]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
import 15.2.2 octopus source
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <string_view>
21 #include <map>
22
23 #include "MDCache.h"
24 #include "MDSRank.h"
25 #include "Server.h"
26 #include "Locker.h"
27 #include "MDLog.h"
28 #include "MDBalancer.h"
29 #include "Migrator.h"
30 #include "ScrubStack.h"
31
32 #include "SnapClient.h"
33
34 #include "MDSMap.h"
35
36 #include "CInode.h"
37 #include "CDir.h"
38
39 #include "Mutation.h"
40
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
44
45 #include "messages/MClientCaps.h"
46
47 #include "msg/Message.h"
48 #include "msg/Messenger.h"
49
50 #include "common/MemoryModel.h"
51 #include "common/errno.h"
52 #include "common/perf_counters.h"
53 #include "common/safe_io.h"
54
55 #include "osdc/Journaler.h"
56 #include "osdc/Filer.h"
57
58 #include "events/ESubtreeMap.h"
59 #include "events/EUpdate.h"
60 #include "events/ESlaveUpdate.h"
61 #include "events/EImportFinish.h"
62 #include "events/EFragment.h"
63 #include "events/ECommitted.h"
64 #include "events/EPurged.h"
65 #include "events/ESessions.h"
66
67 #include "InoTable.h"
68
69 #include "common/Timer.h"
70
71 #include "perfglue/heap_profiler.h"
72
73
74 #include "common/config.h"
75 #include "include/ceph_assert.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_mds
79 #undef dout_prefix
80 #define dout_prefix _prefix(_dout, mds)
81 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
82 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
83 }
84
85 set<int> SimpleLock::empty_gather_set;
86
87
88 /**
89 * All non-I/O contexts that require a reference
90 * to an MDCache instance descend from this.
91 */
92 class MDCacheContext : public virtual MDSContext {
93 protected:
94 MDCache *mdcache;
95 MDSRank *get_mds() override
96 {
97 ceph_assert(mdcache != NULL);
98 return mdcache->mds;
99 }
100 public:
101 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
102 };
103
104
105 /**
106 * Only for contexts called back from an I/O completion
107 *
108 * Note: duplication of members wrt MDCacheContext, because
109 * it'ls the lesser of two evils compared with introducing
110 * yet another piece of (multiple) inheritance.
111 */
112 class MDCacheIOContext : public virtual MDSIOContextBase {
113 protected:
114 MDCache *mdcache;
115 MDSRank *get_mds() override
116 {
117 ceph_assert(mdcache != NULL);
118 return mdcache->mds;
119 }
120 public:
121 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
122 MDSIOContextBase(track), mdcache(mdc_) {}
123 };
124
125 class MDCacheLogContext : public virtual MDSLogContextBase {
126 protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
130 ceph_assert(mdcache != NULL);
131 return mdcache->mds;
132 }
133 public:
134 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
135 };
136
137 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
138 mds(m),
139 open_file_table(m),
140 filer(m->objecter, m->finisher),
141 stray_manager(m, purge_queue_),
142 recovery_queue(m),
143 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
144 {
145 migrator.reset(new Migrator(mds, this));
146
147 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
148 (g_conf()->mds_dir_max_commit_size << 20) :
149 (0.9 *(g_conf()->osd_max_write_size << 20));
150
151 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
153 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
154 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
155
156 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
157
158 bottom_lru.lru_set_midpoint(0);
159
160 decayrate.set_halflife(g_conf()->mds_decay_halflife);
161
162 upkeeper = std::thread([this]() {
163 std::unique_lock lock(upkeep_mutex);
164 while (!upkeep_trim_shutdown.load()) {
165 auto now = clock::now();
166 auto since = now-upkeep_last_trim;
167 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
168 if (since >= trim_interval*.90) {
169 lock.unlock(); /* mds_lock -> upkeep_mutex */
170 std::scoped_lock mds_lock(mds->mds_lock);
171 lock.lock();
172 if (upkeep_trim_shutdown.load())
173 return;
174 if (mds->is_cache_trimmable()) {
175 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
176 trim_client_leases();
177 trim();
178 check_memory_usage();
179 auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
180 mds->server->recall_client_state(nullptr, flags);
181 upkeep_last_trim = now = clock::now();
182 } else {
183 dout(10) << "cache not ready for trimming" << dendl;
184 }
185 } else {
186 trim_interval -= since;
187 }
188 since = now-upkeep_last_release;
189 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
190 if (since >= release_interval) {
191 /* XXX not necessary once MDCache uses PriorityCache */
192 dout(10) << "releasing free memory" << dendl;
193 ceph_heap_release_free_memory();
194 upkeep_last_release = clock::now();
195 } else {
196 release_interval -= since;
197 }
198 auto interval = std::min(release_interval, trim_interval);
199 dout(20) << "upkeep thread waiting interval " << interval << dendl;
200 upkeep_cvar.wait_for(lock, interval);
201 }
202 });
203 }
204
205 MDCache::~MDCache()
206 {
207 if (logger) {
208 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
209 }
210 if (upkeeper.joinable())
211 upkeeper.join();
212 }
213
214 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
215 {
216 if (changed.count("mds_cache_memory_limit"))
217 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
218 if (changed.count("mds_cache_reservation"))
219 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
220 if (changed.count("mds_health_cache_threshold"))
221 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
222 if (changed.count("mds_cache_mid"))
223 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
224 if (changed.count("mds_cache_trim_decay_rate")) {
225 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
226 }
227 if (changed.count("mds_forward_all_requests_to_auth")){
228 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
229 }
230
231 migrator->handle_conf_change(changed, mdsmap);
232 mds->balancer->handle_conf_change(changed, mdsmap);
233 }
234
235 void MDCache::log_stat()
236 {
237 mds->logger->set(l_mds_inodes, lru.lru_get_size());
238 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
239 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
240 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
241 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
242 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
243 mds->logger->set(l_mds_caps, Capability::count());
244 if (root) {
245 mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
246 mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
247 mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
248 }
249 }
250
251
252 //
253
254 bool MDCache::shutdown()
255 {
256 {
257 std::scoped_lock lock(upkeep_mutex);
258 upkeep_trim_shutdown = true;
259 upkeep_cvar.notify_one();
260 }
261 if (lru.lru_get_size() > 0) {
262 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
263 //show_cache();
264 show_subtrees();
265 //dump();
266 }
267 return true;
268 }
269
270
271 // ====================================================================
272 // some inode functions
273
274 void MDCache::add_inode(CInode *in)
275 {
276 // add to lru, inode map
277 if (in->last == CEPH_NOSNAP) {
278 auto &p = inode_map[in->ino()];
279 ceph_assert(!p); // should be no dup inos!
280 p = in;
281 } else {
282 auto &p = snap_inode_map[in->vino()];
283 ceph_assert(!p); // should be no dup inos!
284 p = in;
285 }
286
287 if (in->ino() < MDS_INO_SYSTEM_BASE) {
288 if (in->ino() == MDS_INO_ROOT)
289 root = in;
290 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
291 myin = in;
292 else if (in->is_stray()) {
293 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
294 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
295 }
296 }
297 if (in->is_base())
298 base_inodes.insert(in);
299 }
300
301 if (cache_toofull()) {
302 exceeded_size_limit = true;
303 }
304 }
305
306 void MDCache::remove_inode(CInode *o)
307 {
308 dout(14) << "remove_inode " << *o << dendl;
309
310 if (o->get_parent_dn()) {
311 // FIXME: multiple parents?
312 CDentry *dn = o->get_parent_dn();
313 ceph_assert(!dn->is_dirty());
314 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
315 }
316
317 if (o->is_dirty())
318 o->mark_clean();
319 if (o->is_dirty_parent())
320 o->clear_dirty_parent();
321
322 o->clear_scatter_dirty();
323
324 o->item_open_file.remove_myself();
325
326 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
327 export_pin_queue.erase(o);
328
329 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
330 export_pin_delayed_queue.erase(o);
331
332 // remove from inode map
333 if (o->last == CEPH_NOSNAP) {
334 inode_map.erase(o->ino());
335 } else {
336 o->item_caps.remove_myself();
337 snap_inode_map.erase(o->vino());
338 }
339
340 if (o->ino() < MDS_INO_SYSTEM_BASE) {
341 if (o == root) root = 0;
342 if (o == myin) myin = 0;
343 if (o->is_stray()) {
344 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
345 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
346 }
347 }
348 if (o->is_base())
349 base_inodes.erase(o);
350 }
351
352 // delete it
353 ceph_assert(o->get_num_ref() == 0);
354 delete o;
355 }
356
357 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
358 {
359 file_layout_t result = file_layout_t::get_default();
360 result.pool_id = mdsmap.get_first_data_pool();
361 return result;
362 }
363
364 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
365 {
366 file_layout_t result = file_layout_t::get_default();
367 result.pool_id = mdsmap.get_metadata_pool();
368 if (g_conf()->mds_log_segment_size > 0) {
369 result.object_size = g_conf()->mds_log_segment_size;
370 result.stripe_unit = g_conf()->mds_log_segment_size;
371 }
372 return result;
373 }
374
375 void MDCache::init_layouts()
376 {
377 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
378 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
379 }
380
381 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
382 int mode) const
383 {
384 in->inode.ino = ino;
385 in->inode.version = 1;
386 in->inode.xattr_version = 1;
387 in->inode.mode = 0500 | mode;
388 in->inode.size = 0;
389 in->inode.ctime =
390 in->inode.mtime =
391 in->inode.btime = ceph_clock_now();
392 in->inode.nlink = 1;
393 in->inode.truncate_size = -1ull;
394 in->inode.change_attr = 0;
395 in->inode.export_pin = MDS_RANK_NONE;
396
397 // FIPS zeroization audit 20191117: this memset is not security related.
398 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
399 if (in->inode.is_dir()) {
400 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
401 in->inode.rstat.rsubdirs = 1; /* itself */
402 in->inode.rstat.rctime = in->inode.ctime;
403 } else {
404 in->inode.layout = default_file_layout;
405 ++in->inode.rstat.rfiles;
406 }
407 in->inode.accounted_rstat = in->inode.rstat;
408
409 if (in->is_base()) {
410 if (in->is_root())
411 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
412 else
413 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
414 in->open_snaprealm(); // empty snaprealm
415 ceph_assert(!in->snaprealm->parent); // created its own
416 in->snaprealm->srnode.seq = 1;
417 }
418 }
419
420 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
421 {
422 dout(0) << "creating system inode with ino:" << ino << dendl;
423 CInode *in = new CInode(this);
424 create_unlinked_system_inode(in, ino, mode);
425 add_inode(in);
426 return in;
427 }
428
429 CInode *MDCache::create_root_inode()
430 {
431 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
432 i->inode.uid = g_conf()->mds_root_ino_uid;
433 i->inode.gid = g_conf()->mds_root_ino_gid;
434 i->inode.layout = default_file_layout;
435 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
436 return i;
437 }
438
439 void MDCache::create_empty_hierarchy(MDSGather *gather)
440 {
441 // create root dir
442 CInode *root = create_root_inode();
443
444 // force empty root dir
445 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
446 adjust_subtree_auth(rootdir, mds->get_nodeid());
447 rootdir->dir_rep = CDir::REP_ALL; //NONE;
448
449 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
450 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
451 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
452 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
453 * assume version 0 is stale/invalid.
454 */
455
456 rootdir->mark_complete();
457 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
458 rootdir->commit(0, gather->new_sub());
459
460 root->mark_clean();
461 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
462 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
463 root->flush(gather->new_sub());
464 }
465
466 void MDCache::create_mydir_hierarchy(MDSGather *gather)
467 {
468 // create mds dir
469 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
470
471 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
472 adjust_subtree_auth(mydir, mds->get_nodeid());
473
474 LogSegment *ls = mds->mdlog->get_current_segment();
475
476 // stray dir
477 for (int i = 0; i < NUM_STRAY; ++i) {
478 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
479 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
480 stringstream name;
481 name << "stray" << i;
482 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
483 sdn->_mark_dirty(mds->mdlog->get_current_segment());
484
485 stray->inode.dirstat = straydir->fnode.fragstat;
486
487 mydir->fnode.rstat.add(stray->inode.rstat);
488 mydir->fnode.fragstat.nsubdirs++;
489 // save them
490 straydir->mark_complete();
491 straydir->mark_dirty(straydir->pre_dirty(), ls);
492 straydir->commit(0, gather->new_sub());
493 stray->mark_dirty_parent(ls, true);
494 stray->store_backtrace(gather->new_sub());
495 }
496
497 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
498 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
499
500 myin->inode.dirstat = mydir->fnode.fragstat;
501 myin->inode.rstat = mydir->fnode.rstat;
502 ++myin->inode.rstat.rsubdirs;
503 myin->inode.accounted_rstat = myin->inode.rstat;
504
505 mydir->mark_complete();
506 mydir->mark_dirty(mydir->pre_dirty(), ls);
507 mydir->commit(0, gather->new_sub());
508
509 myin->store(gather->new_sub());
510 }
511
512 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
513 MutationRef mut;
514 CDentry *dn;
515 version_t dpv;
516 MDSContext *fin;
517 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
518 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
519 void finish(int r) override {
520 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
521 }
522 };
523
524 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
525 {
526 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
527 CDentry *dn = dir->add_null_dentry(name);
528
529 dn->push_projected_linkage(in);
530 version_t dpv = dn->pre_dirty();
531
532 CDir *mdir = 0;
533 if (in->inode.is_dir()) {
534 in->inode.rstat.rsubdirs = 1;
535
536 mdir = in->get_or_open_dirfrag(this, frag_t());
537 mdir->mark_complete();
538 mdir->pre_dirty();
539 } else
540 in->inode.rstat.rfiles = 1;
541 in->inode.version = dn->pre_dirty();
542
543 SnapRealm *realm = dir->get_inode()->find_snaprealm();
544 dn->first = in->first = realm->get_newest_seq() + 1;
545
546 MutationRef mut(new MutationImpl());
547
548 // force some locks. hacky.
549 mds->locker->wrlock_force(&dir->inode->filelock, mut);
550 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
551
552 mut->ls = mds->mdlog->get_current_segment();
553 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
554 mds->mdlog->start_entry(le);
555
556 if (!in->is_mdsdir()) {
557 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
558 le->metablob.add_primary_dentry(dn, in, true);
559 } else {
560 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
561 journal_dirty_inode(mut.get(), &le->metablob, in);
562 dn->push_projected_linkage(in->ino(), in->d_type());
563 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
564 le->metablob.add_root(true, in);
565 }
566 if (mdir)
567 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
568
569 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
570 mds->mdlog->flush();
571 }
572
573 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
574 {
575 dout(10) << "_create_system_file_finish " << *dn << dendl;
576
577 dn->pop_projected_linkage();
578 dn->mark_dirty(dpv, mut->ls);
579
580 CInode *in = dn->get_linkage()->get_inode();
581 in->inode.version--;
582 in->mark_dirty(in->inode.version + 1, mut->ls);
583
584 if (in->inode.is_dir()) {
585 CDir *dir = in->get_dirfrag(frag_t());
586 ceph_assert(dir);
587 dir->mark_dirty(1, mut->ls);
588 dir->mark_new(mut->ls);
589 }
590
591 mut->apply();
592 mds->locker->drop_locks(mut.get());
593 mut->cleanup();
594
595 fin->complete(0);
596
597 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
598 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
599 }
600
601
602
603 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
604 MDCache *cache;
605 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
606 void finish(int r) override {
607 if (r < 0) {
608 // If we can't open root, something disastrous has happened: mark
609 // this rank damaged for operator intervention. Note that
610 // it is not okay to call suicide() here because we are in
611 // a Finisher callback.
612 cache->mds->damaged();
613 ceph_abort(); // damaged should never return
614 } else {
615 cache->open_root();
616 }
617 }
618 };
619
620 void MDCache::open_root_inode(MDSContext *c)
621 {
622 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
623 CInode *in;
624 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
625 in->fetch(c);
626 } else {
627 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
628 }
629 }
630
631 void MDCache::open_mydir_inode(MDSContext *c)
632 {
633 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
634 in->fetch(c);
635 }
636
637 void MDCache::open_mydir_frag(MDSContext *c)
638 {
639 open_mydir_inode(
640 new MDSInternalContextWrapper(mds,
641 new LambdaContext([this, c](int r) {
642 if (r < 0) {
643 c->complete(r);
644 return;
645 }
646 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
647 ceph_assert(mydir);
648 adjust_subtree_auth(mydir, mds->get_nodeid());
649 mydir->fetch(c);
650 })
651 )
652 );
653 }
654
655 void MDCache::open_root()
656 {
657 dout(10) << "open_root" << dendl;
658
659 if (!root) {
660 open_root_inode(new C_MDS_RetryOpenRoot(this));
661 return;
662 }
663 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
664 ceph_assert(root->is_auth());
665 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
666 ceph_assert(rootdir);
667 if (!rootdir->is_subtree_root())
668 adjust_subtree_auth(rootdir, mds->get_nodeid());
669 if (!rootdir->is_complete()) {
670 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
671 return;
672 }
673 } else {
674 ceph_assert(!root->is_auth());
675 CDir *rootdir = root->get_dirfrag(frag_t());
676 if (!rootdir) {
677 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
678 return;
679 }
680 }
681
682 if (!myin) {
683 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
684 in->fetch(new C_MDS_RetryOpenRoot(this));
685 return;
686 }
687 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
688 ceph_assert(mydir);
689 adjust_subtree_auth(mydir, mds->get_nodeid());
690
691 populate_mydir();
692 }
693
694 void MDCache::populate_mydir()
695 {
696 ceph_assert(myin);
697 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
698 ceph_assert(mydir);
699
700 dout(10) << "populate_mydir " << *mydir << dendl;
701
702 if (!mydir->is_complete()) {
703 mydir->fetch(new C_MDS_RetryOpenRoot(this));
704 return;
705 }
706
707 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
708 // A missing dirfrag, we will recreate it. Before that, we must dirty
709 // it before dirtying any of the strays we create within it.
710 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
711 "recreating it now";
712 LogSegment *ls = mds->mdlog->get_current_segment();
713 mydir->state_clear(CDir::STATE_BADFRAG);
714 mydir->mark_complete();
715 mydir->mark_dirty(mydir->pre_dirty(), ls);
716 }
717
718 // open or create stray
719 uint64_t num_strays = 0;
720 for (int i = 0; i < NUM_STRAY; ++i) {
721 stringstream name;
722 name << "stray" << i;
723 CDentry *straydn = mydir->lookup(name.str());
724
725 // allow for older fs's with stray instead of stray0
726 if (straydn == NULL && i == 0)
727 straydn = mydir->lookup("stray");
728
729 if (!straydn || !straydn->get_linkage()->get_inode()) {
730 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
731 new C_MDS_RetryOpenRoot(this));
732 return;
733 }
734 ceph_assert(straydn);
735 ceph_assert(strays[i]);
736 // we make multiple passes through this method; make sure we only pin each stray once.
737 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
738 strays[i]->get(CInode::PIN_STRAY);
739 strays[i]->state_set(CInode::STATE_STRAYPINNED);
740 strays[i]->get_stickydirs();
741 }
742 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
743
744 // open all frags
745 frag_vec_t leaves;
746 strays[i]->dirfragtree.get_leaves(leaves);
747 for (const auto& leaf : leaves) {
748 CDir *dir = strays[i]->get_dirfrag(leaf);
749 if (!dir) {
750 dir = strays[i]->get_or_open_dirfrag(this, leaf);
751 }
752
753 // DamageTable applies special handling to strays: it will
754 // have damaged() us out if one is damaged.
755 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
756
757 if (dir->get_version() == 0) {
758 dir->fetch(new C_MDS_RetryOpenRoot(this));
759 return;
760 }
761
762 if (dir->get_frag_size() > 0)
763 num_strays += dir->get_frag_size();
764 }
765 }
766
767 // okay!
768 dout(10) << "populate_mydir done" << dendl;
769 ceph_assert(!open);
770 open = true;
771 mds->queue_waiters(waiting_for_open);
772
773 stray_manager.set_num_strays(num_strays);
774 stray_manager.activate();
775
776 scan_stray_dir();
777 }
778
779 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
780 {
781 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
782 }
783
784 CDir *MDCache::get_stray_dir(CInode *in)
785 {
786 string straydname;
787 in->name_stray_dentry(straydname);
788
789 CInode *strayi = get_stray();
790 ceph_assert(strayi);
791 frag_t fg = strayi->pick_dirfrag(straydname);
792 CDir *straydir = strayi->get_dirfrag(fg);
793 ceph_assert(straydir);
794 return straydir;
795 }
796
797 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
798 {
799 CDir *straydir = get_stray_dir(in);
800 string straydname;
801 in->name_stray_dentry(straydname);
802 CDentry *straydn = straydir->lookup(straydname);
803 if (!straydn) {
804 straydn = straydir->add_null_dentry(straydname);
805 straydn->mark_new();
806 } else {
807 ceph_assert(straydn->get_projected_linkage()->is_null());
808 }
809
810 straydn->state_set(CDentry::STATE_STRAY);
811 return straydn;
812 }
813
814
815
816 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
817 {
818 // inode?
819 if (info.ino)
820 return get_inode(info.ino, info.snapid);
821
822 // dir or dentry.
823 CDir *dir = get_dirfrag(info.dirfrag);
824 if (!dir) return 0;
825
826 if (info.dname.length())
827 return dir->lookup(info.dname, info.snapid);
828 else
829 return dir;
830 }
831
832
833
834
835 // ====================================================================
836 // subtree management
837
838 /*
839 * adjust the dir_auth of a subtree.
840 * merge with parent and/or child subtrees, if is it appropriate.
841 * merge can ONLY happen if both parent and child have unambiguous auth.
842 */
843 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
844 {
845 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
846 << " on " << *dir << dendl;
847
848 show_subtrees();
849
850 CDir *root;
851 if (dir->inode->is_base()) {
852 root = dir; // bootstrap hack.
853 if (subtrees.count(root) == 0) {
854 subtrees[root];
855 root->get(CDir::PIN_SUBTREE);
856 }
857 } else {
858 root = get_subtree_root(dir); // subtree root
859 }
860 ceph_assert(root);
861 ceph_assert(subtrees.count(root));
862 dout(7) << " current root is " << *root << dendl;
863
864 if (root == dir) {
865 // i am already a subtree.
866 dir->set_dir_auth(auth);
867 } else {
868 // i am a new subtree.
869 dout(10) << " new subtree at " << *dir << dendl;
870 ceph_assert(subtrees.count(dir) == 0);
871 subtrees[dir]; // create empty subtree bounds list for me.
872 dir->get(CDir::PIN_SUBTREE);
873
874 // set dir_auth
875 dir->set_dir_auth(auth);
876
877 // move items nested beneath me, under me.
878 set<CDir*>::iterator p = subtrees[root].begin();
879 while (p != subtrees[root].end()) {
880 set<CDir*>::iterator next = p;
881 ++next;
882 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
883 // move under me
884 dout(10) << " claiming child bound " << **p << dendl;
885 subtrees[dir].insert(*p);
886 subtrees[root].erase(p);
887 }
888 p = next;
889 }
890
891 // i am a bound of the parent subtree.
892 subtrees[root].insert(dir);
893
894 // i am now the subtree root.
895 root = dir;
896
897 // adjust recursive pop counters
898 if (adjust_pop && dir->is_auth()) {
899 CDir *p = dir->get_parent_dir();
900 while (p) {
901 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
902 if (p->is_subtree_root()) break;
903 p = p->inode->get_parent_dir();
904 }
905 }
906 }
907
908 show_subtrees();
909 }
910
911
912 void MDCache::try_subtree_merge(CDir *dir)
913 {
914 dout(7) << "try_subtree_merge " << *dir << dendl;
915 // record my old bounds
916 auto oldbounds = subtrees.at(dir);
917
918 set<CInode*> to_eval;
919 // try merge at my root
920 try_subtree_merge_at(dir, &to_eval);
921
922 // try merge at my old bounds
923 for (auto bound : oldbounds)
924 try_subtree_merge_at(bound, &to_eval);
925
926 if (!(mds->is_any_replay() || mds->is_resolve())) {
927 for(auto in : to_eval)
928 eval_subtree_root(in);
929 }
930 }
931
932 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
933 CInode *in;
934 MutationRef mut;
935 public:
936 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
937 void finish(int r) override {
938 mdcache->subtree_merge_writebehind_finish(in, mut);
939 }
940 };
941
942 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
943 {
944 dout(10) << "try_subtree_merge_at " << *dir << dendl;
945
946 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
947 dir->state_test(CDir::STATE_EXPORTBOUND) ||
948 dir->state_test(CDir::STATE_AUXSUBTREE))
949 return;
950
951 auto it = subtrees.find(dir);
952 ceph_assert(it != subtrees.end());
953
954 // merge with parent?
955 CDir *parent = dir;
956 if (!dir->inode->is_base())
957 parent = get_subtree_root(dir->get_parent_dir());
958
959 if (parent != dir && // we have a parent,
960 parent->dir_auth == dir->dir_auth) { // auth matches,
961 // merge with parent.
962 dout(10) << " subtree merge at " << *dir << dendl;
963 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
964
965 // move our bounds under the parent
966 subtrees[parent].insert(it->second.begin(), it->second.end());
967
968 // we are no longer a subtree or bound
969 dir->put(CDir::PIN_SUBTREE);
970 subtrees.erase(it);
971 subtrees[parent].erase(dir);
972
973 // adjust popularity?
974 if (adjust_pop && dir->is_auth()) {
975 CDir *cur = dir;
976 CDir *p = dir->get_parent_dir();
977 while (p) {
978 p->pop_auth_subtree.add(dir->pop_auth_subtree);
979 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
980 if (p->is_subtree_root()) break;
981 cur = p;
982 p = p->inode->get_parent_dir();
983 }
984 }
985
986 if (to_eval && dir->get_inode()->is_auth())
987 to_eval->insert(dir->get_inode());
988
989 show_subtrees(15);
990 }
991 }
992
993 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
994 {
995 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
996 in->pop_and_dirty_projected_inode(mut->ls);
997
998 mut->apply();
999 mds->locker->drop_locks(mut.get());
1000 mut->cleanup();
1001
1002 in->auth_unpin(this);
1003 }
1004
1005 void MDCache::eval_subtree_root(CInode *diri)
1006 {
1007 // evaluate subtree inode filelock?
1008 // (we should scatter the filelock on subtree bounds)
1009 ceph_assert(diri->is_auth());
1010 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1011 }
1012
1013
1014 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1015 {
1016 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1017 << " on " << *dir
1018 << " bounds " << bounds
1019 << dendl;
1020
1021 show_subtrees();
1022
1023 CDir *root;
1024 if (dir->ino() == MDS_INO_ROOT) {
1025 root = dir; // bootstrap hack.
1026 if (subtrees.count(root) == 0) {
1027 subtrees[root];
1028 root->get(CDir::PIN_SUBTREE);
1029 }
1030 } else {
1031 root = get_subtree_root(dir); // subtree root
1032 }
1033 ceph_assert(root);
1034 ceph_assert(subtrees.count(root));
1035 dout(7) << " current root is " << *root << dendl;
1036
1037 mds_authority_t oldauth = dir->authority();
1038
1039 if (root == dir) {
1040 // i am already a subtree.
1041 dir->set_dir_auth(auth);
1042 } else {
1043 // i am a new subtree.
1044 dout(10) << " new subtree at " << *dir << dendl;
1045 ceph_assert(subtrees.count(dir) == 0);
1046 subtrees[dir]; // create empty subtree bounds list for me.
1047 dir->get(CDir::PIN_SUBTREE);
1048
1049 // set dir_auth
1050 dir->set_dir_auth(auth);
1051
1052 // move items nested beneath me, under me.
1053 set<CDir*>::iterator p = subtrees[root].begin();
1054 while (p != subtrees[root].end()) {
1055 set<CDir*>::iterator next = p;
1056 ++next;
1057 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1058 // move under me
1059 dout(10) << " claiming child bound " << **p << dendl;
1060 subtrees[dir].insert(*p);
1061 subtrees[root].erase(p);
1062 }
1063 p = next;
1064 }
1065
1066 // i am a bound of the parent subtree.
1067 subtrees[root].insert(dir);
1068
1069 // i am now the subtree root.
1070 root = dir;
1071 }
1072
1073 set<CInode*> to_eval;
1074
1075 // verify/adjust bounds.
1076 // - these may be new, or
1077 // - beneath existing ambiguous bounds (which will be collapsed),
1078 // - but NOT beneath unambiguous bounds.
1079 for (const auto& bound : bounds) {
1080 // new bound?
1081 if (subtrees[dir].count(bound) == 0) {
1082 if (get_subtree_root(bound) == dir) {
1083 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1084 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1085 }
1086 else {
1087 dout(10) << " want bound " << *bound << dendl;
1088 CDir *t = get_subtree_root(bound->get_parent_dir());
1089 if (subtrees[t].count(bound) == 0) {
1090 ceph_assert(t != dir);
1091 dout(10) << " new bound " << *bound << dendl;
1092 adjust_subtree_auth(bound, t->authority());
1093 }
1094 // make sure it's nested beneath ambiguous subtree(s)
1095 while (1) {
1096 while (subtrees[dir].count(t) == 0)
1097 t = get_subtree_root(t->get_parent_dir());
1098 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1099 adjust_subtree_auth(t, auth);
1100 try_subtree_merge_at(t, &to_eval);
1101 t = get_subtree_root(bound->get_parent_dir());
1102 if (t == dir) break;
1103 }
1104 }
1105 }
1106 else {
1107 dout(10) << " already have bound " << *bound << dendl;
1108 }
1109 }
1110 // merge stray bounds?
1111 while (!subtrees[dir].empty()) {
1112 set<CDir*> copy = subtrees[dir];
1113 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1114 if (bounds.count(*p) == 0) {
1115 CDir *stray = *p;
1116 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1117 adjust_subtree_auth(stray, auth);
1118 try_subtree_merge_at(stray, &to_eval);
1119 }
1120 }
1121 // swallowing subtree may add new subtree bounds
1122 if (copy == subtrees[dir])
1123 break;
1124 }
1125
1126 // bound should now match.
1127 verify_subtree_bounds(dir, bounds);
1128
1129 show_subtrees();
1130
1131 if (!(mds->is_any_replay() || mds->is_resolve())) {
1132 for(auto in : to_eval)
1133 eval_subtree_root(in);
1134 }
1135 }
1136
1137
1138 /*
1139 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1140 * fragmentation as necessary to get an equivalent bounding set. That is, only
1141 * split if one of our frags spans the provided bounding set. Never merge.
1142 */
1143 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1144 {
1145 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1146
1147 // sort by ino
1148 map<inodeno_t, fragset_t> byino;
1149 for (auto& frag : dfs) {
1150 byino[frag.ino].insert_raw(frag.frag);
1151 }
1152 dout(10) << " by ino: " << byino << dendl;
1153
1154 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1155 p->second.simplify();
1156 CInode *diri = get_inode(p->first);
1157 if (!diri)
1158 continue;
1159 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1160
1161 fragtree_t tmpdft;
1162 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1163 tmpdft.force_to_leaf(g_ceph_context, *q);
1164
1165 for (const auto& fg : p->second) {
1166 frag_vec_t leaves;
1167 diri->dirfragtree.get_leaves_under(fg, leaves);
1168 if (leaves.empty()) {
1169 bool all = true;
1170 frag_t approx_fg = diri->dirfragtree[fg.value()];
1171 frag_vec_t approx_leaves;
1172 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1173 for (const auto& leaf : approx_leaves) {
1174 if (p->second.get().count(leaf) == 0) {
1175 // not bound, so the resolve message is from auth MDS of the dirfrag
1176 force_dir_fragment(diri, leaf);
1177 all = false;
1178 }
1179 }
1180 if (all)
1181 leaves.push_back(approx_fg);
1182 else
1183 diri->dirfragtree.get_leaves_under(fg, leaves);
1184 }
1185 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1186 for (const auto& leaf : leaves) {
1187 CDir *dir = diri->get_dirfrag(leaf);
1188 if (dir)
1189 bounds.insert(dir);
1190 }
1191 }
1192 }
1193 }
1194
1195 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1196 {
1197 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1198 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1199
1200 set<CDir*> bounds;
1201 get_force_dirfrag_bound_set(bound_dfs, bounds);
1202 adjust_bounded_subtree_auth(dir, bounds, auth);
1203 }
1204
1205 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1206 {
1207 dout(10) << "map_dirfrag_set " << dfs << dendl;
1208
1209 // group by inode
1210 map<inodeno_t, fragset_t> ino_fragset;
1211 for (const auto &df : dfs) {
1212 ino_fragset[df.ino].insert_raw(df.frag);
1213 }
1214 // get frags
1215 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1216 p != ino_fragset.end();
1217 ++p) {
1218 p->second.simplify();
1219 CInode *in = get_inode(p->first);
1220 if (!in)
1221 continue;
1222
1223 frag_vec_t fgs;
1224 for (const auto& fg : p->second) {
1225 in->dirfragtree.get_leaves_under(fg, fgs);
1226 }
1227
1228 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1229 << " on " << *in << dendl;
1230
1231 for (const auto& fg : fgs) {
1232 CDir *dir = in->get_dirfrag(fg);
1233 if (dir)
1234 result.insert(dir);
1235 }
1236 }
1237 }
1238
1239
1240
1241 CDir *MDCache::get_subtree_root(CDir *dir)
1242 {
1243 // find the underlying dir that delegates (or is about to delegate) auth
1244 while (true) {
1245 if (dir->is_subtree_root())
1246 return dir;
1247 dir = dir->get_inode()->get_parent_dir();
1248 if (!dir)
1249 return 0; // none
1250 }
1251 }
1252
1253 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1254 {
1255 // find the underlying dir that delegates (or is about to delegate) auth
1256 while (true) {
1257 if (dir->is_subtree_root())
1258 return dir;
1259 dir = dir->get_inode()->get_projected_parent_dir();
1260 if (!dir)
1261 return 0; // none
1262 }
1263 }
1264
1265 void MDCache::remove_subtree(CDir *dir)
1266 {
1267 dout(10) << "remove_subtree " << *dir << dendl;
1268 ceph_assert(subtrees.count(dir));
1269 ceph_assert(subtrees[dir].empty());
1270 subtrees.erase(dir);
1271 dir->put(CDir::PIN_SUBTREE);
1272 if (dir->get_parent_dir()) {
1273 CDir *p = get_subtree_root(dir->get_parent_dir());
1274 ceph_assert(subtrees[p].count(dir));
1275 subtrees[p].erase(dir);
1276 }
1277 }
1278
1279 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1280 {
1281 ceph_assert(subtrees.count(dir));
1282 bounds = subtrees[dir];
1283 }
1284
1285 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1286 {
1287 if (subtrees.count(dir)) {
1288 // just copy them, dir is a subtree.
1289 get_subtree_bounds(dir, bounds);
1290 } else {
1291 // find them
1292 CDir *root = get_subtree_root(dir);
1293 for (set<CDir*>::iterator p = subtrees[root].begin();
1294 p != subtrees[root].end();
1295 ++p) {
1296 CDir *t = *p;
1297 while (t != root) {
1298 t = t->get_parent_dir();
1299 ceph_assert(t);
1300 if (t == dir) {
1301 bounds.insert(*p);
1302 continue;
1303 }
1304 }
1305 }
1306 }
1307 }
1308
1309 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1310 {
1311 // for debugging only.
1312 ceph_assert(subtrees.count(dir));
1313 if (bounds != subtrees[dir]) {
1314 dout(0) << "verify_subtree_bounds failed" << dendl;
1315 set<CDir*> b = bounds;
1316 for (auto &cd : subtrees[dir]) {
1317 if (bounds.count(cd)) {
1318 b.erase(cd);
1319 continue;
1320 }
1321 dout(0) << " missing bound " << *cd << dendl;
1322 }
1323 for (const auto &cd : b)
1324 dout(0) << " extra bound " << *cd << dendl;
1325 }
1326 ceph_assert(bounds == subtrees[dir]);
1327 }
1328
1329 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1330 {
1331 // for debugging only.
1332 ceph_assert(subtrees.count(dir));
1333
1334 // make sure that any bounds i do have are properly noted as such.
1335 int failed = 0;
1336 for (const auto &fg : bounds) {
1337 CDir *bd = get_dirfrag(fg);
1338 if (!bd) continue;
1339 if (subtrees[dir].count(bd) == 0) {
1340 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1341 failed++;
1342 }
1343 }
1344 ceph_assert(failed == 0);
1345 }
1346
1347 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1348 {
1349 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1350 << " to " << *newdir << dendl;
1351 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1352 }
1353
1354 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1355 {
1356 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1357
1358 CDir *newdir = diri->get_parent_dir();
1359
1360 if (pop) {
1361 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1362 ceph_assert(p != projected_subtree_renames.end());
1363 ceph_assert(!p->second.empty());
1364 ceph_assert(p->second.front().first == olddir);
1365 ceph_assert(p->second.front().second == newdir);
1366 p->second.pop_front();
1367 if (p->second.empty())
1368 projected_subtree_renames.erase(p);
1369 }
1370
1371 // adjust total auth pin of freezing subtree
1372 if (olddir != newdir) {
1373 auto&& dfls = diri->get_nested_dirfrags();
1374 for (const auto& dir : dfls)
1375 olddir->adjust_freeze_after_rename(dir);
1376 }
1377
1378 // adjust subtree
1379 // N.B. make sure subtree dirfrags are at the front of the list
1380 auto dfls = diri->get_subtree_dirfrags();
1381 diri->get_nested_dirfrags(dfls);
1382 for (const auto& dir : dfls) {
1383 dout(10) << "dirfrag " << *dir << dendl;
1384 CDir *oldparent = get_subtree_root(olddir);
1385 dout(10) << " old parent " << *oldparent << dendl;
1386 CDir *newparent = get_subtree_root(newdir);
1387 dout(10) << " new parent " << *newparent << dendl;
1388
1389 auto& oldbounds = subtrees[oldparent];
1390 auto& newbounds = subtrees[newparent];
1391
1392 if (olddir != newdir)
1393 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1394
1395 if (oldparent == newparent) {
1396 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1397 } else if (dir->is_subtree_root()) {
1398 // children are fine. change parent.
1399 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1400 {
1401 auto n = oldbounds.erase(dir);
1402 ceph_assert(n == 1);
1403 }
1404 newbounds.insert(dir);
1405 // caller is responsible for 'eval diri'
1406 try_subtree_merge_at(dir, NULL, false);
1407 } else {
1408 // mid-subtree.
1409
1410 // see if any old bounds move to the new parent.
1411 std::vector<CDir*> tomove;
1412 for (const auto& bound : oldbounds) {
1413 CDir *broot = get_subtree_root(bound->get_parent_dir());
1414 if (broot != oldparent) {
1415 ceph_assert(broot == newparent);
1416 tomove.push_back(bound);
1417 }
1418 }
1419 for (const auto& bound : tomove) {
1420 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1421 oldbounds.erase(bound);
1422 newbounds.insert(bound);
1423 }
1424
1425 // did auth change?
1426 if (oldparent->authority() != newparent->authority()) {
1427 adjust_subtree_auth(dir, oldparent->authority(), false);
1428 // caller is responsible for 'eval diri'
1429 try_subtree_merge_at(dir, NULL, false);
1430 }
1431 }
1432
1433 if (olddir != newdir)
1434 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1435 }
1436
1437 show_subtrees();
1438 }
1439
1440 // ===================================
1441 // journal and snap/cow helpers
1442
1443
1444 /*
1445 * find first inode in cache that follows given snapid. otherwise, return current.
1446 */
1447 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1448 {
1449 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1450 ceph_assert(in->last == CEPH_NOSNAP);
1451
1452 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1453 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1454 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1455 in = p->second;
1456 }
1457
1458 return in;
1459 }
1460
1461
1462 /*
1463 * note: i'm currently cheating wrt dirty and inode.version on cow
1464 * items. instead of doing a full dir predirty, i just take the
1465 * original item's version, and set the dirty flag (via
1466 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1467 * means a special case in the dir commit clean sweep assertions.
1468 * bah.
1469 */
1470 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1471 {
1472 ceph_assert(last >= in->first);
1473
1474 CInode *oldin = new CInode(this, true, in->first, last);
1475 oldin->inode = *in->get_previous_projected_inode();
1476 oldin->xattrs = *in->get_previous_projected_xattrs();
1477 oldin->symlink = in->symlink;
1478 oldin->inode.trim_client_ranges(last);
1479
1480 if (in->first < in->oldest_snap)
1481 in->oldest_snap = in->first;
1482
1483 in->first = last+1;
1484
1485 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1486 add_inode(oldin);
1487
1488 if (in->last != CEPH_NOSNAP) {
1489 CInode *head_in = get_inode(in->ino());
1490 ceph_assert(head_in);
1491 auto ret = head_in->split_need_snapflush(oldin, in);
1492 if (ret.first) {
1493 oldin->client_snap_caps = in->client_snap_caps;
1494 if (!oldin->client_snap_caps.empty()) {
1495 for (int i = 0; i < num_cinode_locks; i++) {
1496 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1497 ceph_assert(lock);
1498 if (lock->get_state() != LOCK_SNAP_SYNC) {
1499 ceph_assert(lock->is_stable());
1500 lock->set_state(LOCK_SNAP_SYNC); // gathering
1501 oldin->auth_pin(lock);
1502 }
1503 lock->get_wrlock(true);
1504 }
1505 }
1506 }
1507 if (!ret.second) {
1508 auto client_snap_caps = std::move(in->client_snap_caps);
1509 in->client_snap_caps.clear();
1510 in->item_open_file.remove_myself();
1511 in->item_caps.remove_myself();
1512
1513 if (!client_snap_caps.empty()) {
1514 MDSContext::vec finished;
1515 for (int i = 0; i < num_cinode_locks; i++) {
1516 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1517 ceph_assert(lock);
1518 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1519 lock->put_wrlock();
1520 if (!lock->get_num_wrlocks()) {
1521 lock->set_state(LOCK_SYNC);
1522 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1523 in->auth_unpin(lock);
1524 }
1525 }
1526 mds->queue_waiters(finished);
1527 }
1528 }
1529 return oldin;
1530 }
1531
1532 if (!in->client_caps.empty()) {
1533 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1534 // clone caps?
1535 for (auto &p : in->client_caps) {
1536 client_t client = p.first;
1537 Capability *cap = &p.second;
1538 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1539 if ((issued & CEPH_CAP_ANY_WR) &&
1540 cap->client_follows < last) {
1541 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1542 oldin->client_snap_caps.insert(client);
1543 cap->client_follows = last;
1544
1545 // we need snapflushes for any intervening snaps
1546 dout(10) << " snaps " << snaps << dendl;
1547 for (auto q = snaps.lower_bound(oldin->first);
1548 q != snaps.end() && *q <= last;
1549 ++q) {
1550 in->add_need_snapflush(oldin, *q, client);
1551 }
1552 } else {
1553 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1554 }
1555 }
1556
1557 if (!oldin->client_snap_caps.empty()) {
1558 for (int i = 0; i < num_cinode_locks; i++) {
1559 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1560 ceph_assert(lock);
1561 if (lock->get_state() != LOCK_SNAP_SYNC) {
1562 ceph_assert(lock->is_stable());
1563 lock->set_state(LOCK_SNAP_SYNC); // gathering
1564 oldin->auth_pin(lock);
1565 }
1566 lock->get_wrlock(true);
1567 }
1568 }
1569 }
1570 return oldin;
1571 }
1572
1573 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1574 CDentry *dn, snapid_t follows,
1575 CInode **pcow_inode, CDentry::linkage_t *dnl)
1576 {
1577 if (!dn) {
1578 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1579 return;
1580 }
1581 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1582 ceph_assert(dn->is_auth());
1583
1584 // nothing to cow on a null dentry, fix caller
1585 if (!dnl)
1586 dnl = dn->get_projected_linkage();
1587 ceph_assert(!dnl->is_null());
1588
1589 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1590 bool cow_head = false;
1591 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1592 ceph_assert(in->is_frozen_inode());
1593 cow_head = true;
1594 }
1595 if (in && (in->is_multiversion() || cow_head)) {
1596 // multiversion inode.
1597 SnapRealm *realm = NULL;
1598
1599 if (in->get_projected_parent_dn() != dn) {
1600 ceph_assert(follows == CEPH_NOSNAP);
1601 realm = dn->dir->inode->find_snaprealm();
1602 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1603 ceph_assert(dir_follows >= realm->get_newest_seq());
1604
1605 if (dir_follows+1 > dn->first) {
1606 snapid_t oldfirst = dn->first;
1607 dn->first = dir_follows+1;
1608 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1609 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
1610 oldfirst, dir_follows);
1611 olddn->pre_dirty();
1612 dout(10) << " olddn " << *olddn << dendl;
1613 metablob->add_remote_dentry(olddn, true);
1614 mut->add_cow_dentry(olddn);
1615 // FIXME: adjust link count here? hmm.
1616
1617 if (dir_follows+1 > in->first)
1618 in->cow_old_inode(dir_follows, cow_head);
1619 }
1620 }
1621
1622 follows = dir_follows;
1623 if (in->snaprealm) {
1624 realm = in->snaprealm;
1625 ceph_assert(follows >= realm->get_newest_seq());
1626 }
1627 } else {
1628 realm = in->find_snaprealm();
1629 if (follows == CEPH_NOSNAP) {
1630 follows = get_global_snaprealm()->get_newest_seq();
1631 ceph_assert(follows >= realm->get_newest_seq());
1632 }
1633 }
1634
1635 // already cloned?
1636 if (follows < in->first) {
1637 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1638 return;
1639 }
1640
1641 if (!realm->has_snaps_in_range(in->first, follows)) {
1642 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1643 in->first = follows + 1;
1644 return;
1645 }
1646
1647 in->cow_old_inode(follows, cow_head);
1648
1649 } else {
1650 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1651 if (follows == CEPH_NOSNAP) {
1652 follows = get_global_snaprealm()->get_newest_seq();
1653 ceph_assert(follows >= realm->get_newest_seq());
1654 }
1655
1656 // already cloned?
1657 if (follows < dn->first) {
1658 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1659 return;
1660 }
1661
1662 // update dn.first before adding old dentry to cdir's map
1663 snapid_t oldfirst = dn->first;
1664 dn->first = follows+1;
1665
1666 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1667 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1668 if (in)
1669 in->first = follows+1;
1670 return;
1671 }
1672
1673 dout(10) << " dn " << *dn << dendl;
1674 if (in) {
1675 CInode *oldin = cow_inode(in, follows);
1676 mut->add_cow_inode(oldin);
1677 if (pcow_inode)
1678 *pcow_inode = oldin;
1679 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
1680 oldin->inode.version = olddn->pre_dirty();
1681 dout(10) << " olddn " << *olddn << dendl;
1682 bool need_snapflush = !oldin->client_snap_caps.empty();
1683 if (need_snapflush) {
1684 mut->ls->open_files.push_back(&oldin->item_open_file);
1685 mds->locker->mark_need_snapflush_inode(oldin);
1686 }
1687 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1688 mut->add_cow_dentry(olddn);
1689 } else {
1690 ceph_assert(dnl->is_remote());
1691 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
1692 oldfirst, follows);
1693 olddn->pre_dirty();
1694 dout(10) << " olddn " << *olddn << dendl;
1695 metablob->add_remote_dentry(olddn, true);
1696 mut->add_cow_dentry(olddn);
1697 }
1698 }
1699 }
1700
1701
1702 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1703 CInode *in, snapid_t follows,
1704 CInode **pcow_inode)
1705 {
1706 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1707 CDentry *dn = in->get_projected_parent_dn();
1708 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1709 }
1710
1711 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1712 {
1713 if (in->is_base()) {
1714 metablob->add_root(true, in);
1715 } else {
1716 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1717 follows = in->first - 1;
1718 CDentry *dn = in->get_projected_parent_dn();
1719 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1720 journal_cow_dentry(mut, metablob, dn, follows);
1721 if (in->get_projected_inode()->is_backtrace_updated()) {
1722 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1723 in->get_previous_projected_inode()->layout.pool_id;
1724 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1725 } else {
1726 metablob->add_primary_dentry(dn, in, true);
1727 }
1728 }
1729 }
1730
1731
1732
1733 // nested ---------------------------------------------------------------
1734
1735 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1736 int linkunlink, SnapRealm *prealm)
1737 {
1738 CDentry *parentdn = cur->get_projected_parent_dn();
1739 CInode::mempool_inode *curi = cur->get_projected_inode();
1740
1741 if (cur->first > first)
1742 first = cur->first;
1743
1744 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1745 << " " << *cur << dendl;
1746 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1747 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1748
1749 /*
1750 * FIXME. this incompletely propagates rstats to _old_ parents
1751 * (i.e. shortly after a directory rename). but we need full
1752 * blown hard link backpointers to make this work properly...
1753 */
1754 snapid_t floor = parentdn->first;
1755 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1756
1757 if (!prealm)
1758 prealm = parent->inode->find_snaprealm();
1759 const set<snapid_t> snaps = prealm->get_snaps();
1760
1761 if (cur->last != CEPH_NOSNAP) {
1762 ceph_assert(cur->dirty_old_rstats.empty());
1763 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1764 if (q == snaps.end() || *q > cur->last)
1765 return;
1766 }
1767
1768 if (cur->last >= floor) {
1769 bool update = true;
1770 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1771 // rename src inode is not projected in the slave rename prep case. so we should
1772 // avoid updateing the inode.
1773 ceph_assert(linkunlink < 0);
1774 ceph_assert(cur->is_frozen_inode());
1775 update = false;
1776 }
1777 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
1778 linkunlink, update);
1779 }
1780
1781 if (g_conf()->mds_snap_rstat) {
1782 for (const auto &p : cur->dirty_old_rstats) {
1783 auto &old = cur->old_inodes[p];
1784 snapid_t ofirst = std::max(old.first, floor);
1785 auto it = snaps.lower_bound(ofirst);
1786 if (it == snaps.end() || *it > p)
1787 continue;
1788 if (p >= floor)
1789 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
1790 }
1791 }
1792 cur->dirty_old_rstats.clear();
1793 }
1794
1795
1796 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
1797 CDir *parent, int linkunlink, bool update_inode)
1798 {
1799 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1800 dout(20) << " inode rstat " << inode.rstat << dendl;
1801 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1802 nest_info_t delta;
1803 if (linkunlink == 0) {
1804 delta.add(inode.rstat);
1805 delta.sub(inode.accounted_rstat);
1806 } else if (linkunlink < 0) {
1807 delta.sub(inode.accounted_rstat);
1808 } else {
1809 delta.add(inode.rstat);
1810 }
1811 dout(20) << " delta " << delta << dendl;
1812
1813 if (update_inode)
1814 inode.accounted_rstat = inode.rstat;
1815
1816 while (last >= ofirst) {
1817 /*
1818 * pick fnode version to update. at each iteration, we want to
1819 * pick a segment ending in 'last' to update. split as necessary
1820 * to make that work. then, adjust first up so that we only
1821 * update one segment at a time. then loop to cover the whole
1822 * [ofirst,last] interval.
1823 */
1824 nest_info_t *prstat;
1825 snapid_t first;
1826 fnode_t *pf = parent->get_projected_fnode();
1827 if (last == CEPH_NOSNAP) {
1828 if (g_conf()->mds_snap_rstat)
1829 first = std::max(ofirst, parent->first);
1830 else
1831 first = parent->first;
1832 prstat = &pf->rstat;
1833 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1834
1835 if (first > parent->first &&
1836 !(pf->rstat == pf->accounted_rstat)) {
1837 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1838 << parent->first << "," << (first-1) << "] "
1839 << " " << *prstat << "/" << pf->accounted_rstat
1840 << dendl;
1841 parent->dirty_old_rstat[first-1].first = parent->first;
1842 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1843 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1844 }
1845 parent->first = first;
1846 } else if (!g_conf()->mds_snap_rstat) {
1847 // drop snapshots' rstats
1848 break;
1849 } else if (last >= parent->first) {
1850 first = parent->first;
1851 parent->dirty_old_rstat[last].first = first;
1852 parent->dirty_old_rstat[last].rstat = pf->rstat;
1853 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1854 prstat = &parent->dirty_old_rstat[last].rstat;
1855 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1856 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1857 } else {
1858 // be careful, dirty_old_rstat is a _sparse_ map.
1859 // sorry, this is ugly.
1860 first = ofirst;
1861
1862 // find any intersection with last
1863 auto it = parent->dirty_old_rstat.lower_bound(last);
1864 if (it == parent->dirty_old_rstat.end()) {
1865 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1866 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1867 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1868 first = parent->dirty_old_rstat.rbegin()->first+1;
1869 }
1870 } else {
1871 // *it last is >= last
1872 if (it->second.first <= last) {
1873 // *it intersects [first,last]
1874 if (it->second.first < first) {
1875 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1876 parent->dirty_old_rstat[first-1] = it->second;
1877 it->second.first = first;
1878 }
1879 if (it->second.first > first)
1880 first = it->second.first;
1881 if (last < it->first) {
1882 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1883 parent->dirty_old_rstat[last] = it->second;
1884 it->second.first = last+1;
1885 }
1886 } else {
1887 // *it is to the _right_ of [first,last]
1888 it = parent->dirty_old_rstat.lower_bound(first);
1889 // new *it last is >= first
1890 if (it->second.first <= last && // new *it isn't also to the right, and
1891 it->first >= first) { // it intersects our first bit,
1892 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1893 first = it->first+1;
1894 }
1895 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1896 }
1897 }
1898 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1899 parent->dirty_old_rstat[last].first = first;
1900 prstat = &parent->dirty_old_rstat[last].rstat;
1901 }
1902
1903 // apply
1904 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1905 ceph_assert(last >= first);
1906 prstat->add(delta);
1907 if (update_inode)
1908 inode.accounted_rstat = inode.rstat;
1909 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1910
1911 last = first-1;
1912 }
1913 }
1914
1915 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1916 snapid_t ofirst, snapid_t last,
1917 CInode *pin, bool cow_head)
1918 {
1919 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1920 dout(20) << " frag rstat " << rstat << dendl;
1921 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1922 nest_info_t delta = rstat;
1923 delta.sub(accounted_rstat);
1924 dout(20) << " delta " << delta << dendl;
1925
1926 while (last >= ofirst) {
1927 CInode::mempool_inode *pi;
1928 snapid_t first;
1929 if (last == pin->last) {
1930 pi = pin->get_projected_inode();
1931 first = std::max(ofirst, pin->first);
1932 if (first > pin->first) {
1933 auto &old = pin->cow_old_inode(first-1, cow_head);
1934 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1935 }
1936 } else {
1937 if (last >= pin->first) {
1938 first = pin->first;
1939 pin->cow_old_inode(last, cow_head);
1940 } else {
1941 // our life is easier here because old_inodes is not sparse
1942 // (although it may not begin at snapid 1)
1943 auto it = pin->old_inodes.lower_bound(last);
1944 if (it == pin->old_inodes.end()) {
1945 dout(10) << " no old_inode <= " << last << ", done." << dendl;
1946 break;
1947 }
1948 first = it->second.first;
1949 if (first > last) {
1950 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
1951 //assert(p == pin->old_inodes.begin());
1952 break;
1953 }
1954 if (it->first > last) {
1955 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
1956 << (last+1) << "," << it->first << "]" << dendl;
1957 pin->old_inodes[last] = it->second;
1958 it->second.first = last+1;
1959 pin->dirty_old_rstats.insert(it->first);
1960 }
1961 }
1962 if (first < ofirst) {
1963 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
1964 << first << "," << ofirst-1 << "]" << dendl;
1965 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
1966 pin->dirty_old_rstats.insert(ofirst-1);
1967 pin->old_inodes[last].first = first = ofirst;
1968 }
1969 pi = &pin->old_inodes[last].inode;
1970 pin->dirty_old_rstats.insert(last);
1971 }
1972 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
1973 pi->rstat.add(delta);
1974 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
1975
1976 last = first-1;
1977 }
1978 }
1979
1980 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
1981 {
1982 if (!(mds->is_active() || mds->is_stopping()))
1983 return;
1984
1985 if (!in->is_auth() || in->is_frozen())
1986 return;
1987
1988 auto i = in->get_projected_inode();
1989
1990 if (!i->quota.is_enable() &&
1991 !quota_change)
1992 return;
1993
1994 // creaete snaprealm for quota inode (quota was set before mimic)
1995 if (!in->get_projected_srnode())
1996 mds->server->create_quota_realm(in);
1997
1998 for (auto &p : in->client_caps) {
1999 Capability *cap = &p.second;
2000 if (cap->is_noquota())
2001 continue;
2002
2003 if (exclude_ct >= 0 && exclude_ct != p.first)
2004 goto update;
2005
2006 if (cap->last_rbytes == i->rstat.rbytes &&
2007 cap->last_rsize == i->rstat.rsize())
2008 continue;
2009
2010 if (i->quota.max_files > 0) {
2011 if (i->rstat.rsize() >= i->quota.max_files)
2012 goto update;
2013
2014 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2015 abs(cap->last_rsize - i->rstat.rsize()))
2016 goto update;
2017 }
2018
2019 if (i->quota.max_bytes > 0) {
2020 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2021 goto update;
2022
2023 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2024 abs(cap->last_rbytes - i->rstat.rbytes))
2025 goto update;
2026 }
2027
2028 continue;
2029
2030 update:
2031 cap->last_rsize = i->rstat.rsize();
2032 cap->last_rbytes = i->rstat.rbytes;
2033
2034 auto msg = make_message<MClientQuota>();
2035 msg->ino = in->ino();
2036 msg->rstat = i->rstat;
2037 msg->quota = i->quota;
2038 mds->send_message_client_counted(msg, cap->get_session());
2039 }
2040 for (const auto &it : in->get_replicas()) {
2041 auto msg = make_message<MGatherCaps>();
2042 msg->ino = in->ino();
2043 mds->send_message_mds(msg, it.first);
2044 }
2045 }
2046
2047 /*
2048 * NOTE: we _have_ to delay the scatter if we are called during a
2049 * rejoin, because we can't twiddle locks between when the
2050 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2051 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2052 * (no requests), and a survivor acks immediately. _except_ that
2053 * during rejoin_(weak|strong) processing, we may complete a lock
2054 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2055 * scatterlock state in that case or the lock states will get out of
2056 * sync between the auth and replica.
2057 *
2058 * the simple solution is to never do the scatter here. instead, put
2059 * the scatterlock on a list if it isn't already wrlockable. this is
2060 * probably the best plan anyway, since we avoid too many
2061 * scatters/locks under normal usage.
2062 */
2063 /*
2064 * some notes on dirlock/nestlock scatterlock semantics:
2065 *
2066 * the fragstat (dirlock) will never be updated without
2067 * dirlock+nestlock wrlock held by the caller.
2068 *
2069 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2070 * data is pushed up the tree. this could be changed with some
2071 * restructuring here, but in its current form we ensure that the
2072 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2073 * frag, which is nice. and, we only need to track frags that need to
2074 * be nudged (and not inodes with pending rstat changes that need to
2075 * be pushed into the frag). a consequence of this is that the
2076 * accounted_rstat on scatterlock sync may not match our current
2077 * rstat. this is normal and expected.
2078 */
2079 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2080 CInode *in, CDir *parent,
2081 int flags, int linkunlink,
2082 snapid_t cfollows)
2083 {
2084 bool primary_dn = flags & PREDIRTY_PRIMARY;
2085 bool do_parent_mtime = flags & PREDIRTY_DIR;
2086 bool shallow = flags & PREDIRTY_SHALLOW;
2087
2088 ceph_assert(mds->mdlog->entry_is_open());
2089
2090 // make sure stamp is set
2091 if (mut->get_mds_stamp() == utime_t())
2092 mut->set_mds_stamp(ceph_clock_now());
2093
2094 if (in->is_base())
2095 return;
2096
2097 dout(10) << "predirty_journal_parents"
2098 << (do_parent_mtime ? " do_parent_mtime":"")
2099 << " linkunlink=" << linkunlink
2100 << (primary_dn ? " primary_dn":" remote_dn")
2101 << (shallow ? " SHALLOW":"")
2102 << " follows " << cfollows
2103 << " " << *in << dendl;
2104
2105 if (!parent) {
2106 ceph_assert(primary_dn);
2107 parent = in->get_projected_parent_dn()->get_dir();
2108 }
2109
2110 if (flags == 0 && linkunlink == 0) {
2111 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2112 blob->add_dir_context(parent);
2113 return;
2114 }
2115
2116 // build list of inodes to wrlock, dirty, and update
2117 list<CInode*> lsi;
2118 CInode *cur = in;
2119 CDentry *parentdn = NULL;
2120 bool first = true;
2121 while (parent) {
2122 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2123 ceph_assert(parent->is_auth());
2124
2125 // opportunistically adjust parent dirfrag
2126 CInode *pin = parent->get_inode();
2127
2128 // inode -> dirfrag
2129 mut->auth_pin(parent);
2130 mut->add_projected_fnode(parent);
2131
2132 fnode_t *pf = parent->project_fnode();
2133 pf->version = parent->pre_dirty();
2134
2135 if (do_parent_mtime || linkunlink) {
2136 ceph_assert(mut->is_wrlocked(&pin->filelock));
2137 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2138 ceph_assert(cfollows == CEPH_NOSNAP);
2139
2140 // update stale fragstat/rstat?
2141 parent->resync_accounted_fragstat();
2142 parent->resync_accounted_rstat();
2143
2144 if (do_parent_mtime) {
2145 pf->fragstat.mtime = mut->get_op_stamp();
2146 pf->fragstat.change_attr++;
2147 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2148 if (pf->fragstat.mtime > pf->rstat.rctime) {
2149 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2150 pf->rstat.rctime = pf->fragstat.mtime;
2151 } else {
2152 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2153 }
2154 }
2155 if (linkunlink) {
2156 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2157 if (in->is_dir()) {
2158 pf->fragstat.nsubdirs += linkunlink;
2159 //pf->rstat.rsubdirs += linkunlink;
2160 } else {
2161 pf->fragstat.nfiles += linkunlink;
2162 //pf->rstat.rfiles += linkunlink;
2163 }
2164 }
2165 }
2166
2167 // rstat
2168 if (!primary_dn) {
2169 // don't update parent this pass
2170 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2171 pin->versionlock.can_wrlock())) {
2172 dout(20) << " unwritable parent nestlock " << pin->nestlock
2173 << ", marking dirty rstat on " << *cur << dendl;
2174 cur->mark_dirty_rstat();
2175 } else {
2176 // if we don't hold a wrlock reference on this nestlock, take one,
2177 // because we are about to write into the dirfrag fnode and that needs
2178 // to commit before the lock can cycle.
2179 if (linkunlink) {
2180 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2181 }
2182
2183 if (!mut->is_wrlocked(&pin->nestlock)) {
2184 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2185 mds->locker->wrlock_force(&pin->nestlock, mut);
2186 }
2187
2188 // now we can project the inode rstat diff the dirfrag
2189 SnapRealm *prealm = pin->find_snaprealm();
2190
2191 snapid_t follows = cfollows;
2192 if (follows == CEPH_NOSNAP)
2193 follows = prealm->get_newest_seq();
2194
2195 snapid_t first = follows+1;
2196
2197 // first, if the frag is stale, bring it back in sync.
2198 parent->resync_accounted_rstat();
2199
2200 // now push inode rstats into frag
2201 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2202 cur->clear_dirty_rstat();
2203 }
2204
2205 bool stop = false;
2206 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2207 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2208 stop = true;
2209 }
2210
2211 // delay propagating until later?
2212 if (!stop && !first &&
2213 g_conf()->mds_dirstat_min_interval > 0) {
2214 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2215 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2216 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2217 << " < " << g_conf()->mds_dirstat_min_interval
2218 << ", stopping" << dendl;
2219 stop = true;
2220 } else {
2221 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2222 }
2223 }
2224
2225 // can cast only because i'm passing nowait=true in the sole user
2226 if (!stop &&
2227 !mut->is_wrlocked(&pin->nestlock) &&
2228 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2229 !mds->locker->wrlock_try(&pin->nestlock, mut)
2230 )) { // ** do not initiate.. see above comment **
2231 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2232 << " on " << *pin << dendl;
2233 stop = true;
2234 }
2235 if (stop) {
2236 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2237 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2238 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2239 mut->add_updated_lock(&pin->nestlock);
2240 if (do_parent_mtime || linkunlink) {
2241 mds->locker->mark_updated_scatterlock(&pin->filelock);
2242 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2243 mut->add_updated_lock(&pin->filelock);
2244 }
2245 break;
2246 }
2247 if (!mut->is_wrlocked(&pin->versionlock))
2248 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2249
2250 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
2251
2252 pin->last_dirstat_prop = mut->get_mds_stamp();
2253
2254 // dirfrag -> diri
2255 mut->auth_pin(pin);
2256 mut->add_projected_inode(pin);
2257 lsi.push_front(pin);
2258
2259 pin->pre_cow_old_inode(); // avoid cow mayhem!
2260
2261 auto &pi = pin->project_inode();
2262 pi.inode.version = pin->pre_dirty();
2263
2264 // dirstat
2265 if (do_parent_mtime || linkunlink) {
2266 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2267 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2268 bool touched_mtime = false, touched_chattr = false;
2269 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2270 pf->accounted_fragstat = pf->fragstat;
2271 if (touched_mtime)
2272 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
2273 if (touched_chattr)
2274 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2275 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
2276
2277 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2278 if (pi.inode.dirstat.size() < 0)
2279 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2280 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
2281 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2282 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
2283 << ", dirfrag has " << pf->fragstat;
2284
2285 // trust the dirfrag for now
2286 pi.inode.dirstat = pf->fragstat;
2287
2288 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2289 }
2290 }
2291 }
2292
2293 /*
2294 * the rule here is to follow the _oldest_ parent with dirty rstat
2295 * data. if we don't propagate all data, we add ourselves to the
2296 * nudge list. that way all rstat data will (eventually) get
2297 * pushed up the tree.
2298 *
2299 * actually, no. for now, silently drop rstats for old parents. we need
2300 * hard link backpointers to do the above properly.
2301 */
2302
2303 // stop?
2304 if (pin->is_base())
2305 break;
2306 parentdn = pin->get_projected_parent_dn();
2307 ceph_assert(parentdn);
2308
2309 // rstat
2310 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2311
2312 // first, if the frag is stale, bring it back in sync.
2313 parent->resync_accounted_rstat();
2314
2315 if (g_conf()->mds_snap_rstat) {
2316 for (auto &p : parent->dirty_old_rstat) {
2317 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2318 p.first, pin, true);
2319 }
2320 }
2321 parent->dirty_old_rstat.clear();
2322 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2323
2324 pf->accounted_rstat = pf->rstat;
2325
2326 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2327 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
2328 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2329 << parent->dirfrag() << ", inode has " << pi.inode.rstat
2330 << ", dirfrag has " << pf->rstat;
2331
2332 // trust the dirfrag for now
2333 pi.inode.rstat = pf->rstat;
2334
2335 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2336 }
2337 }
2338
2339 parent->check_rstats();
2340 broadcast_quota_to_client(pin);
2341 // next parent!
2342 cur = pin;
2343 parent = parentdn->get_dir();
2344 linkunlink = 0;
2345 do_parent_mtime = false;
2346 primary_dn = true;
2347 first = false;
2348 }
2349
2350 // now, stick it in the blob
2351 ceph_assert(parent);
2352 ceph_assert(parent->is_auth());
2353 blob->add_dir_context(parent);
2354 blob->add_dir(parent, true);
2355 for (const auto& in : lsi) {
2356 journal_dirty_inode(mut.get(), blob, in);
2357 }
2358
2359 }
2360
2361
2362
2363
2364
2365 // ===================================
2366 // slave requests
2367
2368
2369 /*
2370 * some handlers for master requests with slaves. we need to make
2371 * sure slaves journal commits before we forget we mastered them and
2372 * remove them from the uncommitted_masters map (used during recovery
2373 * to commit|abort slaves).
2374 */
2375 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2376 metareqid_t reqid;
2377 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2378 void finish(int r) override {
2379 mdcache->_logged_master_commit(reqid);
2380 }
2381 };
2382
2383 void MDCache::log_master_commit(metareqid_t reqid)
2384 {
2385 dout(10) << "log_master_commit " << reqid << dendl;
2386 uncommitted_masters[reqid].committing = true;
2387 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2388 new C_MDC_CommittedMaster(this, reqid));
2389 }
2390
2391 void MDCache::_logged_master_commit(metareqid_t reqid)
2392 {
2393 dout(10) << "_logged_master_commit " << reqid << dendl;
2394 ceph_assert(uncommitted_masters.count(reqid));
2395 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2396 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2397 uncommitted_masters.erase(reqid);
2398 }
2399
2400 // while active...
2401
2402 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2403 {
2404 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2405 ceph_assert(uncommitted_masters.count(r));
2406 uncommitted_masters[r].slaves.erase(from);
2407 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2408 log_master_commit(r);
2409 }
2410
2411 void MDCache::logged_master_update(metareqid_t reqid)
2412 {
2413 dout(10) << "logged_master_update " << reqid << dendl;
2414 ceph_assert(uncommitted_masters.count(reqid));
2415 uncommitted_masters[reqid].safe = true;
2416 auto p = pending_masters.find(reqid);
2417 if (p != pending_masters.end()) {
2418 pending_masters.erase(p);
2419 if (pending_masters.empty())
2420 process_delayed_resolve();
2421 }
2422 }
2423
2424 /*
2425 * Master may crash after receiving all slaves' commit acks, but before journalling
2426 * the final commit. Slaves may crash after journalling the slave commit, but before
2427 * sending commit ack to the master. Commit masters with no uncommitted slave when
2428 * resolve finishes.
2429 */
2430 void MDCache::finish_committed_masters()
2431 {
2432 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2433 p != uncommitted_masters.end();
2434 ++p) {
2435 p->second.recovering = false;
2436 if (!p->second.committing && p->second.slaves.empty()) {
2437 dout(10) << "finish_committed_masters " << p->first << dendl;
2438 log_master_commit(p->first);
2439 }
2440 }
2441 }
2442
2443 /*
2444 * at end of resolve... we must journal a commit|abort for all slave
2445 * updates, before moving on.
2446 *
2447 * this is so that the master can safely journal ECommitted on ops it
2448 * masters when it reaches up:active (all other recovering nodes must
2449 * complete resolve before that happens).
2450 */
2451 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2452 mds_rank_t from;
2453 metareqid_t reqid;
2454 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2455 void finish(int r) override {
2456 mdcache->_logged_slave_commit(from, reqid);
2457 }
2458 };
2459
2460 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2461 {
2462 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2463
2464 // send a message
2465 auto req = make_message<MMDSSlaveRequest>(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2466 mds->send_message_mds(req, from);
2467 }
2468
2469
2470
2471
2472
2473
2474 // ====================================================================
2475 // import map, recovery
2476
2477 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2478 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2479 {
2480 if (subtrees.count(oldparent)) {
2481 vector<dirfrag_t>& v = subtrees[oldparent];
2482 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2483 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2484 if (*it == df) {
2485 v.erase(it);
2486 break;
2487 }
2488 }
2489 if (subtrees.count(newparent)) {
2490 vector<dirfrag_t>& v = subtrees[newparent];
2491 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2492 v.push_back(df);
2493 }
2494 }
2495
2496 ESubtreeMap *MDCache::create_subtree_map()
2497 {
2498 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2499 << num_subtrees_fullauth() << " fullauth"
2500 << dendl;
2501
2502 show_subtrees();
2503
2504 ESubtreeMap *le = new ESubtreeMap();
2505 mds->mdlog->_start_entry(le);
2506
2507 map<dirfrag_t, CDir*> dirs_to_add;
2508
2509 if (myin) {
2510 CDir* mydir = myin->get_dirfrag(frag_t());
2511 dirs_to_add[mydir->dirfrag()] = mydir;
2512 }
2513
2514 // include all auth subtrees, and their bounds.
2515 // and a spanning tree to tie it to the root.
2516 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
2517 p != subtrees.end();
2518 ++p) {
2519 CDir *dir = p->first;
2520
2521 // journal subtree as "ours" if we are
2522 // me, -2
2523 // me, me
2524 // me, !me (may be importing and ambiguous!)
2525
2526 // so not
2527 // !me, *
2528 if (dir->get_dir_auth().first != mds->get_nodeid())
2529 continue;
2530
2531 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2532 my_ambiguous_imports.count(dir->dirfrag())) {
2533 dout(15) << " ambig subtree " << *dir << dendl;
2534 le->ambiguous_subtrees.insert(dir->dirfrag());
2535 } else {
2536 dout(15) << " subtree " << *dir << dendl;
2537 }
2538
2539 dirs_to_add[dir->dirfrag()] = dir;
2540 le->subtrees[dir->dirfrag()].clear();
2541
2542
2543 // bounds
2544 for (set<CDir*>::iterator q = p->second.begin();
2545 q != p->second.end();
2546 ++q) {
2547 CDir *bound = *q;
2548 dout(15) << " subtree bound " << *bound << dendl;
2549 dirs_to_add[bound->dirfrag()] = bound;
2550 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2551 }
2552 }
2553
2554 // apply projected renames
2555 for (const auto& [diri, renames] : projected_subtree_renames) {
2556 for (const auto& [olddir, newdir] : renames) {
2557 dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2558
2559 auto&& dfls = diri->get_dirfrags();
2560 for (const auto& dir : dfls) {
2561 dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2562 CDir *oldparent = get_projected_subtree_root(olddir);
2563 dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2564 CDir *newparent = get_projected_subtree_root(newdir);
2565 dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2566
2567 if (oldparent == newparent) {
2568 dout(10) << "parent unchanged for " << dir->dirfrag() << " at "
2569 << oldparent->dirfrag() << dendl;
2570 continue;
2571 }
2572
2573 if (dir->is_subtree_root()) {
2574 if (le->subtrees.count(newparent->dirfrag()) &&
2575 oldparent->get_dir_auth() != newparent->get_dir_auth())
2576 dirs_to_add[dir->dirfrag()] = dir;
2577 // children are fine. change parent.
2578 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2579 le->subtrees);
2580 } else {
2581 // mid-subtree.
2582
2583 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2584 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2585 // if oldparent is auth, subtree is mine; include it.
2586 if (le->subtrees.count(oldparent->dirfrag())) {
2587 dirs_to_add[dir->dirfrag()] = dir;
2588 le->subtrees[dir->dirfrag()].clear();
2589 }
2590 // if newparent is auth, subtree is a new bound
2591 if (le->subtrees.count(newparent->dirfrag())) {
2592 dirs_to_add[dir->dirfrag()] = dir;
2593 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2594 }
2595 newparent = dir;
2596 }
2597
2598 // see if any old bounds move to the new parent.
2599 for (set<CDir*>::iterator p = subtrees[oldparent].begin();
2600 p != subtrees[oldparent].end();
2601 ++p) {
2602 CDir *bound = *p;
2603 if (dir->contains(bound->get_parent_dir()))
2604 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2605 le->subtrees);
2606 }
2607 }
2608 }
2609 }
2610 }
2611
2612 // simplify the journaled map. our in memory map may have more
2613 // subtrees than needed due to migrations that are just getting
2614 // started or just completing. but on replay, the "live" map will
2615 // be simple and we can do a straight comparison.
2616 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) {
2617 if (le->ambiguous_subtrees.count(p->first))
2618 continue;
2619 unsigned i = 0;
2620 while (i < p->second.size()) {
2621 dirfrag_t b = p->second[i];
2622 if (le->subtrees.count(b) &&
2623 le->ambiguous_subtrees.count(b) == 0) {
2624 vector<dirfrag_t>& bb = le->subtrees[b];
2625 dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2626 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2627 p->second.push_back(*r);
2628 dirs_to_add.erase(b);
2629 le->subtrees.erase(b);
2630 p->second.erase(p->second.begin() + i);
2631 } else {
2632 ++i;
2633 }
2634 }
2635 }
2636
2637 for (auto &p : dirs_to_add) {
2638 CDir *dir = p.second;
2639 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2640 le->metablob.add_dir(dir, false);
2641 }
2642
2643 dout(15) << " subtrees " << le->subtrees << dendl;
2644 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2645
2646 //le->metablob.print(cout);
2647 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2648 return le;
2649 }
2650
2651 void MDCache::dump_resolve_status(Formatter *f) const
2652 {
2653 f->open_object_section("resolve_status");
2654 f->dump_stream("resolve_gather") << resolve_gather;
2655 f->dump_stream("resolve_ack_gather") << resolve_gather;
2656 f->close_section();
2657 }
2658
2659 void MDCache::resolve_start(MDSContext *resolve_done_)
2660 {
2661 dout(10) << "resolve_start" << dendl;
2662 ceph_assert(!resolve_done);
2663 resolve_done.reset(resolve_done_);
2664
2665 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2666 // if we don't have the root dir, adjust it to UNKNOWN. during
2667 // resolve we want mds0 to explicit claim the portion of it that
2668 // it owns, so that anything beyond its bounds get left as
2669 // unknown.
2670 CDir *rootdir = root->get_dirfrag(frag_t());
2671 if (rootdir)
2672 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2673 }
2674 resolve_gather = recovery_set;
2675
2676 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2677 }
2678
2679 void MDCache::send_resolves()
2680 {
2681 send_slave_resolves();
2682
2683 if (!resolve_done) {
2684 // I'm survivor: refresh snap cache
2685 mds->snapclient->sync(
2686 new MDSInternalContextWrapper(mds,
2687 new LambdaContext([this](int r) {
2688 maybe_finish_slave_resolve();
2689 })
2690 )
2691 );
2692 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2693 return;
2694 }
2695 if (!resolve_ack_gather.empty()) {
2696 dout(10) << "send_resolves still waiting for resolve ack from ("
2697 << resolve_ack_gather << ")" << dendl;
2698 return;
2699 }
2700 if (!resolve_need_rollback.empty()) {
2701 dout(10) << "send_resolves still waiting for rollback to commit on ("
2702 << resolve_need_rollback << ")" << dendl;
2703 return;
2704 }
2705
2706 send_subtree_resolves();
2707 }
2708
2709 void MDCache::send_slave_resolves()
2710 {
2711 dout(10) << "send_slave_resolves" << dendl;
2712
2713 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2714
2715 if (mds->is_resolve()) {
2716 for (map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> >::iterator p = uncommitted_slave_updates.begin();
2717 p != uncommitted_slave_updates.end();
2718 ++p) {
2719 resolves[p->first] = make_message<MMDSResolve>();
2720 for (map<metareqid_t, MDSlaveUpdate*>::iterator q = p->second.begin();
2721 q != p->second.end();
2722 ++q) {
2723 dout(10) << " including uncommitted " << q->first << dendl;
2724 resolves[p->first]->add_slave_request(q->first, false);
2725 }
2726 }
2727 } else {
2728 set<mds_rank_t> resolve_set;
2729 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2730 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2731 p != active_requests.end();
2732 ++p) {
2733 MDRequestRef& mdr = p->second;
2734 if (!mdr->is_slave())
2735 continue;
2736 if (!mdr->slave_did_prepare() && !mdr->committing) {
2737 continue;
2738 }
2739 mds_rank_t master = mdr->slave_to_mds;
2740 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2741 dout(10) << " including uncommitted " << *mdr << dendl;
2742 if (!resolves.count(master))
2743 resolves[master] = make_message<MMDSResolve>();
2744 if (!mdr->committing &&
2745 mdr->has_more() && mdr->more()->is_inode_exporter) {
2746 // re-send cap exports
2747 CInode *in = mdr->more()->rename_inode;
2748 map<client_t, Capability::Export> cap_map;
2749 in->export_client_caps(cap_map);
2750 bufferlist bl;
2751 MMDSResolve::slave_inode_cap inode_caps(in->ino(), cap_map);
2752 encode(inode_caps, bl);
2753 resolves[master]->add_slave_request(p->first, bl);
2754 } else {
2755 resolves[master]->add_slave_request(p->first, mdr->committing);
2756 }
2757 }
2758 }
2759 }
2760
2761 for (auto &p : resolves) {
2762 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2763 mds->send_message_mds(p.second, p.first);
2764 resolve_ack_gather.insert(p.first);
2765 }
2766 }
2767
2768 void MDCache::send_subtree_resolves()
2769 {
2770 dout(10) << "send_subtree_resolves" << dendl;
2771
2772 if (migrator->is_exporting() || migrator->is_importing()) {
2773 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2774 migrator->show_importing();
2775 migrator->show_exporting();
2776 resolves_pending = true;
2777 return; // not now
2778 }
2779
2780 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2781 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2782 p != recovery_set.end();
2783 ++p) {
2784 if (*p == mds->get_nodeid())
2785 continue;
2786 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2787 resolves[*p] = make_message<MMDSResolve>();
2788 }
2789
2790 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2791 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2792
2793 // known
2794 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2795 p != subtrees.end();
2796 ++p) {
2797 CDir *dir = p->first;
2798
2799 // only our subtrees
2800 if (dir->authority().first != mds->get_nodeid())
2801 continue;
2802
2803 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2804 continue; // we'll add it below
2805
2806 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2807 // ambiguous (mid-import)
2808 set<CDir*> bounds;
2809 get_subtree_bounds(dir, bounds);
2810 vector<dirfrag_t> dfls;
2811 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2812 dfls.push_back((*q)->dirfrag());
2813
2814 my_ambig_imports[dir->dirfrag()] = dfls;
2815 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2816 } else {
2817 // not ambiguous.
2818 for (auto &q : resolves) {
2819 resolves[q.first]->add_subtree(dir->dirfrag());
2820 }
2821 // bounds too
2822 vector<dirfrag_t> dfls;
2823 for (set<CDir*>::iterator q = subtrees[dir].begin();
2824 q != subtrees[dir].end();
2825 ++q) {
2826 CDir *bound = *q;
2827 dfls.push_back(bound->dirfrag());
2828 }
2829
2830 my_subtrees[dir->dirfrag()] = dfls;
2831 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2832 }
2833 }
2834
2835 // ambiguous
2836 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2837 p != my_ambiguous_imports.end();
2838 ++p) {
2839 my_ambig_imports[p->first] = p->second;
2840 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2841 }
2842
2843 // simplify the claimed subtree.
2844 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2845 unsigned i = 0;
2846 while (i < p->second.size()) {
2847 dirfrag_t b = p->second[i];
2848 if (my_subtrees.count(b)) {
2849 vector<dirfrag_t>& bb = my_subtrees[b];
2850 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2851 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2852 p->second.push_back(*r);
2853 my_subtrees.erase(b);
2854 p->second.erase(p->second.begin() + i);
2855 } else {
2856 ++i;
2857 }
2858 }
2859 }
2860
2861 // send
2862 for (auto &p : resolves) {
2863 const ref_t<MMDSResolve> &m = p.second;
2864 if (mds->is_resolve()) {
2865 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2866 } else {
2867 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2868 }
2869 m->subtrees = my_subtrees;
2870 m->ambiguous_imports = my_ambig_imports;
2871 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2872 mds->send_message_mds(m, p.first);
2873 }
2874 resolves_pending = false;
2875 }
2876
2877 void MDCache::maybe_finish_slave_resolve() {
2878 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2879 // snap cache get synced or I'm in resolve state
2880 if (mds->snapclient->is_synced() || resolve_done)
2881 send_subtree_resolves();
2882 process_delayed_resolve();
2883 }
2884 }
2885
2886 void MDCache::handle_mds_failure(mds_rank_t who)
2887 {
2888 dout(7) << "handle_mds_failure mds." << who << dendl;
2889
2890 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2891
2892 resolve_gather.insert(who);
2893 discard_delayed_resolve(who);
2894 ambiguous_slave_updates.erase(who);
2895
2896 rejoin_gather.insert(who);
2897 rejoin_sent.erase(who); // i need to send another
2898 rejoin_ack_sent.erase(who); // i need to send another
2899 rejoin_ack_gather.erase(who); // i'll need/get another.
2900
2901 dout(10) << " resolve_gather " << resolve_gather << dendl;
2902 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2903 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2904 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2905 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2906
2907
2908 // tell the migrator too.
2909 migrator->handle_mds_failure_or_stop(who);
2910
2911 // tell the balancer too.
2912 mds->balancer->handle_mds_failure(who);
2913
2914 // clean up any requests slave to/from this node
2915 list<MDRequestRef> finish;
2916 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2917 p != active_requests.end();
2918 ++p) {
2919 MDRequestRef& mdr = p->second;
2920 // slave to the failed node?
2921 if (mdr->slave_to_mds == who) {
2922 if (mdr->slave_did_prepare()) {
2923 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2924 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2925 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2926
2927 if (!mdr->more()->waiting_on_slave.empty()) {
2928 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2929 // will rollback, no need to wait
2930 mdr->reset_slave_request();
2931 mdr->more()->waiting_on_slave.clear();
2932 }
2933 } else if (!mdr->committing) {
2934 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2935 if (mdr->slave_request || mdr->slave_rolling_back())
2936 mdr->aborted = true;
2937 else
2938 finish.push_back(mdr);
2939 }
2940 }
2941
2942 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2943 if (mdr->more()->waiting_on_slave.count(who)) {
2944 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2945 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
2946 << who << dendl;
2947 mdr->more()->waiting_on_slave.erase(who);
2948 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
2949 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
2950 }
2951
2952 if (mdr->more()->srcdn_auth_mds == who &&
2953 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
2954 // rename srcdn's auth mds failed, resolve even I'm a survivor.
2955 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2956 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2957 }
2958 } else if (mdr->slave_request) {
2959 const cref_t<MMDSSlaveRequest> &slave_req = mdr->slave_request;
2960 // FIXME: Slave rename request can arrive after we notice mds failure.
2961 // This can cause mds to crash (does not affect integrity of FS).
2962 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
2963 slave_req->srcdn_auth == who)
2964 slave_req->mark_interrupted();
2965 }
2966
2967 // failed node is slave?
2968 if (mdr->is_master() && !mdr->committing) {
2969 if (mdr->more()->srcdn_auth_mds == who) {
2970 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2971 << who << " to recover" << dendl;
2972 ceph_assert(mdr->more()->witnessed.count(who) == 0);
2973 if (mdr->more()->is_ambiguous_auth)
2974 mdr->clear_ambiguous_auth();
2975 // rename srcdn's auth mds failed, all witnesses will rollback
2976 mdr->more()->witnessed.clear();
2977 pending_masters.erase(p->first);
2978 }
2979
2980 if (mdr->more()->witnessed.count(who)) {
2981 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
2982 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
2983 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
2984 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
2985 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
2986 // until either the request is committing or the slave also fails.
2987 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
2988 pending_masters.insert(p->first);
2989 } else {
2990 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
2991 << who << " to recover" << dendl;
2992 if (srcdn_auth >= 0)
2993 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
2994
2995 // discard this peer's prepare (if any)
2996 mdr->more()->witnessed.erase(who);
2997 }
2998 }
2999
3000 if (mdr->more()->waiting_on_slave.count(who)) {
3001 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3002 << " to recover" << dendl;
3003 // retry request when peer recovers
3004 mdr->more()->waiting_on_slave.erase(who);
3005 if (mdr->more()->waiting_on_slave.empty())
3006 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3007 }
3008
3009 if (mdr->locking && mdr->locking_target_mds == who)
3010 mdr->finish_locking(mdr->locking);
3011 }
3012 }
3013
3014 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3015 p != uncommitted_masters.end();
3016 ++p) {
3017 // The failed MDS may have already committed the slave update
3018 if (p->second.slaves.count(who)) {
3019 p->second.recovering = true;
3020 p->second.slaves.erase(who);
3021 }
3022 }
3023
3024 while (!finish.empty()) {
3025 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3026 request_finish(finish.front());
3027 finish.pop_front();
3028 }
3029
3030 kick_find_ino_peers(who);
3031 kick_open_ino_peers(who);
3032
3033 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3034 p != fragments.end(); ) {
3035 dirfrag_t df = p->first;
3036 fragment_info_t& info = p->second;
3037
3038 if (info.is_fragmenting()) {
3039 if (info.notify_ack_waiting.erase(who) &&
3040 info.notify_ack_waiting.empty()) {
3041 fragment_drop_locks(info);
3042 fragment_maybe_finish(p++);
3043 } else {
3044 ++p;
3045 }
3046 continue;
3047 }
3048
3049 ++p;
3050 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3051 std::vector<CDir*> dirs;
3052 info.dirs.swap(dirs);
3053 fragments.erase(df);
3054 fragment_unmark_unfreeze_dirs(dirs);
3055 }
3056
3057 // MDCache::shutdown_export_strays() always exports strays to mds.0
3058 if (who == mds_rank_t(0))
3059 shutdown_exporting_strays.clear();
3060
3061 show_subtrees();
3062 }
3063
3064 /*
3065 * handle_mds_recovery - called on another node's transition
3066 * from resolve -> active.
3067 */
3068 void MDCache::handle_mds_recovery(mds_rank_t who)
3069 {
3070 dout(7) << "handle_mds_recovery mds." << who << dendl;
3071
3072 // exclude all discover waiters. kick_discovers() will do the job
3073 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3074 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3075
3076 MDSContext::vec waiters;
3077
3078 // wake up any waiters in their subtrees
3079 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3080 p != subtrees.end();
3081 ++p) {
3082 CDir *dir = p->first;
3083
3084 if (dir->authority().first != who ||
3085 dir->authority().second == mds->get_nodeid())
3086 continue;
3087 ceph_assert(!dir->is_auth());
3088
3089 // wake any waiters
3090 std::queue<CDir*> q;
3091 q.push(dir);
3092
3093 while (!q.empty()) {
3094 CDir *d = q.front();
3095 q.pop();
3096 d->take_waiting(d_mask, waiters);
3097
3098 // inode waiters too
3099 for (auto &p : d->items) {
3100 CDentry *dn = p.second;
3101 CDentry::linkage_t *dnl = dn->get_linkage();
3102 if (dnl->is_primary()) {
3103 dnl->get_inode()->take_waiting(i_mask, waiters);
3104
3105 // recurse?
3106 auto&& ls = dnl->get_inode()->get_dirfrags();
3107 for (const auto& subdir : ls) {
3108 if (!subdir->is_subtree_root())
3109 q.push(subdir);
3110 }
3111 }
3112 }
3113 }
3114 }
3115
3116 kick_open_ino_peers(who);
3117 kick_find_ino_peers(who);
3118
3119 // queue them up.
3120 mds->queue_waiters(waiters);
3121 }
3122
3123 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3124 {
3125 dout(7) << "set_recovery_set " << s << dendl;
3126 recovery_set = s;
3127 }
3128
3129
3130 /*
3131 * during resolve state, we share resolves to determine who
3132 * is authoritative for which trees. we expect to get an resolve
3133 * from _everyone_ in the recovery_set (the mds cluster at the time of
3134 * the first failure).
3135 *
3136 * This functions puts the passed message before returning
3137 */
3138 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3139 {
3140 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3141 mds_rank_t from = mds_rank_t(m->get_source().num());
3142
3143 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3144 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3145 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3146 return;
3147 }
3148 // wait until we reach the resolve stage!
3149 return;
3150 }
3151
3152 discard_delayed_resolve(from);
3153
3154 // ambiguous slave requests?
3155 if (!m->slave_requests.empty()) {
3156 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3157 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3158 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3159 ceph_assert(!p->second.committing);
3160 pending_masters.insert(p->first);
3161 }
3162 }
3163
3164 if (!pending_masters.empty()) {
3165 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3166 delayed_resolve[from] = m;
3167 return;
3168 }
3169 }
3170
3171 auto ack = make_message<MMDSResolveAck>();
3172 for (const auto &p : m->slave_requests) {
3173 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3174 // COMMIT
3175 if (p.second.committing) {
3176 // already committing, waiting for the OP_COMMITTED slave reply
3177 dout(10) << " already committing slave request " << p << " noop "<< dendl;
3178 } else {
3179 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3180 ack->add_commit(p.first);
3181 }
3182 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3183
3184 if (p.second.inode_caps.length() > 0) {
3185 // slave wants to export caps (rename)
3186 ceph_assert(mds->is_resolve());
3187 MMDSResolve::slave_inode_cap inode_caps;
3188 auto q = p.second.inode_caps.cbegin();
3189 decode(inode_caps, q);
3190 inodeno_t ino = inode_caps.ino;
3191 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3192 ceph_assert(get_inode(ino));
3193
3194 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3195 q != cap_exports.end();
3196 ++q) {
3197 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3198 im.cap_id = ++last_cap_id; // assign a new cap ID
3199 im.issue_seq = 1;
3200 im.mseq = q->second.mseq;
3201
3202 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3203 if (session)
3204 rejoin_client_map.emplace(q->first, session->info.inst);
3205 }
3206
3207 // will process these caps in rejoin stage
3208 rejoin_slave_exports[ino].first = from;
3209 rejoin_slave_exports[ino].second.swap(cap_exports);
3210
3211 // send information of imported caps back to slave
3212 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3213 }
3214 } else {
3215 // ABORT
3216 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3217 ceph_assert(!p.second.committing);
3218 ack->add_abort(p.first);
3219 }
3220 }
3221 mds->send_message(ack, m->get_connection());
3222 return;
3223 }
3224
3225 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3226 dout(10) << "delay processing subtree resolve" << dendl;
3227 delayed_resolve[from] = m;
3228 return;
3229 }
3230
3231 bool survivor = false;
3232 // am i a surviving ambiguous importer?
3233 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3234 survivor = true;
3235 // check for any import success/failure (from this node)
3236 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3237 while (p != my_ambiguous_imports.end()) {
3238 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3239 ++next;
3240 CDir *dir = get_dirfrag(p->first);
3241 ceph_assert(dir);
3242 dout(10) << "checking ambiguous import " << *dir << dendl;
3243 if (migrator->is_importing(dir->dirfrag()) &&
3244 migrator->get_import_peer(dir->dirfrag()) == from) {
3245 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3246
3247 // check if sender claims the subtree
3248 bool claimed_by_sender = false;
3249 for (const auto &q : m->subtrees) {
3250 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3251 CDir *base = get_force_dirfrag(q.first, false);
3252 if (!base || !base->contains(dir))
3253 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3254
3255 bool inside = true;
3256 set<CDir*> bounds;
3257 get_force_dirfrag_bound_set(q.second, bounds);
3258 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3259 CDir *bound = *p;
3260 if (bound->contains(dir)) {
3261 inside = false; // nope, bound is dir or parent of dir, not inside.
3262 break;
3263 }
3264 }
3265 if (inside)
3266 claimed_by_sender = true;
3267 }
3268
3269 my_ambiguous_imports.erase(p); // no longer ambiguous.
3270 if (claimed_by_sender) {
3271 dout(7) << "ambiguous import failed on " << *dir << dendl;
3272 migrator->import_reverse(dir);
3273 } else {
3274 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3275 migrator->import_finish(dir, true);
3276 }
3277 }
3278 p = next;
3279 }
3280 }
3281
3282 // update my dir_auth values
3283 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3284 // migrations between other nodes)
3285 for (const auto& p : m->subtrees) {
3286 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3287 CDir *dir = get_force_dirfrag(p.first, !survivor);
3288 if (!dir)
3289 continue;
3290 adjust_bounded_subtree_auth(dir, p.second, from);
3291 try_subtree_merge(dir);
3292 }
3293
3294 show_subtrees();
3295
3296 // note ambiguous imports too
3297 for (const auto& p : m->ambiguous_imports) {
3298 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3299 other_ambiguous_imports[from][p.first] = p.second;
3300 }
3301
3302 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3303 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3304 for (const auto& p : m->table_clients) {
3305 dout(10) << " noting " << get_mdstable_name(p.type)
3306 << " pending_commits " << p.pending_commits << dendl;
3307 MDSTableClient *client = mds->get_table_client(p.type);
3308 for (const auto& q : p.pending_commits)
3309 client->notify_commit(q);
3310 }
3311
3312 // did i get them all?
3313 resolve_gather.erase(from);
3314
3315 maybe_resolve_finish();
3316 }
3317
3318 void MDCache::process_delayed_resolve()
3319 {
3320 dout(10) << "process_delayed_resolve" << dendl;
3321 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3322 tmp.swap(delayed_resolve);
3323 for (auto &p : tmp) {
3324 handle_resolve(p.second);
3325 }
3326 }
3327
3328 void MDCache::discard_delayed_resolve(mds_rank_t who)
3329 {
3330 delayed_resolve.erase(who);
3331 }
3332
3333 void MDCache::maybe_resolve_finish()
3334 {
3335 ceph_assert(resolve_ack_gather.empty());
3336 ceph_assert(resolve_need_rollback.empty());
3337
3338 if (!resolve_gather.empty()) {
3339 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3340 << resolve_gather << ")" << dendl;
3341 return;
3342 }
3343
3344 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3345 disambiguate_my_imports();
3346 finish_committed_masters();
3347
3348 if (resolve_done) {
3349 ceph_assert(mds->is_resolve());
3350 trim_unlinked_inodes();
3351 recalc_auth_bits(false);
3352 resolve_done.release()->complete(0);
3353 } else {
3354 // I am survivor.
3355 maybe_send_pending_rejoins();
3356 }
3357 }
3358
3359 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3360 {
3361 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3362 mds_rank_t from = mds_rank_t(ack->get_source().num());
3363
3364 if (!resolve_ack_gather.count(from) ||
3365 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3366 return;
3367 }
3368
3369 if (ambiguous_slave_updates.count(from)) {
3370 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3371 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3372 }
3373
3374 for (const auto &p : ack->commit) {
3375 dout(10) << " commit on slave " << p.first << dendl;
3376
3377 if (ambiguous_slave_updates.count(from)) {
3378 remove_ambiguous_slave_update(p.first, from);
3379 continue;
3380 }
3381
3382 if (mds->is_resolve()) {
3383 // replay
3384 MDSlaveUpdate *su = get_uncommitted_slave_update(p.first, from);
3385 ceph_assert(su);
3386
3387 // log commit
3388 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
3389 ESlaveUpdate::OP_COMMIT, su->origop),
3390 new C_MDC_SlaveCommit(this, from, p.first));
3391 mds->mdlog->flush();
3392
3393 finish_uncommitted_slave_update(p.first, from);
3394 } else {
3395 MDRequestRef mdr = request_get(p.first);
3396 // information about master imported caps
3397 if (p.second.length() > 0)
3398 mdr->more()->inode_import.share(p.second);
3399
3400 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
3401 request_finish(mdr);
3402 }
3403 }
3404
3405 for (const auto &metareq : ack->abort) {
3406 dout(10) << " abort on slave " << metareq << dendl;
3407
3408 if (mds->is_resolve()) {
3409 MDSlaveUpdate *su = get_uncommitted_slave_update(metareq, from);
3410 ceph_assert(su);
3411
3412 // perform rollback (and journal a rollback entry)
3413 // note: this will hold up the resolve a bit, until the rollback entries journal.
3414 MDRequestRef null_ref;
3415 switch (su->origop) {
3416 case ESlaveUpdate::LINK:
3417 mds->server->do_link_rollback(su->rollback, from, null_ref);
3418 break;
3419 case ESlaveUpdate::RENAME:
3420 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3421 break;
3422 case ESlaveUpdate::RMDIR:
3423 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3424 break;
3425 default:
3426 ceph_abort();
3427 }
3428 } else {
3429 MDRequestRef mdr = request_get(metareq);
3430 mdr->aborted = true;
3431 if (mdr->slave_request) {
3432 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3433 add_rollback(metareq, from);
3434 } else {
3435 request_finish(mdr);
3436 }
3437 }
3438 }
3439
3440 if (!ambiguous_slave_updates.count(from)) {
3441 resolve_ack_gather.erase(from);
3442 maybe_finish_slave_resolve();
3443 }
3444 }
3445
3446 void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su)
3447 {
3448 ceph_assert(uncommitted_slave_updates[master].count(reqid) == 0);
3449 uncommitted_slave_updates[master][reqid] = su;
3450 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3451 uncommitted_slave_rename_olddir[*p]++;
3452 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3453 uncommitted_slave_unlink[*p]++;
3454 }
3455
3456 void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3457 {
3458 ceph_assert(uncommitted_slave_updates[master].count(reqid));
3459 MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid];
3460
3461 uncommitted_slave_updates[master].erase(reqid);
3462 if (uncommitted_slave_updates[master].empty())
3463 uncommitted_slave_updates.erase(master);
3464 // discard the non-auth subtree we renamed out of
3465 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3466 CInode *diri = *p;
3467 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3468 ceph_assert(it != uncommitted_slave_rename_olddir.end());
3469 it->second--;
3470 if (it->second == 0) {
3471 uncommitted_slave_rename_olddir.erase(it);
3472 auto&& ls = diri->get_dirfrags();
3473 for (const auto& dir : ls) {
3474 CDir *root = get_subtree_root(dir);
3475 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3476 try_trim_non_auth_subtree(root);
3477 if (dir != root)
3478 break;
3479 }
3480 }
3481 } else
3482 ceph_assert(it->second > 0);
3483 }
3484 // removed the inodes that were unlinked by slave update
3485 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3486 CInode *in = *p;
3487 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3488 ceph_assert(it != uncommitted_slave_unlink.end());
3489 it->second--;
3490 if (it->second == 0) {
3491 uncommitted_slave_unlink.erase(it);
3492 if (!in->get_projected_parent_dn())
3493 mds->mdcache->remove_inode_recursive(in);
3494 } else
3495 ceph_assert(it->second > 0);
3496 }
3497 delete su;
3498 }
3499
3500 MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master)
3501 {
3502
3503 MDSlaveUpdate* su = NULL;
3504 if (uncommitted_slave_updates.count(master) &&
3505 uncommitted_slave_updates[master].count(reqid)) {
3506 su = uncommitted_slave_updates[master][reqid];
3507 ceph_assert(su);
3508 }
3509 return su;
3510 }
3511
3512 void MDCache::finish_rollback(metareqid_t reqid) {
3513 auto p = resolve_need_rollback.find(reqid);
3514 ceph_assert(p != resolve_need_rollback.end());
3515 if (mds->is_resolve())
3516 finish_uncommitted_slave_update(reqid, p->second);
3517 resolve_need_rollback.erase(p);
3518 maybe_finish_slave_resolve();
3519 }
3520
3521 void MDCache::disambiguate_other_imports()
3522 {
3523 dout(10) << "disambiguate_other_imports" << dendl;
3524
3525 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3526 // other nodes' ambiguous imports
3527 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3528 p != other_ambiguous_imports.end();
3529 ++p) {
3530 mds_rank_t who = p->first;
3531 dout(10) << "ambiguous imports for mds." << who << dendl;
3532
3533 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3534 q != p->second.end();
3535 ++q) {
3536 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3537 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3538 CDir *dir = get_force_dirfrag(q->first, recovering);
3539 if (!dir) continue;
3540
3541 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3542 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3543 dout(10) << " mds." << who << " did import " << *dir << dendl;
3544 adjust_bounded_subtree_auth(dir, q->second, who);
3545 try_subtree_merge(dir);
3546 } else {
3547 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3548 }
3549 }
3550 }
3551 other_ambiguous_imports.clear();
3552 }
3553
3554 void MDCache::disambiguate_my_imports()
3555 {
3556 dout(10) << "disambiguate_my_imports" << dendl;
3557
3558 if (!mds->is_resolve()) {
3559 ceph_assert(my_ambiguous_imports.empty());
3560 return;
3561 }
3562
3563 disambiguate_other_imports();
3564
3565 // my ambiguous imports
3566 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3567 while (!my_ambiguous_imports.empty()) {
3568 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3569
3570 CDir *dir = get_dirfrag(q->first);
3571 ceph_assert(dir);
3572
3573 if (dir->authority() != me_ambig) {
3574 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3575 cancel_ambiguous_import(dir);
3576
3577 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3578
3579 // subtree may have been swallowed by another node claiming dir
3580 // as their own.
3581 CDir *root = get_subtree_root(dir);
3582 if (root != dir)
3583 dout(10) << " subtree root is " << *root << dendl;
3584 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3585 try_trim_non_auth_subtree(root);
3586 } else {
3587 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3588 finish_ambiguous_import(q->first);
3589 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3590 }
3591 }
3592 ceph_assert(my_ambiguous_imports.empty());
3593 mds->mdlog->flush();
3594
3595 // verify all my subtrees are unambiguous!
3596 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3597 p != subtrees.end();
3598 ++p) {
3599 CDir *dir = p->first;
3600 if (dir->is_ambiguous_dir_auth()) {
3601 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3602 }
3603 ceph_assert(!dir->is_ambiguous_dir_auth());
3604 }
3605
3606 show_subtrees();
3607 }
3608
3609
3610 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3611 {
3612 ceph_assert(my_ambiguous_imports.count(base) == 0);
3613 my_ambiguous_imports[base] = bounds;
3614 }
3615
3616
3617 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3618 {
3619 // make a list
3620 vector<dirfrag_t> binos;
3621 for (set<CDir*>::iterator p = bounds.begin();
3622 p != bounds.end();
3623 ++p)
3624 binos.push_back((*p)->dirfrag());
3625
3626 // note: this can get called twice if the exporter fails during recovery
3627 if (my_ambiguous_imports.count(base->dirfrag()))
3628 my_ambiguous_imports.erase(base->dirfrag());
3629
3630 add_ambiguous_import(base->dirfrag(), binos);
3631 }
3632
3633 void MDCache::cancel_ambiguous_import(CDir *dir)
3634 {
3635 dirfrag_t df = dir->dirfrag();
3636 ceph_assert(my_ambiguous_imports.count(df));
3637 dout(10) << "cancel_ambiguous_import " << df
3638 << " bounds " << my_ambiguous_imports[df]
3639 << " " << *dir
3640 << dendl;
3641 my_ambiguous_imports.erase(df);
3642 }
3643
3644 void MDCache::finish_ambiguous_import(dirfrag_t df)
3645 {
3646 ceph_assert(my_ambiguous_imports.count(df));
3647 vector<dirfrag_t> bounds;
3648 bounds.swap(my_ambiguous_imports[df]);
3649 my_ambiguous_imports.erase(df);
3650
3651 dout(10) << "finish_ambiguous_import " << df
3652 << " bounds " << bounds
3653 << dendl;
3654 CDir *dir = get_dirfrag(df);
3655 ceph_assert(dir);
3656
3657 // adjust dir_auth, import maps
3658 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3659 try_subtree_merge(dir);
3660 }
3661
3662 void MDCache::remove_inode_recursive(CInode *in)
3663 {
3664 dout(10) << "remove_inode_recursive " << *in << dendl;
3665 auto&& ls = in->get_dirfrags();
3666 for (const auto& subdir : ls) {
3667 dout(10) << " removing dirfrag " << *subdir << dendl;
3668 auto it = subdir->items.begin();
3669 while (it != subdir->items.end()) {
3670 CDentry *dn = it->second;
3671 ++it;
3672 CDentry::linkage_t *dnl = dn->get_linkage();
3673 if (dnl->is_primary()) {
3674 CInode *tin = dnl->get_inode();
3675 subdir->unlink_inode(dn, false);
3676 remove_inode_recursive(tin);
3677 }
3678 subdir->remove_dentry(dn);
3679 }
3680
3681 if (subdir->is_subtree_root())
3682 remove_subtree(subdir);
3683 in->close_dirfrag(subdir->dirfrag().frag);
3684 }
3685 remove_inode(in);
3686 }
3687
3688 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3689 {
3690 ceph_assert(!in->is_auth());
3691
3692 dout(10) << __func__ << ":" << *in << dendl;
3693
3694 // Recurse into any dirfrags beneath this inode
3695 auto&& ls = in->get_dirfrags();
3696 for (const auto& subdir : ls) {
3697 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3698 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3699 return true;
3700 }
3701
3702 for (auto &it : subdir->items) {
3703 CDentry *dn = it.second;
3704 CDentry::linkage_t *dnl = dn->get_linkage();
3705 if (dnl->is_primary()) {
3706 CInode *tin = dnl->get_inode();
3707
3708 /* Remote strays with linkage (i.e. hardlinks) should not be
3709 * expired, because they may be the target of
3710 * a rename() as the owning MDS shuts down */
3711 if (!tin->is_stray() && tin->inode.nlink) {
3712 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3713 return true;
3714 }
3715
3716 const bool abort = expire_recursive(tin, expiremap);
3717 if (abort) {
3718 return true;
3719 }
3720 }
3721 if (dn->lru_is_expireable()) {
3722 trim_dentry(dn, expiremap);
3723 } else {
3724 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3725 return true;
3726 }
3727 }
3728 }
3729
3730 return false;
3731 }
3732
3733 void MDCache::trim_unlinked_inodes()
3734 {
3735 dout(7) << "trim_unlinked_inodes" << dendl;
3736 int count = 0;
3737 vector<CInode*> q;
3738 for (auto &p : inode_map) {
3739 CInode *in = p.second;
3740 if (in->get_parent_dn() == NULL && !in->is_base()) {
3741 dout(7) << " will trim from " << *in << dendl;
3742 q.push_back(in);
3743 }
3744
3745 if (!(++count % 1000))
3746 mds->heartbeat_reset();
3747 }
3748 for (auto& in : q) {
3749 remove_inode_recursive(in);
3750
3751 if (!(++count % 1000))
3752 mds->heartbeat_reset();
3753 }
3754 }
3755
3756 /** recalc_auth_bits()
3757 * once subtree auth is disambiguated, we need to adjust all the
3758 * auth and dirty bits in our cache before moving on.
3759 */
3760 void MDCache::recalc_auth_bits(bool replay)
3761 {
3762 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3763
3764 if (root) {
3765 root->inode_auth.first = mds->mdsmap->get_root();
3766 bool auth = mds->get_nodeid() == root->inode_auth.first;
3767 if (auth) {
3768 root->state_set(CInode::STATE_AUTH);
3769 } else {
3770 root->state_clear(CInode::STATE_AUTH);
3771 if (!replay)
3772 root->state_set(CInode::STATE_REJOINING);
3773 }
3774 }
3775
3776 set<CInode*> subtree_inodes;
3777 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3778 p != subtrees.end();
3779 ++p) {
3780 if (p->first->dir_auth.first == mds->get_nodeid())
3781 subtree_inodes.insert(p->first->inode);
3782 }
3783
3784 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3785 p != subtrees.end();
3786 ++p) {
3787 if (p->first->inode->is_mdsdir()) {
3788 CInode *in = p->first->inode;
3789 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3790 if (auth) {
3791 in->state_set(CInode::STATE_AUTH);
3792 } else {
3793 in->state_clear(CInode::STATE_AUTH);
3794 if (!replay)
3795 in->state_set(CInode::STATE_REJOINING);
3796 }
3797 }
3798
3799 std::queue<CDir*> dfq; // dirfrag queue
3800 dfq.push(p->first);
3801
3802 bool auth = p->first->authority().first == mds->get_nodeid();
3803 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3804
3805 while (!dfq.empty()) {
3806 CDir *dir = dfq.front();
3807 dfq.pop();
3808
3809 // dir
3810 if (auth) {
3811 dir->state_set(CDir::STATE_AUTH);
3812 } else {
3813 dir->state_clear(CDir::STATE_AUTH);
3814 if (!replay) {
3815 // close empty non-auth dirfrag
3816 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3817 dir->inode->close_dirfrag(dir->get_frag());
3818 continue;
3819 }
3820 dir->state_set(CDir::STATE_REJOINING);
3821 dir->state_clear(CDir::STATE_COMPLETE);
3822 if (dir->is_dirty())
3823 dir->mark_clean();
3824 }
3825 }
3826
3827 // dentries in this dir
3828 for (auto &p : dir->items) {
3829 // dn
3830 CDentry *dn = p.second;
3831 CDentry::linkage_t *dnl = dn->get_linkage();
3832 if (auth) {
3833 dn->state_set(CDentry::STATE_AUTH);
3834 } else {
3835 dn->state_clear(CDentry::STATE_AUTH);
3836 if (!replay) {
3837 dn->state_set(CDentry::STATE_REJOINING);
3838 if (dn->is_dirty())
3839 dn->mark_clean();
3840 }
3841 }
3842
3843 if (dnl->is_primary()) {
3844 // inode
3845 CInode *in = dnl->get_inode();
3846 if (auth) {
3847 in->state_set(CInode::STATE_AUTH);
3848 } else {
3849 in->state_clear(CInode::STATE_AUTH);
3850 if (!replay) {
3851 in->state_set(CInode::STATE_REJOINING);
3852 if (in->is_dirty())
3853 in->mark_clean();
3854 if (in->is_dirty_parent())
3855 in->clear_dirty_parent();
3856 // avoid touching scatterlocks for our subtree roots!
3857 if (subtree_inodes.count(in) == 0)
3858 in->clear_scatter_dirty();
3859 }
3860 }
3861 // recurse?
3862 if (in->is_dir()) {
3863 auto&& dfv = in->get_nested_dirfrags();
3864 for (const auto& dir : dfv) {
3865 dfq.push(dir);
3866 }
3867 }
3868 }
3869 }
3870 }
3871 }
3872
3873 show_subtrees();
3874 show_cache();
3875 }
3876
3877
3878
3879 // ===========================================================================
3880 // REJOIN
3881
3882 /*
3883 * notes on scatterlock recovery:
3884 *
3885 * - recovering inode replica sends scatterlock data for any subtree
3886 * roots (the only ones that are possibly dirty).
3887 *
3888 * - surviving auth incorporates any provided scatterlock data. any
3889 * pending gathers are then finished, as with the other lock types.
3890 *
3891 * that takes care of surviving auth + (recovering replica)*.
3892 *
3893 * - surviving replica sends strong_inode, which includes current
3894 * scatterlock state, AND any dirty scatterlock data. this
3895 * provides the recovering auth with everything it might need.
3896 *
3897 * - recovering auth must pick initial scatterlock state based on
3898 * (weak|strong) rejoins.
3899 * - always assimilate scatterlock data (it can't hurt)
3900 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3901 * - include base inode in ack for all inodes that saw scatterlock content
3902 *
3903 * also, for scatter gather,
3904 *
3905 * - auth increments {frag,r}stat.version on completion of any gather.
3906 *
3907 * - auth incorporates changes in a gather _only_ if the version
3908 * matches.
3909 *
3910 * - replica discards changes any time the scatterlock syncs, and
3911 * after recovery.
3912 */
3913
3914 void MDCache::dump_rejoin_status(Formatter *f) const
3915 {
3916 f->open_object_section("rejoin_status");
3917 f->dump_stream("rejoin_gather") << rejoin_gather;
3918 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3919 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
3920 f->close_section();
3921 }
3922
3923 void MDCache::rejoin_start(MDSContext *rejoin_done_)
3924 {
3925 dout(10) << "rejoin_start" << dendl;
3926 ceph_assert(!rejoin_done);
3927 rejoin_done.reset(rejoin_done_);
3928
3929 rejoin_gather = recovery_set;
3930 // need finish opening cap inodes before sending cache rejoins
3931 rejoin_gather.insert(mds->get_nodeid());
3932 process_imported_caps();
3933 }
3934
3935 /*
3936 * rejoin phase!
3937 *
3938 * this initiates rejoin. it should be called before we get any
3939 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
3940 *
3941 * we start out by sending rejoins to everyone in the recovery set.
3942 *
3943 * if we are rejoin, send for all regions in our cache.
3944 * if we are active|stopping, send only to nodes that are rejoining.
3945 */
3946 void MDCache::rejoin_send_rejoins()
3947 {
3948 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
3949
3950 if (rejoin_gather.count(mds->get_nodeid())) {
3951 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
3952 rejoins_pending = true;
3953 return;
3954 }
3955 if (!resolve_gather.empty()) {
3956 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
3957 << resolve_gather << ")" << dendl;
3958 rejoins_pending = true;
3959 return;
3960 }
3961
3962 ceph_assert(!migrator->is_importing());
3963 ceph_assert(!migrator->is_exporting());
3964
3965 if (!mds->is_rejoin()) {
3966 disambiguate_other_imports();
3967 }
3968
3969 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
3970
3971
3972 // if i am rejoining, send a rejoin to everyone.
3973 // otherwise, just send to others who are rejoining.
3974 for (const auto& rank : recovery_set) {
3975 if (rank == mds->get_nodeid()) continue; // nothing to myself!
3976 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
3977 if (mds->is_rejoin())
3978 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
3979 else if (mds->mdsmap->is_rejoin(rank))
3980 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
3981 }
3982
3983 if (mds->is_rejoin()) {
3984 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
3985 for (auto& p : cap_exports) {
3986 mds_rank_t target = p.second.first;
3987 if (rejoins.count(target) == 0)
3988 continue;
3989 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
3990 Session *session = nullptr;
3991 auto it = client_exports.find(q->first);
3992 if (it != client_exports.end()) {
3993 session = it->second.first;
3994 if (session)
3995 it->second.second.insert(target);
3996 } else {
3997 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3998 auto& r = client_exports[q->first];
3999 r.first = session;
4000 if (session)
4001 r.second.insert(target);
4002 }
4003 if (session) {
4004 ++q;
4005 } else {
4006 // remove reconnect with no session
4007 p.second.second.erase(q++);
4008 }
4009 }
4010 rejoins[target]->cap_exports[p.first] = p.second.second;
4011 }
4012 for (auto& p : client_exports) {
4013 Session *session = p.second.first;
4014 for (auto& q : p.second.second) {
4015 auto rejoin = rejoins[q];
4016 rejoin->client_map[p.first] = session->info.inst;
4017 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4018 }
4019 }
4020 }
4021
4022
4023 // check all subtrees
4024 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4025 p != subtrees.end();
4026 ++p) {
4027 CDir *dir = p->first;
4028 ceph_assert(dir->is_subtree_root());
4029 if (dir->is_ambiguous_dir_auth()) {
4030 // exporter is recovering, importer is survivor.
4031 ceph_assert(rejoins.count(dir->authority().first));
4032 ceph_assert(!rejoins.count(dir->authority().second));
4033 continue;
4034 }
4035
4036 // my subtree?
4037 if (dir->is_auth())
4038 continue; // skip my own regions!
4039
4040 mds_rank_t auth = dir->get_dir_auth().first;
4041 ceph_assert(auth >= 0);
4042 if (rejoins.count(auth) == 0)
4043 continue; // don't care about this node's subtrees
4044
4045 rejoin_walk(dir, rejoins[auth]);
4046 }
4047
4048 // rejoin root inodes, too
4049 for (auto &p : rejoins) {
4050 if (mds->is_rejoin()) {
4051 // weak
4052 if (p.first == 0 && root) {
4053 p.second->add_weak_inode(root->vino());
4054 if (root->is_dirty_scattered()) {
4055 dout(10) << " sending scatterlock state on root " << *root << dendl;
4056 p.second->add_scatterlock_state(root);
4057 }
4058 }
4059 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4060 if (in)
4061 p.second->add_weak_inode(in->vino());
4062 }
4063 } else {
4064 // strong
4065 if (p.first == 0 && root) {
4066 p.second->add_strong_inode(root->vino(),
4067 root->get_replica_nonce(),
4068 root->get_caps_wanted(),
4069 root->filelock.get_state(),
4070 root->nestlock.get_state(),
4071 root->dirfragtreelock.get_state());
4072 root->state_set(CInode::STATE_REJOINING);
4073 if (root->is_dirty_scattered()) {
4074 dout(10) << " sending scatterlock state on root " << *root << dendl;
4075 p.second->add_scatterlock_state(root);
4076 }
4077 }
4078
4079 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4080 p.second->add_strong_inode(in->vino(),
4081 in->get_replica_nonce(),
4082 in->get_caps_wanted(),
4083 in->filelock.get_state(),
4084 in->nestlock.get_state(),
4085 in->dirfragtreelock.get_state());
4086 in->state_set(CInode::STATE_REJOINING);
4087 }
4088 }
4089 }
4090
4091 if (!mds->is_rejoin()) {
4092 // i am survivor. send strong rejoin.
4093 // note request remote_auth_pins, xlocks
4094 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4095 p != active_requests.end();
4096 ++p) {
4097 MDRequestRef& mdr = p->second;
4098 if (mdr->is_slave())
4099 continue;
4100 // auth pins
4101 for (const auto& q : mdr->object_states) {
4102 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4103 continue;
4104 if (!q.first->is_auth()) {
4105 mds_rank_t target = q.second.remote_auth_pinned;
4106 ceph_assert(target == q.first->authority().first);
4107 if (rejoins.count(target) == 0) continue;
4108 const auto& rejoin = rejoins[target];
4109
4110 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4111 MDSCacheObjectInfo i;
4112 q.first->set_object_info(i);
4113 if (i.ino)
4114 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4115 else
4116 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4117
4118 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4119 mdr->more()->rename_inode == q.first)
4120 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4121 mdr->reqid, mdr->attempt);
4122 }
4123 }
4124 // xlocks
4125 for (const auto& q : mdr->locks) {
4126 auto lock = q.lock;
4127 auto obj = lock->get_parent();
4128 if (q.is_xlock() && !obj->is_auth()) {
4129 mds_rank_t who = obj->authority().first;
4130 if (rejoins.count(who) == 0) continue;
4131 const auto& rejoin = rejoins[who];
4132
4133 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4134 MDSCacheObjectInfo i;
4135 obj->set_object_info(i);
4136 if (i.ino)
4137 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4138 mdr->reqid, mdr->attempt);
4139 else
4140 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4141 mdr->reqid, mdr->attempt);
4142 } else if (q.is_remote_wrlock()) {
4143 mds_rank_t who = q.wrlock_target;
4144 if (rejoins.count(who) == 0) continue;
4145 const auto& rejoin = rejoins[who];
4146
4147 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4148 MDSCacheObjectInfo i;
4149 obj->set_object_info(i);
4150 ceph_assert(i.ino);
4151 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4152 mdr->reqid, mdr->attempt);
4153 }
4154 }
4155 }
4156 }
4157
4158 // send the messages
4159 for (auto &p : rejoins) {
4160 ceph_assert(rejoin_sent.count(p.first) == 0);
4161 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4162 rejoin_sent.insert(p.first);
4163 rejoin_ack_gather.insert(p.first);
4164 mds->send_message_mds(p.second, p.first);
4165 }
4166 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4167 rejoins_pending = false;
4168
4169 // nothing?
4170 if (mds->is_rejoin() && rejoin_gather.empty()) {
4171 dout(10) << "nothing to rejoin" << dendl;
4172 rejoin_gather_finish();
4173 }
4174 }
4175
4176
4177 /**
4178 * rejoin_walk - build rejoin declarations for a subtree
4179 *
4180 * @param dir subtree root
4181 * @param rejoin rejoin message
4182 *
4183 * from a rejoining node:
4184 * weak dirfrag
4185 * weak dentries (w/ connectivity)
4186 *
4187 * from a surviving node:
4188 * strong dirfrag
4189 * strong dentries (no connectivity!)
4190 * strong inodes
4191 */
4192 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4193 {
4194 dout(10) << "rejoin_walk " << *dir << dendl;
4195
4196 std::vector<CDir*> nested; // finish this dir, then do nested items
4197
4198 if (mds->is_rejoin()) {
4199 // WEAK
4200 rejoin->add_weak_dirfrag(dir->dirfrag());
4201 for (auto &p : dir->items) {
4202 CDentry *dn = p.second;
4203 ceph_assert(dn->last == CEPH_NOSNAP);
4204 CDentry::linkage_t *dnl = dn->get_linkage();
4205 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4206 ceph_assert(dnl->is_primary());
4207 CInode *in = dnl->get_inode();
4208 ceph_assert(dnl->get_inode()->is_dir());
4209 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4210 {
4211 auto&& dirs = in->get_nested_dirfrags();
4212 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4213 }
4214 if (in->is_dirty_scattered()) {
4215 dout(10) << " sending scatterlock state on " << *in << dendl;
4216 rejoin->add_scatterlock_state(in);
4217 }
4218 }
4219 } else {
4220 // STRONG
4221 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4222 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4223 dir->state_set(CDir::STATE_REJOINING);
4224
4225 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4226 CDentry *dn = it->second;
4227 ++it;
4228 dn->state_set(CDentry::STATE_REJOINING);
4229 CDentry::linkage_t *dnl = dn->get_linkage();
4230 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4231
4232 // trim snap dentries. because they may have been pruned by
4233 // their auth mds (snap deleted)
4234 if (dn->last != CEPH_NOSNAP) {
4235 if (in && !in->remote_parents.empty()) {
4236 // unlink any stale remote snap dentry.
4237 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4238 CDentry *remote_dn = *it2;
4239 ++it2;
4240 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4241 remote_dn->unlink_remote(remote_dn->get_linkage());
4242 }
4243 }
4244 if (dn->lru_is_expireable()) {
4245 if (!dnl->is_null())
4246 dir->unlink_inode(dn, false);
4247 if (in)
4248 remove_inode(in);
4249 dir->remove_dentry(dn);
4250 continue;
4251 } else {
4252 // Inventing null/remote dentry shouldn't cause problem
4253 ceph_assert(!dnl->is_primary());
4254 }
4255 }
4256
4257 dout(15) << " add_strong_dentry " << *dn << dendl;
4258 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4259 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4260 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4261 dnl->is_remote() ? dnl->get_remote_d_type():0,
4262 dn->get_replica_nonce(),
4263 dn->lock.get_state());
4264 dn->state_set(CDentry::STATE_REJOINING);
4265 if (dnl->is_primary()) {
4266 CInode *in = dnl->get_inode();
4267 dout(15) << " add_strong_inode " << *in << dendl;
4268 rejoin->add_strong_inode(in->vino(),
4269 in->get_replica_nonce(),
4270 in->get_caps_wanted(),
4271 in->filelock.get_state(),
4272 in->nestlock.get_state(),
4273 in->dirfragtreelock.get_state());
4274 in->state_set(CInode::STATE_REJOINING);
4275 {
4276 auto&& dirs = in->get_nested_dirfrags();
4277 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4278 }
4279 if (in->is_dirty_scattered()) {
4280 dout(10) << " sending scatterlock state on " << *in << dendl;
4281 rejoin->add_scatterlock_state(in);
4282 }
4283 }
4284 }
4285 }
4286
4287 // recurse into nested dirs
4288 for (const auto& dir : nested) {
4289 rejoin_walk(dir, rejoin);
4290 }
4291 }
4292
4293
4294 /*
4295 * i got a rejoin.
4296 * - reply with the lockstate
4297 *
4298 * if i am active|stopping,
4299 * - remove source from replica list for everything not referenced here.
4300 */
4301 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4302 {
4303 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4304 << " (" << m->get_payload().length() << " bytes)"
4305 << dendl;
4306
4307 switch (m->op) {
4308 case MMDSCacheRejoin::OP_WEAK:
4309 handle_cache_rejoin_weak(m);
4310 break;
4311 case MMDSCacheRejoin::OP_STRONG:
4312 handle_cache_rejoin_strong(m);
4313 break;
4314 case MMDSCacheRejoin::OP_ACK:
4315 handle_cache_rejoin_ack(m);
4316 break;
4317
4318 default:
4319 ceph_abort();
4320 }
4321 }
4322
4323
4324 /*
4325 * handle_cache_rejoin_weak
4326 *
4327 * the sender
4328 * - is recovering from their journal.
4329 * - may have incorrect (out of date) inode contents
4330 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4331 *
4332 * if the sender didn't trim_non_auth(), they
4333 * - may have incorrect (out of date) dentry/inode linkage
4334 * - may have deleted/purged inodes
4335 * and i may have to go to disk to get accurate inode contents. yuck.
4336 */
4337 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4338 {
4339 mds_rank_t from = mds_rank_t(weak->get_source().num());
4340
4341 // possible response(s)
4342 ref_t<MMDSCacheRejoin> ack; // if survivor
4343 set<vinodeno_t> acked_inodes; // if survivor
4344 set<SimpleLock *> gather_locks; // if survivor
4345 bool survivor = false; // am i a survivor?
4346
4347 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4348 survivor = true;
4349 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4350 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4351
4352 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4353
4354 // check cap exports
4355 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4356 CInode *in = get_inode(p->first);
4357 ceph_assert(!in || in->is_auth());
4358 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4359 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4360 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4361 Capability::Import& im = imported_caps[p->first][q->first];
4362 if (cap) {
4363 im.cap_id = cap->get_cap_id();
4364 im.issue_seq = cap->get_last_seq();
4365 im.mseq = cap->get_mseq();
4366 } else {
4367 // all are zero
4368 }
4369 }
4370 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4371 }
4372
4373 encode(imported_caps, ack->imported_caps);
4374 } else {
4375 ceph_assert(mds->is_rejoin());
4376
4377 // we may have already received a strong rejoin from the sender.
4378 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4379 ceph_assert(gather_locks.empty());
4380
4381 // check cap exports.
4382 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4383 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4384 weak->client_metadata_map.end());
4385
4386 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4387 CInode *in = get_inode(p->first);
4388 ceph_assert(!in || in->is_auth());
4389 // note
4390 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4391 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4392 cap_imports[p->first][q->first][from] = q->second;
4393 }
4394 }
4395 }
4396
4397 // assimilate any potentially dirty scatterlock state
4398 for (const auto &p : weak->inode_scatterlocks) {
4399 CInode *in = get_inode(p.first);
4400 ceph_assert(in);
4401 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4402 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4403 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4404 if (!survivor)
4405 rejoin_potential_updated_scatterlocks.insert(in);
4406 }
4407
4408 // recovering peer may send incorrect dirfrags here. we need to
4409 // infer which dirfrag they meant. the ack will include a
4410 // strong_dirfrag that will set them straight on the fragmentation.
4411
4412 // walk weak map
4413 set<CDir*> dirs_to_share;
4414 for (const auto &p : weak->weak_dirfrags) {
4415 CInode *diri = get_inode(p.ino);
4416 if (!diri)
4417 dout(0) << " missing dir ino " << p.ino << dendl;
4418 ceph_assert(diri);
4419
4420 frag_vec_t leaves;
4421 if (diri->dirfragtree.is_leaf(p.frag)) {
4422 leaves.push_back(p.frag);
4423 } else {
4424 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4425 if (leaves.empty())
4426 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4427 }
4428 for (const auto& leaf : leaves) {
4429 CDir *dir = diri->get_dirfrag(leaf);
4430 if (!dir) {
4431 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4432 continue;
4433 }
4434 ceph_assert(dir);
4435 if (dirs_to_share.count(dir)) {
4436 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4437 } else {
4438 dirs_to_share.insert(dir);
4439 unsigned nonce = dir->add_replica(from);
4440 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4441 if (ack) {
4442 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4443 ack->add_dirfrag_base(dir);
4444 }
4445 }
4446 }
4447 }
4448
4449 for (const auto &p : weak->weak) {
4450 CInode *diri = get_inode(p.first);
4451 if (!diri)
4452 dout(0) << " missing dir ino " << p.first << dendl;
4453 ceph_assert(diri);
4454
4455 // weak dentries
4456 CDir *dir = 0;
4457 for (const auto &q : p.second) {
4458 // locate proper dirfrag.
4459 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4460 frag_t fg = diri->pick_dirfrag(q.first.name);
4461 if (!dir || dir->get_frag() != fg) {
4462 dir = diri->get_dirfrag(fg);
4463 if (!dir)
4464 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4465 ceph_assert(dir);
4466 ceph_assert(dirs_to_share.count(dir));
4467 }
4468
4469 // and dentry
4470 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4471 ceph_assert(dn);
4472 CDentry::linkage_t *dnl = dn->get_linkage();
4473 ceph_assert(dnl->is_primary());
4474
4475 if (survivor && dn->is_replica(from))
4476 dentry_remove_replica(dn, from, gather_locks);
4477 unsigned dnonce = dn->add_replica(from);
4478 dout(10) << " have " << *dn << dendl;
4479 if (ack)
4480 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4481 dnl->get_inode()->ino(), inodeno_t(0), 0,
4482 dnonce, dn->lock.get_replica_state());
4483
4484 // inode
4485 CInode *in = dnl->get_inode();
4486 ceph_assert(in);
4487
4488 if (survivor && in->is_replica(from))
4489 inode_remove_replica(in, from, true, gather_locks);
4490 unsigned inonce = in->add_replica(from);
4491 dout(10) << " have " << *in << dendl;
4492
4493 // scatter the dirlock, just in case?
4494 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4495 in->filelock.set_state(LOCK_MIX);
4496
4497 if (ack) {
4498 acked_inodes.insert(in->vino());
4499 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4500 bufferlist bl;
4501 in->_encode_locks_state_for_rejoin(bl, from);
4502 ack->add_inode_locks(in, inonce, bl);
4503 }
4504 }
4505 }
4506
4507 // weak base inodes? (root, stray, etc.)
4508 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4509 p != weak->weak_inodes.end();
4510 ++p) {
4511 CInode *in = get_inode(*p);
4512 ceph_assert(in); // hmm fixme wrt stray?
4513 if (survivor && in->is_replica(from))
4514 inode_remove_replica(in, from, true, gather_locks);
4515 unsigned inonce = in->add_replica(from);
4516 dout(10) << " have base " << *in << dendl;
4517
4518 if (ack) {
4519 acked_inodes.insert(in->vino());
4520 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4521 bufferlist bl;
4522 in->_encode_locks_state_for_rejoin(bl, from);
4523 ack->add_inode_locks(in, inonce, bl);
4524 }
4525 }
4526
4527 ceph_assert(rejoin_gather.count(from));
4528 rejoin_gather.erase(from);
4529 if (survivor) {
4530 // survivor. do everything now.
4531 for (const auto &p : weak->inode_scatterlocks) {
4532 CInode *in = get_inode(p.first);
4533 ceph_assert(in);
4534 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4535 acked_inodes.insert(in->vino());
4536 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4537 }
4538
4539 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4540 mds->send_message(ack, weak->get_connection());
4541
4542 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4543 if (!(*p)->is_stable())
4544 mds->locker->eval_gather(*p);
4545 }
4546 } else {
4547 // done?
4548 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4549 rejoin_gather_finish();
4550 } else {
4551 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4552 }
4553 }
4554 }
4555
4556 /*
4557 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4558 *
4559 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4560 * ack, the replica dne, and we can remove it from our replica maps.
4561 */
4562 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4563 set<vinodeno_t>& acked_inodes,
4564 set<SimpleLock *>& gather_locks)
4565 {
4566 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4567
4568 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4569 // inode?
4570 if (in->is_auth() &&
4571 in->is_replica(from) &&
4572 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4573 inode_remove_replica(in, from, false, gather_locks);
4574 dout(10) << " rem " << *in << dendl;
4575 }
4576
4577 if (!in->is_dir())
4578 return;
4579
4580 const auto&& dfs = in->get_dirfrags();
4581 for (const auto& dir : dfs) {
4582 if (!dir->is_auth())
4583 continue;
4584
4585 if (dir->is_replica(from) &&
4586 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4587 dir->remove_replica(from);
4588 dout(10) << " rem " << *dir << dendl;
4589 }
4590
4591 // dentries
4592 for (auto &p : dir->items) {
4593 CDentry *dn = p.second;
4594
4595 if (dn->is_replica(from)) {
4596 if (ack) {
4597 const auto it = ack->strong_dentries.find(dir->dirfrag());
4598 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4599 continue;
4600 }
4601 }
4602 dentry_remove_replica(dn, from, gather_locks);
4603 dout(10) << " rem " << *dn << dendl;
4604 }
4605 }
4606 }
4607 };
4608
4609 for (auto &p : inode_map)
4610 scour_func(p.second);
4611 for (auto &p : snap_inode_map)
4612 scour_func(p.second);
4613 }
4614
4615
4616 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4617 {
4618 CInode *in = new CInode(this, true, 1, last);
4619 in->inode.ino = ino;
4620 in->state_set(CInode::STATE_REJOINUNDEF);
4621 add_inode(in);
4622 rejoin_undef_inodes.insert(in);
4623 dout(10) << " invented " << *in << dendl;
4624 return in;
4625 }
4626
4627 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4628 {
4629 CInode *in = get_inode(df.ino);
4630 if (!in)
4631 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4632 if (!in->is_dir()) {
4633 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4634 in->inode.mode = S_IFDIR;
4635 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4636 }
4637 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4638 dir->state_set(CDir::STATE_REJOINUNDEF);
4639 rejoin_undef_dirfrags.insert(dir);
4640 dout(10) << " invented " << *dir << dendl;
4641 return dir;
4642 }
4643
4644 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4645 {
4646 mds_rank_t from = mds_rank_t(strong->get_source().num());
4647
4648 // only a recovering node will get a strong rejoin.
4649 if (!mds->is_rejoin()) {
4650 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4651 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4652 return;
4653 }
4654 ceph_abort_msg("got unexpected rejoin message during recovery");
4655 }
4656
4657 // assimilate any potentially dirty scatterlock state
4658 for (const auto &p : strong->inode_scatterlocks) {
4659 CInode *in = get_inode(p.first);
4660 ceph_assert(in);
4661 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4662 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4663 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4664 rejoin_potential_updated_scatterlocks.insert(in);
4665 }
4666
4667 rejoin_unlinked_inodes[from].clear();
4668
4669 // surviving peer may send incorrect dirfrag here (maybe they didn't
4670 // get the fragment notify, or maybe we rolled back?). we need to
4671 // infer the right frag and get them with the program. somehow.
4672 // we don't normally send ACK.. so we'll need to bundle this with
4673 // MISSING or something.
4674
4675 // strong dirfrags/dentries.
4676 // also process auth_pins, xlocks.
4677 for (const auto &p : strong->strong_dirfrags) {
4678 auto& dirfrag = p.first;
4679 CInode *diri = get_inode(dirfrag.ino);
4680 if (!diri)
4681 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4682 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4683 bool refragged = false;
4684 if (dir) {
4685 dout(10) << " have " << *dir << dendl;
4686 } else {
4687 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4688 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4689 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4690 dir = rejoin_invent_dirfrag(dirfrag);
4691 }
4692 if (dir) {
4693 dir->add_replica(from, p.second.nonce);
4694 dir->dir_rep = p.second.dir_rep;
4695 } else {
4696 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4697 frag_vec_t leaves;
4698 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4699 if (leaves.empty())
4700 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4701 dout(10) << " maps to frag(s) " << leaves << dendl;
4702 for (const auto& leaf : leaves) {
4703 CDir *dir = diri->get_dirfrag(leaf);
4704 if (!dir)
4705 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4706 else
4707 dout(10) << " have(approx) " << *dir << dendl;
4708 dir->add_replica(from, p.second.nonce);
4709 dir->dir_rep = p.second.dir_rep;
4710 }
4711 refragged = true;
4712 }
4713
4714 const auto it = strong->strong_dentries.find(dirfrag);
4715 if (it != strong->strong_dentries.end()) {
4716 const auto& dmap = it->second;
4717 for (const auto &q : dmap) {
4718 const string_snap_t& ss = q.first;
4719 const MMDSCacheRejoin::dn_strong& d = q.second;
4720 CDentry *dn;
4721 if (!refragged)
4722 dn = dir->lookup(ss.name, ss.snapid);
4723 else {
4724 frag_t fg = diri->pick_dirfrag(ss.name);
4725 dir = diri->get_dirfrag(fg);
4726 ceph_assert(dir);
4727 dn = dir->lookup(ss.name, ss.snapid);
4728 }
4729 if (!dn) {
4730 if (d.is_remote()) {
4731 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4732 } else if (d.is_null()) {
4733 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4734 } else {
4735 CInode *in = get_inode(d.ino, ss.snapid);
4736 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4737 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4738 }
4739 dout(10) << " invented " << *dn << dendl;
4740 }
4741 CDentry::linkage_t *dnl = dn->get_linkage();
4742
4743 // dn auth_pin?
4744 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4745 if (pinned_it != strong->authpinned_dentries.end()) {
4746 const auto slave_reqid_it = pinned_it->second.find(ss);
4747 if (slave_reqid_it != pinned_it->second.end()) {
4748 for (const auto &r : slave_reqid_it->second) {
4749 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4750
4751 // get/create slave mdrequest
4752 MDRequestRef mdr;
4753 if (have_request(r.reqid))
4754 mdr = request_get(r.reqid);
4755 else
4756 mdr = request_start_slave(r.reqid, r.attempt, strong);
4757 mdr->auth_pin(dn);
4758 }
4759 }
4760 }
4761
4762 // dn xlock?
4763 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4764 if (xlocked_it != strong->xlocked_dentries.end()) {
4765 const auto ss_req_it = xlocked_it->second.find(ss);
4766 if (ss_req_it != xlocked_it->second.end()) {
4767 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4768 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4769 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4770 ceph_assert(mdr->is_auth_pinned(dn));
4771 if (!mdr->is_xlocked(&dn->versionlock)) {
4772 ceph_assert(dn->versionlock.can_xlock_local());
4773 dn->versionlock.get_xlock(mdr, mdr->get_client());
4774 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4775 }
4776 if (dn->lock.is_stable())
4777 dn->auth_pin(&dn->lock);
4778 dn->lock.set_state(LOCK_XLOCK);
4779 dn->lock.get_xlock(mdr, mdr->get_client());
4780 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4781 }
4782 }
4783
4784 dn->add_replica(from, d.nonce);
4785 dout(10) << " have " << *dn << dendl;
4786
4787 if (dnl->is_primary()) {
4788 if (d.is_primary()) {
4789 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4790 // the survivor missed MDentryUnlink+MDentryLink messages ?
4791 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4792 CInode *in = get_inode(d.ino, ss.snapid);
4793 ceph_assert(in);
4794 ceph_assert(in->get_parent_dn());
4795 rejoin_unlinked_inodes[from].insert(in);
4796 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4797 }
4798 } else {
4799 // the survivor missed MDentryLink message ?
4800 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4801 dout(7) << " sender doesn't have primay dentry" << dendl;
4802 }
4803 } else {
4804 if (d.is_primary()) {
4805 // the survivor missed MDentryUnlink message ?
4806 CInode *in = get_inode(d.ino, ss.snapid);
4807 ceph_assert(in);
4808 ceph_assert(in->get_parent_dn());
4809 rejoin_unlinked_inodes[from].insert(in);
4810 dout(7) << " sender has primary dentry but we don't" << dendl;
4811 }
4812 }
4813 }
4814 }
4815 }
4816
4817 for (const auto &p : strong->strong_inodes) {
4818 CInode *in = get_inode(p.first);
4819 ceph_assert(in);
4820 in->add_replica(from, p.second.nonce);
4821 dout(10) << " have " << *in << dendl;
4822
4823 const MMDSCacheRejoin::inode_strong& is = p.second;
4824
4825 // caps_wanted
4826 if (is.caps_wanted) {
4827 in->set_mds_caps_wanted(from, is.caps_wanted);
4828 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4829 << " on " << *in << dendl;
4830 }
4831
4832 // scatterlocks?
4833 // infer state from replica state:
4834 // * go to MIX if they might have wrlocks
4835 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4836 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4837 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4838 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4839
4840 // auth pin?
4841 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4842 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4843 for (const auto& r : authpinned_inodes_it->second) {
4844 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4845
4846 // get/create slave mdrequest
4847 MDRequestRef mdr;
4848 if (have_request(r.reqid))
4849 mdr = request_get(r.reqid);
4850 else
4851 mdr = request_start_slave(r.reqid, r.attempt, strong);
4852 if (strong->frozen_authpin_inodes.count(in->vino())) {
4853 ceph_assert(!in->get_num_auth_pins());
4854 mdr->freeze_auth_pin(in);
4855 } else {
4856 ceph_assert(!in->is_frozen_auth_pin());
4857 }
4858 mdr->auth_pin(in);
4859 }
4860 }
4861 // xlock(s)?
4862 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4863 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4864 for (const auto &q : xlocked_inodes_it->second) {
4865 SimpleLock *lock = in->get_lock(q.first);
4866 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4867 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4868 ceph_assert(mdr->is_auth_pinned(in));
4869 if (!mdr->is_xlocked(&in->versionlock)) {
4870 ceph_assert(in->versionlock.can_xlock_local());
4871 in->versionlock.get_xlock(mdr, mdr->get_client());
4872 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4873 }
4874 if (lock->is_stable())
4875 in->auth_pin(lock);
4876 lock->set_state(LOCK_XLOCK);
4877 if (lock == &in->filelock)
4878 in->loner_cap = -1;
4879 lock->get_xlock(mdr, mdr->get_client());
4880 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4881 }
4882 }
4883 }
4884 // wrlock(s)?
4885 for (const auto &p : strong->wrlocked_inodes) {
4886 CInode *in = get_inode(p.first);
4887 for (const auto &q : p.second) {
4888 SimpleLock *lock = in->get_lock(q.first);
4889 for (const auto &r : q.second) {
4890 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4891 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4892 if (in->is_auth())
4893 ceph_assert(mdr->is_auth_pinned(in));
4894 lock->set_state(LOCK_MIX);
4895 if (lock == &in->filelock)
4896 in->loner_cap = -1;
4897 lock->get_wrlock(true);
4898 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4899 }
4900 }
4901 }
4902
4903 // done?
4904 ceph_assert(rejoin_gather.count(from));
4905 rejoin_gather.erase(from);
4906 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4907 rejoin_gather_finish();
4908 } else {
4909 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4910 }
4911 }
4912
4913 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4914 {
4915 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4916 mds_rank_t from = mds_rank_t(ack->get_source().num());
4917
4918 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4919 bool survivor = !mds->is_rejoin();
4920
4921 // for sending cache expire message
4922 set<CInode*> isolated_inodes;
4923 set<CInode*> refragged_inodes;
4924 list<pair<CInode*,int> > updated_realms;
4925
4926 // dirs
4927 for (const auto &p : ack->strong_dirfrags) {
4928 // we may have had incorrect dir fragmentation; refragment based
4929 // on what they auth tells us.
4930 CDir *dir = get_dirfrag(p.first);
4931 if (!dir) {
4932 dir = get_force_dirfrag(p.first, false);
4933 if (dir)
4934 refragged_inodes.insert(dir->get_inode());
4935 }
4936 if (!dir) {
4937 CInode *diri = get_inode(p.first.ino);
4938 if (!diri) {
4939 // barebones inode; the full inode loop below will clean up.
4940 diri = new CInode(this, false);
4941 diri->inode.ino = p.first.ino;
4942 diri->inode.mode = S_IFDIR;
4943 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4944 add_inode(diri);
4945 if (MDS_INO_MDSDIR(from) == p.first.ino) {
4946 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
4947 dout(10) << " add inode " << *diri << dendl;
4948 } else {
4949 diri->inode_auth = CDIR_AUTH_DEFAULT;
4950 isolated_inodes.insert(diri);
4951 dout(10) << " unconnected dirfrag " << p.first << dendl;
4952 }
4953 }
4954 // barebones dirfrag; the full dirfrag loop below will clean up.
4955 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
4956 if (MDS_INO_MDSDIR(from) == p.first.ino ||
4957 (dir->authority() != CDIR_AUTH_UNDEF &&
4958 dir->authority().first != from))
4959 adjust_subtree_auth(dir, from);
4960 dout(10) << " add dirfrag " << *dir << dendl;
4961 }
4962
4963 dir->set_replica_nonce(p.second.nonce);
4964 dir->state_clear(CDir::STATE_REJOINING);
4965 dout(10) << " got " << *dir << dendl;
4966
4967 // dentries
4968 auto it = ack->strong_dentries.find(p.first);
4969 if (it != ack->strong_dentries.end()) {
4970 for (const auto &q : it->second) {
4971 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4972 if(!dn)
4973 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
4974
4975 CDentry::linkage_t *dnl = dn->get_linkage();
4976
4977 ceph_assert(dn->last == q.first.snapid);
4978 if (dn->first != q.second.first) {
4979 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
4980 dn->first = q.second.first;
4981 }
4982
4983 // may have bad linkage if we missed dentry link/unlink messages
4984 if (dnl->is_primary()) {
4985 CInode *in = dnl->get_inode();
4986 if (!q.second.is_primary() ||
4987 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
4988 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
4989 dir->unlink_inode(dn);
4990 }
4991 } else if (dnl->is_remote()) {
4992 if (!q.second.is_remote() ||
4993 q.second.remote_ino != dnl->get_remote_ino() ||
4994 q.second.remote_d_type != dnl->get_remote_d_type()) {
4995 dout(10) << " had bad linkage for " << *dn << dendl;
4996 dir->unlink_inode(dn);
4997 }
4998 } else {
4999 if (!q.second.is_null())
5000 dout(10) << " had bad linkage for " << *dn << dendl;
5001 }
5002
5003 // hmm, did we have the proper linkage here?
5004 if (dnl->is_null() && !q.second.is_null()) {
5005 if (q.second.is_remote()) {
5006 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5007 } else {
5008 CInode *in = get_inode(q.second.ino, q.first.snapid);
5009 if (!in) {
5010 // barebones inode; assume it's dir, the full inode loop below will clean up.
5011 in = new CInode(this, false, q.second.first, q.first.snapid);
5012 in->inode.ino = q.second.ino;
5013 in->inode.mode = S_IFDIR;
5014 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5015 add_inode(in);
5016 dout(10) << " add inode " << *in << dendl;
5017 } else if (in->get_parent_dn()) {
5018 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5019 << ", unlinking " << *in << dendl;
5020 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5021 }
5022 dn->dir->link_primary_inode(dn, in);
5023 isolated_inodes.erase(in);
5024 }
5025 }
5026
5027 dn->set_replica_nonce(q.second.nonce);
5028 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5029 dn->state_clear(CDentry::STATE_REJOINING);
5030 dout(10) << " got " << *dn << dendl;
5031 }
5032 }
5033 }
5034
5035 for (const auto& in : refragged_inodes) {
5036 auto&& ls = in->get_nested_dirfrags();
5037 for (const auto& dir : ls) {
5038 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5039 continue;
5040 ceph_assert(dir->get_num_any() == 0);
5041 in->close_dirfrag(dir->get_frag());
5042 }
5043 }
5044
5045 // full dirfrags
5046 for (const auto &p : ack->dirfrag_bases) {
5047 CDir *dir = get_dirfrag(p.first);
5048 ceph_assert(dir);
5049 auto q = p.second.cbegin();
5050 dir->_decode_base(q);
5051 dout(10) << " got dir replica " << *dir << dendl;
5052 }
5053
5054 // full inodes
5055 auto p = ack->inode_base.cbegin();
5056 while (!p.end()) {
5057 inodeno_t ino;
5058 snapid_t last;
5059 bufferlist basebl;
5060 decode(ino, p);
5061 decode(last, p);
5062 decode(basebl, p);
5063 CInode *in = get_inode(ino, last);
5064 ceph_assert(in);
5065 auto q = basebl.cbegin();
5066 snapid_t sseq = 0;
5067 if (in->snaprealm)
5068 sseq = in->snaprealm->srnode.seq;
5069 in->_decode_base(q);
5070 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5071 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5072 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5073 }
5074 dout(10) << " got inode base " << *in << dendl;
5075 }
5076
5077 // inodes
5078 p = ack->inode_locks.cbegin();
5079 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5080 while (!p.end()) {
5081 inodeno_t ino;
5082 snapid_t last;
5083 __u32 nonce;
5084 bufferlist lockbl;
5085 decode(ino, p);
5086 decode(last, p);
5087 decode(nonce, p);
5088 decode(lockbl, p);
5089
5090 CInode *in = get_inode(ino, last);
5091 ceph_assert(in);
5092 in->set_replica_nonce(nonce);
5093 auto q = lockbl.cbegin();
5094 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5095 in->state_clear(CInode::STATE_REJOINING);
5096 dout(10) << " got inode locks " << *in << dendl;
5097 }
5098
5099 // FIXME: This can happen if entire subtree, together with the inode subtree root
5100 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5101 ceph_assert(isolated_inodes.empty());
5102
5103 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5104 auto bp = ack->imported_caps.cbegin();
5105 decode(peer_imported, bp);
5106
5107 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5108 p != peer_imported.end();
5109 ++p) {
5110 auto& ex = cap_exports.at(p->first);
5111 ceph_assert(ex.first == from);
5112 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5113 q != p->second.end();
5114 ++q) {
5115 auto r = ex.second.find(q->first);
5116 ceph_assert(r != ex.second.end());
5117
5118 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5119 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5120 if (!session) {
5121 dout(10) << " no session for client." << p->first << dendl;
5122 ex.second.erase(r);
5123 continue;
5124 }
5125
5126 // mark client caps stale.
5127 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5128 r->second.capinfo.cap_id, 0,
5129 mds->get_osd_epoch_barrier());
5130 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5131 (q->second.cap_id > 0 ? from : -1), 0);
5132 mds->send_message_client_counted(m, session);
5133
5134 ex.second.erase(r);
5135 }
5136 ceph_assert(ex.second.empty());
5137 }
5138
5139 for (auto p : updated_realms) {
5140 CInode *in = p.first;
5141 bool notify_clients;
5142 if (mds->is_rejoin()) {
5143 if (!rejoin_pending_snaprealms.count(in)) {
5144 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5145 rejoin_pending_snaprealms.insert(in);
5146 }
5147 notify_clients = false;
5148 } else {
5149 // notify clients if I'm survivor
5150 notify_clients = true;
5151 }
5152 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5153 }
5154
5155 // done?
5156 ceph_assert(rejoin_ack_gather.count(from));
5157 rejoin_ack_gather.erase(from);
5158 if (!survivor) {
5159 if (rejoin_gather.empty()) {
5160 // eval unstable scatter locks after all wrlocks are rejoined.
5161 while (!rejoin_eval_locks.empty()) {
5162 SimpleLock *lock = rejoin_eval_locks.front();
5163 rejoin_eval_locks.pop_front();
5164 if (!lock->is_stable())
5165 mds->locker->eval_gather(lock);
5166 }
5167 }
5168
5169 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5170 rejoin_ack_gather.empty()) {
5171 // finally, kickstart past snap parent opens
5172 open_snaprealms();
5173 } else {
5174 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5175 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5176 }
5177 } else {
5178 // survivor.
5179 mds->queue_waiters(rejoin_waiters);
5180 }
5181 }
5182
5183 /**
5184 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5185 *
5186 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5187 * messages that clean these guys up...
5188 */
5189 void MDCache::rejoin_trim_undef_inodes()
5190 {
5191 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5192
5193 while (!rejoin_undef_inodes.empty()) {
5194 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5195 CInode *in = *p;
5196 rejoin_undef_inodes.erase(p);
5197
5198 in->clear_replica_map();
5199
5200 // close out dirfrags
5201 if (in->is_dir()) {
5202 const auto&& dfls = in->get_dirfrags();
5203 for (const auto& dir : dfls) {
5204 dir->clear_replica_map();
5205
5206 for (auto &p : dir->items) {
5207 CDentry *dn = p.second;
5208 dn->clear_replica_map();
5209
5210 dout(10) << " trimming " << *dn << dendl;
5211 dir->remove_dentry(dn);
5212 }
5213
5214 dout(10) << " trimming " << *dir << dendl;
5215 in->close_dirfrag(dir->dirfrag().frag);
5216 }
5217 }
5218
5219 CDentry *dn = in->get_parent_dn();
5220 if (dn) {
5221 dn->clear_replica_map();
5222 dout(10) << " trimming " << *dn << dendl;
5223 dn->dir->remove_dentry(dn);
5224 } else {
5225 dout(10) << " trimming " << *in << dendl;
5226 remove_inode(in);
5227 }
5228 }
5229
5230 ceph_assert(rejoin_undef_inodes.empty());
5231 }
5232
5233 void MDCache::rejoin_gather_finish()
5234 {
5235 dout(10) << "rejoin_gather_finish" << dendl;
5236 ceph_assert(mds->is_rejoin());
5237 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5238
5239 if (open_undef_inodes_dirfrags())
5240 return;
5241
5242 if (process_imported_caps())
5243 return;
5244
5245 choose_lock_states_and_reconnect_caps();
5246
5247 identify_files_to_recover();
5248 rejoin_send_acks();
5249
5250 // signal completion of fetches, rejoin_gather_finish, etc.
5251 rejoin_ack_gather.erase(mds->get_nodeid());
5252
5253 // did we already get our acks too?
5254 if (rejoin_ack_gather.empty()) {
5255 // finally, open snaprealms
5256 open_snaprealms();
5257 }
5258 }
5259
5260 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5261 inodeno_t ino;
5262 public:
5263 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5264 void finish(int r) override {
5265 mdcache->rejoin_open_ino_finish(ino, r);
5266 }
5267 };
5268
5269 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5270 {
5271 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5272
5273 if (ret < 0) {
5274 cap_imports_missing.insert(ino);
5275 } else if (ret == mds->get_nodeid()) {
5276 ceph_assert(get_inode(ino));
5277 } else {
5278 auto p = cap_imports.find(ino);
5279 ceph_assert(p != cap_imports.end());
5280 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5281 ceph_assert(q->second.count(MDS_RANK_NONE));
5282 ceph_assert(q->second.size() == 1);
5283 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5284 }
5285 cap_imports.erase(p);
5286 }
5287
5288 ceph_assert(cap_imports_num_opening > 0);
5289 cap_imports_num_opening--;
5290
5291 if (cap_imports_num_opening == 0) {
5292 if (rejoin_gather.empty())
5293 rejoin_gather_finish();
5294 else if (rejoin_gather.count(mds->get_nodeid()))
5295 process_imported_caps();
5296 }
5297 }
5298
5299 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5300 public:
5301 map<client_t,pair<Session*,uint64_t> > session_map;
5302 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5303 void finish(int r) override {
5304 ceph_assert(r == 0);
5305 mdcache->rejoin_open_sessions_finish(session_map);
5306 }
5307 };
5308
5309 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5310 {
5311 dout(10) << "rejoin_open_sessions_finish" << dendl;
5312 mds->server->finish_force_open_sessions(session_map);
5313 rejoin_session_map.swap(session_map);
5314 if (rejoin_gather.empty())
5315 rejoin_gather_finish();
5316 }
5317
5318 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5319 {
5320 auto p = cap_imports.find(ino);
5321 if (p != cap_imports.end()) {
5322 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5323 if (ret < 0) {
5324 cap_imports_missing.insert(ino);
5325 } else if (ret != mds->get_nodeid()) {
5326 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5327 ceph_assert(q->second.count(MDS_RANK_NONE));
5328 ceph_assert(q->second.size() == 1);
5329 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5330 }
5331 cap_imports.erase(p);
5332 }
5333 }
5334 }
5335
5336 bool MDCache::process_imported_caps()
5337 {
5338 dout(10) << "process_imported_caps" << dendl;
5339
5340 if (!open_file_table.is_prefetched() &&
5341 open_file_table.prefetch_inodes()) {
5342 open_file_table.wait_for_prefetch(
5343 new MDSInternalContextWrapper(mds,
5344 new LambdaContext([this](int r) {
5345 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5346 process_imported_caps();
5347 })
5348 )
5349 );
5350 return true;
5351 }
5352
5353 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5354 CInode *in = get_inode(p->first);
5355 if (in) {
5356 ceph_assert(in->is_auth());
5357 cap_imports_missing.erase(p->first);
5358 continue;
5359 }
5360 if (cap_imports_missing.count(p->first) > 0)
5361 continue;
5362
5363 cap_imports_num_opening++;
5364 dout(10) << " opening missing ino " << p->first << dendl;
5365 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5366 if (!(cap_imports_num_opening % 1000))
5367 mds->heartbeat_reset();
5368 }
5369
5370 if (cap_imports_num_opening > 0)
5371 return true;
5372
5373 // called by rejoin_gather_finish() ?
5374 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5375 if (!rejoin_client_map.empty() &&
5376 rejoin_session_map.empty()) {
5377 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5378 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5379 rejoin_client_metadata_map,
5380 finish->session_map);
5381 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5382 std::move(rejoin_client_metadata_map));
5383 mds->mdlog->start_submit_entry(le, finish);
5384 mds->mdlog->flush();
5385 rejoin_client_map.clear();
5386 rejoin_client_metadata_map.clear();
5387 return true;
5388 }
5389
5390 // process caps that were exported by slave rename
5391 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5392 p != rejoin_slave_exports.end();
5393 ++p) {
5394 CInode *in = get_inode(p->first);
5395 ceph_assert(in);
5396 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5397 q != p->second.second.end();
5398 ++q) {
5399 auto r = rejoin_session_map.find(q->first);
5400 if (r == rejoin_session_map.end())
5401 continue;
5402
5403 Session *session = r->second.first;
5404 Capability *cap = in->get_client_cap(q->first);
5405 if (!cap) {
5406 cap = in->add_client_cap(q->first, session);
5407 // add empty item to reconnected_caps
5408 (void)reconnected_caps[p->first][q->first];
5409 }
5410 cap->merge(q->second, true);
5411
5412 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5413 ceph_assert(cap->get_last_seq() == im.issue_seq);
5414 ceph_assert(cap->get_mseq() == im.mseq);
5415 cap->set_cap_id(im.cap_id);
5416 // send cap import because we assigned a new cap ID
5417 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5418 p->second.first, CEPH_CAP_FLAG_AUTH);
5419 }
5420 }
5421 rejoin_slave_exports.clear();
5422 rejoin_imported_caps.clear();
5423
5424 // process cap imports
5425 // ino -> client -> frommds -> capex
5426 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5427 CInode *in = get_inode(p->first);
5428 if (!in) {
5429 dout(10) << " still missing ino " << p->first
5430 << ", will try again after replayed client requests" << dendl;
5431 ++p;
5432 continue;
5433 }
5434 ceph_assert(in->is_auth());
5435 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5436 Session *session;
5437 {
5438 auto r = rejoin_session_map.find(q->first);
5439 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5440 }
5441
5442 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5443 if (!session) {
5444 if (r->first >= 0)
5445 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5446 continue;
5447 }
5448
5449 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5450 add_reconnected_cap(q->first, in->ino(), r->second);
5451 if (r->first >= 0) {
5452 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5453 cap->inc_mseq();
5454 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5455
5456 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5457 im.cap_id = cap->get_cap_id();
5458 im.issue_seq = cap->get_last_seq();
5459 im.mseq = cap->get_mseq();
5460 }
5461 }
5462 }
5463 cap_imports.erase(p++); // remove and move on
5464 }
5465 } else {
5466 trim_non_auth();
5467
5468 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5469 rejoin_gather.erase(mds->get_nodeid());
5470 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5471 maybe_send_pending_rejoins();
5472 }
5473 return false;
5474 }
5475
5476 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5477 client_t client, snapid_t snap_follows)
5478 {
5479 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5480
5481 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5482 return;
5483
5484 const set<snapid_t>& snaps = realm->get_snaps();
5485 snapid_t follows = snap_follows;
5486
5487 while (true) {
5488 CInode *in = pick_inode_snap(head_in, follows);
5489 if (in == head_in)
5490 break;
5491
5492 bool need_snapflush = false;
5493 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5494 p != snaps.end() && *p <= in->last;
5495 ++p) {
5496 head_in->add_need_snapflush(in, *p, client);
5497 need_snapflush = true;
5498 }
5499 follows = in->last;
5500 if (!need_snapflush)
5501 continue;
5502
5503 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5504
5505 if (in->client_snap_caps.empty()) {
5506 for (int i = 0; i < num_cinode_locks; i++) {
5507 int lockid = cinode_lock_info[i].lock;
5508 SimpleLock *lock = in->get_lock(lockid);
5509 ceph_assert(lock);
5510 in->auth_pin(lock);
5511 lock->set_state(LOCK_SNAP_SYNC);
5512 lock->get_wrlock(true);
5513 }
5514 }
5515 in->client_snap_caps.insert(client);
5516 mds->locker->mark_need_snapflush_inode(in);
5517 }
5518 }
5519
5520 /*
5521 * choose lock states based on reconnected caps
5522 */
5523 void MDCache::choose_lock_states_and_reconnect_caps()
5524 {
5525 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5526
5527 int count = 0;
5528 for (auto p : inode_map) {
5529 CInode *in = p.second;
5530 if (in->last != CEPH_NOSNAP)
5531 continue;
5532
5533 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5534 in->mark_dirty_rstat();
5535
5536 int dirty_caps = 0;
5537 auto q = reconnected_caps.find(in->ino());
5538 if (q != reconnected_caps.end()) {
5539 for (const auto &it : q->second)
5540 dirty_caps |= it.second.dirty_caps;
5541 }
5542 in->choose_lock_states(dirty_caps);
5543 dout(15) << " chose lock states on " << *in << dendl;
5544
5545 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5546 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5547 rejoin_pending_snaprealms.insert(in);
5548 }
5549
5550 if (!(++count % 1000))
5551 mds->heartbeat_reset();
5552 }
5553 }
5554
5555 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5556 map<client_t,ref_t<MClientSnap>>& splits)
5557 {
5558 ref_t<MClientSnap> snap;
5559 auto it = splits.find(client);
5560 if (it != splits.end()) {
5561 snap = it->second;
5562 snap->head.op = CEPH_SNAP_OP_SPLIT;
5563 } else {
5564 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5565 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5566 snap->head.split = realm->inode->ino();
5567 snap->bl = realm->get_snap_trace();
5568
5569 for (const auto& child : realm->open_children)
5570 snap->split_realms.push_back(child->inode->ino());
5571 }
5572 snap->split_inos.push_back(ino);
5573 }
5574
5575 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5576 map<client_t,ref_t<MClientSnap>>& splits)
5577 {
5578 ceph_assert(parent_realm);
5579
5580 vector<inodeno_t> split_inos;
5581 vector<inodeno_t> split_realms;
5582
5583 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5584 !p.end();
5585 ++p)
5586 split_inos.push_back((*p)->ino());
5587 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5588 p != realm->open_children.end();
5589 ++p)
5590 split_realms.push_back((*p)->inode->ino());
5591
5592 for (const auto& p : realm->client_caps) {
5593 ceph_assert(!p.second->empty());
5594 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5595 if (em.second) {
5596 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5597 update->head.split = parent_realm->inode->ino();
5598 update->split_inos = split_inos;
5599 update->split_realms = split_realms;
5600 update->bl = parent_realm->get_snap_trace();
5601 em.first->second = std::move(update);
5602 }
5603 }
5604 }
5605
5606 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5607 {
5608 dout(10) << "send_snaps" << dendl;
5609
5610 for (auto &p : splits) {
5611 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5612 if (session) {
5613 dout(10) << " client." << p.first
5614 << " split " << p.second->head.split
5615 << " inos " << p.second->split_inos
5616 << dendl;
5617 mds->send_message_client_counted(p.second, session);
5618 } else {
5619 dout(10) << " no session for client." << p.first << dendl;
5620 }
5621 }
5622 splits.clear();
5623 }
5624
5625
5626 /*
5627 * remove any items from logsegment open_file lists that don't have
5628 * any caps
5629 */
5630 void MDCache::clean_open_file_lists()
5631 {
5632 dout(10) << "clean_open_file_lists" << dendl;
5633
5634 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5635 p != mds->mdlog->segments.end();
5636 ++p) {
5637 LogSegment *ls = p->second;
5638
5639 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5640 while (!q.end()) {
5641 CInode *in = *q;
5642 ++q;
5643 if (in->last == CEPH_NOSNAP) {
5644 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5645 in->item_open_file.remove_myself();
5646 } else {
5647 if (in->client_snap_caps.empty()) {
5648 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5649 in->item_open_file.remove_myself();
5650 }
5651 }
5652 }
5653 }
5654 }
5655
5656 void MDCache::dump_openfiles(Formatter *f)
5657 {
5658 f->open_array_section("openfiles");
5659 for (auto p = mds->mdlog->segments.begin();
5660 p != mds->mdlog->segments.end();
5661 ++p) {
5662 LogSegment *ls = p->second;
5663
5664 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5665 while (!q.end()) {
5666 CInode *in = *q;
5667 ++q;
5668 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5669 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5670 continue;
5671 f->open_object_section("file");
5672 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5673 f->close_section();
5674 }
5675 }
5676 f->close_section();
5677 }
5678
5679 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5680 {
5681 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5682 << " on " << *in << dendl;
5683 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5684 if (!session) {
5685 dout(10) << " no session for client." << client << dendl;
5686 return NULL;
5687 }
5688
5689 Capability *cap = in->reconnect_cap(client, icr, session);
5690
5691 if (frommds >= 0) {
5692 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5693 cap->inc_mseq();
5694 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5695 }
5696
5697 return cap;
5698 }
5699
5700 void MDCache::export_remaining_imported_caps()
5701 {
5702 dout(10) << "export_remaining_imported_caps" << dendl;
5703
5704 stringstream warn_str;
5705
5706 int count = 0;
5707 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5708 warn_str << " ino " << p->first << "\n";
5709 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5710 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5711 if (session) {
5712 // mark client caps stale.
5713 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5714 0, 0, 0,
5715 mds->get_osd_epoch_barrier());
5716 stale->set_cap_peer(0, 0, 0, -1, 0);
5717 mds->send_message_client_counted(stale, q->first);
5718 }
5719 }
5720
5721 if (!(++count % 1000))
5722 mds->heartbeat_reset();
5723 }
5724
5725 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5726 p != cap_reconnect_waiters.end();
5727 ++p)
5728 mds->queue_waiters(p->second);
5729
5730 cap_imports.clear();
5731 cap_reconnect_waiters.clear();
5732
5733 if (warn_str.peek() != EOF) {
5734 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5735 mds->clog->warn(warn_str);
5736 }
5737 }
5738
5739 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5740 {
5741 client_t client = session->info.get_client();
5742 Capability *cap = nullptr;
5743 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5744 if (rc) {
5745 cap = in->reconnect_cap(client, *rc, session);
5746 dout(10) << "try_reconnect_cap client." << client
5747 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5748 << " issue " << ccap_string(rc->capinfo.issued)
5749 << " on " << *in << dendl;
5750 remove_replay_cap_reconnect(in->ino(), client);
5751
5752 if (in->is_replicated()) {
5753 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5754 } else {
5755 int dirty_caps = 0;
5756 auto p = reconnected_caps.find(in->ino());
5757 if (p != reconnected_caps.end()) {
5758 auto q = p->second.find(client);
5759 if (q != p->second.end())
5760 dirty_caps = q->second.dirty_caps;
5761 }
5762 in->choose_lock_states(dirty_caps);
5763 dout(15) << " chose lock states on " << *in << dendl;
5764 }
5765
5766 map<inodeno_t, MDSContext::vec >::iterator it =
5767 cap_reconnect_waiters.find(in->ino());
5768 if (it != cap_reconnect_waiters.end()) {
5769 mds->queue_waiters(it->second);
5770 cap_reconnect_waiters.erase(it);
5771 }
5772 }
5773 return cap;
5774 }
5775
5776
5777
5778 // -------
5779 // cap imports and delayed snap parent opens
5780
5781 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5782 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5783 int peer, int p_flags)
5784 {
5785 SnapRealm *realm = in->find_snaprealm();
5786 if (realm->have_past_parents_open()) {
5787 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5788 if (cap->get_last_seq() == 0) // reconnected cap
5789 cap->inc_last_seq();
5790 cap->set_last_issue();
5791 cap->set_last_issue_stamp(ceph_clock_now());
5792 cap->clear_new();
5793 auto reap = make_message<MClientCaps>(
5794 CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(),
5795 cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(),
5796 mds->get_osd_epoch_barrier());
5797 in->encode_cap_message(reap, cap);
5798 reap->snapbl = realm->get_snap_trace();
5799 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5800 mds->send_message_client_counted(reap, session);
5801 } else {
5802 ceph_abort();
5803 }
5804 }
5805
5806 void MDCache::do_delayed_cap_imports()
5807 {
5808 dout(10) << "do_delayed_cap_imports" << dendl;
5809
5810 ceph_assert(delayed_imported_caps.empty());
5811 }
5812
5813 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5814 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5815 void finish(int r) override {
5816 mdcache->open_snaprealms();
5817 }
5818 };
5819
5820 void MDCache::open_snaprealms()
5821 {
5822 dout(10) << "open_snaprealms" << dendl;
5823
5824 MDSGatherBuilder gather(g_ceph_context);
5825
5826 auto it = rejoin_pending_snaprealms.begin();
5827 while (it != rejoin_pending_snaprealms.end()) {
5828 CInode *in = *it;
5829 SnapRealm *realm = in->snaprealm;
5830 ceph_assert(realm);
5831 if (realm->have_past_parents_open() ||
5832 realm->open_parents(gather.new_sub())) {
5833 dout(10) << " past parents now open on " << *in << dendl;
5834
5835 map<client_t,ref_t<MClientSnap>> splits;
5836 // finish off client snaprealm reconnects?
5837 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5838 if (q != reconnected_snaprealms.end()) {
5839 for (const auto& r : q->second)
5840 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5841 reconnected_snaprealms.erase(q);
5842 }
5843
5844 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5845 !p.end(); ++p) {
5846 CInode *child = *p;
5847 auto q = reconnected_caps.find(child->ino());
5848 ceph_assert(q != reconnected_caps.end());
5849 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5850 Capability *cap = child->get_client_cap(r->first);
5851 if (!cap)
5852 continue;
5853 if (r->second.snap_follows > 0) {
5854 if (r->second.snap_follows < child->first - 1) {
5855 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5856 } else if (r->second.snapflush) {
5857 // When processing a cap flush message that is re-sent, it's possble
5858 // that the sender has already released all WR caps. So we should
5859 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5860 cap->mark_needsnapflush();
5861 }
5862 }
5863 // make sure client's cap is in the correct snaprealm.
5864 if (r->second.realm_ino != in->ino()) {
5865 prepare_realm_split(realm, r->first, child->ino(), splits);
5866 }
5867 }
5868 }
5869
5870 rejoin_pending_snaprealms.erase(it++);
5871 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5872
5873 send_snaps(splits);
5874 } else {
5875 dout(10) << " opening past parents on " << *in << dendl;
5876 ++it;
5877 }
5878 }
5879
5880 if (gather.has_subs()) {
5881 if (gather.num_subs_remaining() == 0) {
5882 // cleanup gather
5883 gather.set_finisher(new C_MDSInternalNoop);
5884 gather.activate();
5885 } else {
5886 // for multimds, must succeed the first time
5887 ceph_assert(recovery_set.empty());
5888
5889 dout(10) << "open_snaprealms - waiting for "
5890 << gather.num_subs_remaining() << dendl;
5891 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5892 gather.activate();
5893 return;
5894 }
5895 }
5896
5897 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5898
5899 if (!reconnected_snaprealms.empty()) {
5900 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5901 for (auto& p : reconnected_snaprealms) {
5902 stringstream warn_str;
5903 warn_str << " " << p.first << " {";
5904 bool first = true;
5905 for (auto& q : p.second) {
5906 if (!first)
5907 warn_str << ", ";
5908 warn_str << "client." << q.first << "/" << q.second;
5909 }
5910 warn_str << "}";
5911 dout(5) << warn_str.str() << dendl;
5912 }
5913 }
5914 ceph_assert(rejoin_waiters.empty());
5915 ceph_assert(rejoin_pending_snaprealms.empty());
5916 dout(10) << "open_snaprealms - all open" << dendl;
5917 do_delayed_cap_imports();
5918
5919 ceph_assert(rejoin_done);
5920 rejoin_done.release()->complete(0);
5921 reconnected_caps.clear();
5922 }
5923
5924 bool MDCache::open_undef_inodes_dirfrags()
5925 {
5926 dout(10) << "open_undef_inodes_dirfrags "
5927 << rejoin_undef_inodes.size() << " inodes "
5928 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
5929
5930 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
5931
5932 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5933 p != rejoin_undef_inodes.end();
5934 ++p) {
5935 CInode *in = *p;
5936 ceph_assert(!in->is_base());
5937 fetch_queue.insert(in->get_parent_dir());
5938 }
5939
5940 if (fetch_queue.empty())
5941 return false;
5942
5943 MDSGatherBuilder gather(g_ceph_context,
5944 new MDSInternalContextWrapper(mds,
5945 new LambdaContext([this](int r) {
5946 if (rejoin_gather.empty())
5947 rejoin_gather_finish();
5948 })
5949 )
5950 );
5951
5952 for (set<CDir*>::iterator p = fetch_queue.begin();
5953 p != fetch_queue.end();
5954 ++p) {
5955 CDir *dir = *p;
5956 CInode *diri = dir->get_inode();
5957 if (diri->state_test(CInode::STATE_REJOINUNDEF))
5958 continue;
5959 if (dir->state_test(CDir::STATE_REJOINUNDEF))
5960 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
5961 dir->fetch(gather.new_sub());
5962 }
5963 ceph_assert(gather.has_subs());
5964 gather.activate();
5965 return true;
5966 }
5967
5968 void MDCache::opened_undef_inode(CInode *in) {
5969 dout(10) << "opened_undef_inode " << *in << dendl;
5970 rejoin_undef_inodes.erase(in);
5971 if (in->is_dir()) {
5972 // FIXME: re-hash dentries if necessary
5973 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
5974 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
5975 CDir *dir = in->get_dirfrag(frag_t());
5976 ceph_assert(dir);
5977 rejoin_undef_dirfrags.erase(dir);
5978 in->force_dirfrags();
5979 auto&& ls = in->get_dirfrags();
5980 for (const auto& dir : ls) {
5981 rejoin_undef_dirfrags.insert(dir);
5982 }
5983 }
5984 }
5985 }
5986
5987 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
5988 map<client_t,ref_t<MClientSnap>>& updates)
5989 {
5990 if (seq < realm->get_newest_seq()) {
5991 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
5992 << realm->get_newest_seq() << " on " << *realm << dendl;
5993 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
5994 snap->bl = realm->get_snap_trace();
5995 for (const auto& child : realm->open_children)
5996 snap->split_realms.push_back(child->inode->ino());
5997 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5998 } else {
5999 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6000 << " on " << *realm << dendl;
6001 }
6002 }
6003
6004
6005
6006 void MDCache::rejoin_send_acks()
6007 {
6008 dout(7) << "rejoin_send_acks" << dendl;
6009
6010 // replicate stray
6011 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6012 p != rejoin_unlinked_inodes.end();
6013 ++p) {
6014 for (set<CInode*>::iterator q = p->second.begin();
6015 q != p->second.end();
6016 ++q) {
6017 CInode *in = *q;
6018 dout(7) << " unlinked inode " << *in << dendl;
6019 // inode expired
6020 if (!in->is_replica(p->first))
6021 continue;
6022 while (1) {
6023 CDentry *dn = in->get_parent_dn();
6024 if (dn->is_replica(p->first))
6025 break;
6026 dn->add_replica(p->first);
6027 CDir *dir = dn->get_dir();
6028 if (dir->is_replica(p->first))
6029 break;
6030 dir->add_replica(p->first);
6031 in = dir->get_inode();
6032 if (in->is_replica(p->first))
6033 break;
6034 in->add_replica(p->first);
6035 if (in->is_base())
6036 break;
6037 }
6038 }
6039 }
6040 rejoin_unlinked_inodes.clear();
6041
6042 // send acks to everyone in the recovery set
6043 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6044 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6045 p != recovery_set.end();
6046 ++p) {
6047 if (rejoin_ack_sent.count(*p))
6048 continue;
6049 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6050 }
6051
6052 rejoin_ack_sent = recovery_set;
6053
6054 // walk subtrees
6055 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6056 p != subtrees.end();
6057 ++p) {
6058 CDir *dir = p->first;
6059 if (!dir->is_auth())
6060 continue;
6061 dout(10) << "subtree " << *dir << dendl;
6062
6063 // auth items in this subtree
6064 std::queue<CDir*> dq;
6065 dq.push(dir);
6066
6067 while (!dq.empty()) {
6068 CDir *dir = dq.front();
6069 dq.pop();
6070
6071 // dir
6072 for (auto &r : dir->get_replicas()) {
6073 auto it = acks.find(r.first);
6074 if (it == acks.end())
6075 continue;
6076 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6077 it->second->add_dirfrag_base(dir);
6078 }
6079
6080 for (auto &p : dir->items) {
6081 CDentry *dn = p.second;
6082 CDentry::linkage_t *dnl = dn->get_linkage();
6083
6084 // inode
6085 CInode *in = NULL;
6086 if (dnl->is_primary())
6087 in = dnl->get_inode();
6088
6089 // dentry
6090 for (auto &r : dn->get_replicas()) {
6091 auto it = acks.find(r.first);
6092 if (it == acks.end())
6093 continue;
6094 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
6095 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6096 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6097 dnl->is_remote() ? dnl->get_remote_d_type():0,
6098 ++r.second,
6099 dn->lock.get_replica_state());
6100 // peer missed MDentrylink message ?
6101 if (in && !in->is_replica(r.first))
6102 in->add_replica(r.first);
6103 }
6104
6105 if (!in)
6106 continue;
6107
6108 for (auto &r : in->get_replicas()) {
6109 auto it = acks.find(r.first);
6110 if (it == acks.end())
6111 continue;
6112 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6113 bufferlist bl;
6114 in->_encode_locks_state_for_rejoin(bl, r.first);
6115 it->second->add_inode_locks(in, ++r.second, bl);
6116 }
6117
6118 // subdirs in this subtree?
6119 {
6120 auto&& dirs = in->get_nested_dirfrags();
6121 for (const auto& dir : dirs) {
6122 dq.push(dir);
6123 }
6124 }
6125 }
6126 }
6127 }
6128
6129 // base inodes too
6130 if (root && root->is_auth())
6131 for (auto &r : root->get_replicas()) {
6132 auto it = acks.find(r.first);
6133 if (it == acks.end())
6134 continue;
6135 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6136 bufferlist bl;
6137 root->_encode_locks_state_for_rejoin(bl, r.first);
6138 it->second->add_inode_locks(root, ++r.second, bl);
6139 }
6140 if (myin)
6141 for (auto &r : myin->get_replicas()) {
6142 auto it = acks.find(r.first);
6143 if (it == acks.end())
6144 continue;
6145 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6146 bufferlist bl;
6147 myin->_encode_locks_state_for_rejoin(bl, r.first);
6148 it->second->add_inode_locks(myin, ++r.second, bl);
6149 }
6150
6151 // include inode base for any inodes whose scatterlocks may have updated
6152 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6153 p != rejoin_potential_updated_scatterlocks.end();
6154 ++p) {
6155 CInode *in = *p;
6156 for (const auto &r : in->get_replicas()) {
6157 auto it = acks.find(r.first);
6158 if (it == acks.end())
6159 continue;
6160 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6161 }
6162 }
6163
6164 // send acks
6165 for (auto p = acks.begin(); p != acks.end(); ++p) {
6166 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6167 mds->send_message_mds(p->second, p->first);
6168 }
6169
6170 rejoin_imported_caps.clear();
6171 }
6172
6173 class C_MDC_ReIssueCaps : public MDCacheContext {
6174 CInode *in;
6175 public:
6176 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6177 MDCacheContext(mdc), in(i)
6178 {
6179 in->get(CInode::PIN_PTRWAITER);
6180 }
6181 void finish(int r) override {
6182 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6183 mdcache->mds->locker->issue_caps(in);
6184 in->put(CInode::PIN_PTRWAITER);
6185 }
6186 };
6187
6188 void MDCache::reissue_all_caps()
6189 {
6190 dout(10) << "reissue_all_caps" << dendl;
6191
6192 int count = 0;
6193 for (auto &p : inode_map) {
6194 int n = 1;
6195 CInode *in = p.second;
6196 if (in->is_head() && in->is_any_caps()) {
6197 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6198 if (in->is_frozen_inode()) {
6199 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6200 continue;
6201 }
6202 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6203 n += mds->locker->issue_caps(in);
6204 }
6205
6206 if ((count % 1000) + n >= 1000)
6207 mds->heartbeat_reset();
6208 count += n;
6209 }
6210 }
6211
6212
6213 // ===============================================================================
6214
6215 struct C_MDC_QueuedCow : public MDCacheContext {
6216 CInode *in;
6217 MutationRef mut;
6218 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6219 MDCacheContext(mdc), in(i), mut(m) {}
6220 void finish(int r) override {
6221 mdcache->_queued_file_recover_cow(in, mut);
6222 }
6223 };
6224
6225
6226 void MDCache::queue_file_recover(CInode *in)
6227 {
6228 dout(10) << "queue_file_recover " << *in << dendl;
6229 ceph_assert(in->is_auth());
6230
6231 // cow?
6232 /*
6233 SnapRealm *realm = in->find_snaprealm();
6234 set<snapid_t> s = realm->get_snaps();
6235 while (!s.empty() && *s.begin() < in->first)
6236 s.erase(s.begin());
6237 while (!s.empty() && *s.rbegin() > in->last)
6238 s.erase(*s.rbegin());
6239 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6240 if (s.size() > 1) {
6241 CInode::mempool_inode pi = in->project_inode();
6242 pi->version = in->pre_dirty();
6243
6244 auto mut(std::make_shared<MutationImpl>());
6245 mut->ls = mds->mdlog->get_current_segment();
6246 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6247 mds->mdlog->start_entry(le);
6248 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6249
6250 s.erase(*s.begin());
6251 while (!s.empty()) {
6252 snapid_t snapid = *s.begin();
6253 CInode *cow_inode = 0;
6254 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6255 ceph_assert(cow_inode);
6256 recovery_queue.enqueue(cow_inode);
6257 s.erase(*s.begin());
6258 }
6259
6260 in->parent->first = in->first;
6261 le->metablob.add_primary_dentry(in->parent, in, true);
6262 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6263 mds->mdlog->flush();
6264 }
6265 */
6266
6267 recovery_queue.enqueue(in);
6268 }
6269
6270 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6271 {
6272 in->pop_and_dirty_projected_inode(mut->ls);
6273 mut->apply();
6274 mds->locker->drop_locks(mut.get());
6275 mut->cleanup();
6276 }
6277
6278
6279 /*
6280 * called after recovery to recover file sizes for previously opened (for write)
6281 * files. that is, those where max_size > size.
6282 */
6283 void MDCache::identify_files_to_recover()
6284 {
6285 dout(10) << "identify_files_to_recover" << dendl;
6286 int count = 0;
6287 for (auto &p : inode_map) {
6288 CInode *in = p.second;
6289 if (!in->is_auth())
6290 continue;
6291
6292 if (in->last != CEPH_NOSNAP)
6293 continue;
6294
6295 // Only normal files need file size recovery
6296 if (!in->is_file()) {
6297 continue;
6298 }
6299
6300 bool recover = false;
6301 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6302 p != in->inode.client_ranges.end();
6303 ++p) {
6304 Capability *cap = in->get_client_cap(p->first);
6305 if (cap) {
6306 cap->mark_clientwriteable();
6307 } else {
6308 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6309 recover = true;
6310 break;
6311 }
6312 }
6313
6314 if (recover) {
6315 if (in->filelock.is_stable()) {
6316 in->auth_pin(&in->filelock);
6317 } else {
6318 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6319 }
6320 in->filelock.set_state(LOCK_PRE_SCAN);
6321 rejoin_recover_q.push_back(in);
6322 } else {
6323 rejoin_check_q.push_back(in);
6324 }
6325
6326 if (!(++count % 1000))
6327 mds->heartbeat_reset();
6328 }
6329 }
6330
6331 void MDCache::start_files_to_recover()
6332 {
6333 for (CInode *in : rejoin_check_q) {
6334 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6335 mds->locker->issue_caps(in);
6336 mds->locker->check_inode_max_size(in);
6337 }
6338 rejoin_check_q.clear();
6339 for (CInode *in : rejoin_recover_q) {
6340 mds->locker->file_recover(&in->filelock);
6341 }
6342 if (!rejoin_recover_q.empty()) {
6343 rejoin_recover_q.clear();
6344 do_file_recover();
6345 }
6346 }
6347
6348 void MDCache::do_file_recover()
6349 {
6350 recovery_queue.advance();
6351 }
6352
6353 // ===============================================================================
6354
6355
6356 // ----------------------------
6357 // truncate
6358
6359 class C_MDC_RetryTruncate : public MDCacheContext {
6360 CInode *in;
6361 LogSegment *ls;
6362 public:
6363 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6364 MDCacheContext(c), in(i), ls(l) {}
6365 void finish(int r) override {
6366 mdcache->_truncate_inode(in, ls);
6367 }
6368 };
6369
6370 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6371 {
6372 auto pi = in->get_projected_inode();
6373 dout(10) << "truncate_inode "
6374 << pi->truncate_from << " -> " << pi->truncate_size
6375 << " on " << *in
6376 << dendl;
6377
6378 ls->truncating_inodes.insert(in);
6379 in->get(CInode::PIN_TRUNCATING);
6380 in->auth_pin(this);
6381
6382 if (!in->client_need_snapflush.empty() &&
6383 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6384 ceph_assert(in->filelock.is_xlocked());
6385 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6386 mds->locker->issue_caps(in);
6387 return;
6388 }
6389
6390 _truncate_inode(in, ls);
6391 }
6392
6393 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6394 CInode *in;
6395 LogSegment *ls;
6396 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6397 MDCacheIOContext(c, false), in(i), ls(l) {
6398 }
6399 void finish(int r) override {
6400 ceph_assert(r == 0 || r == -ENOENT);
6401 mdcache->truncate_inode_finish(in, ls);
6402 }
6403 void print(ostream& out) const override {
6404 out << "file_truncate(" << in->ino() << ")";
6405 }
6406 };
6407
6408 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6409 {
6410 auto pi = &in->inode;
6411 dout(10) << "_truncate_inode "
6412 << pi->truncate_from << " -> " << pi->truncate_size
6413 << " on " << *in << dendl;
6414
6415 ceph_assert(pi->is_truncating());
6416 ceph_assert(pi->truncate_size < (1ULL << 63));
6417 ceph_assert(pi->truncate_from < (1ULL << 63));
6418 ceph_assert(pi->truncate_size < pi->truncate_from);
6419
6420
6421 SnapRealm *realm = in->find_snaprealm();
6422 SnapContext nullsnap;
6423 const SnapContext *snapc;
6424 if (realm) {
6425 dout(10) << " realm " << *realm << dendl;
6426 snapc = &realm->get_snap_context();
6427 } else {
6428 dout(10) << " NO realm, using null context" << dendl;
6429 snapc = &nullsnap;
6430 ceph_assert(in->last == CEPH_NOSNAP);
6431 }
6432 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6433 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6434 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6435 pi->truncate_seq, ceph::real_time::min(), 0,
6436 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6437 mds->finisher));
6438 }
6439
6440 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6441 CInode *in;
6442 MutationRef mut;
6443 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6444 MDCacheLogContext(m), in(i), mut(mu) {}
6445 void finish(int r) override {
6446 mdcache->truncate_inode_logged(in, mut);
6447 }
6448 };
6449
6450 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6451 {
6452 dout(10) << "truncate_inode_finish " << *in << dendl;
6453
6454 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6455 ceph_assert(p != ls->truncating_inodes.end());
6456 ls->truncating_inodes.erase(p);
6457
6458 // update
6459 auto &pi = in->project_inode();
6460 pi.inode.version = in->pre_dirty();
6461 pi.inode.truncate_from = 0;
6462 pi.inode.truncate_pending--;
6463
6464 MutationRef mut(new MutationImpl());
6465 mut->ls = mds->mdlog->get_current_segment();
6466 mut->add_projected_inode(in);
6467
6468 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6469 mds->mdlog->start_entry(le);
6470 CDentry *dn = in->get_projected_parent_dn();
6471 le->metablob.add_dir_context(dn->get_dir());
6472 le->metablob.add_primary_dentry(dn, in, true);
6473 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6474
6475 journal_dirty_inode(mut.get(), &le->metablob, in);
6476 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6477
6478 // flush immediately if there are readers/writers waiting
6479 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6480 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6481 mds->mdlog->flush();
6482 }
6483
6484 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6485 {
6486 dout(10) << "truncate_inode_logged " << *in << dendl;
6487 mut->apply();
6488 mds->locker->drop_locks(mut.get());
6489 mut->cleanup();
6490
6491 in->put(CInode::PIN_TRUNCATING);
6492 in->auth_unpin(this);
6493
6494 MDSContext::vec waiters;
6495 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6496 mds->queue_waiters(waiters);
6497 }
6498
6499
6500 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6501 {
6502 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6503 << ls->seq << "/" << ls->offset << dendl;
6504 ls->truncating_inodes.insert(in);
6505 in->get(CInode::PIN_TRUNCATING);
6506 }
6507
6508 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6509 {
6510 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6511 << ls->seq << "/" << ls->offset << dendl;
6512 // if we have the logseg the truncate started in, it must be in our list.
6513 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6514 ceph_assert(p != ls->truncating_inodes.end());
6515 ls->truncating_inodes.erase(p);
6516 in->put(CInode::PIN_TRUNCATING);
6517 }
6518
6519 void MDCache::start_recovered_truncates()
6520 {
6521 dout(10) << "start_recovered_truncates" << dendl;
6522 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6523 p != mds->mdlog->segments.end();
6524 ++p) {
6525 LogSegment *ls = p->second;
6526 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6527 q != ls->truncating_inodes.end();
6528 ++q) {
6529 CInode *in = *q;
6530 in->auth_pin(this);
6531
6532 if (!in->client_need_snapflush.empty() &&
6533 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6534 ceph_assert(in->filelock.is_stable());
6535 in->filelock.set_state(LOCK_XLOCKDONE);
6536 in->auth_pin(&in->filelock);
6537 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6538 // start_files_to_recover will revoke caps
6539 continue;
6540 }
6541 _truncate_inode(in, ls);
6542 }
6543 }
6544 }
6545
6546
6547 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6548 interval_set<inodeno_t> inos;
6549 version_t inotablev;
6550 LogSegment *ls;
6551 public:
6552 C_MDS_purge_completed_finish(MDCache *m,
6553 interval_set<inodeno_t> i,
6554 version_t iv,
6555 LogSegment *_ls)
6556 : MDCacheLogContext(m),
6557 inos(std::move(i)),
6558 inotablev(iv),
6559 ls(_ls) {}
6560 void finish(int r) override {
6561 assert(r == 0);
6562 if (inotablev) {
6563 ls->purge_inodes_finish(inos);
6564 mdcache->mds->inotable->apply_release_ids(inos);
6565 assert(mdcache->mds->inotable->get_version() == inotablev);
6566 }
6567 }
6568 };
6569
6570 void MDCache::start_purge_inodes(){
6571 dout(10) << "start_purge_inodes" << dendl;
6572 for (auto& p : mds->mdlog->segments){
6573 LogSegment *ls = p.second;
6574 if (ls->purge_inodes.size()){
6575 purge_inodes(ls->purge_inodes, ls);
6576 }
6577 }
6578 }
6579
6580 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6581 {
6582 auto cb = new LambdaContext([this, inos, ls](int r){
6583 assert(r == 0 || r == -2);
6584 mds->inotable->project_release_ids(inos);
6585 version_t piv = mds->inotable->get_projected_version();
6586 assert(piv != 0);
6587 mds->mdlog->start_submit_entry(new EPurged(inos, piv, ls->seq),
6588 new C_MDS_purge_completed_finish(this, inos, piv, ls));
6589 mds->mdlog->flush();
6590 });
6591
6592 dout(10) << __func__ << " start purge data : " << inos << dendl;
6593 C_GatherBuilder gather(g_ceph_context,
6594 new C_OnFinisher( new MDSIOContextWrapper(mds, cb), mds->finisher));
6595 SnapContext nullsnapc;
6596 uint64_t num = Striper::get_num_objects(default_file_layout, default_file_layout.get_period());
6597 for (auto p = inos.begin();
6598 p != inos.end();
6599 ++p){
6600 dout(10) << __func__
6601 << " prealloc_inos : " << inos.size()
6602 << " start : " << p.get_start().val
6603 << " length : " << p.get_len() << " "
6604 << " seq : " << ls->seq << dendl;
6605
6606 for (_inodeno_t i = 0; i < p.get_len(); i++){
6607 dout(20) << __func__ << " : " << p.get_start() + i << dendl;
6608 filer.purge_range(p.get_start() + i,
6609 &default_file_layout,
6610 nullsnapc,
6611 0, num,
6612 ceph::real_clock::now(),
6613 0, gather.new_sub());
6614 }
6615 }
6616 gather.activate();
6617 }
6618
6619 // ================================================================================
6620 // cache trimming
6621
6622 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6623 {
6624 bool is_standby_replay = mds->is_standby_replay();
6625 std::vector<CDentry *> unexpirables;
6626 uint64_t trimmed = 0;
6627
6628 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6629
6630 dout(7) << "trim_lru trimming " << count
6631 << " items from LRU"
6632 << " size=" << lru.lru_get_size()
6633 << " mid=" << lru.lru_get_top()
6634 << " pintail=" << lru.lru_get_pintail()
6635 << " pinned=" << lru.lru_get_num_pinned()
6636 << dendl;
6637
6638 const uint64_t trim_counter_start = trim_counter.get();
6639 bool throttled = false;
6640 while (1) {
6641 throttled |= trim_counter_start+trimmed >= trim_threshold;
6642 if (throttled) break;
6643 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6644 if (!dn)
6645 break;
6646 if (trim_dentry(dn, expiremap)) {
6647 unexpirables.push_back(dn);
6648 } else {
6649 trimmed++;
6650 }
6651 }
6652
6653 for (auto &dn : unexpirables) {
6654 bottom_lru.lru_insert_mid(dn);
6655 }
6656 unexpirables.clear();
6657
6658 // trim dentries from the LRU until count is reached
6659 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6660 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6661 throttled |= trim_counter_start+trimmed >= trim_threshold;
6662 if (throttled) break;
6663 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6664 if (!dn) {
6665 break;
6666 }
6667 if ((is_standby_replay && dn->get_linkage()->inode &&
6668 dn->get_linkage()->inode->item_open_file.is_on_list())) {
6669 // we move the inodes that need to be trimmed to the end of the lru queue.
6670 // refer to MDCache::standby_trim_segment
6671 lru.lru_insert_bot(dn);
6672 break;
6673 } else if (trim_dentry(dn, expiremap)) {
6674 unexpirables.push_back(dn);
6675 } else {
6676 trimmed++;
6677 if (count > 0) count--;
6678 }
6679 }
6680 trim_counter.hit(trimmed);
6681
6682 for (auto &dn : unexpirables) {
6683 lru.lru_insert_mid(dn);
6684 }
6685 unexpirables.clear();
6686
6687 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6688 return std::pair<bool, uint64_t>(throttled, trimmed);
6689 }
6690
6691 /*
6692 * note: only called while MDS is active or stopping... NOT during recovery.
6693 * however, we may expire a replica whose authority is recovering.
6694 *
6695 * @param count is number of dentries to try to expire
6696 */
6697 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6698 {
6699 uint64_t used = cache_size();
6700 uint64_t limit = cache_memory_limit;
6701 expiremap expiremap;
6702
6703 dout(7) << "trim bytes_used=" << bytes2str(used)
6704 << " limit=" << bytes2str(limit)
6705 << " reservation=" << cache_reservation
6706 << "% count=" << count << dendl;
6707
6708 // process delayed eval_stray()
6709 stray_manager.advance_delayed();
6710
6711 auto result = trim_lru(count, expiremap);
6712 auto& trimmed = result.second;
6713
6714 // trim non-auth, non-bound subtrees
6715 for (auto p = subtrees.begin(); p != subtrees.end();) {
6716 CDir *dir = p->first;
6717 ++p;
6718 CInode *diri = dir->get_inode();
6719 if (dir->is_auth()) {
6720 if (!diri->is_auth() && !diri->is_base() &&
6721 dir->get_num_head_items() == 0) {
6722 if (dir->state_test(CDir::STATE_EXPORTING) ||
6723 !(mds->is_active() || mds->is_stopping()) ||
6724 dir->is_freezing() || dir->is_frozen())
6725 continue;
6726
6727 migrator->export_empty_import(dir);
6728 ++trimmed;
6729 }
6730 } else {
6731 if (!diri->is_auth()) {
6732 if (dir->get_num_ref() > 1) // only subtree pin
6733 continue;
6734 auto&& ls = diri->get_subtree_dirfrags();
6735 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6736 continue;
6737
6738 // don't trim subtree root if its auth MDS is recovering.
6739 // This simplify the cache rejoin code.
6740 if (dir->is_subtree_root() &&
6741 rejoin_ack_gather.count(dir->get_dir_auth().first))
6742 continue;
6743 trim_dirfrag(dir, 0, expiremap);
6744 ++trimmed;
6745 }
6746 }
6747 }
6748
6749 // trim root?
6750 if (mds->is_stopping() && root) {
6751 auto&& ls = root->get_dirfrags();
6752 for (const auto& dir : ls) {
6753 if (dir->get_num_ref() == 1) { // subtree pin
6754 trim_dirfrag(dir, 0, expiremap);
6755 ++trimmed;
6756 }
6757 }
6758 if (root->get_num_ref() == 0) {
6759 trim_inode(0, root, 0, expiremap);
6760 ++trimmed;
6761 }
6762 }
6763
6764 std::set<mds_rank_t> stopping;
6765 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6766 stopping.erase(mds->get_nodeid());
6767 for (auto rank : stopping) {
6768 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6769 if (!mdsdir_in)
6770 continue;
6771
6772 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6773 if (em.second) {
6774 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6775 }
6776
6777 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6778
6779 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6780 if (!aborted) {
6781 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6782 auto&& ls = mdsdir_in->get_dirfrags();
6783 for (auto dir : ls) {
6784 if (dir->get_num_ref() == 1) { // subtree pin
6785 trim_dirfrag(dir, dir, expiremap);
6786 ++trimmed;
6787 }
6788 }
6789 if (mdsdir_in->get_num_ref() == 0) {
6790 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6791 ++trimmed;
6792 }
6793 } else {
6794 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6795 }
6796 }
6797
6798 // Other rank's base inodes (when I'm stopping)
6799 if (mds->is_stopping()) {
6800 for (set<CInode*>::iterator p = base_inodes.begin();
6801 p != base_inodes.end();) {
6802 CInode *base_in = *p;
6803 ++p;
6804 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6805 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6806 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6807 if (base_in->get_num_ref() == 0) {
6808 trim_inode(NULL, base_in, NULL, expiremap);
6809 ++trimmed;
6810 }
6811 }
6812 }
6813 }
6814
6815 // send any expire messages
6816 send_expire_messages(expiremap);
6817
6818 return result;
6819 }
6820
6821 void MDCache::send_expire_messages(expiremap& expiremap)
6822 {
6823 // send expires
6824 for (const auto &p : expiremap) {
6825 if (mds->is_cluster_degraded() &&
6826 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6827 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6828 rejoin_sent.count(p.first) == 0))) {
6829 continue;
6830 }
6831 dout(7) << "sending cache_expire to " << p.first << dendl;
6832 mds->send_message_mds(p.second, p.first);
6833 }
6834 expiremap.clear();
6835 }
6836
6837
6838 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6839 {
6840 dout(12) << "trim_dentry " << *dn << dendl;
6841
6842 CDentry::linkage_t *dnl = dn->get_linkage();
6843
6844 CDir *dir = dn->get_dir();
6845 ceph_assert(dir);
6846
6847 CDir *con = get_subtree_root(dir);
6848 if (con)
6849 dout(12) << " in container " << *con << dendl;
6850 else {
6851 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6852 ceph_assert(dn->is_auth());
6853 }
6854
6855 // If replica dentry is not readable, it's likely we will receive
6856 // MDentryLink/MDentryUnlink message soon (It's possible we first
6857 // receive a MDentryUnlink message, then MDentryLink message)
6858 // MDentryLink message only replicates an inode, so we should
6859 // avoid trimming the inode's parent dentry. This is because that
6860 // unconnected replicas are problematic for subtree migration.
6861 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6862 !dn->get_dir()->get_inode()->is_stray())
6863 return true;
6864
6865 // adjust the dir state
6866 // NOTE: we can safely remove a clean, null dentry without effecting
6867 // directory completeness.
6868 // (check this _before_ we unlink the inode, below!)
6869 bool clear_complete = false;
6870 if (!(dnl->is_null() && dn->is_clean()))
6871 clear_complete = true;
6872
6873 // unlink the dentry
6874 if (dnl->is_remote()) {
6875 // just unlink.
6876 dir->unlink_inode(dn, false);
6877 } else if (dnl->is_primary()) {
6878 // expire the inode, too.
6879 CInode *in = dnl->get_inode();
6880 ceph_assert(in);
6881 if (trim_inode(dn, in, con, expiremap))
6882 return true; // purging stray instead of trimming
6883 } else {
6884 ceph_assert(dnl->is_null());
6885 }
6886
6887 if (!dn->is_auth()) {
6888 // notify dentry authority.
6889 mds_authority_t auth = dn->authority();
6890
6891 for (int p=0; p<2; p++) {
6892 mds_rank_t a = auth.first;
6893 if (p) a = auth.second;
6894 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6895 if (mds->get_nodeid() == auth.second &&
6896 con->is_importing()) break; // don't send any expire while importing.
6897 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6898
6899 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6900 ceph_assert(a != mds->get_nodeid());
6901 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6902 if (em.second)
6903 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6904 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6905 }
6906 }
6907
6908 // remove dentry
6909 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6910 dir->add_to_bloom(dn);
6911 dir->remove_dentry(dn);
6912
6913 if (clear_complete)
6914 dir->state_clear(CDir::STATE_COMPLETE);
6915
6916 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
6917 return false;
6918 }
6919
6920
6921 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
6922 {
6923 dout(15) << "trim_dirfrag " << *dir << dendl;
6924
6925 if (dir->is_subtree_root()) {
6926 ceph_assert(!dir->is_auth() ||
6927 (!dir->is_replicated() && dir->inode->is_base()));
6928 remove_subtree(dir); // remove from subtree map
6929 }
6930 ceph_assert(dir->get_num_ref() == 0);
6931
6932 CInode *in = dir->get_inode();
6933
6934 if (!dir->is_auth()) {
6935 mds_authority_t auth = dir->authority();
6936
6937 // was this an auth delegation? (if so, slightly modified container)
6938 dirfrag_t condf;
6939 if (dir->is_subtree_root()) {
6940 dout(12) << " subtree root, container is " << *dir << dendl;
6941 con = dir;
6942 condf = dir->dirfrag();
6943 } else {
6944 condf = con->dirfrag();
6945 }
6946
6947 for (int p=0; p<2; p++) {
6948 mds_rank_t a = auth.first;
6949 if (p) a = auth.second;
6950 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6951 if (mds->get_nodeid() == auth.second &&
6952 con->is_importing()) break; // don't send any expire while importing.
6953 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6954
6955 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
6956 ceph_assert(a != mds->get_nodeid());
6957 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6958 if (em.second)
6959 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
6960 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
6961 }
6962 }
6963
6964 in->close_dirfrag(dir->dirfrag().frag);
6965 }
6966
6967 /**
6968 * Try trimming an inode from the cache
6969 *
6970 * @return true if the inode is still in cache, else false if it was trimmed
6971 */
6972 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
6973 {
6974 dout(15) << "trim_inode " << *in << dendl;
6975 ceph_assert(in->get_num_ref() == 0);
6976
6977 if (in->is_dir()) {
6978 // If replica inode's dirfragtreelock is not readable, it's likely
6979 // some dirfrags of the inode are being fragmented and we will receive
6980 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
6981 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
6982 // This is because that unconnected replicas are problematic for
6983 // subtree migration.
6984 //
6985 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
6986 return true;
6987 }
6988
6989 // DIR
6990 auto&& dfls = in->get_dirfrags();
6991 for (const auto& dir : dfls) {
6992 ceph_assert(!dir->is_subtree_root());
6993 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
6994 }
6995 }
6996
6997 // INODE
6998 if (in->is_auth()) {
6999 // eval stray after closing dirfrags
7000 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7001 maybe_eval_stray(in);
7002 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7003 return true;
7004 }
7005 } else {
7006 mds_authority_t auth = in->authority();
7007
7008 dirfrag_t df;
7009 if (con)
7010 df = con->dirfrag();
7011 else
7012 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7013
7014 for (int p=0; p<2; p++) {
7015 mds_rank_t a = auth.first;
7016 if (p) a = auth.second;
7017 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7018 if (con && mds->get_nodeid() == auth.second &&
7019 con->is_importing()) break; // don't send any expire while importing.
7020 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7021
7022 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7023 ceph_assert(a != mds->get_nodeid());
7024 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7025 if (em.second)
7026 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7027 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7028 }
7029 }
7030
7031 /*
7032 if (in->is_auth()) {
7033 if (in->hack_accessed)
7034 mds->logger->inc("outt");
7035 else {
7036 mds->logger->inc("outut");
7037 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7038 }
7039 }
7040 */
7041
7042 // unlink
7043 if (dn)
7044 dn->get_dir()->unlink_inode(dn, false);
7045 remove_inode(in);
7046 return false;
7047 }
7048
7049
7050 /**
7051 * trim_non_auth - remove any non-auth items from our cache
7052 *
7053 * this reduces the amount of non-auth metadata in our cache, reducing the
7054 * load incurred by the rejoin phase.
7055 *
7056 * the only non-auth items that remain are those that are needed to
7057 * attach our own subtrees to the root.
7058 *
7059 * when we are done, all dentries will be in the top bit of the lru.
7060 *
7061 * why we have to do this:
7062 * we may not have accurate linkage for non-auth items. which means we will
7063 * know which subtree it falls into, and can not be sure to declare it to the
7064 * correct authority.
7065 */
7066 void MDCache::trim_non_auth()
7067 {
7068 dout(7) << "trim_non_auth" << dendl;
7069
7070 // temporarily pin all subtree roots
7071 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7072 p != subtrees.end();
7073 ++p)
7074 p->first->get(CDir::PIN_SUBTREETEMP);
7075
7076 list<CDentry*> auth_list;
7077
7078 // trim non-auth items from the lru
7079 for (;;) {
7080 CDentry *dn = NULL;
7081 if (bottom_lru.lru_get_size() > 0)
7082 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7083 if (!dn && lru.lru_get_size() > 0)
7084 dn = static_cast<CDentry*>(lru.lru_expire());
7085 if (!dn)
7086 break;
7087
7088 CDentry::linkage_t *dnl = dn->get_linkage();
7089
7090 if (dn->is_auth()) {
7091 // add back into lru (at the top)
7092 auth_list.push_back(dn);
7093
7094 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7095 dn->unlink_remote(dnl);
7096 } else {
7097 // non-auth. expire.
7098 CDir *dir = dn->get_dir();
7099 ceph_assert(dir);
7100
7101 // unlink the dentry
7102 dout(10) << " removing " << *dn << dendl;
7103 if (dnl->is_remote()) {
7104 dir->unlink_inode(dn, false);
7105 }
7106 else if (dnl->is_primary()) {
7107 CInode *in = dnl->get_inode();
7108 dout(10) << " removing " << *in << dendl;
7109 auto&& ls = in->get_dirfrags();
7110 for (const auto& subdir : ls) {
7111 ceph_assert(!subdir->is_subtree_root());
7112 in->close_dirfrag(subdir->dirfrag().frag);
7113 }
7114 dir->unlink_inode(dn, false);
7115 remove_inode(in);
7116 }
7117 else {
7118 ceph_assert(dnl->is_null());
7119 }
7120
7121 ceph_assert(!dir->has_bloom());
7122 dir->remove_dentry(dn);
7123 // adjust the dir state
7124 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7125 // close empty non-auth dirfrag
7126 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7127 dir->inode->close_dirfrag(dir->get_frag());
7128 }
7129 }
7130
7131 for (const auto& dn : auth_list) {
7132 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7133 bottom_lru.lru_insert_mid(dn);
7134 else
7135 lru.lru_insert_top(dn);
7136 }
7137
7138 // move everything in the pintail to the top bit of the lru.
7139 lru.lru_touch_entire_pintail();
7140
7141 // unpin all subtrees
7142 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7143 p != subtrees.end();
7144 ++p)
7145 p->first->put(CDir::PIN_SUBTREETEMP);
7146
7147 if (lru.lru_get_size() == 0 &&
7148 bottom_lru.lru_get_size() == 0) {
7149 // root, stray, etc.?
7150 auto p = inode_map.begin();
7151 while (p != inode_map.end()) {
7152 CInode *in = p->second;
7153 ++p;
7154 if (!in->is_auth()) {
7155 auto&& ls = in->get_dirfrags();
7156 for (const auto& dir : ls) {
7157 dout(10) << " removing " << *dir << dendl;
7158 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7159 remove_subtree(dir);
7160 in->close_dirfrag(dir->dirfrag().frag);
7161 }
7162 dout(10) << " removing " << *in << dendl;
7163 ceph_assert(!in->get_parent_dn());
7164 ceph_assert(in->get_num_ref() == 0);
7165 remove_inode(in);
7166 }
7167 }
7168 }
7169
7170 show_subtrees();
7171 }
7172
7173 /**
7174 * Recursively trim the subtree rooted at directory to remove all
7175 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7176 * of those links. This is used to clear invalid data out of the cache.
7177 * Note that it doesn't clear the passed-in directory, since that's not
7178 * always safe.
7179 */
7180 bool MDCache::trim_non_auth_subtree(CDir *dir)
7181 {
7182 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7183
7184 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7185
7186 auto j = dir->begin();
7187 auto i = j;
7188 while (j != dir->end()) {
7189 i = j++;
7190 CDentry *dn = i->second;
7191 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7192 CDentry::linkage_t *dnl = dn->get_linkage();
7193 if (dnl->is_primary()) { // check for subdirectories, etc
7194 CInode *in = dnl->get_inode();
7195 bool keep_inode = false;
7196 if (in->is_dir()) {
7197 auto&& subdirs = in->get_dirfrags();
7198 for (const auto& subdir : subdirs) {
7199 if (subdir->is_subtree_root()) {
7200 keep_inode = true;
7201 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7202 } else {
7203 if (trim_non_auth_subtree(subdir))
7204 keep_inode = true;
7205 else {
7206 in->close_dirfrag(subdir->get_frag());
7207 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7208 }
7209 }
7210 }
7211
7212 }
7213 if (!keep_inode) { // remove it!
7214 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7215 dir->unlink_inode(dn, false);
7216 remove_inode(in);
7217 ceph_assert(!dir->has_bloom());
7218 dir->remove_dentry(dn);
7219 } else {
7220 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7221 dn->state_clear(CDentry::STATE_AUTH);
7222 in->state_clear(CInode::STATE_AUTH);
7223 }
7224 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7225 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7226 } else { // just remove it
7227 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7228 if (dnl->is_remote())
7229 dir->unlink_inode(dn, false);
7230 dir->remove_dentry(dn);
7231 }
7232 }
7233 dir->state_clear(CDir::STATE_AUTH);
7234 /**
7235 * We've now checked all our children and deleted those that need it.
7236 * Now return to caller, and tell them if *we're* a keeper.
7237 */
7238 return keep_dir || dir->get_num_any();
7239 }
7240
7241 /*
7242 * during replay, when we determine a subtree is no longer ours, we
7243 * try to trim it from our cache. because subtrees must be connected
7244 * to the root, the fact that we can trim this tree may mean that our
7245 * children or parents can also be trimmed.
7246 */
7247 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7248 {
7249 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7250
7251 // can we now trim child subtrees?
7252 set<CDir*> bounds;
7253 get_subtree_bounds(dir, bounds);
7254 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7255 CDir *bd = *p;
7256 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7257 bd->get_num_any() == 0 && // and empty
7258 can_trim_non_auth_dirfrag(bd)) {
7259 CInode *bi = bd->get_inode();
7260 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7261 remove_subtree(bd);
7262 bd->mark_clean();
7263 bi->close_dirfrag(bd->get_frag());
7264 }
7265 }
7266
7267 if (trim_non_auth_subtree(dir)) {
7268 // keep
7269 try_subtree_merge(dir);
7270 } else {
7271 // can we trim this subtree (and possibly our ancestors) too?
7272 while (true) {
7273 CInode *diri = dir->get_inode();
7274 if (diri->is_base()) {
7275 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7276 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7277 remove_subtree(dir);
7278 dir->mark_clean();
7279 diri->close_dirfrag(dir->get_frag());
7280
7281 dout(10) << " removing " << *diri << dendl;
7282 ceph_assert(!diri->get_parent_dn());
7283 ceph_assert(diri->get_num_ref() == 0);
7284 remove_inode(diri);
7285 }
7286 break;
7287 }
7288
7289 CDir *psub = get_subtree_root(diri->get_parent_dir());
7290 dout(10) << " parent subtree is " << *psub << dendl;
7291 if (psub->get_dir_auth().first == mds->get_nodeid())
7292 break; // we are auth, keep.
7293
7294 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7295 remove_subtree(dir);
7296 dir->mark_clean();
7297 diri->close_dirfrag(dir->get_frag());
7298
7299 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7300 if (trim_non_auth_subtree(psub))
7301 break;
7302 dir = psub;
7303 }
7304 }
7305
7306 show_subtrees();
7307 }
7308
7309 void MDCache::standby_trim_segment(LogSegment *ls)
7310 {
7311 auto try_trim_inode = [this](CInode *in) {
7312 if (in->get_num_ref() == 0 &&
7313 !in->item_open_file.is_on_list() &&
7314 in->parent != NULL &&
7315 in->parent->get_num_ref() == 0){
7316 touch_dentry_bottom(in->parent);
7317 }
7318 };
7319
7320 auto try_trim_dentry = [this](CDentry *dn) {
7321 if (dn->get_num_ref() > 0)
7322 return;
7323 auto in = dn->get_linkage()->inode;
7324 if(in && in->item_open_file.is_on_list())
7325 return;
7326 touch_dentry_bottom(dn);
7327 };
7328
7329 ls->new_dirfrags.clear_list();
7330 ls->open_files.clear_list();
7331
7332 while (!ls->dirty_dirfrags.empty()) {
7333 CDir *dir = ls->dirty_dirfrags.front();
7334 dir->mark_clean();
7335 if (dir->inode)
7336 try_trim_inode(dir->inode);
7337 }
7338 while (!ls->dirty_inodes.empty()) {
7339 CInode *in = ls->dirty_inodes.front();
7340 in->mark_clean();
7341 try_trim_inode(in);
7342 }
7343 while (!ls->dirty_dentries.empty()) {
7344 CDentry *dn = ls->dirty_dentries.front();
7345 dn->mark_clean();
7346 try_trim_dentry(dn);
7347 }
7348 while (!ls->dirty_parent_inodes.empty()) {
7349 CInode *in = ls->dirty_parent_inodes.front();
7350 in->clear_dirty_parent();
7351 try_trim_inode(in);
7352 }
7353 while (!ls->dirty_dirfrag_dir.empty()) {
7354 CInode *in = ls->dirty_dirfrag_dir.front();
7355 in->filelock.remove_dirty();
7356 try_trim_inode(in);
7357 }
7358 while (!ls->dirty_dirfrag_nest.empty()) {
7359 CInode *in = ls->dirty_dirfrag_nest.front();
7360 in->nestlock.remove_dirty();
7361 try_trim_inode(in);
7362 }
7363 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7364 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7365 in->dirfragtreelock.remove_dirty();
7366 try_trim_inode(in);
7367 }
7368 while (!ls->truncating_inodes.empty()) {
7369 auto it = ls->truncating_inodes.begin();
7370 CInode *in = *it;
7371 ls->truncating_inodes.erase(it);
7372 in->put(CInode::PIN_TRUNCATING);
7373 try_trim_inode(in);
7374 }
7375 }
7376
7377 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7378 {
7379 mds_rank_t from = mds_rank_t(m->get_from());
7380
7381 dout(7) << "cache_expire from mds." << from << dendl;
7382
7383 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7384 return;
7385 }
7386
7387 set<SimpleLock *> gather_locks;
7388 // loop over realms
7389 for (const auto &p : m->realms) {
7390 // check container?
7391 if (p.first.ino > 0) {
7392 CInode *expired_inode = get_inode(p.first.ino);
7393 ceph_assert(expired_inode); // we had better have this.
7394 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7395 ceph_assert(parent_dir);
7396
7397 int export_state = -1;
7398 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7399 export_state = migrator->get_export_state(parent_dir);
7400 ceph_assert(export_state >= 0);
7401 }
7402
7403 if (!parent_dir->is_auth() ||
7404 (export_state != -1 &&
7405 ((export_state == Migrator::EXPORT_WARNING &&
7406 migrator->export_has_warned(parent_dir,from)) ||
7407 export_state == Migrator::EXPORT_EXPORTING ||
7408 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7409 (export_state == Migrator::EXPORT_NOTIFYING &&
7410 !migrator->export_has_notified(parent_dir,from))))) {
7411
7412 // not auth.
7413 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7414 ceph_assert(parent_dir->is_frozen_tree_root());
7415
7416 // make a message container
7417
7418 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7419 if (em.second)
7420 em.first->second = make_message<MCacheExpire>(from); /* new */
7421
7422 // merge these expires into it
7423 em.first->second->add_realm(p.first, p.second);
7424 continue;
7425 }
7426 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7427 (export_state == Migrator::EXPORT_WARNING &&
7428 !migrator->export_has_warned(parent_dir, from)));
7429
7430 dout(7) << "expires for " << *parent_dir << dendl;
7431 } else {
7432 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7433 }
7434
7435 // INODES
7436 for (const auto &q : p.second.inodes) {
7437 CInode *in = get_inode(q.first);
7438 unsigned nonce = q.second;
7439
7440 if (!in) {
7441 dout(0) << " inode expire on " << q.first << " from " << from
7442 << ", don't have it" << dendl;
7443 ceph_assert(in);
7444 }
7445 ceph_assert(in->is_auth());
7446 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7447
7448 // check nonce
7449 if (nonce == in->get_replica_nonce(from)) {
7450 // remove from our cached_by
7451 dout(7) << " inode expire on " << *in << " from mds." << from
7452 << " cached_by was " << in->get_replicas() << dendl;
7453 inode_remove_replica(in, from, false, gather_locks);
7454 }
7455 else {
7456 // this is an old nonce, ignore expire.
7457 dout(7) << " inode expire on " << *in << " from mds." << from
7458 << " with old nonce " << nonce
7459 << " (current " << in->get_replica_nonce(from) << "), dropping"
7460 << dendl;
7461 }
7462 }
7463
7464 // DIRS
7465 for (const auto &q : p.second.dirs) {
7466 CDir *dir = get_dirfrag(q.first);
7467 unsigned nonce = q.second;
7468
7469 if (!dir) {
7470 CInode *diri = get_inode(q.first.ino);
7471 if (diri) {
7472 if (mds->is_rejoin() &&
7473 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7474 !diri->is_replica(from)) {
7475 auto&& ls = diri->get_nested_dirfrags();
7476 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7477 << " while rejoining, inode isn't replicated" << dendl;
7478 for (const auto& d : ls) {
7479 dir = d;
7480 if (dir->is_replica(from)) {
7481 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7482 dir->remove_replica(from);
7483 }
7484 }
7485 continue;
7486 }
7487 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7488 if (other) {
7489 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7490 << " have " << *other << ", mismatched frags, dropping" << dendl;
7491 continue;
7492 }
7493 }
7494 dout(0) << " dir expire on " << q.first << " from " << from
7495 << ", don't have it" << dendl;
7496 ceph_assert(dir);
7497 }
7498 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7499
7500 ceph_assert(dir->is_auth());
7501
7502 // check nonce
7503 if (nonce == dir->get_replica_nonce(from)) {
7504 // remove from our cached_by
7505 dout(7) << " dir expire on " << *dir << " from mds." << from
7506 << " replicas was " << dir->get_replicas() << dendl;
7507 dir->remove_replica(from);
7508 }
7509 else {
7510 // this is an old nonce, ignore expire.
7511 dout(7) << " dir expire on " << *dir << " from mds." << from
7512 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7513 << "), dropping" << dendl;
7514 }
7515 }
7516
7517 // DENTRIES
7518 for (const auto &pd : p.second.dentries) {
7519 dout(10) << " dn expires in dir " << pd.first << dendl;
7520 CInode *diri = get_inode(pd.first.ino);
7521 ceph_assert(diri);
7522 CDir *dir = diri->get_dirfrag(pd.first.frag);
7523
7524 if (!dir) {
7525 dout(0) << " dn expires on " << pd.first << " from " << from
7526 << ", must have refragmented" << dendl;
7527 } else {
7528 ceph_assert(dir->is_auth());
7529 }
7530
7531 for (const auto &p : pd.second) {
7532 unsigned nonce = p.second;
7533 CDentry *dn;
7534
7535 if (dir) {
7536 dn = dir->lookup(p.first.first, p.first.second);
7537 } else {
7538 // which dirfrag for this dentry?
7539 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7540 ceph_assert(dir);
7541 ceph_assert(dir->is_auth());
7542 dn = dir->lookup(p.first.first, p.first.second);
7543 }
7544
7545 if (!dn) {
7546 if (dir)
7547 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7548 else
7549 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7550 }
7551 ceph_assert(dn);
7552
7553 if (nonce == dn->get_replica_nonce(from)) {
7554 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7555 dentry_remove_replica(dn, from, gather_locks);
7556 }
7557 else {
7558 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7559 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7560 << "), dropping" << dendl;
7561 }
7562 }
7563 }
7564 }
7565
7566 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7567 if (!(*p)->is_stable())
7568 mds->locker->eval_gather(*p);
7569 }
7570 }
7571
7572 void MDCache::process_delayed_expire(CDir *dir)
7573 {
7574 dout(7) << "process_delayed_expire on " << *dir << dendl;
7575 for (const auto &p : delayed_expire[dir]) {
7576 handle_cache_expire(p.second);
7577 }
7578 delayed_expire.erase(dir);
7579 }
7580
7581 void MDCache::discard_delayed_expire(CDir *dir)
7582 {
7583 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7584 delayed_expire.erase(dir);
7585 }
7586
7587 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7588 set<SimpleLock *>& gather_locks)
7589 {
7590 in->remove_replica(from);
7591 in->set_mds_caps_wanted(from, 0);
7592
7593 // note: this code calls _eval more often than it needs to!
7594 // fix lock
7595 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7596 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7597 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7598 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7599 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7600 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7601
7602 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7603 // Don't remove the recovering mds from lock's gathering list because
7604 // it may hold rejoined wrlocks.
7605 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7606 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7607 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7608 }
7609
7610 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7611 {
7612 dn->remove_replica(from);
7613
7614 // fix lock
7615 if (dn->lock.remove_replica(from))
7616 gather_locks.insert(&dn->lock);
7617
7618 // Replicated strays might now be elegible for purge
7619 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7620 if (dnl->is_primary()) {
7621 maybe_eval_stray(dnl->get_inode());
7622 }
7623 }
7624
7625 void MDCache::trim_client_leases()
7626 {
7627 utime_t now = ceph_clock_now();
7628
7629 dout(10) << "trim_client_leases" << dendl;
7630
7631 std::size_t pool = 0;
7632 for (const auto& list : client_leases) {
7633 pool += 1;
7634 if (list.empty())
7635 continue;
7636
7637 auto before = list.size();
7638 while (!list.empty()) {
7639 ClientLease *r = list.front();
7640 if (r->ttl > now) break;
7641 CDentry *dn = static_cast<CDentry*>(r->parent);
7642 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7643 dn->remove_client_lease(r, mds->locker);
7644 }
7645 auto after = list.size();
7646 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7647 << (before-after) << " leases, " << after << " left" << dendl;
7648 }
7649 }
7650
7651
7652 void MDCache::check_memory_usage()
7653 {
7654 static MemoryModel mm(g_ceph_context);
7655 static MemoryModel::snap last;
7656 mm.sample(&last);
7657 static MemoryModel::snap baseline = last;
7658
7659 // check client caps
7660 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7661 double caps_per_inode = 0.0;
7662 if (CInode::count())
7663 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7664
7665 dout(2) << "Memory usage: "
7666 << " total " << last.get_total()
7667 << ", rss " << last.get_rss()
7668 << ", heap " << last.get_heap()
7669 << ", baseline " << baseline.get_heap()
7670 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7671 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7672 << dendl;
7673
7674 mds->update_mlogger();
7675 mds->mlogger->set(l_mdm_rss, last.get_rss());
7676 mds->mlogger->set(l_mdm_heap, last.get_heap());
7677
7678 if (cache_toofull()) {
7679 mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM);
7680 }
7681
7682 // If the cache size had exceeded its limit, but we're back in bounds
7683 // now, free any unused pool memory so that our memory usage isn't
7684 // permanently bloated.
7685 if (exceeded_size_limit && !cache_toofull()) {
7686 // Only do this once we are back in bounds: otherwise the releases would
7687 // slow down whatever process caused us to exceed bounds to begin with
7688 if (ceph_using_tcmalloc()) {
7689 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7690 << dendl;
7691 ceph_heap_release_free_memory();
7692 }
7693 exceeded_size_limit = false;
7694 }
7695 }
7696
7697
7698
7699 // =========================================================================================
7700 // shutdown
7701
7702 class C_MDC_ShutdownCheck : public MDCacheContext {
7703 public:
7704 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7705 void finish(int) override {
7706 mdcache->shutdown_check();
7707 }
7708 };
7709
7710 void MDCache::shutdown_check()
7711 {
7712 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7713
7714 // cache
7715 char old_val[32] = { 0 };
7716 char *o = old_val;
7717 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7718 g_conf().set_val("debug_mds", "10");
7719 g_conf().apply_changes(nullptr);
7720 show_cache();
7721 g_conf().set_val("debug_mds", old_val);
7722 g_conf().apply_changes(nullptr);
7723 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7724
7725 // this
7726 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7727 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7728
7729
7730 if (mds->objecter->is_active()) {
7731 dout(0) << "objecter still active" << dendl;
7732 mds->objecter->dump_active();
7733 }
7734 }
7735
7736
7737 void MDCache::shutdown_start()
7738 {
7739 dout(5) << "shutdown_start" << dendl;
7740
7741 if (g_conf()->mds_shutdown_check)
7742 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7743
7744 // g_conf()->debug_mds = 10;
7745 }
7746
7747
7748
7749 bool MDCache::shutdown_pass()
7750 {
7751 dout(7) << "shutdown_pass" << dendl;
7752
7753 if (mds->is_stopped()) {
7754 dout(7) << " already shut down" << dendl;
7755 show_cache();
7756 show_subtrees();
7757 return true;
7758 }
7759
7760 // empty stray dir
7761 bool strays_all_exported = shutdown_export_strays();
7762
7763 // trim cache
7764 trim(UINT64_MAX);
7765 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7766
7767 // Export all subtrees to another active (usually rank 0) if not rank 0
7768 int num_auth_subtree = 0;
7769 if (!subtrees.empty() &&
7770 mds->get_nodeid() != 0) {
7771 dout(7) << "looking for subtrees to export to mds0" << dendl;
7772 std::vector<CDir*> ls;
7773 for (map<CDir*, set<CDir*> >::iterator it = subtrees.begin();
7774 it != subtrees.end();
7775 ++it) {
7776 CDir *dir = it->first;
7777 if (dir->get_inode()->is_mdsdir())
7778 continue;
7779 if (dir->is_auth()) {
7780 num_auth_subtree++;
7781 if (dir->is_frozen() ||
7782 dir->is_freezing() ||
7783 dir->is_ambiguous_dir_auth() ||
7784 dir->state_test(CDir::STATE_EXPORTING))
7785 continue;
7786 ls.push_back(dir);
7787 }
7788 }
7789
7790 migrator->clear_export_queue();
7791 for (const auto& dir : ls) {
7792 mds_rank_t dest = dir->get_inode()->authority().first;
7793 if (dest > 0 && !mds->mdsmap->is_active(dest))
7794 dest = 0;
7795 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7796 migrator->export_dir_nicely(dir, dest);
7797 }
7798 }
7799
7800 if (!strays_all_exported) {
7801 dout(7) << "waiting for strays to migrate" << dendl;
7802 return false;
7803 }
7804
7805 if (num_auth_subtree > 0) {
7806 ceph_assert(mds->get_nodeid() > 0);
7807 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7808 show_subtrees();
7809 return false;
7810 }
7811
7812 // close out any sessions (and open files!) before we try to trim the log, etc.
7813 if (mds->sessionmap.have_unclosed_sessions()) {
7814 if (!mds->server->terminating_sessions)
7815 mds->server->terminate_sessions();
7816 return false;
7817 }
7818
7819 // Fully trim the log so that all objects in cache are clean and may be
7820 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7821 // trim the log such that the cache eventually becomes clean.
7822 if (mds->mdlog->get_num_segments() > 0) {
7823 auto ls = mds->mdlog->get_current_segment();
7824 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7825 // Current segment contains events other than subtreemap or
7826 // there are dirty dirfrags (see CDir::log_mark_dirty())
7827 mds->mdlog->start_new_segment();
7828 mds->mdlog->flush();
7829 }
7830 }
7831 mds->mdlog->trim_all();
7832 if (mds->mdlog->get_num_segments() > 1) {
7833 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7834 return false;
7835 }
7836
7837 // drop our reference to our stray dir inode
7838 for (int i = 0; i < NUM_STRAY; ++i) {
7839 if (strays[i] &&
7840 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7841 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7842 strays[i]->put(CInode::PIN_STRAY);
7843 strays[i]->put_stickydirs();
7844 }
7845 }
7846
7847 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7848 if (mydir && !mydir->is_subtree_root())
7849 mydir = NULL;
7850
7851 // subtrees map not empty yet?
7852 if (subtrees.size() > (mydir ? 1 : 0)) {
7853 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7854 show_subtrees();
7855 migrator->show_importing();
7856 migrator->show_exporting();
7857 if (!migrator->is_importing() && !migrator->is_exporting())
7858 show_cache();
7859 return false;
7860 }
7861 ceph_assert(!migrator->is_exporting());
7862 ceph_assert(!migrator->is_importing());
7863
7864 // replicas may dirty scatter locks
7865 if (myin && myin->is_replicated()) {
7866 dout(7) << "still have replicated objects" << dendl;
7867 return false;
7868 }
7869
7870 if ((myin && myin->get_num_auth_pins()) ||
7871 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
7872 dout(7) << "still have auth pinned objects" << dendl;
7873 return false;
7874 }
7875
7876 // (only do this once!)
7877 if (!mds->mdlog->is_capped()) {
7878 dout(7) << "capping the log" << dendl;
7879 mds->mdlog->cap();
7880 }
7881
7882 if (!mds->mdlog->empty())
7883 mds->mdlog->trim(0);
7884
7885 if (!mds->mdlog->empty()) {
7886 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7887 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7888 return false;
7889 }
7890
7891 if (!did_shutdown_log_cap) {
7892 // flush journal header
7893 dout(7) << "writing header for (now-empty) journal" << dendl;
7894 ceph_assert(mds->mdlog->empty());
7895 mds->mdlog->write_head(0);
7896 // NOTE: filer active checker below will block us until this completes.
7897 did_shutdown_log_cap = true;
7898 return false;
7899 }
7900
7901 // filer active?
7902 if (mds->objecter->is_active()) {
7903 dout(7) << "objecter still active" << dendl;
7904 mds->objecter->dump_active();
7905 return false;
7906 }
7907
7908 // trim what we can from the cache
7909 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
7910 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7911 show_cache();
7912 //dump();
7913 return false;
7914 }
7915
7916 // make mydir subtree go away
7917 if (mydir) {
7918 if (mydir->get_num_ref() > 1) { // subtree pin
7919 dout(7) << "there's still reference to mydir " << *mydir << dendl;
7920 show_cache();
7921 return false;
7922 }
7923
7924 remove_subtree(mydir);
7925 myin->close_dirfrag(mydir->get_frag());
7926 }
7927 ceph_assert(subtrees.empty());
7928
7929 if (myin) {
7930 remove_inode(myin);
7931 ceph_assert(!myin);
7932 }
7933
7934 if (global_snaprealm) {
7935 remove_inode(global_snaprealm->inode);
7936 global_snaprealm = nullptr;
7937 }
7938
7939 // done!
7940 dout(5) << "shutdown done." << dendl;
7941 return true;
7942 }
7943
7944 bool MDCache::shutdown_export_strays()
7945 {
7946 static const unsigned MAX_EXPORTING = 100;
7947
7948 if (mds->get_nodeid() == 0)
7949 return true;
7950
7951 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
7952 return false;
7953
7954 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
7955 << " '" << shutdown_export_next.second << "'" << dendl;
7956
7957 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
7958 bool all_exported = false;
7959
7960 again:
7961 auto next = shutdown_export_next;
7962
7963 for (int i = 0; i < NUM_STRAY; ++i) {
7964 CInode *strayi = strays[i];
7965 if (!strayi ||
7966 !strayi->state_test(CInode::STATE_STRAYPINNED))
7967 continue;
7968 if (strayi->ino() < next.first.ino)
7969 continue;
7970
7971 deque<CDir*> dfls;
7972 strayi->get_dirfrags(dfls);
7973
7974 while (!dfls.empty()) {
7975 CDir *dir = dfls.front();
7976 dfls.pop_front();
7977
7978 if (dir->dirfrag() < next.first)
7979 continue;
7980 if (next.first < dir->dirfrag()) {
7981 next.first = dir->dirfrag();
7982 next.second.clear();
7983 }
7984
7985 if (!dir->is_complete()) {
7986 MDSContext *fin = nullptr;
7987 if (shutdown_exporting_strays.empty()) {
7988 fin = new MDSInternalContextWrapper(mds,
7989 new LambdaContext([this](int r) {
7990 shutdown_export_strays();
7991 })
7992 );
7993 }
7994 dir->fetch(fin);
7995 goto done;
7996 }
7997
7998 CDir::dentry_key_map::iterator it;
7999 if (next.second.empty()) {
8000 it = dir->begin();
8001 } else {
8002 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8003 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8004 }
8005
8006 for (; it != dir->end(); ++it) {
8007 CDentry *dn = it->second;
8008 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8009 if (dnl->is_null())
8010 continue;
8011
8012 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8013 next.second = it->first.name;
8014 goto done;
8015 }
8016
8017 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8018 if (!ret.second) {
8019 dout(10) << "already exporting/purging " << *dn << dendl;
8020 continue;
8021 }
8022
8023 // Don't try to migrate anything that is actually
8024 // being purged right now
8025 if (!dn->state_test(CDentry::STATE_PURGING))
8026 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8027
8028 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8029 ++it;
8030 if (it != dir->end()) {
8031 next.second = it->first.name;
8032 } else {
8033 if (dfls.empty())
8034 next.first.ino.val++;
8035 else
8036 next.first = dfls.front()->dirfrag();
8037 next.second.clear();
8038 }
8039 goto done;
8040 }
8041 }
8042 }
8043 }
8044
8045 if (shutdown_exporting_strays.empty()) {
8046 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8047 if (first_df < shutdown_export_next.first ||
8048 !shutdown_export_next.second.empty()) {
8049 shutdown_export_next.first = first_df;
8050 shutdown_export_next.second.clear();
8051 goto again;
8052 }
8053 all_exported = true;
8054 }
8055
8056 done:
8057 shutdown_export_next = next;
8058 return all_exported;
8059 }
8060
8061 // ========= messaging ==============
8062
8063 void MDCache::dispatch(const cref_t<Message> &m)
8064 {
8065 switch (m->get_type()) {
8066
8067 // RESOLVE
8068 case MSG_MDS_RESOLVE:
8069 handle_resolve(ref_cast<MMDSResolve>(m));
8070 break;
8071 case MSG_MDS_RESOLVEACK:
8072 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8073 break;
8074
8075 // REJOIN
8076 case MSG_MDS_CACHEREJOIN:
8077 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8078 break;
8079
8080 case MSG_MDS_DISCOVER:
8081 handle_discover(ref_cast<MDiscover>(m));
8082 break;
8083 case MSG_MDS_DISCOVERREPLY:
8084 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8085 break;
8086
8087 case MSG_MDS_DIRUPDATE:
8088 handle_dir_update(ref_cast<MDirUpdate>(m));
8089 break;
8090
8091 case MSG_MDS_CACHEEXPIRE:
8092 handle_cache_expire(ref_cast<MCacheExpire>(m));
8093 break;
8094
8095 case MSG_MDS_DENTRYLINK:
8096 handle_dentry_link(ref_cast<MDentryLink>(m));
8097 break;
8098 case MSG_MDS_DENTRYUNLINK:
8099 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8100 break;
8101
8102 case MSG_MDS_FRAGMENTNOTIFY:
8103 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8104 break;
8105 case MSG_MDS_FRAGMENTNOTIFYACK:
8106 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8107 break;
8108
8109 case MSG_MDS_FINDINO:
8110 handle_find_ino(ref_cast<MMDSFindIno>(m));
8111 break;
8112 case MSG_MDS_FINDINOREPLY:
8113 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8114 break;
8115
8116 case MSG_MDS_OPENINO:
8117 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8118 break;
8119 case MSG_MDS_OPENINOREPLY:
8120 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8121 break;
8122
8123 case MSG_MDS_SNAPUPDATE:
8124 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8125 break;
8126
8127 default:
8128 derr << "cache unknown message " << m->get_type() << dendl;
8129 ceph_abort_msg("cache unknown message");
8130 }
8131 }
8132
8133 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8134 const filepath& path, int flags,
8135 vector<CDentry*> *pdnvec, CInode **pin)
8136 {
8137 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8138 bool forward = !discover;
8139 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8140 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8141 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8142 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8143 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8144 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8145 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8146
8147 if (forward)
8148 ceph_assert(mdr); // forward requires a request
8149
8150 snapid_t snapid = CEPH_NOSNAP;
8151 if (mdr)
8152 mdr->snapid = snapid;
8153
8154 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8155
8156 if (mds->logger) mds->logger->inc(l_mds_traverse);
8157
8158 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8159 CInode *cur = get_inode(path.get_ino());
8160 if (!cur) {
8161 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8162 open_foreign_mdsdir(path.get_ino(), cf.build());
8163 return 1;
8164 }
8165 if (MDS_INO_IS_STRAY(path.get_ino())) {
8166 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8167 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8168 filepath path(strays[idx]->get_parent_dn()->get_name(),
8169 MDS_INO_MDSDIR(rank));
8170 MDRequestRef null_ref;
8171 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8172 }
8173 return -ESTALE;
8174 }
8175 if (cur->state_test(CInode::STATE_PURGING))
8176 return -ESTALE;
8177
8178 // make sure snaprealm are open...
8179 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8180 !cur->snaprealm->open_parents(cf.build())) {
8181 return 1;
8182 }
8183
8184 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8185 mds->locker->find_and_attach_lock_cache(mdr, cur);
8186
8187 if (mdr && mdr->lock_cache) {
8188 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8189 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8190 } else if (rdlock_snap) {
8191 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8192 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8193 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8194 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8195 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8196 return 1;
8197 }
8198 }
8199
8200 // start trace
8201 if (pdnvec)
8202 pdnvec->clear();
8203 if (pin)
8204 *pin = cur;
8205
8206 MutationImpl::LockOpVec lov;
8207
8208 for (unsigned depth = 0; depth < path.depth(); ) {
8209 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8210 << "' snapid " << snapid << dendl;
8211
8212 if (!cur->is_dir()) {
8213 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8214 return -ENOTDIR;
8215 }
8216
8217 // walk into snapdir?
8218 if (path[depth].length() == 0) {
8219 dout(10) << "traverse: snapdir" << dendl;
8220 if (!mdr || depth > 0) // snapdir must be the first component
8221 return -EINVAL;
8222 snapid = CEPH_SNAPDIR;
8223 mdr->snapid = snapid;
8224 depth++;
8225 continue;
8226 }
8227 // walk thru snapdir?
8228 if (snapid == CEPH_SNAPDIR) {
8229 if (!mdr)
8230 return -EINVAL;
8231 SnapRealm *realm = cur->find_snaprealm();
8232 snapid = realm->resolve_snapname(path[depth], cur->ino());
8233 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8234 if (!snapid) {
8235 if (pdnvec)
8236 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8237 return -ENOENT;
8238 }
8239 mdr->snapid = snapid;
8240 depth++;
8241 continue;
8242 }
8243
8244 // open dir
8245 frag_t fg = cur->pick_dirfrag(path[depth]);
8246 CDir *curdir = cur->get_dirfrag(fg);
8247 if (!curdir) {
8248 if (cur->is_auth()) {
8249 // parent dir frozen_dir?
8250 if (cur->is_frozen()) {
8251 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8252 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8253 return 1;
8254 }
8255 curdir = cur->get_or_open_dirfrag(this, fg);
8256 } else {
8257 // discover?
8258 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8259 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8260 path_locked);
8261 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8262 return 1;
8263 }
8264 }
8265 ceph_assert(curdir);
8266
8267 #ifdef MDS_VERIFY_FRAGSTAT
8268 if (curdir->is_complete())
8269 curdir->verify_fragstat();
8270 #endif
8271
8272 // frozen?
8273 /*
8274 if (curdir->is_frozen()) {
8275 // doh!
8276 // FIXME: traverse is allowed?
8277 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8278 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8279 if (onfinish) delete onfinish;
8280 return 1;
8281 }
8282 */
8283
8284 if (want_auth && want_dentry && depth == path.depth() - 1) {
8285 if (curdir->is_ambiguous_auth()) {
8286 dout(10) << "waiting for single auth on " << *curdir << dendl;
8287 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8288 return 1;
8289 }
8290 if (!curdir->is_auth()) {
8291 dout(10) << "fw to auth for " << *curdir << dendl;
8292 request_forward(mdr, curdir->authority().first);
8293 return 2;
8294 }
8295 }
8296
8297 // Before doing dirfrag->dn lookup, compare with DamageTable's
8298 // record of which dentries were unreadable
8299 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8300 dout(4) << "traverse: stopped lookup at damaged dentry "
8301 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8302 return -EIO;
8303 }
8304
8305 // dentry
8306 CDentry *dn = curdir->lookup(path[depth], snapid);
8307 if (dn) {
8308 if (dn->state_test(CDentry::STATE_PURGING))
8309 return -ENOENT;
8310
8311 if (rdlock_path) {
8312 lov.clear();
8313 if (xlock_dentry && depth == path.depth() - 1) {
8314 if (depth > 0 || !mdr->lock_cache) {
8315 lov.add_wrlock(&cur->filelock);
8316 lov.add_wrlock(&cur->nestlock);
8317 if (rdlock_authlock)
8318 lov.add_rdlock(&cur->authlock);
8319 }
8320 lov.add_xlock(&dn->lock);
8321 } else {
8322 // force client to flush async dir operation if necessary
8323 if (cur->filelock.is_cached())
8324 lov.add_wrlock(&cur->filelock);
8325 lov.add_rdlock(&dn->lock);
8326 }
8327 if (!mds->locker->acquire_locks(mdr, lov)) {
8328 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8329 return 1;
8330 }
8331 } else if (!path_locked &&
8332 !dn->lock.can_read(client) &&
8333 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8334 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8335 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8336 if (mds->logger)
8337 mds->logger->inc(l_mds_traverse_lock);
8338 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8339 mds->mdlog->flush();
8340 return 1;
8341 }
8342
8343 if (pdnvec)
8344 pdnvec->push_back(dn);
8345
8346 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8347 // can we conclude ENOENT?
8348 if (dnl->is_null()) {
8349 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8350 if (depth == path.depth() - 1) {
8351 if (want_dentry)
8352 break;
8353 } else {
8354 if (pdnvec)
8355 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8356 }
8357 return -ENOENT;
8358 }
8359
8360 // do we have inode?
8361 CInode *in = dnl->get_inode();
8362 if (!in) {
8363 ceph_assert(dnl->is_remote());
8364 // do i have it?
8365 in = get_inode(dnl->get_remote_ino());
8366 if (in) {
8367 dout(7) << "linking in remote in " << *in << dendl;
8368 dn->link_remote(dnl, in);
8369 } else {
8370 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8371 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8372 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8373 dout(4) << "traverse: remote dentry points to damaged ino "
8374 << *dn << dendl;
8375 return -EIO;
8376 }
8377 open_remote_dentry(dn, true, cf.build(),
8378 (path_locked && depth == path.depth() - 1));
8379 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8380 return 1;
8381 }
8382 }
8383
8384 cur = in;
8385 // make sure snaprealm are open...
8386 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8387 !cur->snaprealm->open_parents(cf.build())) {
8388 return 1;
8389 }
8390
8391 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8392 lov.clear();
8393 lov.add_rdlock(&cur->snaplock);
8394 if (!mds->locker->acquire_locks(mdr, lov)) {
8395 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8396 return 1;
8397 }
8398 }
8399
8400 // add to trace, continue.
8401 touch_inode(cur);
8402 if (pin)
8403 *pin = cur;
8404 depth++;
8405 continue;
8406 }
8407
8408 ceph_assert(!dn);
8409
8410 // MISS. dentry doesn't exist.
8411 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8412
8413 if (curdir->is_auth()) {
8414 // dentry is mine.
8415 if (curdir->is_complete() ||
8416 (snapid == CEPH_NOSNAP &&
8417 curdir->has_bloom() &&
8418 !curdir->is_in_bloom(path[depth]))) {
8419 // file not found
8420 if (pdnvec) {
8421 // instantiate a null dn?
8422 if (depth < path.depth() - 1) {
8423 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8424 } else if (snapid < CEPH_MAXSNAP) {
8425 dout(20) << " not adding null for snapid " << snapid << dendl;
8426 } else if (curdir->is_frozen()) {
8427 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8428 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8429 return 1;
8430 } else {
8431 // create a null dentry
8432 dn = curdir->add_null_dentry(path[depth]);
8433 dout(20) << " added null " << *dn << dendl;
8434
8435 if (rdlock_path) {
8436 lov.clear();
8437 if (xlock_dentry) {
8438 if (depth > 0 || !mdr->lock_cache) {
8439 lov.add_wrlock(&cur->filelock);
8440 lov.add_wrlock(&cur->nestlock);
8441 if (rdlock_authlock)
8442 lov.add_rdlock(&cur->authlock);
8443 }
8444 lov.add_xlock(&dn->lock);
8445 } else {
8446 // force client to flush async dir operation if necessary
8447 if (cur->filelock.is_cached())
8448 lov.add_wrlock(&cur->filelock);
8449 lov.add_rdlock(&dn->lock);
8450 }
8451 if (!mds->locker->acquire_locks(mdr, lov)) {
8452 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8453 return 1;
8454 }
8455 }
8456 }
8457 if (dn) {
8458 pdnvec->push_back(dn);
8459 if (want_dentry)
8460 break;
8461 } else {
8462 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8463 }
8464 }
8465 return -ENOENT;
8466 } else {
8467
8468 // Check DamageTable for missing fragments before trying to fetch
8469 // this
8470 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8471 dout(4) << "traverse: damaged dirfrag " << *curdir
8472 << ", blocking fetch" << dendl;
8473 return -EIO;
8474 }
8475
8476 // directory isn't complete; reload
8477 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8478 touch_inode(cur);
8479 curdir->fetch(cf.build(), path[depth]);
8480 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8481 return 1;
8482 }
8483 } else {
8484 // dirfrag/dentry is not mine.
8485 mds_authority_t dauth = curdir->authority();
8486
8487 if (!forward_all_requests_to_auth &&
8488 forward &&
8489 mdr && mdr->client_request &&
8490 (int)depth < mdr->client_request->get_num_fwd()){
8491 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8492 << " < fwd " << mdr->client_request->get_num_fwd()
8493 << ", discovering instead of forwarding" << dendl;
8494 discover = true;
8495 }
8496
8497 if ((discover)) {
8498 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8499 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8500 path_locked);
8501 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8502 return 1;
8503 }
8504 if (forward) {
8505 // forward
8506 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8507
8508 if (curdir->is_ambiguous_auth()) {
8509 // wait
8510 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8511 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
8512 return 1;
8513 }
8514
8515 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8516
8517 request_forward(mdr, dauth.first);
8518
8519 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8520 return 2;
8521 }
8522 }
8523
8524 ceph_abort(); // i shouldn't get here
8525 }
8526
8527 if (want_auth && !want_dentry) {
8528 if (cur->is_ambiguous_auth()) {
8529 dout(10) << "waiting for single auth on " << *cur << dendl;
8530 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8531 return 1;
8532 }
8533 if (!cur->is_auth()) {
8534 dout(10) << "fw to auth for " << *cur << dendl;
8535 request_forward(mdr, cur->authority().first);
8536 return 2;
8537 }
8538 }
8539
8540 // success.
8541 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8542 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8543 if (mdr)
8544 ceph_assert(mdr->snapid == snapid);
8545
8546 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8547 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8548 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8549 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8550
8551 if (rdlock_path)
8552 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8553
8554 return 0;
8555 }
8556
8557 CInode *MDCache::cache_traverse(const filepath& fp)
8558 {
8559 dout(10) << "cache_traverse " << fp << dendl;
8560
8561 CInode *in;
8562 if (fp.get_ino())
8563 in = get_inode(fp.get_ino());
8564 else
8565 in = root;
8566 if (!in)
8567 return NULL;
8568
8569 for (unsigned i = 0; i < fp.depth(); i++) {
8570 std::string_view dname = fp[i];
8571 frag_t fg = in->pick_dirfrag(dname);
8572 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8573 CDir *curdir = in->get_dirfrag(fg);
8574 if (!curdir)
8575 return NULL;
8576 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8577 if (!dn)
8578 return NULL;
8579 in = dn->get_linkage()->get_inode();
8580 if (!in)
8581 return NULL;
8582 }
8583 dout(10) << " got " << *in << dendl;
8584 return in;
8585 }
8586
8587
8588 /**
8589 * open_remote_dir -- open up a remote dirfrag
8590 *
8591 * @param diri base inode
8592 * @param approxfg approximate fragment.
8593 * @param fin completion callback
8594 */
8595 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8596 {
8597 dout(10) << "open_remote_dir on " << *diri << dendl;
8598 ceph_assert(diri->is_dir());
8599 ceph_assert(!diri->is_auth());
8600 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8601
8602 discover_dir_frag(diri, approxfg, fin);
8603 }
8604
8605
8606 /**
8607 * get_dentry_inode - get or open inode
8608 *
8609 * @param dn the dentry
8610 * @param mdr current request
8611 *
8612 * will return inode for primary, or link up/open up remote link's inode as necessary.
8613 * If it's not available right now, puts mdr on wait list and returns null.
8614 */
8615 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8616 {
8617 CDentry::linkage_t *dnl;
8618 if (projected)
8619 dnl = dn->get_projected_linkage();
8620 else
8621 dnl = dn->get_linkage();
8622
8623 ceph_assert(!dnl->is_null());
8624
8625 if (dnl->is_primary())
8626 return dnl->inode;
8627
8628 ceph_assert(dnl->is_remote());
8629 CInode *in = get_inode(dnl->get_remote_ino());
8630 if (in) {
8631 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8632 dn->link_remote(dnl, in);
8633 return in;
8634 } else {
8635 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8636 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8637 return 0;
8638 }
8639 }
8640
8641 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8642 CDentry *dn;
8643 inodeno_t ino;
8644 MDSContext *onfinish;
8645 bool want_xlocked;
8646 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8647 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8648 dn->get(MDSCacheObject::PIN_PTRWAITER);
8649 }
8650 void finish(int r) override {
8651 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8652 dn->put(MDSCacheObject::PIN_PTRWAITER);
8653 }
8654 };
8655
8656 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8657 {
8658 dout(10) << "open_remote_dentry " << *dn << dendl;
8659 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8660 inodeno_t ino = dnl->get_remote_ino();
8661 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8662 open_ino(ino, pool,
8663 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8664 }
8665
8666 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8667 bool want_xlocked, int r)
8668 {
8669 if (r < 0) {
8670 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8671 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8672 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8673 dn->state_set(CDentry::STATE_BADREMOTEINO);
8674
8675 std::string path;
8676 CDir *dir = dn->get_dir();
8677 if (dir) {
8678 dir->get_inode()->make_path_string(path);
8679 path += "/";
8680 path += dn->get_name();
8681 }
8682
8683 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8684 if (fatal) {
8685 mds->damaged();
8686 ceph_abort(); // unreachable, damaged() respawns us
8687 }
8688 } else {
8689 r = 0;
8690 }
8691 }
8692 fin->complete(r < 0 ? r : 0);
8693 }
8694
8695
8696 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8697 {
8698 // empty trace if we're a base inode
8699 if (in->is_base())
8700 return;
8701
8702 CInode *parent = in->get_parent_inode();
8703 ceph_assert(parent);
8704 make_trace(trace, parent);
8705
8706 CDentry *dn = in->get_parent_dn();
8707 dout(15) << "make_trace adding " << *dn << dendl;
8708 trace.push_back(dn);
8709 }
8710
8711
8712 // -------------------------------------------------------------------------------
8713 // Open inode by inode number
8714
8715 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8716 inodeno_t ino;
8717 public:
8718 bufferlist bl;
8719 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8720 MDCacheIOContext(c), ino(i) {}
8721 void finish(int r) override {
8722 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8723 }
8724 void print(ostream& out) const override {
8725 out << "openino_backtrace_fetch" << ino << ")";
8726 }
8727 };
8728
8729 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8730 inodeno_t ino;
8731 cref_t<MMDSOpenIno> msg;
8732 bool parent;
8733 public:
8734 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8735 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8736 void finish(int r) override {
8737 if (r < 0 && !parent)
8738 r = -EAGAIN;
8739 if (msg) {
8740 mdcache->handle_open_ino(msg, r);
8741 return;
8742 }
8743 auto& info = mdcache->opening_inodes.at(ino);
8744 mdcache->_open_ino_traverse_dir(ino, info, r);
8745 }
8746 };
8747
8748 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8749 inodeno_t ino;
8750 public:
8751 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8752 void finish(int r) override {
8753 mdcache->_open_ino_parent_opened(ino, r);
8754 }
8755 };
8756
8757 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8758 {
8759 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8760
8761 open_ino_info_t& info = opening_inodes.at(ino);
8762
8763 CInode *in = get_inode(ino);
8764 if (in) {
8765 dout(10) << " found cached " << *in << dendl;
8766 open_ino_finish(ino, info, in->authority().first);
8767 return;
8768 }
8769
8770 inode_backtrace_t backtrace;
8771 if (err == 0) {
8772 try {
8773 decode(backtrace, bl);
8774 } catch (const buffer::error &decode_exc) {
8775 derr << "corrupt backtrace on ino x0" << std::hex << ino
8776 << std::dec << ": " << decode_exc << dendl;
8777 open_ino_finish(ino, info, -EIO);
8778 return;
8779 }
8780 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8781 dout(10) << " old object in pool " << info.pool
8782 << ", retrying pool " << backtrace.pool << dendl;
8783 info.pool = backtrace.pool;
8784 C_IO_MDC_OpenInoBacktraceFetched *fin =
8785 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8786 fetch_backtrace(ino, info.pool, fin->bl,
8787 new C_OnFinisher(fin, mds->finisher));
8788 return;
8789 }
8790 } else if (err == -ENOENT) {
8791 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8792 if (info.pool != meta_pool) {
8793 dout(10) << " no object in pool " << info.pool
8794 << ", retrying pool " << meta_pool << dendl;
8795 info.pool = meta_pool;
8796 C_IO_MDC_OpenInoBacktraceFetched *fin =
8797 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8798 fetch_backtrace(ino, info.pool, fin->bl,
8799 new C_OnFinisher(fin, mds->finisher));
8800 return;
8801 }
8802 err = 0; // backtrace.ancestors.empty() is checked below
8803 }
8804
8805 if (err == 0) {
8806 if (backtrace.ancestors.empty()) {
8807 dout(10) << " got empty backtrace " << dendl;
8808 err = -ESTALE;
8809 } else if (!info.ancestors.empty()) {
8810 if (info.ancestors[0] == backtrace.ancestors[0]) {
8811 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8812 err = -EINVAL;
8813 } else {
8814 info.last_err = 0;
8815 }
8816 }
8817 }
8818 if (err) {
8819 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8820 if (info.last_err)
8821 err = info.last_err;
8822 open_ino_finish(ino, info, err);
8823 return;
8824 }
8825
8826 dout(10) << " got backtrace " << backtrace << dendl;
8827 info.ancestors = backtrace.ancestors;
8828
8829 _open_ino_traverse_dir(ino, info, 0);
8830 }
8831
8832 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8833 {
8834 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8835
8836 open_ino_info_t& info = opening_inodes.at(ino);
8837
8838 CInode *in = get_inode(ino);
8839 if (in) {
8840 dout(10) << " found cached " << *in << dendl;
8841 open_ino_finish(ino, info, in->authority().first);
8842 return;
8843 }
8844
8845 if (ret == mds->get_nodeid()) {
8846 _open_ino_traverse_dir(ino, info, 0);
8847 } else {
8848 if (ret >= 0) {
8849 mds_rank_t checked_rank = mds_rank_t(ret);
8850 info.check_peers = true;
8851 info.auth_hint = checked_rank;
8852 info.checked.erase(checked_rank);
8853 }
8854 do_open_ino(ino, info, ret);
8855 }
8856 }
8857
8858 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8859 {
8860 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8861
8862 CInode *in = get_inode(ino);
8863 if (in) {
8864 dout(10) << " found cached " << *in << dendl;
8865 open_ino_finish(ino, info, in->authority().first);
8866 return;
8867 }
8868
8869 if (ret) {
8870 do_open_ino(ino, info, ret);
8871 return;
8872 }
8873
8874 mds_rank_t hint = info.auth_hint;
8875 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8876 info.discover, info.want_xlocked, &hint);
8877 if (ret > 0)
8878 return;
8879 if (hint != mds->get_nodeid())
8880 info.auth_hint = hint;
8881 do_open_ino(ino, info, ret);
8882 }
8883
8884 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
8885 {
8886 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8887 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8888 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8889 if (mds->logger)
8890 mds->logger->inc(l_mds_openino_dir_fetch);
8891 }
8892
8893 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
8894 const vector<inode_backpointer_t>& ancestors,
8895 bool discover, bool want_xlocked, mds_rank_t *hint)
8896 {
8897 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8898 int err = 0;
8899 for (unsigned i = 0; i < ancestors.size(); i++) {
8900 const auto& ancestor = ancestors.at(i);
8901 CInode *diri = get_inode(ancestor.dirino);
8902
8903 if (!diri) {
8904 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
8905 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8906 return 1;
8907 }
8908 continue;
8909 }
8910
8911 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
8912 CDir *dir = diri->get_parent_dir();
8913 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
8914 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
8915 dir = dir->get_inode()->get_parent_dir();
8916 _open_ino_fetch_dir(ino, m, dir, i == 0);
8917 return 1;
8918 }
8919
8920 if (!diri->is_dir()) {
8921 dout(10) << " " << *diri << " is not dir" << dendl;
8922 if (i == 0)
8923 err = -ENOTDIR;
8924 break;
8925 }
8926
8927 const string& name = ancestor.dname;
8928 frag_t fg = diri->pick_dirfrag(name);
8929 CDir *dir = diri->get_dirfrag(fg);
8930 if (!dir) {
8931 if (diri->is_auth()) {
8932 if (diri->is_frozen()) {
8933 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
8934 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8935 return 1;
8936 }
8937 dir = diri->get_or_open_dirfrag(this, fg);
8938 } else if (discover) {
8939 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8940 return 1;
8941 }
8942 }
8943 if (dir) {
8944 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
8945 CDentry *dn = dir->lookup(name);
8946 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
8947 if (dir->is_auth()) {
8948 if (dnl && dnl->is_primary() &&
8949 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
8950 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
8951 _open_ino_fetch_dir(ino, m, dir, i == 0);
8952 return 1;
8953 }
8954
8955 if (!dnl && !dir->is_complete() &&
8956 (!dir->has_bloom() || dir->is_in_bloom(name))) {
8957 dout(10) << " fetching incomplete " << *dir << dendl;
8958 _open_ino_fetch_dir(ino, m, dir, i == 0);
8959 return 1;
8960 }
8961
8962 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8963 if (i == 0)
8964 err = -ENOENT;
8965 } else if (discover) {
8966 if (!dnl) {
8967 filepath path(name, 0);
8968 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
8969 (i == 0 && want_xlocked));
8970 return 1;
8971 }
8972 if (dnl->is_null() && !dn->lock.can_read(-1)) {
8973 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
8974 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
8975 return 1;
8976 }
8977 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
8978 if (i == 0)
8979 err = -ENOENT;
8980 }
8981 }
8982 if (hint && i == 0)
8983 *hint = dir ? dir->authority().first : diri->authority().first;
8984 break;
8985 }
8986 return err;
8987 }
8988
8989 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
8990 {
8991 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
8992
8993 MDSContext::vec waiters;
8994 waiters.swap(info.waiters);
8995 opening_inodes.erase(ino);
8996 finish_contexts(g_ceph_context, waiters, ret);
8997 }
8998
8999 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9000 {
9001 if (err < 0 && err != -EAGAIN) {
9002 info.checked.clear();
9003 info.checking = MDS_RANK_NONE;
9004 info.check_peers = true;
9005 info.fetch_backtrace = true;
9006 if (info.discover) {
9007 info.discover = false;
9008 info.ancestors.clear();
9009 }
9010 if (err != -ENOENT && err != -ENOTDIR)
9011 info.last_err = err;
9012 }
9013
9014 if (info.check_peers || info.discover) {
9015 if (info.discover) {
9016 // got backtrace from peer, but failed to find inode. re-check peers
9017 info.discover = false;
9018 info.ancestors.clear();
9019 info.checked.clear();
9020 }
9021 info.check_peers = false;
9022 info.checking = MDS_RANK_NONE;
9023 do_open_ino_peer(ino, info);
9024 } else if (info.fetch_backtrace) {
9025 info.check_peers = true;
9026 info.fetch_backtrace = false;
9027 info.checking = mds->get_nodeid();
9028 info.checked.clear();
9029 C_IO_MDC_OpenInoBacktraceFetched *fin =
9030 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9031 fetch_backtrace(ino, info.pool, fin->bl,
9032 new C_OnFinisher(fin, mds->finisher));
9033 } else {
9034 ceph_assert(!info.ancestors.empty());
9035 info.checking = mds->get_nodeid();
9036 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
9037 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9038 }
9039 }
9040
9041 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9042 {
9043 set<mds_rank_t> all, active;
9044 mds->mdsmap->get_mds_set(all);
9045 if (mds->get_state() == MDSMap::STATE_REJOIN)
9046 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9047 else
9048 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9049
9050 dout(10) << "do_open_ino_peer " << ino << " active " << active
9051 << " all " << all << " checked " << info.checked << dendl;
9052
9053 mds_rank_t whoami = mds->get_nodeid();
9054 mds_rank_t peer = MDS_RANK_NONE;
9055 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9056 if (active.count(info.auth_hint)) {
9057 peer = info.auth_hint;
9058 info.auth_hint = MDS_RANK_NONE;
9059 }
9060 } else {
9061 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9062 if (*p != whoami && info.checked.count(*p) == 0) {
9063 peer = *p;
9064 break;
9065 }
9066 }
9067 if (peer < 0) {
9068 all.erase(whoami);
9069 if (all != info.checked) {
9070 dout(10) << " waiting for more peers to be active" << dendl;
9071 } else {
9072 dout(10) << " all MDS peers have been checked " << dendl;
9073 do_open_ino(ino, info, 0);
9074 }
9075 } else {
9076 info.checking = peer;
9077 vector<inode_backpointer_t> *pa = NULL;
9078 // got backtrace from peer or backtrace just fetched
9079 if (info.discover || !info.fetch_backtrace)
9080 pa = &info.ancestors;
9081 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9082 if (mds->logger)
9083 mds->logger->inc(l_mds_openino_peer_discover);
9084 }
9085 }
9086
9087 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9088 {
9089 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9090 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9091 return;
9092 }
9093
9094 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9095
9096 auto from = mds_rank_t(m->get_source().num());
9097 inodeno_t ino = m->ino;
9098 ref_t<MMDSOpenInoReply> reply;
9099 CInode *in = get_inode(ino);
9100 if (in) {
9101 dout(10) << " have " << *in << dendl;
9102 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9103 if (in->is_auth()) {
9104 touch_inode(in);
9105 while (1) {
9106 CDentry *pdn = in->get_parent_dn();
9107 if (!pdn)
9108 break;
9109 CInode *diri = pdn->get_dir()->get_inode();
9110 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9111 in->inode.version));
9112 in = diri;
9113 }
9114 } else {
9115 reply->hint = in->authority().first;
9116 }
9117 } else if (err < 0) {
9118 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9119 } else {
9120 mds_rank_t hint = MDS_RANK_NONE;
9121 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9122 if (ret > 0)
9123 return;
9124 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9125 }
9126 mds->send_message_mds(reply, from);
9127 }
9128
9129 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9130 {
9131 dout(10) << "handle_open_ino_reply " << *m << dendl;
9132
9133 inodeno_t ino = m->ino;
9134 mds_rank_t from = mds_rank_t(m->get_source().num());
9135 auto it = opening_inodes.find(ino);
9136 if (it != opening_inodes.end() && it->second.checking == from) {
9137 open_ino_info_t& info = it->second;
9138 info.checking = MDS_RANK_NONE;
9139 info.checked.insert(from);
9140
9141 CInode *in = get_inode(ino);
9142 if (in) {
9143 dout(10) << " found cached " << *in << dendl;
9144 open_ino_finish(ino, info, in->authority().first);
9145 } else if (!m->ancestors.empty()) {
9146 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9147 if (!info.want_replica) {
9148 open_ino_finish(ino, info, from);
9149 return;
9150 }
9151
9152 info.ancestors = m->ancestors;
9153 info.auth_hint = from;
9154 info.checking = mds->get_nodeid();
9155 info.discover = true;
9156 _open_ino_traverse_dir(ino, info, 0);
9157 } else if (m->error) {
9158 dout(10) << " error " << m->error << " from mds." << from << dendl;
9159 do_open_ino(ino, info, m->error);
9160 } else {
9161 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9162 info.auth_hint = m->hint;
9163 info.checked.erase(m->hint);
9164 }
9165 do_open_ino_peer(ino, info);
9166 }
9167 }
9168 }
9169
9170 void MDCache::kick_open_ino_peers(mds_rank_t who)
9171 {
9172 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9173
9174 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9175 p != opening_inodes.end();
9176 ++p) {
9177 open_ino_info_t& info = p->second;
9178 if (info.checking == who) {
9179 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9180 info.checking = MDS_RANK_NONE;
9181 do_open_ino_peer(p->first, info);
9182 } else if (info.checking == MDS_RANK_NONE) {
9183 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9184 do_open_ino_peer(p->first, info);
9185 }
9186 }
9187 }
9188
9189 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9190 bool want_replica, bool want_xlocked)
9191 {
9192 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9193 << want_replica << dendl;
9194
9195 auto it = opening_inodes.find(ino);
9196 if (it != opening_inodes.end()) {
9197 open_ino_info_t& info = it->second;
9198 if (want_replica) {
9199 info.want_replica = true;
9200 if (want_xlocked && !info.want_xlocked) {
9201 if (!info.ancestors.empty()) {
9202 CInode *diri = get_inode(info.ancestors[0].dirino);
9203 if (diri) {
9204 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9205 CDir *dir = diri->get_dirfrag(fg);
9206 if (dir && !dir->is_auth()) {
9207 filepath path(info.ancestors[0].dname, 0);
9208 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9209 }
9210 }
9211 }
9212 info.want_xlocked = true;
9213 }
9214 }
9215 info.waiters.push_back(fin);
9216 } else {
9217 open_ino_info_t& info = opening_inodes[ino];
9218 info.want_replica = want_replica;
9219 info.want_xlocked = want_xlocked;
9220 info.tid = ++open_ino_last_tid;
9221 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9222 info.waiters.push_back(fin);
9223 if (mds->is_rejoin() &&
9224 open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
9225 info.fetch_backtrace = false;
9226 info.checking = mds->get_nodeid();
9227 _open_ino_traverse_dir(ino, info, 0);
9228 } else {
9229 do_open_ino(ino, info, 0);
9230 }
9231 }
9232 }
9233
9234 /* ---------------------------- */
9235
9236 /*
9237 * search for a given inode on MDS peers. optionally start with the given node.
9238
9239
9240 TODO
9241 - recover from mds node failure, recovery
9242 - traverse path
9243
9244 */
9245 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9246 mds_rank_t hint, bool path_locked)
9247 {
9248 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9249 CInode *in = get_inode(ino);
9250 if (in && in->state_test(CInode::STATE_PURGING)) {
9251 c->complete(-ESTALE);
9252 return;
9253 }
9254 ceph_assert(!in);
9255
9256 ceph_tid_t tid = ++find_ino_peer_last_tid;
9257 find_ino_peer_info_t& fip = find_ino_peer[tid];
9258 fip.ino = ino;
9259 fip.tid = tid;
9260 fip.fin = c;
9261 fip.path_locked = path_locked;
9262 fip.hint = hint;
9263 _do_find_ino_peer(fip);
9264 }
9265
9266 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9267 {
9268 set<mds_rank_t> all, active;
9269 mds->mdsmap->get_mds_set(all);
9270 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9271
9272 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9273 << " active " << active << " all " << all
9274 << " checked " << fip.checked
9275 << dendl;
9276
9277 mds_rank_t m = MDS_RANK_NONE;
9278 if (fip.hint >= 0) {
9279 m = fip.hint;
9280 fip.hint = MDS_RANK_NONE;
9281 } else {
9282 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9283 if (*p != mds->get_nodeid() &&
9284 fip.checked.count(*p) == 0) {
9285 m = *p;
9286 break;
9287 }
9288 }
9289 if (m == MDS_RANK_NONE) {
9290 all.erase(mds->get_nodeid());
9291 if (all != fip.checked) {
9292 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9293 } else {
9294 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9295 fip.fin->complete(-ESTALE);
9296 find_ino_peer.erase(fip.tid);
9297 }
9298 } else {
9299 fip.checking = m;
9300 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9301 }
9302 }
9303
9304 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9305 {
9306 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9307 return;
9308 }
9309
9310 dout(10) << "handle_find_ino " << *m << dendl;
9311 auto r = make_message<MMDSFindInoReply>(m->tid);
9312 CInode *in = get_inode(m->ino);
9313 if (in) {
9314 in->make_path(r->path);
9315 dout(10) << " have " << r->path << " " << *in << dendl;
9316 }
9317 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9318 }
9319
9320
9321 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9322 {
9323 auto p = find_ino_peer.find(m->tid);
9324 if (p != find_ino_peer.end()) {
9325 dout(10) << "handle_find_ino_reply " << *m << dendl;
9326 find_ino_peer_info_t& fip = p->second;
9327
9328 // success?
9329 if (get_inode(fip.ino)) {
9330 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9331 mds->queue_waiter(fip.fin);
9332 find_ino_peer.erase(p);
9333 return;
9334 }
9335
9336 mds_rank_t from = mds_rank_t(m->get_source().num());
9337 if (fip.checking == from)
9338 fip.checking = MDS_RANK_NONE;
9339 fip.checked.insert(from);
9340
9341 if (!m->path.empty()) {
9342 // we got a path!
9343 vector<CDentry*> trace;
9344 CF_MDS_RetryMessageFactory cf(mds, m);
9345 MDRequestRef null_ref;
9346 int flags = MDS_TRAVERSE_DISCOVER;
9347 if (fip.path_locked)
9348 flags |= MDS_TRAVERSE_PATH_LOCKED;
9349 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9350 if (r > 0)
9351 return;
9352 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9353 << ", retrying" << dendl;
9354 fip.checked.clear();
9355 _do_find_ino_peer(fip);
9356 } else {
9357 // nope, continue.
9358 _do_find_ino_peer(fip);
9359 }
9360 } else {
9361 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9362 }
9363 }
9364
9365 void MDCache::kick_find_ino_peers(mds_rank_t who)
9366 {
9367 // find_ino_peers requests we should move on from
9368 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9369 p != find_ino_peer.end();
9370 ++p) {
9371 find_ino_peer_info_t& fip = p->second;
9372 if (fip.checking == who) {
9373 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9374 fip.checking = MDS_RANK_NONE;
9375 _do_find_ino_peer(fip);
9376 } else if (fip.checking == MDS_RANK_NONE) {
9377 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9378 _do_find_ino_peer(fip);
9379 }
9380 }
9381 }
9382
9383 /* ---------------------------- */
9384
9385 int MDCache::get_num_client_requests()
9386 {
9387 int count = 0;
9388 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9389 p != active_requests.end();
9390 ++p) {
9391 MDRequestRef& mdr = p->second;
9392 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9393 count++;
9394 }
9395 return count;
9396 }
9397
9398 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9399 {
9400 // did we win a forward race against a slave?
9401 if (active_requests.count(req->get_reqid())) {
9402 MDRequestRef& mdr = active_requests[req->get_reqid()];
9403 ceph_assert(mdr);
9404 if (mdr->is_slave()) {
9405 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9406 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9407 } else {
9408 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9409 }
9410 return MDRequestRef();
9411 }
9412
9413 // register new client request
9414 MDRequestImpl::Params params;
9415 params.reqid = req->get_reqid();
9416 params.attempt = req->get_num_fwd();
9417 params.client_req = req;
9418 params.initiated = req->get_recv_stamp();
9419 params.throttled = req->get_throttle_stamp();
9420 params.all_read = req->get_recv_complete_stamp();
9421 params.dispatched = req->get_dispatch_stamp();
9422
9423 MDRequestRef mdr =
9424 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9425 active_requests[params.reqid] = mdr;
9426 mdr->set_op_stamp(req->get_stamp());
9427 dout(7) << "request_start " << *mdr << dendl;
9428 return mdr;
9429 }
9430
9431 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9432 {
9433 int by = m->get_source().num();
9434 MDRequestImpl::Params params;
9435 params.reqid = ri;
9436 params.attempt = attempt;
9437 params.triggering_slave_req = m;
9438 params.slave_to = by;
9439 params.initiated = m->get_recv_stamp();
9440 params.throttled = m->get_throttle_stamp();
9441 params.all_read = m->get_recv_complete_stamp();
9442 params.dispatched = m->get_dispatch_stamp();
9443 MDRequestRef mdr =
9444 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9445 ceph_assert(active_requests.count(mdr->reqid) == 0);
9446 active_requests[mdr->reqid] = mdr;
9447 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9448 return mdr;
9449 }
9450
9451 MDRequestRef MDCache::request_start_internal(int op)
9452 {
9453 utime_t now = ceph_clock_now();
9454 MDRequestImpl::Params params;
9455 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9456 params.reqid.tid = mds->issue_tid();
9457 params.initiated = now;
9458 params.throttled = now;
9459 params.all_read = now;
9460 params.dispatched = now;
9461 params.internal_op = op;
9462 MDRequestRef mdr =
9463 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9464
9465 ceph_assert(active_requests.count(mdr->reqid) == 0);
9466 active_requests[mdr->reqid] = mdr;
9467 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9468 return mdr;
9469 }
9470
9471 MDRequestRef MDCache::request_get(metareqid_t rid)
9472 {
9473 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9474 ceph_assert(p != active_requests.end());
9475 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9476 return p->second;
9477 }
9478
9479 void MDCache::request_finish(MDRequestRef& mdr)
9480 {
9481 dout(7) << "request_finish " << *mdr << dendl;
9482 mdr->mark_event("finishing request");
9483
9484 // slave finisher?
9485 if (mdr->has_more() && mdr->more()->slave_commit) {
9486 Context *fin = mdr->more()->slave_commit;
9487 mdr->more()->slave_commit = 0;
9488 int ret;
9489 if (mdr->aborted) {
9490 mdr->aborted = false;
9491 ret = -1;
9492 mdr->more()->slave_rolling_back = true;
9493 } else {
9494 ret = 0;
9495 mdr->committing = true;
9496 }
9497 fin->complete(ret); // this must re-call request_finish.
9498 return;
9499 }
9500
9501 switch(mdr->internal_op) {
9502 case CEPH_MDS_OP_FRAGMENTDIR:
9503 logger->inc(l_mdss_ireq_fragmentdir);
9504 break;
9505 case CEPH_MDS_OP_EXPORTDIR:
9506 logger->inc(l_mdss_ireq_exportdir);
9507 break;
9508 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9509 logger->inc(l_mdss_ireq_enqueue_scrub);
9510 break;
9511 case CEPH_MDS_OP_FLUSH:
9512 logger->inc(l_mdss_ireq_flush);
9513 break;
9514 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9515 logger->inc(l_mdss_ireq_fragstats);
9516 break;
9517 case CEPH_MDS_OP_REPAIR_INODESTATS:
9518 logger->inc(l_mdss_ireq_inodestats);
9519 break;
9520 }
9521
9522 request_cleanup(mdr);
9523 }
9524
9525
9526 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9527 {
9528 mdr->mark_event("forwarding request");
9529 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9530 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9531 << *mdr->client_request << dendl;
9532 if (mdr->is_batch_head) {
9533 int mask = mdr->client_request->head.args.getattr.mask;
9534
9535 switch (mdr->client_request->get_op()) {
9536 case CEPH_MDS_OP_GETATTR:
9537 {
9538 CInode* in = mdr->in[0];
9539 if (in) {
9540 auto it = in->batch_ops.find(mask);
9541 if (it != in->batch_ops.end()) {
9542 it->second->forward(who);
9543 in->batch_ops.erase(it);
9544 }
9545 }
9546 break;
9547 }
9548 case CEPH_MDS_OP_LOOKUP:
9549 {
9550 if (mdr->dn[0].size()) {
9551 CDentry* dn = mdr->dn[0].back();
9552 auto it = dn->batch_ops.find(mask);
9553 if (it != dn->batch_ops.end()) {
9554 it->second->forward(who);
9555 dn->batch_ops.erase(it);
9556 }
9557 }
9558 break;
9559 }
9560 default:
9561 ceph_abort();
9562 }
9563 } else {
9564 mds->forward_message_mds(mdr->release_client_request(), who);
9565 }
9566 if (mds->logger) mds->logger->inc(l_mds_forward);
9567 } else if (mdr->internal_op >= 0) {
9568 dout(10) << "request_forward on internal op; cancelling" << dendl;
9569 mdr->internal_op_finish->complete(-EXDEV);
9570 } else {
9571 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9572 << " was from mds" << dendl;
9573 }
9574 request_cleanup(mdr);
9575 }
9576
9577
9578 void MDCache::dispatch_request(MDRequestRef& mdr)
9579 {
9580 if (mdr->client_request) {
9581 mds->server->dispatch_client_request(mdr);
9582 } else if (mdr->slave_request) {
9583 mds->server->dispatch_slave_request(mdr);
9584 } else {
9585 switch (mdr->internal_op) {
9586 case CEPH_MDS_OP_FRAGMENTDIR:
9587 dispatch_fragment_dir(mdr);
9588 break;
9589 case CEPH_MDS_OP_EXPORTDIR:
9590 migrator->dispatch_export_dir(mdr, 0);
9591 break;
9592 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9593 enqueue_scrub_work(mdr);
9594 break;
9595 case CEPH_MDS_OP_FLUSH:
9596 flush_dentry_work(mdr);
9597 break;
9598 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9599 repair_dirfrag_stats_work(mdr);
9600 break;
9601 case CEPH_MDS_OP_REPAIR_INODESTATS:
9602 repair_inode_stats_work(mdr);
9603 break;
9604 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9605 upgrade_inode_snaprealm_work(mdr);
9606 break;
9607 default:
9608 ceph_abort();
9609 }
9610 }
9611 }
9612
9613
9614 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9615 {
9616 if (!mdr->has_more())
9617 return;
9618
9619 // clean up slaves
9620 // (will implicitly drop remote dn pins)
9621 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9622 p != mdr->more()->slaves.end();
9623 ++p) {
9624 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt,
9625 MMDSSlaveRequest::OP_FINISH);
9626
9627 if (mdr->killed && !mdr->committing) {
9628 r->mark_abort();
9629 } else if (mdr->more()->srcdn_auth_mds == *p &&
9630 mdr->more()->inode_import.length() > 0) {
9631 // information about rename imported caps
9632 r->inode_export.claim(mdr->more()->inode_import);
9633 }
9634
9635 mds->send_message_mds(r, *p);
9636 }
9637
9638 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9639 * implicitly. Note that we don't call the finishers -- there shouldn't
9640 * be any on a remote lock and the request finish wakes up all
9641 * the waiters anyway! */
9642
9643 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9644 SimpleLock *lock = it->lock;
9645 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9646 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9647 << " on " << lock->get_parent() << dendl;
9648 lock->put_xlock();
9649 mdr->locks.erase(it++);
9650 } else if (it->is_remote_wrlock()) {
9651 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9652 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9653 if (it->is_wrlock()) {
9654 it->clear_remote_wrlock();
9655 ++it;
9656 } else {
9657 mdr->locks.erase(it++);
9658 }
9659 } else {
9660 ++it;
9661 }
9662 }
9663
9664 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9665 * leaving them in can cause double-notifies as
9666 * this function can get called more than once */
9667 }
9668
9669 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9670 {
9671 request_drop_foreign_locks(mdr);
9672 mds->locker->drop_non_rdlocks(mdr.get());
9673 }
9674
9675 void MDCache::request_drop_locks(MDRequestRef& mdr)
9676 {
9677 request_drop_foreign_locks(mdr);
9678 mds->locker->drop_locks(mdr.get());
9679 }
9680
9681 void MDCache::request_cleanup(MDRequestRef& mdr)
9682 {
9683 dout(15) << "request_cleanup " << *mdr << dendl;
9684
9685 if (mdr->has_more()) {
9686 if (mdr->more()->is_ambiguous_auth)
9687 mdr->clear_ambiguous_auth();
9688 if (!mdr->more()->waiting_for_finish.empty())
9689 mds->queue_waiters(mdr->more()->waiting_for_finish);
9690 }
9691
9692 request_drop_locks(mdr);
9693
9694 // drop (local) auth pins
9695 mdr->drop_local_auth_pins();
9696
9697 // drop stickydirs
9698 mdr->put_stickydirs();
9699
9700 mds->locker->kick_cap_releases(mdr);
9701
9702 // drop cache pins
9703 mdr->drop_pins();
9704
9705 // remove from session
9706 mdr->item_session_request.remove_myself();
9707
9708 // remove from map
9709 active_requests.erase(mdr->reqid);
9710
9711 if (mds->logger)
9712 log_stat();
9713
9714 mdr->mark_event("cleaned up request");
9715 }
9716
9717 void MDCache::request_kill(MDRequestRef& mdr)
9718 {
9719 // rollback slave requests is tricky. just let the request proceed.
9720 if (mdr->has_more() &&
9721 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9722 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9723 ceph_assert(mdr->more()->witnessed.empty());
9724 mdr->aborted = true;
9725 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9726 } else {
9727 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9728 }
9729
9730 ceph_assert(mdr->used_prealloc_ino == 0);
9731 ceph_assert(mdr->prealloc_inos.empty());
9732
9733 mdr->session = NULL;
9734 mdr->item_session_request.remove_myself();
9735 return;
9736 }
9737
9738 mdr->killed = true;
9739 mdr->mark_event("killing request");
9740
9741 if (mdr->committing) {
9742 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9743 } else {
9744 dout(10) << "request_kill " << *mdr << dendl;
9745 request_cleanup(mdr);
9746 }
9747 }
9748
9749 // -------------------------------------------------------------------------------
9750 // SNAPREALMS
9751
9752 void MDCache::create_global_snaprealm()
9753 {
9754 CInode *in = new CInode(this); // dummy inode
9755 create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9756 add_inode(in);
9757 global_snaprealm = in->snaprealm;
9758 }
9759
9760 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9761 {
9762 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9763
9764 vector<inodeno_t> split_inos;
9765 vector<inodeno_t> split_realms;
9766
9767 if (notify_clients) {
9768 ceph_assert(in->snaprealm->have_past_parents_open());
9769 if (snapop == CEPH_SNAP_OP_SPLIT) {
9770 // notify clients of update|split
9771 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9772 !p.end(); ++p)
9773 split_inos.push_back((*p)->ino());
9774
9775 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9776 p != in->snaprealm->open_children.end();
9777 ++p)
9778 split_realms.push_back((*p)->inode->ino());
9779 }
9780 }
9781
9782 set<SnapRealm*> past_children;
9783 map<client_t, ref_t<MClientSnap>> updates;
9784 list<SnapRealm*> q;
9785 q.push_back(in->snaprealm);
9786 while (!q.empty()) {
9787 SnapRealm *realm = q.front();
9788 q.pop_front();
9789
9790 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9791 realm->invalidate_cached_snaps();
9792
9793 if (notify_clients) {
9794 for (const auto& p : realm->client_caps) {
9795 const auto& client = p.first;
9796 const auto& caps = p.second;
9797 ceph_assert(!caps->empty());
9798
9799 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9800 if (em.second) {
9801 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9802 update->head.split = in->ino();
9803 update->split_inos = split_inos;
9804 update->split_realms = split_realms;
9805 update->bl = in->snaprealm->get_snap_trace();
9806 em.first->second = std::move(update);
9807 }
9808 }
9809 }
9810
9811 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9812 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9813 p != realm->open_past_children.end();
9814 ++p)
9815 past_children.insert(*p);
9816 }
9817
9818 // notify for active children, too.
9819 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9820 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9821 p != realm->open_children.end();
9822 ++p)
9823 q.push_back(*p);
9824 }
9825
9826 if (notify_clients)
9827 send_snaps(updates);
9828
9829 // notify past children and their descendants if we update/delete old snapshots
9830 for (set<SnapRealm*>::iterator p = past_children.begin();
9831 p != past_children.end();
9832 ++p)
9833 q.push_back(*p);
9834
9835 while (!q.empty()) {
9836 SnapRealm *realm = q.front();
9837 q.pop_front();
9838
9839 realm->invalidate_cached_snaps();
9840
9841 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9842 p != realm->open_children.end();
9843 ++p) {
9844 if (past_children.count(*p) == 0)
9845 q.push_back(*p);
9846 }
9847
9848 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9849 p != realm->open_past_children.end();
9850 ++p) {
9851 if (past_children.count(*p) == 0) {
9852 q.push_back(*p);
9853 past_children.insert(*p);
9854 }
9855 }
9856 }
9857
9858 if (snapop == CEPH_SNAP_OP_DESTROY) {
9859 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9860 for (set<SnapRealm*>::iterator p = past_children.begin();
9861 p != past_children.end();
9862 ++p)
9863 maybe_eval_stray((*p)->inode, true);
9864 }
9865 }
9866
9867 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
9868 {
9869 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9870 ceph_assert(in->is_auth());
9871
9872 set<mds_rank_t> mds_set;
9873 if (stid > 0) {
9874 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9875 mds_set.erase(mds->get_nodeid());
9876 } else {
9877 in->list_replicas(mds_set);
9878 }
9879
9880 if (!mds_set.empty()) {
9881 bufferlist snap_blob;
9882 in->encode_snap(snap_blob);
9883
9884 for (auto p : mds_set) {
9885 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
9886 m->snap_blob = snap_blob;
9887 mds->send_message_mds(m, p);
9888 }
9889 }
9890
9891 if (stid > 0)
9892 notify_global_snaprealm_update(snap_op);
9893 }
9894
9895 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
9896 {
9897 mds_rank_t from = mds_rank_t(m->get_source().num());
9898 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
9899
9900 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
9901 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
9902 return;
9903 }
9904
9905 // null rejoin_done means open_snaprealms() has already been called
9906 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
9907 (mds->is_rejoin() && !rejoin_done);
9908
9909 if (m->get_tid() > 0) {
9910 mds->snapclient->notify_commit(m->get_tid());
9911 if (notify_clients)
9912 notify_global_snaprealm_update(m->get_snap_op());
9913 }
9914
9915 CInode *in = get_inode(m->get_ino());
9916 if (in) {
9917 ceph_assert(!in->is_auth());
9918 if (mds->get_state() > MDSMap::STATE_REJOIN ||
9919 (mds->is_rejoin() && !in->is_rejoining())) {
9920 auto p = m->snap_blob.cbegin();
9921 in->decode_snap(p);
9922
9923 if (!notify_clients) {
9924 if (!rejoin_pending_snaprealms.count(in)) {
9925 in->get(CInode::PIN_OPENINGSNAPPARENTS);
9926 rejoin_pending_snaprealms.insert(in);
9927 }
9928 }
9929 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
9930 }
9931 }
9932 }
9933
9934 void MDCache::notify_global_snaprealm_update(int snap_op)
9935 {
9936 if (snap_op != CEPH_SNAP_OP_DESTROY)
9937 snap_op = CEPH_SNAP_OP_UPDATE;
9938 set<Session*> sessions;
9939 mds->sessionmap.get_client_session_set(sessions);
9940 for (auto &session : sessions) {
9941 if (!session->is_open() && !session->is_stale())
9942 continue;
9943 auto update = make_message<MClientSnap>(snap_op);
9944 update->head.split = global_snaprealm->inode->ino();
9945 update->bl = global_snaprealm->get_snap_trace();
9946 mds->send_message_client_counted(update, session);
9947 }
9948 }
9949
9950 // -------------------------------------------------------------------------------
9951 // STRAYS
9952
9953 struct C_MDC_RetryScanStray : public MDCacheContext {
9954 dirfrag_t next;
9955 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
9956 void finish(int r) override {
9957 mdcache->scan_stray_dir(next);
9958 }
9959 };
9960
9961 void MDCache::scan_stray_dir(dirfrag_t next)
9962 {
9963 dout(10) << "scan_stray_dir " << next << dendl;
9964
9965 std::vector<CDir*> ls;
9966 for (int i = 0; i < NUM_STRAY; ++i) {
9967 if (strays[i]->ino() < next.ino)
9968 continue;
9969 strays[i]->get_dirfrags(ls);
9970 }
9971
9972 for (const auto& dir : ls) {
9973 if (dir->dirfrag() < next)
9974 continue;
9975 if (!dir->is_complete()) {
9976 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
9977 return;
9978 }
9979 for (auto &p : dir->items) {
9980 CDentry *dn = p.second;
9981 dn->state_set(CDentry::STATE_STRAY);
9982 CDentry::linkage_t *dnl = dn->get_projected_linkage();
9983 if (dnl->is_primary()) {
9984 CInode *in = dnl->get_inode();
9985 if (in->inode.nlink == 0)
9986 in->state_set(CInode::STATE_ORPHAN);
9987 maybe_eval_stray(in);
9988 }
9989 }
9990 }
9991 }
9992
9993 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
9994 {
9995 object_t oid = CInode::get_object_name(ino, frag_t(), "");
9996 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
9997 if (mds->logger)
9998 mds->logger->inc(l_mds_openino_backtrace_fetch);
9999 }
10000
10001
10002
10003
10004
10005 // ========================================================================================
10006 // DISCOVER
10007 /*
10008
10009 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10010 to the parent metadata object in the cache (pinning it).
10011
10012 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10013
10014 */
10015
10016 void MDCache::_send_discover(discover_info_t& d)
10017 {
10018 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
10019 d.want_base_dir, d.path_locked);
10020 dis->set_tid(d.tid);
10021 mds->send_message_mds(dis, d.mds);
10022 }
10023
10024 void MDCache::discover_base_ino(inodeno_t want_ino,
10025 MDSContext *onfinish,
10026 mds_rank_t from)
10027 {
10028 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
10029 if (waiting_for_base_ino[from].count(want_ino) == 0) {
10030 discover_info_t& d = _create_discover(from);
10031 d.ino = want_ino;
10032 _send_discover(d);
10033 }
10034 waiting_for_base_ino[from][want_ino].push_back(onfinish);
10035 }
10036
10037
10038 void MDCache::discover_dir_frag(CInode *base,
10039 frag_t approx_fg,
10040 MDSContext *onfinish,
10041 mds_rank_t from)
10042 {
10043 if (from < 0)
10044 from = base->authority().first;
10045
10046 dirfrag_t df(base->ino(), approx_fg);
10047 dout(7) << "discover_dir_frag " << df
10048 << " from mds." << from << dendl;
10049
10050 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10051 discover_info_t& d = _create_discover(from);
10052 d.pin_base(base);
10053 d.ino = base->ino();
10054 d.frag = approx_fg;
10055 d.want_base_dir = true;
10056 _send_discover(d);
10057 }
10058
10059 if (onfinish)
10060 base->add_dir_waiter(approx_fg, onfinish);
10061 }
10062
10063 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10064 CInode *base;
10065 snapid_t snapid;
10066 filepath path;
10067 mds_rank_t from;
10068 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10069 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10070 void finish(int r) override {
10071 mdcache->discover_path(base, snapid, path, 0, from);
10072 }
10073 };
10074
10075 void MDCache::discover_path(CInode *base,
10076 snapid_t snap,
10077 filepath want_path,
10078 MDSContext *onfinish,
10079 bool path_locked,
10080 mds_rank_t from)
10081 {
10082 if (from < 0)
10083 from = base->authority().first;
10084
10085 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10086 << (path_locked ? " path_locked":"")
10087 << dendl;
10088
10089 if (base->is_ambiguous_auth()) {
10090 dout(10) << " waiting for single auth on " << *base << dendl;
10091 if (!onfinish)
10092 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10093 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10094 return;
10095 } else if (from == mds->get_nodeid()) {
10096 MDSContext::vec finished;
10097 base->take_waiting(CInode::WAIT_DIR, finished);
10098 mds->queue_waiters(finished);
10099 return;
10100 }
10101
10102 frag_t fg = base->pick_dirfrag(want_path[0]);
10103 if ((path_locked && want_path.depth() == 1) ||
10104 !base->is_waiting_for_dir(fg) || !onfinish) {
10105 discover_info_t& d = _create_discover(from);
10106 d.ino = base->ino();
10107 d.pin_base(base);
10108 d.frag = fg;
10109 d.snap = snap;
10110 d.want_path = want_path;
10111 d.want_base_dir = true;
10112 d.path_locked = path_locked;
10113 _send_discover(d);
10114 }
10115
10116 // register + wait
10117 if (onfinish)
10118 base->add_dir_waiter(fg, onfinish);
10119 }
10120
10121 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10122 CDir *base;
10123 snapid_t snapid;
10124 filepath path;
10125 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10126 MDCacheContext(c), base(b), snapid(s), path(p) {}
10127 void finish(int r) override {
10128 mdcache->discover_path(base, snapid, path, 0);
10129 }
10130 };
10131
10132 void MDCache::discover_path(CDir *base,
10133 snapid_t snap,
10134 filepath want_path,
10135 MDSContext *onfinish,
10136 bool path_locked)
10137 {
10138 mds_rank_t from = base->authority().first;
10139
10140 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10141 << (path_locked ? " path_locked":"")
10142 << dendl;
10143
10144 if (base->is_ambiguous_auth()) {
10145 dout(7) << " waiting for single auth on " << *base << dendl;
10146 if (!onfinish)
10147 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10148 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10149 return;
10150 } else if (from == mds->get_nodeid()) {
10151 MDSContext::vec finished;
10152 base->take_sub_waiting(finished);
10153 mds->queue_waiters(finished);
10154 return;
10155 }
10156
10157 if ((path_locked && want_path.depth() == 1) ||
10158 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10159 discover_info_t& d = _create_discover(from);
10160 d.ino = base->ino();
10161 d.pin_base(base->inode);
10162 d.frag = base->get_frag();
10163 d.snap = snap;
10164 d.want_path = want_path;
10165 d.want_base_dir = false;
10166 d.path_locked = path_locked;
10167 _send_discover(d);
10168 }
10169
10170 // register + wait
10171 if (onfinish)
10172 base->add_dentry_waiter(want_path[0], snap, onfinish);
10173 }
10174
10175 void MDCache::kick_discovers(mds_rank_t who)
10176 {
10177 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10178 p != discovers.end();
10179 ++p) {
10180 if (p->second.mds != who)
10181 continue;
10182 _send_discover(p->second);
10183 }
10184 }
10185
10186
10187 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10188 {
10189 mds_rank_t whoami = mds->get_nodeid();
10190 mds_rank_t from = mds_rank_t(dis->get_source().num());
10191
10192 ceph_assert(from != whoami);
10193
10194 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10195 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10196 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10197 return;
10198 }
10199
10200 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10201 // delay processing request from survivor because we may not yet choose lock states.
10202 if (!mds->mdsmap->is_rejoin(from)) {
10203 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10204 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10205 return;
10206 }
10207 }
10208
10209
10210 CInode *cur = 0;
10211 auto reply = make_message<MDiscoverReply>(*dis);
10212
10213 snapid_t snapid = dis->get_snapid();
10214
10215 // get started.
10216 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10217 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10218 // wants root
10219 dout(7) << "handle_discover from mds." << from
10220 << " wants base + " << dis->get_want().get_path()
10221 << " snap " << snapid
10222 << dendl;
10223
10224 cur = get_inode(dis->get_base_ino());
10225 ceph_assert(cur);
10226
10227 // add root
10228 reply->starts_with = MDiscoverReply::INODE;
10229 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10230 dout(10) << "added base " << *cur << dendl;
10231 }
10232 else {
10233 // there's a base inode
10234 cur = get_inode(dis->get_base_ino(), snapid);
10235 if (!cur && snapid != CEPH_NOSNAP) {
10236 cur = get_inode(dis->get_base_ino());
10237 if (cur && !cur->is_multiversion())
10238 cur = NULL; // nope!
10239 }
10240
10241 if (!cur) {
10242 dout(7) << "handle_discover mds." << from
10243 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10244 << dendl;
10245 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10246 reply->set_error_dentry(dis->get_dentry(0));
10247 reply->set_flag_error_dir();
10248 } else if (dis->wants_base_dir()) {
10249 dout(7) << "handle_discover mds." << from
10250 << " wants basedir+" << dis->get_want().get_path()
10251 << " has " << *cur
10252 << dendl;
10253 } else {
10254 dout(7) << "handle_discover mds." << from
10255 << " wants " << dis->get_want().get_path()
10256 << " has " << *cur
10257 << dendl;
10258 }
10259 }
10260
10261 ceph_assert(reply);
10262
10263 // add content
10264 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10265 for (unsigned i = 0;
10266 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10267 i++) {
10268
10269 // -- figure out the dir
10270
10271 // is *cur even a dir at all?
10272 if (!cur->is_dir()) {
10273 dout(7) << *cur << " not a dir" << dendl;
10274 reply->set_flag_error_dir();
10275 break;
10276 }
10277
10278 // pick frag
10279 frag_t fg;
10280 if (dis->get_want().depth()) {
10281 // dentry specifies
10282 fg = cur->pick_dirfrag(dis->get_dentry(i));
10283 } else {
10284 // requester explicity specified the frag
10285 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10286 fg = dis->get_base_dir_frag();
10287 if (!cur->dirfragtree.is_leaf(fg))
10288 fg = cur->dirfragtree[fg.value()];
10289 }
10290 CDir *curdir = cur->get_dirfrag(fg);
10291
10292 if ((!curdir && !cur->is_auth()) ||
10293 (curdir && !curdir->is_auth())) {
10294
10295 /* before:
10296 * ONLY set flag if empty!!
10297 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10298 * resulting in duplicate discovers in flight,
10299 * which can wreak havoc when discovering rename srcdn (which may move)
10300 */
10301
10302 if (reply->is_empty()) {
10303 // only hint if empty.
10304 // someday this could be better, but right now the waiter logic isn't smart enough.
10305
10306 // hint
10307 if (curdir) {
10308 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10309 reply->set_dir_auth_hint(curdir->authority().first);
10310 } else {
10311 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10312 << *cur << dendl;
10313 reply->set_dir_auth_hint(cur->authority().first);
10314 }
10315
10316 // note error dentry, if any
10317 // NOTE: important, as it allows requester to issue an equivalent discover
10318 // to whomever we hint at.
10319 if (dis->get_want().depth() > i)
10320 reply->set_error_dentry(dis->get_dentry(i));
10321 }
10322
10323 break;
10324 }
10325
10326 if (!curdir) { // open dir?
10327 if (cur->is_frozen()) {
10328 if (!reply->is_empty()) {
10329 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10330 break;
10331 }
10332 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10333 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10334 return;
10335 }
10336 curdir = cur->get_or_open_dirfrag(this, fg);
10337 } else if (curdir->is_frozen_tree() ||
10338 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10339 if (!reply->is_empty()) {
10340 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10341 break;
10342 }
10343 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10344 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10345 reply->set_flag_error_dir();
10346 break;
10347 }
10348 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10349 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10350 return;
10351 }
10352
10353 // add dir
10354 if (curdir->get_version() == 0) {
10355 // fetch newly opened dir
10356 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10357 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10358 // make sure the base frag is correct, though, in there was a refragment since the
10359 // original request was sent.
10360 reply->set_base_dir_frag(curdir->get_frag());
10361 } else {
10362 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10363 if (!reply->trace.length())
10364 reply->starts_with = MDiscoverReply::DIR;
10365 encode_replica_dir(curdir, from, reply->trace);
10366 dout(7) << "handle_discover added dir " << *curdir << dendl;
10367 }
10368
10369 // lookup
10370 CDentry *dn = 0;
10371 if (curdir->get_version() == 0) {
10372 // fetch newly opened dir
10373 ceph_assert(!curdir->has_bloom());
10374 } else if (dis->get_want().depth() > 0) {
10375 // lookup dentry
10376 dn = curdir->lookup(dis->get_dentry(i), snapid);
10377 } else
10378 break; // done!
10379
10380 // incomplete dir?
10381 if (!dn) {
10382 if (!curdir->is_complete() &&
10383 !(snapid == CEPH_NOSNAP &&
10384 curdir->has_bloom() &&
10385 !curdir->is_in_bloom(dis->get_dentry(i)))) {
10386 // readdir
10387 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10388 if (reply->is_empty()) {
10389 // fetch and wait
10390 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10391 dis->wants_base_dir() && curdir->get_version() == 0);
10392 return;
10393 } else {
10394 // initiate fetch, but send what we have so far
10395 curdir->fetch(0);
10396 break;
10397 }
10398 }
10399
10400 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10401 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10402 << " dne, non-empty reply, stopping" << dendl;
10403 break;
10404 }
10405
10406 // send null dentry
10407 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10408 << *curdir << dendl;
10409 if (snapid == CEPH_NOSNAP)
10410 dn = curdir->add_null_dentry(dis->get_dentry(i));
10411 else
10412 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10413 }
10414 ceph_assert(dn);
10415
10416 // don't add replica to purging dentry/inode
10417 if (dn->state_test(CDentry::STATE_PURGING)) {
10418 if (reply->is_empty())
10419 reply->set_flag_error_dn(dis->get_dentry(i));
10420 break;
10421 }
10422
10423 CDentry::linkage_t *dnl = dn->get_linkage();
10424
10425 // xlocked dentry?
10426 // ...always block on non-tail items (they are unrelated)
10427 // ...allow xlocked tail disocvery _only_ if explicitly requested
10428 if (dn->lock.is_xlocked()) {
10429 // is this the last (tail) item in the discover traversal?
10430 if (dis->is_path_locked()) {
10431 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10432 } else if (reply->is_empty()) {
10433 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10434 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10435 return;
10436 } else {
10437 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10438 break;
10439 }
10440 }
10441
10442 // frozen inode?
10443 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10444 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10445 if (tailitem && dis->is_path_locked()) {
10446 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10447 } else if (reply->is_empty()) {
10448 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10449 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10450 return;
10451 } else {
10452 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10453 break;
10454 }
10455 }
10456
10457 // add dentry
10458 if (!reply->trace.length())
10459 reply->starts_with = MDiscoverReply::DENTRY;
10460 encode_replica_dentry(dn, from, reply->trace);
10461 dout(7) << "handle_discover added dentry " << *dn << dendl;
10462
10463 if (!dnl->is_primary()) break; // stop on null or remote link.
10464
10465 // add inode
10466 CInode *next = dnl->get_inode();
10467 ceph_assert(next->is_auth());
10468
10469 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10470 dout(7) << "handle_discover added inode " << *next << dendl;
10471
10472 // descend, keep going.
10473 cur = next;
10474 continue;
10475 }
10476
10477 // how did we do?
10478 ceph_assert(!reply->is_empty());
10479 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10480 mds->send_message(reply, dis->get_connection());
10481 }
10482
10483 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10484 {
10485 /*
10486 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10487 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10488 return;
10489 }
10490 */
10491 dout(7) << "discover_reply " << *m << dendl;
10492 if (m->is_flag_error_dir())
10493 dout(7) << " flag error, dir" << dendl;
10494 if (m->is_flag_error_dn())
10495 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10496
10497 MDSContext::vec finished, error;
10498 mds_rank_t from = mds_rank_t(m->get_source().num());
10499
10500 // starting point
10501 CInode *cur = get_inode(m->get_base_ino());
10502 auto p = m->trace.cbegin();
10503
10504 int next = m->starts_with;
10505
10506 // decrement discover counters
10507 if (m->get_tid()) {
10508 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10509 if (p != discovers.end()) {
10510 dout(10) << " found tid " << m->get_tid() << dendl;
10511 discovers.erase(p);
10512 } else {
10513 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10514 }
10515 }
10516
10517 // discover may start with an inode
10518 if (!p.end() && next == MDiscoverReply::INODE) {
10519 decode_replica_inode(cur, p, NULL, finished);
10520 dout(7) << "discover_reply got base inode " << *cur << dendl;
10521 ceph_assert(cur->is_base());
10522
10523 next = MDiscoverReply::DIR;
10524
10525 // take waiters?
10526 if (cur->is_base() &&
10527 waiting_for_base_ino[from].count(cur->ino())) {
10528 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10529 waiting_for_base_ino[from].erase(cur->ino());
10530 }
10531 }
10532 ceph_assert(cur);
10533
10534 // loop over discover results.
10535 // indexes follow each ([[dir] dentry] inode)
10536 // can start, end with any type.
10537 while (!p.end()) {
10538 // dir
10539 frag_t fg;
10540 CDir *curdir = nullptr;
10541 if (next == MDiscoverReply::DIR) {
10542 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10543 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10544 ceph_assert(m->get_wanted_base_dir());
10545 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10546 }
10547 } else {
10548 // note: this can only happen our first way around this loop.
10549 if (p.end() && m->is_flag_error_dn()) {
10550 fg = cur->pick_dirfrag(m->get_error_dentry());
10551 curdir = cur->get_dirfrag(fg);
10552 } else
10553 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10554 }
10555
10556 if (p.end())
10557 break;
10558
10559 // dentry
10560 CDentry *dn = nullptr;
10561 decode_replica_dentry(dn, p, curdir, finished);
10562
10563 if (p.end())
10564 break;
10565
10566 // inode
10567 decode_replica_inode(cur, p, dn, finished);
10568
10569 next = MDiscoverReply::DIR;
10570 }
10571
10572 // dir error?
10573 // or dir_auth hint?
10574 if (m->is_flag_error_dir() && !cur->is_dir()) {
10575 // not a dir.
10576 cur->take_waiting(CInode::WAIT_DIR, error);
10577 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10578 mds_rank_t who = m->get_dir_auth_hint();
10579 if (who == mds->get_nodeid()) who = -1;
10580 if (who >= 0)
10581 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10582
10583
10584 if (m->get_wanted_base_dir()) {
10585 frag_t fg = m->get_base_dir_frag();
10586 CDir *dir = cur->get_dirfrag(fg);
10587
10588 if (cur->is_waiting_for_dir(fg)) {
10589 if (cur->is_auth())
10590 cur->take_waiting(CInode::WAIT_DIR, finished);
10591 else if (dir || !cur->dirfragtree.is_leaf(fg))
10592 cur->take_dir_waiting(fg, finished);
10593 else
10594 discover_dir_frag(cur, fg, 0, who);
10595 } else
10596 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10597 }
10598
10599 // try again?
10600 if (m->get_error_dentry().length()) {
10601 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10602 CDir *dir = cur->get_dirfrag(fg);
10603 // wanted a dentry
10604 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10605 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10606 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10607 m->get_wanted_snapid(), finished);
10608 } else {
10609 filepath relpath(m->get_error_dentry(), 0);
10610 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10611 }
10612 } else
10613 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10614 << m->get_error_dentry() << dendl;
10615 }
10616 } else if (m->is_flag_error_dn()) {
10617 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10618 CDir *dir = cur->get_dirfrag(fg);
10619 if (dir) {
10620 if (dir->is_auth()) {
10621 dir->take_sub_waiting(finished);
10622 } else {
10623 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10624 m->get_wanted_snapid(), error);
10625 }
10626 }
10627 }
10628
10629 // waiters
10630 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10631 mds->queue_waiters(finished);
10632 }
10633
10634
10635
10636 // ----------------------------
10637 // REPLICAS
10638
10639
10640 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10641 {
10642 ENCODE_START(1, 1, bl);
10643 dirfrag_t df = dir->dirfrag();
10644 encode(df, bl);
10645 __u32 nonce = dir->add_replica(to);
10646 encode(nonce, bl);
10647 dir->_encode_base(bl);
10648 ENCODE_FINISH(bl);
10649 }
10650
10651 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10652 {
10653 ENCODE_START(1, 1, bl);
10654 encode(dn->get_name(), bl);
10655 encode(dn->last, bl);
10656
10657 __u32 nonce = dn->add_replica(to);
10658 encode(nonce, bl);
10659 encode(dn->first, bl);
10660 encode(dn->linkage.remote_ino, bl);
10661 encode(dn->linkage.remote_d_type, bl);
10662 dn->lock.encode_state_for_replica(bl);
10663 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10664 encode(need_recover, bl);
10665 ENCODE_FINISH(bl);
10666 }
10667
10668 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10669 uint64_t features)
10670 {
10671 ENCODE_START(1, 1, bl);
10672 ceph_assert(in->is_auth());
10673 encode(in->inode.ino, bl); // bleh, minor assymetry here
10674 encode(in->last, bl);
10675
10676 __u32 nonce = in->add_replica(to);
10677 encode(nonce, bl);
10678
10679 in->_encode_base(bl, features);
10680 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10681 ENCODE_FINISH(bl);
10682 }
10683
10684 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10685 MDSContext::vec& finished)
10686 {
10687 DECODE_START(1, p);
10688 dirfrag_t df;
10689 decode(df, p);
10690
10691 ceph_assert(diri->ino() == df.ino);
10692
10693 // add it (_replica_)
10694 dir = diri->get_dirfrag(df.frag);
10695
10696 if (dir) {
10697 // had replica. update w/ new nonce.
10698 __u32 nonce;
10699 decode(nonce, p);
10700 dir->set_replica_nonce(nonce);
10701 dir->_decode_base(p);
10702 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10703 } else {
10704 // force frag to leaf in the diri tree
10705 if (!diri->dirfragtree.is_leaf(df.frag)) {
10706 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10707 << diri->dirfragtree << dendl;
10708 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10709 }
10710 // add replica.
10711 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10712 __u32 nonce;
10713 decode(nonce, p);
10714 dir->set_replica_nonce(nonce);
10715 dir->_decode_base(p);
10716 // is this a dir_auth delegation boundary?
10717 if (from != diri->authority().first ||
10718 diri->is_ambiguous_auth() ||
10719 diri->is_base())
10720 adjust_subtree_auth(dir, from);
10721
10722 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10723 // get waiters
10724 diri->take_dir_waiting(df.frag, finished);
10725 }
10726 DECODE_FINISH(p);
10727 }
10728
10729 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10730 {
10731 DECODE_START(1, p);
10732 string name;
10733 snapid_t last;
10734 decode(name, p);
10735 decode(last, p);
10736
10737 dn = dir->lookup(name, last);
10738
10739 // have it?
10740 bool is_new = false;
10741 if (dn) {
10742 is_new = false;
10743 dout(7) << __func__ << " had " << *dn << dendl;
10744 } else {
10745 is_new = true;
10746 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10747 dout(7) << __func__ << " added " << *dn << dendl;
10748 }
10749
10750 __u32 nonce;
10751 decode(nonce, p);
10752 dn->set_replica_nonce(nonce);
10753 decode(dn->first, p);
10754
10755 inodeno_t rino;
10756 unsigned char rdtype;
10757 decode(rino, p);
10758 decode(rdtype, p);
10759 dn->lock.decode_state(p, is_new);
10760
10761 bool need_recover;
10762 decode(need_recover, p);
10763
10764 if (is_new) {
10765 if (rino)
10766 dir->link_remote_inode(dn, rino, rdtype);
10767 if (need_recover)
10768 dn->lock.mark_need_recover();
10769 }
10770
10771 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10772 DECODE_FINISH(p);
10773 }
10774
10775 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10776 {
10777 DECODE_START(1, p);
10778 inodeno_t ino;
10779 snapid_t last;
10780 __u32 nonce;
10781 decode(ino, p);
10782 decode(last, p);
10783 decode(nonce, p);
10784 in = get_inode(ino, last);
10785 if (!in) {
10786 in = new CInode(this, false, 1, last);
10787 in->set_replica_nonce(nonce);
10788 in->_decode_base(p);
10789 in->_decode_locks_state_for_replica(p, true);
10790 add_inode(in);
10791 if (in->ino() == MDS_INO_ROOT)
10792 in->inode_auth.first = 0;
10793 else if (in->is_mdsdir())
10794 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10795 dout(10) << __func__ << " added " << *in << dendl;
10796 if (dn) {
10797 ceph_assert(dn->get_linkage()->is_null());
10798 dn->dir->link_primary_inode(dn, in);
10799 }
10800 } else {
10801 in->set_replica_nonce(nonce);
10802 in->_decode_base(p);
10803 in->_decode_locks_state_for_replica(p, false);
10804 dout(10) << __func__ << " had " << *in << dendl;
10805 }
10806
10807 if (dn) {
10808 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10809 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10810 }
10811 DECODE_FINISH(p);
10812 }
10813
10814
10815 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10816 {
10817 ENCODE_START(1, 1, bl);
10818 uint64_t features = mds->mdsmap->get_up_features();
10819 encode_replica_inode(get_myin(), who, bl, features);
10820 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10821 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10822 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10823 encode_replica_dir(straydn->get_dir(), who, bl);
10824 encode_replica_dentry(straydn, who, bl);
10825 ENCODE_FINISH(bl);
10826 }
10827
10828 void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
10829 {
10830 MDSContext::vec finished;
10831 auto p = bl.cbegin();
10832
10833 DECODE_START(1, p);
10834 CInode *mdsin = nullptr;
10835 decode_replica_inode(mdsin, p, NULL, finished);
10836 CDir *mdsdir = nullptr;
10837 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10838 CDentry *straydirdn = nullptr;
10839 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10840 CInode *strayin = nullptr;
10841 decode_replica_inode(strayin, p, straydirdn, finished);
10842 CDir *straydir = nullptr;
10843 decode_replica_dir(straydir, p, strayin, from, finished);
10844
10845 decode_replica_dentry(straydn, p, straydir, finished);
10846 if (!finished.empty())
10847 mds->queue_waiters(finished);
10848 DECODE_FINISH(p);
10849 }
10850
10851
10852 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10853 {
10854 // this is an FYI, re: replication
10855
10856 set<mds_rank_t> who;
10857 if (bcast) {
10858 mds->get_mds_map()->get_active_mds_set(who);
10859 } else {
10860 for (const auto &p : dir->get_replicas()) {
10861 who.insert(p.first);
10862 }
10863 }
10864
10865 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10866
10867 filepath path;
10868 dir->inode->make_path(path);
10869
10870 mds_rank_t whoami = mds->get_nodeid();
10871 for (set<mds_rank_t>::iterator it = who.begin();
10872 it != who.end();
10873 ++it) {
10874 if (*it == whoami) continue;
10875 //if (*it == except) continue;
10876 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10877
10878 std::set<int32_t> s;
10879 for (const auto &r : dir->dir_rep_by) {
10880 s.insert(r);
10881 }
10882 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
10883 }
10884
10885 return 0;
10886 }
10887
10888 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
10889 {
10890 dirfrag_t df = m->get_dirfrag();
10891 CDir *dir = get_dirfrag(df);
10892 if (!dir) {
10893 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
10894
10895 // discover it?
10896 if (m->should_discover()) {
10897 // only try once!
10898 // this is key to avoid a fragtree update race, among other things.
10899 m->inc_tried_discover();
10900 vector<CDentry*> trace;
10901 CInode *in;
10902 filepath path = m->get_path();
10903 dout(5) << "trying discover on dir_update for " << path << dendl;
10904 CF_MDS_RetryMessageFactory cf(mds, m);
10905 MDRequestRef null_ref;
10906 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
10907 if (r > 0)
10908 return;
10909 if (r == 0 &&
10910 in->ino() == df.ino &&
10911 in->get_approx_dirfrag(df.frag) == NULL) {
10912 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
10913 return;
10914 }
10915 }
10916
10917 return;
10918 }
10919
10920 if (!m->has_tried_discover()) {
10921 // Update if it already exists. Othwerwise it got updated by discover reply.
10922 dout(5) << "dir_update on " << *dir << dendl;
10923 dir->dir_rep = m->get_dir_rep();
10924 dir->dir_rep_by.clear();
10925 for (const auto &e : m->get_dir_rep_by()) {
10926 dir->dir_rep_by.insert(e);
10927 }
10928 }
10929 }
10930
10931
10932
10933
10934
10935 // LINK
10936
10937 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
10938 {
10939 ENCODE_START(1, 1, bl);
10940 inodeno_t ino = dnl->get_remote_ino();
10941 encode(ino, bl);
10942 __u8 d_type = dnl->get_remote_d_type();
10943 encode(d_type, bl);
10944 ENCODE_FINISH(bl);
10945 }
10946
10947 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
10948 {
10949 DECODE_START(1, p);
10950 inodeno_t ino;
10951 __u8 d_type;
10952 decode(ino, p);
10953 decode(d_type, p);
10954 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
10955 dir->link_remote_inode(dn, ino, d_type);
10956 DECODE_FINISH(p);
10957 }
10958
10959 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
10960 {
10961 dout(7) << __func__ << " " << *dn << dendl;
10962
10963 CDir *subtree = get_subtree_root(dn->get_dir());
10964 for (const auto &p : dn->get_replicas()) {
10965 // don't tell (rename) witnesses; they already know
10966 if (mdr.get() && mdr->more()->witnessed.count(p.first))
10967 continue;
10968 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
10969 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
10970 rejoin_gather.count(p.first)))
10971 continue;
10972 CDentry::linkage_t *dnl = dn->get_linkage();
10973 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
10974 if (dnl->is_primary()) {
10975 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
10976 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
10977 mds->mdsmap->get_up_features());
10978 } else if (dnl->is_remote()) {
10979 encode_remote_dentry_link(dnl, m->bl);
10980 } else
10981 ceph_abort(); // aie, bad caller!
10982 mds->send_message_mds(m, p.first);
10983 }
10984 }
10985
10986 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
10987 {
10988 CDentry *dn = NULL;
10989 CDir *dir = get_dirfrag(m->get_dirfrag());
10990 if (!dir) {
10991 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
10992 } else {
10993 dn = dir->lookup(m->get_dn());
10994 if (!dn) {
10995 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
10996 } else {
10997 dout(7) << __func__ << " on " << *dn << dendl;
10998 CDentry::linkage_t *dnl = dn->get_linkage();
10999
11000 ceph_assert(!dn->is_auth());
11001 ceph_assert(dnl->is_null());
11002 }
11003 }
11004
11005 auto p = m->bl.cbegin();
11006 MDSContext::vec finished;
11007 if (dn) {
11008 if (m->get_is_primary()) {
11009 // primary link.
11010 CInode *in = nullptr;
11011 decode_replica_inode(in, p, dn, finished);
11012 } else {
11013 // remote link, easy enough.
11014 decode_remote_dentry_link(dir, dn, p);
11015 }
11016 } else {
11017 ceph_abort();
11018 }
11019
11020 if (!finished.empty())
11021 mds->queue_waiters(finished);
11022
11023 return;
11024 }
11025
11026
11027 // UNLINK
11028
11029 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11030 {
11031 dout(10) << __func__ << " " << *dn << dendl;
11032 // share unlink news with replicas
11033 set<mds_rank_t> replicas;
11034 dn->list_replicas(replicas);
11035 bufferlist snapbl;
11036 if (straydn) {
11037 straydn->list_replicas(replicas);
11038 CInode *strayin = straydn->get_linkage()->get_inode();
11039 strayin->encode_snap_blob(snapbl);
11040 }
11041 for (set<mds_rank_t>::iterator it = replicas.begin();
11042 it != replicas.end();
11043 ++it) {
11044 // don't tell (rmdir) witnesses; they already know
11045 if (mdr.get() && mdr->more()->witnessed.count(*it))
11046 continue;
11047
11048 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11049 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11050 rejoin_gather.count(*it)))
11051 continue;
11052
11053 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11054 if (straydn) {
11055 encode_replica_stray(straydn, *it, unlink->straybl);
11056 unlink->snapbl = snapbl;
11057 }
11058 mds->send_message_mds(unlink, *it);
11059 }
11060 }
11061
11062 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11063 {
11064 // straydn
11065 CDentry *straydn = nullptr;
11066 if (m->straybl.length())
11067 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
11068
11069 CDir *dir = get_dirfrag(m->get_dirfrag());
11070 if (!dir) {
11071 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11072 } else {
11073 CDentry *dn = dir->lookup(m->get_dn());
11074 if (!dn) {
11075 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11076 } else {
11077 dout(7) << __func__ << " on " << *dn << dendl;
11078 CDentry::linkage_t *dnl = dn->get_linkage();
11079
11080 // open inode?
11081 if (dnl->is_primary()) {
11082 CInode *in = dnl->get_inode();
11083 dn->dir->unlink_inode(dn);
11084 ceph_assert(straydn);
11085 straydn->dir->link_primary_inode(straydn, in);
11086
11087 // in->first is lazily updated on replica; drag it forward so
11088 // that we always keep it in sync with the dnq
11089 ceph_assert(straydn->first >= in->first);
11090 in->first = straydn->first;
11091
11092 // update subtree map?
11093 if (in->is_dir())
11094 adjust_subtree_after_rename(in, dir, false);
11095
11096 if (m->snapbl.length()) {
11097 bool hadrealm = (in->snaprealm ? true : false);
11098 in->decode_snap_blob(m->snapbl);
11099 ceph_assert(in->snaprealm);
11100 ceph_assert(in->snaprealm->have_past_parents_open());
11101 if (!hadrealm)
11102 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11103 }
11104
11105 // send caps to auth (if we're not already)
11106 if (in->is_any_caps() &&
11107 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11108 migrator->export_caps(in);
11109
11110 straydn = NULL;
11111 } else {
11112 ceph_assert(!straydn);
11113 ceph_assert(dnl->is_remote());
11114 dn->dir->unlink_inode(dn);
11115 }
11116 ceph_assert(dnl->is_null());
11117 }
11118 }
11119
11120 // race with trim_dentry()
11121 if (straydn) {
11122 ceph_assert(straydn->get_num_ref() == 0);
11123 ceph_assert(straydn->get_linkage()->is_null());
11124 expiremap ex;
11125 trim_dentry(straydn, ex);
11126 send_expire_messages(ex);
11127 }
11128 }
11129
11130
11131
11132
11133
11134
11135 // ===================================================================
11136
11137
11138
11139 // ===================================================================
11140 // FRAGMENT
11141
11142
11143 /**
11144 * adjust_dir_fragments -- adjust fragmentation for a directory
11145 *
11146 * @param diri directory inode
11147 * @param basefrag base fragment
11148 * @param bits bit adjustment. positive for split, negative for merge.
11149 */
11150 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11151 std::vector<CDir*>* resultfrags,
11152 MDSContext::vec& waiters,
11153 bool replay)
11154 {
11155 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11156 << " on " << *diri << dendl;
11157
11158 auto&& p = diri->get_dirfrags_under(basefrag);
11159
11160 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11161 }
11162
11163 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11164 {
11165 CDir *dir = diri->get_dirfrag(fg);
11166 if (dir)
11167 return dir;
11168
11169 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11170
11171 std::vector<CDir*> src, result;
11172 MDSContext::vec waiters;
11173
11174 // split a parent?
11175 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11176 while (1) {
11177 CDir *pdir = diri->get_dirfrag(parent);
11178 if (pdir) {
11179 int split = fg.bits() - parent.bits();
11180 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11181 src.push_back(pdir);
11182 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11183 dir = diri->get_dirfrag(fg);
11184 if (dir) {
11185 dout(10) << "force_dir_fragment result " << *dir << dendl;
11186 break;
11187 }
11188 }
11189 if (parent == frag_t())
11190 break;
11191 frag_t last = parent;
11192 parent = parent.parent();
11193 dout(10) << " " << last << " parent is " << parent << dendl;
11194 }
11195
11196 if (!dir) {
11197 // hoover up things under fg?
11198 {
11199 auto&& p = diri->get_dirfrags_under(fg);
11200 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11201 }
11202 if (src.empty()) {
11203 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11204 } else {
11205 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11206 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11207 dir = result.front();
11208 dout(10) << "force_dir_fragment result " << *dir << dendl;
11209 }
11210 }
11211 if (!replay)
11212 mds->queue_waiters(waiters);
11213 return dir;
11214 }
11215
11216 void MDCache::adjust_dir_fragments(CInode *diri,
11217 const std::vector<CDir*>& srcfrags,
11218 frag_t basefrag, int bits,
11219 std::vector<CDir*>* resultfrags,
11220 MDSContext::vec& waiters,
11221 bool replay)
11222 {
11223 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11224 << " srcfrags " << srcfrags
11225 << " on " << *diri << dendl;
11226
11227 // adjust fragtree
11228 // yuck. we may have discovered the inode while it was being fragmented.
11229 if (!diri->dirfragtree.is_leaf(basefrag))
11230 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11231
11232 if (bits > 0)
11233 diri->dirfragtree.split(basefrag, bits);
11234 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11235
11236 if (srcfrags.empty())
11237 return;
11238
11239 // split
11240 CDir *parent_dir = diri->get_parent_dir();
11241 CDir *parent_subtree = 0;
11242 if (parent_dir)
11243 parent_subtree = get_subtree_root(parent_dir);
11244
11245 ceph_assert(srcfrags.size() >= 1);
11246 if (bits > 0) {
11247 // SPLIT
11248 ceph_assert(srcfrags.size() == 1);
11249 CDir *dir = srcfrags.front();
11250
11251 dir->split(bits, resultfrags, waiters, replay);
11252
11253 // did i change the subtree map?
11254 if (dir->is_subtree_root()) {
11255 // new frags are now separate subtrees
11256 for (const auto& dir : *resultfrags) {
11257 subtrees[dir].clear(); // new frag is now its own subtree
11258 }
11259
11260 // was i a bound?
11261 if (parent_subtree) {
11262 ceph_assert(subtrees[parent_subtree].count(dir));
11263 subtrees[parent_subtree].erase(dir);
11264 for (const auto& dir : *resultfrags) {
11265 ceph_assert(dir->is_subtree_root());
11266 subtrees[parent_subtree].insert(dir);
11267 }
11268 }
11269
11270 // adjust my bounds.
11271 set<CDir*> bounds;
11272 bounds.swap(subtrees[dir]);
11273 subtrees.erase(dir);
11274 for (set<CDir*>::iterator p = bounds.begin();
11275 p != bounds.end();
11276 ++p) {
11277 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11278 subtrees[frag].insert(*p);
11279 }
11280
11281 show_subtrees(10);
11282 }
11283
11284 diri->close_dirfrag(dir->get_frag());
11285
11286 } else {
11287 // MERGE
11288
11289 // are my constituent bits subtrees? if so, i will be too.
11290 // (it's all or none, actually.)
11291 bool any_subtree = false, any_non_subtree = false;
11292 for (const auto& dir : srcfrags) {
11293 if (dir->is_subtree_root())
11294 any_subtree = true;
11295 else
11296 any_non_subtree = true;
11297 }
11298 ceph_assert(!any_subtree || !any_non_subtree);
11299
11300 set<CDir*> new_bounds;
11301 if (any_subtree) {
11302 for (const auto& dir : srcfrags) {
11303 // this simplifies the code that find subtrees underneath the dirfrag
11304 if (!dir->is_subtree_root()) {
11305 dir->state_set(CDir::STATE_AUXSUBTREE);
11306 adjust_subtree_auth(dir, mds->get_nodeid());
11307 }
11308 }
11309
11310 for (const auto& dir : srcfrags) {
11311 ceph_assert(dir->is_subtree_root());
11312 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11313 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11314 set<CDir*>::iterator r = q->second.begin();
11315 while (r != subtrees[dir].end()) {
11316 new_bounds.insert(*r);
11317 subtrees[dir].erase(r++);
11318 }
11319 subtrees.erase(q);
11320
11321 // remove myself as my parent's bound
11322 if (parent_subtree)
11323 subtrees[parent_subtree].erase(dir);
11324 }
11325 }
11326
11327 // merge
11328 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11329 f->merge(srcfrags, waiters, replay);
11330
11331 if (any_subtree) {
11332 ceph_assert(f->is_subtree_root());
11333 subtrees[f].swap(new_bounds);
11334 if (parent_subtree)
11335 subtrees[parent_subtree].insert(f);
11336
11337 show_subtrees(10);
11338 }
11339
11340 resultfrags->push_back(f);
11341 }
11342 }
11343
11344
11345 class C_MDC_FragmentFrozen : public MDSInternalContext {
11346 MDCache *mdcache;
11347 MDRequestRef mdr;
11348 public:
11349 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11350 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11351 void finish(int r) override {
11352 mdcache->fragment_frozen(mdr, r);
11353 }
11354 };
11355
11356 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11357 {
11358 if (is_readonly()) {
11359 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11360 return false;
11361 }
11362 if (mds->is_cluster_degraded()) {
11363 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11364 return false;
11365 }
11366 if (diri->get_parent_dir() &&
11367 diri->get_parent_dir()->get_inode()->is_stray()) {
11368 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11369 return false;
11370 }
11371 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
11372 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11373 return false;
11374 }
11375
11376 if (diri->scrub_is_in_progress()) {
11377 dout(7) << "can_fragment: scrub in progress" << dendl;
11378 return false;
11379 }
11380
11381 for (const auto& dir : dirs) {
11382 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11383 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11384 return false;
11385 }
11386 if (!dir->is_auth()) {
11387 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11388 return false;
11389 }
11390 if (dir->is_bad()) {
11391 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11392 return false;
11393 }
11394 if (dir->is_frozen() ||
11395 dir->is_freezing()) {
11396 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11397 return false;
11398 }
11399 }
11400
11401 return true;
11402 }
11403
11404 void MDCache::split_dir(CDir *dir, int bits)
11405 {
11406 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11407 ceph_assert(dir->is_auth());
11408 CInode *diri = dir->inode;
11409
11410 std::vector<CDir*> dirs;
11411 dirs.push_back(dir);
11412
11413 if (!can_fragment(diri, dirs)) {
11414 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11415 return;
11416 }
11417
11418 if (dir->frag.bits() + bits > 24) {
11419 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11420 return;
11421 }
11422
11423 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11424 mdr->more()->fragment_base = dir->dirfrag();
11425
11426 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11427 fragment_info_t& info = fragments[dir->dirfrag()];
11428 info.mdr = mdr;
11429 info.dirs.push_back(dir);
11430 info.bits = bits;
11431 info.last_cum_auth_pins_change = ceph_clock_now();
11432
11433 fragment_freeze_dirs(dirs);
11434 // initial mark+complete pass
11435 fragment_mark_and_complete(mdr);
11436 }
11437
11438 void MDCache::merge_dir(CInode *diri, frag_t frag)
11439 {
11440 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11441
11442 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11443 if (!all) {
11444 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11445 return;
11446 }
11447
11448 if (diri->dirfragtree.is_leaf(frag)) {
11449 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11450 return;
11451 }
11452
11453 if (!can_fragment(diri, dirs))
11454 return;
11455
11456 CDir *first = dirs.front();
11457 int bits = first->get_frag().bits() - frag.bits();
11458 dout(10) << " we are merging by " << bits << " bits" << dendl;
11459
11460 dirfrag_t basedirfrag(diri->ino(), frag);
11461 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11462 mdr->more()->fragment_base = basedirfrag;
11463
11464 ceph_assert(fragments.count(basedirfrag) == 0);
11465 fragment_info_t& info = fragments[basedirfrag];
11466 info.mdr = mdr;
11467 info.dirs = dirs;
11468 info.bits = -bits;
11469 info.last_cum_auth_pins_change = ceph_clock_now();
11470
11471 fragment_freeze_dirs(dirs);
11472 // initial mark+complete pass
11473 fragment_mark_and_complete(mdr);
11474 }
11475
11476 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11477 {
11478 bool any_subtree = false, any_non_subtree = false;
11479 for (const auto& dir : dirs) {
11480 dir->auth_pin(dir); // until we mark and complete them
11481 dir->state_set(CDir::STATE_FRAGMENTING);
11482 dir->freeze_dir();
11483 ceph_assert(dir->is_freezing_dir());
11484
11485 if (dir->is_subtree_root())
11486 any_subtree = true;
11487 else
11488 any_non_subtree = true;
11489 }
11490
11491 if (any_subtree && any_non_subtree) {
11492 // either all dirfrags are subtree roots or all are not.
11493 for (const auto& dir : dirs) {
11494 if (dir->is_subtree_root()) {
11495 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11496 } else {
11497 dir->state_set(CDir::STATE_AUXSUBTREE);
11498 adjust_subtree_auth(dir, mds->get_nodeid());
11499 }
11500 }
11501 }
11502 }
11503
11504 class C_MDC_FragmentMarking : public MDCacheContext {
11505 MDRequestRef mdr;
11506 public:
11507 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11508 void finish(int r) override {
11509 mdcache->fragment_mark_and_complete(mdr);
11510 }
11511 };
11512
11513 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11514 {
11515 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11516 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11517 if (it == fragments.end() || it->second.mdr != mdr) {
11518 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11519 request_finish(mdr);
11520 return;
11521 }
11522
11523 fragment_info_t& info = it->second;
11524 CInode *diri = info.dirs.front()->get_inode();
11525 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11526
11527 MDSGatherBuilder gather(g_ceph_context);
11528
11529 for (const auto& dir : info.dirs) {
11530 bool ready = true;
11531 if (!dir->is_complete()) {
11532 dout(15) << " fetching incomplete " << *dir << dendl;
11533 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11534 ready = false;
11535 } else if (dir->get_frag() == frag_t()) {
11536 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11537 // the operation. To avoid CDir::fetch() complaining about missing object,
11538 // we commit new dirfrag first.
11539 if (dir->state_test(CDir::STATE_CREATING)) {
11540 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11541 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11542 ready = false;
11543 } else if (dir->is_new()) {
11544 dout(15) << " committing new " << *dir << dendl;
11545 ceph_assert(dir->is_dirty());
11546 dir->commit(0, gather.new_sub(), true);
11547 ready = false;
11548 }
11549 }
11550 if (!ready)
11551 continue;
11552
11553 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11554 dout(15) << " marking " << *dir << dendl;
11555 for (auto &p : dir->items) {
11556 CDentry *dn = p.second;
11557 dn->get(CDentry::PIN_FRAGMENTING);
11558 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11559 dn->state_set(CDentry::STATE_FRAGMENTING);
11560 }
11561 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11562 dir->auth_unpin(dir);
11563 } else {
11564 dout(15) << " already marked " << *dir << dendl;
11565 }
11566 }
11567 if (gather.has_subs()) {
11568 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11569 gather.activate();
11570 return;
11571 }
11572
11573 for (const auto& dir : info.dirs) {
11574 if (!dir->is_frozen_dir()) {
11575 ceph_assert(dir->is_freezing_dir());
11576 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11577 }
11578 }
11579 if (gather.has_subs()) {
11580 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11581 gather.activate();
11582 // flush log so that request auth_pins are retired
11583 mds->mdlog->flush();
11584 return;
11585 }
11586
11587 fragment_frozen(mdr, 0);
11588 }
11589
11590 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11591 {
11592 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11593 for (const auto& dir : dirs) {
11594 dout(10) << " frag " << *dir << dendl;
11595
11596 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11597 dir->state_clear(CDir::STATE_FRAGMENTING);
11598
11599 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11600 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11601
11602 for (auto &p : dir->items) {
11603 CDentry *dn = p.second;
11604 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11605 dn->state_clear(CDentry::STATE_FRAGMENTING);
11606 dn->put(CDentry::PIN_FRAGMENTING);
11607 }
11608 } else {
11609 dir->auth_unpin(dir);
11610 }
11611
11612 dir->unfreeze_dir();
11613 }
11614 }
11615
11616 bool MDCache::fragment_are_all_frozen(CDir *dir)
11617 {
11618 ceph_assert(dir->is_frozen_dir());
11619 map<dirfrag_t,fragment_info_t>::iterator p;
11620 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11621 p != fragments.end() && p->first.ino == dir->ino();
11622 ++p) {
11623 if (p->first.frag.contains(dir->get_frag()))
11624 return p->second.all_frozen;
11625 }
11626 ceph_abort();
11627 return false;
11628 }
11629
11630 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11631 {
11632 map<dirfrag_t,fragment_info_t>::iterator p;
11633 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11634 p != fragments.end() && p->first.ino == dir->ino();
11635 ++p) {
11636 if (p->first.frag.contains(dir->get_frag())) {
11637 p->second.num_remote_waiters++;
11638 return;
11639 }
11640 }
11641 ceph_abort();
11642 }
11643
11644 void MDCache::find_stale_fragment_freeze()
11645 {
11646 dout(10) << "find_stale_fragment_freeze" << dendl;
11647 // see comment in Migrator::find_stale_export_freeze()
11648 utime_t now = ceph_clock_now();
11649 utime_t cutoff = now;
11650 cutoff -= g_conf()->mds_freeze_tree_timeout;
11651
11652 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11653 p != fragments.end(); ) {
11654 dirfrag_t df = p->first;
11655 fragment_info_t& info = p->second;
11656 ++p;
11657 if (info.all_frozen)
11658 continue;
11659 CDir *dir;
11660 int total_auth_pins = 0;
11661 for (const auto& d : info.dirs) {
11662 dir = d;
11663 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11664 total_auth_pins = -1;
11665 break;
11666 }
11667 if (dir->is_frozen_dir())
11668 continue;
11669 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11670 }
11671 if (total_auth_pins < 0)
11672 continue;
11673 if (info.last_cum_auth_pins != total_auth_pins) {
11674 info.last_cum_auth_pins = total_auth_pins;
11675 info.last_cum_auth_pins_change = now;
11676 continue;
11677 }
11678 if (info.last_cum_auth_pins_change >= cutoff)
11679 continue;
11680 dir = info.dirs.front();
11681 if (info.num_remote_waiters > 0 ||
11682 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11683 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11684 std::vector<CDir*> dirs;
11685 info.dirs.swap(dirs);
11686 fragments.erase(df);
11687 fragment_unmark_unfreeze_dirs(dirs);
11688 }
11689 }
11690 }
11691
11692 class C_MDC_FragmentPrep : public MDCacheLogContext {
11693 MDRequestRef mdr;
11694 public:
11695 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11696 void finish(int r) override {
11697 mdcache->_fragment_logged(mdr);
11698 }
11699 };
11700
11701 class C_MDC_FragmentStore : public MDCacheContext {
11702 MDRequestRef mdr;
11703 public:
11704 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11705 void finish(int r) override {
11706 mdcache->_fragment_stored(mdr);
11707 }
11708 };
11709
11710 class C_MDC_FragmentCommit : public MDCacheLogContext {
11711 dirfrag_t basedirfrag;
11712 MDRequestRef mdr;
11713 public:
11714 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11715 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11716 void finish(int r) override {
11717 mdcache->_fragment_committed(basedirfrag, mdr);
11718 }
11719 };
11720
11721 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11722 dirfrag_t basedirfrag;
11723 int bits;
11724 MDRequestRef mdr;
11725 public:
11726 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11727 const MDRequestRef& r) :
11728 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11729 void finish(int r) override {
11730 ceph_assert(r == 0 || r == -ENOENT);
11731 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
11732 }
11733 void print(ostream& out) const override {
11734 out << "fragment_purge_old(" << basedirfrag << ")";
11735 }
11736 };
11737
11738 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11739 {
11740 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11741 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11742 if (it == fragments.end() || it->second.mdr != mdr) {
11743 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11744 request_finish(mdr);
11745 return;
11746 }
11747
11748 ceph_assert(r == 0);
11749 fragment_info_t& info = it->second;
11750 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11751 << " on " << info.dirs.front()->get_inode() << dendl;
11752
11753 info.all_frozen = true;
11754 dispatch_fragment_dir(mdr);
11755 }
11756
11757 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11758 {
11759 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11760 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11761 if (it == fragments.end() || it->second.mdr != mdr) {
11762 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11763 request_finish(mdr);
11764 return;
11765 }
11766
11767 fragment_info_t& info = it->second;
11768 CInode *diri = info.dirs.front()->get_inode();
11769
11770 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11771 << " on " << *diri << dendl;
11772
11773 if (mdr->more()->slave_error)
11774 mdr->aborted = true;
11775
11776 if (!mdr->aborted) {
11777 MutationImpl::LockOpVec lov;
11778 lov.add_wrlock(&diri->dirfragtreelock);
11779 // prevent a racing gather on any other scatterlocks too
11780 lov.lock_scatter_gather(&diri->nestlock);
11781 lov.lock_scatter_gather(&diri->filelock);
11782 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
11783 if (!mdr->aborted)
11784 return;
11785 }
11786 }
11787
11788 if (mdr->aborted) {
11789 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11790 << info.dirs.front()->dirfrag() << dendl;
11791 if (info.bits > 0)
11792 mds->balancer->queue_split(info.dirs.front(), false);
11793 else
11794 mds->balancer->queue_merge(info.dirs.front());
11795 fragment_unmark_unfreeze_dirs(info.dirs);
11796 fragments.erase(it);
11797 request_finish(mdr);
11798 return;
11799 }
11800
11801 mdr->ls = mds->mdlog->get_current_segment();
11802 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11803 mds->mdlog->start_entry(le);
11804
11805 for (const auto& dir : info.dirs) {
11806 dirfrag_rollback rollback;
11807 rollback.fnode = dir->fnode;
11808 le->add_orig_frag(dir->get_frag(), &rollback);
11809 }
11810
11811 // refragment
11812 MDSContext::vec waiters;
11813 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11814 &info.resultfrags, waiters, false);
11815 if (g_conf()->mds_debug_frag)
11816 diri->verify_dirfrags();
11817 mds->queue_waiters(waiters);
11818
11819 for (const auto& fg : le->orig_frags)
11820 ceph_assert(!diri->dirfragtree.is_leaf(fg));
11821
11822 le->metablob.add_dir_context(info.resultfrags.front());
11823 for (const auto& dir : info.resultfrags) {
11824 if (diri->is_auth()) {
11825 le->metablob.add_fragmented_dir(dir, false, false);
11826 } else {
11827 dir->state_set(CDir::STATE_DIRTYDFT);
11828 le->metablob.add_fragmented_dir(dir, false, true);
11829 }
11830 }
11831
11832 // dft lock
11833 if (diri->is_auth()) {
11834 // journal dirfragtree
11835 auto &pi = diri->project_inode();
11836 pi.inode.version = diri->pre_dirty();
11837 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11838 } else {
11839 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11840 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11841 mdr->add_updated_lock(&diri->dirfragtreelock);
11842 }
11843
11844 /*
11845 // filelock
11846 mds->locker->mark_updated_scatterlock(&diri->filelock);
11847 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11848 mut->add_updated_lock(&diri->filelock);
11849
11850 // dirlock
11851 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11852 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11853 mut->add_updated_lock(&diri->nestlock);
11854 */
11855
11856 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11857 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11858 mdr, __func__);
11859 mds->mdlog->flush();
11860 }
11861
11862 void MDCache::_fragment_logged(MDRequestRef& mdr)
11863 {
11864 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11865 auto& info = fragments.at(basedirfrag);
11866 CInode *diri = info.resultfrags.front()->get_inode();
11867
11868 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11869 << " on " << *diri << dendl;
11870 mdr->mark_event("prepare logged");
11871
11872 if (diri->is_auth())
11873 diri->pop_and_dirty_projected_inode(mdr->ls);
11874
11875 mdr->apply(); // mark scatterlock
11876
11877 // store resulting frags
11878 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11879
11880 for (const auto& dir : info.resultfrags) {
11881 dout(10) << " storing result frag " << *dir << dendl;
11882
11883 // freeze and store them too
11884 dir->auth_pin(this);
11885 dir->state_set(CDir::STATE_FRAGMENTING);
11886 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
11887 }
11888
11889 gather.activate();
11890 }
11891
11892 void MDCache::_fragment_stored(MDRequestRef& mdr)
11893 {
11894 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11895 fragment_info_t &info = fragments.at(basedirfrag);
11896 CDir *first = info.resultfrags.front();
11897 CInode *diri = first->get_inode();
11898
11899 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
11900 << " on " << *diri << dendl;
11901 mdr->mark_event("new frags stored");
11902
11903 // tell peers
11904 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
11905 diri->authority().first : CDIR_AUTH_UNKNOWN;
11906 for (const auto &p : first->get_replicas()) {
11907 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11908 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11909 rejoin_gather.count(p.first)))
11910 continue;
11911
11912 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
11913 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
11914 diri_auth != p.first) { // not auth mds of diri
11915 /*
11916 * In the nornal case, mds does not trim dir inode whose child dirfrags
11917 * are likely being fragmented (see trim_inode()). But when fragmenting
11918 * subtree roots, following race can happen:
11919 *
11920 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
11921 * mds.c and drops wrlock on dirfragtreelock.
11922 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
11923 * SYNC and send lock message mds.c
11924 * - mds.c receives the lock message and changes dirfragtreelock state
11925 * to SYNC
11926 * - mds.c trim dirfrag and dir inode from its cache
11927 * - mds.c receives the fragment_notify message
11928 *
11929 * So we need to ensure replicas have received the notify, then unlock
11930 * the dirfragtreelock.
11931 */
11932 notify->mark_ack_wanted();
11933 info.notify_ack_waiting.insert(p.first);
11934 }
11935
11936 // freshly replicate new dirs to peers
11937 for (const auto& dir : info.resultfrags) {
11938 encode_replica_dir(dir, p.first, notify->basebl);
11939 }
11940
11941 mds->send_message_mds(notify, p.first);
11942 }
11943
11944 // journal commit
11945 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
11946 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
11947
11948
11949 // unfreeze resulting frags
11950 for (const auto& dir : info.resultfrags) {
11951 dout(10) << " result frag " << *dir << dendl;
11952
11953 for (auto &p : dir->items) {
11954 CDentry *dn = p.second;
11955 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11956 dn->state_clear(CDentry::STATE_FRAGMENTING);
11957 dn->put(CDentry::PIN_FRAGMENTING);
11958 }
11959
11960 // unfreeze
11961 dir->unfreeze_dir();
11962 }
11963
11964 if (info.notify_ack_waiting.empty()) {
11965 fragment_drop_locks(info);
11966 } else {
11967 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
11968 }
11969 }
11970
11971 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
11972 {
11973 dout(10) << "fragment_committed " << basedirfrag << dendl;
11974 if (mdr)
11975 mdr->mark_event("commit logged");
11976
11977 ufragment &uf = uncommitted_fragments.at(basedirfrag);
11978
11979 // remove old frags
11980 C_GatherBuilder gather(
11981 g_ceph_context,
11982 new C_OnFinisher(
11983 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
11984 mds->finisher));
11985
11986 SnapContext nullsnapc;
11987 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
11988 for (const auto& fg : uf.old_frags) {
11989 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
11990 ObjectOperation op;
11991 if (fg == frag_t()) {
11992 // backtrace object
11993 dout(10) << " truncate orphan dirfrag " << oid << dendl;
11994 op.truncate(0);
11995 op.omap_clear();
11996 } else {
11997 dout(10) << " removing orphan dirfrag " << oid << dendl;
11998 op.remove();
11999 }
12000 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12001 ceph::real_clock::now(),
12002 0, gather.new_sub());
12003 }
12004
12005 ceph_assert(gather.has_subs());
12006 gather.activate();
12007 }
12008
12009 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
12010 {
12011 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12012 if (mdr)
12013 mdr->mark_event("old frags purged");
12014
12015 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12016 mds->mdlog->start_submit_entry(le);
12017
12018 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12019
12020 if (mds->logger) {
12021 if (bits > 0) {
12022 mds->logger->inc(l_mds_dir_split);
12023 } else {
12024 mds->logger->inc(l_mds_dir_merge);
12025 }
12026 }
12027
12028 if (mdr) {
12029 auto it = fragments.find(basedirfrag);
12030 ceph_assert(it != fragments.end());
12031 it->second.finishing = true;
12032 if (it->second.notify_ack_waiting.empty())
12033 fragment_maybe_finish(it);
12034 else
12035 mdr->mark_event("wating for notify acks");
12036 }
12037 }
12038
12039 void MDCache::fragment_drop_locks(fragment_info_t& info)
12040 {
12041 mds->locker->drop_locks(info.mdr.get());
12042 request_finish(info.mdr);
12043 //info.mdr.reset();
12044 }
12045
12046 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12047 {
12048 if (!it->second.finishing)
12049 return;
12050
12051 // unmark & auth_unpin
12052 for (const auto &dir : it->second.resultfrags) {
12053 dir->state_clear(CDir::STATE_FRAGMENTING);
12054 dir->auth_unpin(this);
12055
12056 // In case the resulting fragments are beyond the split size,
12057 // we might need to split them again right away (they could
12058 // have been taking inserts between unfreezing and getting
12059 // here)
12060 mds->balancer->maybe_fragment(dir, false);
12061 }
12062
12063 fragments.erase(it);
12064 }
12065
12066
12067 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12068 {
12069 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12070 mds_rank_t from = mds_rank_t(ack->get_source().num());
12071
12072 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12073 return;
12074 }
12075
12076 auto it = fragments.find(ack->get_base_dirfrag());
12077 if (it == fragments.end() ||
12078 it->second.get_tid() != ack->get_tid()) {
12079 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12080 return;
12081 }
12082
12083 if (it->second.notify_ack_waiting.erase(from) &&
12084 it->second.notify_ack_waiting.empty()) {
12085 fragment_drop_locks(it->second);
12086 fragment_maybe_finish(it);
12087 }
12088 }
12089
12090 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12091 {
12092 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12093 mds_rank_t from = mds_rank_t(notify->get_source().num());
12094
12095 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12096 return;
12097 }
12098
12099 CInode *diri = get_inode(notify->get_ino());
12100 if (diri) {
12101 frag_t base = notify->get_basefrag();
12102 int bits = notify->get_bits();
12103
12104 /*
12105 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12106 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12107 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12108 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12109 return;
12110 }
12111 */
12112
12113 // refragment
12114 MDSContext::vec waiters;
12115 std::vector<CDir*> resultfrags;
12116 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12117 if (g_conf()->mds_debug_frag)
12118 diri->verify_dirfrags();
12119
12120 for (const auto& dir : resultfrags) {
12121 diri->take_dir_waiting(dir->get_frag(), waiters);
12122 }
12123
12124 // add new replica dirs values
12125 auto p = notify->basebl.cbegin();
12126 while (!p.end()) {
12127 CDir *tmp_dir = nullptr;
12128 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12129 }
12130
12131 mds->queue_waiters(waiters);
12132 } else {
12133 ceph_abort();
12134 }
12135
12136 if (notify->is_ack_wanted()) {
12137 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12138 notify->get_bits(), notify->get_tid());
12139 mds->send_message_mds(ack, from);
12140 }
12141 }
12142
12143 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12144 LogSegment *ls, bufferlist *rollback)
12145 {
12146 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12147 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12148 ufragment& uf = uncommitted_fragments[basedirfrag];
12149 uf.old_frags = old_frags;
12150 uf.bits = bits;
12151 uf.ls = ls;
12152 ls->uncommitted_fragments.insert(basedirfrag);
12153 if (rollback)
12154 uf.rollback.swap(*rollback);
12155 }
12156
12157 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12158 {
12159 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12160 << " op " << EFragment::op_name(op) << dendl;
12161 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12162 if (it != uncommitted_fragments.end()) {
12163 ufragment& uf = it->second;
12164 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12165 uf.committed = true;
12166 } else {
12167 uf.ls->uncommitted_fragments.erase(basedirfrag);
12168 mds->queue_waiters(uf.waiters);
12169 uncommitted_fragments.erase(it);
12170 }
12171 }
12172 }
12173
12174 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12175 {
12176 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12177 << " old_frags (" << old_frags << ")" << dendl;
12178 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12179 if (it != uncommitted_fragments.end()) {
12180 ufragment& uf = it->second;
12181 if (!uf.old_frags.empty()) {
12182 uf.old_frags = std::move(old_frags);
12183 uf.committed = true;
12184 } else {
12185 uf.ls->uncommitted_fragments.erase(basedirfrag);
12186 uncommitted_fragments.erase(it);
12187 }
12188 }
12189 }
12190
12191 void MDCache::rollback_uncommitted_fragments()
12192 {
12193 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12194 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12195 p != uncommitted_fragments.end();
12196 ++p) {
12197 ufragment &uf = p->second;
12198 CInode *diri = get_inode(p->first.ino);
12199 ceph_assert(diri);
12200
12201 if (uf.committed) {
12202 _fragment_committed(p->first, MDRequestRef());
12203 continue;
12204 }
12205
12206 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12207
12208 LogSegment *ls = mds->mdlog->get_current_segment();
12209 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12210 mds->mdlog->start_entry(le);
12211 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12212
12213 frag_vec_t old_frags;
12214 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12215
12216 std::vector<CDir*> resultfrags;
12217 if (uf.old_frags.empty()) {
12218 // created by old format EFragment
12219 MDSContext::vec waiters;
12220 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12221 } else {
12222 auto bp = uf.rollback.cbegin();
12223 for (const auto& fg : uf.old_frags) {
12224 CDir *dir = force_dir_fragment(diri, fg);
12225 resultfrags.push_back(dir);
12226
12227 dirfrag_rollback rollback;
12228 decode(rollback, bp);
12229
12230 dir->set_version(rollback.fnode.version);
12231 dir->fnode = rollback.fnode;
12232
12233 dir->_mark_dirty(ls);
12234
12235 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
12236 dout(10) << " dirty nestinfo on " << *dir << dendl;
12237 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
12238 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
12239 }
12240 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
12241 dout(10) << " dirty fragstat on " << *dir << dendl;
12242 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
12243 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
12244 }
12245
12246 le->add_orig_frag(dir->get_frag());
12247 le->metablob.add_dir_context(dir);
12248 if (diri_auth) {
12249 le->metablob.add_fragmented_dir(dir, true, false);
12250 } else {
12251 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12252 dir->state_set(CDir::STATE_DIRTYDFT);
12253 le->metablob.add_fragmented_dir(dir, true, true);
12254 }
12255 }
12256 }
12257
12258 if (diri_auth) {
12259 auto &pi = diri->project_inode();
12260 pi.inode.version = diri->pre_dirty();
12261 diri->pop_and_dirty_projected_inode(ls); // hacky
12262 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12263 } else {
12264 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12265 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12266 }
12267
12268 if (g_conf()->mds_debug_frag)
12269 diri->verify_dirfrags();
12270
12271 for (const auto& leaf : old_frags) {
12272 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12273 }
12274
12275 mds->mdlog->submit_entry(le);
12276
12277 uf.old_frags.swap(old_frags);
12278 _fragment_committed(p->first, MDRequestRef());
12279 }
12280 }
12281
12282 void MDCache::force_readonly()
12283 {
12284 if (is_readonly())
12285 return;
12286
12287 dout(1) << "force file system read-only" << dendl;
12288 mds->clog->warn() << "force file system read-only";
12289
12290 set_readonly();
12291
12292 mds->server->force_clients_readonly();
12293
12294 // revoke write caps
12295 int count = 0;
12296 for (auto &p : inode_map) {
12297 CInode *in = p.second;
12298 if (in->is_head())
12299 mds->locker->eval(in, CEPH_CAP_LOCKS);
12300 if (!(++count % 1000))
12301 mds->heartbeat_reset();
12302 }
12303
12304 mds->mdlog->flush();
12305 }
12306
12307
12308 // ==============================================================
12309 // debug crap
12310
12311 void MDCache::show_subtrees(int dbl, bool force_print)
12312 {
12313 if (g_conf()->mds_thrash_exports)
12314 dbl += 15;
12315
12316 //dout(10) << "show_subtrees" << dendl;
12317
12318 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12319 return; // i won't print anything.
12320
12321 if (subtrees.empty()) {
12322 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12323 << dendl;
12324 return;
12325 }
12326
12327 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12328 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12329 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12330 "printing subtrees" << dendl;
12331 return;
12332 }
12333
12334 // root frags
12335 std::vector<CDir*> basefrags;
12336 for (set<CInode*>::iterator p = base_inodes.begin();
12337 p != base_inodes.end();
12338 ++p)
12339 (*p)->get_dirfrags(basefrags);
12340 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12341 dout(15) << "show_subtrees" << dendl;
12342
12343 // queue stuff
12344 list<pair<CDir*,int> > q;
12345 string indent;
12346 set<CDir*> seen;
12347
12348 // calc max depth
12349 for (const auto& dir : basefrags) {
12350 q.emplace_back(dir, 0);
12351 }
12352
12353 set<CDir*> subtrees_seen;
12354
12355 unsigned int depth = 0;
12356 while (!q.empty()) {
12357 CDir *dir = q.front().first;
12358 unsigned int d = q.front().second;
12359 q.pop_front();
12360
12361 if (subtrees.count(dir) == 0) continue;
12362
12363 subtrees_seen.insert(dir);
12364
12365 if (d > depth) depth = d;
12366
12367 // sanity check
12368 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12369 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12370 ceph_assert(seen.count(dir) == 0);
12371 seen.insert(dir);
12372
12373 // nested items?
12374 if (!subtrees[dir].empty()) {
12375 for (set<CDir*>::iterator p = subtrees[dir].begin();
12376 p != subtrees[dir].end();
12377 ++p) {
12378 //dout(25) << " saw sub " << **p << dendl;
12379 q.push_front(pair<CDir*,int>(*p, d+1));
12380 }
12381 }
12382 }
12383
12384 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12385 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12386 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12387 "subtrees" << dendl;
12388 return;
12389 }
12390
12391 // print tree
12392 for (const auto& dir : basefrags) {
12393 q.emplace_back(dir, 0);
12394 }
12395
12396 while (!q.empty()) {
12397 CDir *dir = q.front().first;
12398 int d = q.front().second;
12399 q.pop_front();
12400
12401 if (subtrees.count(dir) == 0) continue;
12402
12403 // adjust indenter
12404 while ((unsigned)d < indent.size())
12405 indent.resize(d);
12406
12407 // pad
12408 string pad = "______________________________________";
12409 pad.resize(depth*2+1-indent.size());
12410 if (!subtrees[dir].empty())
12411 pad[0] = '.'; // parent
12412
12413
12414 string auth;
12415 if (dir->is_auth())
12416 auth = "auth ";
12417 else
12418 auth = " rep ";
12419
12420 char s[10];
12421 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12422 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12423 else
12424 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12425
12426 // print
12427 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12428 << " " << auth << *dir << dendl;
12429
12430 if (dir->ino() == MDS_INO_ROOT)
12431 ceph_assert(dir->inode == root);
12432 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12433 ceph_assert(dir->inode == myin);
12434 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12435 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12436
12437 // nested items?
12438 if (!subtrees[dir].empty()) {
12439 // more at my level?
12440 if (!q.empty() && q.front().second == d)
12441 indent += "| ";
12442 else
12443 indent += " ";
12444
12445 for (set<CDir*>::iterator p = subtrees[dir].begin();
12446 p != subtrees[dir].end();
12447 ++p)
12448 q.push_front(pair<CDir*,int>(*p, d+2));
12449 }
12450 }
12451
12452 // verify there isn't stray crap in subtree map
12453 int lost = 0;
12454 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12455 p != subtrees.end();
12456 ++p) {
12457 if (subtrees_seen.count(p->first)) continue;
12458 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12459 lost++;
12460 }
12461 ceph_assert(lost == 0);
12462 }
12463
12464 void MDCache::show_cache()
12465 {
12466 dout(7) << "show_cache" << dendl;
12467
12468 auto show_func = [this](CInode *in) {
12469 // unlinked?
12470 if (!in->parent)
12471 dout(7) << " unlinked " << *in << dendl;
12472
12473 // dirfrags?
12474 auto&& dfs = in->get_dirfrags();
12475 for (const auto& dir : dfs) {
12476 dout(7) << " dirfrag " << *dir << dendl;
12477
12478 for (auto &p : dir->items) {
12479 CDentry *dn = p.second;
12480 dout(7) << " dentry " << *dn << dendl;
12481 CDentry::linkage_t *dnl = dn->get_linkage();
12482 if (dnl->is_primary() && dnl->get_inode())
12483 dout(7) << " inode " << *dnl->get_inode() << dendl;
12484 }
12485 }
12486 };
12487
12488 for (auto &p : inode_map)
12489 show_func(p.second);
12490 for (auto &p : snap_inode_map)
12491 show_func(p.second);
12492 }
12493
12494 void MDCache::cache_status(Formatter *f)
12495 {
12496 f->open_object_section("cache");
12497
12498 f->open_object_section("pool");
12499 mempool::get_pool(mempool::mds_co::id).dump(f);
12500 f->close_section();
12501
12502 f->close_section();
12503 }
12504
12505 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12506 {
12507 ceph_assert(in);
12508 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12509 return;
12510 }
12511 auto&& ls = in->get_dirfrags();
12512 for (const auto &subdir : ls) {
12513 for (const auto &p : subdir->items) {
12514 CDentry *dn = p.second;
12515 CInode *in = dn->get_linkage()->get_inode();
12516 if (in) {
12517 dump_tree(in, cur_depth + 1, max_depth, f);
12518 }
12519 }
12520 }
12521 f->open_object_section("inode");
12522 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12523 f->close_section();
12524 }
12525
12526 int MDCache::dump_cache(std::string_view file_name)
12527 {
12528 return dump_cache(file_name, NULL);
12529 }
12530
12531 int MDCache::dump_cache(Formatter *f)
12532 {
12533 return dump_cache(std::string_view(""), f);
12534 }
12535
12536 /**
12537 * Dump the metadata cache, either to a Formatter, if
12538 * provided, else to a plain text file.
12539 */
12540 int MDCache::dump_cache(std::string_view fn, Formatter *f)
12541 {
12542 int r = 0;
12543
12544 // dumping large caches may cause mds to hang or worse get killed.
12545 // so, disallow the dump if the cache size exceeds the configured
12546 // threshold, which is 1G for formatter and unlimited for file (note
12547 // that this can be jacked up by the admin... and is nothing but foot
12548 // shooting, but the option itself is for devs and hence dangerous to
12549 // tune). TODO: remove this when fixed.
12550 uint64_t threshold = f ?
12551 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12552 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12553
12554 if (threshold && cache_size() > threshold) {
12555 if (f) {
12556 std::stringstream ss;
12557 ss << "cache usage exceeds dump threshold";
12558 f->open_object_section("result");
12559 f->dump_string("error", ss.str());
12560 f->close_section();
12561 } else {
12562 derr << "cache usage exceeds dump threshold" << dendl;
12563 r = -EINVAL;
12564 }
12565 return r;
12566 }
12567
12568 r = 0;
12569 int fd = -1;
12570
12571 if (f) {
12572 f->open_array_section("inodes");
12573 } else {
12574 char path[PATH_MAX] = "";
12575 if (fn.length()) {
12576 snprintf(path, sizeof path, "%s", fn.data());
12577 } else {
12578 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12579 }
12580
12581 dout(1) << "dump_cache to " << path << dendl;
12582
12583 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12584 if (fd < 0) {
12585 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12586 return errno;
12587 }
12588 }
12589
12590 auto dump_func = [fd, f](CInode *in) {
12591 int r;
12592 if (f) {
12593 f->open_object_section("inode");
12594 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12595 f->close_section();
12596 return 1;
12597 }
12598 ostringstream ss;
12599 ss << *in << std::endl;
12600 std::string s = ss.str();
12601 r = safe_write(fd, s.c_str(), s.length());
12602 if (r < 0)
12603 return r;
12604 auto&& dfs = in->get_dirfrags();
12605 for (auto &dir : dfs) {
12606 ostringstream tt;
12607 tt << " " << *dir << std::endl;
12608 std::string t = tt.str();
12609 r = safe_write(fd, t.c_str(), t.length());
12610 if (r < 0)
12611 return r;
12612 for (auto &p : dir->items) {
12613 CDentry *dn = p.second;
12614 ostringstream uu;
12615 uu << " " << *dn << std::endl;
12616 std::string u = uu.str();
12617 r = safe_write(fd, u.c_str(), u.length());
12618 if (r < 0)
12619 return r;
12620 }
12621 dir->check_rstats();
12622 }
12623 return 1;
12624 };
12625
12626 for (auto &p : inode_map) {
12627 r = dump_func(p.second);
12628 if (r < 0)
12629 goto out;
12630 }
12631 for (auto &p : snap_inode_map) {
12632 r = dump_func(p.second);
12633 if (r < 0)
12634 goto out;
12635 }
12636 r = 0;
12637
12638 out:
12639 if (f) {
12640 f->close_section(); // inodes
12641 } else {
12642 ::close(fd);
12643 }
12644 return r;
12645 }
12646
12647
12648
12649 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12650 : MDSInternalContext(c->mds), cache(c), mdr(r)
12651 {}
12652
12653 void C_MDS_RetryRequest::finish(int r)
12654 {
12655 mdr->retry++;
12656 cache->dispatch_request(mdr);
12657 }
12658
12659
12660 class C_MDS_EnqueueScrub : public Context
12661 {
12662 std::string tag;
12663 Formatter *formatter;
12664 Context *on_finish;
12665 public:
12666 ScrubHeaderRef header;
12667 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12668 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12669
12670 Context *take_finisher() {
12671 Context *fin = on_finish;
12672 on_finish = NULL;
12673 return fin;
12674 }
12675
12676 void finish(int r) override {
12677 if (r == 0) {
12678 // since recursive scrub is asynchronous, dump minimal output
12679 // to not upset cli tools.
12680 if (header && header->get_recursive()) {
12681 formatter->open_object_section("results");
12682 formatter->dump_int("return_code", 0);
12683 formatter->dump_string("scrub_tag", tag);
12684 formatter->dump_string("mode", "asynchronous");
12685 formatter->close_section(); // results
12686 }
12687 } else { // we failed the lookup or something; dump ourselves
12688 formatter->open_object_section("results");
12689 formatter->dump_int("return_code", r);
12690 formatter->close_section(); // results
12691 r = 0; // already dumped in formatter
12692 }
12693 if (on_finish)
12694 on_finish->complete(r);
12695 }
12696 };
12697
12698 void MDCache::enqueue_scrub(
12699 std::string_view path,
12700 std::string_view tag,
12701 bool force, bool recursive, bool repair,
12702 Formatter *f, Context *fin)
12703 {
12704 dout(10) << __func__ << " " << path << dendl;
12705 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12706 if (path == "~mdsdir") {
12707 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12708 mdr->set_filepath(fp);
12709 } else {
12710 filepath fp(path);
12711 mdr->set_filepath(path);
12712 }
12713
12714 bool is_internal = false;
12715 std::string tag_str(tag);
12716 if (tag_str.empty()) {
12717 uuid_d uuid_gen;
12718 uuid_gen.generate_random();
12719 tag_str = uuid_gen.to_string();
12720 is_internal = true;
12721 }
12722
12723 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
12724 cs->header = std::make_shared<ScrubHeader>(
12725 tag_str, is_internal, force, recursive, repair, f);
12726
12727 mdr->internal_op_finish = cs;
12728 enqueue_scrub_work(mdr);
12729 }
12730
12731 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12732 {
12733 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
12734 if (NULL == in)
12735 return;
12736
12737 // TODO: Remove this restriction
12738 ceph_assert(in->is_auth());
12739
12740 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12741 ScrubHeaderRef header = cs->header;
12742
12743 // Cannot scrub same dentry twice at same time
12744 if (in->scrub_is_in_progress()) {
12745 mds->server->respond_to_request(mdr, -EBUSY);
12746 return;
12747 } else {
12748 in->scrub_info();
12749 }
12750
12751 header->set_origin(in);
12752
12753 Context *fin;
12754 if (header->get_recursive()) {
12755 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12756 fin = new MDSInternalContextWrapper(mds,
12757 new LambdaContext([this, header](int r) {
12758 recursive_scrub_finish(header);
12759 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12760 })
12761 );
12762 } else {
12763 fin = cs->take_finisher();
12764 }
12765
12766 // If the scrub did some repair, then flush the journal at the end of
12767 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12768 // the on disk state will still look damaged.
12769 auto scrub_finish = new LambdaContext([this, header, fin](int r){
12770 if (!header->get_repaired()) {
12771 if (fin)
12772 fin->complete(r);
12773 return;
12774 }
12775
12776 auto flush_finish = new LambdaContext([this, fin](int r){
12777 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12778 mds->mdlog->trim_all();
12779
12780 if (fin) {
12781 MDSGatherBuilder gather(g_ceph_context);
12782 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12783 for (auto logseg : expiring_segments)
12784 logseg->wait_for_expiry(gather.new_sub());
12785 ceph_assert(gather.has_subs());
12786 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12787 gather.activate();
12788 }
12789 });
12790
12791 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12792 mds->mdlog->start_new_segment();
12793 mds->mdlog->flush();
12794 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
12795 });
12796
12797 if (!header->get_recursive()) {
12798 mds->scrubstack->enqueue_inode_top(in, header,
12799 new MDSInternalContextWrapper(mds, scrub_finish));
12800 } else {
12801 mds->scrubstack->enqueue_inode_bottom(in, header,
12802 new MDSInternalContextWrapper(mds, scrub_finish));
12803 }
12804
12805 mds->server->respond_to_request(mdr, 0);
12806 return;
12807 }
12808
12809 void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12810 {
12811 if (header->get_origin()->is_base() &&
12812 header->get_force() && header->get_repair()) {
12813 // notify snapserver that base directory is recursively scrubbed.
12814 // After both root and mdsdir are recursively scrubbed, snapserver
12815 // knows that all old format snaprealms are converted to the new
12816 // format.
12817 if (mds->mdsmap->get_num_in_mds() == 1 &&
12818 mds->mdsmap->get_num_failed_mds() == 0 &&
12819 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12820 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12821 }
12822 }
12823 }
12824
12825 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
12826 MDRequestRef mdr;
12827 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
12828 MDCacheLogContext(c), mdr(m) {}
12829 void finish(int r) override {
12830 mdr->apply();
12831 get_mds()->server->respond_to_request(mdr, r);
12832 }
12833 };
12834
12835 void MDCache::repair_dirfrag_stats(CDir *dir)
12836 {
12837 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12838 mdr->pin(dir);
12839 mdr->internal_op_private = dir;
12840 mdr->internal_op_finish = new C_MDSInternalNoop;
12841 repair_dirfrag_stats_work(mdr);
12842 }
12843
12844 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12845 {
12846 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12847 dout(10) << __func__ << " " << *dir << dendl;
12848
12849 if (!dir->is_auth()) {
12850 mds->server->respond_to_request(mdr, -ESTALE);
12851 return;
12852 }
12853
12854 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12855 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12856
12857 mds->locker->drop_locks(mdr.get());
12858 mdr->drop_local_auth_pins();
12859 if (mdr->is_any_remote_auth_pin())
12860 mds->locker->notify_freeze_waiter(dir);
12861 return;
12862 }
12863
12864 mdr->auth_pin(dir);
12865
12866 MutationImpl::LockOpVec lov;
12867 CInode *diri = dir->inode;
12868 lov.add_rdlock(&diri->dirfragtreelock);
12869 lov.add_wrlock(&diri->nestlock);
12870 lov.add_wrlock(&diri->filelock);
12871 if (!mds->locker->acquire_locks(mdr, lov))
12872 return;
12873
12874 if (!dir->is_complete()) {
12875 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12876 return;
12877 }
12878
12879 frag_info_t frag_info;
12880 nest_info_t nest_info;
12881 for (auto it = dir->begin(); it != dir->end(); ++it) {
12882 CDentry *dn = it->second;
12883 if (dn->last != CEPH_NOSNAP)
12884 continue;
12885 CDentry::linkage_t *dnl = dn->get_projected_linkage();
12886 if (dnl->is_primary()) {
12887 CInode *in = dnl->get_inode();
12888 nest_info.add(in->get_projected_inode()->accounted_rstat);
12889 if (in->is_dir())
12890 frag_info.nsubdirs++;
12891 else
12892 frag_info.nfiles++;
12893 } else if (dnl->is_remote())
12894 frag_info.nfiles++;
12895 }
12896
12897 fnode_t *pf = dir->get_projected_fnode();
12898 bool good_fragstat = frag_info.same_sums(pf->fragstat);
12899 bool good_rstat = nest_info.same_sums(pf->rstat);
12900 if (good_fragstat && good_rstat) {
12901 dout(10) << __func__ << " no corruption found" << dendl;
12902 mds->server->respond_to_request(mdr, 0);
12903 return;
12904 }
12905
12906 pf = dir->project_fnode();
12907 pf->version = dir->pre_dirty();
12908 mdr->add_projected_fnode(dir);
12909
12910 mdr->ls = mds->mdlog->get_current_segment();
12911 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
12912 mds->mdlog->start_entry(le);
12913
12914 if (!good_fragstat) {
12915 if (pf->fragstat.mtime > frag_info.mtime)
12916 frag_info.mtime = pf->fragstat.mtime;
12917 if (pf->fragstat.change_attr > frag_info.change_attr)
12918 frag_info.change_attr = pf->fragstat.change_attr;
12919 pf->fragstat = frag_info;
12920 mds->locker->mark_updated_scatterlock(&diri->filelock);
12921 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12922 mdr->add_updated_lock(&diri->filelock);
12923 }
12924
12925 if (!good_rstat) {
12926 if (pf->rstat.rctime > nest_info.rctime)
12927 nest_info.rctime = pf->rstat.rctime;
12928 pf->rstat = nest_info;
12929 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12930 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12931 mdr->add_updated_lock(&diri->nestlock);
12932 }
12933
12934 le->metablob.add_dir_context(dir);
12935 le->metablob.add_dir(dir, true);
12936
12937 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
12938 }
12939
12940 void MDCache::repair_inode_stats(CInode *diri)
12941 {
12942 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
12943 mdr->pin(diri);
12944 mdr->internal_op_private = diri;
12945 mdr->internal_op_finish = new C_MDSInternalNoop;
12946 repair_inode_stats_work(mdr);
12947 }
12948
12949 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
12950 {
12951 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
12952 dout(10) << __func__ << " " << *diri << dendl;
12953
12954 if (!diri->is_auth()) {
12955 mds->server->respond_to_request(mdr, -ESTALE);
12956 return;
12957 }
12958 if (!diri->is_dir()) {
12959 mds->server->respond_to_request(mdr, -ENOTDIR);
12960 return;
12961 }
12962
12963 MutationImpl::LockOpVec lov;
12964
12965 if (mdr->ls) // already marked filelock/nestlock dirty ?
12966 goto do_rdlocks;
12967
12968 lov.add_rdlock(&diri->dirfragtreelock);
12969 lov.add_wrlock(&diri->nestlock);
12970 lov.add_wrlock(&diri->filelock);
12971 if (!mds->locker->acquire_locks(mdr, lov))
12972 return;
12973
12974 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
12975 // the scatter-gather process, which will fix any fragstat/rstat errors.
12976 {
12977 frag_vec_t leaves;
12978 diri->dirfragtree.get_leaves(leaves);
12979 for (const auto& leaf : leaves) {
12980 CDir *dir = diri->get_dirfrag(leaf);
12981 if (!dir) {
12982 ceph_assert(mdr->is_auth_pinned(diri));
12983 dir = diri->get_or_open_dirfrag(this, leaf);
12984 }
12985 if (dir->get_version() == 0) {
12986 ceph_assert(dir->is_auth());
12987 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12988 return;
12989 }
12990 }
12991 }
12992
12993 diri->state_set(CInode::STATE_REPAIRSTATS);
12994 mdr->ls = mds->mdlog->get_current_segment();
12995 mds->locker->mark_updated_scatterlock(&diri->filelock);
12996 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
12997 mds->locker->mark_updated_scatterlock(&diri->nestlock);
12998 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
12999
13000 mds->locker->drop_locks(mdr.get());
13001
13002 do_rdlocks:
13003 // force the scatter-gather process
13004 lov.clear();
13005 lov.add_rdlock(&diri->dirfragtreelock);
13006 lov.add_rdlock(&diri->nestlock);
13007 lov.add_rdlock(&diri->filelock);
13008 if (!mds->locker->acquire_locks(mdr, lov))
13009 return;
13010
13011 diri->state_clear(CInode::STATE_REPAIRSTATS);
13012
13013 frag_info_t dir_info;
13014 nest_info_t nest_info;
13015 nest_info.rsubdirs = 1; // it gets one to account for self
13016 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13017 nest_info.rsnaps = srnode->snaps.size();
13018
13019 {
13020 frag_vec_t leaves;
13021 diri->dirfragtree.get_leaves(leaves);
13022 for (const auto& leaf : leaves) {
13023 CDir *dir = diri->get_dirfrag(leaf);
13024 ceph_assert(dir);
13025 ceph_assert(dir->get_version() > 0);
13026 dir_info.add(dir->fnode.accounted_fragstat);
13027 nest_info.add(dir->fnode.accounted_rstat);
13028 }
13029 }
13030
13031 if (!dir_info.same_sums(diri->inode.dirstat) ||
13032 !nest_info.same_sums(diri->inode.rstat)) {
13033 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13034 << *diri << dendl;
13035 }
13036
13037 mds->server->respond_to_request(mdr, 0);
13038 }
13039
13040 void MDCache::upgrade_inode_snaprealm(CInode *in)
13041 {
13042 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
13043 mdr->pin(in);
13044 mdr->internal_op_private = in;
13045 mdr->internal_op_finish = new C_MDSInternalNoop;
13046 upgrade_inode_snaprealm_work(mdr);
13047 }
13048
13049 void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
13050 {
13051 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
13052 dout(10) << __func__ << " " << *in << dendl;
13053
13054 if (!in->is_auth()) {
13055 mds->server->respond_to_request(mdr, -ESTALE);
13056 return;
13057 }
13058
13059 MutationImpl::LockOpVec lov;
13060 lov.add_xlock(&in->snaplock);
13061 if (!mds->locker->acquire_locks(mdr, lov))
13062 return;
13063
13064 // project_snaprealm() upgrades snaprealm format
13065 auto &pi = in->project_inode(false, true);
13066 mdr->add_projected_inode(in);
13067 pi.inode.version = in->pre_dirty();
13068
13069 mdr->ls = mds->mdlog->get_current_segment();
13070 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
13071 mds->mdlog->start_entry(le);
13072
13073 if (in->is_base()) {
13074 le->metablob.add_root(true, in);
13075 } else {
13076 CDentry *pdn = in->get_projected_parent_dn();
13077 le->metablob.add_dir_context(pdn->get_dir());
13078 le->metablob.add_primary_dentry(pdn, in, true);
13079 }
13080
13081 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13082 }
13083
13084 void MDCache::flush_dentry(std::string_view path, Context *fin)
13085 {
13086 if (is_readonly()) {
13087 dout(10) << __func__ << ": read-only FS" << dendl;
13088 fin->complete(-EROFS);
13089 return;
13090 }
13091 dout(10) << "flush_dentry " << path << dendl;
13092 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13093 filepath fp(path);
13094 mdr->set_filepath(fp);
13095 mdr->internal_op_finish = fin;
13096 flush_dentry_work(mdr);
13097 }
13098
13099 class C_FinishIOMDR : public MDSContext {
13100 protected:
13101 MDSRank *mds;
13102 MDRequestRef mdr;
13103 MDSRank *get_mds() override { return mds; }
13104 public:
13105 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13106 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13107 };
13108
13109 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13110 {
13111 MutationImpl::LockOpVec lov;
13112 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13113 if (!in)
13114 return;
13115
13116 ceph_assert(in->is_auth());
13117 in->flush(new C_FinishIOMDR(mds, mdr));
13118 }
13119
13120
13121 /**
13122 * Initialize performance counters with global perfcounter
13123 * collection.
13124 */
13125 void MDCache::register_perfcounters()
13126 {
13127 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13128
13129 // Stray/purge statistics
13130 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13131 PerfCountersBuilder::PRIO_INTERESTING);
13132 pcb.add_u64(l_mdc_num_recovering_enqueued,
13133 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13134 PerfCountersBuilder::PRIO_INTERESTING);
13135 pcb.add_u64_counter(l_mdc_recovery_completed,
13136 "recovery_completed", "File recoveries completed", "recd",
13137 PerfCountersBuilder::PRIO_INTERESTING);
13138
13139 // useful recovery queue statistics
13140 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13141 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13142 "Files currently being recovered");
13143 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13144 "Files waiting for recovery with elevated priority");
13145 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13146 "File recoveries started");
13147
13148 // along with other stray dentries stats
13149 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13150 "Stray dentries delayed");
13151 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13152 "Stray dentries enqueuing for purge");
13153 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13154 "Stray dentries created");
13155 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13156 "Stray dentries enqueued for purge");
13157 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13158 "Stray dentries reintegrated");
13159 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13160 "Stray dentries migrated");
13161
13162 // low prio internal request stats
13163 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13164 "Internal Request type enqueue scrub");
13165 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13166 "Internal Request type export dir");
13167 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13168 "Internal Request type flush");
13169 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13170 "Internal Request type fragmentdir");
13171 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13172 "Internal Request type frag stats");
13173 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13174 "Internal Request type inode stats");
13175
13176 logger.reset(pcb.create_perf_counters());
13177 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13178 recovery_queue.set_logger(logger.get());
13179 stray_manager.set_logger(logger.get());
13180 }
13181
13182 /**
13183 * Call this when putting references to an inode/dentry or
13184 * when attempting to trim it.
13185 *
13186 * If this inode is no longer linked by anyone, and this MDS
13187 * rank holds the primary dentry, and that dentry is in a stray
13188 * directory, then give up the dentry to the StrayManager, never
13189 * to be seen again by MDCache.
13190 *
13191 * @param delay if true, then purgeable inodes are stashed til
13192 * the next trim(), rather than being purged right
13193 * away.
13194 */
13195 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13196 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
13197 mds->get_state() <= MDSMap::STATE_REJOIN)
13198 return;
13199
13200 CDentry *dn = in->get_projected_parent_dn();
13201
13202 if (dn->state_test(CDentry::STATE_PURGING)) {
13203 /* We have already entered the purging process, no need
13204 * to re-evaluate me ! */
13205 return;
13206 }
13207
13208 if (dn->get_dir()->get_inode()->is_stray()) {
13209 if (delay)
13210 stray_manager.queue_delayed(dn);
13211 else
13212 stray_manager.eval_stray(dn);
13213 }
13214 }
13215
13216 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13217 dout(10) << __func__ << " " << *diri << dendl;
13218 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13219 auto&& ls = diri->get_dirfrags();
13220 for (auto &p : ls) {
13221 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13222 p->try_remove_dentries_for_stray();
13223 }
13224 if (!diri->snaprealm) {
13225 if (diri->is_auth())
13226 diri->clear_dirty_rstat();
13227 diri->clear_scatter_dirty();
13228 }
13229 }
13230
13231 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13232 CInode *in = get_inode(number);
13233 if (!in) {
13234 return false;
13235 }
13236 f->open_object_section("inode");
13237 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13238 f->close_section();
13239 return true;
13240 }
13241
13242 void MDCache::handle_mdsmap(const MDSMap &mdsmap) {
13243 // process export_pin_delayed_queue whenever a new MDSMap received
13244 auto &q = export_pin_delayed_queue;
13245 for (auto it = q.begin(); it != q.end(); ) {
13246 auto *in = *it;
13247 mds_rank_t export_pin = in->get_export_pin(false);
13248 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13249 << " max_mds=" << mdsmap.get_max_mds() << dendl;
13250 if (export_pin >= mdsmap.get_max_mds()) {
13251 it++;
13252 continue;
13253 }
13254
13255 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13256 it = q.erase(it);
13257 in->maybe_export_pin();
13258 }
13259 }
13260