]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.cc
88eece19f00bb485c42581b7cc838641626986bd
[ceph.git] / ceph / src / mds / MDCache.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <errno.h>
16 #include <fstream>
17 #include <iostream>
18 #include <sstream>
19 #include <string>
20 #include <string_view>
21 #include <map>
22
23 #include "MDCache.h"
24 #include "MDSRank.h"
25 #include "Server.h"
26 #include "Locker.h"
27 #include "MDLog.h"
28 #include "MDBalancer.h"
29 #include "Migrator.h"
30 #include "ScrubStack.h"
31
32 #include "SnapClient.h"
33
34 #include "MDSMap.h"
35
36 #include "CInode.h"
37 #include "CDir.h"
38
39 #include "Mutation.h"
40
41 #include "include/ceph_fs.h"
42 #include "include/filepath.h"
43 #include "include/util.h"
44
45 #include "messages/MClientCaps.h"
46
47 #include "msg/Message.h"
48 #include "msg/Messenger.h"
49
50 #include "common/MemoryModel.h"
51 #include "common/errno.h"
52 #include "common/perf_counters.h"
53 #include "common/safe_io.h"
54
55 #include "osdc/Journaler.h"
56 #include "osdc/Filer.h"
57
58 #include "events/ESubtreeMap.h"
59 #include "events/EUpdate.h"
60 #include "events/ESlaveUpdate.h"
61 #include "events/EImportFinish.h"
62 #include "events/EFragment.h"
63 #include "events/ECommitted.h"
64 #include "events/EPurged.h"
65 #include "events/ESessions.h"
66
67 #include "InoTable.h"
68
69 #include "common/Timer.h"
70
71 #include "perfglue/heap_profiler.h"
72
73
74 #include "common/config.h"
75 #include "include/ceph_assert.h"
76
77 #define dout_context g_ceph_context
78 #define dout_subsys ceph_subsys_mds
79 #undef dout_prefix
80 #define dout_prefix _prefix(_dout, mds)
81 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
82 return *_dout << "mds." << mds->get_nodeid() << ".cache ";
83 }
84
85 set<int> SimpleLock::empty_gather_set;
86
87
88 /**
89 * All non-I/O contexts that require a reference
90 * to an MDCache instance descend from this.
91 */
92 class MDCacheContext : public virtual MDSContext {
93 protected:
94 MDCache *mdcache;
95 MDSRank *get_mds() override
96 {
97 ceph_assert(mdcache != NULL);
98 return mdcache->mds;
99 }
100 public:
101 explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {}
102 };
103
104
105 /**
106 * Only for contexts called back from an I/O completion
107 *
108 * Note: duplication of members wrt MDCacheContext, because
109 * it'ls the lesser of two evils compared with introducing
110 * yet another piece of (multiple) inheritance.
111 */
112 class MDCacheIOContext : public virtual MDSIOContextBase {
113 protected:
114 MDCache *mdcache;
115 MDSRank *get_mds() override
116 {
117 ceph_assert(mdcache != NULL);
118 return mdcache->mds;
119 }
120 public:
121 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
122 MDSIOContextBase(track), mdcache(mdc_) {}
123 };
124
125 class MDCacheLogContext : public virtual MDSLogContextBase {
126 protected:
127 MDCache *mdcache;
128 MDSRank *get_mds() override
129 {
130 ceph_assert(mdcache != NULL);
131 return mdcache->mds;
132 }
133 public:
134 explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {}
135 };
136
137 MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
138 mds(m),
139 open_file_table(m),
140 filer(m->objecter, m->finisher),
141 stray_manager(m, purge_queue_),
142 recovery_queue(m),
143 trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate"))
144 {
145 migrator.reset(new Migrator(mds, this));
146
147 max_dir_commit_size = g_conf()->mds_dir_max_commit_size ?
148 (g_conf()->mds_dir_max_commit_size << 20) :
149 (0.9 *(g_conf()->osd_max_write_size << 20));
150
151 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
152 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
153 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
154 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
155
156 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
157 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
158 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
159
160 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
161
162 bottom_lru.lru_set_midpoint(0);
163
164 decayrate.set_halflife(g_conf()->mds_decay_halflife);
165
166 upkeeper = std::thread([this]() {
167 std::unique_lock lock(upkeep_mutex);
168 while (!upkeep_trim_shutdown.load()) {
169 auto now = clock::now();
170 auto since = now-upkeep_last_trim;
171 auto trim_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_trim_interval"));
172 if (since >= trim_interval*.90) {
173 lock.unlock(); /* mds_lock -> upkeep_mutex */
174 std::scoped_lock mds_lock(mds->mds_lock);
175 lock.lock();
176 if (upkeep_trim_shutdown.load())
177 return;
178 if (mds->is_cache_trimmable()) {
179 dout(20) << "upkeep thread trimming cache; last trim " << since << " ago" << dendl;
180 trim_client_leases();
181 trim();
182 check_memory_usage();
183 auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
184 mds->server->recall_client_state(nullptr, flags);
185 upkeep_last_trim = now = clock::now();
186 } else {
187 dout(10) << "cache not ready for trimming" << dendl;
188 }
189 } else {
190 trim_interval -= since;
191 }
192 since = now-upkeep_last_release;
193 auto release_interval = clock::duration(g_conf().get_val<std::chrono::seconds>("mds_cache_release_free_interval"));
194 if (since >= release_interval) {
195 /* XXX not necessary once MDCache uses PriorityCache */
196 dout(10) << "releasing free memory" << dendl;
197 ceph_heap_release_free_memory();
198 upkeep_last_release = clock::now();
199 } else {
200 release_interval -= since;
201 }
202 auto interval = std::min(release_interval, trim_interval);
203 dout(20) << "upkeep thread waiting interval " << interval << dendl;
204 upkeep_cvar.wait_for(lock, interval);
205 }
206 });
207 }
208
209 MDCache::~MDCache()
210 {
211 if (logger) {
212 g_ceph_context->get_perfcounters_collection()->remove(logger.get());
213 }
214 if (upkeeper.joinable())
215 upkeeper.join();
216 }
217
218 void MDCache::handle_conf_change(const std::set<std::string>& changed, const MDSMap& mdsmap)
219 {
220 dout(20) << "config changes: " << changed << dendl;
221 if (changed.count("mds_cache_memory_limit"))
222 cache_memory_limit = g_conf().get_val<Option::size_t>("mds_cache_memory_limit");
223 if (changed.count("mds_cache_reservation"))
224 cache_reservation = g_conf().get_val<double>("mds_cache_reservation");
225 if (changed.count("mds_export_ephemeral_distributed")) {
226 export_ephemeral_distributed_config = g_conf().get_val<bool>("mds_export_ephemeral_distributed");
227 dout(10) << "Migrating any ephemeral distributed pinned inodes" << dendl;
228 /* copy to vector to avoid removals during iteration */
229 std::vector<CInode*> migrate;
230 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
231 for (auto& in : migrate) {
232 in->maybe_ephemeral_dist();
233 }
234 mds->balancer->handle_export_pins();
235 }
236 if (changed.count("mds_export_ephemeral_random")) {
237 export_ephemeral_random_config = g_conf().get_val<bool>("mds_export_ephemeral_random");
238 dout(10) << "Migrating any ephemeral random pinned inodes" << dendl;
239 /* copy to vector to avoid removals during iteration */
240 std::vector<CInode*> migrate;
241 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
242 for (auto& in : migrate) {
243 in->maybe_ephemeral_rand();
244 }
245 mds->balancer->handle_export_pins();
246 }
247 if (changed.count("mds_export_ephemeral_random_max")) {
248 export_ephemeral_random_max = g_conf().get_val<double>("mds_export_ephemeral_random_max");
249 }
250 if (changed.count("mds_health_cache_threshold"))
251 cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
252 if (changed.count("mds_cache_mid"))
253 lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
254 if (changed.count("mds_cache_trim_decay_rate")) {
255 trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
256 }
257 if (changed.count("mds_forward_all_requests_to_auth")){
258 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
259 }
260
261 migrator->handle_conf_change(changed, mdsmap);
262 mds->balancer->handle_conf_change(changed, mdsmap);
263 }
264
265 void MDCache::log_stat()
266 {
267 mds->logger->set(l_mds_inodes, lru.lru_get_size());
268 mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned());
269 mds->logger->set(l_mds_inodes_top, lru.lru_get_top());
270 mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot());
271 mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail());
272 mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps);
273 mds->logger->set(l_mds_caps, Capability::count());
274 if (root) {
275 mds->logger->set(l_mds_root_rfiles, root->inode.rstat.rfiles);
276 mds->logger->set(l_mds_root_rbytes, root->inode.rstat.rbytes);
277 mds->logger->set(l_mds_root_rsnaps, root->inode.rstat.rsnaps);
278 }
279 }
280
281
282 //
283
284 bool MDCache::shutdown()
285 {
286 {
287 std::scoped_lock lock(upkeep_mutex);
288 upkeep_trim_shutdown = true;
289 upkeep_cvar.notify_one();
290 }
291 if (lru.lru_get_size() > 0) {
292 dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl;
293 //show_cache();
294 show_subtrees();
295 //dump();
296 }
297 return true;
298 }
299
300
301 // ====================================================================
302 // some inode functions
303
304 void MDCache::add_inode(CInode *in)
305 {
306 // add to lru, inode map
307 if (in->last == CEPH_NOSNAP) {
308 auto &p = inode_map[in->ino()];
309 ceph_assert(!p); // should be no dup inos!
310 p = in;
311 } else {
312 auto &p = snap_inode_map[in->vino()];
313 ceph_assert(!p); // should be no dup inos!
314 p = in;
315 }
316
317 if (in->ino() < MDS_INO_SYSTEM_BASE) {
318 if (in->ino() == MDS_INO_ROOT)
319 root = in;
320 else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
321 myin = in;
322 else if (in->is_stray()) {
323 if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) {
324 strays[MDS_INO_STRAY_INDEX(in->ino())] = in;
325 }
326 }
327 if (in->is_base())
328 base_inodes.insert(in);
329 }
330
331 if (cache_toofull()) {
332 exceeded_size_limit = true;
333 }
334
335 in->maybe_ephemeral_dist(false);
336 }
337
338 void MDCache::remove_inode(CInode *o)
339 {
340 dout(14) << "remove_inode " << *o << dendl;
341
342 if (o->get_parent_dn()) {
343 // FIXME: multiple parents?
344 CDentry *dn = o->get_parent_dn();
345 ceph_assert(!dn->is_dirty());
346 dn->dir->unlink_inode(dn); // leave dentry ... FIXME?
347 }
348
349 if (o->is_dirty())
350 o->mark_clean();
351 if (o->is_dirty_parent())
352 o->clear_dirty_parent();
353
354 o->clear_scatter_dirty();
355
356 o->item_open_file.remove_myself();
357
358 if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN))
359 export_pin_queue.erase(o);
360
361 if (o->state_test(CInode::STATE_DELAYEDEXPORTPIN))
362 export_pin_delayed_queue.erase(o);
363
364 o->set_ephemeral_dist(false);
365 o->set_ephemeral_rand(false);
366
367 // remove from inode map
368 if (o->last == CEPH_NOSNAP) {
369 inode_map.erase(o->ino());
370 } else {
371 o->item_caps.remove_myself();
372 snap_inode_map.erase(o->vino());
373 }
374
375 if (o->ino() < MDS_INO_SYSTEM_BASE) {
376 if (o == root) root = 0;
377 if (o == myin) myin = 0;
378 if (o->is_stray()) {
379 if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) {
380 strays[MDS_INO_STRAY_INDEX(o->ino())] = 0;
381 }
382 }
383 if (o->is_base())
384 base_inodes.erase(o);
385 }
386
387 // delete it
388 ceph_assert(o->get_num_ref() == 0);
389 delete o;
390 }
391
392 file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap)
393 {
394 file_layout_t result = file_layout_t::get_default();
395 result.pool_id = mdsmap.get_first_data_pool();
396 return result;
397 }
398
399 file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap)
400 {
401 file_layout_t result = file_layout_t::get_default();
402 result.pool_id = mdsmap.get_metadata_pool();
403 if (g_conf()->mds_log_segment_size > 0) {
404 result.object_size = g_conf()->mds_log_segment_size;
405 result.stripe_unit = g_conf()->mds_log_segment_size;
406 }
407 return result;
408 }
409
410 void MDCache::init_layouts()
411 {
412 default_file_layout = gen_default_file_layout(*(mds->mdsmap));
413 default_log_layout = gen_default_log_layout(*(mds->mdsmap));
414 }
415
416 void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino,
417 int mode) const
418 {
419 in->inode.ino = ino;
420 in->inode.version = 1;
421 in->inode.xattr_version = 1;
422 in->inode.mode = 0500 | mode;
423 in->inode.size = 0;
424 in->inode.ctime =
425 in->inode.mtime =
426 in->inode.btime = ceph_clock_now();
427 in->inode.nlink = 1;
428 in->inode.truncate_size = -1ull;
429 in->inode.change_attr = 0;
430 in->inode.export_pin = MDS_RANK_NONE;
431
432 // FIPS zeroization audit 20191117: this memset is not security related.
433 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
434 if (in->inode.is_dir()) {
435 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
436 in->inode.rstat.rsubdirs = 1; /* itself */
437 in->inode.rstat.rctime = in->inode.ctime;
438 } else {
439 in->inode.layout = default_file_layout;
440 ++in->inode.rstat.rfiles;
441 }
442 in->inode.accounted_rstat = in->inode.rstat;
443
444 if (in->is_base()) {
445 if (in->is_root())
446 in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN);
447 else
448 in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN);
449 in->open_snaprealm(); // empty snaprealm
450 ceph_assert(!in->snaprealm->parent); // created its own
451 in->snaprealm->srnode.seq = 1;
452 }
453 }
454
455 CInode *MDCache::create_system_inode(inodeno_t ino, int mode)
456 {
457 dout(0) << "creating system inode with ino:" << ino << dendl;
458 CInode *in = new CInode(this);
459 create_unlinked_system_inode(in, ino, mode);
460 add_inode(in);
461 return in;
462 }
463
464 CInode *MDCache::create_root_inode()
465 {
466 CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755);
467 i->inode.uid = g_conf()->mds_root_ino_uid;
468 i->inode.gid = g_conf()->mds_root_ino_gid;
469 i->inode.layout = default_file_layout;
470 i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool();
471 return i;
472 }
473
474 void MDCache::create_empty_hierarchy(MDSGather *gather)
475 {
476 // create root dir
477 CInode *root = create_root_inode();
478
479 // force empty root dir
480 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
481 adjust_subtree_auth(rootdir, mds->get_nodeid());
482 rootdir->dir_rep = CDir::REP_ALL; //NONE;
483
484 ceph_assert(rootdir->fnode.accounted_fragstat == rootdir->fnode.fragstat);
485 ceph_assert(rootdir->fnode.fragstat == root->inode.dirstat);
486 ceph_assert(rootdir->fnode.accounted_rstat == rootdir->fnode.rstat);
487 /* Do no update rootdir rstat information of the fragment, rstat upkeep magic
488 * assume version 0 is stale/invalid.
489 */
490
491 rootdir->mark_complete();
492 rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment());
493 rootdir->commit(0, gather->new_sub());
494
495 root->mark_clean();
496 root->mark_dirty(root->pre_dirty(), mds->mdlog->get_current_segment());
497 root->mark_dirty_parent(mds->mdlog->get_current_segment(), true);
498 root->flush(gather->new_sub());
499 }
500
501 void MDCache::create_mydir_hierarchy(MDSGather *gather)
502 {
503 // create mds dir
504 CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR);
505
506 CDir *mydir = my->get_or_open_dirfrag(this, frag_t());
507 adjust_subtree_auth(mydir, mds->get_nodeid());
508
509 LogSegment *ls = mds->mdlog->get_current_segment();
510
511 // stray dir
512 for (int i = 0; i < NUM_STRAY; ++i) {
513 CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR);
514 CDir *straydir = stray->get_or_open_dirfrag(this, frag_t());
515 stringstream name;
516 name << "stray" << i;
517 CDentry *sdn = mydir->add_primary_dentry(name.str(), stray);
518 sdn->_mark_dirty(mds->mdlog->get_current_segment());
519
520 stray->inode.dirstat = straydir->fnode.fragstat;
521
522 mydir->fnode.rstat.add(stray->inode.rstat);
523 mydir->fnode.fragstat.nsubdirs++;
524 // save them
525 straydir->mark_complete();
526 straydir->mark_dirty(straydir->pre_dirty(), ls);
527 straydir->commit(0, gather->new_sub());
528 stray->mark_dirty_parent(ls, true);
529 stray->store_backtrace(gather->new_sub());
530 }
531
532 mydir->fnode.accounted_fragstat = mydir->fnode.fragstat;
533 mydir->fnode.accounted_rstat = mydir->fnode.rstat;
534
535 myin->inode.dirstat = mydir->fnode.fragstat;
536 myin->inode.rstat = mydir->fnode.rstat;
537 ++myin->inode.rstat.rsubdirs;
538 myin->inode.accounted_rstat = myin->inode.rstat;
539
540 mydir->mark_complete();
541 mydir->mark_dirty(mydir->pre_dirty(), ls);
542 mydir->commit(0, gather->new_sub());
543
544 myin->store(gather->new_sub());
545 }
546
547 struct C_MDC_CreateSystemFile : public MDCacheLogContext {
548 MutationRef mut;
549 CDentry *dn;
550 version_t dpv;
551 MDSContext *fin;
552 C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSContext *f) :
553 MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {}
554 void finish(int r) override {
555 mdcache->_create_system_file_finish(mut, dn, dpv, fin);
556 }
557 };
558
559 void MDCache::_create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin)
560 {
561 dout(10) << "_create_system_file " << name << " in " << *dir << dendl;
562 CDentry *dn = dir->add_null_dentry(name);
563
564 dn->push_projected_linkage(in);
565 version_t dpv = dn->pre_dirty();
566
567 CDir *mdir = 0;
568 if (in->inode.is_dir()) {
569 in->inode.rstat.rsubdirs = 1;
570
571 mdir = in->get_or_open_dirfrag(this, frag_t());
572 mdir->mark_complete();
573 mdir->pre_dirty();
574 } else
575 in->inode.rstat.rfiles = 1;
576 in->inode.version = dn->pre_dirty();
577
578 SnapRealm *realm = dir->get_inode()->find_snaprealm();
579 dn->first = in->first = realm->get_newest_seq() + 1;
580
581 MutationRef mut(new MutationImpl());
582
583 // force some locks. hacky.
584 mds->locker->wrlock_force(&dir->inode->filelock, mut);
585 mds->locker->wrlock_force(&dir->inode->nestlock, mut);
586
587 mut->ls = mds->mdlog->get_current_segment();
588 EUpdate *le = new EUpdate(mds->mdlog, "create system file");
589 mds->mdlog->start_entry(le);
590
591 if (!in->is_mdsdir()) {
592 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
593 le->metablob.add_primary_dentry(dn, in, true);
594 } else {
595 predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1);
596 journal_dirty_inode(mut.get(), &le->metablob, in);
597 dn->push_projected_linkage(in->ino(), in->d_type());
598 le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type());
599 le->metablob.add_root(true, in);
600 }
601 if (mdir)
602 le->metablob.add_new_dir(mdir); // dirty AND complete AND new
603
604 mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin));
605 mds->mdlog->flush();
606 }
607
608 void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSContext *fin)
609 {
610 dout(10) << "_create_system_file_finish " << *dn << dendl;
611
612 dn->pop_projected_linkage();
613 dn->mark_dirty(dpv, mut->ls);
614
615 CInode *in = dn->get_linkage()->get_inode();
616 in->inode.version--;
617 in->mark_dirty(in->inode.version + 1, mut->ls);
618
619 if (in->inode.is_dir()) {
620 CDir *dir = in->get_dirfrag(frag_t());
621 ceph_assert(dir);
622 dir->mark_dirty(1, mut->ls);
623 dir->mark_new(mut->ls);
624 }
625
626 mut->apply();
627 mds->locker->drop_locks(mut.get());
628 mut->cleanup();
629
630 fin->complete(0);
631
632 //if (dir && MDS_INO_IS_MDSDIR(in->ino()))
633 //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET);
634 }
635
636
637
638 struct C_MDS_RetryOpenRoot : public MDSInternalContext {
639 MDCache *cache;
640 explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {}
641 void finish(int r) override {
642 if (r < 0) {
643 // If we can't open root, something disastrous has happened: mark
644 // this rank damaged for operator intervention. Note that
645 // it is not okay to call suicide() here because we are in
646 // a Finisher callback.
647 cache->mds->damaged();
648 ceph_abort(); // damaged should never return
649 } else {
650 cache->open_root();
651 }
652 }
653 };
654
655 void MDCache::open_root_inode(MDSContext *c)
656 {
657 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
658 CInode *in;
659 in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate!
660 in->fetch(c);
661 } else {
662 discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root());
663 }
664 }
665
666 void MDCache::open_mydir_inode(MDSContext *c)
667 {
668 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
669 in->fetch(c);
670 }
671
672 void MDCache::open_mydir_frag(MDSContext *c)
673 {
674 open_mydir_inode(
675 new MDSInternalContextWrapper(mds,
676 new LambdaContext([this, c](int r) {
677 if (r < 0) {
678 c->complete(r);
679 return;
680 }
681 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
682 ceph_assert(mydir);
683 adjust_subtree_auth(mydir, mds->get_nodeid());
684 mydir->fetch(c);
685 })
686 )
687 );
688 }
689
690 void MDCache::open_root()
691 {
692 dout(10) << "open_root" << dendl;
693
694 if (!root) {
695 open_root_inode(new C_MDS_RetryOpenRoot(this));
696 return;
697 }
698 if (mds->get_nodeid() == mds->mdsmap->get_root()) {
699 ceph_assert(root->is_auth());
700 CDir *rootdir = root->get_or_open_dirfrag(this, frag_t());
701 ceph_assert(rootdir);
702 if (!rootdir->is_subtree_root())
703 adjust_subtree_auth(rootdir, mds->get_nodeid());
704 if (!rootdir->is_complete()) {
705 rootdir->fetch(new C_MDS_RetryOpenRoot(this));
706 return;
707 }
708 } else {
709 ceph_assert(!root->is_auth());
710 CDir *rootdir = root->get_dirfrag(frag_t());
711 if (!rootdir) {
712 open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this));
713 return;
714 }
715 }
716
717 if (!myin) {
718 CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate!
719 in->fetch(new C_MDS_RetryOpenRoot(this));
720 return;
721 }
722 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
723 ceph_assert(mydir);
724 adjust_subtree_auth(mydir, mds->get_nodeid());
725
726 populate_mydir();
727 }
728
729 void MDCache::populate_mydir()
730 {
731 ceph_assert(myin);
732 CDir *mydir = myin->get_or_open_dirfrag(this, frag_t());
733 ceph_assert(mydir);
734
735 dout(10) << "populate_mydir " << *mydir << dendl;
736
737 if (!mydir->is_complete()) {
738 mydir->fetch(new C_MDS_RetryOpenRoot(this));
739 return;
740 }
741
742 if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) {
743 // A missing dirfrag, we will recreate it. Before that, we must dirty
744 // it before dirtying any of the strays we create within it.
745 mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, "
746 "recreating it now";
747 LogSegment *ls = mds->mdlog->get_current_segment();
748 mydir->state_clear(CDir::STATE_BADFRAG);
749 mydir->mark_complete();
750 mydir->mark_dirty(mydir->pre_dirty(), ls);
751 }
752
753 // open or create stray
754 uint64_t num_strays = 0;
755 for (int i = 0; i < NUM_STRAY; ++i) {
756 stringstream name;
757 name << "stray" << i;
758 CDentry *straydn = mydir->lookup(name.str());
759
760 // allow for older fs's with stray instead of stray0
761 if (straydn == NULL && i == 0)
762 straydn = mydir->lookup("stray");
763
764 if (!straydn || !straydn->get_linkage()->get_inode()) {
765 _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR),
766 new C_MDS_RetryOpenRoot(this));
767 return;
768 }
769 ceph_assert(straydn);
770 ceph_assert(strays[i]);
771 // we make multiple passes through this method; make sure we only pin each stray once.
772 if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
773 strays[i]->get(CInode::PIN_STRAY);
774 strays[i]->state_set(CInode::STATE_STRAYPINNED);
775 strays[i]->get_stickydirs();
776 }
777 dout(20) << " stray num " << i << " is " << *strays[i] << dendl;
778
779 // open all frags
780 frag_vec_t leaves;
781 strays[i]->dirfragtree.get_leaves(leaves);
782 for (const auto& leaf : leaves) {
783 CDir *dir = strays[i]->get_dirfrag(leaf);
784 if (!dir) {
785 dir = strays[i]->get_or_open_dirfrag(this, leaf);
786 }
787
788 // DamageTable applies special handling to strays: it will
789 // have damaged() us out if one is damaged.
790 ceph_assert(!dir->state_test(CDir::STATE_BADFRAG));
791
792 if (dir->get_version() == 0) {
793 dir->fetch(new C_MDS_RetryOpenRoot(this));
794 return;
795 }
796
797 if (dir->get_frag_size() > 0)
798 num_strays += dir->get_frag_size();
799 }
800 }
801
802 // okay!
803 dout(10) << "populate_mydir done" << dendl;
804 ceph_assert(!open);
805 open = true;
806 mds->queue_waiters(waiting_for_open);
807
808 stray_manager.set_num_strays(num_strays);
809 stray_manager.activate();
810
811 scan_stray_dir();
812 }
813
814 void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSContext *fin)
815 {
816 discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1)));
817 }
818
819 CDir *MDCache::get_stray_dir(CInode *in)
820 {
821 string straydname;
822 in->name_stray_dentry(straydname);
823
824 CInode *strayi = get_stray();
825 ceph_assert(strayi);
826 frag_t fg = strayi->pick_dirfrag(straydname);
827 CDir *straydir = strayi->get_dirfrag(fg);
828 ceph_assert(straydir);
829 return straydir;
830 }
831
832 CDentry *MDCache::get_or_create_stray_dentry(CInode *in)
833 {
834 CDir *straydir = get_stray_dir(in);
835 string straydname;
836 in->name_stray_dentry(straydname);
837 CDentry *straydn = straydir->lookup(straydname);
838 if (!straydn) {
839 straydn = straydir->add_null_dentry(straydname);
840 straydn->mark_new();
841 } else {
842 ceph_assert(straydn->get_projected_linkage()->is_null());
843 }
844
845 straydn->state_set(CDentry::STATE_STRAY);
846 return straydn;
847 }
848
849
850
851 MDSCacheObject *MDCache::get_object(const MDSCacheObjectInfo &info)
852 {
853 // inode?
854 if (info.ino)
855 return get_inode(info.ino, info.snapid);
856
857 // dir or dentry.
858 CDir *dir = get_dirfrag(info.dirfrag);
859 if (!dir) return 0;
860
861 if (info.dname.length())
862 return dir->lookup(info.dname, info.snapid);
863 else
864 return dir;
865 }
866
867
868 // ====================================================================
869 // consistent hash ring
870
871 /*
872 * hashing implementation based on Lamping and Veach's Jump Consistent Hash: https://arxiv.org/pdf/1406.2294.pdf
873 */
874 mds_rank_t MDCache::hash_into_rank_bucket(inodeno_t ino)
875 {
876 const mds_rank_t max_mds = mds->mdsmap->get_max_mds();
877 uint64_t hash = rjhash64(ino);
878 int64_t b = -1, j = 0;
879 while (j < max_mds) {
880 b = j;
881 hash = hash*2862933555777941757ULL + 1;
882 j = (b + 1) * (double(1LL << 31) / double((hash >> 33) + 1));
883 }
884 // verify bounds before returning
885 auto result = mds_rank_t(b);
886 ceph_assert(result >= 0 && result < max_mds);
887 return result;
888 }
889
890
891 // ====================================================================
892 // subtree management
893
894 /*
895 * adjust the dir_auth of a subtree.
896 * merge with parent and/or child subtrees, if is it appropriate.
897 * merge can ONLY happen if both parent and child have unambiguous auth.
898 */
899 void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth, bool adjust_pop)
900 {
901 dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth
902 << " on " << *dir << dendl;
903
904 show_subtrees();
905
906 CDir *root;
907 if (dir->inode->is_base()) {
908 root = dir; // bootstrap hack.
909 if (subtrees.count(root) == 0) {
910 subtrees[root];
911 root->get(CDir::PIN_SUBTREE);
912 }
913 } else {
914 root = get_subtree_root(dir); // subtree root
915 }
916 ceph_assert(root);
917 ceph_assert(subtrees.count(root));
918 dout(7) << " current root is " << *root << dendl;
919
920 if (root == dir) {
921 // i am already a subtree.
922 dir->set_dir_auth(auth);
923 } else {
924 // i am a new subtree.
925 dout(10) << " new subtree at " << *dir << dendl;
926 ceph_assert(subtrees.count(dir) == 0);
927 subtrees[dir]; // create empty subtree bounds list for me.
928 dir->get(CDir::PIN_SUBTREE);
929
930 // set dir_auth
931 dir->set_dir_auth(auth);
932
933 // move items nested beneath me, under me.
934 set<CDir*>::iterator p = subtrees[root].begin();
935 while (p != subtrees[root].end()) {
936 set<CDir*>::iterator next = p;
937 ++next;
938 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
939 // move under me
940 dout(10) << " claiming child bound " << **p << dendl;
941 subtrees[dir].insert(*p);
942 subtrees[root].erase(p);
943 }
944 p = next;
945 }
946
947 // i am a bound of the parent subtree.
948 subtrees[root].insert(dir);
949
950 // i am now the subtree root.
951 root = dir;
952
953 // adjust recursive pop counters
954 if (adjust_pop && dir->is_auth()) {
955 CDir *p = dir->get_parent_dir();
956 while (p) {
957 p->pop_auth_subtree.sub(dir->pop_auth_subtree);
958 if (p->is_subtree_root()) break;
959 p = p->inode->get_parent_dir();
960 }
961 }
962 }
963
964 if (dir->is_auth()) {
965 /* do this now that we are auth for the CDir */
966 dir->inode->maybe_pin();
967 }
968
969 show_subtrees();
970 }
971
972
973 void MDCache::try_subtree_merge(CDir *dir)
974 {
975 dout(7) << "try_subtree_merge " << *dir << dendl;
976 // record my old bounds
977 auto oldbounds = subtrees.at(dir);
978
979 set<CInode*> to_eval;
980 // try merge at my root
981 try_subtree_merge_at(dir, &to_eval);
982
983 // try merge at my old bounds
984 for (auto bound : oldbounds)
985 try_subtree_merge_at(bound, &to_eval);
986
987 if (!(mds->is_any_replay() || mds->is_resolve())) {
988 for(auto in : to_eval)
989 eval_subtree_root(in);
990 }
991 }
992
993 class C_MDC_SubtreeMergeWB : public MDCacheLogContext {
994 CInode *in;
995 MutationRef mut;
996 public:
997 C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {}
998 void finish(int r) override {
999 mdcache->subtree_merge_writebehind_finish(in, mut);
1000 }
1001 };
1002
1003 void MDCache::try_subtree_merge_at(CDir *dir, set<CInode*> *to_eval, bool adjust_pop)
1004 {
1005 dout(10) << "try_subtree_merge_at " << *dir << dendl;
1006
1007 if (dir->dir_auth.second != CDIR_AUTH_UNKNOWN ||
1008 dir->state_test(CDir::STATE_EXPORTBOUND) ||
1009 dir->state_test(CDir::STATE_AUXSUBTREE))
1010 return;
1011
1012 auto it = subtrees.find(dir);
1013 ceph_assert(it != subtrees.end());
1014
1015 // merge with parent?
1016 CDir *parent = dir;
1017 if (!dir->inode->is_base())
1018 parent = get_subtree_root(dir->get_parent_dir());
1019
1020 if (parent != dir && // we have a parent,
1021 parent->dir_auth == dir->dir_auth) { // auth matches,
1022 // merge with parent.
1023 dout(10) << " subtree merge at " << *dir << dendl;
1024 dir->set_dir_auth(CDIR_AUTH_DEFAULT);
1025
1026 // move our bounds under the parent
1027 subtrees[parent].insert(it->second.begin(), it->second.end());
1028
1029 // we are no longer a subtree or bound
1030 dir->put(CDir::PIN_SUBTREE);
1031 subtrees.erase(it);
1032 subtrees[parent].erase(dir);
1033
1034 // adjust popularity?
1035 if (adjust_pop && dir->is_auth()) {
1036 CDir *cur = dir;
1037 CDir *p = dir->get_parent_dir();
1038 while (p) {
1039 p->pop_auth_subtree.add(dir->pop_auth_subtree);
1040 p->pop_lru_subdirs.push_front(&cur->get_inode()->item_pop_lru);
1041 if (p->is_subtree_root()) break;
1042 cur = p;
1043 p = p->inode->get_parent_dir();
1044 }
1045 }
1046
1047 if (to_eval && dir->get_inode()->is_auth())
1048 to_eval->insert(dir->get_inode());
1049
1050 show_subtrees(15);
1051 }
1052 }
1053
1054 void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut)
1055 {
1056 dout(10) << "subtree_merge_writebehind_finish on " << in << dendl;
1057 in->pop_and_dirty_projected_inode(mut->ls);
1058
1059 mut->apply();
1060 mds->locker->drop_locks(mut.get());
1061 mut->cleanup();
1062
1063 in->auth_unpin(this);
1064 }
1065
1066 void MDCache::eval_subtree_root(CInode *diri)
1067 {
1068 // evaluate subtree inode filelock?
1069 // (we should scatter the filelock on subtree bounds)
1070 ceph_assert(diri->is_auth());
1071 mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST);
1072 }
1073
1074
1075 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth)
1076 {
1077 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1078 << " on " << *dir
1079 << " bounds " << bounds
1080 << dendl;
1081
1082 show_subtrees();
1083
1084 CDir *root;
1085 if (dir->ino() == MDS_INO_ROOT) {
1086 root = dir; // bootstrap hack.
1087 if (subtrees.count(root) == 0) {
1088 subtrees[root];
1089 root->get(CDir::PIN_SUBTREE);
1090 }
1091 } else {
1092 root = get_subtree_root(dir); // subtree root
1093 }
1094 ceph_assert(root);
1095 ceph_assert(subtrees.count(root));
1096 dout(7) << " current root is " << *root << dendl;
1097
1098 mds_authority_t oldauth = dir->authority();
1099
1100 if (root == dir) {
1101 // i am already a subtree.
1102 dir->set_dir_auth(auth);
1103 } else {
1104 // i am a new subtree.
1105 dout(10) << " new subtree at " << *dir << dendl;
1106 ceph_assert(subtrees.count(dir) == 0);
1107 subtrees[dir]; // create empty subtree bounds list for me.
1108 dir->get(CDir::PIN_SUBTREE);
1109
1110 // set dir_auth
1111 dir->set_dir_auth(auth);
1112
1113 // move items nested beneath me, under me.
1114 set<CDir*>::iterator p = subtrees[root].begin();
1115 while (p != subtrees[root].end()) {
1116 set<CDir*>::iterator next = p;
1117 ++next;
1118 if (get_subtree_root((*p)->get_parent_dir()) == dir) {
1119 // move under me
1120 dout(10) << " claiming child bound " << **p << dendl;
1121 subtrees[dir].insert(*p);
1122 subtrees[root].erase(p);
1123 }
1124 p = next;
1125 }
1126
1127 // i am a bound of the parent subtree.
1128 subtrees[root].insert(dir);
1129
1130 // i am now the subtree root.
1131 root = dir;
1132 }
1133
1134 set<CInode*> to_eval;
1135
1136 // verify/adjust bounds.
1137 // - these may be new, or
1138 // - beneath existing ambiguous bounds (which will be collapsed),
1139 // - but NOT beneath unambiguous bounds.
1140 for (const auto& bound : bounds) {
1141 // new bound?
1142 if (subtrees[dir].count(bound) == 0) {
1143 if (get_subtree_root(bound) == dir) {
1144 dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl;
1145 adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound.
1146 }
1147 else {
1148 dout(10) << " want bound " << *bound << dendl;
1149 CDir *t = get_subtree_root(bound->get_parent_dir());
1150 if (subtrees[t].count(bound) == 0) {
1151 ceph_assert(t != dir);
1152 dout(10) << " new bound " << *bound << dendl;
1153 adjust_subtree_auth(bound, t->authority());
1154 }
1155 // make sure it's nested beneath ambiguous subtree(s)
1156 while (1) {
1157 while (subtrees[dir].count(t) == 0)
1158 t = get_subtree_root(t->get_parent_dir());
1159 dout(10) << " swallowing intervening subtree at " << *t << dendl;
1160 adjust_subtree_auth(t, auth);
1161 try_subtree_merge_at(t, &to_eval);
1162 t = get_subtree_root(bound->get_parent_dir());
1163 if (t == dir) break;
1164 }
1165 }
1166 }
1167 else {
1168 dout(10) << " already have bound " << *bound << dendl;
1169 }
1170 }
1171 // merge stray bounds?
1172 while (!subtrees[dir].empty()) {
1173 set<CDir*> copy = subtrees[dir];
1174 for (set<CDir*>::iterator p = copy.begin(); p != copy.end(); ++p) {
1175 if (bounds.count(*p) == 0) {
1176 CDir *stray = *p;
1177 dout(10) << " swallowing extra subtree at " << *stray << dendl;
1178 adjust_subtree_auth(stray, auth);
1179 try_subtree_merge_at(stray, &to_eval);
1180 }
1181 }
1182 // swallowing subtree may add new subtree bounds
1183 if (copy == subtrees[dir])
1184 break;
1185 }
1186
1187 // bound should now match.
1188 verify_subtree_bounds(dir, bounds);
1189
1190 show_subtrees();
1191
1192 if (!(mds->is_any_replay() || mds->is_resolve())) {
1193 for(auto in : to_eval)
1194 eval_subtree_root(in);
1195 }
1196 }
1197
1198
1199 /*
1200 * return a set of CDir*'s that correspond to the given bound set. Only adjust
1201 * fragmentation as necessary to get an equivalent bounding set. That is, only
1202 * split if one of our frags spans the provided bounding set. Never merge.
1203 */
1204 void MDCache::get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds)
1205 {
1206 dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl;
1207
1208 // sort by ino
1209 map<inodeno_t, fragset_t> byino;
1210 for (auto& frag : dfs) {
1211 byino[frag.ino].insert_raw(frag.frag);
1212 }
1213 dout(10) << " by ino: " << byino << dendl;
1214
1215 for (map<inodeno_t,fragset_t>::iterator p = byino.begin(); p != byino.end(); ++p) {
1216 p->second.simplify();
1217 CInode *diri = get_inode(p->first);
1218 if (!diri)
1219 continue;
1220 dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl;
1221
1222 fragtree_t tmpdft;
1223 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
1224 tmpdft.force_to_leaf(g_ceph_context, *q);
1225
1226 for (const auto& fg : p->second) {
1227 frag_vec_t leaves;
1228 diri->dirfragtree.get_leaves_under(fg, leaves);
1229 if (leaves.empty()) {
1230 bool all = true;
1231 frag_t approx_fg = diri->dirfragtree[fg.value()];
1232 frag_vec_t approx_leaves;
1233 tmpdft.get_leaves_under(approx_fg, approx_leaves);
1234 for (const auto& leaf : approx_leaves) {
1235 if (p->second.get().count(leaf) == 0) {
1236 // not bound, so the resolve message is from auth MDS of the dirfrag
1237 force_dir_fragment(diri, leaf);
1238 all = false;
1239 }
1240 }
1241 if (all)
1242 leaves.push_back(approx_fg);
1243 else
1244 diri->dirfragtree.get_leaves_under(fg, leaves);
1245 }
1246 dout(10) << " frag " << fg << " contains " << leaves << dendl;
1247 for (const auto& leaf : leaves) {
1248 CDir *dir = diri->get_dirfrag(leaf);
1249 if (dir)
1250 bounds.insert(dir);
1251 }
1252 }
1253 }
1254 }
1255
1256 void MDCache::adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bound_dfs, const mds_authority_t &auth)
1257 {
1258 dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth
1259 << " on " << *dir << " bound_dfs " << bound_dfs << dendl;
1260
1261 set<CDir*> bounds;
1262 get_force_dirfrag_bound_set(bound_dfs, bounds);
1263 adjust_bounded_subtree_auth(dir, bounds, auth);
1264 }
1265
1266 void MDCache::map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result)
1267 {
1268 dout(10) << "map_dirfrag_set " << dfs << dendl;
1269
1270 // group by inode
1271 map<inodeno_t, fragset_t> ino_fragset;
1272 for (const auto &df : dfs) {
1273 ino_fragset[df.ino].insert_raw(df.frag);
1274 }
1275 // get frags
1276 for (map<inodeno_t, fragset_t>::iterator p = ino_fragset.begin();
1277 p != ino_fragset.end();
1278 ++p) {
1279 p->second.simplify();
1280 CInode *in = get_inode(p->first);
1281 if (!in)
1282 continue;
1283
1284 frag_vec_t fgs;
1285 for (const auto& fg : p->second) {
1286 in->dirfragtree.get_leaves_under(fg, fgs);
1287 }
1288
1289 dout(15) << "map_dirfrag_set " << p->second << " -> " << fgs
1290 << " on " << *in << dendl;
1291
1292 for (const auto& fg : fgs) {
1293 CDir *dir = in->get_dirfrag(fg);
1294 if (dir)
1295 result.insert(dir);
1296 }
1297 }
1298 }
1299
1300
1301
1302 CDir *MDCache::get_subtree_root(CDir *dir)
1303 {
1304 // find the underlying dir that delegates (or is about to delegate) auth
1305 while (true) {
1306 if (dir->is_subtree_root())
1307 return dir;
1308 dir = dir->get_inode()->get_parent_dir();
1309 if (!dir)
1310 return 0; // none
1311 }
1312 }
1313
1314 CDir *MDCache::get_projected_subtree_root(CDir *dir)
1315 {
1316 // find the underlying dir that delegates (or is about to delegate) auth
1317 while (true) {
1318 if (dir->is_subtree_root())
1319 return dir;
1320 dir = dir->get_inode()->get_projected_parent_dir();
1321 if (!dir)
1322 return 0; // none
1323 }
1324 }
1325
1326 void MDCache::remove_subtree(CDir *dir)
1327 {
1328 dout(10) << "remove_subtree " << *dir << dendl;
1329 auto it = subtrees.find(dir);
1330 ceph_assert(it != subtrees.end());
1331 subtrees.erase(it);
1332 dir->put(CDir::PIN_SUBTREE);
1333 if (dir->get_parent_dir()) {
1334 CDir *p = get_subtree_root(dir->get_parent_dir());
1335 auto it = subtrees.find(p);
1336 ceph_assert(it != subtrees.end());
1337 auto count = it->second.erase(dir);
1338 ceph_assert(count == 1);
1339 }
1340 }
1341
1342 void MDCache::get_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1343 {
1344 ceph_assert(subtrees.count(dir));
1345 bounds = subtrees[dir];
1346 }
1347
1348 void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set<CDir*>& bounds)
1349 {
1350 if (subtrees.count(dir)) {
1351 // just copy them, dir is a subtree.
1352 get_subtree_bounds(dir, bounds);
1353 } else {
1354 // find them
1355 CDir *root = get_subtree_root(dir);
1356 for (set<CDir*>::iterator p = subtrees[root].begin();
1357 p != subtrees[root].end();
1358 ++p) {
1359 CDir *t = *p;
1360 while (t != root) {
1361 t = t->get_parent_dir();
1362 ceph_assert(t);
1363 if (t == dir) {
1364 bounds.insert(*p);
1365 continue;
1366 }
1367 }
1368 }
1369 }
1370 }
1371
1372 void MDCache::verify_subtree_bounds(CDir *dir, const set<CDir*>& bounds)
1373 {
1374 // for debugging only.
1375 ceph_assert(subtrees.count(dir));
1376 if (bounds != subtrees[dir]) {
1377 dout(0) << "verify_subtree_bounds failed" << dendl;
1378 set<CDir*> b = bounds;
1379 for (auto &cd : subtrees[dir]) {
1380 if (bounds.count(cd)) {
1381 b.erase(cd);
1382 continue;
1383 }
1384 dout(0) << " missing bound " << *cd << dendl;
1385 }
1386 for (const auto &cd : b)
1387 dout(0) << " extra bound " << *cd << dendl;
1388 }
1389 ceph_assert(bounds == subtrees[dir]);
1390 }
1391
1392 void MDCache::verify_subtree_bounds(CDir *dir, const list<dirfrag_t>& bounds)
1393 {
1394 // for debugging only.
1395 ceph_assert(subtrees.count(dir));
1396
1397 // make sure that any bounds i do have are properly noted as such.
1398 int failed = 0;
1399 for (const auto &fg : bounds) {
1400 CDir *bd = get_dirfrag(fg);
1401 if (!bd) continue;
1402 if (subtrees[dir].count(bd) == 0) {
1403 dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl;
1404 failed++;
1405 }
1406 }
1407 ceph_assert(failed == 0);
1408 }
1409
1410 void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir)
1411 {
1412 dout(10) << "project_subtree_rename " << *diri << " from " << *olddir
1413 << " to " << *newdir << dendl;
1414 projected_subtree_renames[diri].push_back(pair<CDir*,CDir*>(olddir, newdir));
1415 }
1416
1417 void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop)
1418 {
1419 dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl;
1420
1421 CDir *newdir = diri->get_parent_dir();
1422
1423 if (pop) {
1424 map<CInode*,list<pair<CDir*,CDir*> > >::iterator p = projected_subtree_renames.find(diri);
1425 ceph_assert(p != projected_subtree_renames.end());
1426 ceph_assert(!p->second.empty());
1427 ceph_assert(p->second.front().first == olddir);
1428 ceph_assert(p->second.front().second == newdir);
1429 p->second.pop_front();
1430 if (p->second.empty())
1431 projected_subtree_renames.erase(p);
1432 }
1433
1434 // adjust total auth pin of freezing subtree
1435 if (olddir != newdir) {
1436 auto&& dfls = diri->get_nested_dirfrags();
1437 for (const auto& dir : dfls)
1438 olddir->adjust_freeze_after_rename(dir);
1439 }
1440
1441 // adjust subtree
1442 // N.B. make sure subtree dirfrags are at the front of the list
1443 auto dfls = diri->get_subtree_dirfrags();
1444 diri->get_nested_dirfrags(dfls);
1445 for (const auto& dir : dfls) {
1446 dout(10) << "dirfrag " << *dir << dendl;
1447 CDir *oldparent = get_subtree_root(olddir);
1448 dout(10) << " old parent " << *oldparent << dendl;
1449 CDir *newparent = get_subtree_root(newdir);
1450 dout(10) << " new parent " << *newparent << dendl;
1451
1452 auto& oldbounds = subtrees[oldparent];
1453 auto& newbounds = subtrees[newparent];
1454
1455 if (olddir != newdir)
1456 mds->balancer->adjust_pop_for_rename(olddir, dir, false);
1457
1458 if (oldparent == newparent) {
1459 dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl;
1460 } else if (dir->is_subtree_root()) {
1461 // children are fine. change parent.
1462 dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl;
1463 {
1464 auto n = oldbounds.erase(dir);
1465 ceph_assert(n == 1);
1466 }
1467 newbounds.insert(dir);
1468 // caller is responsible for 'eval diri'
1469 try_subtree_merge_at(dir, NULL, false);
1470 } else {
1471 // mid-subtree.
1472
1473 // see if any old bounds move to the new parent.
1474 std::vector<CDir*> tomove;
1475 for (const auto& bound : oldbounds) {
1476 CDir *broot = get_subtree_root(bound->get_parent_dir());
1477 if (broot != oldparent) {
1478 ceph_assert(broot == newparent);
1479 tomove.push_back(bound);
1480 }
1481 }
1482 for (const auto& bound : tomove) {
1483 dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl;
1484 oldbounds.erase(bound);
1485 newbounds.insert(bound);
1486 }
1487
1488 // did auth change?
1489 if (oldparent->authority() != newparent->authority()) {
1490 adjust_subtree_auth(dir, oldparent->authority(), false);
1491 // caller is responsible for 'eval diri'
1492 try_subtree_merge_at(dir, NULL, false);
1493 }
1494 }
1495
1496 if (olddir != newdir)
1497 mds->balancer->adjust_pop_for_rename(newdir, dir, true);
1498 }
1499
1500 show_subtrees();
1501 }
1502
1503 // ===================================
1504 // journal and snap/cow helpers
1505
1506
1507 /*
1508 * find first inode in cache that follows given snapid. otherwise, return current.
1509 */
1510 CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows)
1511 {
1512 dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl;
1513 ceph_assert(in->last == CEPH_NOSNAP);
1514
1515 auto p = snap_inode_map.upper_bound(vinodeno_t(in->ino(), follows));
1516 if (p != snap_inode_map.end() && p->second->ino() == in->ino()) {
1517 dout(10) << "pick_inode_snap found " << *p->second << dendl;
1518 in = p->second;
1519 }
1520
1521 return in;
1522 }
1523
1524
1525 /*
1526 * note: i'm currently cheating wrt dirty and inode.version on cow
1527 * items. instead of doing a full dir predirty, i just take the
1528 * original item's version, and set the dirty flag (via
1529 * mutation::add_cow_{inode,dentry}() and mutation::apply(). that
1530 * means a special case in the dir commit clean sweep assertions.
1531 * bah.
1532 */
1533 CInode *MDCache::cow_inode(CInode *in, snapid_t last)
1534 {
1535 ceph_assert(last >= in->first);
1536
1537 CInode *oldin = new CInode(this, true, in->first, last);
1538 oldin->inode = *in->get_previous_projected_inode();
1539 oldin->xattrs = *in->get_previous_projected_xattrs();
1540 oldin->symlink = in->symlink;
1541 oldin->inode.trim_client_ranges(last);
1542
1543 if (in->first < in->oldest_snap)
1544 in->oldest_snap = in->first;
1545
1546 in->first = last+1;
1547
1548 dout(10) << "cow_inode " << *in << " to " << *oldin << dendl;
1549 add_inode(oldin);
1550
1551 if (in->last != CEPH_NOSNAP) {
1552 CInode *head_in = get_inode(in->ino());
1553 ceph_assert(head_in);
1554 auto ret = head_in->split_need_snapflush(oldin, in);
1555 if (ret.first) {
1556 oldin->client_snap_caps = in->client_snap_caps;
1557 if (!oldin->client_snap_caps.empty()) {
1558 for (int i = 0; i < num_cinode_locks; i++) {
1559 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1560 ceph_assert(lock);
1561 if (lock->get_state() != LOCK_SNAP_SYNC) {
1562 ceph_assert(lock->is_stable());
1563 lock->set_state(LOCK_SNAP_SYNC); // gathering
1564 oldin->auth_pin(lock);
1565 }
1566 lock->get_wrlock(true);
1567 }
1568 }
1569 }
1570 if (!ret.second) {
1571 auto client_snap_caps = std::move(in->client_snap_caps);
1572 in->client_snap_caps.clear();
1573 in->item_open_file.remove_myself();
1574 in->item_caps.remove_myself();
1575
1576 if (!client_snap_caps.empty()) {
1577 MDSContext::vec finished;
1578 for (int i = 0; i < num_cinode_locks; i++) {
1579 SimpleLock *lock = in->get_lock(cinode_lock_info[i].lock);
1580 ceph_assert(lock);
1581 ceph_assert(lock->get_state() == LOCK_SNAP_SYNC); // gathering
1582 lock->put_wrlock();
1583 if (!lock->get_num_wrlocks()) {
1584 lock->set_state(LOCK_SYNC);
1585 lock->take_waiting(SimpleLock::WAIT_STABLE|SimpleLock::WAIT_RD, finished);
1586 in->auth_unpin(lock);
1587 }
1588 }
1589 mds->queue_waiters(finished);
1590 }
1591 }
1592 return oldin;
1593 }
1594
1595 if (!in->client_caps.empty()) {
1596 const set<snapid_t>& snaps = in->find_snaprealm()->get_snaps();
1597 // clone caps?
1598 for (auto &p : in->client_caps) {
1599 client_t client = p.first;
1600 Capability *cap = &p.second;
1601 int issued = cap->need_snapflush() ? CEPH_CAP_ANY_WR : cap->issued();
1602 if ((issued & CEPH_CAP_ANY_WR) &&
1603 cap->client_follows < last) {
1604 dout(10) << " client." << client << " cap " << ccap_string(issued) << dendl;
1605 oldin->client_snap_caps.insert(client);
1606 cap->client_follows = last;
1607
1608 // we need snapflushes for any intervening snaps
1609 dout(10) << " snaps " << snaps << dendl;
1610 for (auto q = snaps.lower_bound(oldin->first);
1611 q != snaps.end() && *q <= last;
1612 ++q) {
1613 in->add_need_snapflush(oldin, *q, client);
1614 }
1615 } else {
1616 dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl;
1617 }
1618 }
1619
1620 if (!oldin->client_snap_caps.empty()) {
1621 for (int i = 0; i < num_cinode_locks; i++) {
1622 SimpleLock *lock = oldin->get_lock(cinode_lock_info[i].lock);
1623 ceph_assert(lock);
1624 if (lock->get_state() != LOCK_SNAP_SYNC) {
1625 ceph_assert(lock->is_stable());
1626 lock->set_state(LOCK_SNAP_SYNC); // gathering
1627 oldin->auth_pin(lock);
1628 }
1629 lock->get_wrlock(true);
1630 }
1631 }
1632 }
1633 return oldin;
1634 }
1635
1636 void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob,
1637 CDentry *dn, snapid_t follows,
1638 CInode **pcow_inode, CDentry::linkage_t *dnl)
1639 {
1640 if (!dn) {
1641 dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl;
1642 return;
1643 }
1644 dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl;
1645 ceph_assert(dn->is_auth());
1646
1647 // nothing to cow on a null dentry, fix caller
1648 if (!dnl)
1649 dnl = dn->get_projected_linkage();
1650 ceph_assert(!dnl->is_null());
1651
1652 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
1653 bool cow_head = false;
1654 if (in && in->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
1655 ceph_assert(in->is_frozen_inode());
1656 cow_head = true;
1657 }
1658 if (in && (in->is_multiversion() || cow_head)) {
1659 // multiversion inode.
1660 SnapRealm *realm = NULL;
1661
1662 if (in->get_projected_parent_dn() != dn) {
1663 ceph_assert(follows == CEPH_NOSNAP);
1664 realm = dn->dir->inode->find_snaprealm();
1665 snapid_t dir_follows = get_global_snaprealm()->get_newest_seq();
1666 ceph_assert(dir_follows >= realm->get_newest_seq());
1667
1668 if (dir_follows+1 > dn->first) {
1669 snapid_t oldfirst = dn->first;
1670 dn->first = dir_follows+1;
1671 if (realm->has_snaps_in_range(oldfirst, dir_follows)) {
1672 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), in->ino(), in->d_type(),
1673 oldfirst, dir_follows);
1674 olddn->pre_dirty();
1675 dout(10) << " olddn " << *olddn << dendl;
1676 metablob->add_remote_dentry(olddn, true);
1677 mut->add_cow_dentry(olddn);
1678 // FIXME: adjust link count here? hmm.
1679
1680 if (dir_follows+1 > in->first)
1681 in->cow_old_inode(dir_follows, cow_head);
1682 }
1683 }
1684
1685 follows = dir_follows;
1686 if (in->snaprealm) {
1687 realm = in->snaprealm;
1688 ceph_assert(follows >= realm->get_newest_seq());
1689 }
1690 } else {
1691 realm = in->find_snaprealm();
1692 if (follows == CEPH_NOSNAP) {
1693 follows = get_global_snaprealm()->get_newest_seq();
1694 ceph_assert(follows >= realm->get_newest_seq());
1695 }
1696 }
1697
1698 // already cloned?
1699 if (follows < in->first) {
1700 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl;
1701 return;
1702 }
1703
1704 if (!realm->has_snaps_in_range(in->first, follows)) {
1705 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl;
1706 in->first = follows + 1;
1707 return;
1708 }
1709
1710 in->cow_old_inode(follows, cow_head);
1711
1712 } else {
1713 SnapRealm *realm = dn->dir->inode->find_snaprealm();
1714 if (follows == CEPH_NOSNAP) {
1715 follows = get_global_snaprealm()->get_newest_seq();
1716 ceph_assert(follows >= realm->get_newest_seq());
1717 }
1718
1719 // already cloned?
1720 if (follows < dn->first) {
1721 dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl;
1722 return;
1723 }
1724
1725 // update dn.first before adding old dentry to cdir's map
1726 snapid_t oldfirst = dn->first;
1727 dn->first = follows+1;
1728
1729 if (!realm->has_snaps_in_range(oldfirst, follows)) {
1730 dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl;
1731 if (in)
1732 in->first = follows+1;
1733 return;
1734 }
1735
1736 dout(10) << " dn " << *dn << dendl;
1737 if (in) {
1738 CInode *oldin = cow_inode(in, follows);
1739 mut->add_cow_inode(oldin);
1740 if (pcow_inode)
1741 *pcow_inode = oldin;
1742 CDentry *olddn = dn->dir->add_primary_dentry(dn->get_name(), oldin, oldfirst, follows);
1743 oldin->inode.version = olddn->pre_dirty();
1744 dout(10) << " olddn " << *olddn << dendl;
1745 bool need_snapflush = !oldin->client_snap_caps.empty();
1746 if (need_snapflush) {
1747 mut->ls->open_files.push_back(&oldin->item_open_file);
1748 mds->locker->mark_need_snapflush_inode(oldin);
1749 }
1750 metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush);
1751 mut->add_cow_dentry(olddn);
1752 } else {
1753 ceph_assert(dnl->is_remote());
1754 CDentry *olddn = dn->dir->add_remote_dentry(dn->get_name(), dnl->get_remote_ino(), dnl->get_remote_d_type(),
1755 oldfirst, follows);
1756 olddn->pre_dirty();
1757 dout(10) << " olddn " << *olddn << dendl;
1758 metablob->add_remote_dentry(olddn, true);
1759 mut->add_cow_dentry(olddn);
1760 }
1761 }
1762 }
1763
1764
1765 void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob,
1766 CInode *in, snapid_t follows,
1767 CInode **pcow_inode)
1768 {
1769 dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl;
1770 CDentry *dn = in->get_projected_parent_dn();
1771 journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode);
1772 }
1773
1774 void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows)
1775 {
1776 if (in->is_base()) {
1777 metablob->add_root(true, in);
1778 } else {
1779 if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP)
1780 follows = in->first - 1;
1781 CDentry *dn = in->get_projected_parent_dn();
1782 if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry
1783 journal_cow_dentry(mut, metablob, dn, follows);
1784 if (in->get_projected_inode()->is_backtrace_updated()) {
1785 bool dirty_pool = in->get_projected_inode()->layout.pool_id !=
1786 in->get_previous_projected_inode()->layout.pool_id;
1787 metablob->add_primary_dentry(dn, in, true, true, dirty_pool);
1788 } else {
1789 metablob->add_primary_dentry(dn, in, true);
1790 }
1791 }
1792 }
1793
1794
1795
1796 // nested ---------------------------------------------------------------
1797
1798 void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
1799 int linkunlink, SnapRealm *prealm)
1800 {
1801 CDentry *parentdn = cur->get_projected_parent_dn();
1802 CInode::mempool_inode *curi = cur->get_projected_inode();
1803
1804 if (cur->first > first)
1805 first = cur->first;
1806
1807 dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink
1808 << " " << *cur << dendl;
1809 dout(20) << " frag head is [" << parent->first << ",head] " << dendl;
1810 dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl;
1811
1812 /*
1813 * FIXME. this incompletely propagates rstats to _old_ parents
1814 * (i.e. shortly after a directory rename). but we need full
1815 * blown hard link backpointers to make this work properly...
1816 */
1817 snapid_t floor = parentdn->first;
1818 dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl;
1819
1820 if (!prealm)
1821 prealm = parent->inode->find_snaprealm();
1822 const set<snapid_t> snaps = prealm->get_snaps();
1823
1824 if (cur->last != CEPH_NOSNAP) {
1825 ceph_assert(cur->dirty_old_rstats.empty());
1826 set<snapid_t>::const_iterator q = snaps.lower_bound(std::max(first, floor));
1827 if (q == snaps.end() || *q > cur->last)
1828 return;
1829 }
1830
1831 if (cur->last >= floor) {
1832 bool update = true;
1833 if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) {
1834 // rename src inode is not projected in the slave rename prep case. so we should
1835 // avoid updateing the inode.
1836 ceph_assert(linkunlink < 0);
1837 ceph_assert(cur->is_frozen_inode());
1838 update = false;
1839 }
1840 _project_rstat_inode_to_frag(*curi, std::max(first, floor), cur->last, parent,
1841 linkunlink, update);
1842 }
1843
1844 if (g_conf()->mds_snap_rstat) {
1845 for (const auto &p : cur->dirty_old_rstats) {
1846 auto &old = cur->old_inodes[p];
1847 snapid_t ofirst = std::max(old.first, floor);
1848 auto it = snaps.lower_bound(ofirst);
1849 if (it == snaps.end() || *it > p)
1850 continue;
1851 if (p >= floor)
1852 _project_rstat_inode_to_frag(old.inode, ofirst, p, parent, 0, false);
1853 }
1854 }
1855 cur->dirty_old_rstats.clear();
1856 }
1857
1858
1859 void MDCache::_project_rstat_inode_to_frag(CInode::mempool_inode& inode, snapid_t ofirst, snapid_t last,
1860 CDir *parent, int linkunlink, bool update_inode)
1861 {
1862 dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl;
1863 dout(20) << " inode rstat " << inode.rstat << dendl;
1864 dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl;
1865 nest_info_t delta;
1866 if (linkunlink == 0) {
1867 delta.add(inode.rstat);
1868 delta.sub(inode.accounted_rstat);
1869 } else if (linkunlink < 0) {
1870 delta.sub(inode.accounted_rstat);
1871 } else {
1872 delta.add(inode.rstat);
1873 }
1874 dout(20) << " delta " << delta << dendl;
1875
1876 if (update_inode)
1877 inode.accounted_rstat = inode.rstat;
1878
1879 while (last >= ofirst) {
1880 /*
1881 * pick fnode version to update. at each iteration, we want to
1882 * pick a segment ending in 'last' to update. split as necessary
1883 * to make that work. then, adjust first up so that we only
1884 * update one segment at a time. then loop to cover the whole
1885 * [ofirst,last] interval.
1886 */
1887 nest_info_t *prstat;
1888 snapid_t first;
1889 fnode_t *pf = parent->get_projected_fnode();
1890 if (last == CEPH_NOSNAP) {
1891 if (g_conf()->mds_snap_rstat)
1892 first = std::max(ofirst, parent->first);
1893 else
1894 first = parent->first;
1895 prstat = &pf->rstat;
1896 dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl;
1897
1898 if (first > parent->first &&
1899 !(pf->rstat == pf->accounted_rstat)) {
1900 dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat ["
1901 << parent->first << "," << (first-1) << "] "
1902 << " " << *prstat << "/" << pf->accounted_rstat
1903 << dendl;
1904 parent->dirty_old_rstat[first-1].first = parent->first;
1905 parent->dirty_old_rstat[first-1].rstat = pf->rstat;
1906 parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat;
1907 }
1908 parent->first = first;
1909 } else if (!g_conf()->mds_snap_rstat) {
1910 // drop snapshots' rstats
1911 break;
1912 } else if (last >= parent->first) {
1913 first = parent->first;
1914 parent->dirty_old_rstat[last].first = first;
1915 parent->dirty_old_rstat[last].rstat = pf->rstat;
1916 parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat;
1917 prstat = &parent->dirty_old_rstat[last].rstat;
1918 dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] "
1919 << " " << *prstat << "/" << pf->accounted_rstat << dendl;
1920 } else {
1921 // be careful, dirty_old_rstat is a _sparse_ map.
1922 // sorry, this is ugly.
1923 first = ofirst;
1924
1925 // find any intersection with last
1926 auto it = parent->dirty_old_rstat.lower_bound(last);
1927 if (it == parent->dirty_old_rstat.end()) {
1928 dout(20) << " no dirty_old_rstat with last >= last " << last << dendl;
1929 if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) {
1930 dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl;
1931 first = parent->dirty_old_rstat.rbegin()->first+1;
1932 }
1933 } else {
1934 // *it last is >= last
1935 if (it->second.first <= last) {
1936 // *it intersects [first,last]
1937 if (it->second.first < first) {
1938 dout(10) << " splitting off left bit [" << it->second.first << "," << first-1 << "]" << dendl;
1939 parent->dirty_old_rstat[first-1] = it->second;
1940 it->second.first = first;
1941 }
1942 if (it->second.first > first)
1943 first = it->second.first;
1944 if (last < it->first) {
1945 dout(10) << " splitting off right bit [" << last+1 << "," << it->first << "]" << dendl;
1946 parent->dirty_old_rstat[last] = it->second;
1947 it->second.first = last+1;
1948 }
1949 } else {
1950 // *it is to the _right_ of [first,last]
1951 it = parent->dirty_old_rstat.lower_bound(first);
1952 // new *it last is >= first
1953 if (it->second.first <= last && // new *it isn't also to the right, and
1954 it->first >= first) { // it intersects our first bit,
1955 dout(10) << " staying to the right of [" << it->second.first << "," << it->first << "]..." << dendl;
1956 first = it->first+1;
1957 }
1958 dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl;
1959 }
1960 }
1961 dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl;
1962 parent->dirty_old_rstat[last].first = first;
1963 prstat = &parent->dirty_old_rstat[last].rstat;
1964 }
1965
1966 // apply
1967 dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl;
1968 ceph_assert(last >= first);
1969 prstat->add(delta);
1970 if (update_inode)
1971 inode.accounted_rstat = inode.rstat;
1972 dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl;
1973
1974 last = first-1;
1975 }
1976 }
1977
1978 void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
1979 snapid_t ofirst, snapid_t last,
1980 CInode *pin, bool cow_head)
1981 {
1982 dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl;
1983 dout(20) << " frag rstat " << rstat << dendl;
1984 dout(20) << " frag accounted_rstat " << accounted_rstat << dendl;
1985 nest_info_t delta = rstat;
1986 delta.sub(accounted_rstat);
1987 dout(20) << " delta " << delta << dendl;
1988
1989 while (last >= ofirst) {
1990 CInode::mempool_inode *pi;
1991 snapid_t first;
1992 if (last == pin->last) {
1993 pi = pin->get_projected_inode();
1994 first = std::max(ofirst, pin->first);
1995 if (first > pin->first) {
1996 auto &old = pin->cow_old_inode(first-1, cow_head);
1997 dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl;
1998 }
1999 } else {
2000 if (last >= pin->first) {
2001 first = pin->first;
2002 pin->cow_old_inode(last, cow_head);
2003 } else {
2004 // our life is easier here because old_inodes is not sparse
2005 // (although it may not begin at snapid 1)
2006 auto it = pin->old_inodes.lower_bound(last);
2007 if (it == pin->old_inodes.end()) {
2008 dout(10) << " no old_inode <= " << last << ", done." << dendl;
2009 break;
2010 }
2011 first = it->second.first;
2012 if (first > last) {
2013 dout(10) << " oldest old_inode is [" << first << "," << it->first << "], done." << dendl;
2014 //assert(p == pin->old_inodes.begin());
2015 break;
2016 }
2017 if (it->first > last) {
2018 dout(10) << " splitting right old_inode [" << first << "," << it->first << "] to ["
2019 << (last+1) << "," << it->first << "]" << dendl;
2020 pin->old_inodes[last] = it->second;
2021 it->second.first = last+1;
2022 pin->dirty_old_rstats.insert(it->first);
2023 }
2024 }
2025 if (first < ofirst) {
2026 dout(10) << " splitting left old_inode [" << first << "," << last << "] to ["
2027 << first << "," << ofirst-1 << "]" << dendl;
2028 pin->old_inodes[ofirst-1] = pin->old_inodes[last];
2029 pin->dirty_old_rstats.insert(ofirst-1);
2030 pin->old_inodes[last].first = first = ofirst;
2031 }
2032 pi = &pin->old_inodes[last].inode;
2033 pin->dirty_old_rstats.insert(last);
2034 }
2035 dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl;
2036 pi->rstat.add(delta);
2037 dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl;
2038
2039 last = first-1;
2040 }
2041 }
2042
2043 void MDCache::broadcast_quota_to_client(CInode *in, client_t exclude_ct, bool quota_change)
2044 {
2045 if (!(mds->is_active() || mds->is_stopping()))
2046 return;
2047
2048 if (!in->is_auth() || in->is_frozen())
2049 return;
2050
2051 auto i = in->get_projected_inode();
2052
2053 if (!i->quota.is_enable() &&
2054 !quota_change)
2055 return;
2056
2057 // creaete snaprealm for quota inode (quota was set before mimic)
2058 if (!in->get_projected_srnode())
2059 mds->server->create_quota_realm(in);
2060
2061 for (auto &p : in->client_caps) {
2062 Capability *cap = &p.second;
2063 if (cap->is_noquota())
2064 continue;
2065
2066 if (exclude_ct >= 0 && exclude_ct != p.first)
2067 goto update;
2068
2069 if (cap->last_rbytes == i->rstat.rbytes &&
2070 cap->last_rsize == i->rstat.rsize())
2071 continue;
2072
2073 if (i->quota.max_files > 0) {
2074 if (i->rstat.rsize() >= i->quota.max_files)
2075 goto update;
2076
2077 if ((abs(cap->last_rsize - i->quota.max_files) >> 4) <
2078 abs(cap->last_rsize - i->rstat.rsize()))
2079 goto update;
2080 }
2081
2082 if (i->quota.max_bytes > 0) {
2083 if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3))
2084 goto update;
2085
2086 if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) <
2087 abs(cap->last_rbytes - i->rstat.rbytes))
2088 goto update;
2089 }
2090
2091 continue;
2092
2093 update:
2094 cap->last_rsize = i->rstat.rsize();
2095 cap->last_rbytes = i->rstat.rbytes;
2096
2097 auto msg = make_message<MClientQuota>();
2098 msg->ino = in->ino();
2099 msg->rstat = i->rstat;
2100 msg->quota = i->quota;
2101 mds->send_message_client_counted(msg, cap->get_session());
2102 }
2103 for (const auto &it : in->get_replicas()) {
2104 auto msg = make_message<MGatherCaps>();
2105 msg->ino = in->ino();
2106 mds->send_message_mds(msg, it.first);
2107 }
2108 }
2109
2110 /*
2111 * NOTE: we _have_ to delay the scatter if we are called during a
2112 * rejoin, because we can't twiddle locks between when the
2113 * rejoin_(weak|strong) is received and when we send the rejoin_ack.
2114 * normally, this isn't a problem: a recover mds doesn't twiddle locks
2115 * (no requests), and a survivor acks immediately. _except_ that
2116 * during rejoin_(weak|strong) processing, we may complete a lock
2117 * gather, and do a scatter_writebehind.. and we _can't_ twiddle the
2118 * scatterlock state in that case or the lock states will get out of
2119 * sync between the auth and replica.
2120 *
2121 * the simple solution is to never do the scatter here. instead, put
2122 * the scatterlock on a list if it isn't already wrlockable. this is
2123 * probably the best plan anyway, since we avoid too many
2124 * scatters/locks under normal usage.
2125 */
2126 /*
2127 * some notes on dirlock/nestlock scatterlock semantics:
2128 *
2129 * the fragstat (dirlock) will never be updated without
2130 * dirlock+nestlock wrlock held by the caller.
2131 *
2132 * the rstat (nestlock) _may_ get updated without a wrlock when nested
2133 * data is pushed up the tree. this could be changed with some
2134 * restructuring here, but in its current form we ensure that the
2135 * fragstat+rstat _always_ reflect an accurrate summation over the dir
2136 * frag, which is nice. and, we only need to track frags that need to
2137 * be nudged (and not inodes with pending rstat changes that need to
2138 * be pushed into the frag). a consequence of this is that the
2139 * accounted_rstat on scatterlock sync may not match our current
2140 * rstat. this is normal and expected.
2141 */
2142 void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
2143 CInode *in, CDir *parent,
2144 int flags, int linkunlink,
2145 snapid_t cfollows)
2146 {
2147 bool primary_dn = flags & PREDIRTY_PRIMARY;
2148 bool do_parent_mtime = flags & PREDIRTY_DIR;
2149 bool shallow = flags & PREDIRTY_SHALLOW;
2150
2151 ceph_assert(mds->mdlog->entry_is_open());
2152
2153 // make sure stamp is set
2154 if (mut->get_mds_stamp() == utime_t())
2155 mut->set_mds_stamp(ceph_clock_now());
2156
2157 if (in->is_base())
2158 return;
2159
2160 dout(10) << "predirty_journal_parents"
2161 << (do_parent_mtime ? " do_parent_mtime":"")
2162 << " linkunlink=" << linkunlink
2163 << (primary_dn ? " primary_dn":" remote_dn")
2164 << (shallow ? " SHALLOW":"")
2165 << " follows " << cfollows
2166 << " " << *in << dendl;
2167
2168 if (!parent) {
2169 ceph_assert(primary_dn);
2170 parent = in->get_projected_parent_dn()->get_dir();
2171 }
2172
2173 if (flags == 0 && linkunlink == 0) {
2174 dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl;
2175 blob->add_dir_context(parent);
2176 return;
2177 }
2178
2179 // build list of inodes to wrlock, dirty, and update
2180 list<CInode*> lsi;
2181 CInode *cur = in;
2182 CDentry *parentdn = NULL;
2183 bool first = true;
2184 while (parent) {
2185 //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack
2186 ceph_assert(parent->is_auth());
2187
2188 // opportunistically adjust parent dirfrag
2189 CInode *pin = parent->get_inode();
2190
2191 // inode -> dirfrag
2192 mut->auth_pin(parent);
2193 mut->add_projected_fnode(parent);
2194
2195 fnode_t *pf = parent->project_fnode();
2196 pf->version = parent->pre_dirty();
2197
2198 if (do_parent_mtime || linkunlink) {
2199 ceph_assert(mut->is_wrlocked(&pin->filelock));
2200 ceph_assert(mut->is_wrlocked(&pin->nestlock));
2201 ceph_assert(cfollows == CEPH_NOSNAP);
2202
2203 // update stale fragstat/rstat?
2204 parent->resync_accounted_fragstat();
2205 parent->resync_accounted_rstat();
2206
2207 if (do_parent_mtime) {
2208 pf->fragstat.mtime = mut->get_op_stamp();
2209 pf->fragstat.change_attr++;
2210 dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl;
2211 if (pf->fragstat.mtime > pf->rstat.rctime) {
2212 dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl;
2213 pf->rstat.rctime = pf->fragstat.mtime;
2214 } else {
2215 dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl;
2216 }
2217 }
2218 if (linkunlink) {
2219 dout(10) << "predirty_journal_parents updating size on " << *parent << dendl;
2220 if (in->is_dir()) {
2221 pf->fragstat.nsubdirs += linkunlink;
2222 //pf->rstat.rsubdirs += linkunlink;
2223 } else {
2224 pf->fragstat.nfiles += linkunlink;
2225 //pf->rstat.rfiles += linkunlink;
2226 }
2227 }
2228 }
2229
2230 // rstat
2231 if (!primary_dn) {
2232 // don't update parent this pass
2233 } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) &&
2234 pin->versionlock.can_wrlock())) {
2235 dout(20) << " unwritable parent nestlock " << pin->nestlock
2236 << ", marking dirty rstat on " << *cur << dendl;
2237 cur->mark_dirty_rstat();
2238 } else {
2239 // if we don't hold a wrlock reference on this nestlock, take one,
2240 // because we are about to write into the dirfrag fnode and that needs
2241 // to commit before the lock can cycle.
2242 if (linkunlink) {
2243 ceph_assert(pin->nestlock.get_num_wrlocks() || mut->is_slave());
2244 }
2245
2246 if (!mut->is_wrlocked(&pin->nestlock)) {
2247 dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl;
2248 mds->locker->wrlock_force(&pin->nestlock, mut);
2249 }
2250
2251 // now we can project the inode rstat diff the dirfrag
2252 SnapRealm *prealm = pin->find_snaprealm();
2253
2254 snapid_t follows = cfollows;
2255 if (follows == CEPH_NOSNAP)
2256 follows = prealm->get_newest_seq();
2257
2258 snapid_t first = follows+1;
2259
2260 // first, if the frag is stale, bring it back in sync.
2261 parent->resync_accounted_rstat();
2262
2263 // now push inode rstats into frag
2264 project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm);
2265 cur->clear_dirty_rstat();
2266 }
2267
2268 bool stop = false;
2269 if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) {
2270 dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl;
2271 stop = true;
2272 }
2273
2274 // delay propagating until later?
2275 if (!stop && !first &&
2276 g_conf()->mds_dirstat_min_interval > 0) {
2277 double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop;
2278 if (since_last_prop < g_conf()->mds_dirstat_min_interval) {
2279 dout(10) << "predirty_journal_parents last prop " << since_last_prop
2280 << " < " << g_conf()->mds_dirstat_min_interval
2281 << ", stopping" << dendl;
2282 stop = true;
2283 } else {
2284 dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl;
2285 }
2286 }
2287
2288 // can cast only because i'm passing nowait=true in the sole user
2289 if (!stop &&
2290 !mut->is_wrlocked(&pin->nestlock) &&
2291 (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too
2292 !mds->locker->wrlock_try(&pin->nestlock, mut)
2293 )) { // ** do not initiate.. see above comment **
2294 dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock
2295 << " on " << *pin << dendl;
2296 stop = true;
2297 }
2298 if (stop) {
2299 dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl;
2300 mds->locker->mark_updated_scatterlock(&pin->nestlock);
2301 mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest);
2302 mut->add_updated_lock(&pin->nestlock);
2303 if (do_parent_mtime || linkunlink) {
2304 mds->locker->mark_updated_scatterlock(&pin->filelock);
2305 mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir);
2306 mut->add_updated_lock(&pin->filelock);
2307 }
2308 break;
2309 }
2310 if (!mut->is_wrlocked(&pin->versionlock))
2311 mds->locker->local_wrlock_grab(&pin->versionlock, mut);
2312
2313 ceph_assert(mut->is_wrlocked(&pin->nestlock) || mut->is_slave());
2314
2315 pin->last_dirstat_prop = mut->get_mds_stamp();
2316
2317 // dirfrag -> diri
2318 mut->auth_pin(pin);
2319 mut->add_projected_inode(pin);
2320 lsi.push_front(pin);
2321
2322 pin->pre_cow_old_inode(); // avoid cow mayhem!
2323
2324 auto &pi = pin->project_inode();
2325 pi.inode.version = pin->pre_dirty();
2326
2327 // dirstat
2328 if (do_parent_mtime || linkunlink) {
2329 dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl;
2330 dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl;
2331 bool touched_mtime = false, touched_chattr = false;
2332 pi.inode.dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr);
2333 pf->accounted_fragstat = pf->fragstat;
2334 if (touched_mtime)
2335 pi.inode.mtime = pi.inode.ctime = pi.inode.dirstat.mtime;
2336 if (touched_chattr)
2337 pi.inode.change_attr = pi.inode.dirstat.change_attr;
2338 dout(20) << "predirty_journal_parents gives " << pi.inode.dirstat << " on " << *pin << dendl;
2339
2340 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2341 if (pi.inode.dirstat.size() < 0)
2342 ceph_assert(!"negative dirstat size" == g_conf()->mds_verify_scatter);
2343 if (pi.inode.dirstat.size() != pf->fragstat.size()) {
2344 mds->clog->error() << "unmatched fragstat size on single dirfrag "
2345 << parent->dirfrag() << ", inode has " << pi.inode.dirstat
2346 << ", dirfrag has " << pf->fragstat;
2347
2348 // trust the dirfrag for now
2349 pi.inode.dirstat = pf->fragstat;
2350
2351 ceph_assert(!"unmatched fragstat size" == g_conf()->mds_verify_scatter);
2352 }
2353 }
2354 }
2355
2356 /*
2357 * the rule here is to follow the _oldest_ parent with dirty rstat
2358 * data. if we don't propagate all data, we add ourselves to the
2359 * nudge list. that way all rstat data will (eventually) get
2360 * pushed up the tree.
2361 *
2362 * actually, no. for now, silently drop rstats for old parents. we need
2363 * hard link backpointers to do the above properly.
2364 */
2365
2366 // stop?
2367 if (pin->is_base())
2368 break;
2369 parentdn = pin->get_projected_parent_dn();
2370 ceph_assert(parentdn);
2371
2372 // rstat
2373 dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl;
2374
2375 // first, if the frag is stale, bring it back in sync.
2376 parent->resync_accounted_rstat();
2377
2378 if (g_conf()->mds_snap_rstat) {
2379 for (auto &p : parent->dirty_old_rstat) {
2380 project_rstat_frag_to_inode(p.second.rstat, p.second.accounted_rstat, p.second.first,
2381 p.first, pin, true);
2382 }
2383 }
2384 parent->dirty_old_rstat.clear();
2385 project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false);
2386
2387 pf->accounted_rstat = pf->rstat;
2388
2389 if (parent->get_frag() == frag_t()) { // i.e., we are the only frag
2390 if (pi.inode.rstat.rbytes != pf->rstat.rbytes) {
2391 mds->clog->error() << "unmatched rstat rbytes on single dirfrag "
2392 << parent->dirfrag() << ", inode has " << pi.inode.rstat
2393 << ", dirfrag has " << pf->rstat;
2394
2395 // trust the dirfrag for now
2396 pi.inode.rstat = pf->rstat;
2397
2398 ceph_assert(!"unmatched rstat rbytes" == g_conf()->mds_verify_scatter);
2399 }
2400 }
2401
2402 parent->check_rstats();
2403 broadcast_quota_to_client(pin);
2404 // next parent!
2405 cur = pin;
2406 parent = parentdn->get_dir();
2407 linkunlink = 0;
2408 do_parent_mtime = false;
2409 primary_dn = true;
2410 first = false;
2411 }
2412
2413 // now, stick it in the blob
2414 ceph_assert(parent);
2415 ceph_assert(parent->is_auth());
2416 blob->add_dir_context(parent);
2417 blob->add_dir(parent, true);
2418 for (const auto& in : lsi) {
2419 journal_dirty_inode(mut.get(), blob, in);
2420 }
2421
2422 }
2423
2424
2425
2426
2427
2428 // ===================================
2429 // slave requests
2430
2431
2432 /*
2433 * some handlers for master requests with slaves. we need to make
2434 * sure slaves journal commits before we forget we mastered them and
2435 * remove them from the uncommitted_masters map (used during recovery
2436 * to commit|abort slaves).
2437 */
2438 struct C_MDC_CommittedMaster : public MDCacheLogContext {
2439 metareqid_t reqid;
2440 C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {}
2441 void finish(int r) override {
2442 mdcache->_logged_master_commit(reqid);
2443 }
2444 };
2445
2446 void MDCache::log_master_commit(metareqid_t reqid)
2447 {
2448 dout(10) << "log_master_commit " << reqid << dendl;
2449 uncommitted_masters[reqid].committing = true;
2450 mds->mdlog->start_submit_entry(new ECommitted(reqid),
2451 new C_MDC_CommittedMaster(this, reqid));
2452 }
2453
2454 void MDCache::_logged_master_commit(metareqid_t reqid)
2455 {
2456 dout(10) << "_logged_master_commit " << reqid << dendl;
2457 ceph_assert(uncommitted_masters.count(reqid));
2458 uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid);
2459 mds->queue_waiters(uncommitted_masters[reqid].waiters);
2460 uncommitted_masters.erase(reqid);
2461 }
2462
2463 // while active...
2464
2465 void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from)
2466 {
2467 dout(10) << "committed_master_slave mds." << from << " on " << r << dendl;
2468 ceph_assert(uncommitted_masters.count(r));
2469 uncommitted_masters[r].slaves.erase(from);
2470 if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty())
2471 log_master_commit(r);
2472 }
2473
2474 void MDCache::logged_master_update(metareqid_t reqid)
2475 {
2476 dout(10) << "logged_master_update " << reqid << dendl;
2477 ceph_assert(uncommitted_masters.count(reqid));
2478 uncommitted_masters[reqid].safe = true;
2479 auto p = pending_masters.find(reqid);
2480 if (p != pending_masters.end()) {
2481 pending_masters.erase(p);
2482 if (pending_masters.empty())
2483 process_delayed_resolve();
2484 }
2485 }
2486
2487 /*
2488 * Master may crash after receiving all slaves' commit acks, but before journalling
2489 * the final commit. Slaves may crash after journalling the slave commit, but before
2490 * sending commit ack to the master. Commit masters with no uncommitted slave when
2491 * resolve finishes.
2492 */
2493 void MDCache::finish_committed_masters()
2494 {
2495 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
2496 p != uncommitted_masters.end();
2497 ++p) {
2498 p->second.recovering = false;
2499 if (!p->second.committing && p->second.slaves.empty()) {
2500 dout(10) << "finish_committed_masters " << p->first << dendl;
2501 log_master_commit(p->first);
2502 }
2503 }
2504 }
2505
2506 /*
2507 * at end of resolve... we must journal a commit|abort for all slave
2508 * updates, before moving on.
2509 *
2510 * this is so that the master can safely journal ECommitted on ops it
2511 * masters when it reaches up:active (all other recovering nodes must
2512 * complete resolve before that happens).
2513 */
2514 struct C_MDC_SlaveCommit : public MDCacheLogContext {
2515 mds_rank_t from;
2516 metareqid_t reqid;
2517 C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {}
2518 void finish(int r) override {
2519 mdcache->_logged_slave_commit(from, reqid);
2520 }
2521 };
2522
2523 void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid)
2524 {
2525 dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl;
2526
2527 // send a message
2528 auto req = make_message<MMDSSlaveRequest>(reqid, 0, MMDSSlaveRequest::OP_COMMITTED);
2529 mds->send_message_mds(req, from);
2530 }
2531
2532
2533
2534
2535
2536
2537 // ====================================================================
2538 // import map, recovery
2539
2540 void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
2541 map<dirfrag_t,vector<dirfrag_t> >& subtrees)
2542 {
2543 if (subtrees.count(oldparent)) {
2544 vector<dirfrag_t>& v = subtrees[oldparent];
2545 dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl;
2546 for (vector<dirfrag_t>::iterator it = v.begin(); it != v.end(); ++it)
2547 if (*it == df) {
2548 v.erase(it);
2549 break;
2550 }
2551 }
2552 if (subtrees.count(newparent)) {
2553 vector<dirfrag_t>& v = subtrees[newparent];
2554 dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl;
2555 v.push_back(df);
2556 }
2557 }
2558
2559 ESubtreeMap *MDCache::create_subtree_map()
2560 {
2561 dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, "
2562 << num_subtrees_fullauth() << " fullauth"
2563 << dendl;
2564
2565 show_subtrees();
2566
2567 ESubtreeMap *le = new ESubtreeMap();
2568 mds->mdlog->_start_entry(le);
2569
2570 map<dirfrag_t, CDir*> dirs_to_add;
2571
2572 if (myin) {
2573 CDir* mydir = myin->get_dirfrag(frag_t());
2574 dirs_to_add[mydir->dirfrag()] = mydir;
2575 }
2576
2577 // include all auth subtrees, and their bounds.
2578 // and a spanning tree to tie it to the root.
2579 for (auto& [dir, bounds] : subtrees) {
2580 // journal subtree as "ours" if we are
2581 // me, -2
2582 // me, me
2583 // me, !me (may be importing and ambiguous!)
2584
2585 // so not
2586 // !me, *
2587 if (dir->get_dir_auth().first != mds->get_nodeid())
2588 continue;
2589
2590 if (migrator->is_ambiguous_import(dir->dirfrag()) ||
2591 my_ambiguous_imports.count(dir->dirfrag())) {
2592 dout(15) << " ambig subtree " << *dir << dendl;
2593 le->ambiguous_subtrees.insert(dir->dirfrag());
2594 } else {
2595 dout(15) << " auth subtree " << *dir << dendl;
2596 }
2597
2598 dirs_to_add[dir->dirfrag()] = dir;
2599 le->subtrees[dir->dirfrag()].clear();
2600
2601 // bounds
2602 size_t nbounds = bounds.size();
2603 if (nbounds > 3) {
2604 dout(15) << " subtree has " << nbounds << " bounds" << dendl;
2605 }
2606 for (auto& bound : bounds) {
2607 if (nbounds <= 3) {
2608 dout(15) << " subtree bound " << *bound << dendl;
2609 }
2610 dirs_to_add[bound->dirfrag()] = bound;
2611 le->subtrees[dir->dirfrag()].push_back(bound->dirfrag());
2612 }
2613 }
2614
2615 // apply projected renames
2616 for (const auto& [diri, renames] : projected_subtree_renames) {
2617 for (const auto& [olddir, newdir] : renames) {
2618 dout(15) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl;
2619
2620 auto&& dfls = diri->get_dirfrags();
2621 for (const auto& dir : dfls) {
2622 dout(15) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl;
2623 CDir *oldparent = get_projected_subtree_root(olddir);
2624 dout(15) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl;
2625 CDir *newparent = get_projected_subtree_root(newdir);
2626 dout(15) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl;
2627
2628 if (oldparent == newparent) {
2629 dout(15) << "parent unchanged for " << dir->dirfrag() << " at "
2630 << oldparent->dirfrag() << dendl;
2631 continue;
2632 }
2633
2634 if (dir->is_subtree_root()) {
2635 if (le->subtrees.count(newparent->dirfrag()) &&
2636 oldparent->get_dir_auth() != newparent->get_dir_auth())
2637 dirs_to_add[dir->dirfrag()] = dir;
2638 // children are fine. change parent.
2639 _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2640 le->subtrees);
2641 } else {
2642 // mid-subtree.
2643
2644 if (oldparent->get_dir_auth() != newparent->get_dir_auth()) {
2645 dout(10) << " creating subtree for " << dir->dirfrag() << dendl;
2646 // if oldparent is auth, subtree is mine; include it.
2647 if (le->subtrees.count(oldparent->dirfrag())) {
2648 dirs_to_add[dir->dirfrag()] = dir;
2649 le->subtrees[dir->dirfrag()].clear();
2650 }
2651 // if newparent is auth, subtree is a new bound
2652 if (le->subtrees.count(newparent->dirfrag())) {
2653 dirs_to_add[dir->dirfrag()] = dir;
2654 le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound
2655 }
2656 newparent = dir;
2657 }
2658
2659 // see if any old bounds move to the new parent.
2660 for (auto& bound : subtrees.at(oldparent)) {
2661 if (dir->contains(bound->get_parent_dir()))
2662 _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(),
2663 le->subtrees);
2664 }
2665 }
2666 }
2667 }
2668 }
2669
2670 // simplify the journaled map. our in memory map may have more
2671 // subtrees than needed due to migrations that are just getting
2672 // started or just completing. but on replay, the "live" map will
2673 // be simple and we can do a straight comparison.
2674 for (auto& [frag, bfrags] : le->subtrees) {
2675 if (le->ambiguous_subtrees.count(frag))
2676 continue;
2677 unsigned i = 0;
2678 while (i < bfrags.size()) {
2679 dirfrag_t b = bfrags[i];
2680 if (le->subtrees.count(b) &&
2681 le->ambiguous_subtrees.count(b) == 0) {
2682 auto& bb = le->subtrees.at(b);
2683 dout(10) << "simplify: " << frag << " swallowing " << b << " with bounds " << bb << dendl;
2684 for (auto& r : bb) {
2685 bfrags.push_back(r);
2686 }
2687 dirs_to_add.erase(b);
2688 le->subtrees.erase(b);
2689 bfrags.erase(bfrags.begin() + i);
2690 } else {
2691 ++i;
2692 }
2693 }
2694 }
2695
2696 for (auto &p : dirs_to_add) {
2697 CDir *dir = p.second;
2698 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
2699 le->metablob.add_dir(dir, false);
2700 }
2701
2702 dout(15) << " subtrees " << le->subtrees << dendl;
2703 dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl;
2704
2705 //le->metablob.print(cout);
2706 le->expire_pos = mds->mdlog->journaler->get_expire_pos();
2707 return le;
2708 }
2709
2710 void MDCache::dump_resolve_status(Formatter *f) const
2711 {
2712 f->open_object_section("resolve_status");
2713 f->dump_stream("resolve_gather") << resolve_gather;
2714 f->dump_stream("resolve_ack_gather") << resolve_gather;
2715 f->close_section();
2716 }
2717
2718 void MDCache::resolve_start(MDSContext *resolve_done_)
2719 {
2720 dout(10) << "resolve_start" << dendl;
2721 ceph_assert(!resolve_done);
2722 resolve_done.reset(resolve_done_);
2723
2724 if (mds->mdsmap->get_root() != mds->get_nodeid()) {
2725 // if we don't have the root dir, adjust it to UNKNOWN. during
2726 // resolve we want mds0 to explicit claim the portion of it that
2727 // it owns, so that anything beyond its bounds get left as
2728 // unknown.
2729 CDir *rootdir = root->get_dirfrag(frag_t());
2730 if (rootdir)
2731 adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN);
2732 }
2733 resolve_gather = recovery_set;
2734
2735 resolve_snapclient_commits = mds->snapclient->get_journaled_tids();
2736 }
2737
2738 void MDCache::send_resolves()
2739 {
2740 send_slave_resolves();
2741
2742 if (!resolve_done) {
2743 // I'm survivor: refresh snap cache
2744 mds->snapclient->sync(
2745 new MDSInternalContextWrapper(mds,
2746 new LambdaContext([this](int r) {
2747 maybe_finish_slave_resolve();
2748 })
2749 )
2750 );
2751 dout(10) << "send_resolves waiting for snapclient cache to sync" << dendl;
2752 return;
2753 }
2754 if (!resolve_ack_gather.empty()) {
2755 dout(10) << "send_resolves still waiting for resolve ack from ("
2756 << resolve_ack_gather << ")" << dendl;
2757 return;
2758 }
2759 if (!resolve_need_rollback.empty()) {
2760 dout(10) << "send_resolves still waiting for rollback to commit on ("
2761 << resolve_need_rollback << ")" << dendl;
2762 return;
2763 }
2764
2765 send_subtree_resolves();
2766 }
2767
2768 void MDCache::send_slave_resolves()
2769 {
2770 dout(10) << "send_slave_resolves" << dendl;
2771
2772 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2773
2774 if (mds->is_resolve()) {
2775 for (map<metareqid_t, uslave>::iterator p = uncommitted_slaves.begin();
2776 p != uncommitted_slaves.end();
2777 ++p) {
2778 mds_rank_t master = p->second.master;
2779 auto &m = resolves[master];
2780 if (!m) m = make_message<MMDSResolve>();
2781 m->add_slave_request(p->first, false);
2782 }
2783 } else {
2784 set<mds_rank_t> resolve_set;
2785 mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE);
2786 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2787 p != active_requests.end();
2788 ++p) {
2789 MDRequestRef& mdr = p->second;
2790 if (!mdr->is_slave())
2791 continue;
2792 if (!mdr->slave_did_prepare() && !mdr->committing) {
2793 continue;
2794 }
2795 mds_rank_t master = mdr->slave_to_mds;
2796 if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) {
2797 dout(10) << " including uncommitted " << *mdr << dendl;
2798 if (!resolves.count(master))
2799 resolves[master] = make_message<MMDSResolve>();
2800 if (!mdr->committing &&
2801 mdr->has_more() && mdr->more()->is_inode_exporter) {
2802 // re-send cap exports
2803 CInode *in = mdr->more()->rename_inode;
2804 map<client_t, Capability::Export> cap_map;
2805 in->export_client_caps(cap_map);
2806 bufferlist bl;
2807 MMDSResolve::slave_inode_cap inode_caps(in->ino(), cap_map);
2808 encode(inode_caps, bl);
2809 resolves[master]->add_slave_request(p->first, bl);
2810 } else {
2811 resolves[master]->add_slave_request(p->first, mdr->committing);
2812 }
2813 }
2814 }
2815 }
2816
2817 for (auto &p : resolves) {
2818 dout(10) << "sending slave resolve to mds." << p.first << dendl;
2819 mds->send_message_mds(p.second, p.first);
2820 resolve_ack_gather.insert(p.first);
2821 }
2822 }
2823
2824 void MDCache::send_subtree_resolves()
2825 {
2826 dout(10) << "send_subtree_resolves" << dendl;
2827
2828 if (migrator->is_exporting() || migrator->is_importing()) {
2829 dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl;
2830 migrator->show_importing();
2831 migrator->show_exporting();
2832 resolves_pending = true;
2833 return; // not now
2834 }
2835
2836 map<mds_rank_t, ref_t<MMDSResolve>> resolves;
2837 for (set<mds_rank_t>::iterator p = recovery_set.begin();
2838 p != recovery_set.end();
2839 ++p) {
2840 if (*p == mds->get_nodeid())
2841 continue;
2842 if (mds->is_resolve() || mds->mdsmap->is_resolve(*p))
2843 resolves[*p] = make_message<MMDSResolve>();
2844 }
2845
2846 map<dirfrag_t, vector<dirfrag_t> > my_subtrees;
2847 map<dirfrag_t, vector<dirfrag_t> > my_ambig_imports;
2848
2849 // known
2850 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
2851 p != subtrees.end();
2852 ++p) {
2853 CDir *dir = p->first;
2854
2855 // only our subtrees
2856 if (dir->authority().first != mds->get_nodeid())
2857 continue;
2858
2859 if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag()))
2860 continue; // we'll add it below
2861
2862 if (migrator->is_ambiguous_import(dir->dirfrag())) {
2863 // ambiguous (mid-import)
2864 set<CDir*> bounds;
2865 get_subtree_bounds(dir, bounds);
2866 vector<dirfrag_t> dfls;
2867 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
2868 dfls.push_back((*q)->dirfrag());
2869
2870 my_ambig_imports[dir->dirfrag()] = dfls;
2871 dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl;
2872 } else {
2873 // not ambiguous.
2874 for (auto &q : resolves) {
2875 resolves[q.first]->add_subtree(dir->dirfrag());
2876 }
2877 // bounds too
2878 vector<dirfrag_t> dfls;
2879 for (set<CDir*>::iterator q = subtrees[dir].begin();
2880 q != subtrees[dir].end();
2881 ++q) {
2882 CDir *bound = *q;
2883 dfls.push_back(bound->dirfrag());
2884 }
2885
2886 my_subtrees[dir->dirfrag()] = dfls;
2887 dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl;
2888 }
2889 }
2890
2891 // ambiguous
2892 for (map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
2893 p != my_ambiguous_imports.end();
2894 ++p) {
2895 my_ambig_imports[p->first] = p->second;
2896 dout(10) << " ambig " << p->first << " " << p->second << dendl;
2897 }
2898
2899 // simplify the claimed subtree.
2900 for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) {
2901 unsigned i = 0;
2902 while (i < p->second.size()) {
2903 dirfrag_t b = p->second[i];
2904 if (my_subtrees.count(b)) {
2905 vector<dirfrag_t>& bb = my_subtrees[b];
2906 dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl;
2907 for (vector<dirfrag_t>::iterator r = bb.begin(); r != bb.end(); ++r)
2908 p->second.push_back(*r);
2909 my_subtrees.erase(b);
2910 p->second.erase(p->second.begin() + i);
2911 } else {
2912 ++i;
2913 }
2914 }
2915 }
2916
2917 // send
2918 for (auto &p : resolves) {
2919 const ref_t<MMDSResolve> &m = p.second;
2920 if (mds->is_resolve()) {
2921 m->add_table_commits(TABLE_SNAP, resolve_snapclient_commits);
2922 } else {
2923 m->add_table_commits(TABLE_SNAP, mds->snapclient->get_journaled_tids());
2924 }
2925 m->subtrees = my_subtrees;
2926 m->ambiguous_imports = my_ambig_imports;
2927 dout(10) << "sending subtee resolve to mds." << p.first << dendl;
2928 mds->send_message_mds(m, p.first);
2929 }
2930 resolves_pending = false;
2931 }
2932
2933 void MDCache::maybe_finish_slave_resolve() {
2934 if (resolve_ack_gather.empty() && resolve_need_rollback.empty()) {
2935 // snap cache get synced or I'm in resolve state
2936 if (mds->snapclient->is_synced() || resolve_done)
2937 send_subtree_resolves();
2938 process_delayed_resolve();
2939 }
2940 }
2941
2942 void MDCache::handle_mds_failure(mds_rank_t who)
2943 {
2944 dout(7) << "handle_mds_failure mds." << who << dendl;
2945
2946 dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl;
2947
2948 resolve_gather.insert(who);
2949 discard_delayed_resolve(who);
2950 ambiguous_slave_updates.erase(who);
2951
2952 rejoin_gather.insert(who);
2953 rejoin_sent.erase(who); // i need to send another
2954 rejoin_ack_sent.erase(who); // i need to send another
2955 rejoin_ack_gather.erase(who); // i'll need/get another.
2956
2957 dout(10) << " resolve_gather " << resolve_gather << dendl;
2958 dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl;
2959 dout(10) << " rejoin_sent " << rejoin_sent << dendl;
2960 dout(10) << " rejoin_gather " << rejoin_gather << dendl;
2961 dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl;
2962
2963
2964 // tell the migrator too.
2965 migrator->handle_mds_failure_or_stop(who);
2966
2967 // tell the balancer too.
2968 mds->balancer->handle_mds_failure(who);
2969
2970 // clean up any requests slave to/from this node
2971 list<MDRequestRef> finish;
2972 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
2973 p != active_requests.end();
2974 ++p) {
2975 MDRequestRef& mdr = p->second;
2976 // slave to the failed node?
2977 if (mdr->slave_to_mds == who) {
2978 if (mdr->slave_did_prepare()) {
2979 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
2980 if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds))
2981 remove_ambiguous_slave_update(p->first, mdr->slave_to_mds);
2982
2983 if (!mdr->more()->waiting_on_slave.empty()) {
2984 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
2985 // will rollback, no need to wait
2986 mdr->reset_slave_request();
2987 mdr->more()->waiting_on_slave.clear();
2988 }
2989 } else if (!mdr->committing) {
2990 dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl;
2991 if (mdr->slave_request || mdr->slave_rolling_back())
2992 mdr->aborted = true;
2993 else
2994 finish.push_back(mdr);
2995 }
2996 }
2997
2998 if (mdr->is_slave() && mdr->slave_did_prepare()) {
2999 if (mdr->more()->waiting_on_slave.count(who)) {
3000 ceph_assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid());
3001 dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds."
3002 << who << dendl;
3003 mdr->more()->waiting_on_slave.erase(who);
3004 if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request)
3005 mds->queue_waiter(new C_MDS_RetryRequest(this, mdr));
3006 }
3007
3008 if (mdr->more()->srcdn_auth_mds == who &&
3009 mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) {
3010 // rename srcdn's auth mds failed, resolve even I'm a survivor.
3011 dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl;
3012 add_ambiguous_slave_update(p->first, mdr->slave_to_mds);
3013 }
3014 } else if (mdr->slave_request) {
3015 const cref_t<MMDSSlaveRequest> &slave_req = mdr->slave_request;
3016 // FIXME: Slave rename request can arrive after we notice mds failure.
3017 // This can cause mds to crash (does not affect integrity of FS).
3018 if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP &&
3019 slave_req->srcdn_auth == who)
3020 slave_req->mark_interrupted();
3021 }
3022
3023 // failed node is slave?
3024 if (mdr->is_master() && !mdr->committing) {
3025 if (mdr->more()->srcdn_auth_mds == who) {
3026 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
3027 << who << " to recover" << dendl;
3028 ceph_assert(mdr->more()->witnessed.count(who) == 0);
3029 if (mdr->more()->is_ambiguous_auth)
3030 mdr->clear_ambiguous_auth();
3031 // rename srcdn's auth mds failed, all witnesses will rollback
3032 mdr->more()->witnessed.clear();
3033 pending_masters.erase(p->first);
3034 }
3035
3036 if (mdr->more()->witnessed.count(who)) {
3037 mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds;
3038 if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) {
3039 dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds."
3040 << mdr->more()->srcdn_auth_mds << " to reply" << dendl;
3041 // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack
3042 // until either the request is committing or the slave also fails.
3043 ceph_assert(mdr->more()->waiting_on_slave.size() == 1);
3044 pending_masters.insert(p->first);
3045 } else {
3046 dout(10) << " master request " << *mdr << " no longer witnessed by slave mds."
3047 << who << " to recover" << dendl;
3048 if (srcdn_auth >= 0)
3049 ceph_assert(mdr->more()->witnessed.count(srcdn_auth) == 0);
3050
3051 // discard this peer's prepare (if any)
3052 mdr->more()->witnessed.erase(who);
3053 }
3054 }
3055
3056 if (mdr->more()->waiting_on_slave.count(who)) {
3057 dout(10) << " master request " << *mdr << " waiting for slave mds." << who
3058 << " to recover" << dendl;
3059 // retry request when peer recovers
3060 mdr->more()->waiting_on_slave.erase(who);
3061 if (mdr->more()->waiting_on_slave.empty())
3062 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr));
3063 }
3064
3065 if (mdr->locking && mdr->locking_target_mds == who)
3066 mdr->finish_locking(mdr->locking);
3067 }
3068 }
3069
3070 for (map<metareqid_t, umaster>::iterator p = uncommitted_masters.begin();
3071 p != uncommitted_masters.end();
3072 ++p) {
3073 // The failed MDS may have already committed the slave update
3074 if (p->second.slaves.count(who)) {
3075 p->second.recovering = true;
3076 p->second.slaves.erase(who);
3077 }
3078 }
3079
3080 while (!finish.empty()) {
3081 dout(10) << "cleaning up slave request " << *finish.front() << dendl;
3082 request_finish(finish.front());
3083 finish.pop_front();
3084 }
3085
3086 kick_find_ino_peers(who);
3087 kick_open_ino_peers(who);
3088
3089 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
3090 p != fragments.end(); ) {
3091 dirfrag_t df = p->first;
3092 fragment_info_t& info = p->second;
3093
3094 if (info.is_fragmenting()) {
3095 if (info.notify_ack_waiting.erase(who) &&
3096 info.notify_ack_waiting.empty()) {
3097 fragment_drop_locks(info);
3098 fragment_maybe_finish(p++);
3099 } else {
3100 ++p;
3101 }
3102 continue;
3103 }
3104
3105 ++p;
3106 dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl;
3107 std::vector<CDir*> dirs;
3108 info.dirs.swap(dirs);
3109 fragments.erase(df);
3110 fragment_unmark_unfreeze_dirs(dirs);
3111 }
3112
3113 // MDCache::shutdown_export_strays() always exports strays to mds.0
3114 if (who == mds_rank_t(0))
3115 shutdown_exporting_strays.clear();
3116
3117 show_subtrees();
3118 }
3119
3120 /*
3121 * handle_mds_recovery - called on another node's transition
3122 * from resolve -> active.
3123 */
3124 void MDCache::handle_mds_recovery(mds_rank_t who)
3125 {
3126 dout(7) << "handle_mds_recovery mds." << who << dendl;
3127
3128 // exclude all discover waiters. kick_discovers() will do the job
3129 static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR;
3130 static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY;
3131
3132 MDSContext::vec waiters;
3133
3134 // wake up any waiters in their subtrees
3135 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3136 p != subtrees.end();
3137 ++p) {
3138 CDir *dir = p->first;
3139
3140 if (dir->authority().first != who ||
3141 dir->authority().second == mds->get_nodeid())
3142 continue;
3143 ceph_assert(!dir->is_auth());
3144
3145 // wake any waiters
3146 std::queue<CDir*> q;
3147 q.push(dir);
3148
3149 while (!q.empty()) {
3150 CDir *d = q.front();
3151 q.pop();
3152 d->take_waiting(d_mask, waiters);
3153
3154 // inode waiters too
3155 for (auto &p : d->items) {
3156 CDentry *dn = p.second;
3157 CDentry::linkage_t *dnl = dn->get_linkage();
3158 if (dnl->is_primary()) {
3159 dnl->get_inode()->take_waiting(i_mask, waiters);
3160
3161 // recurse?
3162 auto&& ls = dnl->get_inode()->get_dirfrags();
3163 for (const auto& subdir : ls) {
3164 if (!subdir->is_subtree_root())
3165 q.push(subdir);
3166 }
3167 }
3168 }
3169 }
3170 }
3171
3172 kick_open_ino_peers(who);
3173 kick_find_ino_peers(who);
3174
3175 // queue them up.
3176 mds->queue_waiters(waiters);
3177 }
3178
3179 void MDCache::set_recovery_set(set<mds_rank_t>& s)
3180 {
3181 dout(7) << "set_recovery_set " << s << dendl;
3182 recovery_set = s;
3183 }
3184
3185
3186 /*
3187 * during resolve state, we share resolves to determine who
3188 * is authoritative for which trees. we expect to get an resolve
3189 * from _everyone_ in the recovery_set (the mds cluster at the time of
3190 * the first failure).
3191 *
3192 * This functions puts the passed message before returning
3193 */
3194 void MDCache::handle_resolve(const cref_t<MMDSResolve> &m)
3195 {
3196 dout(7) << "handle_resolve from " << m->get_source() << dendl;
3197 mds_rank_t from = mds_rank_t(m->get_source().num());
3198
3199 if (mds->get_state() < MDSMap::STATE_RESOLVE) {
3200 if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) {
3201 mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m));
3202 return;
3203 }
3204 // wait until we reach the resolve stage!
3205 return;
3206 }
3207
3208 discard_delayed_resolve(from);
3209
3210 // ambiguous slave requests?
3211 if (!m->slave_requests.empty()) {
3212 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3213 for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) {
3214 if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) {
3215 ceph_assert(!p->second.committing);
3216 pending_masters.insert(p->first);
3217 }
3218 }
3219
3220 if (!pending_masters.empty()) {
3221 dout(10) << " still have pending updates, delay processing slave resolve" << dendl;
3222 delayed_resolve[from] = m;
3223 return;
3224 }
3225 }
3226
3227 auto ack = make_message<MMDSResolveAck>();
3228 for (const auto &p : m->slave_requests) {
3229 if (uncommitted_masters.count(p.first)) { //mds->sessionmap.have_completed_request(p.first)) {
3230 // COMMIT
3231 if (p.second.committing) {
3232 // already committing, waiting for the OP_COMMITTED slave reply
3233 dout(10) << " already committing slave request " << p << " noop "<< dendl;
3234 } else {
3235 dout(10) << " ambiguous slave request " << p << " will COMMIT" << dendl;
3236 ack->add_commit(p.first);
3237 }
3238 uncommitted_masters[p.first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted
3239
3240 if (p.second.inode_caps.length() > 0) {
3241 // slave wants to export caps (rename)
3242 ceph_assert(mds->is_resolve());
3243 MMDSResolve::slave_inode_cap inode_caps;
3244 auto q = p.second.inode_caps.cbegin();
3245 decode(inode_caps, q);
3246 inodeno_t ino = inode_caps.ino;
3247 map<client_t,Capability::Export> cap_exports = inode_caps.cap_exports;
3248 ceph_assert(get_inode(ino));
3249
3250 for (map<client_t,Capability::Export>::iterator q = cap_exports.begin();
3251 q != cap_exports.end();
3252 ++q) {
3253 Capability::Import& im = rejoin_imported_caps[from][ino][q->first];
3254 im.cap_id = ++last_cap_id; // assign a new cap ID
3255 im.issue_seq = 1;
3256 im.mseq = q->second.mseq;
3257
3258 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
3259 if (session)
3260 rejoin_client_map.emplace(q->first, session->info.inst);
3261 }
3262
3263 // will process these caps in rejoin stage
3264 rejoin_slave_exports[ino].first = from;
3265 rejoin_slave_exports[ino].second.swap(cap_exports);
3266
3267 // send information of imported caps back to slave
3268 encode(rejoin_imported_caps[from][ino], ack->commit[p.first]);
3269 }
3270 } else {
3271 // ABORT
3272 dout(10) << " ambiguous slave request " << p << " will ABORT" << dendl;
3273 ceph_assert(!p.second.committing);
3274 ack->add_abort(p.first);
3275 }
3276 }
3277 mds->send_message(ack, m->get_connection());
3278 return;
3279 }
3280
3281 if (!resolve_ack_gather.empty() || !resolve_need_rollback.empty()) {
3282 dout(10) << "delay processing subtree resolve" << dendl;
3283 delayed_resolve[from] = m;
3284 return;
3285 }
3286
3287 bool survivor = false;
3288 // am i a surviving ambiguous importer?
3289 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
3290 survivor = true;
3291 // check for any import success/failure (from this node)
3292 map<dirfrag_t, vector<dirfrag_t> >::iterator p = my_ambiguous_imports.begin();
3293 while (p != my_ambiguous_imports.end()) {
3294 map<dirfrag_t, vector<dirfrag_t> >::iterator next = p;
3295 ++next;
3296 CDir *dir = get_dirfrag(p->first);
3297 ceph_assert(dir);
3298 dout(10) << "checking ambiguous import " << *dir << dendl;
3299 if (migrator->is_importing(dir->dirfrag()) &&
3300 migrator->get_import_peer(dir->dirfrag()) == from) {
3301 ceph_assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING);
3302
3303 // check if sender claims the subtree
3304 bool claimed_by_sender = false;
3305 for (const auto &q : m->subtrees) {
3306 // an ambiguous import won't race with a refragmentation; it's appropriate to force here.
3307 CDir *base = get_force_dirfrag(q.first, false);
3308 if (!base || !base->contains(dir))
3309 continue; // base not dir or an ancestor of dir, clearly doesn't claim dir.
3310
3311 bool inside = true;
3312 set<CDir*> bounds;
3313 get_force_dirfrag_bound_set(q.second, bounds);
3314 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
3315 CDir *bound = *p;
3316 if (bound->contains(dir)) {
3317 inside = false; // nope, bound is dir or parent of dir, not inside.
3318 break;
3319 }
3320 }
3321 if (inside)
3322 claimed_by_sender = true;
3323 }
3324
3325 my_ambiguous_imports.erase(p); // no longer ambiguous.
3326 if (claimed_by_sender) {
3327 dout(7) << "ambiguous import failed on " << *dir << dendl;
3328 migrator->import_reverse(dir);
3329 } else {
3330 dout(7) << "ambiguous import succeeded on " << *dir << dendl;
3331 migrator->import_finish(dir, true);
3332 }
3333 }
3334 p = next;
3335 }
3336 }
3337
3338 // update my dir_auth values
3339 // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous
3340 // migrations between other nodes)
3341 for (const auto& p : m->subtrees) {
3342 dout(10) << "peer claims " << p.first << " bounds " << p.second << dendl;
3343 CDir *dir = get_force_dirfrag(p.first, !survivor);
3344 if (!dir)
3345 continue;
3346 adjust_bounded_subtree_auth(dir, p.second, from);
3347 try_subtree_merge(dir);
3348 }
3349
3350 show_subtrees();
3351
3352 // note ambiguous imports too
3353 for (const auto& p : m->ambiguous_imports) {
3354 dout(10) << "noting ambiguous import on " << p.first << " bounds " << p.second << dendl;
3355 other_ambiguous_imports[from][p.first] = p.second;
3356 }
3357
3358 // learn other mds' pendina snaptable commits. later when resolve finishes, we will reload
3359 // snaptable cache from snapserver. By this way, snaptable cache get synced among all mds
3360 for (const auto& p : m->table_clients) {
3361 dout(10) << " noting " << get_mdstable_name(p.type)
3362 << " pending_commits " << p.pending_commits << dendl;
3363 MDSTableClient *client = mds->get_table_client(p.type);
3364 for (const auto& q : p.pending_commits)
3365 client->notify_commit(q);
3366 }
3367
3368 // did i get them all?
3369 resolve_gather.erase(from);
3370
3371 maybe_resolve_finish();
3372 }
3373
3374 void MDCache::process_delayed_resolve()
3375 {
3376 dout(10) << "process_delayed_resolve" << dendl;
3377 map<mds_rank_t, cref_t<MMDSResolve>> tmp;
3378 tmp.swap(delayed_resolve);
3379 for (auto &p : tmp) {
3380 handle_resolve(p.second);
3381 }
3382 }
3383
3384 void MDCache::discard_delayed_resolve(mds_rank_t who)
3385 {
3386 delayed_resolve.erase(who);
3387 }
3388
3389 void MDCache::maybe_resolve_finish()
3390 {
3391 ceph_assert(resolve_ack_gather.empty());
3392 ceph_assert(resolve_need_rollback.empty());
3393
3394 if (!resolve_gather.empty()) {
3395 dout(10) << "maybe_resolve_finish still waiting for resolves ("
3396 << resolve_gather << ")" << dendl;
3397 return;
3398 }
3399
3400 dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl;
3401 disambiguate_my_imports();
3402 finish_committed_masters();
3403
3404 if (resolve_done) {
3405 ceph_assert(mds->is_resolve());
3406 trim_unlinked_inodes();
3407 recalc_auth_bits(false);
3408 resolve_done.release()->complete(0);
3409 } else {
3410 // I am survivor.
3411 maybe_send_pending_rejoins();
3412 }
3413 }
3414
3415 void MDCache::handle_resolve_ack(const cref_t<MMDSResolveAck> &ack)
3416 {
3417 dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl;
3418 mds_rank_t from = mds_rank_t(ack->get_source().num());
3419
3420 if (!resolve_ack_gather.count(from) ||
3421 mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) {
3422 return;
3423 }
3424
3425 if (ambiguous_slave_updates.count(from)) {
3426 ceph_assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from));
3427 ceph_assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3428 }
3429
3430 for (const auto &p : ack->commit) {
3431 dout(10) << " commit on slave " << p.first << dendl;
3432
3433 if (ambiguous_slave_updates.count(from)) {
3434 remove_ambiguous_slave_update(p.first, from);
3435 continue;
3436 }
3437
3438 if (mds->is_resolve()) {
3439 // replay
3440 MDSlaveUpdate *su = get_uncommitted_slave(p.first, from);
3441 ceph_assert(su);
3442
3443 // log commit
3444 mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p.first, from,
3445 ESlaveUpdate::OP_COMMIT, su->origop),
3446 new C_MDC_SlaveCommit(this, from, p.first));
3447 mds->mdlog->flush();
3448
3449 finish_uncommitted_slave(p.first);
3450 } else {
3451 MDRequestRef mdr = request_get(p.first);
3452 // information about master imported caps
3453 if (p.second.length() > 0)
3454 mdr->more()->inode_import.share(p.second);
3455
3456 ceph_assert(mdr->slave_request == 0); // shouldn't be doing anything!
3457 request_finish(mdr);
3458 }
3459 }
3460
3461 for (const auto &metareq : ack->abort) {
3462 dout(10) << " abort on slave " << metareq << dendl;
3463
3464 if (mds->is_resolve()) {
3465 MDSlaveUpdate *su = get_uncommitted_slave(metareq, from);
3466 ceph_assert(su);
3467
3468 // perform rollback (and journal a rollback entry)
3469 // note: this will hold up the resolve a bit, until the rollback entries journal.
3470 MDRequestRef null_ref;
3471 switch (su->origop) {
3472 case ESlaveUpdate::LINK:
3473 mds->server->do_link_rollback(su->rollback, from, null_ref);
3474 break;
3475 case ESlaveUpdate::RENAME:
3476 mds->server->do_rename_rollback(su->rollback, from, null_ref);
3477 break;
3478 case ESlaveUpdate::RMDIR:
3479 mds->server->do_rmdir_rollback(su->rollback, from, null_ref);
3480 break;
3481 default:
3482 ceph_abort();
3483 }
3484 } else {
3485 MDRequestRef mdr = request_get(metareq);
3486 mdr->aborted = true;
3487 if (mdr->slave_request) {
3488 if (mdr->slave_did_prepare()) // journaling slave prepare ?
3489 add_rollback(metareq, from);
3490 } else {
3491 request_finish(mdr);
3492 }
3493 }
3494 }
3495
3496 if (!ambiguous_slave_updates.count(from)) {
3497 resolve_ack_gather.erase(from);
3498 maybe_finish_slave_resolve();
3499 }
3500 }
3501
3502 void MDCache::add_uncommitted_slave(metareqid_t reqid, LogSegment *ls, mds_rank_t master, MDSlaveUpdate *su)
3503 {
3504 auto const &ret = uncommitted_slaves.emplace(std::piecewise_construct,
3505 std::forward_as_tuple(reqid),
3506 std::forward_as_tuple());
3507 ceph_assert(ret.second);
3508 ls->uncommitted_slaves.insert(reqid);
3509 uslave &u = ret.first->second;
3510 u.master = master;
3511 u.ls = ls;
3512 u.su = su;
3513 if (su == nullptr) {
3514 return;
3515 }
3516 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p)
3517 uncommitted_slave_rename_olddir[*p]++;
3518 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p)
3519 uncommitted_slave_unlink[*p]++;
3520 }
3521
3522 void MDCache::finish_uncommitted_slave(metareqid_t reqid, bool assert_exist)
3523 {
3524 auto it = uncommitted_slaves.find(reqid);
3525 if (it == uncommitted_slaves.end()) {
3526 ceph_assert(!assert_exist);
3527 return;
3528 }
3529 uslave &u = it->second;
3530 MDSlaveUpdate* su = u.su;
3531
3532 if (!u.waiters.empty()) {
3533 mds->queue_waiters(u.waiters);
3534 }
3535 u.ls->uncommitted_slaves.erase(reqid);
3536 uncommitted_slaves.erase(it);
3537
3538 if (su == nullptr) {
3539 return;
3540 }
3541 // discard the non-auth subtree we renamed out of
3542 for(set<CInode*>::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) {
3543 CInode *diri = *p;
3544 map<CInode*, int>::iterator it = uncommitted_slave_rename_olddir.find(diri);
3545 ceph_assert(it != uncommitted_slave_rename_olddir.end());
3546 it->second--;
3547 if (it->second == 0) {
3548 uncommitted_slave_rename_olddir.erase(it);
3549 auto&& ls = diri->get_dirfrags();
3550 for (const auto& dir : ls) {
3551 CDir *root = get_subtree_root(dir);
3552 if (root->get_dir_auth() == CDIR_AUTH_UNDEF) {
3553 try_trim_non_auth_subtree(root);
3554 if (dir != root)
3555 break;
3556 }
3557 }
3558 } else
3559 ceph_assert(it->second > 0);
3560 }
3561 // removed the inodes that were unlinked by slave update
3562 for(set<CInode*>::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) {
3563 CInode *in = *p;
3564 map<CInode*, int>::iterator it = uncommitted_slave_unlink.find(in);
3565 ceph_assert(it != uncommitted_slave_unlink.end());
3566 it->second--;
3567 if (it->second == 0) {
3568 uncommitted_slave_unlink.erase(it);
3569 if (!in->get_projected_parent_dn())
3570 mds->mdcache->remove_inode_recursive(in);
3571 } else
3572 ceph_assert(it->second > 0);
3573 }
3574 delete su;
3575 }
3576
3577 MDSlaveUpdate* MDCache::get_uncommitted_slave(metareqid_t reqid, mds_rank_t master)
3578 {
3579
3580 MDSlaveUpdate* su = nullptr;
3581 auto it = uncommitted_slaves.find(reqid);
3582 if (it != uncommitted_slaves.end() &&
3583 it->second.master == master) {
3584 su = it->second.su;
3585 }
3586 return su;
3587 }
3588
3589 void MDCache::finish_rollback(metareqid_t reqid, MDRequestRef& mdr) {
3590 auto p = resolve_need_rollback.find(mdr->reqid);
3591 ceph_assert(p != resolve_need_rollback.end());
3592 if (mds->is_resolve()) {
3593 finish_uncommitted_slave(reqid, false);
3594 } else if (mdr) {
3595 finish_uncommitted_slave(mdr->reqid, mdr->more()->slave_update_journaled);
3596 }
3597 resolve_need_rollback.erase(p);
3598 maybe_finish_slave_resolve();
3599 }
3600
3601 void MDCache::disambiguate_other_imports()
3602 {
3603 dout(10) << "disambiguate_other_imports" << dendl;
3604
3605 bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping());
3606 // other nodes' ambiguous imports
3607 for (map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > >::iterator p = other_ambiguous_imports.begin();
3608 p != other_ambiguous_imports.end();
3609 ++p) {
3610 mds_rank_t who = p->first;
3611 dout(10) << "ambiguous imports for mds." << who << dendl;
3612
3613 for (map<dirfrag_t, vector<dirfrag_t> >::iterator q = p->second.begin();
3614 q != p->second.end();
3615 ++q) {
3616 dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl;
3617 // an ambiguous import will not race with a refragmentation; it's appropriate to force here.
3618 CDir *dir = get_force_dirfrag(q->first, recovering);
3619 if (!dir) continue;
3620
3621 if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander
3622 dir->authority() == CDIR_AUTH_UNDEF) { // resolving
3623 dout(10) << " mds." << who << " did import " << *dir << dendl;
3624 adjust_bounded_subtree_auth(dir, q->second, who);
3625 try_subtree_merge(dir);
3626 } else {
3627 dout(10) << " mds." << who << " did not import " << *dir << dendl;
3628 }
3629 }
3630 }
3631 other_ambiguous_imports.clear();
3632 }
3633
3634 void MDCache::disambiguate_my_imports()
3635 {
3636 dout(10) << "disambiguate_my_imports" << dendl;
3637
3638 if (!mds->is_resolve()) {
3639 ceph_assert(my_ambiguous_imports.empty());
3640 return;
3641 }
3642
3643 disambiguate_other_imports();
3644
3645 // my ambiguous imports
3646 mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid());
3647 while (!my_ambiguous_imports.empty()) {
3648 map<dirfrag_t, vector<dirfrag_t> >::iterator q = my_ambiguous_imports.begin();
3649
3650 CDir *dir = get_dirfrag(q->first);
3651 ceph_assert(dir);
3652
3653 if (dir->authority() != me_ambig) {
3654 dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl;
3655 cancel_ambiguous_import(dir);
3656
3657 mds->mdlog->start_submit_entry(new EImportFinish(dir, false));
3658
3659 // subtree may have been swallowed by another node claiming dir
3660 // as their own.
3661 CDir *root = get_subtree_root(dir);
3662 if (root != dir)
3663 dout(10) << " subtree root is " << *root << dendl;
3664 ceph_assert(root->dir_auth.first != mds->get_nodeid()); // no us!
3665 try_trim_non_auth_subtree(root);
3666 } else {
3667 dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl;
3668 finish_ambiguous_import(q->first);
3669 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
3670 }
3671 }
3672 ceph_assert(my_ambiguous_imports.empty());
3673 mds->mdlog->flush();
3674
3675 // verify all my subtrees are unambiguous!
3676 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3677 p != subtrees.end();
3678 ++p) {
3679 CDir *dir = p->first;
3680 if (dir->is_ambiguous_dir_auth()) {
3681 dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl;
3682 }
3683 ceph_assert(!dir->is_ambiguous_dir_auth());
3684 }
3685
3686 show_subtrees();
3687 }
3688
3689
3690 void MDCache::add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds)
3691 {
3692 ceph_assert(my_ambiguous_imports.count(base) == 0);
3693 my_ambiguous_imports[base] = bounds;
3694 }
3695
3696
3697 void MDCache::add_ambiguous_import(CDir *base, const set<CDir*>& bounds)
3698 {
3699 // make a list
3700 vector<dirfrag_t> binos;
3701 for (set<CDir*>::iterator p = bounds.begin();
3702 p != bounds.end();
3703 ++p)
3704 binos.push_back((*p)->dirfrag());
3705
3706 // note: this can get called twice if the exporter fails during recovery
3707 if (my_ambiguous_imports.count(base->dirfrag()))
3708 my_ambiguous_imports.erase(base->dirfrag());
3709
3710 add_ambiguous_import(base->dirfrag(), binos);
3711 }
3712
3713 void MDCache::cancel_ambiguous_import(CDir *dir)
3714 {
3715 dirfrag_t df = dir->dirfrag();
3716 ceph_assert(my_ambiguous_imports.count(df));
3717 dout(10) << "cancel_ambiguous_import " << df
3718 << " bounds " << my_ambiguous_imports[df]
3719 << " " << *dir
3720 << dendl;
3721 my_ambiguous_imports.erase(df);
3722 }
3723
3724 void MDCache::finish_ambiguous_import(dirfrag_t df)
3725 {
3726 ceph_assert(my_ambiguous_imports.count(df));
3727 vector<dirfrag_t> bounds;
3728 bounds.swap(my_ambiguous_imports[df]);
3729 my_ambiguous_imports.erase(df);
3730
3731 dout(10) << "finish_ambiguous_import " << df
3732 << " bounds " << bounds
3733 << dendl;
3734 CDir *dir = get_dirfrag(df);
3735 ceph_assert(dir);
3736
3737 // adjust dir_auth, import maps
3738 adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid());
3739 try_subtree_merge(dir);
3740 }
3741
3742 void MDCache::remove_inode_recursive(CInode *in)
3743 {
3744 dout(10) << "remove_inode_recursive " << *in << dendl;
3745 auto&& ls = in->get_dirfrags();
3746 for (const auto& subdir : ls) {
3747 dout(10) << " removing dirfrag " << *subdir << dendl;
3748 auto it = subdir->items.begin();
3749 while (it != subdir->items.end()) {
3750 CDentry *dn = it->second;
3751 ++it;
3752 CDentry::linkage_t *dnl = dn->get_linkage();
3753 if (dnl->is_primary()) {
3754 CInode *tin = dnl->get_inode();
3755 subdir->unlink_inode(dn, false);
3756 remove_inode_recursive(tin);
3757 }
3758 subdir->remove_dentry(dn);
3759 }
3760
3761 if (subdir->is_subtree_root())
3762 remove_subtree(subdir);
3763 in->close_dirfrag(subdir->dirfrag().frag);
3764 }
3765 remove_inode(in);
3766 }
3767
3768 bool MDCache::expire_recursive(CInode *in, expiremap &expiremap)
3769 {
3770 ceph_assert(!in->is_auth());
3771
3772 dout(10) << __func__ << ":" << *in << dendl;
3773
3774 // Recurse into any dirfrags beneath this inode
3775 auto&& ls = in->get_dirfrags();
3776 for (const auto& subdir : ls) {
3777 if (!in->is_mdsdir() && subdir->is_subtree_root()) {
3778 dout(10) << __func__ << ": stray still has subtree " << *in << dendl;
3779 return true;
3780 }
3781
3782 for (auto &it : subdir->items) {
3783 CDentry *dn = it.second;
3784 CDentry::linkage_t *dnl = dn->get_linkage();
3785 if (dnl->is_primary()) {
3786 CInode *tin = dnl->get_inode();
3787
3788 /* Remote strays with linkage (i.e. hardlinks) should not be
3789 * expired, because they may be the target of
3790 * a rename() as the owning MDS shuts down */
3791 if (!tin->is_stray() && tin->inode.nlink) {
3792 dout(10) << __func__ << ": stray still has linkage " << *tin << dendl;
3793 return true;
3794 }
3795
3796 const bool abort = expire_recursive(tin, expiremap);
3797 if (abort) {
3798 return true;
3799 }
3800 }
3801 if (dn->lru_is_expireable()) {
3802 trim_dentry(dn, expiremap);
3803 } else {
3804 dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl;
3805 return true;
3806 }
3807 }
3808 }
3809
3810 return false;
3811 }
3812
3813 void MDCache::trim_unlinked_inodes()
3814 {
3815 dout(7) << "trim_unlinked_inodes" << dendl;
3816 int count = 0;
3817 vector<CInode*> q;
3818 for (auto &p : inode_map) {
3819 CInode *in = p.second;
3820 if (in->get_parent_dn() == NULL && !in->is_base()) {
3821 dout(7) << " will trim from " << *in << dendl;
3822 q.push_back(in);
3823 }
3824
3825 if (!(++count % 1000))
3826 mds->heartbeat_reset();
3827 }
3828 for (auto& in : q) {
3829 remove_inode_recursive(in);
3830
3831 if (!(++count % 1000))
3832 mds->heartbeat_reset();
3833 }
3834 }
3835
3836 /** recalc_auth_bits()
3837 * once subtree auth is disambiguated, we need to adjust all the
3838 * auth and dirty bits in our cache before moving on.
3839 */
3840 void MDCache::recalc_auth_bits(bool replay)
3841 {
3842 dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl;
3843
3844 if (root) {
3845 root->inode_auth.first = mds->mdsmap->get_root();
3846 bool auth = mds->get_nodeid() == root->inode_auth.first;
3847 if (auth) {
3848 root->state_set(CInode::STATE_AUTH);
3849 } else {
3850 root->state_clear(CInode::STATE_AUTH);
3851 if (!replay)
3852 root->state_set(CInode::STATE_REJOINING);
3853 }
3854 }
3855
3856 set<CInode*> subtree_inodes;
3857 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3858 p != subtrees.end();
3859 ++p) {
3860 if (p->first->dir_auth.first == mds->get_nodeid())
3861 subtree_inodes.insert(p->first->inode);
3862 }
3863
3864 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
3865 p != subtrees.end();
3866 ++p) {
3867 if (p->first->inode->is_mdsdir()) {
3868 CInode *in = p->first->inode;
3869 bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid());
3870 if (auth) {
3871 in->state_set(CInode::STATE_AUTH);
3872 } else {
3873 in->state_clear(CInode::STATE_AUTH);
3874 if (!replay)
3875 in->state_set(CInode::STATE_REJOINING);
3876 }
3877 }
3878
3879 std::queue<CDir*> dfq; // dirfrag queue
3880 dfq.push(p->first);
3881
3882 bool auth = p->first->authority().first == mds->get_nodeid();
3883 dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl;
3884
3885 while (!dfq.empty()) {
3886 CDir *dir = dfq.front();
3887 dfq.pop();
3888
3889 // dir
3890 if (auth) {
3891 dir->state_set(CDir::STATE_AUTH);
3892 } else {
3893 dir->state_clear(CDir::STATE_AUTH);
3894 if (!replay) {
3895 // close empty non-auth dirfrag
3896 if (!dir->is_subtree_root() && dir->get_num_any() == 0) {
3897 dir->inode->close_dirfrag(dir->get_frag());
3898 continue;
3899 }
3900 dir->state_set(CDir::STATE_REJOINING);
3901 dir->state_clear(CDir::STATE_COMPLETE);
3902 if (dir->is_dirty())
3903 dir->mark_clean();
3904 }
3905 }
3906
3907 // dentries in this dir
3908 for (auto &p : dir->items) {
3909 // dn
3910 CDentry *dn = p.second;
3911 CDentry::linkage_t *dnl = dn->get_linkage();
3912 if (auth) {
3913 dn->state_set(CDentry::STATE_AUTH);
3914 } else {
3915 dn->state_clear(CDentry::STATE_AUTH);
3916 if (!replay) {
3917 dn->state_set(CDentry::STATE_REJOINING);
3918 if (dn->is_dirty())
3919 dn->mark_clean();
3920 }
3921 }
3922
3923 if (dnl->is_primary()) {
3924 // inode
3925 CInode *in = dnl->get_inode();
3926 if (auth) {
3927 in->state_set(CInode::STATE_AUTH);
3928 } else {
3929 in->state_clear(CInode::STATE_AUTH);
3930 if (!replay) {
3931 in->state_set(CInode::STATE_REJOINING);
3932 if (in->is_dirty())
3933 in->mark_clean();
3934 if (in->is_dirty_parent())
3935 in->clear_dirty_parent();
3936 // avoid touching scatterlocks for our subtree roots!
3937 if (subtree_inodes.count(in) == 0)
3938 in->clear_scatter_dirty();
3939 }
3940 }
3941 // recurse?
3942 if (in->is_dir()) {
3943 auto&& dfv = in->get_nested_dirfrags();
3944 for (const auto& dir : dfv) {
3945 dfq.push(dir);
3946 }
3947 }
3948 }
3949 }
3950 }
3951 }
3952
3953 show_subtrees();
3954 show_cache();
3955 }
3956
3957
3958
3959 // ===========================================================================
3960 // REJOIN
3961
3962 /*
3963 * notes on scatterlock recovery:
3964 *
3965 * - recovering inode replica sends scatterlock data for any subtree
3966 * roots (the only ones that are possibly dirty).
3967 *
3968 * - surviving auth incorporates any provided scatterlock data. any
3969 * pending gathers are then finished, as with the other lock types.
3970 *
3971 * that takes care of surviving auth + (recovering replica)*.
3972 *
3973 * - surviving replica sends strong_inode, which includes current
3974 * scatterlock state, AND any dirty scatterlock data. this
3975 * provides the recovering auth with everything it might need.
3976 *
3977 * - recovering auth must pick initial scatterlock state based on
3978 * (weak|strong) rejoins.
3979 * - always assimilate scatterlock data (it can't hurt)
3980 * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC.
3981 * - include base inode in ack for all inodes that saw scatterlock content
3982 *
3983 * also, for scatter gather,
3984 *
3985 * - auth increments {frag,r}stat.version on completion of any gather.
3986 *
3987 * - auth incorporates changes in a gather _only_ if the version
3988 * matches.
3989 *
3990 * - replica discards changes any time the scatterlock syncs, and
3991 * after recovery.
3992 */
3993
3994 void MDCache::dump_rejoin_status(Formatter *f) const
3995 {
3996 f->open_object_section("rejoin_status");
3997 f->dump_stream("rejoin_gather") << rejoin_gather;
3998 f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather;
3999 f->dump_unsigned("num_opening_inodes", cap_imports_num_opening);
4000 f->close_section();
4001 }
4002
4003 void MDCache::rejoin_start(MDSContext *rejoin_done_)
4004 {
4005 dout(10) << "rejoin_start" << dendl;
4006 ceph_assert(!rejoin_done);
4007 rejoin_done.reset(rejoin_done_);
4008
4009 rejoin_gather = recovery_set;
4010 // need finish opening cap inodes before sending cache rejoins
4011 rejoin_gather.insert(mds->get_nodeid());
4012 process_imported_caps();
4013 }
4014
4015 /*
4016 * rejoin phase!
4017 *
4018 * this initiates rejoin. it should be called before we get any
4019 * rejoin or rejoin_ack messages (or else mdsmap distribution is broken).
4020 *
4021 * we start out by sending rejoins to everyone in the recovery set.
4022 *
4023 * if we are rejoin, send for all regions in our cache.
4024 * if we are active|stopping, send only to nodes that are rejoining.
4025 */
4026 void MDCache::rejoin_send_rejoins()
4027 {
4028 dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl;
4029
4030 if (rejoin_gather.count(mds->get_nodeid())) {
4031 dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl;
4032 rejoins_pending = true;
4033 return;
4034 }
4035 if (!resolve_gather.empty()) {
4036 dout(7) << "rejoin_send_rejoins still waiting for resolves ("
4037 << resolve_gather << ")" << dendl;
4038 rejoins_pending = true;
4039 return;
4040 }
4041
4042 ceph_assert(!migrator->is_importing());
4043 ceph_assert(!migrator->is_exporting());
4044
4045 if (!mds->is_rejoin()) {
4046 disambiguate_other_imports();
4047 }
4048
4049 map<mds_rank_t, ref_t<MMDSCacheRejoin>> rejoins;
4050
4051
4052 // if i am rejoining, send a rejoin to everyone.
4053 // otherwise, just send to others who are rejoining.
4054 for (const auto& rank : recovery_set) {
4055 if (rank == mds->get_nodeid()) continue; // nothing to myself!
4056 if (rejoin_sent.count(rank)) continue; // already sent a rejoin to this node!
4057 if (mds->is_rejoin())
4058 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_WEAK);
4059 else if (mds->mdsmap->is_rejoin(rank))
4060 rejoins[rank] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_STRONG);
4061 }
4062
4063 if (mds->is_rejoin()) {
4064 map<client_t, pair<Session*, set<mds_rank_t> > > client_exports;
4065 for (auto& p : cap_exports) {
4066 mds_rank_t target = p.second.first;
4067 if (rejoins.count(target) == 0)
4068 continue;
4069 for (auto q = p.second.second.begin(); q != p.second.second.end(); ) {
4070 Session *session = nullptr;
4071 auto it = client_exports.find(q->first);
4072 if (it != client_exports.end()) {
4073 session = it->second.first;
4074 if (session)
4075 it->second.second.insert(target);
4076 } else {
4077 session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
4078 auto& r = client_exports[q->first];
4079 r.first = session;
4080 if (session)
4081 r.second.insert(target);
4082 }
4083 if (session) {
4084 ++q;
4085 } else {
4086 // remove reconnect with no session
4087 p.second.second.erase(q++);
4088 }
4089 }
4090 rejoins[target]->cap_exports[p.first] = p.second.second;
4091 }
4092 for (auto& p : client_exports) {
4093 Session *session = p.second.first;
4094 for (auto& q : p.second.second) {
4095 auto rejoin = rejoins[q];
4096 rejoin->client_map[p.first] = session->info.inst;
4097 rejoin->client_metadata_map[p.first] = session->info.client_metadata;
4098 }
4099 }
4100 }
4101
4102
4103 // check all subtrees
4104 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
4105 p != subtrees.end();
4106 ++p) {
4107 CDir *dir = p->first;
4108 ceph_assert(dir->is_subtree_root());
4109 if (dir->is_ambiguous_dir_auth()) {
4110 // exporter is recovering, importer is survivor.
4111 ceph_assert(rejoins.count(dir->authority().first));
4112 ceph_assert(!rejoins.count(dir->authority().second));
4113 continue;
4114 }
4115
4116 // my subtree?
4117 if (dir->is_auth())
4118 continue; // skip my own regions!
4119
4120 mds_rank_t auth = dir->get_dir_auth().first;
4121 ceph_assert(auth >= 0);
4122 if (rejoins.count(auth) == 0)
4123 continue; // don't care about this node's subtrees
4124
4125 rejoin_walk(dir, rejoins[auth]);
4126 }
4127
4128 // rejoin root inodes, too
4129 for (auto &p : rejoins) {
4130 if (mds->is_rejoin()) {
4131 // weak
4132 if (p.first == 0 && root) {
4133 p.second->add_weak_inode(root->vino());
4134 if (root->is_dirty_scattered()) {
4135 dout(10) << " sending scatterlock state on root " << *root << dendl;
4136 p.second->add_scatterlock_state(root);
4137 }
4138 }
4139 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4140 if (in)
4141 p.second->add_weak_inode(in->vino());
4142 }
4143 } else {
4144 // strong
4145 if (p.first == 0 && root) {
4146 p.second->add_strong_inode(root->vino(),
4147 root->get_replica_nonce(),
4148 root->get_caps_wanted(),
4149 root->filelock.get_state(),
4150 root->nestlock.get_state(),
4151 root->dirfragtreelock.get_state());
4152 root->state_set(CInode::STATE_REJOINING);
4153 if (root->is_dirty_scattered()) {
4154 dout(10) << " sending scatterlock state on root " << *root << dendl;
4155 p.second->add_scatterlock_state(root);
4156 }
4157 }
4158
4159 if (CInode *in = get_inode(MDS_INO_MDSDIR(p.first))) {
4160 p.second->add_strong_inode(in->vino(),
4161 in->get_replica_nonce(),
4162 in->get_caps_wanted(),
4163 in->filelock.get_state(),
4164 in->nestlock.get_state(),
4165 in->dirfragtreelock.get_state());
4166 in->state_set(CInode::STATE_REJOINING);
4167 }
4168 }
4169 }
4170
4171 if (!mds->is_rejoin()) {
4172 // i am survivor. send strong rejoin.
4173 // note request remote_auth_pins, xlocks
4174 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
4175 p != active_requests.end();
4176 ++p) {
4177 MDRequestRef& mdr = p->second;
4178 if (mdr->is_slave())
4179 continue;
4180 // auth pins
4181 for (const auto& q : mdr->object_states) {
4182 if (q.second.remote_auth_pinned == MDS_RANK_NONE)
4183 continue;
4184 if (!q.first->is_auth()) {
4185 mds_rank_t target = q.second.remote_auth_pinned;
4186 ceph_assert(target == q.first->authority().first);
4187 if (rejoins.count(target) == 0) continue;
4188 const auto& rejoin = rejoins[target];
4189
4190 dout(15) << " " << *mdr << " authpin on " << *q.first << dendl;
4191 MDSCacheObjectInfo i;
4192 q.first->set_object_info(i);
4193 if (i.ino)
4194 rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt);
4195 else
4196 rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt);
4197
4198 if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin &&
4199 mdr->more()->rename_inode == q.first)
4200 rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid),
4201 mdr->reqid, mdr->attempt);
4202 }
4203 }
4204 // xlocks
4205 for (const auto& q : mdr->locks) {
4206 auto lock = q.lock;
4207 auto obj = lock->get_parent();
4208 if (q.is_xlock() && !obj->is_auth()) {
4209 mds_rank_t who = obj->authority().first;
4210 if (rejoins.count(who) == 0) continue;
4211 const auto& rejoin = rejoins[who];
4212
4213 dout(15) << " " << *mdr << " xlock on " << *lock << " " << *obj << dendl;
4214 MDSCacheObjectInfo i;
4215 obj->set_object_info(i);
4216 if (i.ino)
4217 rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4218 mdr->reqid, mdr->attempt);
4219 else
4220 rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid,
4221 mdr->reqid, mdr->attempt);
4222 } else if (q.is_remote_wrlock()) {
4223 mds_rank_t who = q.wrlock_target;
4224 if (rejoins.count(who) == 0) continue;
4225 const auto& rejoin = rejoins[who];
4226
4227 dout(15) << " " << *mdr << " wrlock on " << *lock << " " << *obj << dendl;
4228 MDSCacheObjectInfo i;
4229 obj->set_object_info(i);
4230 ceph_assert(i.ino);
4231 rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), lock->get_type(),
4232 mdr->reqid, mdr->attempt);
4233 }
4234 }
4235 }
4236 }
4237
4238 // send the messages
4239 for (auto &p : rejoins) {
4240 ceph_assert(rejoin_sent.count(p.first) == 0);
4241 ceph_assert(rejoin_ack_gather.count(p.first) == 0);
4242 rejoin_sent.insert(p.first);
4243 rejoin_ack_gather.insert(p.first);
4244 mds->send_message_mds(p.second, p.first);
4245 }
4246 rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too
4247 rejoins_pending = false;
4248
4249 // nothing?
4250 if (mds->is_rejoin() && rejoin_gather.empty()) {
4251 dout(10) << "nothing to rejoin" << dendl;
4252 rejoin_gather_finish();
4253 }
4254 }
4255
4256
4257 /**
4258 * rejoin_walk - build rejoin declarations for a subtree
4259 *
4260 * @param dir subtree root
4261 * @param rejoin rejoin message
4262 *
4263 * from a rejoining node:
4264 * weak dirfrag
4265 * weak dentries (w/ connectivity)
4266 *
4267 * from a surviving node:
4268 * strong dirfrag
4269 * strong dentries (no connectivity!)
4270 * strong inodes
4271 */
4272 void MDCache::rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin)
4273 {
4274 dout(10) << "rejoin_walk " << *dir << dendl;
4275
4276 std::vector<CDir*> nested; // finish this dir, then do nested items
4277
4278 if (mds->is_rejoin()) {
4279 // WEAK
4280 rejoin->add_weak_dirfrag(dir->dirfrag());
4281 for (auto &p : dir->items) {
4282 CDentry *dn = p.second;
4283 ceph_assert(dn->last == CEPH_NOSNAP);
4284 CDentry::linkage_t *dnl = dn->get_linkage();
4285 dout(15) << " add_weak_primary_dentry " << *dn << dendl;
4286 ceph_assert(dnl->is_primary());
4287 CInode *in = dnl->get_inode();
4288 ceph_assert(dnl->get_inode()->is_dir());
4289 rejoin->add_weak_primary_dentry(dir->ino(), dn->get_name(), dn->first, dn->last, in->ino());
4290 {
4291 auto&& dirs = in->get_nested_dirfrags();
4292 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4293 }
4294 if (in->is_dirty_scattered()) {
4295 dout(10) << " sending scatterlock state on " << *in << dendl;
4296 rejoin->add_scatterlock_state(in);
4297 }
4298 }
4299 } else {
4300 // STRONG
4301 dout(15) << " add_strong_dirfrag " << *dir << dendl;
4302 rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep());
4303 dir->state_set(CDir::STATE_REJOINING);
4304
4305 for (auto it = dir->items.begin(); it != dir->items.end(); ) {
4306 CDentry *dn = it->second;
4307 ++it;
4308 dn->state_set(CDentry::STATE_REJOINING);
4309 CDentry::linkage_t *dnl = dn->get_linkage();
4310 CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL;
4311
4312 // trim snap dentries. because they may have been pruned by
4313 // their auth mds (snap deleted)
4314 if (dn->last != CEPH_NOSNAP) {
4315 if (in && !in->remote_parents.empty()) {
4316 // unlink any stale remote snap dentry.
4317 for (auto it2 = in->remote_parents.begin(); it2 != in->remote_parents.end(); ) {
4318 CDentry *remote_dn = *it2;
4319 ++it2;
4320 ceph_assert(remote_dn->last != CEPH_NOSNAP);
4321 remote_dn->unlink_remote(remote_dn->get_linkage());
4322 }
4323 }
4324 if (dn->lru_is_expireable()) {
4325 if (!dnl->is_null())
4326 dir->unlink_inode(dn, false);
4327 if (in)
4328 remove_inode(in);
4329 dir->remove_dentry(dn);
4330 continue;
4331 } else {
4332 // Inventing null/remote dentry shouldn't cause problem
4333 ceph_assert(!dnl->is_primary());
4334 }
4335 }
4336
4337 dout(15) << " add_strong_dentry " << *dn << dendl;
4338 rejoin->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4339 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
4340 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
4341 dnl->is_remote() ? dnl->get_remote_d_type():0,
4342 dn->get_replica_nonce(),
4343 dn->lock.get_state());
4344 dn->state_set(CDentry::STATE_REJOINING);
4345 if (dnl->is_primary()) {
4346 CInode *in = dnl->get_inode();
4347 dout(15) << " add_strong_inode " << *in << dendl;
4348 rejoin->add_strong_inode(in->vino(),
4349 in->get_replica_nonce(),
4350 in->get_caps_wanted(),
4351 in->filelock.get_state(),
4352 in->nestlock.get_state(),
4353 in->dirfragtreelock.get_state());
4354 in->state_set(CInode::STATE_REJOINING);
4355 {
4356 auto&& dirs = in->get_nested_dirfrags();
4357 nested.insert(std::end(nested), std::begin(dirs), std::end(dirs));
4358 }
4359 if (in->is_dirty_scattered()) {
4360 dout(10) << " sending scatterlock state on " << *in << dendl;
4361 rejoin->add_scatterlock_state(in);
4362 }
4363 }
4364 }
4365 }
4366
4367 // recurse into nested dirs
4368 for (const auto& dir : nested) {
4369 rejoin_walk(dir, rejoin);
4370 }
4371 }
4372
4373
4374 /*
4375 * i got a rejoin.
4376 * - reply with the lockstate
4377 *
4378 * if i am active|stopping,
4379 * - remove source from replica list for everything not referenced here.
4380 */
4381 void MDCache::handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m)
4382 {
4383 dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source()
4384 << " (" << m->get_payload().length() << " bytes)"
4385 << dendl;
4386
4387 switch (m->op) {
4388 case MMDSCacheRejoin::OP_WEAK:
4389 handle_cache_rejoin_weak(m);
4390 break;
4391 case MMDSCacheRejoin::OP_STRONG:
4392 handle_cache_rejoin_strong(m);
4393 break;
4394 case MMDSCacheRejoin::OP_ACK:
4395 handle_cache_rejoin_ack(m);
4396 break;
4397
4398 default:
4399 ceph_abort();
4400 }
4401 }
4402
4403
4404 /*
4405 * handle_cache_rejoin_weak
4406 *
4407 * the sender
4408 * - is recovering from their journal.
4409 * - may have incorrect (out of date) inode contents
4410 * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient
4411 *
4412 * if the sender didn't trim_non_auth(), they
4413 * - may have incorrect (out of date) dentry/inode linkage
4414 * - may have deleted/purged inodes
4415 * and i may have to go to disk to get accurate inode contents. yuck.
4416 */
4417 void MDCache::handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &weak)
4418 {
4419 mds_rank_t from = mds_rank_t(weak->get_source().num());
4420
4421 // possible response(s)
4422 ref_t<MMDSCacheRejoin> ack; // if survivor
4423 set<vinodeno_t> acked_inodes; // if survivor
4424 set<SimpleLock *> gather_locks; // if survivor
4425 bool survivor = false; // am i a survivor?
4426
4427 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) {
4428 survivor = true;
4429 dout(10) << "i am a surivivor, and will ack immediately" << dendl;
4430 ack = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
4431
4432 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
4433
4434 // check cap exports
4435 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4436 CInode *in = get_inode(p->first);
4437 ceph_assert(!in || in->is_auth());
4438 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4439 dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl;
4440 Capability *cap = rejoin_import_cap(in, q->first, q->second, from);
4441 Capability::Import& im = imported_caps[p->first][q->first];
4442 if (cap) {
4443 im.cap_id = cap->get_cap_id();
4444 im.issue_seq = cap->get_last_seq();
4445 im.mseq = cap->get_mseq();
4446 } else {
4447 // all are zero
4448 }
4449 }
4450 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
4451 }
4452
4453 encode(imported_caps, ack->imported_caps);
4454 } else {
4455 ceph_assert(mds->is_rejoin());
4456
4457 // we may have already received a strong rejoin from the sender.
4458 rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks);
4459 ceph_assert(gather_locks.empty());
4460
4461 // check cap exports.
4462 rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end());
4463 rejoin_client_metadata_map.insert(weak->client_metadata_map.begin(),
4464 weak->client_metadata_map.end());
4465
4466 for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) {
4467 CInode *in = get_inode(p->first);
4468 ceph_assert(!in || in->is_auth());
4469 // note
4470 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
4471 dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl;
4472 cap_imports[p->first][q->first][from] = q->second;
4473 }
4474 }
4475 }
4476
4477 // assimilate any potentially dirty scatterlock state
4478 for (const auto &p : weak->inode_scatterlocks) {
4479 CInode *in = get_inode(p.first);
4480 ceph_assert(in);
4481 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4482 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4483 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4484 if (!survivor)
4485 rejoin_potential_updated_scatterlocks.insert(in);
4486 }
4487
4488 // recovering peer may send incorrect dirfrags here. we need to
4489 // infer which dirfrag they meant. the ack will include a
4490 // strong_dirfrag that will set them straight on the fragmentation.
4491
4492 // walk weak map
4493 set<CDir*> dirs_to_share;
4494 for (const auto &p : weak->weak_dirfrags) {
4495 CInode *diri = get_inode(p.ino);
4496 if (!diri)
4497 dout(0) << " missing dir ino " << p.ino << dendl;
4498 ceph_assert(diri);
4499
4500 frag_vec_t leaves;
4501 if (diri->dirfragtree.is_leaf(p.frag)) {
4502 leaves.push_back(p.frag);
4503 } else {
4504 diri->dirfragtree.get_leaves_under(p.frag, leaves);
4505 if (leaves.empty())
4506 leaves.push_back(diri->dirfragtree[p.frag.value()]);
4507 }
4508 for (const auto& leaf : leaves) {
4509 CDir *dir = diri->get_dirfrag(leaf);
4510 if (!dir) {
4511 dout(0) << " missing dir for " << p.frag << " (which maps to " << leaf << ") on " << *diri << dendl;
4512 continue;
4513 }
4514 ceph_assert(dir);
4515 if (dirs_to_share.count(dir)) {
4516 dout(10) << " already have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4517 } else {
4518 dirs_to_share.insert(dir);
4519 unsigned nonce = dir->add_replica(from);
4520 dout(10) << " have " << p.frag << " -> " << leaf << " " << *dir << dendl;
4521 if (ack) {
4522 ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep);
4523 ack->add_dirfrag_base(dir);
4524 }
4525 }
4526 }
4527 }
4528
4529 for (const auto &p : weak->weak) {
4530 CInode *diri = get_inode(p.first);
4531 if (!diri)
4532 dout(0) << " missing dir ino " << p.first << dendl;
4533 ceph_assert(diri);
4534
4535 // weak dentries
4536 CDir *dir = 0;
4537 for (const auto &q : p.second) {
4538 // locate proper dirfrag.
4539 // optimize for common case (one dirfrag) to avoid dirs_to_share set check
4540 frag_t fg = diri->pick_dirfrag(q.first.name);
4541 if (!dir || dir->get_frag() != fg) {
4542 dir = diri->get_dirfrag(fg);
4543 if (!dir)
4544 dout(0) << " missing dir frag " << fg << " on " << *diri << dendl;
4545 ceph_assert(dir);
4546 ceph_assert(dirs_to_share.count(dir));
4547 }
4548
4549 // and dentry
4550 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
4551 ceph_assert(dn);
4552 CDentry::linkage_t *dnl = dn->get_linkage();
4553 ceph_assert(dnl->is_primary());
4554
4555 if (survivor && dn->is_replica(from))
4556 dentry_remove_replica(dn, from, gather_locks);
4557 unsigned dnonce = dn->add_replica(from);
4558 dout(10) << " have " << *dn << dendl;
4559 if (ack)
4560 ack->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
4561 dnl->get_inode()->ino(), inodeno_t(0), 0,
4562 dnonce, dn->lock.get_replica_state());
4563
4564 // inode
4565 CInode *in = dnl->get_inode();
4566 ceph_assert(in);
4567
4568 if (survivor && in->is_replica(from))
4569 inode_remove_replica(in, from, true, gather_locks);
4570 unsigned inonce = in->add_replica(from);
4571 dout(10) << " have " << *in << dendl;
4572
4573 // scatter the dirlock, just in case?
4574 if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag())
4575 in->filelock.set_state(LOCK_MIX);
4576
4577 if (ack) {
4578 acked_inodes.insert(in->vino());
4579 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4580 bufferlist bl;
4581 in->_encode_locks_state_for_rejoin(bl, from);
4582 ack->add_inode_locks(in, inonce, bl);
4583 }
4584 }
4585 }
4586
4587 // weak base inodes? (root, stray, etc.)
4588 for (set<vinodeno_t>::iterator p = weak->weak_inodes.begin();
4589 p != weak->weak_inodes.end();
4590 ++p) {
4591 CInode *in = get_inode(*p);
4592 ceph_assert(in); // hmm fixme wrt stray?
4593 if (survivor && in->is_replica(from))
4594 inode_remove_replica(in, from, true, gather_locks);
4595 unsigned inonce = in->add_replica(from);
4596 dout(10) << " have base " << *in << dendl;
4597
4598 if (ack) {
4599 acked_inodes.insert(in->vino());
4600 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4601 bufferlist bl;
4602 in->_encode_locks_state_for_rejoin(bl, from);
4603 ack->add_inode_locks(in, inonce, bl);
4604 }
4605 }
4606
4607 ceph_assert(rejoin_gather.count(from));
4608 rejoin_gather.erase(from);
4609 if (survivor) {
4610 // survivor. do everything now.
4611 for (const auto &p : weak->inode_scatterlocks) {
4612 CInode *in = get_inode(p.first);
4613 ceph_assert(in);
4614 dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl;
4615 acked_inodes.insert(in->vino());
4616 ack->add_inode_base(in, mds->mdsmap->get_up_features());
4617 }
4618
4619 rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks);
4620 mds->send_message(ack, weak->get_connection());
4621
4622 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
4623 if (!(*p)->is_stable())
4624 mds->locker->eval_gather(*p);
4625 }
4626 } else {
4627 // done?
4628 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4629 rejoin_gather_finish();
4630 } else {
4631 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4632 }
4633 }
4634 }
4635
4636 /*
4637 * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects
4638 *
4639 * all validated replicas are acked with a strong nonce, etc. if that isn't in the
4640 * ack, the replica dne, and we can remove it from our replica maps.
4641 */
4642 void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
4643 set<vinodeno_t>& acked_inodes,
4644 set<SimpleLock *>& gather_locks)
4645 {
4646 dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl;
4647
4648 auto scour_func = [this, from, ack, &acked_inodes, &gather_locks] (CInode *in) {
4649 // inode?
4650 if (in->is_auth() &&
4651 in->is_replica(from) &&
4652 (ack == NULL || acked_inodes.count(in->vino()) == 0)) {
4653 inode_remove_replica(in, from, false, gather_locks);
4654 dout(10) << " rem " << *in << dendl;
4655 }
4656
4657 if (!in->is_dir())
4658 return;
4659
4660 const auto&& dfs = in->get_dirfrags();
4661 for (const auto& dir : dfs) {
4662 if (!dir->is_auth())
4663 continue;
4664
4665 if (dir->is_replica(from) &&
4666 (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) {
4667 dir->remove_replica(from);
4668 dout(10) << " rem " << *dir << dendl;
4669 }
4670
4671 // dentries
4672 for (auto &p : dir->items) {
4673 CDentry *dn = p.second;
4674
4675 if (dn->is_replica(from)) {
4676 if (ack) {
4677 const auto it = ack->strong_dentries.find(dir->dirfrag());
4678 if (it != ack->strong_dentries.end() && it->second.count(string_snap_t(dn->get_name(), dn->last)) > 0) {
4679 continue;
4680 }
4681 }
4682 dentry_remove_replica(dn, from, gather_locks);
4683 dout(10) << " rem " << *dn << dendl;
4684 }
4685 }
4686 }
4687 };
4688
4689 for (auto &p : inode_map)
4690 scour_func(p.second);
4691 for (auto &p : snap_inode_map)
4692 scour_func(p.second);
4693 }
4694
4695
4696 CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last)
4697 {
4698 CInode *in = new CInode(this, true, 1, last);
4699 in->inode.ino = ino;
4700 in->state_set(CInode::STATE_REJOINUNDEF);
4701 add_inode(in);
4702 rejoin_undef_inodes.insert(in);
4703 dout(10) << " invented " << *in << dendl;
4704 return in;
4705 }
4706
4707 CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df)
4708 {
4709 CInode *in = get_inode(df.ino);
4710 if (!in)
4711 in = rejoin_invent_inode(df.ino, CEPH_NOSNAP);
4712 if (!in->is_dir()) {
4713 ceph_assert(in->state_test(CInode::STATE_REJOINUNDEF));
4714 in->inode.mode = S_IFDIR;
4715 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
4716 }
4717 CDir *dir = in->get_or_open_dirfrag(this, df.frag);
4718 dir->state_set(CDir::STATE_REJOINUNDEF);
4719 rejoin_undef_dirfrags.insert(dir);
4720 dout(10) << " invented " << *dir << dendl;
4721 return dir;
4722 }
4723
4724 void MDCache::handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &strong)
4725 {
4726 mds_rank_t from = mds_rank_t(strong->get_source().num());
4727
4728 // only a recovering node will get a strong rejoin.
4729 if (!mds->is_rejoin()) {
4730 if (mds->get_want_state() == MDSMap::STATE_REJOIN) {
4731 mds->wait_for_rejoin(new C_MDS_RetryMessage(mds, strong));
4732 return;
4733 }
4734 ceph_abort_msg("got unexpected rejoin message during recovery");
4735 }
4736
4737 // assimilate any potentially dirty scatterlock state
4738 for (const auto &p : strong->inode_scatterlocks) {
4739 CInode *in = get_inode(p.first);
4740 ceph_assert(in);
4741 in->decode_lock_state(CEPH_LOCK_IFILE, p.second.file);
4742 in->decode_lock_state(CEPH_LOCK_INEST, p.second.nest);
4743 in->decode_lock_state(CEPH_LOCK_IDFT, p.second.dft);
4744 rejoin_potential_updated_scatterlocks.insert(in);
4745 }
4746
4747 rejoin_unlinked_inodes[from].clear();
4748
4749 // surviving peer may send incorrect dirfrag here (maybe they didn't
4750 // get the fragment notify, or maybe we rolled back?). we need to
4751 // infer the right frag and get them with the program. somehow.
4752 // we don't normally send ACK.. so we'll need to bundle this with
4753 // MISSING or something.
4754
4755 // strong dirfrags/dentries.
4756 // also process auth_pins, xlocks.
4757 for (const auto &p : strong->strong_dirfrags) {
4758 auto& dirfrag = p.first;
4759 CInode *diri = get_inode(dirfrag.ino);
4760 if (!diri)
4761 diri = rejoin_invent_inode(dirfrag.ino, CEPH_NOSNAP);
4762 CDir *dir = diri->get_dirfrag(dirfrag.frag);
4763 bool refragged = false;
4764 if (dir) {
4765 dout(10) << " have " << *dir << dendl;
4766 } else {
4767 if (diri->state_test(CInode::STATE_REJOINUNDEF))
4768 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t()));
4769 else if (diri->dirfragtree.is_leaf(dirfrag.frag))
4770 dir = rejoin_invent_dirfrag(dirfrag);
4771 }
4772 if (dir) {
4773 dir->add_replica(from, p.second.nonce);
4774 dir->dir_rep = p.second.dir_rep;
4775 } else {
4776 dout(10) << " frag " << dirfrag << " doesn't match dirfragtree " << *diri << dendl;
4777 frag_vec_t leaves;
4778 diri->dirfragtree.get_leaves_under(dirfrag.frag, leaves);
4779 if (leaves.empty())
4780 leaves.push_back(diri->dirfragtree[dirfrag.frag.value()]);
4781 dout(10) << " maps to frag(s) " << leaves << dendl;
4782 for (const auto& leaf : leaves) {
4783 CDir *dir = diri->get_dirfrag(leaf);
4784 if (!dir)
4785 dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), leaf));
4786 else
4787 dout(10) << " have(approx) " << *dir << dendl;
4788 dir->add_replica(from, p.second.nonce);
4789 dir->dir_rep = p.second.dir_rep;
4790 }
4791 refragged = true;
4792 }
4793
4794 const auto it = strong->strong_dentries.find(dirfrag);
4795 if (it != strong->strong_dentries.end()) {
4796 const auto& dmap = it->second;
4797 for (const auto &q : dmap) {
4798 const string_snap_t& ss = q.first;
4799 const MMDSCacheRejoin::dn_strong& d = q.second;
4800 CDentry *dn;
4801 if (!refragged)
4802 dn = dir->lookup(ss.name, ss.snapid);
4803 else {
4804 frag_t fg = diri->pick_dirfrag(ss.name);
4805 dir = diri->get_dirfrag(fg);
4806 ceph_assert(dir);
4807 dn = dir->lookup(ss.name, ss.snapid);
4808 }
4809 if (!dn) {
4810 if (d.is_remote()) {
4811 dn = dir->add_remote_dentry(ss.name, d.remote_ino, d.remote_d_type, d.first, ss.snapid);
4812 } else if (d.is_null()) {
4813 dn = dir->add_null_dentry(ss.name, d.first, ss.snapid);
4814 } else {
4815 CInode *in = get_inode(d.ino, ss.snapid);
4816 if (!in) in = rejoin_invent_inode(d.ino, ss.snapid);
4817 dn = dir->add_primary_dentry(ss.name, in, d.first, ss.snapid);
4818 }
4819 dout(10) << " invented " << *dn << dendl;
4820 }
4821 CDentry::linkage_t *dnl = dn->get_linkage();
4822
4823 // dn auth_pin?
4824 const auto pinned_it = strong->authpinned_dentries.find(dirfrag);
4825 if (pinned_it != strong->authpinned_dentries.end()) {
4826 const auto slave_reqid_it = pinned_it->second.find(ss);
4827 if (slave_reqid_it != pinned_it->second.end()) {
4828 for (const auto &r : slave_reqid_it->second) {
4829 dout(10) << " dn authpin by " << r << " on " << *dn << dendl;
4830
4831 // get/create slave mdrequest
4832 MDRequestRef mdr;
4833 if (have_request(r.reqid))
4834 mdr = request_get(r.reqid);
4835 else
4836 mdr = request_start_slave(r.reqid, r.attempt, strong);
4837 mdr->auth_pin(dn);
4838 }
4839 }
4840 }
4841
4842 // dn xlock?
4843 const auto xlocked_it = strong->xlocked_dentries.find(dirfrag);
4844 if (xlocked_it != strong->xlocked_dentries.end()) {
4845 const auto ss_req_it = xlocked_it->second.find(ss);
4846 if (ss_req_it != xlocked_it->second.end()) {
4847 const MMDSCacheRejoin::slave_reqid& r = ss_req_it->second;
4848 dout(10) << " dn xlock by " << r << " on " << *dn << dendl;
4849 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4850 ceph_assert(mdr->is_auth_pinned(dn));
4851 if (!mdr->is_xlocked(&dn->versionlock)) {
4852 ceph_assert(dn->versionlock.can_xlock_local());
4853 dn->versionlock.get_xlock(mdr, mdr->get_client());
4854 mdr->emplace_lock(&dn->versionlock, MutationImpl::LockOp::XLOCK);
4855 }
4856 if (dn->lock.is_stable())
4857 dn->auth_pin(&dn->lock);
4858 dn->lock.set_state(LOCK_XLOCK);
4859 dn->lock.get_xlock(mdr, mdr->get_client());
4860 mdr->emplace_lock(&dn->lock, MutationImpl::LockOp::XLOCK);
4861 }
4862 }
4863
4864 dn->add_replica(from, d.nonce);
4865 dout(10) << " have " << *dn << dendl;
4866
4867 if (dnl->is_primary()) {
4868 if (d.is_primary()) {
4869 if (vinodeno_t(d.ino, ss.snapid) != dnl->get_inode()->vino()) {
4870 // the survivor missed MDentryUnlink+MDentryLink messages ?
4871 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4872 CInode *in = get_inode(d.ino, ss.snapid);
4873 ceph_assert(in);
4874 ceph_assert(in->get_parent_dn());
4875 rejoin_unlinked_inodes[from].insert(in);
4876 dout(7) << " sender has primary dentry but wrong inode" << dendl;
4877 }
4878 } else {
4879 // the survivor missed MDentryLink message ?
4880 ceph_assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0);
4881 dout(7) << " sender doesn't have primay dentry" << dendl;
4882 }
4883 } else {
4884 if (d.is_primary()) {
4885 // the survivor missed MDentryUnlink message ?
4886 CInode *in = get_inode(d.ino, ss.snapid);
4887 ceph_assert(in);
4888 ceph_assert(in->get_parent_dn());
4889 rejoin_unlinked_inodes[from].insert(in);
4890 dout(7) << " sender has primary dentry but we don't" << dendl;
4891 }
4892 }
4893 }
4894 }
4895 }
4896
4897 for (const auto &p : strong->strong_inodes) {
4898 CInode *in = get_inode(p.first);
4899 ceph_assert(in);
4900 in->add_replica(from, p.second.nonce);
4901 dout(10) << " have " << *in << dendl;
4902
4903 const MMDSCacheRejoin::inode_strong& is = p.second;
4904
4905 // caps_wanted
4906 if (is.caps_wanted) {
4907 in->set_mds_caps_wanted(from, is.caps_wanted);
4908 dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted)
4909 << " on " << *in << dendl;
4910 }
4911
4912 // scatterlocks?
4913 // infer state from replica state:
4914 // * go to MIX if they might have wrlocks
4915 // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock)
4916 in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK
4917 in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false);
4918 in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false);
4919
4920 // auth pin?
4921 const auto authpinned_inodes_it = strong->authpinned_inodes.find(in->vino());
4922 if (authpinned_inodes_it != strong->authpinned_inodes.end()) {
4923 for (const auto& r : authpinned_inodes_it->second) {
4924 dout(10) << " inode authpin by " << r << " on " << *in << dendl;
4925
4926 // get/create slave mdrequest
4927 MDRequestRef mdr;
4928 if (have_request(r.reqid))
4929 mdr = request_get(r.reqid);
4930 else
4931 mdr = request_start_slave(r.reqid, r.attempt, strong);
4932 if (strong->frozen_authpin_inodes.count(in->vino())) {
4933 ceph_assert(!in->get_num_auth_pins());
4934 mdr->freeze_auth_pin(in);
4935 } else {
4936 ceph_assert(!in->is_frozen_auth_pin());
4937 }
4938 mdr->auth_pin(in);
4939 }
4940 }
4941 // xlock(s)?
4942 const auto xlocked_inodes_it = strong->xlocked_inodes.find(in->vino());
4943 if (xlocked_inodes_it != strong->xlocked_inodes.end()) {
4944 for (const auto &q : xlocked_inodes_it->second) {
4945 SimpleLock *lock = in->get_lock(q.first);
4946 dout(10) << " inode xlock by " << q.second << " on " << *lock << " on " << *in << dendl;
4947 MDRequestRef mdr = request_get(q.second.reqid); // should have this from auth_pin above.
4948 ceph_assert(mdr->is_auth_pinned(in));
4949 if (!mdr->is_xlocked(&in->versionlock)) {
4950 ceph_assert(in->versionlock.can_xlock_local());
4951 in->versionlock.get_xlock(mdr, mdr->get_client());
4952 mdr->emplace_lock(&in->versionlock, MutationImpl::LockOp::XLOCK);
4953 }
4954 if (lock->is_stable())
4955 in->auth_pin(lock);
4956 lock->set_state(LOCK_XLOCK);
4957 if (lock == &in->filelock)
4958 in->loner_cap = -1;
4959 lock->get_xlock(mdr, mdr->get_client());
4960 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
4961 }
4962 }
4963 }
4964 // wrlock(s)?
4965 for (const auto &p : strong->wrlocked_inodes) {
4966 CInode *in = get_inode(p.first);
4967 for (const auto &q : p.second) {
4968 SimpleLock *lock = in->get_lock(q.first);
4969 for (const auto &r : q.second) {
4970 dout(10) << " inode wrlock by " << r << " on " << *lock << " on " << *in << dendl;
4971 MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above.
4972 if (in->is_auth())
4973 ceph_assert(mdr->is_auth_pinned(in));
4974 lock->set_state(LOCK_MIX);
4975 if (lock == &in->filelock)
4976 in->loner_cap = -1;
4977 lock->get_wrlock(true);
4978 mdr->emplace_lock(lock, MutationImpl::LockOp::WRLOCK);
4979 }
4980 }
4981 }
4982
4983 // done?
4984 ceph_assert(rejoin_gather.count(from));
4985 rejoin_gather.erase(from);
4986 if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) {
4987 rejoin_gather_finish();
4988 } else {
4989 dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl;
4990 }
4991 }
4992
4993 void MDCache::handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &ack)
4994 {
4995 dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl;
4996 mds_rank_t from = mds_rank_t(ack->get_source().num());
4997
4998 ceph_assert(mds->get_state() >= MDSMap::STATE_REJOIN);
4999 bool survivor = !mds->is_rejoin();
5000
5001 // for sending cache expire message
5002 set<CInode*> isolated_inodes;
5003 set<CInode*> refragged_inodes;
5004 list<pair<CInode*,int> > updated_realms;
5005
5006 // dirs
5007 for (const auto &p : ack->strong_dirfrags) {
5008 // we may have had incorrect dir fragmentation; refragment based
5009 // on what they auth tells us.
5010 CDir *dir = get_dirfrag(p.first);
5011 if (!dir) {
5012 dir = get_force_dirfrag(p.first, false);
5013 if (dir)
5014 refragged_inodes.insert(dir->get_inode());
5015 }
5016 if (!dir) {
5017 CInode *diri = get_inode(p.first.ino);
5018 if (!diri) {
5019 // barebones inode; the full inode loop below will clean up.
5020 diri = new CInode(this, false);
5021 diri->inode.ino = p.first.ino;
5022 diri->inode.mode = S_IFDIR;
5023 diri->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5024 add_inode(diri);
5025 if (MDS_INO_MDSDIR(from) == p.first.ino) {
5026 diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN);
5027 dout(10) << " add inode " << *diri << dendl;
5028 } else {
5029 diri->inode_auth = CDIR_AUTH_DEFAULT;
5030 isolated_inodes.insert(diri);
5031 dout(10) << " unconnected dirfrag " << p.first << dendl;
5032 }
5033 }
5034 // barebones dirfrag; the full dirfrag loop below will clean up.
5035 dir = diri->add_dirfrag(new CDir(diri, p.first.frag, this, false));
5036 if (MDS_INO_MDSDIR(from) == p.first.ino ||
5037 (dir->authority() != CDIR_AUTH_UNDEF &&
5038 dir->authority().first != from))
5039 adjust_subtree_auth(dir, from);
5040 dout(10) << " add dirfrag " << *dir << dendl;
5041 }
5042
5043 dir->set_replica_nonce(p.second.nonce);
5044 dir->state_clear(CDir::STATE_REJOINING);
5045 dout(10) << " got " << *dir << dendl;
5046
5047 // dentries
5048 auto it = ack->strong_dentries.find(p.first);
5049 if (it != ack->strong_dentries.end()) {
5050 for (const auto &q : it->second) {
5051 CDentry *dn = dir->lookup(q.first.name, q.first.snapid);
5052 if(!dn)
5053 dn = dir->add_null_dentry(q.first.name, q.second.first, q.first.snapid);
5054
5055 CDentry::linkage_t *dnl = dn->get_linkage();
5056
5057 ceph_assert(dn->last == q.first.snapid);
5058 if (dn->first != q.second.first) {
5059 dout(10) << " adjust dn.first " << dn->first << " -> " << q.second.first << " on " << *dn << dendl;
5060 dn->first = q.second.first;
5061 }
5062
5063 // may have bad linkage if we missed dentry link/unlink messages
5064 if (dnl->is_primary()) {
5065 CInode *in = dnl->get_inode();
5066 if (!q.second.is_primary() ||
5067 vinodeno_t(q.second.ino, q.first.snapid) != in->vino()) {
5068 dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl;
5069 dir->unlink_inode(dn);
5070 }
5071 } else if (dnl->is_remote()) {
5072 if (!q.second.is_remote() ||
5073 q.second.remote_ino != dnl->get_remote_ino() ||
5074 q.second.remote_d_type != dnl->get_remote_d_type()) {
5075 dout(10) << " had bad linkage for " << *dn << dendl;
5076 dir->unlink_inode(dn);
5077 }
5078 } else {
5079 if (!q.second.is_null())
5080 dout(10) << " had bad linkage for " << *dn << dendl;
5081 }
5082
5083 // hmm, did we have the proper linkage here?
5084 if (dnl->is_null() && !q.second.is_null()) {
5085 if (q.second.is_remote()) {
5086 dn->dir->link_remote_inode(dn, q.second.remote_ino, q.second.remote_d_type);
5087 } else {
5088 CInode *in = get_inode(q.second.ino, q.first.snapid);
5089 if (!in) {
5090 // barebones inode; assume it's dir, the full inode loop below will clean up.
5091 in = new CInode(this, false, q.second.first, q.first.snapid);
5092 in->inode.ino = q.second.ino;
5093 in->inode.mode = S_IFDIR;
5094 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
5095 add_inode(in);
5096 dout(10) << " add inode " << *in << dendl;
5097 } else if (in->get_parent_dn()) {
5098 dout(10) << " had bad linkage for " << *(in->get_parent_dn())
5099 << ", unlinking " << *in << dendl;
5100 in->get_parent_dir()->unlink_inode(in->get_parent_dn());
5101 }
5102 dn->dir->link_primary_inode(dn, in);
5103 isolated_inodes.erase(in);
5104 }
5105 }
5106
5107 dn->set_replica_nonce(q.second.nonce);
5108 dn->lock.set_state_rejoin(q.second.lock, rejoin_waiters, survivor);
5109 dn->state_clear(CDentry::STATE_REJOINING);
5110 dout(10) << " got " << *dn << dendl;
5111 }
5112 }
5113 }
5114
5115 for (const auto& in : refragged_inodes) {
5116 auto&& ls = in->get_nested_dirfrags();
5117 for (const auto& dir : ls) {
5118 if (dir->is_auth() || ack->strong_dirfrags.count(dir->dirfrag()))
5119 continue;
5120 ceph_assert(dir->get_num_any() == 0);
5121 in->close_dirfrag(dir->get_frag());
5122 }
5123 }
5124
5125 // full dirfrags
5126 for (const auto &p : ack->dirfrag_bases) {
5127 CDir *dir = get_dirfrag(p.first);
5128 ceph_assert(dir);
5129 auto q = p.second.cbegin();
5130 dir->_decode_base(q);
5131 dout(10) << " got dir replica " << *dir << dendl;
5132 }
5133
5134 // full inodes
5135 auto p = ack->inode_base.cbegin();
5136 while (!p.end()) {
5137 inodeno_t ino;
5138 snapid_t last;
5139 bufferlist basebl;
5140 decode(ino, p);
5141 decode(last, p);
5142 decode(basebl, p);
5143 CInode *in = get_inode(ino, last);
5144 ceph_assert(in);
5145 auto q = basebl.cbegin();
5146 snapid_t sseq = 0;
5147 if (in->snaprealm)
5148 sseq = in->snaprealm->srnode.seq;
5149 in->_decode_base(q);
5150 if (in->snaprealm && in->snaprealm->srnode.seq != sseq) {
5151 int snap_op = sseq > 0 ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT;
5152 updated_realms.push_back(pair<CInode*,int>(in, snap_op));
5153 }
5154 dout(10) << " got inode base " << *in << dendl;
5155 }
5156
5157 // inodes
5158 p = ack->inode_locks.cbegin();
5159 //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl;
5160 while (!p.end()) {
5161 inodeno_t ino;
5162 snapid_t last;
5163 __u32 nonce;
5164 bufferlist lockbl;
5165 decode(ino, p);
5166 decode(last, p);
5167 decode(nonce, p);
5168 decode(lockbl, p);
5169
5170 CInode *in = get_inode(ino, last);
5171 ceph_assert(in);
5172 in->set_replica_nonce(nonce);
5173 auto q = lockbl.cbegin();
5174 in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks, survivor);
5175 in->state_clear(CInode::STATE_REJOINING);
5176 dout(10) << " got inode locks " << *in << dendl;
5177 }
5178
5179 // FIXME: This can happen if entire subtree, together with the inode subtree root
5180 // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack.
5181 ceph_assert(isolated_inodes.empty());
5182
5183 map<inodeno_t,map<client_t,Capability::Import> > peer_imported;
5184 auto bp = ack->imported_caps.cbegin();
5185 decode(peer_imported, bp);
5186
5187 for (map<inodeno_t,map<client_t,Capability::Import> >::iterator p = peer_imported.begin();
5188 p != peer_imported.end();
5189 ++p) {
5190 auto& ex = cap_exports.at(p->first);
5191 ceph_assert(ex.first == from);
5192 for (map<client_t,Capability::Import>::iterator q = p->second.begin();
5193 q != p->second.end();
5194 ++q) {
5195 auto r = ex.second.find(q->first);
5196 ceph_assert(r != ex.second.end());
5197
5198 dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl;
5199 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5200 if (!session) {
5201 dout(10) << " no session for client." << p->first << dendl;
5202 ex.second.erase(r);
5203 continue;
5204 }
5205
5206 // mark client caps stale.
5207 auto m = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first, 0,
5208 r->second.capinfo.cap_id, 0,
5209 mds->get_osd_epoch_barrier());
5210 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq,
5211 (q->second.cap_id > 0 ? from : -1), 0);
5212 mds->send_message_client_counted(m, session);
5213
5214 ex.second.erase(r);
5215 }
5216 ceph_assert(ex.second.empty());
5217 }
5218
5219 for (auto p : updated_realms) {
5220 CInode *in = p.first;
5221 bool notify_clients;
5222 if (mds->is_rejoin()) {
5223 if (!rejoin_pending_snaprealms.count(in)) {
5224 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5225 rejoin_pending_snaprealms.insert(in);
5226 }
5227 notify_clients = false;
5228 } else {
5229 // notify clients if I'm survivor
5230 notify_clients = true;
5231 }
5232 do_realm_invalidate_and_update_notify(in, p.second, notify_clients);
5233 }
5234
5235 // done?
5236 ceph_assert(rejoin_ack_gather.count(from));
5237 rejoin_ack_gather.erase(from);
5238 if (!survivor) {
5239 if (rejoin_gather.empty()) {
5240 // eval unstable scatter locks after all wrlocks are rejoined.
5241 while (!rejoin_eval_locks.empty()) {
5242 SimpleLock *lock = rejoin_eval_locks.front();
5243 rejoin_eval_locks.pop_front();
5244 if (!lock->is_stable())
5245 mds->locker->eval_gather(lock);
5246 }
5247 }
5248
5249 if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too.
5250 rejoin_ack_gather.empty()) {
5251 // finally, kickstart past snap parent opens
5252 open_snaprealms();
5253 } else {
5254 dout(7) << "still need rejoin from (" << rejoin_gather << ")"
5255 << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl;
5256 }
5257 } else {
5258 // survivor.
5259 mds->queue_waiters(rejoin_waiters);
5260 }
5261 }
5262
5263 /**
5264 * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes
5265 *
5266 * FIXME: wait, can this actually happen? a survivor should generate cache trim
5267 * messages that clean these guys up...
5268 */
5269 void MDCache::rejoin_trim_undef_inodes()
5270 {
5271 dout(10) << "rejoin_trim_undef_inodes" << dendl;
5272
5273 while (!rejoin_undef_inodes.empty()) {
5274 set<CInode*>::iterator p = rejoin_undef_inodes.begin();
5275 CInode *in = *p;
5276 rejoin_undef_inodes.erase(p);
5277
5278 in->clear_replica_map();
5279
5280 // close out dirfrags
5281 if (in->is_dir()) {
5282 const auto&& dfls = in->get_dirfrags();
5283 for (const auto& dir : dfls) {
5284 dir->clear_replica_map();
5285
5286 for (auto &p : dir->items) {
5287 CDentry *dn = p.second;
5288 dn->clear_replica_map();
5289
5290 dout(10) << " trimming " << *dn << dendl;
5291 dir->remove_dentry(dn);
5292 }
5293
5294 dout(10) << " trimming " << *dir << dendl;
5295 in->close_dirfrag(dir->dirfrag().frag);
5296 }
5297 }
5298
5299 CDentry *dn = in->get_parent_dn();
5300 if (dn) {
5301 dn->clear_replica_map();
5302 dout(10) << " trimming " << *dn << dendl;
5303 dn->dir->remove_dentry(dn);
5304 } else {
5305 dout(10) << " trimming " << *in << dendl;
5306 remove_inode(in);
5307 }
5308 }
5309
5310 ceph_assert(rejoin_undef_inodes.empty());
5311 }
5312
5313 void MDCache::rejoin_gather_finish()
5314 {
5315 dout(10) << "rejoin_gather_finish" << dendl;
5316 ceph_assert(mds->is_rejoin());
5317 ceph_assert(rejoin_ack_gather.count(mds->get_nodeid()));
5318
5319 if (open_undef_inodes_dirfrags())
5320 return;
5321
5322 if (process_imported_caps())
5323 return;
5324
5325 choose_lock_states_and_reconnect_caps();
5326
5327 identify_files_to_recover();
5328 rejoin_send_acks();
5329
5330 // signal completion of fetches, rejoin_gather_finish, etc.
5331 rejoin_ack_gather.erase(mds->get_nodeid());
5332
5333 // did we already get our acks too?
5334 if (rejoin_ack_gather.empty()) {
5335 // finally, open snaprealms
5336 open_snaprealms();
5337 }
5338 }
5339
5340 class C_MDC_RejoinOpenInoFinish: public MDCacheContext {
5341 inodeno_t ino;
5342 public:
5343 C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
5344 void finish(int r) override {
5345 mdcache->rejoin_open_ino_finish(ino, r);
5346 }
5347 };
5348
5349 void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret)
5350 {
5351 dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl;
5352
5353 if (ret < 0) {
5354 cap_imports_missing.insert(ino);
5355 } else if (ret == mds->get_nodeid()) {
5356 ceph_assert(get_inode(ino));
5357 } else {
5358 auto p = cap_imports.find(ino);
5359 ceph_assert(p != cap_imports.end());
5360 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5361 ceph_assert(q->second.count(MDS_RANK_NONE));
5362 ceph_assert(q->second.size() == 1);
5363 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5364 }
5365 cap_imports.erase(p);
5366 }
5367
5368 ceph_assert(cap_imports_num_opening > 0);
5369 cap_imports_num_opening--;
5370
5371 if (cap_imports_num_opening == 0) {
5372 if (rejoin_gather.empty())
5373 rejoin_gather_finish();
5374 else if (rejoin_gather.count(mds->get_nodeid()))
5375 process_imported_caps();
5376 }
5377 }
5378
5379 class C_MDC_RejoinSessionsOpened : public MDCacheLogContext {
5380 public:
5381 map<client_t,pair<Session*,uint64_t> > session_map;
5382 C_MDC_RejoinSessionsOpened(MDCache *c) : MDCacheLogContext(c) {}
5383 void finish(int r) override {
5384 ceph_assert(r == 0);
5385 mdcache->rejoin_open_sessions_finish(session_map);
5386 }
5387 };
5388
5389 void MDCache::rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map)
5390 {
5391 dout(10) << "rejoin_open_sessions_finish" << dendl;
5392 mds->server->finish_force_open_sessions(session_map);
5393 rejoin_session_map.swap(session_map);
5394 if (rejoin_gather.empty())
5395 rejoin_gather_finish();
5396 }
5397
5398 void MDCache::rejoin_prefetch_ino_finish(inodeno_t ino, int ret)
5399 {
5400 auto p = cap_imports.find(ino);
5401 if (p != cap_imports.end()) {
5402 dout(10) << __func__ << " ino " << ino << " ret " << ret << dendl;
5403 if (ret < 0) {
5404 cap_imports_missing.insert(ino);
5405 } else if (ret != mds->get_nodeid()) {
5406 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5407 ceph_assert(q->second.count(MDS_RANK_NONE));
5408 ceph_assert(q->second.size() == 1);
5409 rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret);
5410 }
5411 cap_imports.erase(p);
5412 }
5413 }
5414 }
5415
5416 bool MDCache::process_imported_caps()
5417 {
5418 dout(10) << "process_imported_caps" << dendl;
5419
5420 if (!open_file_table.is_prefetched() &&
5421 open_file_table.prefetch_inodes()) {
5422 open_file_table.wait_for_prefetch(
5423 new MDSInternalContextWrapper(mds,
5424 new LambdaContext([this](int r) {
5425 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5426 process_imported_caps();
5427 })
5428 )
5429 );
5430 return true;
5431 }
5432
5433 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5434 CInode *in = get_inode(p->first);
5435 if (in) {
5436 ceph_assert(in->is_auth());
5437 cap_imports_missing.erase(p->first);
5438 continue;
5439 }
5440 if (cap_imports_missing.count(p->first) > 0)
5441 continue;
5442
5443 cap_imports_num_opening++;
5444 dout(10) << " opening missing ino " << p->first << dendl;
5445 open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false);
5446 if (!(cap_imports_num_opening % 1000))
5447 mds->heartbeat_reset();
5448 }
5449
5450 if (cap_imports_num_opening > 0)
5451 return true;
5452
5453 // called by rejoin_gather_finish() ?
5454 if (rejoin_gather.count(mds->get_nodeid()) == 0) {
5455 if (!rejoin_client_map.empty() &&
5456 rejoin_session_map.empty()) {
5457 C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this);
5458 version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map,
5459 rejoin_client_metadata_map,
5460 finish->session_map);
5461 ESessions *le = new ESessions(pv, std::move(rejoin_client_map),
5462 std::move(rejoin_client_metadata_map));
5463 mds->mdlog->start_submit_entry(le, finish);
5464 mds->mdlog->flush();
5465 rejoin_client_map.clear();
5466 rejoin_client_metadata_map.clear();
5467 return true;
5468 }
5469
5470 // process caps that were exported by slave rename
5471 for (map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > >::iterator p = rejoin_slave_exports.begin();
5472 p != rejoin_slave_exports.end();
5473 ++p) {
5474 CInode *in = get_inode(p->first);
5475 ceph_assert(in);
5476 for (map<client_t,Capability::Export>::iterator q = p->second.second.begin();
5477 q != p->second.second.end();
5478 ++q) {
5479 auto r = rejoin_session_map.find(q->first);
5480 if (r == rejoin_session_map.end())
5481 continue;
5482
5483 Session *session = r->second.first;
5484 Capability *cap = in->get_client_cap(q->first);
5485 if (!cap) {
5486 cap = in->add_client_cap(q->first, session);
5487 // add empty item to reconnected_caps
5488 (void)reconnected_caps[p->first][q->first];
5489 }
5490 cap->merge(q->second, true);
5491
5492 Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first];
5493 ceph_assert(cap->get_last_seq() == im.issue_seq);
5494 ceph_assert(cap->get_mseq() == im.mseq);
5495 cap->set_cap_id(im.cap_id);
5496 // send cap import because we assigned a new cap ID
5497 do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1,
5498 p->second.first, CEPH_CAP_FLAG_AUTH);
5499 }
5500 }
5501 rejoin_slave_exports.clear();
5502 rejoin_imported_caps.clear();
5503
5504 // process cap imports
5505 // ino -> client -> frommds -> capex
5506 for (auto p = cap_imports.begin(); p != cap_imports.end(); ) {
5507 CInode *in = get_inode(p->first);
5508 if (!in) {
5509 dout(10) << " still missing ino " << p->first
5510 << ", will try again after replayed client requests" << dendl;
5511 ++p;
5512 continue;
5513 }
5514 ceph_assert(in->is_auth());
5515 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5516 Session *session;
5517 {
5518 auto r = rejoin_session_map.find(q->first);
5519 session = (r != rejoin_session_map.end() ? r->second.first : nullptr);
5520 }
5521
5522 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5523 if (!session) {
5524 if (r->first >= 0)
5525 (void)rejoin_imported_caps[r->first][p->first][q->first]; // all are zero
5526 continue;
5527 }
5528
5529 Capability *cap = in->reconnect_cap(q->first, r->second, session);
5530 add_reconnected_cap(q->first, in->ino(), r->second);
5531 if (r->first >= 0) {
5532 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5533 cap->inc_mseq();
5534 do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0);
5535
5536 Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first];
5537 im.cap_id = cap->get_cap_id();
5538 im.issue_seq = cap->get_last_seq();
5539 im.mseq = cap->get_mseq();
5540 }
5541 }
5542 }
5543 cap_imports.erase(p++); // remove and move on
5544 }
5545 } else {
5546 trim_non_auth();
5547
5548 ceph_assert(rejoin_gather.count(mds->get_nodeid()));
5549 rejoin_gather.erase(mds->get_nodeid());
5550 ceph_assert(!rejoin_ack_gather.count(mds->get_nodeid()));
5551 maybe_send_pending_rejoins();
5552 }
5553 return false;
5554 }
5555
5556 void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm,
5557 client_t client, snapid_t snap_follows)
5558 {
5559 dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl;
5560
5561 if (!realm->has_snaps_in_range(snap_follows + 1, head_in->first - 1))
5562 return;
5563
5564 const set<snapid_t>& snaps = realm->get_snaps();
5565 snapid_t follows = snap_follows;
5566
5567 while (true) {
5568 CInode *in = pick_inode_snap(head_in, follows);
5569 if (in == head_in)
5570 break;
5571
5572 bool need_snapflush = false;
5573 for (auto p = snaps.lower_bound(std::max<snapid_t>(in->first, (follows + 1)));
5574 p != snaps.end() && *p <= in->last;
5575 ++p) {
5576 head_in->add_need_snapflush(in, *p, client);
5577 need_snapflush = true;
5578 }
5579 follows = in->last;
5580 if (!need_snapflush)
5581 continue;
5582
5583 dout(10) << " need snapflush from client." << client << " on " << *in << dendl;
5584
5585 if (in->client_snap_caps.empty()) {
5586 for (int i = 0; i < num_cinode_locks; i++) {
5587 int lockid = cinode_lock_info[i].lock;
5588 SimpleLock *lock = in->get_lock(lockid);
5589 ceph_assert(lock);
5590 in->auth_pin(lock);
5591 lock->set_state(LOCK_SNAP_SYNC);
5592 lock->get_wrlock(true);
5593 }
5594 }
5595 in->client_snap_caps.insert(client);
5596 mds->locker->mark_need_snapflush_inode(in);
5597 }
5598 }
5599
5600 /*
5601 * choose lock states based on reconnected caps
5602 */
5603 void MDCache::choose_lock_states_and_reconnect_caps()
5604 {
5605 dout(10) << "choose_lock_states_and_reconnect_caps" << dendl;
5606
5607 int count = 0;
5608 for (auto p : inode_map) {
5609 CInode *in = p.second;
5610 if (in->last != CEPH_NOSNAP)
5611 continue;
5612
5613 if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat())
5614 in->mark_dirty_rstat();
5615
5616 int dirty_caps = 0;
5617 auto q = reconnected_caps.find(in->ino());
5618 if (q != reconnected_caps.end()) {
5619 for (const auto &it : q->second)
5620 dirty_caps |= it.second.dirty_caps;
5621 }
5622 in->choose_lock_states(dirty_caps);
5623 dout(15) << " chose lock states on " << *in << dendl;
5624
5625 if (in->snaprealm && !rejoin_pending_snaprealms.count(in)) {
5626 in->get(CInode::PIN_OPENINGSNAPPARENTS);
5627 rejoin_pending_snaprealms.insert(in);
5628 }
5629
5630 if (!(++count % 1000))
5631 mds->heartbeat_reset();
5632 }
5633 }
5634
5635 void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
5636 map<client_t,ref_t<MClientSnap>>& splits)
5637 {
5638 ref_t<MClientSnap> snap;
5639 auto it = splits.find(client);
5640 if (it != splits.end()) {
5641 snap = it->second;
5642 snap->head.op = CEPH_SNAP_OP_SPLIT;
5643 } else {
5644 snap = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5645 splits.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
5646 snap->head.split = realm->inode->ino();
5647 snap->bl = realm->get_snap_trace();
5648
5649 for (const auto& child : realm->open_children)
5650 snap->split_realms.push_back(child->inode->ino());
5651 }
5652 snap->split_inos.push_back(ino);
5653 }
5654
5655 void MDCache::prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm,
5656 map<client_t,ref_t<MClientSnap>>& splits)
5657 {
5658 ceph_assert(parent_realm);
5659
5660 vector<inodeno_t> split_inos;
5661 vector<inodeno_t> split_realms;
5662
5663 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5664 !p.end();
5665 ++p)
5666 split_inos.push_back((*p)->ino());
5667 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
5668 p != realm->open_children.end();
5669 ++p)
5670 split_realms.push_back((*p)->inode->ino());
5671
5672 for (const auto& p : realm->client_caps) {
5673 ceph_assert(!p.second->empty());
5674 auto em = splits.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple());
5675 if (em.second) {
5676 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
5677 update->head.split = parent_realm->inode->ino();
5678 update->split_inos = split_inos;
5679 update->split_realms = split_realms;
5680 update->bl = parent_realm->get_snap_trace();
5681 em.first->second = std::move(update);
5682 }
5683 }
5684 }
5685
5686 void MDCache::send_snaps(map<client_t,ref_t<MClientSnap>>& splits)
5687 {
5688 dout(10) << "send_snaps" << dendl;
5689
5690 for (auto &p : splits) {
5691 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p.first.v));
5692 if (session) {
5693 dout(10) << " client." << p.first
5694 << " split " << p.second->head.split
5695 << " inos " << p.second->split_inos
5696 << dendl;
5697 mds->send_message_client_counted(p.second, session);
5698 } else {
5699 dout(10) << " no session for client." << p.first << dendl;
5700 }
5701 }
5702 splits.clear();
5703 }
5704
5705
5706 /*
5707 * remove any items from logsegment open_file lists that don't have
5708 * any caps
5709 */
5710 void MDCache::clean_open_file_lists()
5711 {
5712 dout(10) << "clean_open_file_lists" << dendl;
5713
5714 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
5715 p != mds->mdlog->segments.end();
5716 ++p) {
5717 LogSegment *ls = p->second;
5718
5719 elist<CInode*>::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file));
5720 while (!q.end()) {
5721 CInode *in = *q;
5722 ++q;
5723 if (in->last == CEPH_NOSNAP) {
5724 dout(10) << " unlisting unwanted/capless inode " << *in << dendl;
5725 in->item_open_file.remove_myself();
5726 } else {
5727 if (in->client_snap_caps.empty()) {
5728 dout(10) << " unlisting flushed snap inode " << *in << dendl;
5729 in->item_open_file.remove_myself();
5730 }
5731 }
5732 }
5733 }
5734 }
5735
5736 void MDCache::dump_openfiles(Formatter *f)
5737 {
5738 f->open_array_section("openfiles");
5739 for (auto p = mds->mdlog->segments.begin();
5740 p != mds->mdlog->segments.end();
5741 ++p) {
5742 LogSegment *ls = p->second;
5743
5744 auto q = ls->open_files.begin(member_offset(CInode, item_open_file));
5745 while (!q.end()) {
5746 CInode *in = *q;
5747 ++q;
5748 if ((in->last == CEPH_NOSNAP && !in->is_any_caps_wanted())
5749 || (in->last != CEPH_NOSNAP && in->client_snap_caps.empty()))
5750 continue;
5751 f->open_object_section("file");
5752 in->dump(f, CInode::DUMP_PATH | CInode::DUMP_INODE_STORE_BASE | CInode::DUMP_CAPS);
5753 f->close_section();
5754 }
5755 }
5756 f->close_section();
5757 }
5758
5759 Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds)
5760 {
5761 dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds
5762 << " on " << *in << dendl;
5763 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
5764 if (!session) {
5765 dout(10) << " no session for client." << client << dendl;
5766 return NULL;
5767 }
5768
5769 Capability *cap = in->reconnect_cap(client, icr, session);
5770
5771 if (frommds >= 0) {
5772 if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists
5773 cap->inc_mseq();
5774 do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0);
5775 }
5776
5777 return cap;
5778 }
5779
5780 void MDCache::export_remaining_imported_caps()
5781 {
5782 dout(10) << "export_remaining_imported_caps" << dendl;
5783
5784 stringstream warn_str;
5785
5786 int count = 0;
5787 for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) {
5788 warn_str << " ino " << p->first << "\n";
5789 for (auto q = p->second.begin(); q != p->second.end(); ++q) {
5790 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
5791 if (session) {
5792 // mark client caps stale.
5793 auto stale = make_message<MClientCaps>(CEPH_CAP_OP_EXPORT, p->first,
5794 0, 0, 0,
5795 mds->get_osd_epoch_barrier());
5796 stale->set_cap_peer(0, 0, 0, -1, 0);
5797 mds->send_message_client_counted(stale, q->first);
5798 }
5799 }
5800
5801 if (!(++count % 1000))
5802 mds->heartbeat_reset();
5803 }
5804
5805 for (map<inodeno_t, MDSContext::vec >::iterator p = cap_reconnect_waiters.begin();
5806 p != cap_reconnect_waiters.end();
5807 ++p)
5808 mds->queue_waiters(p->second);
5809
5810 cap_imports.clear();
5811 cap_reconnect_waiters.clear();
5812
5813 if (warn_str.peek() != EOF) {
5814 mds->clog->warn() << "failed to reconnect caps for missing inodes:";
5815 mds->clog->warn(warn_str);
5816 }
5817 }
5818
5819 Capability* MDCache::try_reconnect_cap(CInode *in, Session *session)
5820 {
5821 client_t client = session->info.get_client();
5822 Capability *cap = nullptr;
5823 const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client);
5824 if (rc) {
5825 cap = in->reconnect_cap(client, *rc, session);
5826 dout(10) << "try_reconnect_cap client." << client
5827 << " reconnect wanted " << ccap_string(rc->capinfo.wanted)
5828 << " issue " << ccap_string(rc->capinfo.issued)
5829 << " on " << *in << dendl;
5830 remove_replay_cap_reconnect(in->ino(), client);
5831
5832 if (in->is_replicated()) {
5833 mds->locker->try_eval(in, CEPH_CAP_LOCKS);
5834 } else {
5835 int dirty_caps = 0;
5836 auto p = reconnected_caps.find(in->ino());
5837 if (p != reconnected_caps.end()) {
5838 auto q = p->second.find(client);
5839 if (q != p->second.end())
5840 dirty_caps = q->second.dirty_caps;
5841 }
5842 in->choose_lock_states(dirty_caps);
5843 dout(15) << " chose lock states on " << *in << dendl;
5844 }
5845
5846 map<inodeno_t, MDSContext::vec >::iterator it =
5847 cap_reconnect_waiters.find(in->ino());
5848 if (it != cap_reconnect_waiters.end()) {
5849 mds->queue_waiters(it->second);
5850 cap_reconnect_waiters.erase(it);
5851 }
5852 }
5853 return cap;
5854 }
5855
5856
5857
5858 // -------
5859 // cap imports and delayed snap parent opens
5860
5861 void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap,
5862 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
5863 int peer, int p_flags)
5864 {
5865 SnapRealm *realm = in->find_snaprealm();
5866 if (realm->have_past_parents_open()) {
5867 dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl;
5868 if (cap->get_last_seq() == 0) // reconnected cap
5869 cap->inc_last_seq();
5870 cap->set_last_issue();
5871 cap->set_last_issue_stamp(ceph_clock_now());
5872 cap->clear_new();
5873 auto reap = make_message<MClientCaps>(
5874 CEPH_CAP_OP_IMPORT, in->ino(), realm->inode->ino(), cap->get_cap_id(),
5875 cap->get_last_seq(), cap->pending(), cap->wanted(), 0, cap->get_mseq(),
5876 mds->get_osd_epoch_barrier());
5877 in->encode_cap_message(reap, cap);
5878 reap->snapbl = realm->get_snap_trace();
5879 reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags);
5880 mds->send_message_client_counted(reap, session);
5881 } else {
5882 ceph_abort();
5883 }
5884 }
5885
5886 void MDCache::do_delayed_cap_imports()
5887 {
5888 dout(10) << "do_delayed_cap_imports" << dendl;
5889
5890 ceph_assert(delayed_imported_caps.empty());
5891 }
5892
5893 struct C_MDC_OpenSnapRealms : public MDCacheContext {
5894 explicit C_MDC_OpenSnapRealms(MDCache *c) : MDCacheContext(c) {}
5895 void finish(int r) override {
5896 mdcache->open_snaprealms();
5897 }
5898 };
5899
5900 void MDCache::open_snaprealms()
5901 {
5902 dout(10) << "open_snaprealms" << dendl;
5903
5904 MDSGatherBuilder gather(g_ceph_context);
5905
5906 auto it = rejoin_pending_snaprealms.begin();
5907 while (it != rejoin_pending_snaprealms.end()) {
5908 CInode *in = *it;
5909 SnapRealm *realm = in->snaprealm;
5910 ceph_assert(realm);
5911 if (realm->have_past_parents_open() ||
5912 realm->open_parents(gather.new_sub())) {
5913 dout(10) << " past parents now open on " << *in << dendl;
5914
5915 map<client_t,ref_t<MClientSnap>> splits;
5916 // finish off client snaprealm reconnects?
5917 map<inodeno_t,map<client_t,snapid_t> >::iterator q = reconnected_snaprealms.find(in->ino());
5918 if (q != reconnected_snaprealms.end()) {
5919 for (const auto& r : q->second)
5920 finish_snaprealm_reconnect(r.first, realm, r.second, splits);
5921 reconnected_snaprealms.erase(q);
5922 }
5923
5924 for (elist<CInode*>::iterator p = realm->inodes_with_caps.begin(member_offset(CInode, item_caps));
5925 !p.end(); ++p) {
5926 CInode *child = *p;
5927 auto q = reconnected_caps.find(child->ino());
5928 ceph_assert(q != reconnected_caps.end());
5929 for (auto r = q->second.begin(); r != q->second.end(); ++r) {
5930 Capability *cap = child->get_client_cap(r->first);
5931 if (!cap)
5932 continue;
5933 if (r->second.snap_follows > 0) {
5934 if (r->second.snap_follows < child->first - 1) {
5935 rebuild_need_snapflush(child, realm, r->first, r->second.snap_follows);
5936 } else if (r->second.snapflush) {
5937 // When processing a cap flush message that is re-sent, it's possble
5938 // that the sender has already released all WR caps. So we should
5939 // force MDCache::cow_inode() to setup CInode::client_need_snapflush.
5940 cap->mark_needsnapflush();
5941 }
5942 }
5943 // make sure client's cap is in the correct snaprealm.
5944 if (r->second.realm_ino != in->ino()) {
5945 prepare_realm_split(realm, r->first, child->ino(), splits);
5946 }
5947 }
5948 }
5949
5950 rejoin_pending_snaprealms.erase(it++);
5951 in->put(CInode::PIN_OPENINGSNAPPARENTS);
5952
5953 send_snaps(splits);
5954 } else {
5955 dout(10) << " opening past parents on " << *in << dendl;
5956 ++it;
5957 }
5958 }
5959
5960 if (gather.has_subs()) {
5961 if (gather.num_subs_remaining() == 0) {
5962 // cleanup gather
5963 gather.set_finisher(new C_MDSInternalNoop);
5964 gather.activate();
5965 } else {
5966 // for multimds, must succeed the first time
5967 ceph_assert(recovery_set.empty());
5968
5969 dout(10) << "open_snaprealms - waiting for "
5970 << gather.num_subs_remaining() << dendl;
5971 gather.set_finisher(new C_MDC_OpenSnapRealms(this));
5972 gather.activate();
5973 return;
5974 }
5975 }
5976
5977 notify_global_snaprealm_update(CEPH_SNAP_OP_UPDATE);
5978
5979 if (!reconnected_snaprealms.empty()) {
5980 dout(5) << "open_snaprealms has unconnected snaprealm:" << dendl;
5981 for (auto& p : reconnected_snaprealms) {
5982 stringstream warn_str;
5983 warn_str << " " << p.first << " {";
5984 bool first = true;
5985 for (auto& q : p.second) {
5986 if (!first)
5987 warn_str << ", ";
5988 warn_str << "client." << q.first << "/" << q.second;
5989 }
5990 warn_str << "}";
5991 dout(5) << warn_str.str() << dendl;
5992 }
5993 }
5994 ceph_assert(rejoin_waiters.empty());
5995 ceph_assert(rejoin_pending_snaprealms.empty());
5996 dout(10) << "open_snaprealms - all open" << dendl;
5997 do_delayed_cap_imports();
5998
5999 ceph_assert(rejoin_done);
6000 rejoin_done.release()->complete(0);
6001 reconnected_caps.clear();
6002 }
6003
6004 bool MDCache::open_undef_inodes_dirfrags()
6005 {
6006 dout(10) << "open_undef_inodes_dirfrags "
6007 << rejoin_undef_inodes.size() << " inodes "
6008 << rejoin_undef_dirfrags.size() << " dirfrags" << dendl;
6009
6010 set<CDir*> fetch_queue = rejoin_undef_dirfrags;
6011
6012 for (set<CInode*>::iterator p = rejoin_undef_inodes.begin();
6013 p != rejoin_undef_inodes.end();
6014 ++p) {
6015 CInode *in = *p;
6016 ceph_assert(!in->is_base());
6017 fetch_queue.insert(in->get_parent_dir());
6018 }
6019
6020 if (fetch_queue.empty())
6021 return false;
6022
6023 MDSGatherBuilder gather(g_ceph_context,
6024 new MDSInternalContextWrapper(mds,
6025 new LambdaContext([this](int r) {
6026 if (rejoin_gather.empty())
6027 rejoin_gather_finish();
6028 })
6029 )
6030 );
6031
6032 for (set<CDir*>::iterator p = fetch_queue.begin();
6033 p != fetch_queue.end();
6034 ++p) {
6035 CDir *dir = *p;
6036 CInode *diri = dir->get_inode();
6037 if (diri->state_test(CInode::STATE_REJOINUNDEF))
6038 continue;
6039 if (dir->state_test(CDir::STATE_REJOINUNDEF))
6040 ceph_assert(diri->dirfragtree.is_leaf(dir->get_frag()));
6041 dir->fetch(gather.new_sub());
6042 }
6043 ceph_assert(gather.has_subs());
6044 gather.activate();
6045 return true;
6046 }
6047
6048 void MDCache::opened_undef_inode(CInode *in) {
6049 dout(10) << "opened_undef_inode " << *in << dendl;
6050 rejoin_undef_inodes.erase(in);
6051 if (in->is_dir()) {
6052 // FIXME: re-hash dentries if necessary
6053 ceph_assert(in->inode.dir_layout.dl_dir_hash == g_conf()->mds_default_dir_hash);
6054 if (in->get_num_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) {
6055 CDir *dir = in->get_dirfrag(frag_t());
6056 ceph_assert(dir);
6057 rejoin_undef_dirfrags.erase(dir);
6058 in->force_dirfrags();
6059 auto&& ls = in->get_dirfrags();
6060 for (const auto& dir : ls) {
6061 rejoin_undef_dirfrags.insert(dir);
6062 }
6063 }
6064 }
6065 }
6066
6067 void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
6068 map<client_t,ref_t<MClientSnap>>& updates)
6069 {
6070 if (seq < realm->get_newest_seq()) {
6071 dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < "
6072 << realm->get_newest_seq() << " on " << *realm << dendl;
6073 auto snap = make_message<MClientSnap>(CEPH_SNAP_OP_UPDATE);
6074 snap->bl = realm->get_snap_trace();
6075 for (const auto& child : realm->open_children)
6076 snap->split_realms.push_back(child->inode->ino());
6077 updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple(snap));
6078 } else {
6079 dout(10) << "finish_snaprealm_reconnect client." << client << " up to date"
6080 << " on " << *realm << dendl;
6081 }
6082 }
6083
6084
6085
6086 void MDCache::rejoin_send_acks()
6087 {
6088 dout(7) << "rejoin_send_acks" << dendl;
6089
6090 // replicate stray
6091 for (map<mds_rank_t, set<CInode*> >::iterator p = rejoin_unlinked_inodes.begin();
6092 p != rejoin_unlinked_inodes.end();
6093 ++p) {
6094 for (set<CInode*>::iterator q = p->second.begin();
6095 q != p->second.end();
6096 ++q) {
6097 CInode *in = *q;
6098 dout(7) << " unlinked inode " << *in << dendl;
6099 // inode expired
6100 if (!in->is_replica(p->first))
6101 continue;
6102 while (1) {
6103 CDentry *dn = in->get_parent_dn();
6104 if (dn->is_replica(p->first))
6105 break;
6106 dn->add_replica(p->first);
6107 CDir *dir = dn->get_dir();
6108 if (dir->is_replica(p->first))
6109 break;
6110 dir->add_replica(p->first);
6111 in = dir->get_inode();
6112 if (in->is_replica(p->first))
6113 break;
6114 in->add_replica(p->first);
6115 if (in->is_base())
6116 break;
6117 }
6118 }
6119 }
6120 rejoin_unlinked_inodes.clear();
6121
6122 // send acks to everyone in the recovery set
6123 map<mds_rank_t,ref_t<MMDSCacheRejoin>> acks;
6124 for (set<mds_rank_t>::iterator p = recovery_set.begin();
6125 p != recovery_set.end();
6126 ++p) {
6127 if (rejoin_ack_sent.count(*p))
6128 continue;
6129 acks[*p] = make_message<MMDSCacheRejoin>(MMDSCacheRejoin::OP_ACK);
6130 }
6131
6132 rejoin_ack_sent = recovery_set;
6133
6134 // walk subtrees
6135 for (map<CDir*,set<CDir*> >::iterator p = subtrees.begin();
6136 p != subtrees.end();
6137 ++p) {
6138 CDir *dir = p->first;
6139 if (!dir->is_auth())
6140 continue;
6141 dout(10) << "subtree " << *dir << dendl;
6142
6143 // auth items in this subtree
6144 std::queue<CDir*> dq;
6145 dq.push(dir);
6146
6147 while (!dq.empty()) {
6148 CDir *dir = dq.front();
6149 dq.pop();
6150
6151 // dir
6152 for (auto &r : dir->get_replicas()) {
6153 auto it = acks.find(r.first);
6154 if (it == acks.end())
6155 continue;
6156 it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep);
6157 it->second->add_dirfrag_base(dir);
6158 }
6159
6160 for (auto &p : dir->items) {
6161 CDentry *dn = p.second;
6162 CDentry::linkage_t *dnl = dn->get_linkage();
6163
6164 // inode
6165 CInode *in = NULL;
6166 if (dnl->is_primary())
6167 in = dnl->get_inode();
6168
6169 // dentry
6170 for (auto &r : dn->get_replicas()) {
6171 auto it = acks.find(r.first);
6172 if (it == acks.end())
6173 continue;
6174 it->second->add_strong_dentry(dir->dirfrag(), dn->get_name(), dn->first, dn->last,
6175 dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0),
6176 dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0),
6177 dnl->is_remote() ? dnl->get_remote_d_type():0,
6178 ++r.second,
6179 dn->lock.get_replica_state());
6180 // peer missed MDentrylink message ?
6181 if (in && !in->is_replica(r.first))
6182 in->add_replica(r.first);
6183 }
6184
6185 if (!in)
6186 continue;
6187
6188 for (auto &r : in->get_replicas()) {
6189 auto it = acks.find(r.first);
6190 if (it == acks.end())
6191 continue;
6192 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6193 bufferlist bl;
6194 in->_encode_locks_state_for_rejoin(bl, r.first);
6195 it->second->add_inode_locks(in, ++r.second, bl);
6196 }
6197
6198 // subdirs in this subtree?
6199 {
6200 auto&& dirs = in->get_nested_dirfrags();
6201 for (const auto& dir : dirs) {
6202 dq.push(dir);
6203 }
6204 }
6205 }
6206 }
6207 }
6208
6209 // base inodes too
6210 if (root && root->is_auth())
6211 for (auto &r : root->get_replicas()) {
6212 auto it = acks.find(r.first);
6213 if (it == acks.end())
6214 continue;
6215 it->second->add_inode_base(root, mds->mdsmap->get_up_features());
6216 bufferlist bl;
6217 root->_encode_locks_state_for_rejoin(bl, r.first);
6218 it->second->add_inode_locks(root, ++r.second, bl);
6219 }
6220 if (myin)
6221 for (auto &r : myin->get_replicas()) {
6222 auto it = acks.find(r.first);
6223 if (it == acks.end())
6224 continue;
6225 it->second->add_inode_base(myin, mds->mdsmap->get_up_features());
6226 bufferlist bl;
6227 myin->_encode_locks_state_for_rejoin(bl, r.first);
6228 it->second->add_inode_locks(myin, ++r.second, bl);
6229 }
6230
6231 // include inode base for any inodes whose scatterlocks may have updated
6232 for (set<CInode*>::iterator p = rejoin_potential_updated_scatterlocks.begin();
6233 p != rejoin_potential_updated_scatterlocks.end();
6234 ++p) {
6235 CInode *in = *p;
6236 for (const auto &r : in->get_replicas()) {
6237 auto it = acks.find(r.first);
6238 if (it == acks.end())
6239 continue;
6240 it->second->add_inode_base(in, mds->mdsmap->get_up_features());
6241 }
6242 }
6243
6244 // send acks
6245 for (auto p = acks.begin(); p != acks.end(); ++p) {
6246 encode(rejoin_imported_caps[p->first], p->second->imported_caps);
6247 mds->send_message_mds(p->second, p->first);
6248 }
6249
6250 rejoin_imported_caps.clear();
6251 }
6252
6253 class C_MDC_ReIssueCaps : public MDCacheContext {
6254 CInode *in;
6255 public:
6256 C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) :
6257 MDCacheContext(mdc), in(i)
6258 {
6259 in->get(CInode::PIN_PTRWAITER);
6260 }
6261 void finish(int r) override {
6262 if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS))
6263 mdcache->mds->locker->issue_caps(in);
6264 in->put(CInode::PIN_PTRWAITER);
6265 }
6266 };
6267
6268 void MDCache::reissue_all_caps()
6269 {
6270 dout(10) << "reissue_all_caps" << dendl;
6271
6272 int count = 0;
6273 for (auto &p : inode_map) {
6274 int n = 1;
6275 CInode *in = p.second;
6276 if (in->is_head() && in->is_any_caps()) {
6277 // called by MDSRank::active_start(). There shouldn't be any frozen subtree.
6278 if (in->is_frozen_inode()) {
6279 in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in));
6280 continue;
6281 }
6282 if (!mds->locker->eval(in, CEPH_CAP_LOCKS))
6283 n += mds->locker->issue_caps(in);
6284 }
6285
6286 if ((count % 1000) + n >= 1000)
6287 mds->heartbeat_reset();
6288 count += n;
6289 }
6290 }
6291
6292
6293 // ===============================================================================
6294
6295 struct C_MDC_QueuedCow : public MDCacheContext {
6296 CInode *in;
6297 MutationRef mut;
6298 C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) :
6299 MDCacheContext(mdc), in(i), mut(m) {}
6300 void finish(int r) override {
6301 mdcache->_queued_file_recover_cow(in, mut);
6302 }
6303 };
6304
6305
6306 void MDCache::queue_file_recover(CInode *in)
6307 {
6308 dout(10) << "queue_file_recover " << *in << dendl;
6309 ceph_assert(in->is_auth());
6310
6311 // cow?
6312 /*
6313 SnapRealm *realm = in->find_snaprealm();
6314 set<snapid_t> s = realm->get_snaps();
6315 while (!s.empty() && *s.begin() < in->first)
6316 s.erase(s.begin());
6317 while (!s.empty() && *s.rbegin() > in->last)
6318 s.erase(*s.rbegin());
6319 dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl;
6320 if (s.size() > 1) {
6321 CInode::mempool_inode pi = in->project_inode();
6322 pi->version = in->pre_dirty();
6323
6324 auto mut(std::make_shared<MutationImpl>());
6325 mut->ls = mds->mdlog->get_current_segment();
6326 EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow");
6327 mds->mdlog->start_entry(le);
6328 predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY);
6329
6330 s.erase(*s.begin());
6331 while (!s.empty()) {
6332 snapid_t snapid = *s.begin();
6333 CInode *cow_inode = 0;
6334 journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode);
6335 ceph_assert(cow_inode);
6336 recovery_queue.enqueue(cow_inode);
6337 s.erase(*s.begin());
6338 }
6339
6340 in->parent->first = in->first;
6341 le->metablob.add_primary_dentry(in->parent, in, true);
6342 mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut));
6343 mds->mdlog->flush();
6344 }
6345 */
6346
6347 recovery_queue.enqueue(in);
6348 }
6349
6350 void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut)
6351 {
6352 in->pop_and_dirty_projected_inode(mut->ls);
6353 mut->apply();
6354 mds->locker->drop_locks(mut.get());
6355 mut->cleanup();
6356 }
6357
6358
6359 /*
6360 * called after recovery to recover file sizes for previously opened (for write)
6361 * files. that is, those where max_size > size.
6362 */
6363 void MDCache::identify_files_to_recover()
6364 {
6365 dout(10) << "identify_files_to_recover" << dendl;
6366 int count = 0;
6367 for (auto &p : inode_map) {
6368 CInode *in = p.second;
6369 if (!in->is_auth())
6370 continue;
6371
6372 if (in->last != CEPH_NOSNAP)
6373 continue;
6374
6375 // Only normal files need file size recovery
6376 if (!in->is_file()) {
6377 continue;
6378 }
6379
6380 bool recover = false;
6381 for (map<client_t,client_writeable_range_t>::iterator p = in->inode.client_ranges.begin();
6382 p != in->inode.client_ranges.end();
6383 ++p) {
6384 Capability *cap = in->get_client_cap(p->first);
6385 if (cap) {
6386 cap->mark_clientwriteable();
6387 } else {
6388 dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl;
6389 recover = true;
6390 break;
6391 }
6392 }
6393
6394 if (recover) {
6395 if (in->filelock.is_stable()) {
6396 in->auth_pin(&in->filelock);
6397 } else {
6398 ceph_assert(in->filelock.get_state() == LOCK_XLOCKSNAP);
6399 }
6400 in->filelock.set_state(LOCK_PRE_SCAN);
6401 rejoin_recover_q.push_back(in);
6402 } else {
6403 rejoin_check_q.push_back(in);
6404 }
6405
6406 if (!(++count % 1000))
6407 mds->heartbeat_reset();
6408 }
6409 }
6410
6411 void MDCache::start_files_to_recover()
6412 {
6413 for (CInode *in : rejoin_check_q) {
6414 if (in->filelock.get_state() == LOCK_XLOCKSNAP)
6415 mds->locker->issue_caps(in);
6416 mds->locker->check_inode_max_size(in);
6417 }
6418 rejoin_check_q.clear();
6419 for (CInode *in : rejoin_recover_q) {
6420 mds->locker->file_recover(&in->filelock);
6421 }
6422 if (!rejoin_recover_q.empty()) {
6423 rejoin_recover_q.clear();
6424 do_file_recover();
6425 }
6426 }
6427
6428 void MDCache::do_file_recover()
6429 {
6430 recovery_queue.advance();
6431 }
6432
6433 // ===============================================================================
6434
6435
6436 // ----------------------------
6437 // truncate
6438
6439 class C_MDC_RetryTruncate : public MDCacheContext {
6440 CInode *in;
6441 LogSegment *ls;
6442 public:
6443 C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) :
6444 MDCacheContext(c), in(i), ls(l) {}
6445 void finish(int r) override {
6446 mdcache->_truncate_inode(in, ls);
6447 }
6448 };
6449
6450 void MDCache::truncate_inode(CInode *in, LogSegment *ls)
6451 {
6452 auto pi = in->get_projected_inode();
6453 dout(10) << "truncate_inode "
6454 << pi->truncate_from << " -> " << pi->truncate_size
6455 << " on " << *in
6456 << dendl;
6457
6458 ls->truncating_inodes.insert(in);
6459 in->get(CInode::PIN_TRUNCATING);
6460 in->auth_pin(this);
6461
6462 if (!in->client_need_snapflush.empty() &&
6463 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6464 ceph_assert(in->filelock.is_xlocked());
6465 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6466 mds->locker->issue_caps(in);
6467 return;
6468 }
6469
6470 _truncate_inode(in, ls);
6471 }
6472
6473 struct C_IO_MDC_TruncateFinish : public MDCacheIOContext {
6474 CInode *in;
6475 LogSegment *ls;
6476 C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) :
6477 MDCacheIOContext(c, false), in(i), ls(l) {
6478 }
6479 void finish(int r) override {
6480 ceph_assert(r == 0 || r == -ENOENT);
6481 mdcache->truncate_inode_finish(in, ls);
6482 }
6483 void print(ostream& out) const override {
6484 out << "file_truncate(" << in->ino() << ")";
6485 }
6486 };
6487
6488 void MDCache::_truncate_inode(CInode *in, LogSegment *ls)
6489 {
6490 auto pi = &in->inode;
6491 dout(10) << "_truncate_inode "
6492 << pi->truncate_from << " -> " << pi->truncate_size
6493 << " on " << *in << dendl;
6494
6495 ceph_assert(pi->is_truncating());
6496 ceph_assert(pi->truncate_size < (1ULL << 63));
6497 ceph_assert(pi->truncate_from < (1ULL << 63));
6498 ceph_assert(pi->truncate_size < pi->truncate_from);
6499
6500
6501 SnapRealm *realm = in->find_snaprealm();
6502 SnapContext nullsnap;
6503 const SnapContext *snapc;
6504 if (realm) {
6505 dout(10) << " realm " << *realm << dendl;
6506 snapc = &realm->get_snap_context();
6507 } else {
6508 dout(10) << " NO realm, using null context" << dendl;
6509 snapc = &nullsnap;
6510 ceph_assert(in->last == CEPH_NOSNAP);
6511 }
6512 dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl;
6513 filer.truncate(in->inode.ino, &in->inode.layout, *snapc,
6514 pi->truncate_size, pi->truncate_from-pi->truncate_size,
6515 pi->truncate_seq, ceph::real_time::min(), 0,
6516 new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls),
6517 mds->finisher));
6518 }
6519
6520 struct C_MDC_TruncateLogged : public MDCacheLogContext {
6521 CInode *in;
6522 MutationRef mut;
6523 C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) :
6524 MDCacheLogContext(m), in(i), mut(mu) {}
6525 void finish(int r) override {
6526 mdcache->truncate_inode_logged(in, mut);
6527 }
6528 };
6529
6530 void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls)
6531 {
6532 dout(10) << "truncate_inode_finish " << *in << dendl;
6533
6534 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6535 ceph_assert(p != ls->truncating_inodes.end());
6536 ls->truncating_inodes.erase(p);
6537
6538 // update
6539 auto &pi = in->project_inode();
6540 pi.inode.version = in->pre_dirty();
6541 pi.inode.truncate_from = 0;
6542 pi.inode.truncate_pending--;
6543
6544 MutationRef mut(new MutationImpl());
6545 mut->ls = mds->mdlog->get_current_segment();
6546 mut->add_projected_inode(in);
6547
6548 EUpdate *le = new EUpdate(mds->mdlog, "truncate finish");
6549 mds->mdlog->start_entry(le);
6550 CDentry *dn = in->get_projected_parent_dn();
6551 le->metablob.add_dir_context(dn->get_dir());
6552 le->metablob.add_primary_dentry(dn, in, true);
6553 le->metablob.add_truncate_finish(in->ino(), ls->seq);
6554
6555 journal_dirty_inode(mut.get(), &le->metablob, in);
6556 mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut));
6557
6558 // flush immediately if there are readers/writers waiting
6559 if (in->is_waiter_for(CInode::WAIT_TRUNC) ||
6560 (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
6561 mds->mdlog->flush();
6562 }
6563
6564 void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut)
6565 {
6566 dout(10) << "truncate_inode_logged " << *in << dendl;
6567 mut->apply();
6568 mds->locker->drop_locks(mut.get());
6569 mut->cleanup();
6570
6571 in->put(CInode::PIN_TRUNCATING);
6572 in->auth_unpin(this);
6573
6574 MDSContext::vec waiters;
6575 in->take_waiting(CInode::WAIT_TRUNC, waiters);
6576 mds->queue_waiters(waiters);
6577 }
6578
6579
6580 void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls)
6581 {
6582 dout(20) << "add_recovered_truncate " << *in << " in log segment "
6583 << ls->seq << "/" << ls->offset << dendl;
6584 ls->truncating_inodes.insert(in);
6585 in->get(CInode::PIN_TRUNCATING);
6586 }
6587
6588 void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls)
6589 {
6590 dout(20) << "remove_recovered_truncate " << *in << " in log segment "
6591 << ls->seq << "/" << ls->offset << dendl;
6592 // if we have the logseg the truncate started in, it must be in our list.
6593 set<CInode*>::iterator p = ls->truncating_inodes.find(in);
6594 ceph_assert(p != ls->truncating_inodes.end());
6595 ls->truncating_inodes.erase(p);
6596 in->put(CInode::PIN_TRUNCATING);
6597 }
6598
6599 void MDCache::start_recovered_truncates()
6600 {
6601 dout(10) << "start_recovered_truncates" << dendl;
6602 for (map<uint64_t,LogSegment*>::iterator p = mds->mdlog->segments.begin();
6603 p != mds->mdlog->segments.end();
6604 ++p) {
6605 LogSegment *ls = p->second;
6606 for (set<CInode*>::iterator q = ls->truncating_inodes.begin();
6607 q != ls->truncating_inodes.end();
6608 ++q) {
6609 CInode *in = *q;
6610 in->auth_pin(this);
6611
6612 if (!in->client_need_snapflush.empty() &&
6613 (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) {
6614 ceph_assert(in->filelock.is_stable());
6615 in->filelock.set_state(LOCK_XLOCKDONE);
6616 in->auth_pin(&in->filelock);
6617 in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls));
6618 // start_files_to_recover will revoke caps
6619 continue;
6620 }
6621 _truncate_inode(in, ls);
6622 }
6623 }
6624 }
6625
6626
6627 class C_MDS_purge_completed_finish : public MDCacheLogContext {
6628 interval_set<inodeno_t> inos;
6629 version_t inotablev;
6630 LogSegment *ls;
6631 public:
6632 C_MDS_purge_completed_finish(MDCache *m,
6633 interval_set<inodeno_t> i,
6634 version_t iv,
6635 LogSegment *_ls)
6636 : MDCacheLogContext(m),
6637 inos(std::move(i)),
6638 inotablev(iv),
6639 ls(_ls) {}
6640 void finish(int r) override {
6641 assert(r == 0);
6642 if (inotablev) {
6643 ls->purge_inodes_finish(inos);
6644 mdcache->mds->inotable->apply_release_ids(inos);
6645 assert(mdcache->mds->inotable->get_version() == inotablev);
6646 }
6647 }
6648 };
6649
6650 void MDCache::start_purge_inodes(){
6651 dout(10) << "start_purge_inodes" << dendl;
6652 for (auto& p : mds->mdlog->segments){
6653 LogSegment *ls = p.second;
6654 if (ls->purge_inodes.size()){
6655 purge_inodes(ls->purge_inodes, ls);
6656 }
6657 }
6658 }
6659
6660 void MDCache::purge_inodes(const interval_set<inodeno_t>& inos, LogSegment *ls)
6661 {
6662 auto cb = new LambdaContext([this, inos, ls](int r){
6663 assert(r == 0 || r == -2);
6664 mds->inotable->project_release_ids(inos);
6665 version_t piv = mds->inotable->get_projected_version();
6666 assert(piv != 0);
6667 mds->mdlog->start_submit_entry(new EPurged(inos, piv, ls->seq),
6668 new C_MDS_purge_completed_finish(this, inos, piv, ls));
6669 mds->mdlog->flush();
6670 });
6671
6672 dout(10) << __func__ << " start purge data : " << inos << dendl;
6673 C_GatherBuilder gather(g_ceph_context,
6674 new C_OnFinisher( new MDSIOContextWrapper(mds, cb), mds->finisher));
6675 SnapContext nullsnapc;
6676 uint64_t num = Striper::get_num_objects(default_file_layout, default_file_layout.get_period());
6677 for (auto p = inos.begin();
6678 p != inos.end();
6679 ++p){
6680 dout(10) << __func__
6681 << " prealloc_inos : " << inos.size()
6682 << " start : " << p.get_start().val
6683 << " length : " << p.get_len() << " "
6684 << " seq : " << ls->seq << dendl;
6685
6686 for (_inodeno_t i = 0; i < p.get_len(); i++){
6687 dout(20) << __func__ << " : " << p.get_start() + i << dendl;
6688 filer.purge_range(p.get_start() + i,
6689 &default_file_layout,
6690 nullsnapc,
6691 0, num,
6692 ceph::real_clock::now(),
6693 0, gather.new_sub());
6694 }
6695 }
6696 gather.activate();
6697 }
6698
6699 // ================================================================================
6700 // cache trimming
6701
6702 std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
6703 {
6704 bool is_standby_replay = mds->is_standby_replay();
6705 std::vector<CDentry *> unexpirables;
6706 uint64_t trimmed = 0;
6707
6708 auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
6709
6710 dout(7) << "trim_lru trimming " << count
6711 << " items from LRU"
6712 << " size=" << lru.lru_get_size()
6713 << " mid=" << lru.lru_get_top()
6714 << " pintail=" << lru.lru_get_pintail()
6715 << " pinned=" << lru.lru_get_num_pinned()
6716 << dendl;
6717
6718 const uint64_t trim_counter_start = trim_counter.get();
6719 bool throttled = false;
6720 while (1) {
6721 throttled |= trim_counter_start+trimmed >= trim_threshold;
6722 if (throttled) break;
6723 CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
6724 if (!dn)
6725 break;
6726 if (trim_dentry(dn, expiremap)) {
6727 unexpirables.push_back(dn);
6728 } else {
6729 trimmed++;
6730 }
6731 }
6732
6733 for (auto &dn : unexpirables) {
6734 bottom_lru.lru_insert_mid(dn);
6735 }
6736 unexpirables.clear();
6737
6738 // trim dentries from the LRU until count is reached
6739 // if mds is in standbyreplay and will trim all inodes which aren't in segments
6740 while (!throttled && (cache_toofull() || count > 0 || is_standby_replay)) {
6741 throttled |= trim_counter_start+trimmed >= trim_threshold;
6742 if (throttled) break;
6743 CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
6744 if (!dn) {
6745 break;
6746 }
6747 if ((is_standby_replay && dn->get_linkage()->inode &&
6748 dn->get_linkage()->inode->item_open_file.is_on_list())) {
6749 // we move the inodes that need to be trimmed to the end of the lru queue.
6750 // refer to MDCache::standby_trim_segment
6751 lru.lru_insert_bot(dn);
6752 break;
6753 } else if (trim_dentry(dn, expiremap)) {
6754 unexpirables.push_back(dn);
6755 } else {
6756 trimmed++;
6757 if (count > 0) count--;
6758 }
6759 }
6760 trim_counter.hit(trimmed);
6761
6762 for (auto &dn : unexpirables) {
6763 lru.lru_insert_mid(dn);
6764 }
6765 unexpirables.clear();
6766
6767 dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
6768 return std::pair<bool, uint64_t>(throttled, trimmed);
6769 }
6770
6771 /*
6772 * note: only called while MDS is active or stopping... NOT during recovery.
6773 * however, we may expire a replica whose authority is recovering.
6774 *
6775 * @param count is number of dentries to try to expire
6776 */
6777 std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
6778 {
6779 uint64_t used = cache_size();
6780 uint64_t limit = cache_memory_limit;
6781 expiremap expiremap;
6782
6783 dout(7) << "trim bytes_used=" << bytes2str(used)
6784 << " limit=" << bytes2str(limit)
6785 << " reservation=" << cache_reservation
6786 << "% count=" << count << dendl;
6787
6788 // process delayed eval_stray()
6789 stray_manager.advance_delayed();
6790
6791 auto result = trim_lru(count, expiremap);
6792 auto& trimmed = result.second;
6793
6794 // trim non-auth, non-bound subtrees
6795 for (auto p = subtrees.begin(); p != subtrees.end();) {
6796 CDir *dir = p->first;
6797 ++p;
6798 CInode *diri = dir->get_inode();
6799 if (dir->is_auth()) {
6800 if (diri->is_auth() && !diri->is_base()) {
6801 /* this situation should correspond to an export pin */
6802 if (dir->get_num_head_items() == 0 && dir->get_num_ref() == 1) {
6803 /* pinned empty subtree, try to drop */
6804 if (dir->state_test(CDir::STATE_AUXSUBTREE)) {
6805 dout(20) << "trimming empty pinned subtree " << *dir << dendl;
6806 dir->state_clear(CDir::STATE_AUXSUBTREE);
6807 remove_subtree(dir);
6808 diri->close_dirfrag(dir->dirfrag().frag);
6809 }
6810 }
6811 } else if (!diri->is_auth() && !diri->is_base() && dir->get_num_head_items() == 0) {
6812 if (dir->state_test(CDir::STATE_EXPORTING) ||
6813 !(mds->is_active() || mds->is_stopping()) ||
6814 dir->is_freezing() || dir->is_frozen())
6815 continue;
6816
6817 migrator->export_empty_import(dir);
6818 ++trimmed;
6819 }
6820 } else if (!diri->is_auth() && dir->get_num_ref() <= 1) {
6821 // only subtree pin
6822 auto&& ls = diri->get_subtree_dirfrags();
6823 if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees
6824 continue;
6825
6826 // don't trim subtree root if its auth MDS is recovering.
6827 // This simplify the cache rejoin code.
6828 if (dir->is_subtree_root() && rejoin_ack_gather.count(dir->get_dir_auth().first))
6829 continue;
6830 trim_dirfrag(dir, 0, expiremap);
6831 ++trimmed;
6832 }
6833 }
6834
6835 // trim root?
6836 if (mds->is_stopping() && root) {
6837 auto&& ls = root->get_dirfrags();
6838 for (const auto& dir : ls) {
6839 if (dir->get_num_ref() == 1) { // subtree pin
6840 trim_dirfrag(dir, 0, expiremap);
6841 ++trimmed;
6842 }
6843 }
6844 if (root->get_num_ref() == 0) {
6845 trim_inode(0, root, 0, expiremap);
6846 ++trimmed;
6847 }
6848 }
6849
6850 std::set<mds_rank_t> stopping;
6851 mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING);
6852 stopping.erase(mds->get_nodeid());
6853 for (auto rank : stopping) {
6854 CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank));
6855 if (!mdsdir_in)
6856 continue;
6857
6858 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(rank), std::forward_as_tuple());
6859 if (em.second) {
6860 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6861 }
6862
6863 dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl;
6864
6865 const bool aborted = expire_recursive(mdsdir_in, expiremap);
6866 if (!aborted) {
6867 dout(20) << __func__ << ": successfully expired mdsdir" << dendl;
6868 auto&& ls = mdsdir_in->get_dirfrags();
6869 for (auto dir : ls) {
6870 if (dir->get_num_ref() == 1) { // subtree pin
6871 trim_dirfrag(dir, dir, expiremap);
6872 ++trimmed;
6873 }
6874 }
6875 if (mdsdir_in->get_num_ref() == 0) {
6876 trim_inode(NULL, mdsdir_in, NULL, expiremap);
6877 ++trimmed;
6878 }
6879 } else {
6880 dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
6881 }
6882 }
6883
6884 // Other rank's base inodes (when I'm stopping)
6885 if (mds->is_stopping()) {
6886 for (set<CInode*>::iterator p = base_inodes.begin();
6887 p != base_inodes.end();) {
6888 CInode *base_in = *p;
6889 ++p;
6890 if (MDS_INO_IS_MDSDIR(base_in->ino()) &&
6891 MDS_INO_MDSDIR_OWNER(base_in->ino()) != mds->get_nodeid()) {
6892 dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
6893 if (base_in->get_num_ref() == 0) {
6894 trim_inode(NULL, base_in, NULL, expiremap);
6895 ++trimmed;
6896 }
6897 }
6898 }
6899 }
6900
6901 // send any expire messages
6902 send_expire_messages(expiremap);
6903
6904 return result;
6905 }
6906
6907 void MDCache::send_expire_messages(expiremap& expiremap)
6908 {
6909 // send expires
6910 for (const auto &p : expiremap) {
6911 if (mds->is_cluster_degraded() &&
6912 (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
6913 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
6914 rejoin_sent.count(p.first) == 0))) {
6915 continue;
6916 }
6917 dout(7) << "sending cache_expire to " << p.first << dendl;
6918 mds->send_message_mds(p.second, p.first);
6919 }
6920 expiremap.clear();
6921 }
6922
6923
6924 bool MDCache::trim_dentry(CDentry *dn, expiremap& expiremap)
6925 {
6926 dout(12) << "trim_dentry " << *dn << dendl;
6927
6928 CDentry::linkage_t *dnl = dn->get_linkage();
6929
6930 CDir *dir = dn->get_dir();
6931 ceph_assert(dir);
6932
6933 CDir *con = get_subtree_root(dir);
6934 if (con)
6935 dout(12) << " in container " << *con << dendl;
6936 else {
6937 dout(12) << " no container; under a not-yet-linked dir" << dendl;
6938 ceph_assert(dn->is_auth());
6939 }
6940
6941 // If replica dentry is not readable, it's likely we will receive
6942 // MDentryLink/MDentryUnlink message soon (It's possible we first
6943 // receive a MDentryUnlink message, then MDentryLink message)
6944 // MDentryLink message only replicates an inode, so we should
6945 // avoid trimming the inode's parent dentry. This is because that
6946 // unconnected replicas are problematic for subtree migration.
6947 if (!dn->is_auth() && !dn->lock.can_read(-1) &&
6948 !dn->get_dir()->get_inode()->is_stray())
6949 return true;
6950
6951 // adjust the dir state
6952 // NOTE: we can safely remove a clean, null dentry without effecting
6953 // directory completeness.
6954 // (check this _before_ we unlink the inode, below!)
6955 bool clear_complete = false;
6956 if (!(dnl->is_null() && dn->is_clean()))
6957 clear_complete = true;
6958
6959 // unlink the dentry
6960 if (dnl->is_remote()) {
6961 // just unlink.
6962 dir->unlink_inode(dn, false);
6963 } else if (dnl->is_primary()) {
6964 // expire the inode, too.
6965 CInode *in = dnl->get_inode();
6966 ceph_assert(in);
6967 if (trim_inode(dn, in, con, expiremap))
6968 return true; // purging stray instead of trimming
6969 } else {
6970 ceph_assert(dnl->is_null());
6971 }
6972
6973 if (!dn->is_auth()) {
6974 // notify dentry authority.
6975 mds_authority_t auth = dn->authority();
6976
6977 for (int p=0; p<2; p++) {
6978 mds_rank_t a = auth.first;
6979 if (p) a = auth.second;
6980 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
6981 if (mds->get_nodeid() == auth.second &&
6982 con->is_importing()) break; // don't send any expire while importing.
6983 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
6984
6985 dout(12) << " sending expire to mds." << a << " on " << *dn << dendl;
6986 ceph_assert(a != mds->get_nodeid());
6987 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
6988 if (em.second)
6989 em.first->second = make_message<MCacheExpire>(mds->get_nodeid());
6990 em.first->second->add_dentry(con->dirfrag(), dir->dirfrag(), dn->get_name(), dn->last, dn->get_replica_nonce());
6991 }
6992 }
6993
6994 // remove dentry
6995 if (dn->last == CEPH_NOSNAP && dir->is_auth())
6996 dir->add_to_bloom(dn);
6997 dir->remove_dentry(dn);
6998
6999 if (clear_complete)
7000 dir->state_clear(CDir::STATE_COMPLETE);
7001
7002 if (mds->logger) mds->logger->inc(l_mds_inodes_expired);
7003 return false;
7004 }
7005
7006
7007 void MDCache::trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap)
7008 {
7009 dout(15) << "trim_dirfrag " << *dir << dendl;
7010
7011 if (dir->is_subtree_root()) {
7012 ceph_assert(!dir->is_auth() ||
7013 (!dir->is_replicated() && dir->inode->is_base()));
7014 remove_subtree(dir); // remove from subtree map
7015 }
7016 ceph_assert(dir->get_num_ref() == 0);
7017
7018 CInode *in = dir->get_inode();
7019
7020 if (!dir->is_auth()) {
7021 mds_authority_t auth = dir->authority();
7022
7023 // was this an auth delegation? (if so, slightly modified container)
7024 dirfrag_t condf;
7025 if (dir->is_subtree_root()) {
7026 dout(12) << " subtree root, container is " << *dir << dendl;
7027 con = dir;
7028 condf = dir->dirfrag();
7029 } else {
7030 condf = con->dirfrag();
7031 }
7032
7033 for (int p=0; p<2; p++) {
7034 mds_rank_t a = auth.first;
7035 if (p) a = auth.second;
7036 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7037 if (mds->get_nodeid() == auth.second &&
7038 con->is_importing()) break; // don't send any expire while importing.
7039 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7040
7041 dout(12) << " sending expire to mds." << a << " on " << *dir << dendl;
7042 ceph_assert(a != mds->get_nodeid());
7043 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7044 if (em.second)
7045 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7046 em.first->second->add_dir(condf, dir->dirfrag(), dir->replica_nonce);
7047 }
7048 }
7049
7050 in->close_dirfrag(dir->dirfrag().frag);
7051 }
7052
7053 /**
7054 * Try trimming an inode from the cache
7055 *
7056 * @return true if the inode is still in cache, else false if it was trimmed
7057 */
7058 bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap& expiremap)
7059 {
7060 dout(15) << "trim_inode " << *in << dendl;
7061 ceph_assert(in->get_num_ref() == 0);
7062
7063 if (in->is_dir()) {
7064 // If replica inode's dirfragtreelock is not readable, it's likely
7065 // some dirfrags of the inode are being fragmented and we will receive
7066 // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new
7067 // dirfrags, so we should avoid trimming these dirfrags' parent inode.
7068 // This is because that unconnected replicas are problematic for
7069 // subtree migration.
7070 //
7071 if (!in->is_auth() && !mds->locker->rdlock_try(&in->dirfragtreelock, -1)) {
7072 return true;
7073 }
7074
7075 // DIR
7076 auto&& dfls = in->get_dirfrags();
7077 for (const auto& dir : dfls) {
7078 ceph_assert(!dir->is_subtree_root());
7079 trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p
7080 }
7081 }
7082
7083 // INODE
7084 if (in->is_auth()) {
7085 // eval stray after closing dirfrags
7086 if (dn && !dn->state_test(CDentry::STATE_PURGING)) {
7087 maybe_eval_stray(in);
7088 if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0)
7089 return true;
7090 }
7091 } else {
7092 mds_authority_t auth = in->authority();
7093
7094 dirfrag_t df;
7095 if (con)
7096 df = con->dirfrag();
7097 else
7098 df = dirfrag_t(0,frag_t()); // must be a root or stray inode.
7099
7100 for (int p=0; p<2; p++) {
7101 mds_rank_t a = auth.first;
7102 if (p) a = auth.second;
7103 if (a < 0 || (p == 1 && auth.second == auth.first)) break;
7104 if (con && mds->get_nodeid() == auth.second &&
7105 con->is_importing()) break; // don't send any expire while importing.
7106 if (a == mds->get_nodeid()) continue; // on export, ignore myself.
7107
7108 dout(12) << " sending expire to mds." << a << " on " << *in << dendl;
7109 ceph_assert(a != mds->get_nodeid());
7110 auto em = expiremap.emplace(std::piecewise_construct, std::forward_as_tuple(a), std::forward_as_tuple());
7111 if (em.second)
7112 em.first->second = make_message<MCacheExpire>(mds->get_nodeid()); /* new */
7113 em.first->second->add_inode(df, in->vino(), in->get_replica_nonce());
7114 }
7115 }
7116
7117 /*
7118 if (in->is_auth()) {
7119 if (in->hack_accessed)
7120 mds->logger->inc("outt");
7121 else {
7122 mds->logger->inc("outut");
7123 mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp);
7124 }
7125 }
7126 */
7127
7128 // unlink
7129 if (dn)
7130 dn->get_dir()->unlink_inode(dn, false);
7131 remove_inode(in);
7132 return false;
7133 }
7134
7135
7136 /**
7137 * trim_non_auth - remove any non-auth items from our cache
7138 *
7139 * this reduces the amount of non-auth metadata in our cache, reducing the
7140 * load incurred by the rejoin phase.
7141 *
7142 * the only non-auth items that remain are those that are needed to
7143 * attach our own subtrees to the root.
7144 *
7145 * when we are done, all dentries will be in the top bit of the lru.
7146 *
7147 * why we have to do this:
7148 * we may not have accurate linkage for non-auth items. which means we will
7149 * know which subtree it falls into, and can not be sure to declare it to the
7150 * correct authority.
7151 */
7152 void MDCache::trim_non_auth()
7153 {
7154 dout(7) << "trim_non_auth" << dendl;
7155
7156 // temporarily pin all subtree roots
7157 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7158 p != subtrees.end();
7159 ++p)
7160 p->first->get(CDir::PIN_SUBTREETEMP);
7161
7162 list<CDentry*> auth_list;
7163
7164 // trim non-auth items from the lru
7165 for (;;) {
7166 CDentry *dn = NULL;
7167 if (bottom_lru.lru_get_size() > 0)
7168 dn = static_cast<CDentry*>(bottom_lru.lru_expire());
7169 if (!dn && lru.lru_get_size() > 0)
7170 dn = static_cast<CDentry*>(lru.lru_expire());
7171 if (!dn)
7172 break;
7173
7174 CDentry::linkage_t *dnl = dn->get_linkage();
7175
7176 if (dn->is_auth()) {
7177 // add back into lru (at the top)
7178 auth_list.push_back(dn);
7179
7180 if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth())
7181 dn->unlink_remote(dnl);
7182 } else {
7183 // non-auth. expire.
7184 CDir *dir = dn->get_dir();
7185 ceph_assert(dir);
7186
7187 // unlink the dentry
7188 dout(10) << " removing " << *dn << dendl;
7189 if (dnl->is_remote()) {
7190 dir->unlink_inode(dn, false);
7191 }
7192 else if (dnl->is_primary()) {
7193 CInode *in = dnl->get_inode();
7194 dout(10) << " removing " << *in << dendl;
7195 auto&& ls = in->get_dirfrags();
7196 for (const auto& subdir : ls) {
7197 ceph_assert(!subdir->is_subtree_root());
7198 in->close_dirfrag(subdir->dirfrag().frag);
7199 }
7200 dir->unlink_inode(dn, false);
7201 remove_inode(in);
7202 }
7203 else {
7204 ceph_assert(dnl->is_null());
7205 }
7206
7207 ceph_assert(!dir->has_bloom());
7208 dir->remove_dentry(dn);
7209 // adjust the dir state
7210 dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete!
7211 // close empty non-auth dirfrag
7212 if (!dir->is_subtree_root() && dir->get_num_any() == 0)
7213 dir->inode->close_dirfrag(dir->get_frag());
7214 }
7215 }
7216
7217 for (const auto& dn : auth_list) {
7218 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
7219 bottom_lru.lru_insert_mid(dn);
7220 else
7221 lru.lru_insert_top(dn);
7222 }
7223
7224 // move everything in the pintail to the top bit of the lru.
7225 lru.lru_touch_entire_pintail();
7226
7227 // unpin all subtrees
7228 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
7229 p != subtrees.end();
7230 ++p)
7231 p->first->put(CDir::PIN_SUBTREETEMP);
7232
7233 if (lru.lru_get_size() == 0 &&
7234 bottom_lru.lru_get_size() == 0) {
7235 // root, stray, etc.?
7236 auto p = inode_map.begin();
7237 while (p != inode_map.end()) {
7238 CInode *in = p->second;
7239 ++p;
7240 if (!in->is_auth()) {
7241 auto&& ls = in->get_dirfrags();
7242 for (const auto& dir : ls) {
7243 dout(10) << " removing " << *dir << dendl;
7244 ceph_assert(dir->get_num_ref() == 1); // SUBTREE
7245 remove_subtree(dir);
7246 in->close_dirfrag(dir->dirfrag().frag);
7247 }
7248 dout(10) << " removing " << *in << dendl;
7249 ceph_assert(!in->get_parent_dn());
7250 ceph_assert(in->get_num_ref() == 0);
7251 remove_inode(in);
7252 }
7253 }
7254 }
7255
7256 show_subtrees();
7257 }
7258
7259 /**
7260 * Recursively trim the subtree rooted at directory to remove all
7261 * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors
7262 * of those links. This is used to clear invalid data out of the cache.
7263 * Note that it doesn't clear the passed-in directory, since that's not
7264 * always safe.
7265 */
7266 bool MDCache::trim_non_auth_subtree(CDir *dir)
7267 {
7268 dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl;
7269
7270 bool keep_dir = !can_trim_non_auth_dirfrag(dir);
7271
7272 auto j = dir->begin();
7273 auto i = j;
7274 while (j != dir->end()) {
7275 i = j++;
7276 CDentry *dn = i->second;
7277 dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl;
7278 CDentry::linkage_t *dnl = dn->get_linkage();
7279 if (dnl->is_primary()) { // check for subdirectories, etc
7280 CInode *in = dnl->get_inode();
7281 bool keep_inode = false;
7282 if (in->is_dir()) {
7283 auto&& subdirs = in->get_dirfrags();
7284 for (const auto& subdir : subdirs) {
7285 if (subdir->is_subtree_root()) {
7286 keep_inode = true;
7287 dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << *subdir << dendl;
7288 } else {
7289 if (trim_non_auth_subtree(subdir))
7290 keep_inode = true;
7291 else {
7292 in->close_dirfrag(subdir->get_frag());
7293 dir->state_clear(CDir::STATE_COMPLETE); // now incomplete!
7294 }
7295 }
7296 }
7297
7298 }
7299 if (!keep_inode) { // remove it!
7300 dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl;
7301 dir->unlink_inode(dn, false);
7302 remove_inode(in);
7303 ceph_assert(!dir->has_bloom());
7304 dir->remove_dentry(dn);
7305 } else {
7306 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <<dendl;
7307 dn->state_clear(CDentry::STATE_AUTH);
7308 in->state_clear(CInode::STATE_AUTH);
7309 }
7310 } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback
7311 dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <<dendl;
7312 } else { // just remove it
7313 dout(20) << "trim_non_auth_subtree(" << dir << ") removing dentry " << dn << dendl;
7314 if (dnl->is_remote())
7315 dir->unlink_inode(dn, false);
7316 dir->remove_dentry(dn);
7317 }
7318 }
7319 dir->state_clear(CDir::STATE_AUTH);
7320 /**
7321 * We've now checked all our children and deleted those that need it.
7322 * Now return to caller, and tell them if *we're* a keeper.
7323 */
7324 return keep_dir || dir->get_num_any();
7325 }
7326
7327 /*
7328 * during replay, when we determine a subtree is no longer ours, we
7329 * try to trim it from our cache. because subtrees must be connected
7330 * to the root, the fact that we can trim this tree may mean that our
7331 * children or parents can also be trimmed.
7332 */
7333 void MDCache::try_trim_non_auth_subtree(CDir *dir)
7334 {
7335 dout(10) << "try_trim_nonauth_subtree " << *dir << dendl;
7336
7337 // can we now trim child subtrees?
7338 set<CDir*> bounds;
7339 get_subtree_bounds(dir, bounds);
7340 for (set<CDir*>::iterator p = bounds.begin(); p != bounds.end(); ++p) {
7341 CDir *bd = *p;
7342 if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth
7343 bd->get_num_any() == 0 && // and empty
7344 can_trim_non_auth_dirfrag(bd)) {
7345 CInode *bi = bd->get_inode();
7346 dout(10) << " closing empty non-auth child subtree " << *bd << dendl;
7347 remove_subtree(bd);
7348 bd->mark_clean();
7349 bi->close_dirfrag(bd->get_frag());
7350 }
7351 }
7352
7353 if (trim_non_auth_subtree(dir)) {
7354 // keep
7355 try_subtree_merge(dir);
7356 } else {
7357 // can we trim this subtree (and possibly our ancestors) too?
7358 while (true) {
7359 CInode *diri = dir->get_inode();
7360 if (diri->is_base()) {
7361 if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) {
7362 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7363 remove_subtree(dir);
7364 dir->mark_clean();
7365 diri->close_dirfrag(dir->get_frag());
7366
7367 dout(10) << " removing " << *diri << dendl;
7368 ceph_assert(!diri->get_parent_dn());
7369 ceph_assert(diri->get_num_ref() == 0);
7370 remove_inode(diri);
7371 }
7372 break;
7373 }
7374
7375 CDir *psub = get_subtree_root(diri->get_parent_dir());
7376 dout(10) << " parent subtree is " << *psub << dendl;
7377 if (psub->get_dir_auth().first == mds->get_nodeid())
7378 break; // we are auth, keep.
7379
7380 dout(10) << " closing empty non-auth subtree " << *dir << dendl;
7381 remove_subtree(dir);
7382 dir->mark_clean();
7383 diri->close_dirfrag(dir->get_frag());
7384
7385 dout(10) << " parent subtree also non-auth: " << *psub << dendl;
7386 if (trim_non_auth_subtree(psub))
7387 break;
7388 dir = psub;
7389 }
7390 }
7391
7392 show_subtrees();
7393 }
7394
7395 void MDCache::standby_trim_segment(LogSegment *ls)
7396 {
7397 auto try_trim_inode = [this](CInode *in) {
7398 if (in->get_num_ref() == 0 &&
7399 !in->item_open_file.is_on_list() &&
7400 in->parent != NULL &&
7401 in->parent->get_num_ref() == 0){
7402 touch_dentry_bottom(in->parent);
7403 }
7404 };
7405
7406 auto try_trim_dentry = [this](CDentry *dn) {
7407 if (dn->get_num_ref() > 0)
7408 return;
7409 auto in = dn->get_linkage()->inode;
7410 if(in && in->item_open_file.is_on_list())
7411 return;
7412 touch_dentry_bottom(dn);
7413 };
7414
7415 ls->new_dirfrags.clear_list();
7416 ls->open_files.clear_list();
7417
7418 while (!ls->dirty_dirfrags.empty()) {
7419 CDir *dir = ls->dirty_dirfrags.front();
7420 dir->mark_clean();
7421 if (dir->inode)
7422 try_trim_inode(dir->inode);
7423 }
7424 while (!ls->dirty_inodes.empty()) {
7425 CInode *in = ls->dirty_inodes.front();
7426 in->mark_clean();
7427 try_trim_inode(in);
7428 }
7429 while (!ls->dirty_dentries.empty()) {
7430 CDentry *dn = ls->dirty_dentries.front();
7431 dn->mark_clean();
7432 try_trim_dentry(dn);
7433 }
7434 while (!ls->dirty_parent_inodes.empty()) {
7435 CInode *in = ls->dirty_parent_inodes.front();
7436 in->clear_dirty_parent();
7437 try_trim_inode(in);
7438 }
7439 while (!ls->dirty_dirfrag_dir.empty()) {
7440 CInode *in = ls->dirty_dirfrag_dir.front();
7441 in->filelock.remove_dirty();
7442 try_trim_inode(in);
7443 }
7444 while (!ls->dirty_dirfrag_nest.empty()) {
7445 CInode *in = ls->dirty_dirfrag_nest.front();
7446 in->nestlock.remove_dirty();
7447 try_trim_inode(in);
7448 }
7449 while (!ls->dirty_dirfrag_dirfragtree.empty()) {
7450 CInode *in = ls->dirty_dirfrag_dirfragtree.front();
7451 in->dirfragtreelock.remove_dirty();
7452 try_trim_inode(in);
7453 }
7454 while (!ls->truncating_inodes.empty()) {
7455 auto it = ls->truncating_inodes.begin();
7456 CInode *in = *it;
7457 ls->truncating_inodes.erase(it);
7458 in->put(CInode::PIN_TRUNCATING);
7459 try_trim_inode(in);
7460 }
7461 }
7462
7463 void MDCache::handle_cache_expire(const cref_t<MCacheExpire> &m)
7464 {
7465 mds_rank_t from = mds_rank_t(m->get_from());
7466
7467 dout(7) << "cache_expire from mds." << from << dendl;
7468
7469 if (mds->get_state() < MDSMap::STATE_REJOIN) {
7470 return;
7471 }
7472
7473 set<SimpleLock *> gather_locks;
7474 // loop over realms
7475 for (const auto &p : m->realms) {
7476 // check container?
7477 if (p.first.ino > 0) {
7478 CInode *expired_inode = get_inode(p.first.ino);
7479 ceph_assert(expired_inode); // we had better have this.
7480 CDir *parent_dir = expired_inode->get_approx_dirfrag(p.first.frag);
7481 ceph_assert(parent_dir);
7482
7483 int export_state = -1;
7484 if (parent_dir->is_auth() && parent_dir->is_exporting()) {
7485 export_state = migrator->get_export_state(parent_dir);
7486 ceph_assert(export_state >= 0);
7487 }
7488
7489 if (!parent_dir->is_auth() ||
7490 (export_state != -1 &&
7491 ((export_state == Migrator::EXPORT_WARNING &&
7492 migrator->export_has_warned(parent_dir,from)) ||
7493 export_state == Migrator::EXPORT_EXPORTING ||
7494 export_state == Migrator::EXPORT_LOGGINGFINISH ||
7495 (export_state == Migrator::EXPORT_NOTIFYING &&
7496 !migrator->export_has_notified(parent_dir,from))))) {
7497
7498 // not auth.
7499 dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl;
7500 ceph_assert(parent_dir->is_frozen_tree_root());
7501
7502 // make a message container
7503
7504 auto em = delayed_expire[parent_dir].emplace(std::piecewise_construct, std::forward_as_tuple(from), std::forward_as_tuple());
7505 if (em.second)
7506 em.first->second = make_message<MCacheExpire>(from); /* new */
7507
7508 // merge these expires into it
7509 em.first->second->add_realm(p.first, p.second);
7510 continue;
7511 }
7512 ceph_assert(export_state <= Migrator::EXPORT_PREPPING ||
7513 (export_state == Migrator::EXPORT_WARNING &&
7514 !migrator->export_has_warned(parent_dir, from)));
7515
7516 dout(7) << "expires for " << *parent_dir << dendl;
7517 } else {
7518 dout(7) << "containerless expires (root, stray inodes)" << dendl;
7519 }
7520
7521 // INODES
7522 for (const auto &q : p.second.inodes) {
7523 CInode *in = get_inode(q.first);
7524 unsigned nonce = q.second;
7525
7526 if (!in) {
7527 dout(0) << " inode expire on " << q.first << " from " << from
7528 << ", don't have it" << dendl;
7529 ceph_assert(in);
7530 }
7531 ceph_assert(in->is_auth());
7532 dout(20) << __func__ << ": expiring inode " << *in << dendl;
7533
7534 // check nonce
7535 if (nonce == in->get_replica_nonce(from)) {
7536 // remove from our cached_by
7537 dout(7) << " inode expire on " << *in << " from mds." << from
7538 << " cached_by was " << in->get_replicas() << dendl;
7539 inode_remove_replica(in, from, false, gather_locks);
7540 }
7541 else {
7542 // this is an old nonce, ignore expire.
7543 dout(7) << " inode expire on " << *in << " from mds." << from
7544 << " with old nonce " << nonce
7545 << " (current " << in->get_replica_nonce(from) << "), dropping"
7546 << dendl;
7547 }
7548 }
7549
7550 // DIRS
7551 for (const auto &q : p.second.dirs) {
7552 CDir *dir = get_dirfrag(q.first);
7553 unsigned nonce = q.second;
7554
7555 if (!dir) {
7556 CInode *diri = get_inode(q.first.ino);
7557 if (diri) {
7558 if (mds->is_rejoin() &&
7559 rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet
7560 !diri->is_replica(from)) {
7561 auto&& ls = diri->get_nested_dirfrags();
7562 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7563 << " while rejoining, inode isn't replicated" << dendl;
7564 for (const auto& d : ls) {
7565 dir = d;
7566 if (dir->is_replica(from)) {
7567 dout(7) << " dir expire on " << *dir << " from mds." << from << dendl;
7568 dir->remove_replica(from);
7569 }
7570 }
7571 continue;
7572 }
7573 CDir *other = diri->get_approx_dirfrag(q.first.frag);
7574 if (other) {
7575 dout(7) << " dir expire on dirfrag " << q.first << " from mds." << from
7576 << " have " << *other << ", mismatched frags, dropping" << dendl;
7577 continue;
7578 }
7579 }
7580 dout(0) << " dir expire on " << q.first << " from " << from
7581 << ", don't have it" << dendl;
7582 ceph_assert(dir);
7583 }
7584 dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl;
7585
7586 ceph_assert(dir->is_auth());
7587
7588 // check nonce
7589 if (nonce == dir->get_replica_nonce(from)) {
7590 // remove from our cached_by
7591 dout(7) << " dir expire on " << *dir << " from mds." << from
7592 << " replicas was " << dir->get_replicas() << dendl;
7593 dir->remove_replica(from);
7594 }
7595 else {
7596 // this is an old nonce, ignore expire.
7597 dout(7) << " dir expire on " << *dir << " from mds." << from
7598 << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from)
7599 << "), dropping" << dendl;
7600 }
7601 }
7602
7603 // DENTRIES
7604 for (const auto &pd : p.second.dentries) {
7605 dout(10) << " dn expires in dir " << pd.first << dendl;
7606 CInode *diri = get_inode(pd.first.ino);
7607 ceph_assert(diri);
7608 CDir *dir = diri->get_dirfrag(pd.first.frag);
7609
7610 if (!dir) {
7611 dout(0) << " dn expires on " << pd.first << " from " << from
7612 << ", must have refragmented" << dendl;
7613 } else {
7614 ceph_assert(dir->is_auth());
7615 }
7616
7617 for (const auto &p : pd.second) {
7618 unsigned nonce = p.second;
7619 CDentry *dn;
7620
7621 if (dir) {
7622 dn = dir->lookup(p.first.first, p.first.second);
7623 } else {
7624 // which dirfrag for this dentry?
7625 CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p.first.first));
7626 ceph_assert(dir);
7627 ceph_assert(dir->is_auth());
7628 dn = dir->lookup(p.first.first, p.first.second);
7629 }
7630
7631 if (!dn) {
7632 if (dir)
7633 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << " in " << *dir << dendl;
7634 else
7635 dout(0) << " missing dentry for " << p.first.first << " snap " << p.first.second << dendl;
7636 }
7637 ceph_assert(dn);
7638
7639 if (nonce == dn->get_replica_nonce(from)) {
7640 dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl;
7641 dentry_remove_replica(dn, from, gather_locks);
7642 }
7643 else {
7644 dout(7) << " dentry_expire on " << *dn << " from mds." << from
7645 << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from)
7646 << "), dropping" << dendl;
7647 }
7648 }
7649 }
7650 }
7651
7652 for (set<SimpleLock*>::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) {
7653 if (!(*p)->is_stable())
7654 mds->locker->eval_gather(*p);
7655 }
7656 }
7657
7658 void MDCache::process_delayed_expire(CDir *dir)
7659 {
7660 dout(7) << "process_delayed_expire on " << *dir << dendl;
7661 for (const auto &p : delayed_expire[dir]) {
7662 handle_cache_expire(p.second);
7663 }
7664 delayed_expire.erase(dir);
7665 }
7666
7667 void MDCache::discard_delayed_expire(CDir *dir)
7668 {
7669 dout(7) << "discard_delayed_expire on " << *dir << dendl;
7670 delayed_expire.erase(dir);
7671 }
7672
7673 void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin,
7674 set<SimpleLock *>& gather_locks)
7675 {
7676 in->remove_replica(from);
7677 in->set_mds_caps_wanted(from, 0);
7678
7679 // note: this code calls _eval more often than it needs to!
7680 // fix lock
7681 if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock);
7682 if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock);
7683 if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock);
7684 if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock);
7685 if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock);
7686 if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock);
7687
7688 // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state.
7689 // Don't remove the recovering mds from lock's gathering list because
7690 // it may hold rejoined wrlocks.
7691 if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock);
7692 if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock);
7693 if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock);
7694 }
7695
7696 void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set<SimpleLock *>& gather_locks)
7697 {
7698 dn->remove_replica(from);
7699
7700 // fix lock
7701 if (dn->lock.remove_replica(from))
7702 gather_locks.insert(&dn->lock);
7703
7704 // Replicated strays might now be elegible for purge
7705 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7706 if (dnl->is_primary()) {
7707 maybe_eval_stray(dnl->get_inode());
7708 }
7709 }
7710
7711 void MDCache::trim_client_leases()
7712 {
7713 utime_t now = ceph_clock_now();
7714
7715 dout(10) << "trim_client_leases" << dendl;
7716
7717 std::size_t pool = 0;
7718 for (const auto& list : client_leases) {
7719 pool += 1;
7720 if (list.empty())
7721 continue;
7722
7723 auto before = list.size();
7724 while (!list.empty()) {
7725 ClientLease *r = list.front();
7726 if (r->ttl > now) break;
7727 CDentry *dn = static_cast<CDentry*>(r->parent);
7728 dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl;
7729 dn->remove_client_lease(r, mds->locker);
7730 }
7731 auto after = list.size();
7732 dout(10) << "trim_client_leases pool " << pool << " trimmed "
7733 << (before-after) << " leases, " << after << " left" << dendl;
7734 }
7735 }
7736
7737
7738 void MDCache::check_memory_usage()
7739 {
7740 static MemoryModel mm(g_ceph_context);
7741 static MemoryModel::snap last;
7742 mm.sample(&last);
7743 static MemoryModel::snap baseline = last;
7744
7745 // check client caps
7746 ceph_assert(CInode::count() == inode_map.size() + snap_inode_map.size() + num_shadow_inodes);
7747 double caps_per_inode = 0.0;
7748 if (CInode::count())
7749 caps_per_inode = (double)Capability::count() / (double)CInode::count();
7750
7751 dout(2) << "Memory usage: "
7752 << " total " << last.get_total()
7753 << ", rss " << last.get_rss()
7754 << ", heap " << last.get_heap()
7755 << ", baseline " << baseline.get_heap()
7756 << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps"
7757 << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode"
7758 << dendl;
7759
7760 mds->update_mlogger();
7761 mds->mlogger->set(l_mdm_rss, last.get_rss());
7762 mds->mlogger->set(l_mdm_heap, last.get_heap());
7763
7764 if (cache_toofull()) {
7765 mds->server->recall_client_state(nullptr, Server::RecallFlags::TRIM);
7766 }
7767
7768 // If the cache size had exceeded its limit, but we're back in bounds
7769 // now, free any unused pool memory so that our memory usage isn't
7770 // permanently bloated.
7771 if (exceeded_size_limit && !cache_toofull()) {
7772 // Only do this once we are back in bounds: otherwise the releases would
7773 // slow down whatever process caused us to exceed bounds to begin with
7774 if (ceph_using_tcmalloc()) {
7775 dout(5) << "check_memory_usage: releasing unused space from tcmalloc"
7776 << dendl;
7777 ceph_heap_release_free_memory();
7778 }
7779 exceeded_size_limit = false;
7780 }
7781 }
7782
7783
7784
7785 // =========================================================================================
7786 // shutdown
7787
7788 class C_MDC_ShutdownCheck : public MDCacheContext {
7789 public:
7790 explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {}
7791 void finish(int) override {
7792 mdcache->shutdown_check();
7793 }
7794 };
7795
7796 void MDCache::shutdown_check()
7797 {
7798 dout(0) << "shutdown_check at " << ceph_clock_now() << dendl;
7799
7800 // cache
7801 char old_val[32] = { 0 };
7802 char *o = old_val;
7803 g_conf().get_val("debug_mds", &o, sizeof(old_val));
7804 g_conf().set_val("debug_mds", "10");
7805 g_conf().apply_changes(nullptr);
7806 show_cache();
7807 g_conf().set_val("debug_mds", old_val);
7808 g_conf().apply_changes(nullptr);
7809 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7810
7811 // this
7812 dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7813 dout(0) << "log len " << mds->mdlog->get_num_events() << dendl;
7814
7815
7816 if (mds->objecter->is_active()) {
7817 dout(0) << "objecter still active" << dendl;
7818 mds->objecter->dump_active();
7819 }
7820 }
7821
7822
7823 void MDCache::shutdown_start()
7824 {
7825 dout(5) << "shutdown_start" << dendl;
7826
7827 if (g_conf()->mds_shutdown_check)
7828 mds->timer.add_event_after(g_conf()->mds_shutdown_check, new C_MDC_ShutdownCheck(this));
7829
7830 // g_conf()->debug_mds = 10;
7831 }
7832
7833
7834
7835 bool MDCache::shutdown_pass()
7836 {
7837 dout(7) << "shutdown_pass" << dendl;
7838
7839 if (mds->is_stopped()) {
7840 dout(7) << " already shut down" << dendl;
7841 show_cache();
7842 show_subtrees();
7843 return true;
7844 }
7845
7846 // empty stray dir
7847 bool strays_all_exported = shutdown_export_strays();
7848
7849 // trim cache
7850 trim(UINT64_MAX);
7851 dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
7852
7853
7854 {
7855 dout(10) << "Migrating any ephemerally pinned inodes" << dendl;
7856 /* copy to vector to avoid removals during iteration */
7857 std::vector<CInode*> migrate;
7858 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
7859 for (auto& in : migrate) {
7860 in->maybe_ephemeral_rand();
7861 }
7862 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
7863 for (auto& in : migrate) {
7864 in->maybe_ephemeral_dist();
7865 }
7866 mds->balancer->handle_export_pins();
7867 }
7868
7869 // Export all subtrees to another active (usually rank 0) if not rank 0
7870 int num_auth_subtree = 0;
7871 if (!subtrees.empty() && mds->get_nodeid() != 0) {
7872 dout(7) << "looking for subtrees to export" << dendl;
7873 std::vector<CDir*> ls;
7874 for (auto& [dir, bounds] : subtrees) {
7875 dout(10) << " examining " << *dir << " bounds " << bounds << dendl;
7876 if (dir->get_inode()->is_mdsdir() || !dir->is_auth())
7877 continue;
7878 num_auth_subtree++;
7879 if (dir->is_frozen() ||
7880 dir->is_freezing() ||
7881 dir->is_ambiguous_dir_auth() ||
7882 dir->state_test(CDir::STATE_EXPORTING) ||
7883 dir->get_inode()->is_ephemerally_pinned()) {
7884 continue;
7885 }
7886 ls.push_back(dir);
7887 }
7888
7889 migrator->clear_export_queue();
7890
7891 for (const auto& dir : ls) {
7892 mds_rank_t dest = dir->get_inode()->authority().first;
7893 if (dest > 0 && !mds->mdsmap->is_active(dest))
7894 dest = 0;
7895 dout(7) << "sending " << *dir << " back to mds." << dest << dendl;
7896 migrator->export_dir_nicely(dir, dest);
7897 }
7898 }
7899
7900 if (!strays_all_exported) {
7901 dout(7) << "waiting for strays to migrate" << dendl;
7902 return false;
7903 }
7904
7905 if (num_auth_subtree > 0) {
7906 ceph_assert(mds->get_nodeid() > 0);
7907 dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl;
7908 show_subtrees();
7909 return false;
7910 }
7911
7912 // close out any sessions (and open files!) before we try to trim the log, etc.
7913 if (mds->sessionmap.have_unclosed_sessions()) {
7914 if (!mds->server->terminating_sessions)
7915 mds->server->terminate_sessions();
7916 return false;
7917 }
7918
7919 // Fully trim the log so that all objects in cache are clean and may be
7920 // trimmed by a future MDCache::trim. Note that MDSRank::tick does not
7921 // trim the log such that the cache eventually becomes clean.
7922 if (mds->mdlog->get_num_segments() > 0) {
7923 auto ls = mds->mdlog->get_current_segment();
7924 if (ls->num_events > 1 || !ls->dirty_dirfrags.empty()) {
7925 // Current segment contains events other than subtreemap or
7926 // there are dirty dirfrags (see CDir::log_mark_dirty())
7927 mds->mdlog->start_new_segment();
7928 mds->mdlog->flush();
7929 }
7930 }
7931 mds->mdlog->trim_all();
7932 if (mds->mdlog->get_num_segments() > 1) {
7933 dout(7) << "still >1 segments, waiting for log to trim" << dendl;
7934 return false;
7935 }
7936
7937 // drop our reference to our stray dir inode
7938 for (int i = 0; i < NUM_STRAY; ++i) {
7939 if (strays[i] &&
7940 strays[i]->state_test(CInode::STATE_STRAYPINNED)) {
7941 strays[i]->state_clear(CInode::STATE_STRAYPINNED);
7942 strays[i]->put(CInode::PIN_STRAY);
7943 strays[i]->put_stickydirs();
7944 }
7945 }
7946
7947 CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL;
7948 if (mydir && !mydir->is_subtree_root())
7949 mydir = NULL;
7950
7951 // subtrees map not empty yet?
7952 if (subtrees.size() > (mydir ? 1 : 0)) {
7953 dout(7) << "still have " << num_subtrees() << " subtrees" << dendl;
7954 show_subtrees();
7955 migrator->show_importing();
7956 migrator->show_exporting();
7957 if (!migrator->is_importing() && !migrator->is_exporting())
7958 show_cache();
7959 return false;
7960 }
7961 ceph_assert(!migrator->is_exporting());
7962 ceph_assert(!migrator->is_importing());
7963
7964 // replicas may dirty scatter locks
7965 if (myin && myin->is_replicated()) {
7966 dout(7) << "still have replicated objects" << dendl;
7967 return false;
7968 }
7969
7970 if ((myin && myin->get_num_auth_pins()) ||
7971 (mydir && (mydir->get_auth_pins() || mydir->get_dir_auth_pins()))) {
7972 dout(7) << "still have auth pinned objects" << dendl;
7973 return false;
7974 }
7975
7976 // (only do this once!)
7977 if (!mds->mdlog->is_capped()) {
7978 dout(7) << "capping the log" << dendl;
7979 mds->mdlog->cap();
7980 }
7981
7982 if (!mds->mdlog->empty())
7983 mds->mdlog->trim(0);
7984
7985 if (!mds->mdlog->empty()) {
7986 dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events()
7987 << " in " << mds->mdlog->get_num_segments() << " segments" << dendl;
7988 return false;
7989 }
7990
7991 if (!did_shutdown_log_cap) {
7992 // flush journal header
7993 dout(7) << "writing header for (now-empty) journal" << dendl;
7994 ceph_assert(mds->mdlog->empty());
7995 mds->mdlog->write_head(0);
7996 // NOTE: filer active checker below will block us until this completes.
7997 did_shutdown_log_cap = true;
7998 return false;
7999 }
8000
8001 // filer active?
8002 if (mds->objecter->is_active()) {
8003 dout(7) << "objecter still active" << dendl;
8004 mds->objecter->dump_active();
8005 return false;
8006 }
8007
8008 // trim what we can from the cache
8009 if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) {
8010 dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl;
8011 show_cache();
8012 //dump();
8013 return false;
8014 }
8015
8016 // make mydir subtree go away
8017 if (mydir) {
8018 if (mydir->get_num_ref() > 1) { // subtree pin
8019 dout(7) << "there's still reference to mydir " << *mydir << dendl;
8020 show_cache();
8021 return false;
8022 }
8023
8024 remove_subtree(mydir);
8025 myin->close_dirfrag(mydir->get_frag());
8026 }
8027 ceph_assert(subtrees.empty());
8028
8029 if (myin) {
8030 remove_inode(myin);
8031 ceph_assert(!myin);
8032 }
8033
8034 if (global_snaprealm) {
8035 remove_inode(global_snaprealm->inode);
8036 global_snaprealm = nullptr;
8037 }
8038
8039 // done!
8040 dout(5) << "shutdown done." << dendl;
8041 return true;
8042 }
8043
8044 bool MDCache::shutdown_export_strays()
8045 {
8046 static const unsigned MAX_EXPORTING = 100;
8047
8048 if (mds->get_nodeid() == 0)
8049 return true;
8050
8051 if (shutdown_exporting_strays.size() * 3 >= MAX_EXPORTING * 2)
8052 return false;
8053
8054 dout(10) << "shutdown_export_strays " << shutdown_export_next.first
8055 << " '" << shutdown_export_next.second << "'" << dendl;
8056
8057 bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0));
8058 bool all_exported = false;
8059
8060 again:
8061 auto next = shutdown_export_next;
8062
8063 for (int i = 0; i < NUM_STRAY; ++i) {
8064 CInode *strayi = strays[i];
8065 if (!strayi ||
8066 !strayi->state_test(CInode::STATE_STRAYPINNED))
8067 continue;
8068 if (strayi->ino() < next.first.ino)
8069 continue;
8070
8071 deque<CDir*> dfls;
8072 strayi->get_dirfrags(dfls);
8073
8074 while (!dfls.empty()) {
8075 CDir *dir = dfls.front();
8076 dfls.pop_front();
8077
8078 if (dir->dirfrag() < next.first)
8079 continue;
8080 if (next.first < dir->dirfrag()) {
8081 next.first = dir->dirfrag();
8082 next.second.clear();
8083 }
8084
8085 if (!dir->is_complete()) {
8086 MDSContext *fin = nullptr;
8087 if (shutdown_exporting_strays.empty()) {
8088 fin = new MDSInternalContextWrapper(mds,
8089 new LambdaContext([this](int r) {
8090 shutdown_export_strays();
8091 })
8092 );
8093 }
8094 dir->fetch(fin);
8095 goto done;
8096 }
8097
8098 CDir::dentry_key_map::iterator it;
8099 if (next.second.empty()) {
8100 it = dir->begin();
8101 } else {
8102 auto hash = ceph_frag_value(strayi->hash_dentry_name(next.second));
8103 it = dir->lower_bound(dentry_key_t(0, next.second, hash));
8104 }
8105
8106 for (; it != dir->end(); ++it) {
8107 CDentry *dn = it->second;
8108 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8109 if (dnl->is_null())
8110 continue;
8111
8112 if (!mds0_active && !dn->state_test(CDentry::STATE_PURGING)) {
8113 next.second = it->first.name;
8114 goto done;
8115 }
8116
8117 auto ret = shutdown_exporting_strays.insert(dnl->get_inode()->ino());
8118 if (!ret.second) {
8119 dout(10) << "already exporting/purging " << *dn << dendl;
8120 continue;
8121 }
8122
8123 // Don't try to migrate anything that is actually
8124 // being purged right now
8125 if (!dn->state_test(CDentry::STATE_PURGING))
8126 stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root!
8127
8128 if (shutdown_exporting_strays.size() >= MAX_EXPORTING) {
8129 ++it;
8130 if (it != dir->end()) {
8131 next.second = it->first.name;
8132 } else {
8133 if (dfls.empty())
8134 next.first.ino.val++;
8135 else
8136 next.first = dfls.front()->dirfrag();
8137 next.second.clear();
8138 }
8139 goto done;
8140 }
8141 }
8142 }
8143 }
8144
8145 if (shutdown_exporting_strays.empty()) {
8146 dirfrag_t first_df(MDS_INO_STRAY(mds->get_nodeid(), 0), 0);
8147 if (first_df < shutdown_export_next.first ||
8148 !shutdown_export_next.second.empty()) {
8149 shutdown_export_next.first = first_df;
8150 shutdown_export_next.second.clear();
8151 goto again;
8152 }
8153 all_exported = true;
8154 }
8155
8156 done:
8157 shutdown_export_next = next;
8158 return all_exported;
8159 }
8160
8161 // ========= messaging ==============
8162
8163 void MDCache::dispatch(const cref_t<Message> &m)
8164 {
8165 switch (m->get_type()) {
8166
8167 // RESOLVE
8168 case MSG_MDS_RESOLVE:
8169 handle_resolve(ref_cast<MMDSResolve>(m));
8170 break;
8171 case MSG_MDS_RESOLVEACK:
8172 handle_resolve_ack(ref_cast<MMDSResolveAck>(m));
8173 break;
8174
8175 // REJOIN
8176 case MSG_MDS_CACHEREJOIN:
8177 handle_cache_rejoin(ref_cast<MMDSCacheRejoin>(m));
8178 break;
8179
8180 case MSG_MDS_DISCOVER:
8181 handle_discover(ref_cast<MDiscover>(m));
8182 break;
8183 case MSG_MDS_DISCOVERREPLY:
8184 handle_discover_reply(ref_cast<MDiscoverReply>(m));
8185 break;
8186
8187 case MSG_MDS_DIRUPDATE:
8188 handle_dir_update(ref_cast<MDirUpdate>(m));
8189 break;
8190
8191 case MSG_MDS_CACHEEXPIRE:
8192 handle_cache_expire(ref_cast<MCacheExpire>(m));
8193 break;
8194
8195 case MSG_MDS_DENTRYLINK:
8196 handle_dentry_link(ref_cast<MDentryLink>(m));
8197 break;
8198 case MSG_MDS_DENTRYUNLINK:
8199 handle_dentry_unlink(ref_cast<MDentryUnlink>(m));
8200 break;
8201
8202 case MSG_MDS_FRAGMENTNOTIFY:
8203 handle_fragment_notify(ref_cast<MMDSFragmentNotify>(m));
8204 break;
8205 case MSG_MDS_FRAGMENTNOTIFYACK:
8206 handle_fragment_notify_ack(ref_cast<MMDSFragmentNotifyAck>(m));
8207 break;
8208
8209 case MSG_MDS_FINDINO:
8210 handle_find_ino(ref_cast<MMDSFindIno>(m));
8211 break;
8212 case MSG_MDS_FINDINOREPLY:
8213 handle_find_ino_reply(ref_cast<MMDSFindInoReply>(m));
8214 break;
8215
8216 case MSG_MDS_OPENINO:
8217 handle_open_ino(ref_cast<MMDSOpenIno>(m));
8218 break;
8219 case MSG_MDS_OPENINOREPLY:
8220 handle_open_ino_reply(ref_cast<MMDSOpenInoReply>(m));
8221 break;
8222
8223 case MSG_MDS_SNAPUPDATE:
8224 handle_snap_update(ref_cast<MMDSSnapUpdate>(m));
8225 break;
8226
8227 default:
8228 derr << "cache unknown message " << m->get_type() << dendl;
8229 ceph_abort_msg("cache unknown message");
8230 }
8231 }
8232
8233 int MDCache::path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
8234 const filepath& path, int flags,
8235 vector<CDentry*> *pdnvec, CInode **pin)
8236 {
8237 bool discover = (flags & MDS_TRAVERSE_DISCOVER);
8238 bool forward = !discover;
8239 bool path_locked = (flags & MDS_TRAVERSE_PATH_LOCKED);
8240 bool want_dentry = (flags & MDS_TRAVERSE_WANT_DENTRY);
8241 bool want_auth = (flags & MDS_TRAVERSE_WANT_AUTH);
8242 bool rdlock_snap = (flags & (MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_SNAP2));
8243 bool rdlock_path = (flags & MDS_TRAVERSE_RDLOCK_PATH);
8244 bool xlock_dentry = (flags & MDS_TRAVERSE_XLOCK_DENTRY);
8245 bool rdlock_authlock = (flags & MDS_TRAVERSE_RDLOCK_AUTHLOCK);
8246
8247 if (forward)
8248 ceph_assert(mdr); // forward requires a request
8249
8250 snapid_t snapid = CEPH_NOSNAP;
8251 if (mdr)
8252 mdr->snapid = snapid;
8253
8254 client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1;
8255
8256 if (mds->logger) mds->logger->inc(l_mds_traverse);
8257
8258 dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl;
8259 CInode *cur = get_inode(path.get_ino());
8260 if (!cur) {
8261 if (MDS_INO_IS_MDSDIR(path.get_ino())) {
8262 open_foreign_mdsdir(path.get_ino(), cf.build());
8263 return 1;
8264 }
8265 if (MDS_INO_IS_STRAY(path.get_ino())) {
8266 mds_rank_t rank = MDS_INO_STRAY_OWNER(path.get_ino());
8267 unsigned idx = MDS_INO_STRAY_INDEX(path.get_ino());
8268 filepath path(strays[idx]->get_parent_dn()->get_name(),
8269 MDS_INO_MDSDIR(rank));
8270 MDRequestRef null_ref;
8271 return path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, nullptr);
8272 }
8273 return -ESTALE;
8274 }
8275 if (cur->state_test(CInode::STATE_PURGING))
8276 return -ESTALE;
8277
8278 // make sure snaprealm are open...
8279 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8280 !cur->snaprealm->open_parents(cf.build())) {
8281 return 1;
8282 }
8283
8284 if (flags & MDS_TRAVERSE_CHECK_LOCKCACHE)
8285 mds->locker->find_and_attach_lock_cache(mdr, cur);
8286
8287 if (mdr && mdr->lock_cache) {
8288 if (flags & MDS_TRAVERSE_WANT_DIRLAYOUT)
8289 mdr->dir_layout = mdr->lock_cache->get_dir_layout();
8290 } else if (rdlock_snap) {
8291 int n = (flags & MDS_TRAVERSE_RDLOCK_SNAP2) ? 1 : 0;
8292 if ((n == 0 && !(mdr->locking_state & MutationImpl::SNAP_LOCKED)) ||
8293 (n == 1 && !(mdr->locking_state & MutationImpl::SNAP2_LOCKED))) {
8294 bool want_layout = (flags & MDS_TRAVERSE_WANT_DIRLAYOUT);
8295 if (!mds->locker->try_rdlock_snap_layout(cur, mdr, n, want_layout))
8296 return 1;
8297 }
8298 }
8299
8300 // start trace
8301 if (pdnvec)
8302 pdnvec->clear();
8303 if (pin)
8304 *pin = cur;
8305
8306 MutationImpl::LockOpVec lov;
8307
8308 for (unsigned depth = 0; depth < path.depth(); ) {
8309 dout(12) << "traverse: path seg depth " << depth << " '" << path[depth]
8310 << "' snapid " << snapid << dendl;
8311
8312 if (!cur->is_dir()) {
8313 dout(7) << "traverse: " << *cur << " not a dir " << dendl;
8314 return -ENOTDIR;
8315 }
8316
8317 // walk into snapdir?
8318 if (path[depth].length() == 0) {
8319 dout(10) << "traverse: snapdir" << dendl;
8320 if (!mdr || depth > 0) // snapdir must be the first component
8321 return -EINVAL;
8322 snapid = CEPH_SNAPDIR;
8323 mdr->snapid = snapid;
8324 depth++;
8325 continue;
8326 }
8327 // walk thru snapdir?
8328 if (snapid == CEPH_SNAPDIR) {
8329 if (!mdr)
8330 return -EINVAL;
8331 SnapRealm *realm = cur->find_snaprealm();
8332 snapid = realm->resolve_snapname(path[depth], cur->ino());
8333 dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl;
8334 if (!snapid) {
8335 if (pdnvec)
8336 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8337 return -ENOENT;
8338 }
8339 mdr->snapid = snapid;
8340 depth++;
8341 continue;
8342 }
8343
8344 // open dir
8345 frag_t fg = cur->pick_dirfrag(path[depth]);
8346 CDir *curdir = cur->get_dirfrag(fg);
8347 if (!curdir) {
8348 if (cur->is_auth()) {
8349 // parent dir frozen_dir?
8350 if (cur->is_frozen()) {
8351 dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl;
8352 cur->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8353 return 1;
8354 }
8355 curdir = cur->get_or_open_dirfrag(this, fg);
8356 } else {
8357 // discover?
8358 dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl;
8359 discover_path(cur, snapid, path.postfixpath(depth), cf.build(),
8360 path_locked);
8361 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8362 return 1;
8363 }
8364 }
8365 ceph_assert(curdir);
8366
8367 #ifdef MDS_VERIFY_FRAGSTAT
8368 if (curdir->is_complete())
8369 curdir->verify_fragstat();
8370 #endif
8371
8372 // frozen?
8373 /*
8374 if (curdir->is_frozen()) {
8375 // doh!
8376 // FIXME: traverse is allowed?
8377 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8378 curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin));
8379 if (onfinish) delete onfinish;
8380 return 1;
8381 }
8382 */
8383
8384 if (want_auth && want_dentry && depth == path.depth() - 1) {
8385 if (curdir->is_ambiguous_auth()) {
8386 dout(10) << "waiting for single auth on " << *curdir << dendl;
8387 curdir->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8388 return 1;
8389 }
8390 if (!curdir->is_auth()) {
8391 dout(10) << "fw to auth for " << *curdir << dendl;
8392 request_forward(mdr, curdir->authority().first);
8393 return 2;
8394 }
8395 }
8396
8397 // Before doing dirfrag->dn lookup, compare with DamageTable's
8398 // record of which dentries were unreadable
8399 if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) {
8400 dout(4) << "traverse: stopped lookup at damaged dentry "
8401 << *curdir << "/" << path[depth] << " snap=" << snapid << dendl;
8402 return -EIO;
8403 }
8404
8405 // dentry
8406 CDentry *dn = curdir->lookup(path[depth], snapid);
8407 if (dn) {
8408 if (dn->state_test(CDentry::STATE_PURGING))
8409 return -ENOENT;
8410
8411 if (rdlock_path) {
8412 lov.clear();
8413 if (xlock_dentry && depth == path.depth() - 1) {
8414 if (depth > 0 || !mdr->lock_cache) {
8415 lov.add_wrlock(&cur->filelock);
8416 lov.add_wrlock(&cur->nestlock);
8417 if (rdlock_authlock)
8418 lov.add_rdlock(&cur->authlock);
8419 }
8420 lov.add_xlock(&dn->lock);
8421 } else {
8422 // force client to flush async dir operation if necessary
8423 if (cur->filelock.is_cached())
8424 lov.add_wrlock(&cur->filelock);
8425 lov.add_rdlock(&dn->lock);
8426 }
8427 if (!mds->locker->acquire_locks(mdr, lov)) {
8428 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8429 return 1;
8430 }
8431 } else if (!path_locked &&
8432 !dn->lock.can_read(client) &&
8433 !(dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) {
8434 dout(10) << "traverse: non-readable dentry at " << *dn << dendl;
8435 dn->lock.add_waiter(SimpleLock::WAIT_RD, cf.build());
8436 if (mds->logger)
8437 mds->logger->inc(l_mds_traverse_lock);
8438 if (dn->is_auth() && dn->lock.is_unstable_and_locked())
8439 mds->mdlog->flush();
8440 return 1;
8441 }
8442
8443 if (pdnvec)
8444 pdnvec->push_back(dn);
8445
8446 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8447 // can we conclude ENOENT?
8448 if (dnl->is_null()) {
8449 dout(10) << "traverse: null+readable dentry at " << *dn << dendl;
8450 if (depth == path.depth() - 1) {
8451 if (want_dentry)
8452 break;
8453 } else {
8454 if (pdnvec)
8455 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8456 }
8457 return -ENOENT;
8458 }
8459
8460 // do we have inode?
8461 CInode *in = dnl->get_inode();
8462 if (!in) {
8463 ceph_assert(dnl->is_remote());
8464 // do i have it?
8465 in = get_inode(dnl->get_remote_ino());
8466 if (in) {
8467 dout(7) << "linking in remote in " << *in << dendl;
8468 dn->link_remote(dnl, in);
8469 } else {
8470 dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl;
8471 ceph_assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal!
8472 if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) {
8473 dout(4) << "traverse: remote dentry points to damaged ino "
8474 << *dn << dendl;
8475 return -EIO;
8476 }
8477 open_remote_dentry(dn, true, cf.build(),
8478 (path_locked && depth == path.depth() - 1));
8479 if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino);
8480 return 1;
8481 }
8482 }
8483
8484 cur = in;
8485 // make sure snaprealm are open...
8486 if (mdr && cur->snaprealm && !cur->snaprealm->have_past_parents_open() &&
8487 !cur->snaprealm->open_parents(cf.build())) {
8488 return 1;
8489 }
8490
8491 if (rdlock_snap && !(want_dentry && depth == path.depth() - 1)) {
8492 lov.clear();
8493 lov.add_rdlock(&cur->snaplock);
8494 if (!mds->locker->acquire_locks(mdr, lov)) {
8495 dout(10) << "traverse: failed to rdlock " << cur->snaplock << " " << *cur << dendl;
8496 return 1;
8497 }
8498 }
8499
8500 // add to trace, continue.
8501 touch_inode(cur);
8502 if (pin)
8503 *pin = cur;
8504 depth++;
8505 continue;
8506 }
8507
8508 ceph_assert(!dn);
8509
8510 // MISS. dentry doesn't exist.
8511 dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl;
8512
8513 if (curdir->is_auth()) {
8514 // dentry is mine.
8515 if (curdir->is_complete() ||
8516 (snapid == CEPH_NOSNAP &&
8517 curdir->has_bloom() &&
8518 !curdir->is_in_bloom(path[depth]))) {
8519 // file not found
8520 if (pdnvec) {
8521 // instantiate a null dn?
8522 if (depth < path.depth() - 1) {
8523 dout(20) << " didn't traverse full path; not returning pdnvec" << dendl;
8524 } else if (snapid < CEPH_MAXSNAP) {
8525 dout(20) << " not adding null for snapid " << snapid << dendl;
8526 } else if (curdir->is_frozen()) {
8527 dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl;
8528 curdir->add_waiter(CDir::WAIT_UNFREEZE, cf.build());
8529 return 1;
8530 } else {
8531 // create a null dentry
8532 dn = curdir->add_null_dentry(path[depth]);
8533 dout(20) << " added null " << *dn << dendl;
8534
8535 if (rdlock_path) {
8536 lov.clear();
8537 if (xlock_dentry) {
8538 if (depth > 0 || !mdr->lock_cache) {
8539 lov.add_wrlock(&cur->filelock);
8540 lov.add_wrlock(&cur->nestlock);
8541 if (rdlock_authlock)
8542 lov.add_rdlock(&cur->authlock);
8543 }
8544 lov.add_xlock(&dn->lock);
8545 } else {
8546 // force client to flush async dir operation if necessary
8547 if (cur->filelock.is_cached())
8548 lov.add_wrlock(&cur->filelock);
8549 lov.add_rdlock(&dn->lock);
8550 }
8551 if (!mds->locker->acquire_locks(mdr, lov)) {
8552 dout(10) << "traverse: failed to rdlock " << dn->lock << " " << *dn << dendl;
8553 return 1;
8554 }
8555 }
8556 }
8557 if (dn) {
8558 pdnvec->push_back(dn);
8559 if (want_dentry)
8560 break;
8561 } else {
8562 pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref();
8563 }
8564 }
8565 return -ENOENT;
8566 } else {
8567
8568 // Check DamageTable for missing fragments before trying to fetch
8569 // this
8570 if (mds->damage_table.is_dirfrag_damaged(curdir)) {
8571 dout(4) << "traverse: damaged dirfrag " << *curdir
8572 << ", blocking fetch" << dendl;
8573 return -EIO;
8574 }
8575
8576 // directory isn't complete; reload
8577 dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl;
8578 touch_inode(cur);
8579 curdir->fetch(cf.build(), path[depth]);
8580 if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch);
8581 return 1;
8582 }
8583 } else {
8584 // dirfrag/dentry is not mine.
8585 mds_authority_t dauth = curdir->authority();
8586
8587 if (!forward_all_requests_to_auth &&
8588 forward &&
8589 mdr && mdr->client_request &&
8590 (int)depth < mdr->client_request->get_num_fwd()){
8591 dout(7) << "traverse: snap " << snapid << " and depth " << depth
8592 << " < fwd " << mdr->client_request->get_num_fwd()
8593 << ", discovering instead of forwarding" << dendl;
8594 discover = true;
8595 }
8596
8597 if ((discover)) {
8598 dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl;
8599 discover_path(curdir, snapid, path.postfixpath(depth), cf.build(),
8600 path_locked);
8601 if (mds->logger) mds->logger->inc(l_mds_traverse_discover);
8602 return 1;
8603 }
8604 if (forward) {
8605 // forward
8606 dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl;
8607
8608 if (curdir->is_ambiguous_auth()) {
8609 // wait
8610 dout(7) << "traverse: waiting for single auth in " << *curdir << dendl;
8611 curdir->add_waiter(CDir::WAIT_SINGLEAUTH, cf.build());
8612 return 1;
8613 }
8614
8615 dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl;
8616
8617 request_forward(mdr, dauth.first);
8618
8619 if (mds->logger) mds->logger->inc(l_mds_traverse_forward);
8620 return 2;
8621 }
8622 }
8623
8624 ceph_abort(); // i shouldn't get here
8625 }
8626
8627 if (want_auth && !want_dentry) {
8628 if (cur->is_ambiguous_auth()) {
8629 dout(10) << "waiting for single auth on " << *cur << dendl;
8630 cur->add_waiter(CInode::WAIT_SINGLEAUTH, cf.build());
8631 return 1;
8632 }
8633 if (!cur->is_auth()) {
8634 dout(10) << "fw to auth for " << *cur << dendl;
8635 request_forward(mdr, cur->authority().first);
8636 return 2;
8637 }
8638 }
8639
8640 // success.
8641 if (mds->logger) mds->logger->inc(l_mds_traverse_hit);
8642 dout(10) << "path_traverse finish on snapid " << snapid << dendl;
8643 if (mdr)
8644 ceph_assert(mdr->snapid == snapid);
8645
8646 if (flags & MDS_TRAVERSE_RDLOCK_SNAP)
8647 mdr->locking_state |= MutationImpl::SNAP_LOCKED;
8648 else if (flags & MDS_TRAVERSE_RDLOCK_SNAP2)
8649 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
8650
8651 if (rdlock_path)
8652 mdr->locking_state |= MutationImpl::PATH_LOCKED;
8653
8654 return 0;
8655 }
8656
8657 CInode *MDCache::cache_traverse(const filepath& fp)
8658 {
8659 dout(10) << "cache_traverse " << fp << dendl;
8660
8661 CInode *in;
8662 if (fp.get_ino())
8663 in = get_inode(fp.get_ino());
8664 else
8665 in = root;
8666 if (!in)
8667 return NULL;
8668
8669 for (unsigned i = 0; i < fp.depth(); i++) {
8670 std::string_view dname = fp[i];
8671 frag_t fg = in->pick_dirfrag(dname);
8672 dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl;
8673 CDir *curdir = in->get_dirfrag(fg);
8674 if (!curdir)
8675 return NULL;
8676 CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP);
8677 if (!dn)
8678 return NULL;
8679 in = dn->get_linkage()->get_inode();
8680 if (!in)
8681 return NULL;
8682 }
8683 dout(10) << " got " << *in << dendl;
8684 return in;
8685 }
8686
8687
8688 /**
8689 * open_remote_dir -- open up a remote dirfrag
8690 *
8691 * @param diri base inode
8692 * @param approxfg approximate fragment.
8693 * @param fin completion callback
8694 */
8695 void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSContext *fin)
8696 {
8697 dout(10) << "open_remote_dir on " << *diri << dendl;
8698 ceph_assert(diri->is_dir());
8699 ceph_assert(!diri->is_auth());
8700 ceph_assert(diri->get_dirfrag(approxfg) == 0);
8701
8702 discover_dir_frag(diri, approxfg, fin);
8703 }
8704
8705
8706 /**
8707 * get_dentry_inode - get or open inode
8708 *
8709 * @param dn the dentry
8710 * @param mdr current request
8711 *
8712 * will return inode for primary, or link up/open up remote link's inode as necessary.
8713 * If it's not available right now, puts mdr on wait list and returns null.
8714 */
8715 CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected)
8716 {
8717 CDentry::linkage_t *dnl;
8718 if (projected)
8719 dnl = dn->get_projected_linkage();
8720 else
8721 dnl = dn->get_linkage();
8722
8723 ceph_assert(!dnl->is_null());
8724
8725 if (dnl->is_primary())
8726 return dnl->inode;
8727
8728 ceph_assert(dnl->is_remote());
8729 CInode *in = get_inode(dnl->get_remote_ino());
8730 if (in) {
8731 dout(7) << "get_dentry_inode linking in remote in " << *in << dendl;
8732 dn->link_remote(dnl, in);
8733 return in;
8734 } else {
8735 dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl;
8736 open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr));
8737 return 0;
8738 }
8739 }
8740
8741 struct C_MDC_OpenRemoteDentry : public MDCacheContext {
8742 CDentry *dn;
8743 inodeno_t ino;
8744 MDSContext *onfinish;
8745 bool want_xlocked;
8746 C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSContext *f, bool wx) :
8747 MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) {
8748 dn->get(MDSCacheObject::PIN_PTRWAITER);
8749 }
8750 void finish(int r) override {
8751 mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r);
8752 dn->put(MDSCacheObject::PIN_PTRWAITER);
8753 }
8754 };
8755
8756 void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, bool want_xlocked)
8757 {
8758 dout(10) << "open_remote_dentry " << *dn << dendl;
8759 CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage();
8760 inodeno_t ino = dnl->get_remote_ino();
8761 int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1;
8762 open_ino(ino, pool,
8763 new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace
8764 }
8765
8766 void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
8767 bool want_xlocked, int r)
8768 {
8769 if (r < 0) {
8770 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8771 if (dnl->is_remote() && dnl->get_remote_ino() == ino) {
8772 dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl;
8773 dn->state_set(CDentry::STATE_BADREMOTEINO);
8774
8775 std::string path;
8776 CDir *dir = dn->get_dir();
8777 if (dir) {
8778 dir->get_inode()->make_path_string(path);
8779 path += "/";
8780 path += dn->get_name();
8781 }
8782
8783 bool fatal = mds->damage_table.notify_remote_damaged(ino, path);
8784 if (fatal) {
8785 mds->damaged();
8786 ceph_abort(); // unreachable, damaged() respawns us
8787 }
8788 } else {
8789 r = 0;
8790 }
8791 }
8792 fin->complete(r < 0 ? r : 0);
8793 }
8794
8795
8796 void MDCache::make_trace(vector<CDentry*>& trace, CInode *in)
8797 {
8798 // empty trace if we're a base inode
8799 if (in->is_base())
8800 return;
8801
8802 CInode *parent = in->get_parent_inode();
8803 ceph_assert(parent);
8804 make_trace(trace, parent);
8805
8806 CDentry *dn = in->get_parent_dn();
8807 dout(15) << "make_trace adding " << *dn << dendl;
8808 trace.push_back(dn);
8809 }
8810
8811
8812 // -------------------------------------------------------------------------------
8813 // Open inode by inode number
8814
8815 class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext {
8816 inodeno_t ino;
8817 public:
8818 bufferlist bl;
8819 C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) :
8820 MDCacheIOContext(c), ino(i) {}
8821 void finish(int r) override {
8822 mdcache->_open_ino_backtrace_fetched(ino, bl, r);
8823 }
8824 void print(ostream& out) const override {
8825 out << "openino_backtrace_fetch" << ino << ")";
8826 }
8827 };
8828
8829 struct C_MDC_OpenInoTraverseDir : public MDCacheContext {
8830 inodeno_t ino;
8831 cref_t<MMDSOpenIno> msg;
8832 bool parent;
8833 public:
8834 C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, const cref_t<MMDSOpenIno> &m, bool p) :
8835 MDCacheContext(c), ino(i), msg(m), parent(p) {}
8836 void finish(int r) override {
8837 if (r < 0 && !parent)
8838 r = -EAGAIN;
8839 if (msg) {
8840 mdcache->handle_open_ino(msg, r);
8841 return;
8842 }
8843 auto& info = mdcache->opening_inodes.at(ino);
8844 mdcache->_open_ino_traverse_dir(ino, info, r);
8845 }
8846 };
8847
8848 struct C_MDC_OpenInoParentOpened : public MDCacheContext {
8849 inodeno_t ino;
8850 public:
8851 C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {}
8852 void finish(int r) override {
8853 mdcache->_open_ino_parent_opened(ino, r);
8854 }
8855 };
8856
8857 void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err)
8858 {
8859 dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl;
8860
8861 open_ino_info_t& info = opening_inodes.at(ino);
8862
8863 CInode *in = get_inode(ino);
8864 if (in) {
8865 dout(10) << " found cached " << *in << dendl;
8866 open_ino_finish(ino, info, in->authority().first);
8867 return;
8868 }
8869
8870 inode_backtrace_t backtrace;
8871 if (err == 0) {
8872 try {
8873 decode(backtrace, bl);
8874 } catch (const buffer::error &decode_exc) {
8875 derr << "corrupt backtrace on ino x0" << std::hex << ino
8876 << std::dec << ": " << decode_exc << dendl;
8877 open_ino_finish(ino, info, -EIO);
8878 return;
8879 }
8880 if (backtrace.pool != info.pool && backtrace.pool != -1) {
8881 dout(10) << " old object in pool " << info.pool
8882 << ", retrying pool " << backtrace.pool << dendl;
8883 info.pool = backtrace.pool;
8884 C_IO_MDC_OpenInoBacktraceFetched *fin =
8885 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8886 fetch_backtrace(ino, info.pool, fin->bl,
8887 new C_OnFinisher(fin, mds->finisher));
8888 return;
8889 }
8890 } else if (err == -ENOENT) {
8891 int64_t meta_pool = mds->mdsmap->get_metadata_pool();
8892 if (info.pool != meta_pool) {
8893 dout(10) << " no object in pool " << info.pool
8894 << ", retrying pool " << meta_pool << dendl;
8895 info.pool = meta_pool;
8896 C_IO_MDC_OpenInoBacktraceFetched *fin =
8897 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
8898 fetch_backtrace(ino, info.pool, fin->bl,
8899 new C_OnFinisher(fin, mds->finisher));
8900 return;
8901 }
8902 err = 0; // backtrace.ancestors.empty() is checked below
8903 }
8904
8905 if (err == 0) {
8906 if (backtrace.ancestors.empty()) {
8907 dout(10) << " got empty backtrace " << dendl;
8908 err = -ESTALE;
8909 } else if (!info.ancestors.empty()) {
8910 if (info.ancestors[0] == backtrace.ancestors[0]) {
8911 dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl;
8912 err = -EINVAL;
8913 } else {
8914 info.last_err = 0;
8915 }
8916 }
8917 }
8918 if (err) {
8919 dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl;
8920 if (info.last_err)
8921 err = info.last_err;
8922 open_ino_finish(ino, info, err);
8923 return;
8924 }
8925
8926 dout(10) << " got backtrace " << backtrace << dendl;
8927 info.ancestors = backtrace.ancestors;
8928
8929 _open_ino_traverse_dir(ino, info, 0);
8930 }
8931
8932 void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret)
8933 {
8934 dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl;
8935
8936 open_ino_info_t& info = opening_inodes.at(ino);
8937
8938 CInode *in = get_inode(ino);
8939 if (in) {
8940 dout(10) << " found cached " << *in << dendl;
8941 open_ino_finish(ino, info, in->authority().first);
8942 return;
8943 }
8944
8945 if (ret == mds->get_nodeid()) {
8946 _open_ino_traverse_dir(ino, info, 0);
8947 } else {
8948 if (ret >= 0) {
8949 mds_rank_t checked_rank = mds_rank_t(ret);
8950 info.check_peers = true;
8951 info.auth_hint = checked_rank;
8952 info.checked.erase(checked_rank);
8953 }
8954 do_open_ino(ino, info, ret);
8955 }
8956 }
8957
8958 void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret)
8959 {
8960 dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl;
8961
8962 CInode *in = get_inode(ino);
8963 if (in) {
8964 dout(10) << " found cached " << *in << dendl;
8965 open_ino_finish(ino, info, in->authority().first);
8966 return;
8967 }
8968
8969 if (ret) {
8970 do_open_ino(ino, info, ret);
8971 return;
8972 }
8973
8974 mds_rank_t hint = info.auth_hint;
8975 ret = open_ino_traverse_dir(ino, NULL, info.ancestors,
8976 info.discover, info.want_xlocked, &hint);
8977 if (ret > 0)
8978 return;
8979 if (hint != mds->get_nodeid())
8980 info.auth_hint = hint;
8981 do_open_ino(ino, info, ret);
8982 }
8983
8984 void MDCache::_open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent)
8985 {
8986 if (dir->state_test(CDir::STATE_REJOINUNDEF))
8987 ceph_assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()));
8988 dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent));
8989 if (mds->logger)
8990 mds->logger->inc(l_mds_openino_dir_fetch);
8991 }
8992
8993 int MDCache::open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
8994 const vector<inode_backpointer_t>& ancestors,
8995 bool discover, bool want_xlocked, mds_rank_t *hint)
8996 {
8997 dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl;
8998 int err = 0;
8999 for (unsigned i = 0; i < ancestors.size(); i++) {
9000 const auto& ancestor = ancestors.at(i);
9001 CInode *diri = get_inode(ancestor.dirino);
9002
9003 if (!diri) {
9004 if (discover && MDS_INO_IS_MDSDIR(ancestor.dirino)) {
9005 open_foreign_mdsdir(ancestor.dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9006 return 1;
9007 }
9008 continue;
9009 }
9010
9011 if (diri->state_test(CInode::STATE_REJOINUNDEF)) {
9012 CDir *dir = diri->get_parent_dir();
9013 while (dir->state_test(CDir::STATE_REJOINUNDEF) &&
9014 dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF))
9015 dir = dir->get_inode()->get_parent_dir();
9016 _open_ino_fetch_dir(ino, m, dir, i == 0);
9017 return 1;
9018 }
9019
9020 if (!diri->is_dir()) {
9021 dout(10) << " " << *diri << " is not dir" << dendl;
9022 if (i == 0)
9023 err = -ENOTDIR;
9024 break;
9025 }
9026
9027 const string& name = ancestor.dname;
9028 frag_t fg = diri->pick_dirfrag(name);
9029 CDir *dir = diri->get_dirfrag(fg);
9030 if (!dir) {
9031 if (diri->is_auth()) {
9032 if (diri->is_frozen()) {
9033 dout(10) << " " << *diri << " is frozen, waiting " << dendl;
9034 diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9035 return 1;
9036 }
9037 dir = diri->get_or_open_dirfrag(this, fg);
9038 } else if (discover) {
9039 open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9040 return 1;
9041 }
9042 }
9043 if (dir) {
9044 inodeno_t next_ino = i > 0 ? ancestors.at(i-1).dirino : ino;
9045 CDentry *dn = dir->lookup(name);
9046 CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL;
9047 if (dir->is_auth()) {
9048 if (dnl && dnl->is_primary() &&
9049 dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) {
9050 dout(10) << " fetching undef " << *dnl->get_inode() << dendl;
9051 _open_ino_fetch_dir(ino, m, dir, i == 0);
9052 return 1;
9053 }
9054
9055 if (!dnl && !dir->is_complete() &&
9056 (!dir->has_bloom() || dir->is_in_bloom(name))) {
9057 dout(10) << " fetching incomplete " << *dir << dendl;
9058 _open_ino_fetch_dir(ino, m, dir, i == 0);
9059 return 1;
9060 }
9061
9062 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9063 if (i == 0)
9064 err = -ENOENT;
9065 } else if (discover) {
9066 if (!dnl) {
9067 filepath path(name, 0);
9068 discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0),
9069 (i == 0 && want_xlocked));
9070 return 1;
9071 }
9072 if (dnl->is_null() && !dn->lock.can_read(-1)) {
9073 dout(10) << " null " << *dn << " is not readable, waiting" << dendl;
9074 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0));
9075 return 1;
9076 }
9077 dout(10) << " no ino " << next_ino << " in " << *dir << dendl;
9078 if (i == 0)
9079 err = -ENOENT;
9080 }
9081 }
9082 if (hint && i == 0)
9083 *hint = dir ? dir->authority().first : diri->authority().first;
9084 break;
9085 }
9086 return err;
9087 }
9088
9089 void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret)
9090 {
9091 dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl;
9092
9093 MDSContext::vec waiters;
9094 waiters.swap(info.waiters);
9095 opening_inodes.erase(ino);
9096 finish_contexts(g_ceph_context, waiters, ret);
9097 }
9098
9099 void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err)
9100 {
9101 if (err < 0 && err != -EAGAIN) {
9102 info.checked.clear();
9103 info.checking = MDS_RANK_NONE;
9104 info.check_peers = true;
9105 info.fetch_backtrace = true;
9106 if (info.discover) {
9107 info.discover = false;
9108 info.ancestors.clear();
9109 }
9110 if (err != -ENOENT && err != -ENOTDIR)
9111 info.last_err = err;
9112 }
9113
9114 if (info.check_peers || info.discover) {
9115 if (info.discover) {
9116 // got backtrace from peer, but failed to find inode. re-check peers
9117 info.discover = false;
9118 info.ancestors.clear();
9119 info.checked.clear();
9120 }
9121 info.check_peers = false;
9122 info.checking = MDS_RANK_NONE;
9123 do_open_ino_peer(ino, info);
9124 } else if (info.fetch_backtrace) {
9125 info.check_peers = true;
9126 info.fetch_backtrace = false;
9127 info.checking = mds->get_nodeid();
9128 info.checked.clear();
9129 C_IO_MDC_OpenInoBacktraceFetched *fin =
9130 new C_IO_MDC_OpenInoBacktraceFetched(this, ino);
9131 fetch_backtrace(ino, info.pool, fin->bl,
9132 new C_OnFinisher(fin, mds->finisher));
9133 } else {
9134 ceph_assert(!info.ancestors.empty());
9135 info.checking = mds->get_nodeid();
9136 open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(),
9137 new C_MDC_OpenInoParentOpened(this, ino), info.want_replica);
9138 }
9139 }
9140
9141 void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info)
9142 {
9143 set<mds_rank_t> all, active;
9144 mds->mdsmap->get_mds_set(all);
9145 if (mds->get_state() == MDSMap::STATE_REJOIN)
9146 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_REJOIN);
9147 else
9148 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9149
9150 dout(10) << "do_open_ino_peer " << ino << " active " << active
9151 << " all " << all << " checked " << info.checked << dendl;
9152
9153 mds_rank_t whoami = mds->get_nodeid();
9154 mds_rank_t peer = MDS_RANK_NONE;
9155 if (info.auth_hint >= 0 && info.auth_hint != whoami) {
9156 if (active.count(info.auth_hint)) {
9157 peer = info.auth_hint;
9158 info.auth_hint = MDS_RANK_NONE;
9159 }
9160 } else {
9161 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9162 if (*p != whoami && info.checked.count(*p) == 0) {
9163 peer = *p;
9164 break;
9165 }
9166 }
9167 if (peer < 0) {
9168 all.erase(whoami);
9169 if (all != info.checked) {
9170 dout(10) << " waiting for more peers to be active" << dendl;
9171 } else {
9172 dout(10) << " all MDS peers have been checked " << dendl;
9173 do_open_ino(ino, info, 0);
9174 }
9175 } else {
9176 info.checking = peer;
9177 vector<inode_backpointer_t> *pa = NULL;
9178 // got backtrace from peer or backtrace just fetched
9179 if (info.discover || !info.fetch_backtrace)
9180 pa = &info.ancestors;
9181 mds->send_message_mds(make_message<MMDSOpenIno>(info.tid, ino, pa), peer);
9182 if (mds->logger)
9183 mds->logger->inc(l_mds_openino_peer_discover);
9184 }
9185 }
9186
9187 void MDCache::handle_open_ino(const cref_t<MMDSOpenIno> &m, int err)
9188 {
9189 if (mds->get_state() < MDSMap::STATE_REJOIN &&
9190 mds->get_want_state() != CEPH_MDS_STATE_REJOIN) {
9191 return;
9192 }
9193
9194 dout(10) << "handle_open_ino " << *m << " err " << err << dendl;
9195
9196 auto from = mds_rank_t(m->get_source().num());
9197 inodeno_t ino = m->ino;
9198 ref_t<MMDSOpenInoReply> reply;
9199 CInode *in = get_inode(ino);
9200 if (in) {
9201 dout(10) << " have " << *in << dendl;
9202 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, mds_rank_t(0));
9203 if (in->is_auth()) {
9204 touch_inode(in);
9205 while (1) {
9206 CDentry *pdn = in->get_parent_dn();
9207 if (!pdn)
9208 break;
9209 CInode *diri = pdn->get_dir()->get_inode();
9210 reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->get_name(),
9211 in->inode.version));
9212 in = diri;
9213 }
9214 } else {
9215 reply->hint = in->authority().first;
9216 }
9217 } else if (err < 0) {
9218 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, MDS_RANK_NONE, err);
9219 } else {
9220 mds_rank_t hint = MDS_RANK_NONE;
9221 int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint);
9222 if (ret > 0)
9223 return;
9224 reply = make_message<MMDSOpenInoReply>(m->get_tid(), ino, hint, ret);
9225 }
9226 mds->send_message_mds(reply, from);
9227 }
9228
9229 void MDCache::handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m)
9230 {
9231 dout(10) << "handle_open_ino_reply " << *m << dendl;
9232
9233 inodeno_t ino = m->ino;
9234 mds_rank_t from = mds_rank_t(m->get_source().num());
9235 auto it = opening_inodes.find(ino);
9236 if (it != opening_inodes.end() && it->second.checking == from) {
9237 open_ino_info_t& info = it->second;
9238 info.checking = MDS_RANK_NONE;
9239 info.checked.insert(from);
9240
9241 CInode *in = get_inode(ino);
9242 if (in) {
9243 dout(10) << " found cached " << *in << dendl;
9244 open_ino_finish(ino, info, in->authority().first);
9245 } else if (!m->ancestors.empty()) {
9246 dout(10) << " found ino " << ino << " on mds." << from << dendl;
9247 if (!info.want_replica) {
9248 open_ino_finish(ino, info, from);
9249 return;
9250 }
9251
9252 info.ancestors = m->ancestors;
9253 info.auth_hint = from;
9254 info.checking = mds->get_nodeid();
9255 info.discover = true;
9256 _open_ino_traverse_dir(ino, info, 0);
9257 } else if (m->error) {
9258 dout(10) << " error " << m->error << " from mds." << from << dendl;
9259 do_open_ino(ino, info, m->error);
9260 } else {
9261 if (m->hint >= 0 && m->hint != mds->get_nodeid()) {
9262 info.auth_hint = m->hint;
9263 info.checked.erase(m->hint);
9264 }
9265 do_open_ino_peer(ino, info);
9266 }
9267 }
9268 }
9269
9270 void MDCache::kick_open_ino_peers(mds_rank_t who)
9271 {
9272 dout(10) << "kick_open_ino_peers mds." << who << dendl;
9273
9274 for (map<inodeno_t, open_ino_info_t>::iterator p = opening_inodes.begin();
9275 p != opening_inodes.end();
9276 ++p) {
9277 open_ino_info_t& info = p->second;
9278 if (info.checking == who) {
9279 dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl;
9280 info.checking = MDS_RANK_NONE;
9281 do_open_ino_peer(p->first, info);
9282 } else if (info.checking == MDS_RANK_NONE) {
9283 dout(10) << " kicking ino " << p->first << " who was waiting" << dendl;
9284 do_open_ino_peer(p->first, info);
9285 }
9286 }
9287 }
9288
9289 void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSContext* fin,
9290 bool want_replica, bool want_xlocked)
9291 {
9292 dout(10) << "open_ino " << ino << " pool " << pool << " want_replica "
9293 << want_replica << dendl;
9294
9295 auto it = opening_inodes.find(ino);
9296 if (it != opening_inodes.end()) {
9297 open_ino_info_t& info = it->second;
9298 if (want_replica) {
9299 info.want_replica = true;
9300 if (want_xlocked && !info.want_xlocked) {
9301 if (!info.ancestors.empty()) {
9302 CInode *diri = get_inode(info.ancestors[0].dirino);
9303 if (diri) {
9304 frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname);
9305 CDir *dir = diri->get_dirfrag(fg);
9306 if (dir && !dir->is_auth()) {
9307 filepath path(info.ancestors[0].dname, 0);
9308 discover_path(dir, CEPH_NOSNAP, path, NULL, true);
9309 }
9310 }
9311 }
9312 info.want_xlocked = true;
9313 }
9314 }
9315 info.waiters.push_back(fin);
9316 } else {
9317 open_ino_info_t& info = opening_inodes[ino];
9318 info.want_replica = want_replica;
9319 info.want_xlocked = want_xlocked;
9320 info.tid = ++open_ino_last_tid;
9321 info.pool = pool >= 0 ? pool : default_file_layout.pool_id;
9322 info.waiters.push_back(fin);
9323 if (mds->is_rejoin() &&
9324 open_file_table.get_ancestors(ino, info.ancestors, info.auth_hint)) {
9325 info.fetch_backtrace = false;
9326 info.checking = mds->get_nodeid();
9327 _open_ino_traverse_dir(ino, info, 0);
9328 } else {
9329 do_open_ino(ino, info, 0);
9330 }
9331 }
9332 }
9333
9334 /* ---------------------------- */
9335
9336 /*
9337 * search for a given inode on MDS peers. optionally start with the given node.
9338
9339
9340 TODO
9341 - recover from mds node failure, recovery
9342 - traverse path
9343
9344 */
9345 void MDCache::find_ino_peers(inodeno_t ino, MDSContext *c,
9346 mds_rank_t hint, bool path_locked)
9347 {
9348 dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl;
9349 CInode *in = get_inode(ino);
9350 if (in && in->state_test(CInode::STATE_PURGING)) {
9351 c->complete(-ESTALE);
9352 return;
9353 }
9354 ceph_assert(!in);
9355
9356 ceph_tid_t tid = ++find_ino_peer_last_tid;
9357 find_ino_peer_info_t& fip = find_ino_peer[tid];
9358 fip.ino = ino;
9359 fip.tid = tid;
9360 fip.fin = c;
9361 fip.path_locked = path_locked;
9362 fip.hint = hint;
9363 _do_find_ino_peer(fip);
9364 }
9365
9366 void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip)
9367 {
9368 set<mds_rank_t> all, active;
9369 mds->mdsmap->get_mds_set(all);
9370 mds->mdsmap->get_mds_set_lower_bound(active, MDSMap::STATE_CLIENTREPLAY);
9371
9372 dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino
9373 << " active " << active << " all " << all
9374 << " checked " << fip.checked
9375 << dendl;
9376
9377 mds_rank_t m = MDS_RANK_NONE;
9378 if (fip.hint >= 0) {
9379 m = fip.hint;
9380 fip.hint = MDS_RANK_NONE;
9381 } else {
9382 for (set<mds_rank_t>::iterator p = active.begin(); p != active.end(); ++p)
9383 if (*p != mds->get_nodeid() &&
9384 fip.checked.count(*p) == 0) {
9385 m = *p;
9386 break;
9387 }
9388 }
9389 if (m == MDS_RANK_NONE) {
9390 all.erase(mds->get_nodeid());
9391 if (all != fip.checked) {
9392 dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl;
9393 } else {
9394 dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl;
9395 fip.fin->complete(-ESTALE);
9396 find_ino_peer.erase(fip.tid);
9397 }
9398 } else {
9399 fip.checking = m;
9400 mds->send_message_mds(make_message<MMDSFindIno>(fip.tid, fip.ino), m);
9401 }
9402 }
9403
9404 void MDCache::handle_find_ino(const cref_t<MMDSFindIno> &m)
9405 {
9406 if (mds->get_state() < MDSMap::STATE_REJOIN) {
9407 return;
9408 }
9409
9410 dout(10) << "handle_find_ino " << *m << dendl;
9411 auto r = make_message<MMDSFindInoReply>(m->tid);
9412 CInode *in = get_inode(m->ino);
9413 if (in) {
9414 in->make_path(r->path);
9415 dout(10) << " have " << r->path << " " << *in << dendl;
9416 }
9417 mds->send_message_mds(r, mds_rank_t(m->get_source().num()));
9418 }
9419
9420
9421 void MDCache::handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m)
9422 {
9423 auto p = find_ino_peer.find(m->tid);
9424 if (p != find_ino_peer.end()) {
9425 dout(10) << "handle_find_ino_reply " << *m << dendl;
9426 find_ino_peer_info_t& fip = p->second;
9427
9428 // success?
9429 if (get_inode(fip.ino)) {
9430 dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl;
9431 mds->queue_waiter(fip.fin);
9432 find_ino_peer.erase(p);
9433 return;
9434 }
9435
9436 mds_rank_t from = mds_rank_t(m->get_source().num());
9437 if (fip.checking == from)
9438 fip.checking = MDS_RANK_NONE;
9439 fip.checked.insert(from);
9440
9441 if (!m->path.empty()) {
9442 // we got a path!
9443 vector<CDentry*> trace;
9444 CF_MDS_RetryMessageFactory cf(mds, m);
9445 MDRequestRef null_ref;
9446 int flags = MDS_TRAVERSE_DISCOVER;
9447 if (fip.path_locked)
9448 flags |= MDS_TRAVERSE_PATH_LOCKED;
9449 int r = path_traverse(null_ref, cf, m->path, flags, &trace);
9450 if (r > 0)
9451 return;
9452 dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path
9453 << ", retrying" << dendl;
9454 fip.checked.clear();
9455 _do_find_ino_peer(fip);
9456 } else {
9457 // nope, continue.
9458 _do_find_ino_peer(fip);
9459 }
9460 } else {
9461 dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl;
9462 }
9463 }
9464
9465 void MDCache::kick_find_ino_peers(mds_rank_t who)
9466 {
9467 // find_ino_peers requests we should move on from
9468 for (map<ceph_tid_t,find_ino_peer_info_t>::iterator p = find_ino_peer.begin();
9469 p != find_ino_peer.end();
9470 ++p) {
9471 find_ino_peer_info_t& fip = p->second;
9472 if (fip.checking == who) {
9473 dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl;
9474 fip.checking = MDS_RANK_NONE;
9475 _do_find_ino_peer(fip);
9476 } else if (fip.checking == MDS_RANK_NONE) {
9477 dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl;
9478 _do_find_ino_peer(fip);
9479 }
9480 }
9481 }
9482
9483 /* ---------------------------- */
9484
9485 int MDCache::get_num_client_requests()
9486 {
9487 int count = 0;
9488 for (ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.begin();
9489 p != active_requests.end();
9490 ++p) {
9491 MDRequestRef& mdr = p->second;
9492 if (mdr->reqid.name.is_client() && !mdr->is_slave())
9493 count++;
9494 }
9495 return count;
9496 }
9497
9498 MDRequestRef MDCache::request_start(const cref_t<MClientRequest>& req)
9499 {
9500 // did we win a forward race against a slave?
9501 if (active_requests.count(req->get_reqid())) {
9502 MDRequestRef& mdr = active_requests[req->get_reqid()];
9503 ceph_assert(mdr);
9504 if (mdr->is_slave()) {
9505 dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl;
9506 mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req));
9507 } else {
9508 dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl;
9509 }
9510 return MDRequestRef();
9511 }
9512
9513 // register new client request
9514 MDRequestImpl::Params params;
9515 params.reqid = req->get_reqid();
9516 params.attempt = req->get_num_fwd();
9517 params.client_req = req;
9518 params.initiated = req->get_recv_stamp();
9519 params.throttled = req->get_throttle_stamp();
9520 params.all_read = req->get_recv_complete_stamp();
9521 params.dispatched = req->get_dispatch_stamp();
9522
9523 MDRequestRef mdr =
9524 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9525 active_requests[params.reqid] = mdr;
9526 mdr->set_op_stamp(req->get_stamp());
9527 dout(7) << "request_start " << *mdr << dendl;
9528 return mdr;
9529 }
9530
9531 MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, const cref_t<Message> &m)
9532 {
9533 int by = m->get_source().num();
9534 MDRequestImpl::Params params;
9535 params.reqid = ri;
9536 params.attempt = attempt;
9537 params.triggering_slave_req = m;
9538 params.slave_to = by;
9539 params.initiated = m->get_recv_stamp();
9540 params.throttled = m->get_throttle_stamp();
9541 params.all_read = m->get_recv_complete_stamp();
9542 params.dispatched = m->get_dispatch_stamp();
9543 MDRequestRef mdr =
9544 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9545 ceph_assert(active_requests.count(mdr->reqid) == 0);
9546 active_requests[mdr->reqid] = mdr;
9547 dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl;
9548 return mdr;
9549 }
9550
9551 MDRequestRef MDCache::request_start_internal(int op)
9552 {
9553 utime_t now = ceph_clock_now();
9554 MDRequestImpl::Params params;
9555 params.reqid.name = entity_name_t::MDS(mds->get_nodeid());
9556 params.reqid.tid = mds->issue_tid();
9557 params.initiated = now;
9558 params.throttled = now;
9559 params.all_read = now;
9560 params.dispatched = now;
9561 params.internal_op = op;
9562 MDRequestRef mdr =
9563 mds->op_tracker.create_request<MDRequestImpl,MDRequestImpl::Params*>(&params);
9564
9565 ceph_assert(active_requests.count(mdr->reqid) == 0);
9566 active_requests[mdr->reqid] = mdr;
9567 dout(7) << "request_start_internal " << *mdr << " op " << op << dendl;
9568 return mdr;
9569 }
9570
9571 MDRequestRef MDCache::request_get(metareqid_t rid)
9572 {
9573 ceph::unordered_map<metareqid_t, MDRequestRef>::iterator p = active_requests.find(rid);
9574 ceph_assert(p != active_requests.end());
9575 dout(7) << "request_get " << rid << " " << *p->second << dendl;
9576 return p->second;
9577 }
9578
9579 void MDCache::request_finish(MDRequestRef& mdr)
9580 {
9581 dout(7) << "request_finish " << *mdr << dendl;
9582 mdr->mark_event("finishing request");
9583
9584 // slave finisher?
9585 if (mdr->has_more() && mdr->more()->slave_commit) {
9586 Context *fin = mdr->more()->slave_commit;
9587 mdr->more()->slave_commit = 0;
9588 int ret;
9589 if (mdr->aborted) {
9590 mdr->aborted = false;
9591 ret = -1;
9592 mdr->more()->slave_rolling_back = true;
9593 } else {
9594 ret = 0;
9595 mdr->committing = true;
9596 }
9597 fin->complete(ret); // this must re-call request_finish.
9598 return;
9599 }
9600
9601 switch(mdr->internal_op) {
9602 case CEPH_MDS_OP_FRAGMENTDIR:
9603 logger->inc(l_mdss_ireq_fragmentdir);
9604 break;
9605 case CEPH_MDS_OP_EXPORTDIR:
9606 logger->inc(l_mdss_ireq_exportdir);
9607 break;
9608 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9609 logger->inc(l_mdss_ireq_enqueue_scrub);
9610 break;
9611 case CEPH_MDS_OP_FLUSH:
9612 logger->inc(l_mdss_ireq_flush);
9613 break;
9614 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9615 logger->inc(l_mdss_ireq_fragstats);
9616 break;
9617 case CEPH_MDS_OP_REPAIR_INODESTATS:
9618 logger->inc(l_mdss_ireq_inodestats);
9619 break;
9620 }
9621
9622 request_cleanup(mdr);
9623 }
9624
9625
9626 void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port)
9627 {
9628 mdr->mark_event("forwarding request");
9629 if (mdr->client_request && mdr->client_request->get_source().is_client()) {
9630 dout(7) << "request_forward " << *mdr << " to mds." << who << " req "
9631 << *mdr->client_request << dendl;
9632 if (mdr->is_batch_head) {
9633 int mask = mdr->client_request->head.args.getattr.mask;
9634
9635 switch (mdr->client_request->get_op()) {
9636 case CEPH_MDS_OP_GETATTR:
9637 {
9638 CInode* in = mdr->in[0];
9639 if (in) {
9640 auto it = in->batch_ops.find(mask);
9641 if (it != in->batch_ops.end()) {
9642 it->second->forward(who);
9643 in->batch_ops.erase(it);
9644 }
9645 }
9646 break;
9647 }
9648 case CEPH_MDS_OP_LOOKUP:
9649 {
9650 if (mdr->dn[0].size()) {
9651 CDentry* dn = mdr->dn[0].back();
9652 auto it = dn->batch_ops.find(mask);
9653 if (it != dn->batch_ops.end()) {
9654 it->second->forward(who);
9655 dn->batch_ops.erase(it);
9656 }
9657 }
9658 break;
9659 }
9660 default:
9661 ceph_abort();
9662 }
9663 } else {
9664 mds->forward_message_mds(mdr->release_client_request(), who);
9665 }
9666 if (mds->logger) mds->logger->inc(l_mds_forward);
9667 } else if (mdr->internal_op >= 0) {
9668 dout(10) << "request_forward on internal op; cancelling" << dendl;
9669 mdr->internal_op_finish->complete(-EXDEV);
9670 } else {
9671 dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request
9672 << " was from mds" << dendl;
9673 }
9674 request_cleanup(mdr);
9675 }
9676
9677
9678 void MDCache::dispatch_request(MDRequestRef& mdr)
9679 {
9680 if (mdr->client_request) {
9681 mds->server->dispatch_client_request(mdr);
9682 } else if (mdr->slave_request) {
9683 mds->server->dispatch_slave_request(mdr);
9684 } else {
9685 switch (mdr->internal_op) {
9686 case CEPH_MDS_OP_FRAGMENTDIR:
9687 dispatch_fragment_dir(mdr);
9688 break;
9689 case CEPH_MDS_OP_EXPORTDIR:
9690 migrator->dispatch_export_dir(mdr, 0);
9691 break;
9692 case CEPH_MDS_OP_ENQUEUE_SCRUB:
9693 enqueue_scrub_work(mdr);
9694 break;
9695 case CEPH_MDS_OP_FLUSH:
9696 flush_dentry_work(mdr);
9697 break;
9698 case CEPH_MDS_OP_REPAIR_FRAGSTATS:
9699 repair_dirfrag_stats_work(mdr);
9700 break;
9701 case CEPH_MDS_OP_REPAIR_INODESTATS:
9702 repair_inode_stats_work(mdr);
9703 break;
9704 case CEPH_MDS_OP_UPGRADE_SNAPREALM:
9705 upgrade_inode_snaprealm_work(mdr);
9706 break;
9707 default:
9708 ceph_abort();
9709 }
9710 }
9711 }
9712
9713
9714 void MDCache::request_drop_foreign_locks(MDRequestRef& mdr)
9715 {
9716 if (!mdr->has_more())
9717 return;
9718
9719 // clean up slaves
9720 // (will implicitly drop remote dn pins)
9721 for (set<mds_rank_t>::iterator p = mdr->more()->slaves.begin();
9722 p != mdr->more()->slaves.end();
9723 ++p) {
9724 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt,
9725 MMDSSlaveRequest::OP_FINISH);
9726
9727 if (mdr->killed && !mdr->committing) {
9728 r->mark_abort();
9729 } else if (mdr->more()->srcdn_auth_mds == *p &&
9730 mdr->more()->inode_import.length() > 0) {
9731 // information about rename imported caps
9732 r->inode_export.claim(mdr->more()->inode_import);
9733 }
9734
9735 mds->send_message_mds(r, *p);
9736 }
9737
9738 /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them
9739 * implicitly. Note that we don't call the finishers -- there shouldn't
9740 * be any on a remote lock and the request finish wakes up all
9741 * the waiters anyway! */
9742
9743 for (auto it = mdr->locks.begin(); it != mdr->locks.end(); ) {
9744 SimpleLock *lock = it->lock;
9745 if (it->is_xlock() && !lock->get_parent()->is_auth()) {
9746 dout(10) << "request_drop_foreign_locks forgetting lock " << *lock
9747 << " on " << lock->get_parent() << dendl;
9748 lock->put_xlock();
9749 mdr->locks.erase(it++);
9750 } else if (it->is_remote_wrlock()) {
9751 dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *lock
9752 << " on mds." << it->wrlock_target << " on " << *lock->get_parent() << dendl;
9753 if (it->is_wrlock()) {
9754 it->clear_remote_wrlock();
9755 ++it;
9756 } else {
9757 mdr->locks.erase(it++);
9758 }
9759 } else {
9760 ++it;
9761 }
9762 }
9763
9764 mdr->more()->slaves.clear(); /* we no longer have requests out to them, and
9765 * leaving them in can cause double-notifies as
9766 * this function can get called more than once */
9767 }
9768
9769 void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr)
9770 {
9771 request_drop_foreign_locks(mdr);
9772 mds->locker->drop_non_rdlocks(mdr.get());
9773 }
9774
9775 void MDCache::request_drop_locks(MDRequestRef& mdr)
9776 {
9777 request_drop_foreign_locks(mdr);
9778 mds->locker->drop_locks(mdr.get());
9779 }
9780
9781 void MDCache::request_cleanup(MDRequestRef& mdr)
9782 {
9783 dout(15) << "request_cleanup " << *mdr << dendl;
9784
9785 if (mdr->has_more()) {
9786 if (mdr->more()->is_ambiguous_auth)
9787 mdr->clear_ambiguous_auth();
9788 if (!mdr->more()->waiting_for_finish.empty())
9789 mds->queue_waiters(mdr->more()->waiting_for_finish);
9790 }
9791
9792 request_drop_locks(mdr);
9793
9794 // drop (local) auth pins
9795 mdr->drop_local_auth_pins();
9796
9797 // drop stickydirs
9798 mdr->put_stickydirs();
9799
9800 mds->locker->kick_cap_releases(mdr);
9801
9802 // drop cache pins
9803 mdr->drop_pins();
9804
9805 // remove from session
9806 mdr->item_session_request.remove_myself();
9807
9808 // remove from map
9809 active_requests.erase(mdr->reqid);
9810
9811 if (mds->logger)
9812 log_stat();
9813
9814 mdr->mark_event("cleaned up request");
9815 }
9816
9817 void MDCache::request_kill(MDRequestRef& mdr)
9818 {
9819 // rollback slave requests is tricky. just let the request proceed.
9820 if (mdr->has_more() &&
9821 (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) {
9822 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9823 ceph_assert(mdr->more()->witnessed.empty());
9824 mdr->aborted = true;
9825 dout(10) << "request_kill " << *mdr << " -- waiting for slave reply, delaying" << dendl;
9826 } else {
9827 dout(10) << "request_kill " << *mdr << " -- already started slave prep, no-op" << dendl;
9828 }
9829
9830 ceph_assert(mdr->used_prealloc_ino == 0);
9831 ceph_assert(mdr->prealloc_inos.empty());
9832
9833 mdr->session = NULL;
9834 mdr->item_session_request.remove_myself();
9835 return;
9836 }
9837
9838 mdr->killed = true;
9839 mdr->mark_event("killing request");
9840
9841 if (mdr->committing) {
9842 dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl;
9843 } else {
9844 dout(10) << "request_kill " << *mdr << dendl;
9845 request_cleanup(mdr);
9846 }
9847 }
9848
9849 // -------------------------------------------------------------------------------
9850 // SNAPREALMS
9851
9852 void MDCache::create_global_snaprealm()
9853 {
9854 CInode *in = new CInode(this); // dummy inode
9855 create_unlinked_system_inode(in, MDS_INO_GLOBAL_SNAPREALM, S_IFDIR|0755);
9856 add_inode(in);
9857 global_snaprealm = in->snaprealm;
9858 }
9859
9860 void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients)
9861 {
9862 dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl;
9863
9864 vector<inodeno_t> split_inos;
9865 vector<inodeno_t> split_realms;
9866
9867 if (notify_clients) {
9868 ceph_assert(in->snaprealm->have_past_parents_open());
9869 if (snapop == CEPH_SNAP_OP_SPLIT) {
9870 // notify clients of update|split
9871 for (elist<CInode*>::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps));
9872 !p.end(); ++p)
9873 split_inos.push_back((*p)->ino());
9874
9875 for (set<SnapRealm*>::iterator p = in->snaprealm->open_children.begin();
9876 p != in->snaprealm->open_children.end();
9877 ++p)
9878 split_realms.push_back((*p)->inode->ino());
9879 }
9880 }
9881
9882 set<SnapRealm*> past_children;
9883 map<client_t, ref_t<MClientSnap>> updates;
9884 list<SnapRealm*> q;
9885 q.push_back(in->snaprealm);
9886 while (!q.empty()) {
9887 SnapRealm *realm = q.front();
9888 q.pop_front();
9889
9890 dout(10) << " realm " << *realm << " on " << *realm->inode << dendl;
9891 realm->invalidate_cached_snaps();
9892
9893 if (notify_clients) {
9894 for (const auto& p : realm->client_caps) {
9895 const auto& client = p.first;
9896 const auto& caps = p.second;
9897 ceph_assert(!caps->empty());
9898
9899 auto em = updates.emplace(std::piecewise_construct, std::forward_as_tuple(client), std::forward_as_tuple());
9900 if (em.second) {
9901 auto update = make_message<MClientSnap>(CEPH_SNAP_OP_SPLIT);
9902 update->head.split = in->ino();
9903 update->split_inos = split_inos;
9904 update->split_realms = split_realms;
9905 update->bl = in->snaprealm->get_snap_trace();
9906 em.first->second = std::move(update);
9907 }
9908 }
9909 }
9910
9911 if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) {
9912 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9913 p != realm->open_past_children.end();
9914 ++p)
9915 past_children.insert(*p);
9916 }
9917
9918 // notify for active children, too.
9919 dout(10) << " " << realm << " open_children are " << realm->open_children << dendl;
9920 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9921 p != realm->open_children.end();
9922 ++p)
9923 q.push_back(*p);
9924 }
9925
9926 if (notify_clients)
9927 send_snaps(updates);
9928
9929 // notify past children and their descendants if we update/delete old snapshots
9930 for (set<SnapRealm*>::iterator p = past_children.begin();
9931 p != past_children.end();
9932 ++p)
9933 q.push_back(*p);
9934
9935 while (!q.empty()) {
9936 SnapRealm *realm = q.front();
9937 q.pop_front();
9938
9939 realm->invalidate_cached_snaps();
9940
9941 for (set<SnapRealm*>::iterator p = realm->open_children.begin();
9942 p != realm->open_children.end();
9943 ++p) {
9944 if (past_children.count(*p) == 0)
9945 q.push_back(*p);
9946 }
9947
9948 for (set<SnapRealm*>::iterator p = realm->open_past_children.begin();
9949 p != realm->open_past_children.end();
9950 ++p) {
9951 if (past_children.count(*p) == 0) {
9952 q.push_back(*p);
9953 past_children.insert(*p);
9954 }
9955 }
9956 }
9957
9958 if (snapop == CEPH_SNAP_OP_DESTROY) {
9959 // eval stray inodes if we delete snapshot from their past ancestor snaprealm
9960 for (set<SnapRealm*>::iterator p = past_children.begin();
9961 p != past_children.end();
9962 ++p)
9963 maybe_eval_stray((*p)->inode, true);
9964 }
9965 }
9966
9967 void MDCache::send_snap_update(CInode *in, version_t stid, int snap_op)
9968 {
9969 dout(10) << __func__ << " " << *in << " stid " << stid << dendl;
9970 ceph_assert(in->is_auth());
9971
9972 set<mds_rank_t> mds_set;
9973 if (stid > 0) {
9974 mds->mdsmap->get_mds_set_lower_bound(mds_set, MDSMap::STATE_RESOLVE);
9975 mds_set.erase(mds->get_nodeid());
9976 } else {
9977 in->list_replicas(mds_set);
9978 }
9979
9980 if (!mds_set.empty()) {
9981 bufferlist snap_blob;
9982 in->encode_snap(snap_blob);
9983
9984 for (auto p : mds_set) {
9985 auto m = make_message<MMDSSnapUpdate>(in->ino(), stid, snap_op);
9986 m->snap_blob = snap_blob;
9987 mds->send_message_mds(m, p);
9988 }
9989 }
9990
9991 if (stid > 0)
9992 notify_global_snaprealm_update(snap_op);
9993 }
9994
9995 void MDCache::handle_snap_update(const cref_t<MMDSSnapUpdate> &m)
9996 {
9997 mds_rank_t from = mds_rank_t(m->get_source().num());
9998 dout(10) << __func__ << " " << *m << " from mds." << from << dendl;
9999
10000 if (mds->get_state() < MDSMap::STATE_RESOLVE &&
10001 mds->get_want_state() != CEPH_MDS_STATE_RESOLVE) {
10002 return;
10003 }
10004
10005 // null rejoin_done means open_snaprealms() has already been called
10006 bool notify_clients = mds->get_state() > MDSMap::STATE_REJOIN ||
10007 (mds->is_rejoin() && !rejoin_done);
10008
10009 if (m->get_tid() > 0) {
10010 mds->snapclient->notify_commit(m->get_tid());
10011 if (notify_clients)
10012 notify_global_snaprealm_update(m->get_snap_op());
10013 }
10014
10015 CInode *in = get_inode(m->get_ino());
10016 if (in) {
10017 ceph_assert(!in->is_auth());
10018 if (mds->get_state() > MDSMap::STATE_REJOIN ||
10019 (mds->is_rejoin() && !in->is_rejoining())) {
10020 auto p = m->snap_blob.cbegin();
10021 in->decode_snap(p);
10022
10023 if (!notify_clients) {
10024 if (!rejoin_pending_snaprealms.count(in)) {
10025 in->get(CInode::PIN_OPENINGSNAPPARENTS);
10026 rejoin_pending_snaprealms.insert(in);
10027 }
10028 }
10029 do_realm_invalidate_and_update_notify(in, m->get_snap_op(), notify_clients);
10030 }
10031 }
10032 }
10033
10034 void MDCache::notify_global_snaprealm_update(int snap_op)
10035 {
10036 if (snap_op != CEPH_SNAP_OP_DESTROY)
10037 snap_op = CEPH_SNAP_OP_UPDATE;
10038 set<Session*> sessions;
10039 mds->sessionmap.get_client_session_set(sessions);
10040 for (auto &session : sessions) {
10041 if (!session->is_open() && !session->is_stale())
10042 continue;
10043 auto update = make_message<MClientSnap>(snap_op);
10044 update->head.split = global_snaprealm->inode->ino();
10045 update->bl = global_snaprealm->get_snap_trace();
10046 mds->send_message_client_counted(update, session);
10047 }
10048 }
10049
10050 // -------------------------------------------------------------------------------
10051 // STRAYS
10052
10053 struct C_MDC_RetryScanStray : public MDCacheContext {
10054 dirfrag_t next;
10055 C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { }
10056 void finish(int r) override {
10057 mdcache->scan_stray_dir(next);
10058 }
10059 };
10060
10061 void MDCache::scan_stray_dir(dirfrag_t next)
10062 {
10063 dout(10) << "scan_stray_dir " << next << dendl;
10064
10065 std::vector<CDir*> ls;
10066 for (int i = 0; i < NUM_STRAY; ++i) {
10067 if (strays[i]->ino() < next.ino)
10068 continue;
10069 strays[i]->get_dirfrags(ls);
10070 }
10071
10072 for (const auto& dir : ls) {
10073 if (dir->dirfrag() < next)
10074 continue;
10075 if (!dir->is_complete()) {
10076 dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag()));
10077 return;
10078 }
10079 for (auto &p : dir->items) {
10080 CDentry *dn = p.second;
10081 dn->state_set(CDentry::STATE_STRAY);
10082 CDentry::linkage_t *dnl = dn->get_projected_linkage();
10083 if (dnl->is_primary()) {
10084 CInode *in = dnl->get_inode();
10085 if (in->inode.nlink == 0)
10086 in->state_set(CInode::STATE_ORPHAN);
10087 maybe_eval_stray(in);
10088 }
10089 }
10090 }
10091 }
10092
10093 void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin)
10094 {
10095 object_t oid = CInode::get_object_name(ino, frag_t(), "");
10096 mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin);
10097 if (mds->logger)
10098 mds->logger->inc(l_mds_openino_backtrace_fetch);
10099 }
10100
10101
10102
10103
10104
10105 // ========================================================================================
10106 // DISCOVER
10107 /*
10108
10109 - for all discovers (except base_inos, e.g. root, stray), waiters are attached
10110 to the parent metadata object in the cache (pinning it).
10111
10112 - all discovers are tracked by tid, so that we can ignore potentially dup replies.
10113
10114 */
10115
10116 void MDCache::_send_discover(discover_info_t& d)
10117 {
10118 auto dis = make_message<MDiscover>(d.ino, d.frag, d.snap, d.want_path,
10119 d.want_base_dir, d.path_locked);
10120 dis->set_tid(d.tid);
10121 mds->send_message_mds(dis, d.mds);
10122 }
10123
10124 void MDCache::discover_base_ino(inodeno_t want_ino,
10125 MDSContext *onfinish,
10126 mds_rank_t from)
10127 {
10128 dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl;
10129 if (waiting_for_base_ino[from].count(want_ino) == 0) {
10130 discover_info_t& d = _create_discover(from);
10131 d.ino = want_ino;
10132 _send_discover(d);
10133 }
10134 waiting_for_base_ino[from][want_ino].push_back(onfinish);
10135 }
10136
10137
10138 void MDCache::discover_dir_frag(CInode *base,
10139 frag_t approx_fg,
10140 MDSContext *onfinish,
10141 mds_rank_t from)
10142 {
10143 if (from < 0)
10144 from = base->authority().first;
10145
10146 dirfrag_t df(base->ino(), approx_fg);
10147 dout(7) << "discover_dir_frag " << df
10148 << " from mds." << from << dendl;
10149
10150 if (!base->is_waiting_for_dir(approx_fg) || !onfinish) {
10151 discover_info_t& d = _create_discover(from);
10152 d.pin_base(base);
10153 d.ino = base->ino();
10154 d.frag = approx_fg;
10155 d.want_base_dir = true;
10156 _send_discover(d);
10157 }
10158
10159 if (onfinish)
10160 base->add_dir_waiter(approx_fg, onfinish);
10161 }
10162
10163 struct C_MDC_RetryDiscoverPath : public MDCacheContext {
10164 CInode *base;
10165 snapid_t snapid;
10166 filepath path;
10167 mds_rank_t from;
10168 C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) :
10169 MDCacheContext(c), base(b), snapid(s), path(p), from(f) {}
10170 void finish(int r) override {
10171 mdcache->discover_path(base, snapid, path, 0, from);
10172 }
10173 };
10174
10175 void MDCache::discover_path(CInode *base,
10176 snapid_t snap,
10177 filepath want_path,
10178 MDSContext *onfinish,
10179 bool path_locked,
10180 mds_rank_t from)
10181 {
10182 if (from < 0)
10183 from = base->authority().first;
10184
10185 dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from
10186 << (path_locked ? " path_locked":"")
10187 << dendl;
10188
10189 if (base->is_ambiguous_auth()) {
10190 dout(10) << " waiting for single auth on " << *base << dendl;
10191 if (!onfinish)
10192 onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from);
10193 base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish);
10194 return;
10195 } else if (from == mds->get_nodeid()) {
10196 MDSContext::vec finished;
10197 base->take_waiting(CInode::WAIT_DIR, finished);
10198 mds->queue_waiters(finished);
10199 return;
10200 }
10201
10202 frag_t fg = base->pick_dirfrag(want_path[0]);
10203 if ((path_locked && want_path.depth() == 1) ||
10204 !base->is_waiting_for_dir(fg) || !onfinish) {
10205 discover_info_t& d = _create_discover(from);
10206 d.ino = base->ino();
10207 d.pin_base(base);
10208 d.frag = fg;
10209 d.snap = snap;
10210 d.want_path = want_path;
10211 d.want_base_dir = true;
10212 d.path_locked = path_locked;
10213 _send_discover(d);
10214 }
10215
10216 // register + wait
10217 if (onfinish)
10218 base->add_dir_waiter(fg, onfinish);
10219 }
10220
10221 struct C_MDC_RetryDiscoverPath2 : public MDCacheContext {
10222 CDir *base;
10223 snapid_t snapid;
10224 filepath path;
10225 C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) :
10226 MDCacheContext(c), base(b), snapid(s), path(p) {}
10227 void finish(int r) override {
10228 mdcache->discover_path(base, snapid, path, 0);
10229 }
10230 };
10231
10232 void MDCache::discover_path(CDir *base,
10233 snapid_t snap,
10234 filepath want_path,
10235 MDSContext *onfinish,
10236 bool path_locked)
10237 {
10238 mds_rank_t from = base->authority().first;
10239
10240 dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from
10241 << (path_locked ? " path_locked":"")
10242 << dendl;
10243
10244 if (base->is_ambiguous_auth()) {
10245 dout(7) << " waiting for single auth on " << *base << dendl;
10246 if (!onfinish)
10247 onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path);
10248 base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish);
10249 return;
10250 } else if (from == mds->get_nodeid()) {
10251 MDSContext::vec finished;
10252 base->take_sub_waiting(finished);
10253 mds->queue_waiters(finished);
10254 return;
10255 }
10256
10257 if ((path_locked && want_path.depth() == 1) ||
10258 !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) {
10259 discover_info_t& d = _create_discover(from);
10260 d.ino = base->ino();
10261 d.pin_base(base->inode);
10262 d.frag = base->get_frag();
10263 d.snap = snap;
10264 d.want_path = want_path;
10265 d.want_base_dir = false;
10266 d.path_locked = path_locked;
10267 _send_discover(d);
10268 }
10269
10270 // register + wait
10271 if (onfinish)
10272 base->add_dentry_waiter(want_path[0], snap, onfinish);
10273 }
10274
10275 void MDCache::kick_discovers(mds_rank_t who)
10276 {
10277 for (map<ceph_tid_t,discover_info_t>::iterator p = discovers.begin();
10278 p != discovers.end();
10279 ++p) {
10280 if (p->second.mds != who)
10281 continue;
10282 _send_discover(p->second);
10283 }
10284 }
10285
10286
10287 void MDCache::handle_discover(const cref_t<MDiscover> &dis)
10288 {
10289 mds_rank_t whoami = mds->get_nodeid();
10290 mds_rank_t from = mds_rank_t(dis->get_source().num());
10291
10292 ceph_assert(from != whoami);
10293
10294 if (mds->get_state() <= MDSMap::STATE_REJOIN) {
10295 if (mds->get_state() < MDSMap::STATE_REJOIN &&
10296 mds->get_want_state() < CEPH_MDS_STATE_REJOIN) {
10297 return;
10298 }
10299
10300 // proceed if requester is in the REJOIN stage, the request is from parallel_fetch().
10301 // delay processing request from survivor because we may not yet choose lock states.
10302 if (!mds->mdsmap->is_rejoin(from)) {
10303 dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl;
10304 mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis));
10305 return;
10306 }
10307 }
10308
10309
10310 CInode *cur = 0;
10311 auto reply = make_message<MDiscoverReply>(*dis);
10312
10313 snapid_t snapid = dis->get_snapid();
10314
10315 // get started.
10316 if (MDS_INO_IS_BASE(dis->get_base_ino()) &&
10317 !dis->wants_base_dir() && dis->get_want().depth() == 0) {
10318 // wants root
10319 dout(7) << "handle_discover from mds." << from
10320 << " wants base + " << dis->get_want().get_path()
10321 << " snap " << snapid
10322 << dendl;
10323
10324 cur = get_inode(dis->get_base_ino());
10325 ceph_assert(cur);
10326
10327 // add root
10328 reply->starts_with = MDiscoverReply::INODE;
10329 encode_replica_inode(cur, from, reply->trace, mds->mdsmap->get_up_features());
10330 dout(10) << "added base " << *cur << dendl;
10331 }
10332 else {
10333 // there's a base inode
10334 cur = get_inode(dis->get_base_ino(), snapid);
10335 if (!cur && snapid != CEPH_NOSNAP) {
10336 cur = get_inode(dis->get_base_ino());
10337 if (cur && !cur->is_multiversion())
10338 cur = NULL; // nope!
10339 }
10340
10341 if (!cur) {
10342 dout(7) << "handle_discover mds." << from
10343 << " don't have base ino " << dis->get_base_ino() << "." << snapid
10344 << dendl;
10345 if (!dis->wants_base_dir() && dis->get_want().depth() > 0)
10346 reply->set_error_dentry(dis->get_dentry(0));
10347 reply->set_flag_error_dir();
10348 } else if (dis->wants_base_dir()) {
10349 dout(7) << "handle_discover mds." << from
10350 << " wants basedir+" << dis->get_want().get_path()
10351 << " has " << *cur
10352 << dendl;
10353 } else {
10354 dout(7) << "handle_discover mds." << from
10355 << " wants " << dis->get_want().get_path()
10356 << " has " << *cur
10357 << dendl;
10358 }
10359 }
10360
10361 ceph_assert(reply);
10362
10363 // add content
10364 // do some fidgeting to include a dir if they asked for the base dir, or just root.
10365 for (unsigned i = 0;
10366 cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0);
10367 i++) {
10368
10369 // -- figure out the dir
10370
10371 // is *cur even a dir at all?
10372 if (!cur->is_dir()) {
10373 dout(7) << *cur << " not a dir" << dendl;
10374 reply->set_flag_error_dir();
10375 break;
10376 }
10377
10378 // pick frag
10379 frag_t fg;
10380 if (dis->get_want().depth()) {
10381 // dentry specifies
10382 fg = cur->pick_dirfrag(dis->get_dentry(i));
10383 } else {
10384 // requester explicity specified the frag
10385 ceph_assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino()));
10386 fg = dis->get_base_dir_frag();
10387 if (!cur->dirfragtree.is_leaf(fg))
10388 fg = cur->dirfragtree[fg.value()];
10389 }
10390 CDir *curdir = cur->get_dirfrag(fg);
10391
10392 if ((!curdir && !cur->is_auth()) ||
10393 (curdir && !curdir->is_auth())) {
10394
10395 /* before:
10396 * ONLY set flag if empty!!
10397 * otherwise requester will wake up waiter(s) _and_ continue with discover,
10398 * resulting in duplicate discovers in flight,
10399 * which can wreak havoc when discovering rename srcdn (which may move)
10400 */
10401
10402 if (reply->is_empty()) {
10403 // only hint if empty.
10404 // someday this could be better, but right now the waiter logic isn't smart enough.
10405
10406 // hint
10407 if (curdir) {
10408 dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl;
10409 reply->set_dir_auth_hint(curdir->authority().first);
10410 } else {
10411 dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for "
10412 << *cur << dendl;
10413 reply->set_dir_auth_hint(cur->authority().first);
10414 }
10415
10416 // note error dentry, if any
10417 // NOTE: important, as it allows requester to issue an equivalent discover
10418 // to whomever we hint at.
10419 if (dis->get_want().depth() > i)
10420 reply->set_error_dentry(dis->get_dentry(i));
10421 }
10422
10423 break;
10424 }
10425
10426 if (!curdir) { // open dir?
10427 if (cur->is_frozen()) {
10428 if (!reply->is_empty()) {
10429 dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl;
10430 break;
10431 }
10432 dout(7) << *cur << " is frozen, empty reply, waiting" << dendl;
10433 cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10434 return;
10435 }
10436 curdir = cur->get_or_open_dirfrag(this, fg);
10437 } else if (curdir->is_frozen_tree() ||
10438 (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) {
10439 if (!reply->is_empty()) {
10440 dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl;
10441 break;
10442 }
10443 if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) {
10444 dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl;
10445 reply->set_flag_error_dir();
10446 break;
10447 }
10448 dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl;
10449 curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10450 return;
10451 }
10452
10453 // add dir
10454 if (curdir->get_version() == 0) {
10455 // fetch newly opened dir
10456 } else if (reply->is_empty() && !dis->wants_base_dir()) {
10457 dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl;
10458 // make sure the base frag is correct, though, in there was a refragment since the
10459 // original request was sent.
10460 reply->set_base_dir_frag(curdir->get_frag());
10461 } else {
10462 ceph_assert(!curdir->is_ambiguous_auth()); // would be frozen.
10463 if (!reply->trace.length())
10464 reply->starts_with = MDiscoverReply::DIR;
10465 encode_replica_dir(curdir, from, reply->trace);
10466 dout(7) << "handle_discover added dir " << *curdir << dendl;
10467 }
10468
10469 // lookup
10470 CDentry *dn = 0;
10471 if (curdir->get_version() == 0) {
10472 // fetch newly opened dir
10473 ceph_assert(!curdir->has_bloom());
10474 } else if (dis->get_want().depth() > 0) {
10475 // lookup dentry
10476 dn = curdir->lookup(dis->get_dentry(i), snapid);
10477 } else
10478 break; // done!
10479
10480 // incomplete dir?
10481 if (!dn) {
10482 if (!curdir->is_complete() &&
10483 !(snapid == CEPH_NOSNAP &&
10484 curdir->has_bloom() &&
10485 !curdir->is_in_bloom(dis->get_dentry(i)))) {
10486 // readdir
10487 dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl;
10488 if (reply->is_empty()) {
10489 // fetch and wait
10490 curdir->fetch(new C_MDS_RetryMessage(mds, dis),
10491 dis->wants_base_dir() && curdir->get_version() == 0);
10492 return;
10493 } else {
10494 // initiate fetch, but send what we have so far
10495 curdir->fetch(0);
10496 break;
10497 }
10498 }
10499
10500 if (snapid != CEPH_NOSNAP && !reply->is_empty()) {
10501 dout(7) << "dentry " << dis->get_dentry(i) << " snap " << snapid
10502 << " dne, non-empty reply, stopping" << dendl;
10503 break;
10504 }
10505
10506 // send null dentry
10507 dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in "
10508 << *curdir << dendl;
10509 if (snapid == CEPH_NOSNAP)
10510 dn = curdir->add_null_dentry(dis->get_dentry(i));
10511 else
10512 dn = curdir->add_null_dentry(dis->get_dentry(i), snapid, snapid);
10513 }
10514 ceph_assert(dn);
10515
10516 // don't add replica to purging dentry/inode
10517 if (dn->state_test(CDentry::STATE_PURGING)) {
10518 if (reply->is_empty())
10519 reply->set_flag_error_dn(dis->get_dentry(i));
10520 break;
10521 }
10522
10523 CDentry::linkage_t *dnl = dn->get_linkage();
10524
10525 // xlocked dentry?
10526 // ...always block on non-tail items (they are unrelated)
10527 // ...allow xlocked tail disocvery _only_ if explicitly requested
10528 if (dn->lock.is_xlocked()) {
10529 // is this the last (tail) item in the discover traversal?
10530 if (dis->is_path_locked()) {
10531 dout(7) << "handle_discover allowing discovery of xlocked " << *dn << dendl;
10532 } else if (reply->is_empty()) {
10533 dout(7) << "handle_discover blocking on xlocked " << *dn << dendl;
10534 dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis));
10535 return;
10536 } else {
10537 dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl;
10538 break;
10539 }
10540 }
10541
10542 // frozen inode?
10543 bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1);
10544 if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) {
10545 if (tailitem && dis->is_path_locked()) {
10546 dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl;
10547 } else if (reply->is_empty()) {
10548 dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl;
10549 dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis));
10550 return;
10551 } else {
10552 dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl;
10553 break;
10554 }
10555 }
10556
10557 // add dentry
10558 if (!reply->trace.length())
10559 reply->starts_with = MDiscoverReply::DENTRY;
10560 encode_replica_dentry(dn, from, reply->trace);
10561 dout(7) << "handle_discover added dentry " << *dn << dendl;
10562
10563 if (!dnl->is_primary()) break; // stop on null or remote link.
10564
10565 // add inode
10566 CInode *next = dnl->get_inode();
10567 ceph_assert(next->is_auth());
10568
10569 encode_replica_inode(next, from, reply->trace, mds->mdsmap->get_up_features());
10570 dout(7) << "handle_discover added inode " << *next << dendl;
10571
10572 // descend, keep going.
10573 cur = next;
10574 continue;
10575 }
10576
10577 // how did we do?
10578 ceph_assert(!reply->is_empty());
10579 dout(7) << "handle_discover sending result back to asker mds." << from << dendl;
10580 mds->send_message(reply, dis->get_connection());
10581 }
10582
10583 void MDCache::handle_discover_reply(const cref_t<MDiscoverReply> &m)
10584 {
10585 /*
10586 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
10587 dout(0) << "discover_reply NOT ACTIVE YET" << dendl;
10588 return;
10589 }
10590 */
10591 dout(7) << "discover_reply " << *m << dendl;
10592 if (m->is_flag_error_dir())
10593 dout(7) << " flag error, dir" << dendl;
10594 if (m->is_flag_error_dn())
10595 dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl;
10596
10597 MDSContext::vec finished, error;
10598 mds_rank_t from = mds_rank_t(m->get_source().num());
10599
10600 // starting point
10601 CInode *cur = get_inode(m->get_base_ino());
10602 auto p = m->trace.cbegin();
10603
10604 int next = m->starts_with;
10605
10606 // decrement discover counters
10607 if (m->get_tid()) {
10608 map<ceph_tid_t,discover_info_t>::iterator p = discovers.find(m->get_tid());
10609 if (p != discovers.end()) {
10610 dout(10) << " found tid " << m->get_tid() << dendl;
10611 discovers.erase(p);
10612 } else {
10613 dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl;
10614 }
10615 }
10616
10617 // discover may start with an inode
10618 if (!p.end() && next == MDiscoverReply::INODE) {
10619 decode_replica_inode(cur, p, NULL, finished);
10620 dout(7) << "discover_reply got base inode " << *cur << dendl;
10621 ceph_assert(cur->is_base());
10622
10623 next = MDiscoverReply::DIR;
10624
10625 // take waiters?
10626 if (cur->is_base() &&
10627 waiting_for_base_ino[from].count(cur->ino())) {
10628 finished.swap(waiting_for_base_ino[from][cur->ino()]);
10629 waiting_for_base_ino[from].erase(cur->ino());
10630 }
10631 }
10632 ceph_assert(cur);
10633
10634 // loop over discover results.
10635 // indexes follow each ([[dir] dentry] inode)
10636 // can start, end with any type.
10637 while (!p.end()) {
10638 // dir
10639 frag_t fg;
10640 CDir *curdir = nullptr;
10641 if (next == MDiscoverReply::DIR) {
10642 decode_replica_dir(curdir, p, cur, mds_rank_t(m->get_source().num()), finished);
10643 if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) {
10644 ceph_assert(m->get_wanted_base_dir());
10645 cur->take_dir_waiting(m->get_base_dir_frag(), finished);
10646 }
10647 } else {
10648 // note: this can only happen our first way around this loop.
10649 if (p.end() && m->is_flag_error_dn()) {
10650 fg = cur->pick_dirfrag(m->get_error_dentry());
10651 curdir = cur->get_dirfrag(fg);
10652 } else
10653 curdir = cur->get_dirfrag(m->get_base_dir_frag());
10654 }
10655
10656 if (p.end())
10657 break;
10658
10659 // dentry
10660 CDentry *dn = nullptr;
10661 decode_replica_dentry(dn, p, curdir, finished);
10662
10663 if (p.end())
10664 break;
10665
10666 // inode
10667 decode_replica_inode(cur, p, dn, finished);
10668
10669 next = MDiscoverReply::DIR;
10670 }
10671
10672 // dir error?
10673 // or dir_auth hint?
10674 if (m->is_flag_error_dir() && !cur->is_dir()) {
10675 // not a dir.
10676 cur->take_waiting(CInode::WAIT_DIR, error);
10677 } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) {
10678 mds_rank_t who = m->get_dir_auth_hint();
10679 if (who == mds->get_nodeid()) who = -1;
10680 if (who >= 0)
10681 dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl;
10682
10683
10684 if (m->get_wanted_base_dir()) {
10685 frag_t fg = m->get_base_dir_frag();
10686 CDir *dir = cur->get_dirfrag(fg);
10687
10688 if (cur->is_waiting_for_dir(fg)) {
10689 if (cur->is_auth())
10690 cur->take_waiting(CInode::WAIT_DIR, finished);
10691 else if (dir || !cur->dirfragtree.is_leaf(fg))
10692 cur->take_dir_waiting(fg, finished);
10693 else
10694 discover_dir_frag(cur, fg, 0, who);
10695 } else
10696 dout(7) << " doing nothing, nobody is waiting for dir" << dendl;
10697 }
10698
10699 // try again?
10700 if (m->get_error_dentry().length()) {
10701 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10702 CDir *dir = cur->get_dirfrag(fg);
10703 // wanted a dentry
10704 if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) {
10705 if (dir->is_auth() || dir->lookup(m->get_error_dentry())) {
10706 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10707 m->get_wanted_snapid(), finished);
10708 } else {
10709 filepath relpath(m->get_error_dentry(), 0);
10710 discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->is_path_locked());
10711 }
10712 } else
10713 dout(7) << " doing nothing, have dir but nobody is waiting on dentry "
10714 << m->get_error_dentry() << dendl;
10715 }
10716 } else if (m->is_flag_error_dn()) {
10717 frag_t fg = cur->pick_dirfrag(m->get_error_dentry());
10718 CDir *dir = cur->get_dirfrag(fg);
10719 if (dir) {
10720 if (dir->is_auth()) {
10721 dir->take_sub_waiting(finished);
10722 } else {
10723 dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(),
10724 m->get_wanted_snapid(), error);
10725 }
10726 }
10727 }
10728
10729 // waiters
10730 finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly
10731 mds->queue_waiters(finished);
10732 }
10733
10734
10735
10736 // ----------------------------
10737 // REPLICAS
10738
10739
10740 void MDCache::encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl)
10741 {
10742 ENCODE_START(1, 1, bl);
10743 dirfrag_t df = dir->dirfrag();
10744 encode(df, bl);
10745 __u32 nonce = dir->add_replica(to);
10746 encode(nonce, bl);
10747 dir->_encode_base(bl);
10748 ENCODE_FINISH(bl);
10749 }
10750
10751 void MDCache::encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl)
10752 {
10753 ENCODE_START(1, 1, bl);
10754 encode(dn->get_name(), bl);
10755 encode(dn->last, bl);
10756
10757 __u32 nonce = dn->add_replica(to);
10758 encode(nonce, bl);
10759 encode(dn->first, bl);
10760 encode(dn->linkage.remote_ino, bl);
10761 encode(dn->linkage.remote_d_type, bl);
10762 dn->lock.encode_state_for_replica(bl);
10763 bool need_recover = mds->get_state() < MDSMap::STATE_ACTIVE;
10764 encode(need_recover, bl);
10765 ENCODE_FINISH(bl);
10766 }
10767
10768 void MDCache::encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
10769 uint64_t features)
10770 {
10771 ENCODE_START(2, 1, bl);
10772 ceph_assert(in->is_auth());
10773 encode(in->inode.ino, bl); // bleh, minor assymetry here
10774 encode(in->last, bl);
10775
10776 __u32 nonce = in->add_replica(to);
10777 encode(nonce, bl);
10778
10779 in->_encode_base(bl, features);
10780 in->_encode_locks_state_for_replica(bl, mds->get_state() < MDSMap::STATE_ACTIVE);
10781
10782 __u32 state = in->state;
10783 encode(state, bl);
10784
10785 ENCODE_FINISH(bl);
10786 }
10787
10788 void MDCache::decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from,
10789 MDSContext::vec& finished)
10790 {
10791 DECODE_START(1, p);
10792 dirfrag_t df;
10793 decode(df, p);
10794
10795 ceph_assert(diri->ino() == df.ino);
10796
10797 // add it (_replica_)
10798 dir = diri->get_dirfrag(df.frag);
10799
10800 if (dir) {
10801 // had replica. update w/ new nonce.
10802 __u32 nonce;
10803 decode(nonce, p);
10804 dir->set_replica_nonce(nonce);
10805 dir->_decode_base(p);
10806 dout(7) << __func__ << " had " << *dir << " nonce " << dir->replica_nonce << dendl;
10807 } else {
10808 // force frag to leaf in the diri tree
10809 if (!diri->dirfragtree.is_leaf(df.frag)) {
10810 dout(7) << __func__ << " forcing frag " << df.frag << " to leaf in the fragtree "
10811 << diri->dirfragtree << dendl;
10812 diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag);
10813 }
10814 // add replica.
10815 dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) );
10816 __u32 nonce;
10817 decode(nonce, p);
10818 dir->set_replica_nonce(nonce);
10819 dir->_decode_base(p);
10820 // is this a dir_auth delegation boundary?
10821 if (from != diri->authority().first ||
10822 diri->is_ambiguous_auth() ||
10823 diri->is_base())
10824 adjust_subtree_auth(dir, from);
10825
10826 dout(7) << __func__ << " added " << *dir << " nonce " << dir->replica_nonce << dendl;
10827 // get waiters
10828 diri->take_dir_waiting(df.frag, finished);
10829 }
10830 DECODE_FINISH(p);
10831 }
10832
10833 void MDCache::decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished)
10834 {
10835 DECODE_START(1, p);
10836 string name;
10837 snapid_t last;
10838 decode(name, p);
10839 decode(last, p);
10840
10841 dn = dir->lookup(name, last);
10842
10843 // have it?
10844 bool is_new = false;
10845 if (dn) {
10846 is_new = false;
10847 dout(7) << __func__ << " had " << *dn << dendl;
10848 } else {
10849 is_new = true;
10850 dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last);
10851 dout(7) << __func__ << " added " << *dn << dendl;
10852 }
10853
10854 __u32 nonce;
10855 decode(nonce, p);
10856 dn->set_replica_nonce(nonce);
10857 decode(dn->first, p);
10858
10859 inodeno_t rino;
10860 unsigned char rdtype;
10861 decode(rino, p);
10862 decode(rdtype, p);
10863 dn->lock.decode_state(p, is_new);
10864
10865 bool need_recover;
10866 decode(need_recover, p);
10867
10868 if (is_new) {
10869 if (rino)
10870 dir->link_remote_inode(dn, rino, rdtype);
10871 if (need_recover)
10872 dn->lock.mark_need_recover();
10873 }
10874
10875 dir->take_dentry_waiting(name, dn->first, dn->last, finished);
10876 DECODE_FINISH(p);
10877 }
10878
10879 void MDCache::decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished)
10880 {
10881 DECODE_START(2, p);
10882 inodeno_t ino;
10883 snapid_t last;
10884 __u32 nonce;
10885 decode(ino, p);
10886 decode(last, p);
10887 decode(nonce, p);
10888 in = get_inode(ino, last);
10889 if (!in) {
10890 in = new CInode(this, false, 1, last);
10891 in->set_replica_nonce(nonce);
10892 in->_decode_base(p);
10893 in->_decode_locks_state_for_replica(p, true);
10894 add_inode(in);
10895 if (in->ino() == MDS_INO_ROOT)
10896 in->inode_auth.first = 0;
10897 else if (in->is_mdsdir())
10898 in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET;
10899 dout(10) << __func__ << " added " << *in << dendl;
10900 if (dn) {
10901 ceph_assert(dn->get_linkage()->is_null());
10902 dn->dir->link_primary_inode(dn, in);
10903 }
10904 } else {
10905 in->set_replica_nonce(nonce);
10906 in->_decode_base(p);
10907 in->_decode_locks_state_for_replica(p, false);
10908 dout(10) << __func__ << " had " << *in << dendl;
10909 }
10910
10911 if (dn) {
10912 if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in)
10913 dout(10) << __func__ << " different linkage in dentry " << *dn << dendl;
10914 }
10915
10916 if (struct_v >= 2) {
10917 __u32 s;
10918 decode(s, p);
10919 s &= CInode::MASK_STATE_REPLICATED;
10920 if (s & CInode::STATE_RANDEPHEMERALPIN) {
10921 dout(10) << "replica inode is random ephemeral pinned" << dendl;
10922 in->set_ephemeral_rand(true);
10923 }
10924 }
10925
10926 DECODE_FINISH(p);
10927 }
10928
10929
10930 void MDCache::encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl)
10931 {
10932 ENCODE_START(1, 1, bl);
10933 uint64_t features = mds->mdsmap->get_up_features();
10934 encode_replica_inode(get_myin(), who, bl, features);
10935 encode_replica_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl);
10936 encode_replica_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl);
10937 encode_replica_inode(straydn->get_dir()->inode, who, bl, features);
10938 encode_replica_dir(straydn->get_dir(), who, bl);
10939 encode_replica_dentry(straydn, who, bl);
10940 ENCODE_FINISH(bl);
10941 }
10942
10943 void MDCache::decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from)
10944 {
10945 MDSContext::vec finished;
10946 auto p = bl.cbegin();
10947
10948 DECODE_START(1, p);
10949 CInode *mdsin = nullptr;
10950 decode_replica_inode(mdsin, p, NULL, finished);
10951 CDir *mdsdir = nullptr;
10952 decode_replica_dir(mdsdir, p, mdsin, from, finished);
10953 CDentry *straydirdn = nullptr;
10954 decode_replica_dentry(straydirdn, p, mdsdir, finished);
10955 CInode *strayin = nullptr;
10956 decode_replica_inode(strayin, p, straydirdn, finished);
10957 CDir *straydir = nullptr;
10958 decode_replica_dir(straydir, p, strayin, from, finished);
10959
10960 decode_replica_dentry(straydn, p, straydir, finished);
10961 if (!finished.empty())
10962 mds->queue_waiters(finished);
10963 DECODE_FINISH(p);
10964 }
10965
10966
10967 int MDCache::send_dir_updates(CDir *dir, bool bcast)
10968 {
10969 // this is an FYI, re: replication
10970
10971 set<mds_rank_t> who;
10972 if (bcast) {
10973 mds->get_mds_map()->get_active_mds_set(who);
10974 } else {
10975 for (const auto &p : dir->get_replicas()) {
10976 who.insert(p.first);
10977 }
10978 }
10979
10980 dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl;
10981
10982 filepath path;
10983 dir->inode->make_path(path);
10984
10985 mds_rank_t whoami = mds->get_nodeid();
10986 for (set<mds_rank_t>::iterator it = who.begin();
10987 it != who.end();
10988 ++it) {
10989 if (*it == whoami) continue;
10990 //if (*it == except) continue;
10991 dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl;
10992
10993 std::set<int32_t> s;
10994 for (const auto &r : dir->dir_rep_by) {
10995 s.insert(r);
10996 }
10997 mds->send_message_mds(make_message<MDirUpdate>(mds->get_nodeid(), dir->dirfrag(), dir->dir_rep, s, path, bcast), *it);
10998 }
10999
11000 return 0;
11001 }
11002
11003 void MDCache::handle_dir_update(const cref_t<MDirUpdate> &m)
11004 {
11005 dirfrag_t df = m->get_dirfrag();
11006 CDir *dir = get_dirfrag(df);
11007 if (!dir) {
11008 dout(5) << "dir_update on " << df << ", don't have it" << dendl;
11009
11010 // discover it?
11011 if (m->should_discover()) {
11012 // only try once!
11013 // this is key to avoid a fragtree update race, among other things.
11014 m->inc_tried_discover();
11015 vector<CDentry*> trace;
11016 CInode *in;
11017 filepath path = m->get_path();
11018 dout(5) << "trying discover on dir_update for " << path << dendl;
11019 CF_MDS_RetryMessageFactory cf(mds, m);
11020 MDRequestRef null_ref;
11021 int r = path_traverse(null_ref, cf, path, MDS_TRAVERSE_DISCOVER, &trace, &in);
11022 if (r > 0)
11023 return;
11024 if (r == 0 &&
11025 in->ino() == df.ino &&
11026 in->get_approx_dirfrag(df.frag) == NULL) {
11027 open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m));
11028 return;
11029 }
11030 }
11031
11032 return;
11033 }
11034
11035 if (!m->has_tried_discover()) {
11036 // Update if it already exists. Othwerwise it got updated by discover reply.
11037 dout(5) << "dir_update on " << *dir << dendl;
11038 dir->dir_rep = m->get_dir_rep();
11039 dir->dir_rep_by.clear();
11040 for (const auto &e : m->get_dir_rep_by()) {
11041 dir->dir_rep_by.insert(e);
11042 }
11043 }
11044 }
11045
11046
11047
11048
11049
11050 // LINK
11051
11052 void MDCache::encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl)
11053 {
11054 ENCODE_START(1, 1, bl);
11055 inodeno_t ino = dnl->get_remote_ino();
11056 encode(ino, bl);
11057 __u8 d_type = dnl->get_remote_d_type();
11058 encode(d_type, bl);
11059 ENCODE_FINISH(bl);
11060 }
11061
11062 void MDCache::decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p)
11063 {
11064 DECODE_START(1, p);
11065 inodeno_t ino;
11066 __u8 d_type;
11067 decode(ino, p);
11068 decode(d_type, p);
11069 dout(10) << __func__ << " remote " << ino << " " << d_type << dendl;
11070 dir->link_remote_inode(dn, ino, d_type);
11071 DECODE_FINISH(p);
11072 }
11073
11074 void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr)
11075 {
11076 dout(7) << __func__ << " " << *dn << dendl;
11077
11078 CDir *subtree = get_subtree_root(dn->get_dir());
11079 for (const auto &p : dn->get_replicas()) {
11080 // don't tell (rename) witnesses; they already know
11081 if (mdr.get() && mdr->more()->witnessed.count(p.first))
11082 continue;
11083 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
11084 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
11085 rejoin_gather.count(p.first)))
11086 continue;
11087 CDentry::linkage_t *dnl = dn->get_linkage();
11088 auto m = make_message<MDentryLink>(subtree->dirfrag(), dn->get_dir()->dirfrag(), dn->get_name(), dnl->is_primary());
11089 if (dnl->is_primary()) {
11090 dout(10) << __func__ << " primary " << *dnl->get_inode() << dendl;
11091 encode_replica_inode(dnl->get_inode(), p.first, m->bl,
11092 mds->mdsmap->get_up_features());
11093 } else if (dnl->is_remote()) {
11094 encode_remote_dentry_link(dnl, m->bl);
11095 } else
11096 ceph_abort(); // aie, bad caller!
11097 mds->send_message_mds(m, p.first);
11098 }
11099 }
11100
11101 void MDCache::handle_dentry_link(const cref_t<MDentryLink> &m)
11102 {
11103 CDentry *dn = NULL;
11104 CDir *dir = get_dirfrag(m->get_dirfrag());
11105 if (!dir) {
11106 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11107 } else {
11108 dn = dir->lookup(m->get_dn());
11109 if (!dn) {
11110 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11111 } else {
11112 dout(7) << __func__ << " on " << *dn << dendl;
11113 CDentry::linkage_t *dnl = dn->get_linkage();
11114
11115 ceph_assert(!dn->is_auth());
11116 ceph_assert(dnl->is_null());
11117 }
11118 }
11119
11120 auto p = m->bl.cbegin();
11121 MDSContext::vec finished;
11122 if (dn) {
11123 if (m->get_is_primary()) {
11124 // primary link.
11125 CInode *in = nullptr;
11126 decode_replica_inode(in, p, dn, finished);
11127 } else {
11128 // remote link, easy enough.
11129 decode_remote_dentry_link(dir, dn, p);
11130 }
11131 } else {
11132 ceph_abort();
11133 }
11134
11135 if (!finished.empty())
11136 mds->queue_waiters(finished);
11137
11138 return;
11139 }
11140
11141
11142 // UNLINK
11143
11144 void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr)
11145 {
11146 dout(10) << __func__ << " " << *dn << dendl;
11147 // share unlink news with replicas
11148 set<mds_rank_t> replicas;
11149 dn->list_replicas(replicas);
11150 bufferlist snapbl;
11151 if (straydn) {
11152 straydn->list_replicas(replicas);
11153 CInode *strayin = straydn->get_linkage()->get_inode();
11154 strayin->encode_snap_blob(snapbl);
11155 }
11156 for (set<mds_rank_t>::iterator it = replicas.begin();
11157 it != replicas.end();
11158 ++it) {
11159 // don't tell (rmdir) witnesses; they already know
11160 if (mdr.get() && mdr->more()->witnessed.count(*it))
11161 continue;
11162
11163 if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN ||
11164 (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN &&
11165 rejoin_gather.count(*it)))
11166 continue;
11167
11168 auto unlink = make_message<MDentryUnlink>(dn->get_dir()->dirfrag(), dn->get_name());
11169 if (straydn) {
11170 encode_replica_stray(straydn, *it, unlink->straybl);
11171 unlink->snapbl = snapbl;
11172 }
11173 mds->send_message_mds(unlink, *it);
11174 }
11175 }
11176
11177 void MDCache::handle_dentry_unlink(const cref_t<MDentryUnlink> &m)
11178 {
11179 // straydn
11180 CDentry *straydn = nullptr;
11181 if (m->straybl.length())
11182 decode_replica_stray(straydn, m->straybl, mds_rank_t(m->get_source().num()));
11183
11184 CDir *dir = get_dirfrag(m->get_dirfrag());
11185 if (!dir) {
11186 dout(7) << __func__ << " don't have dirfrag " << m->get_dirfrag() << dendl;
11187 } else {
11188 CDentry *dn = dir->lookup(m->get_dn());
11189 if (!dn) {
11190 dout(7) << __func__ << " don't have dentry " << *dir << " dn " << m->get_dn() << dendl;
11191 } else {
11192 dout(7) << __func__ << " on " << *dn << dendl;
11193 CDentry::linkage_t *dnl = dn->get_linkage();
11194
11195 // open inode?
11196 if (dnl->is_primary()) {
11197 CInode *in = dnl->get_inode();
11198 dn->dir->unlink_inode(dn);
11199 ceph_assert(straydn);
11200 straydn->dir->link_primary_inode(straydn, in);
11201
11202 // in->first is lazily updated on replica; drag it forward so
11203 // that we always keep it in sync with the dnq
11204 ceph_assert(straydn->first >= in->first);
11205 in->first = straydn->first;
11206
11207 // update subtree map?
11208 if (in->is_dir())
11209 adjust_subtree_after_rename(in, dir, false);
11210
11211 if (m->snapbl.length()) {
11212 bool hadrealm = (in->snaprealm ? true : false);
11213 in->decode_snap_blob(m->snapbl);
11214 ceph_assert(in->snaprealm);
11215 ceph_assert(in->snaprealm->have_past_parents_open());
11216 if (!hadrealm)
11217 do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
11218 }
11219
11220 // send caps to auth (if we're not already)
11221 if (in->is_any_caps() &&
11222 !in->state_test(CInode::STATE_EXPORTINGCAPS))
11223 migrator->export_caps(in);
11224
11225 straydn = NULL;
11226 } else {
11227 ceph_assert(!straydn);
11228 ceph_assert(dnl->is_remote());
11229 dn->dir->unlink_inode(dn);
11230 }
11231 ceph_assert(dnl->is_null());
11232 }
11233 }
11234
11235 // race with trim_dentry()
11236 if (straydn) {
11237 ceph_assert(straydn->get_num_ref() == 0);
11238 ceph_assert(straydn->get_linkage()->is_null());
11239 expiremap ex;
11240 trim_dentry(straydn, ex);
11241 send_expire_messages(ex);
11242 }
11243 }
11244
11245
11246
11247
11248
11249
11250 // ===================================================================
11251
11252
11253
11254 // ===================================================================
11255 // FRAGMENT
11256
11257
11258 /**
11259 * adjust_dir_fragments -- adjust fragmentation for a directory
11260 *
11261 * @param diri directory inode
11262 * @param basefrag base fragment
11263 * @param bits bit adjustment. positive for split, negative for merge.
11264 */
11265 void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
11266 std::vector<CDir*>* resultfrags,
11267 MDSContext::vec& waiters,
11268 bool replay)
11269 {
11270 dout(10) << "adjust_dir_fragments " << basefrag << " " << bits
11271 << " on " << *diri << dendl;
11272
11273 auto&& p = diri->get_dirfrags_under(basefrag);
11274
11275 adjust_dir_fragments(diri, p.second, basefrag, bits, resultfrags, waiters, replay);
11276 }
11277
11278 CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay)
11279 {
11280 CDir *dir = diri->get_dirfrag(fg);
11281 if (dir)
11282 return dir;
11283
11284 dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl;
11285
11286 std::vector<CDir*> src, result;
11287 MDSContext::vec waiters;
11288
11289 // split a parent?
11290 frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg);
11291 while (1) {
11292 CDir *pdir = diri->get_dirfrag(parent);
11293 if (pdir) {
11294 int split = fg.bits() - parent.bits();
11295 dout(10) << " splitting parent by " << split << " " << *pdir << dendl;
11296 src.push_back(pdir);
11297 adjust_dir_fragments(diri, src, parent, split, &result, waiters, replay);
11298 dir = diri->get_dirfrag(fg);
11299 if (dir) {
11300 dout(10) << "force_dir_fragment result " << *dir << dendl;
11301 break;
11302 }
11303 }
11304 if (parent == frag_t())
11305 break;
11306 frag_t last = parent;
11307 parent = parent.parent();
11308 dout(10) << " " << last << " parent is " << parent << dendl;
11309 }
11310
11311 if (!dir) {
11312 // hoover up things under fg?
11313 {
11314 auto&& p = diri->get_dirfrags_under(fg);
11315 src.insert(std::end(src), std::cbegin(p.second), std::cend(p.second));
11316 }
11317 if (src.empty()) {
11318 dout(10) << "force_dir_fragment no frags under " << fg << dendl;
11319 } else {
11320 dout(10) << " will combine frags under " << fg << ": " << src << dendl;
11321 adjust_dir_fragments(diri, src, fg, 0, &result, waiters, replay);
11322 dir = result.front();
11323 dout(10) << "force_dir_fragment result " << *dir << dendl;
11324 }
11325 }
11326 if (!replay)
11327 mds->queue_waiters(waiters);
11328 return dir;
11329 }
11330
11331 void MDCache::adjust_dir_fragments(CInode *diri,
11332 const std::vector<CDir*>& srcfrags,
11333 frag_t basefrag, int bits,
11334 std::vector<CDir*>* resultfrags,
11335 MDSContext::vec& waiters,
11336 bool replay)
11337 {
11338 dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits
11339 << " srcfrags " << srcfrags
11340 << " on " << *diri << dendl;
11341
11342 // adjust fragtree
11343 // yuck. we may have discovered the inode while it was being fragmented.
11344 if (!diri->dirfragtree.is_leaf(basefrag))
11345 diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag);
11346
11347 if (bits > 0)
11348 diri->dirfragtree.split(basefrag, bits);
11349 dout(10) << " new fragtree is " << diri->dirfragtree << dendl;
11350
11351 if (srcfrags.empty())
11352 return;
11353
11354 // split
11355 CDir *parent_dir = diri->get_parent_dir();
11356 CDir *parent_subtree = 0;
11357 if (parent_dir)
11358 parent_subtree = get_subtree_root(parent_dir);
11359
11360 ceph_assert(srcfrags.size() >= 1);
11361 if (bits > 0) {
11362 // SPLIT
11363 ceph_assert(srcfrags.size() == 1);
11364 CDir *dir = srcfrags.front();
11365
11366 dir->split(bits, resultfrags, waiters, replay);
11367
11368 // did i change the subtree map?
11369 if (dir->is_subtree_root()) {
11370 // new frags are now separate subtrees
11371 for (const auto& dir : *resultfrags) {
11372 subtrees[dir].clear(); // new frag is now its own subtree
11373 }
11374
11375 // was i a bound?
11376 if (parent_subtree) {
11377 ceph_assert(subtrees[parent_subtree].count(dir));
11378 subtrees[parent_subtree].erase(dir);
11379 for (const auto& dir : *resultfrags) {
11380 ceph_assert(dir->is_subtree_root());
11381 subtrees[parent_subtree].insert(dir);
11382 }
11383 }
11384
11385 // adjust my bounds.
11386 set<CDir*> bounds;
11387 bounds.swap(subtrees[dir]);
11388 subtrees.erase(dir);
11389 for (set<CDir*>::iterator p = bounds.begin();
11390 p != bounds.end();
11391 ++p) {
11392 CDir *frag = get_subtree_root((*p)->get_parent_dir());
11393 subtrees[frag].insert(*p);
11394 }
11395
11396 show_subtrees(10);
11397 }
11398
11399 diri->close_dirfrag(dir->get_frag());
11400
11401 } else {
11402 // MERGE
11403
11404 // are my constituent bits subtrees? if so, i will be too.
11405 // (it's all or none, actually.)
11406 bool any_subtree = false, any_non_subtree = false;
11407 for (const auto& dir : srcfrags) {
11408 if (dir->is_subtree_root())
11409 any_subtree = true;
11410 else
11411 any_non_subtree = true;
11412 }
11413 ceph_assert(!any_subtree || !any_non_subtree);
11414
11415 set<CDir*> new_bounds;
11416 if (any_subtree) {
11417 for (const auto& dir : srcfrags) {
11418 // this simplifies the code that find subtrees underneath the dirfrag
11419 if (!dir->is_subtree_root()) {
11420 dir->state_set(CDir::STATE_AUXSUBTREE);
11421 adjust_subtree_auth(dir, mds->get_nodeid());
11422 }
11423 }
11424
11425 for (const auto& dir : srcfrags) {
11426 ceph_assert(dir->is_subtree_root());
11427 dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl;
11428 map<CDir*, set<CDir*> >::iterator q = subtrees.find(dir);
11429 set<CDir*>::iterator r = q->second.begin();
11430 while (r != subtrees[dir].end()) {
11431 new_bounds.insert(*r);
11432 subtrees[dir].erase(r++);
11433 }
11434 subtrees.erase(q);
11435
11436 // remove myself as my parent's bound
11437 if (parent_subtree)
11438 subtrees[parent_subtree].erase(dir);
11439 }
11440 }
11441
11442 // merge
11443 CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth());
11444 f->merge(srcfrags, waiters, replay);
11445
11446 if (any_subtree) {
11447 ceph_assert(f->is_subtree_root());
11448 subtrees[f].swap(new_bounds);
11449 if (parent_subtree)
11450 subtrees[parent_subtree].insert(f);
11451
11452 show_subtrees(10);
11453 }
11454
11455 resultfrags->push_back(f);
11456 }
11457 }
11458
11459
11460 class C_MDC_FragmentFrozen : public MDSInternalContext {
11461 MDCache *mdcache;
11462 MDRequestRef mdr;
11463 public:
11464 C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) :
11465 MDSInternalContext(m->mds), mdcache(m), mdr(r) {}
11466 void finish(int r) override {
11467 mdcache->fragment_frozen(mdr, r);
11468 }
11469 };
11470
11471 bool MDCache::can_fragment(CInode *diri, const std::vector<CDir*>& dirs)
11472 {
11473 if (is_readonly()) {
11474 dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl;
11475 return false;
11476 }
11477 if (mds->is_cluster_degraded()) {
11478 dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl;
11479 return false;
11480 }
11481 if (diri->get_parent_dir() &&
11482 diri->get_parent_dir()->get_inode()->is_stray()) {
11483 dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl;
11484 return false;
11485 }
11486 if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) {
11487 dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl;
11488 return false;
11489 }
11490
11491 if (diri->scrub_is_in_progress()) {
11492 dout(7) << "can_fragment: scrub in progress" << dendl;
11493 return false;
11494 }
11495
11496 for (const auto& dir : dirs) {
11497 if (dir->state_test(CDir::STATE_FRAGMENTING)) {
11498 dout(7) << "can_fragment: already fragmenting " << *dir << dendl;
11499 return false;
11500 }
11501 if (!dir->is_auth()) {
11502 dout(7) << "can_fragment: not auth on " << *dir << dendl;
11503 return false;
11504 }
11505 if (dir->is_bad()) {
11506 dout(7) << "can_fragment: bad dirfrag " << *dir << dendl;
11507 return false;
11508 }
11509 if (dir->is_frozen() ||
11510 dir->is_freezing()) {
11511 dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl;
11512 return false;
11513 }
11514 }
11515
11516 return true;
11517 }
11518
11519 void MDCache::split_dir(CDir *dir, int bits)
11520 {
11521 dout(7) << __func__ << " " << *dir << " bits " << bits << dendl;
11522 ceph_assert(dir->is_auth());
11523 CInode *diri = dir->inode;
11524
11525 std::vector<CDir*> dirs;
11526 dirs.push_back(dir);
11527
11528 if (!can_fragment(diri, dirs)) {
11529 dout(7) << __func__ << " cannot fragment right now, dropping" << dendl;
11530 return;
11531 }
11532
11533 if (dir->frag.bits() + bits > 24) {
11534 dout(7) << __func__ << " frag bits > 24, dropping" << dendl;
11535 return;
11536 }
11537
11538 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11539 mdr->more()->fragment_base = dir->dirfrag();
11540
11541 ceph_assert(fragments.count(dir->dirfrag()) == 0);
11542 fragment_info_t& info = fragments[dir->dirfrag()];
11543 info.mdr = mdr;
11544 info.dirs.push_back(dir);
11545 info.bits = bits;
11546 info.last_cum_auth_pins_change = ceph_clock_now();
11547
11548 fragment_freeze_dirs(dirs);
11549 // initial mark+complete pass
11550 fragment_mark_and_complete(mdr);
11551 }
11552
11553 void MDCache::merge_dir(CInode *diri, frag_t frag)
11554 {
11555 dout(7) << "merge_dir to " << frag << " on " << *diri << dendl;
11556
11557 auto&& [all, dirs] = diri->get_dirfrags_under(frag);
11558 if (!all) {
11559 dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl;
11560 return;
11561 }
11562
11563 if (diri->dirfragtree.is_leaf(frag)) {
11564 dout(10) << " " << frag << " already a leaf for " << *diri << dendl;
11565 return;
11566 }
11567
11568 if (!can_fragment(diri, dirs))
11569 return;
11570
11571 CDir *first = dirs.front();
11572 int bits = first->get_frag().bits() - frag.bits();
11573 dout(10) << " we are merging by " << bits << " bits" << dendl;
11574
11575 dirfrag_t basedirfrag(diri->ino(), frag);
11576 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR);
11577 mdr->more()->fragment_base = basedirfrag;
11578
11579 ceph_assert(fragments.count(basedirfrag) == 0);
11580 fragment_info_t& info = fragments[basedirfrag];
11581 info.mdr = mdr;
11582 info.dirs = dirs;
11583 info.bits = -bits;
11584 info.last_cum_auth_pins_change = ceph_clock_now();
11585
11586 fragment_freeze_dirs(dirs);
11587 // initial mark+complete pass
11588 fragment_mark_and_complete(mdr);
11589 }
11590
11591 void MDCache::fragment_freeze_dirs(const std::vector<CDir*>& dirs)
11592 {
11593 bool any_subtree = false, any_non_subtree = false;
11594 for (const auto& dir : dirs) {
11595 dir->auth_pin(dir); // until we mark and complete them
11596 dir->state_set(CDir::STATE_FRAGMENTING);
11597 dir->freeze_dir();
11598 ceph_assert(dir->is_freezing_dir());
11599
11600 if (dir->is_subtree_root())
11601 any_subtree = true;
11602 else
11603 any_non_subtree = true;
11604 }
11605
11606 if (any_subtree && any_non_subtree) {
11607 // either all dirfrags are subtree roots or all are not.
11608 for (const auto& dir : dirs) {
11609 if (dir->is_subtree_root()) {
11610 ceph_assert(dir->state_test(CDir::STATE_AUXSUBTREE));
11611 } else {
11612 dir->state_set(CDir::STATE_AUXSUBTREE);
11613 adjust_subtree_auth(dir, mds->get_nodeid());
11614 }
11615 }
11616 }
11617 }
11618
11619 class C_MDC_FragmentMarking : public MDCacheContext {
11620 MDRequestRef mdr;
11621 public:
11622 C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11623 void finish(int r) override {
11624 mdcache->fragment_mark_and_complete(mdr);
11625 }
11626 };
11627
11628 void MDCache::fragment_mark_and_complete(MDRequestRef& mdr)
11629 {
11630 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11631 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11632 if (it == fragments.end() || it->second.mdr != mdr) {
11633 dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl;
11634 request_finish(mdr);
11635 return;
11636 }
11637
11638 fragment_info_t& info = it->second;
11639 CInode *diri = info.dirs.front()->get_inode();
11640 dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl;
11641
11642 MDSGatherBuilder gather(g_ceph_context);
11643
11644 for (const auto& dir : info.dirs) {
11645 bool ready = true;
11646 if (!dir->is_complete()) {
11647 dout(15) << " fetching incomplete " << *dir << dendl;
11648 dir->fetch(gather.new_sub(), true); // ignore authpinnability
11649 ready = false;
11650 } else if (dir->get_frag() == frag_t()) {
11651 // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback
11652 // the operation. To avoid CDir::fetch() complaining about missing object,
11653 // we commit new dirfrag first.
11654 if (dir->state_test(CDir::STATE_CREATING)) {
11655 dout(15) << " waiting until new dir gets journaled " << *dir << dendl;
11656 dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub());
11657 ready = false;
11658 } else if (dir->is_new()) {
11659 dout(15) << " committing new " << *dir << dendl;
11660 ceph_assert(dir->is_dirty());
11661 dir->commit(0, gather.new_sub(), true);
11662 ready = false;
11663 }
11664 }
11665 if (!ready)
11666 continue;
11667
11668 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11669 dout(15) << " marking " << *dir << dendl;
11670 for (auto &p : dir->items) {
11671 CDentry *dn = p.second;
11672 dn->get(CDentry::PIN_FRAGMENTING);
11673 ceph_assert(!dn->state_test(CDentry::STATE_FRAGMENTING));
11674 dn->state_set(CDentry::STATE_FRAGMENTING);
11675 }
11676 dir->state_set(CDir::STATE_DNPINNEDFRAG);
11677 dir->auth_unpin(dir);
11678 } else {
11679 dout(15) << " already marked " << *dir << dendl;
11680 }
11681 }
11682 if (gather.has_subs()) {
11683 gather.set_finisher(new C_MDC_FragmentMarking(this, mdr));
11684 gather.activate();
11685 return;
11686 }
11687
11688 for (const auto& dir : info.dirs) {
11689 if (!dir->is_frozen_dir()) {
11690 ceph_assert(dir->is_freezing_dir());
11691 dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub());
11692 }
11693 }
11694 if (gather.has_subs()) {
11695 gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr));
11696 gather.activate();
11697 // flush log so that request auth_pins are retired
11698 mds->mdlog->flush();
11699 return;
11700 }
11701
11702 fragment_frozen(mdr, 0);
11703 }
11704
11705 void MDCache::fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs)
11706 {
11707 dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl;
11708 for (const auto& dir : dirs) {
11709 dout(10) << " frag " << *dir << dendl;
11710
11711 ceph_assert(dir->state_test(CDir::STATE_FRAGMENTING));
11712 dir->state_clear(CDir::STATE_FRAGMENTING);
11713
11714 if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11715 dir->state_clear(CDir::STATE_DNPINNEDFRAG);
11716
11717 for (auto &p : dir->items) {
11718 CDentry *dn = p.second;
11719 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
11720 dn->state_clear(CDentry::STATE_FRAGMENTING);
11721 dn->put(CDentry::PIN_FRAGMENTING);
11722 }
11723 } else {
11724 dir->auth_unpin(dir);
11725 }
11726
11727 dir->unfreeze_dir();
11728 }
11729 }
11730
11731 bool MDCache::fragment_are_all_frozen(CDir *dir)
11732 {
11733 ceph_assert(dir->is_frozen_dir());
11734 map<dirfrag_t,fragment_info_t>::iterator p;
11735 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11736 p != fragments.end() && p->first.ino == dir->ino();
11737 ++p) {
11738 if (p->first.frag.contains(dir->get_frag()))
11739 return p->second.all_frozen;
11740 }
11741 ceph_abort();
11742 return false;
11743 }
11744
11745 void MDCache::fragment_freeze_inc_num_waiters(CDir *dir)
11746 {
11747 map<dirfrag_t,fragment_info_t>::iterator p;
11748 for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0));
11749 p != fragments.end() && p->first.ino == dir->ino();
11750 ++p) {
11751 if (p->first.frag.contains(dir->get_frag())) {
11752 p->second.num_remote_waiters++;
11753 return;
11754 }
11755 }
11756 ceph_abort();
11757 }
11758
11759 void MDCache::find_stale_fragment_freeze()
11760 {
11761 dout(10) << "find_stale_fragment_freeze" << dendl;
11762 // see comment in Migrator::find_stale_export_freeze()
11763 utime_t now = ceph_clock_now();
11764 utime_t cutoff = now;
11765 cutoff -= g_conf()->mds_freeze_tree_timeout;
11766
11767 for (map<dirfrag_t,fragment_info_t>::iterator p = fragments.begin();
11768 p != fragments.end(); ) {
11769 dirfrag_t df = p->first;
11770 fragment_info_t& info = p->second;
11771 ++p;
11772 if (info.all_frozen)
11773 continue;
11774 CDir *dir;
11775 int total_auth_pins = 0;
11776 for (const auto& d : info.dirs) {
11777 dir = d;
11778 if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) {
11779 total_auth_pins = -1;
11780 break;
11781 }
11782 if (dir->is_frozen_dir())
11783 continue;
11784 total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins();
11785 }
11786 if (total_auth_pins < 0)
11787 continue;
11788 if (info.last_cum_auth_pins != total_auth_pins) {
11789 info.last_cum_auth_pins = total_auth_pins;
11790 info.last_cum_auth_pins_change = now;
11791 continue;
11792 }
11793 if (info.last_cum_auth_pins_change >= cutoff)
11794 continue;
11795 dir = info.dirs.front();
11796 if (info.num_remote_waiters > 0 ||
11797 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
11798 dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl;
11799 std::vector<CDir*> dirs;
11800 info.dirs.swap(dirs);
11801 fragments.erase(df);
11802 fragment_unmark_unfreeze_dirs(dirs);
11803 }
11804 }
11805 }
11806
11807 class C_MDC_FragmentPrep : public MDCacheLogContext {
11808 MDRequestRef mdr;
11809 public:
11810 C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {}
11811 void finish(int r) override {
11812 mdcache->_fragment_logged(mdr);
11813 }
11814 };
11815
11816 class C_MDC_FragmentStore : public MDCacheContext {
11817 MDRequestRef mdr;
11818 public:
11819 C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {}
11820 void finish(int r) override {
11821 mdcache->_fragment_stored(mdr);
11822 }
11823 };
11824
11825 class C_MDC_FragmentCommit : public MDCacheLogContext {
11826 dirfrag_t basedirfrag;
11827 MDRequestRef mdr;
11828 public:
11829 C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, const MDRequestRef& r) :
11830 MDCacheLogContext(m), basedirfrag(df), mdr(r) {}
11831 void finish(int r) override {
11832 mdcache->_fragment_committed(basedirfrag, mdr);
11833 }
11834 };
11835
11836 class C_IO_MDC_FragmentPurgeOld : public MDCacheIOContext {
11837 dirfrag_t basedirfrag;
11838 int bits;
11839 MDRequestRef mdr;
11840 public:
11841 C_IO_MDC_FragmentPurgeOld(MDCache *m, dirfrag_t f, int b,
11842 const MDRequestRef& r) :
11843 MDCacheIOContext(m), basedirfrag(f), bits(b), mdr(r) {}
11844 void finish(int r) override {
11845 ceph_assert(r == 0 || r == -ENOENT);
11846 mdcache->_fragment_old_purged(basedirfrag, bits, mdr);
11847 }
11848 void print(ostream& out) const override {
11849 out << "fragment_purge_old(" << basedirfrag << ")";
11850 }
11851 };
11852
11853 void MDCache::fragment_frozen(MDRequestRef& mdr, int r)
11854 {
11855 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11856 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11857 if (it == fragments.end() || it->second.mdr != mdr) {
11858 dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl;
11859 request_finish(mdr);
11860 return;
11861 }
11862
11863 ceph_assert(r == 0);
11864 fragment_info_t& info = it->second;
11865 dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits
11866 << " on " << info.dirs.front()->get_inode() << dendl;
11867
11868 info.all_frozen = true;
11869 dispatch_fragment_dir(mdr);
11870 }
11871
11872 void MDCache::dispatch_fragment_dir(MDRequestRef& mdr)
11873 {
11874 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11875 map<dirfrag_t,fragment_info_t>::iterator it = fragments.find(basedirfrag);
11876 if (it == fragments.end() || it->second.mdr != mdr) {
11877 dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl;
11878 request_finish(mdr);
11879 return;
11880 }
11881
11882 fragment_info_t& info = it->second;
11883 CInode *diri = info.dirs.front()->get_inode();
11884
11885 dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits
11886 << " on " << *diri << dendl;
11887
11888 if (mdr->more()->slave_error)
11889 mdr->aborted = true;
11890
11891 if (!mdr->aborted) {
11892 MutationImpl::LockOpVec lov;
11893 lov.add_wrlock(&diri->dirfragtreelock);
11894 // prevent a racing gather on any other scatterlocks too
11895 lov.lock_scatter_gather(&diri->nestlock);
11896 lov.lock_scatter_gather(&diri->filelock);
11897 if (!mds->locker->acquire_locks(mdr, lov, NULL, true)) {
11898 if (!mdr->aborted)
11899 return;
11900 }
11901 }
11902
11903 if (mdr->aborted) {
11904 dout(10) << " can't auth_pin " << *diri << ", requeuing dir "
11905 << info.dirs.front()->dirfrag() << dendl;
11906 if (info.bits > 0)
11907 mds->balancer->queue_split(info.dirs.front(), false);
11908 else
11909 mds->balancer->queue_merge(info.dirs.front());
11910 fragment_unmark_unfreeze_dirs(info.dirs);
11911 fragments.erase(it);
11912 request_finish(mdr);
11913 return;
11914 }
11915
11916 mdr->ls = mds->mdlog->get_current_segment();
11917 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits);
11918 mds->mdlog->start_entry(le);
11919
11920 for (const auto& dir : info.dirs) {
11921 dirfrag_rollback rollback;
11922 rollback.fnode = dir->fnode;
11923 le->add_orig_frag(dir->get_frag(), &rollback);
11924 }
11925
11926 // refragment
11927 MDSContext::vec waiters;
11928 adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits,
11929 &info.resultfrags, waiters, false);
11930 if (g_conf()->mds_debug_frag)
11931 diri->verify_dirfrags();
11932 mds->queue_waiters(waiters);
11933
11934 for (const auto& fg : le->orig_frags)
11935 ceph_assert(!diri->dirfragtree.is_leaf(fg));
11936
11937 le->metablob.add_dir_context(info.resultfrags.front());
11938 for (const auto& dir : info.resultfrags) {
11939 if (diri->is_auth()) {
11940 le->metablob.add_fragmented_dir(dir, false, false);
11941 } else {
11942 dir->state_set(CDir::STATE_DIRTYDFT);
11943 le->metablob.add_fragmented_dir(dir, false, true);
11944 }
11945 }
11946
11947 // dft lock
11948 if (diri->is_auth()) {
11949 // journal dirfragtree
11950 auto &pi = diri->project_inode();
11951 pi.inode.version = diri->pre_dirty();
11952 journal_dirty_inode(mdr.get(), &le->metablob, diri);
11953 } else {
11954 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
11955 mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
11956 mdr->add_updated_lock(&diri->dirfragtreelock);
11957 }
11958
11959 /*
11960 // filelock
11961 mds->locker->mark_updated_scatterlock(&diri->filelock);
11962 mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
11963 mut->add_updated_lock(&diri->filelock);
11964
11965 // dirlock
11966 mds->locker->mark_updated_scatterlock(&diri->nestlock);
11967 mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
11968 mut->add_updated_lock(&diri->nestlock);
11969 */
11970
11971 add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls);
11972 mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr),
11973 mdr, __func__);
11974 mds->mdlog->flush();
11975 }
11976
11977 void MDCache::_fragment_logged(MDRequestRef& mdr)
11978 {
11979 dirfrag_t basedirfrag = mdr->more()->fragment_base;
11980 auto& info = fragments.at(basedirfrag);
11981 CInode *diri = info.resultfrags.front()->get_inode();
11982
11983 dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits
11984 << " on " << *diri << dendl;
11985 mdr->mark_event("prepare logged");
11986
11987 if (diri->is_auth())
11988 diri->pop_and_dirty_projected_inode(mdr->ls);
11989
11990 mdr->apply(); // mark scatterlock
11991
11992 // store resulting frags
11993 MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr));
11994
11995 for (const auto& dir : info.resultfrags) {
11996 dout(10) << " storing result frag " << *dir << dendl;
11997
11998 // freeze and store them too
11999 dir->auth_pin(this);
12000 dir->state_set(CDir::STATE_FRAGMENTING);
12001 dir->commit(0, gather.new_sub(), true); // ignore authpinnability
12002 }
12003
12004 gather.activate();
12005 }
12006
12007 void MDCache::_fragment_stored(MDRequestRef& mdr)
12008 {
12009 dirfrag_t basedirfrag = mdr->more()->fragment_base;
12010 fragment_info_t &info = fragments.at(basedirfrag);
12011 CDir *first = info.resultfrags.front();
12012 CInode *diri = first->get_inode();
12013
12014 dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits
12015 << " on " << *diri << dendl;
12016 mdr->mark_event("new frags stored");
12017
12018 // tell peers
12019 mds_rank_t diri_auth = (first->is_subtree_root() && !diri->is_auth()) ?
12020 diri->authority().first : CDIR_AUTH_UNKNOWN;
12021 for (const auto &p : first->get_replicas()) {
12022 if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN ||
12023 (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN &&
12024 rejoin_gather.count(p.first)))
12025 continue;
12026
12027 auto notify = make_message<MMDSFragmentNotify>(basedirfrag, info.bits, mdr->reqid.tid);
12028 if (diri_auth != CDIR_AUTH_UNKNOWN && // subtree root
12029 diri_auth != p.first) { // not auth mds of diri
12030 /*
12031 * In the nornal case, mds does not trim dir inode whose child dirfrags
12032 * are likely being fragmented (see trim_inode()). But when fragmenting
12033 * subtree roots, following race can happen:
12034 *
12035 * - mds.a (auth mds of dirfrag) sends fragment_notify message to
12036 * mds.c and drops wrlock on dirfragtreelock.
12037 * - mds.b (auth mds of dir inode) changes dirfragtreelock state to
12038 * SYNC and send lock message mds.c
12039 * - mds.c receives the lock message and changes dirfragtreelock state
12040 * to SYNC
12041 * - mds.c trim dirfrag and dir inode from its cache
12042 * - mds.c receives the fragment_notify message
12043 *
12044 * So we need to ensure replicas have received the notify, then unlock
12045 * the dirfragtreelock.
12046 */
12047 notify->mark_ack_wanted();
12048 info.notify_ack_waiting.insert(p.first);
12049 }
12050
12051 // freshly replicate new dirs to peers
12052 for (const auto& dir : info.resultfrags) {
12053 encode_replica_dir(dir, p.first, notify->basebl);
12054 }
12055
12056 mds->send_message_mds(notify, p.first);
12057 }
12058
12059 // journal commit
12060 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits);
12061 mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, mdr));
12062
12063
12064 // unfreeze resulting frags
12065 for (const auto& dir : info.resultfrags) {
12066 dout(10) << " result frag " << *dir << dendl;
12067
12068 for (auto &p : dir->items) {
12069 CDentry *dn = p.second;
12070 ceph_assert(dn->state_test(CDentry::STATE_FRAGMENTING));
12071 dn->state_clear(CDentry::STATE_FRAGMENTING);
12072 dn->put(CDentry::PIN_FRAGMENTING);
12073 }
12074
12075 // unfreeze
12076 dir->unfreeze_dir();
12077 }
12078
12079 if (info.notify_ack_waiting.empty()) {
12080 fragment_drop_locks(info);
12081 } else {
12082 mds->locker->drop_locks_for_fragment_unfreeze(mdr.get());
12083 }
12084 }
12085
12086 void MDCache::_fragment_committed(dirfrag_t basedirfrag, const MDRequestRef& mdr)
12087 {
12088 dout(10) << "fragment_committed " << basedirfrag << dendl;
12089 if (mdr)
12090 mdr->mark_event("commit logged");
12091
12092 ufragment &uf = uncommitted_fragments.at(basedirfrag);
12093
12094 // remove old frags
12095 C_GatherBuilder gather(
12096 g_ceph_context,
12097 new C_OnFinisher(
12098 new C_IO_MDC_FragmentPurgeOld(this, basedirfrag, uf.bits, mdr),
12099 mds->finisher));
12100
12101 SnapContext nullsnapc;
12102 object_locator_t oloc(mds->mdsmap->get_metadata_pool());
12103 for (const auto& fg : uf.old_frags) {
12104 object_t oid = CInode::get_object_name(basedirfrag.ino, fg, "");
12105 ObjectOperation op;
12106 if (fg == frag_t()) {
12107 // backtrace object
12108 dout(10) << " truncate orphan dirfrag " << oid << dendl;
12109 op.truncate(0);
12110 op.omap_clear();
12111 } else {
12112 dout(10) << " removing orphan dirfrag " << oid << dendl;
12113 op.remove();
12114 }
12115 mds->objecter->mutate(oid, oloc, op, nullsnapc,
12116 ceph::real_clock::now(),
12117 0, gather.new_sub());
12118 }
12119
12120 ceph_assert(gather.has_subs());
12121 gather.activate();
12122 }
12123
12124 void MDCache::_fragment_old_purged(dirfrag_t basedirfrag, int bits, const MDRequestRef& mdr)
12125 {
12126 dout(10) << "fragment_old_purged " << basedirfrag << dendl;
12127 if (mdr)
12128 mdr->mark_event("old frags purged");
12129
12130 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, bits);
12131 mds->mdlog->start_submit_entry(le);
12132
12133 finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH);
12134
12135 if (mds->logger) {
12136 if (bits > 0) {
12137 mds->logger->inc(l_mds_dir_split);
12138 } else {
12139 mds->logger->inc(l_mds_dir_merge);
12140 }
12141 }
12142
12143 if (mdr) {
12144 auto it = fragments.find(basedirfrag);
12145 ceph_assert(it != fragments.end());
12146 it->second.finishing = true;
12147 if (it->second.notify_ack_waiting.empty())
12148 fragment_maybe_finish(it);
12149 else
12150 mdr->mark_event("wating for notify acks");
12151 }
12152 }
12153
12154 void MDCache::fragment_drop_locks(fragment_info_t& info)
12155 {
12156 mds->locker->drop_locks(info.mdr.get());
12157 request_finish(info.mdr);
12158 //info.mdr.reset();
12159 }
12160
12161 void MDCache::fragment_maybe_finish(const fragment_info_iterator& it)
12162 {
12163 if (!it->second.finishing)
12164 return;
12165
12166 // unmark & auth_unpin
12167 for (const auto &dir : it->second.resultfrags) {
12168 dir->state_clear(CDir::STATE_FRAGMENTING);
12169 dir->auth_unpin(this);
12170
12171 // In case the resulting fragments are beyond the split size,
12172 // we might need to split them again right away (they could
12173 // have been taking inserts between unfreezing and getting
12174 // here)
12175 mds->balancer->maybe_fragment(dir, false);
12176 }
12177
12178 fragments.erase(it);
12179 }
12180
12181
12182 void MDCache::handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &ack)
12183 {
12184 dout(10) << "handle_fragment_notify_ack " << *ack << " from " << ack->get_source() << dendl;
12185 mds_rank_t from = mds_rank_t(ack->get_source().num());
12186
12187 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
12188 return;
12189 }
12190
12191 auto it = fragments.find(ack->get_base_dirfrag());
12192 if (it == fragments.end() ||
12193 it->second.get_tid() != ack->get_tid()) {
12194 dout(10) << "handle_fragment_notify_ack obsolete message, dropping" << dendl;
12195 return;
12196 }
12197
12198 if (it->second.notify_ack_waiting.erase(from) &&
12199 it->second.notify_ack_waiting.empty()) {
12200 fragment_drop_locks(it->second);
12201 fragment_maybe_finish(it);
12202 }
12203 }
12204
12205 void MDCache::handle_fragment_notify(const cref_t<MMDSFragmentNotify> &notify)
12206 {
12207 dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl;
12208 mds_rank_t from = mds_rank_t(notify->get_source().num());
12209
12210 if (mds->get_state() < MDSMap::STATE_REJOIN) {
12211 return;
12212 }
12213
12214 CInode *diri = get_inode(notify->get_ino());
12215 if (diri) {
12216 frag_t base = notify->get_basefrag();
12217 int bits = notify->get_bits();
12218
12219 /*
12220 if ((bits < 0 && diri->dirfragtree.is_leaf(base)) ||
12221 (bits > 0 && !diri->dirfragtree.is_leaf(base))) {
12222 dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits
12223 << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl;
12224 return;
12225 }
12226 */
12227
12228 // refragment
12229 MDSContext::vec waiters;
12230 std::vector<CDir*> resultfrags;
12231 adjust_dir_fragments(diri, base, bits, &resultfrags, waiters, false);
12232 if (g_conf()->mds_debug_frag)
12233 diri->verify_dirfrags();
12234
12235 for (const auto& dir : resultfrags) {
12236 diri->take_dir_waiting(dir->get_frag(), waiters);
12237 }
12238
12239 // add new replica dirs values
12240 auto p = notify->basebl.cbegin();
12241 while (!p.end()) {
12242 CDir *tmp_dir = nullptr;
12243 decode_replica_dir(tmp_dir, p, diri, from, waiters);
12244 }
12245
12246 mds->queue_waiters(waiters);
12247 } else {
12248 ceph_abort();
12249 }
12250
12251 if (notify->is_ack_wanted()) {
12252 auto ack = make_message<MMDSFragmentNotifyAck>(notify->get_base_dirfrag(),
12253 notify->get_bits(), notify->get_tid());
12254 mds->send_message_mds(ack, from);
12255 }
12256 }
12257
12258 void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frags,
12259 LogSegment *ls, bufferlist *rollback)
12260 {
12261 dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl;
12262 ceph_assert(!uncommitted_fragments.count(basedirfrag));
12263 ufragment& uf = uncommitted_fragments[basedirfrag];
12264 uf.old_frags = old_frags;
12265 uf.bits = bits;
12266 uf.ls = ls;
12267 ls->uncommitted_fragments.insert(basedirfrag);
12268 if (rollback)
12269 uf.rollback.swap(*rollback);
12270 }
12271
12272 void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op)
12273 {
12274 dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag
12275 << " op " << EFragment::op_name(op) << dendl;
12276 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12277 if (it != uncommitted_fragments.end()) {
12278 ufragment& uf = it->second;
12279 if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) {
12280 uf.committed = true;
12281 } else {
12282 uf.ls->uncommitted_fragments.erase(basedirfrag);
12283 mds->queue_waiters(uf.waiters);
12284 uncommitted_fragments.erase(it);
12285 }
12286 }
12287 }
12288
12289 void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags)
12290 {
12291 dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag
12292 << " old_frags (" << old_frags << ")" << dendl;
12293 map<dirfrag_t, ufragment>::iterator it = uncommitted_fragments.find(basedirfrag);
12294 if (it != uncommitted_fragments.end()) {
12295 ufragment& uf = it->second;
12296 if (!uf.old_frags.empty()) {
12297 uf.old_frags = std::move(old_frags);
12298 uf.committed = true;
12299 } else {
12300 uf.ls->uncommitted_fragments.erase(basedirfrag);
12301 uncommitted_fragments.erase(it);
12302 }
12303 }
12304 }
12305
12306 void MDCache::wait_for_uncommitted_fragments(MDSGather *gather)
12307 {
12308 for (auto& p : uncommitted_fragments)
12309 p.second.waiters.push_back(gather->new_sub());
12310 }
12311
12312 void MDCache::rollback_uncommitted_fragments()
12313 {
12314 dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl;
12315 for (map<dirfrag_t, ufragment>::iterator p = uncommitted_fragments.begin();
12316 p != uncommitted_fragments.end();
12317 ++p) {
12318 ufragment &uf = p->second;
12319 CInode *diri = get_inode(p->first.ino);
12320 ceph_assert(diri);
12321
12322 if (uf.committed) {
12323 _fragment_committed(p->first, MDRequestRef());
12324 continue;
12325 }
12326
12327 dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl;
12328
12329 LogSegment *ls = mds->mdlog->get_current_segment();
12330 EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits);
12331 mds->mdlog->start_entry(le);
12332 bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF);
12333
12334 frag_vec_t old_frags;
12335 diri->dirfragtree.get_leaves_under(p->first.frag, old_frags);
12336
12337 std::vector<CDir*> resultfrags;
12338 if (uf.old_frags.empty()) {
12339 // created by old format EFragment
12340 MDSContext::vec waiters;
12341 adjust_dir_fragments(diri, p->first.frag, -uf.bits, &resultfrags, waiters, true);
12342 } else {
12343 auto bp = uf.rollback.cbegin();
12344 for (const auto& fg : uf.old_frags) {
12345 CDir *dir = force_dir_fragment(diri, fg);
12346 resultfrags.push_back(dir);
12347
12348 dirfrag_rollback rollback;
12349 decode(rollback, bp);
12350
12351 dir->set_version(rollback.fnode.version);
12352 dir->fnode = rollback.fnode;
12353
12354 dir->_mark_dirty(ls);
12355
12356 if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) {
12357 dout(10) << " dirty nestinfo on " << *dir << dendl;
12358 mds->locker->mark_updated_scatterlock(&dir->inode->nestlock);
12359 ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest);
12360 }
12361 if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) {
12362 dout(10) << " dirty fragstat on " << *dir << dendl;
12363 mds->locker->mark_updated_scatterlock(&dir->inode->filelock);
12364 ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir);
12365 }
12366
12367 le->add_orig_frag(dir->get_frag());
12368 le->metablob.add_dir_context(dir);
12369 if (diri_auth) {
12370 le->metablob.add_fragmented_dir(dir, true, false);
12371 } else {
12372 dout(10) << " dirty dirfragtree on " << *dir << dendl;
12373 dir->state_set(CDir::STATE_DIRTYDFT);
12374 le->metablob.add_fragmented_dir(dir, true, true);
12375 }
12376 }
12377 }
12378
12379 if (diri_auth) {
12380 auto &pi = diri->project_inode();
12381 pi.inode.version = diri->pre_dirty();
12382 diri->pop_and_dirty_projected_inode(ls); // hacky
12383 le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true);
12384 } else {
12385 mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock);
12386 ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree);
12387 }
12388
12389 if (g_conf()->mds_debug_frag)
12390 diri->verify_dirfrags();
12391
12392 for (const auto& leaf : old_frags) {
12393 ceph_assert(!diri->dirfragtree.is_leaf(leaf));
12394 }
12395
12396 mds->mdlog->submit_entry(le);
12397
12398 uf.old_frags.swap(old_frags);
12399 _fragment_committed(p->first, MDRequestRef());
12400 }
12401 }
12402
12403 void MDCache::force_readonly()
12404 {
12405 if (is_readonly())
12406 return;
12407
12408 dout(1) << "force file system read-only" << dendl;
12409 mds->clog->warn() << "force file system read-only";
12410
12411 set_readonly();
12412
12413 mds->server->force_clients_readonly();
12414
12415 // revoke write caps
12416 int count = 0;
12417 for (auto &p : inode_map) {
12418 CInode *in = p.second;
12419 if (in->is_head())
12420 mds->locker->eval(in, CEPH_CAP_LOCKS);
12421 if (!(++count % 1000))
12422 mds->heartbeat_reset();
12423 }
12424
12425 mds->mdlog->flush();
12426 }
12427
12428
12429 // ==============================================================
12430 // debug crap
12431
12432 void MDCache::show_subtrees(int dbl, bool force_print)
12433 {
12434 if (g_conf()->mds_thrash_exports)
12435 dbl += 15;
12436
12437 //dout(10) << "show_subtrees" << dendl;
12438
12439 if (!g_conf()->subsys.should_gather(ceph_subsys_mds, dbl))
12440 return; // i won't print anything.
12441
12442 if (subtrees.empty()) {
12443 dout(ceph::dout::need_dynamic(dbl)) << "show_subtrees - no subtrees"
12444 << dendl;
12445 return;
12446 }
12447
12448 if (!force_print && subtrees.size() > SUBTREES_COUNT_THRESHOLD &&
12449 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12450 dout(ceph::dout::need_dynamic(dbl)) << "number of subtrees = " << subtrees.size() << "; not "
12451 "printing subtrees" << dendl;
12452 return;
12453 }
12454
12455 // root frags
12456 std::vector<CDir*> basefrags;
12457 for (set<CInode*>::iterator p = base_inodes.begin();
12458 p != base_inodes.end();
12459 ++p)
12460 (*p)->get_dirfrags(basefrags);
12461 //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl;
12462 dout(15) << "show_subtrees" << dendl;
12463
12464 // queue stuff
12465 list<pair<CDir*,int> > q;
12466 string indent;
12467 set<CDir*> seen;
12468
12469 // calc max depth
12470 for (const auto& dir : basefrags) {
12471 q.emplace_back(dir, 0);
12472 }
12473
12474 set<CDir*> subtrees_seen;
12475
12476 unsigned int depth = 0;
12477 while (!q.empty()) {
12478 CDir *dir = q.front().first;
12479 unsigned int d = q.front().second;
12480 q.pop_front();
12481
12482 if (subtrees.count(dir) == 0) continue;
12483
12484 subtrees_seen.insert(dir);
12485
12486 if (d > depth) depth = d;
12487
12488 // sanity check
12489 //dout(25) << "saw depth " << d << " " << *dir << dendl;
12490 if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl;
12491 ceph_assert(seen.count(dir) == 0);
12492 seen.insert(dir);
12493
12494 // nested items?
12495 if (!subtrees[dir].empty()) {
12496 for (set<CDir*>::iterator p = subtrees[dir].begin();
12497 p != subtrees[dir].end();
12498 ++p) {
12499 //dout(25) << " saw sub " << **p << dendl;
12500 q.push_front(pair<CDir*,int>(*p, d+1));
12501 }
12502 }
12503 }
12504
12505 if (!force_print && depth > SUBTREES_DEPTH_THRESHOLD &&
12506 !g_conf()->subsys.should_gather<ceph_subsys_mds, 25>()) {
12507 dout(ceph::dout::need_dynamic(dbl)) << "max depth among subtrees = " << depth << "; not printing "
12508 "subtrees" << dendl;
12509 return;
12510 }
12511
12512 // print tree
12513 for (const auto& dir : basefrags) {
12514 q.emplace_back(dir, 0);
12515 }
12516
12517 while (!q.empty()) {
12518 CDir *dir = q.front().first;
12519 int d = q.front().second;
12520 q.pop_front();
12521
12522 if (subtrees.count(dir) == 0) continue;
12523
12524 // adjust indenter
12525 while ((unsigned)d < indent.size())
12526 indent.resize(d);
12527
12528 // pad
12529 string pad = "______________________________________";
12530 pad.resize(depth*2+1-indent.size());
12531 if (!subtrees[dir].empty())
12532 pad[0] = '.'; // parent
12533
12534
12535 string auth;
12536 if (dir->is_auth())
12537 auth = "auth ";
12538 else
12539 auth = " rep ";
12540
12541 char s[10];
12542 if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN)
12543 snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first));
12544 else
12545 snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second));
12546
12547 // print
12548 dout(ceph::dout::need_dynamic(dbl)) << indent << "|_" << pad << s
12549 << " " << auth << *dir << dendl;
12550
12551 if (dir->ino() == MDS_INO_ROOT)
12552 ceph_assert(dir->inode == root);
12553 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
12554 ceph_assert(dir->inode == myin);
12555 if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid()))
12556 ceph_assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode);
12557
12558 // nested items?
12559 if (!subtrees[dir].empty()) {
12560 // more at my level?
12561 if (!q.empty() && q.front().second == d)
12562 indent += "| ";
12563 else
12564 indent += " ";
12565
12566 for (set<CDir*>::iterator p = subtrees[dir].begin();
12567 p != subtrees[dir].end();
12568 ++p)
12569 q.push_front(pair<CDir*,int>(*p, d+2));
12570 }
12571 }
12572
12573 // verify there isn't stray crap in subtree map
12574 int lost = 0;
12575 for (map<CDir*, set<CDir*> >::iterator p = subtrees.begin();
12576 p != subtrees.end();
12577 ++p) {
12578 if (subtrees_seen.count(p->first)) continue;
12579 dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl;
12580 lost++;
12581 }
12582 ceph_assert(lost == 0);
12583 }
12584
12585 void MDCache::show_cache()
12586 {
12587 dout(7) << "show_cache" << dendl;
12588
12589 auto show_func = [this](CInode *in) {
12590 // unlinked?
12591 if (!in->parent)
12592 dout(7) << " unlinked " << *in << dendl;
12593
12594 // dirfrags?
12595 auto&& dfs = in->get_dirfrags();
12596 for (const auto& dir : dfs) {
12597 dout(7) << " dirfrag " << *dir << dendl;
12598
12599 for (auto &p : dir->items) {
12600 CDentry *dn = p.second;
12601 dout(7) << " dentry " << *dn << dendl;
12602 CDentry::linkage_t *dnl = dn->get_linkage();
12603 if (dnl->is_primary() && dnl->get_inode())
12604 dout(7) << " inode " << *dnl->get_inode() << dendl;
12605 }
12606 }
12607 };
12608
12609 for (auto &p : inode_map)
12610 show_func(p.second);
12611 for (auto &p : snap_inode_map)
12612 show_func(p.second);
12613 }
12614
12615 void MDCache::cache_status(Formatter *f)
12616 {
12617 f->open_object_section("cache");
12618
12619 f->open_object_section("pool");
12620 mempool::get_pool(mempool::mds_co::id).dump(f);
12621 f->close_section();
12622
12623 f->close_section();
12624 }
12625
12626 void MDCache::dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f)
12627 {
12628 ceph_assert(in);
12629 if ((max_depth >= 0) && (cur_depth > max_depth)) {
12630 return;
12631 }
12632 auto&& ls = in->get_dirfrags();
12633 for (const auto &subdir : ls) {
12634 for (const auto &p : subdir->items) {
12635 CDentry *dn = p.second;
12636 CInode *in = dn->get_linkage()->get_inode();
12637 if (in) {
12638 dump_tree(in, cur_depth + 1, max_depth, f);
12639 }
12640 }
12641 }
12642 f->open_object_section("inode");
12643 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12644 f->close_section();
12645 }
12646
12647 int MDCache::dump_cache(std::string_view file_name)
12648 {
12649 return dump_cache(file_name, NULL);
12650 }
12651
12652 int MDCache::dump_cache(Formatter *f)
12653 {
12654 return dump_cache(std::string_view(""), f);
12655 }
12656
12657 /**
12658 * Dump the metadata cache, either to a Formatter, if
12659 * provided, else to a plain text file.
12660 */
12661 int MDCache::dump_cache(std::string_view fn, Formatter *f)
12662 {
12663 int r = 0;
12664
12665 // dumping large caches may cause mds to hang or worse get killed.
12666 // so, disallow the dump if the cache size exceeds the configured
12667 // threshold, which is 1G for formatter and unlimited for file (note
12668 // that this can be jacked up by the admin... and is nothing but foot
12669 // shooting, but the option itself is for devs and hence dangerous to
12670 // tune). TODO: remove this when fixed.
12671 uint64_t threshold = f ?
12672 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_formatter") :
12673 g_conf().get_val<Option::size_t>("mds_dump_cache_threshold_file");
12674
12675 if (threshold && cache_size() > threshold) {
12676 if (f) {
12677 std::stringstream ss;
12678 ss << "cache usage exceeds dump threshold";
12679 f->open_object_section("result");
12680 f->dump_string("error", ss.str());
12681 f->close_section();
12682 } else {
12683 derr << "cache usage exceeds dump threshold" << dendl;
12684 r = -EINVAL;
12685 }
12686 return r;
12687 }
12688
12689 r = 0;
12690 int fd = -1;
12691
12692 if (f) {
12693 f->open_array_section("inodes");
12694 } else {
12695 char path[PATH_MAX] = "";
12696 if (fn.length()) {
12697 snprintf(path, sizeof path, "%s", fn.data());
12698 } else {
12699 snprintf(path, sizeof path, "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid()));
12700 }
12701
12702 dout(1) << "dump_cache to " << path << dendl;
12703
12704 fd = ::open(path, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC, 0600);
12705 if (fd < 0) {
12706 derr << "failed to open " << path << ": " << cpp_strerror(errno) << dendl;
12707 return errno;
12708 }
12709 }
12710
12711 auto dump_func = [fd, f](CInode *in) {
12712 int r;
12713 if (f) {
12714 f->open_object_section("inode");
12715 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_DIRFRAGS);
12716 f->close_section();
12717 return 1;
12718 }
12719 ostringstream ss;
12720 ss << *in << std::endl;
12721 std::string s = ss.str();
12722 r = safe_write(fd, s.c_str(), s.length());
12723 if (r < 0)
12724 return r;
12725 auto&& dfs = in->get_dirfrags();
12726 for (auto &dir : dfs) {
12727 ostringstream tt;
12728 tt << " " << *dir << std::endl;
12729 std::string t = tt.str();
12730 r = safe_write(fd, t.c_str(), t.length());
12731 if (r < 0)
12732 return r;
12733 for (auto &p : dir->items) {
12734 CDentry *dn = p.second;
12735 ostringstream uu;
12736 uu << " " << *dn << std::endl;
12737 std::string u = uu.str();
12738 r = safe_write(fd, u.c_str(), u.length());
12739 if (r < 0)
12740 return r;
12741 }
12742 dir->check_rstats();
12743 }
12744 return 1;
12745 };
12746
12747 for (auto &p : inode_map) {
12748 r = dump_func(p.second);
12749 if (r < 0)
12750 goto out;
12751 }
12752 for (auto &p : snap_inode_map) {
12753 r = dump_func(p.second);
12754 if (r < 0)
12755 goto out;
12756 }
12757 r = 0;
12758
12759 out:
12760 if (f) {
12761 f->close_section(); // inodes
12762 } else {
12763 ::close(fd);
12764 }
12765 return r;
12766 }
12767
12768
12769
12770 C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r)
12771 : MDSInternalContext(c->mds), cache(c), mdr(r)
12772 {}
12773
12774 void C_MDS_RetryRequest::finish(int r)
12775 {
12776 mdr->retry++;
12777 cache->dispatch_request(mdr);
12778 }
12779
12780
12781 class C_MDS_EnqueueScrub : public Context
12782 {
12783 std::string tag;
12784 Formatter *formatter;
12785 Context *on_finish;
12786 public:
12787 ScrubHeaderRef header;
12788 C_MDS_EnqueueScrub(std::string_view tag, Formatter *f, Context *fin) :
12789 tag(tag), formatter(f), on_finish(fin), header(nullptr) {}
12790
12791 Context *take_finisher() {
12792 Context *fin = on_finish;
12793 on_finish = NULL;
12794 return fin;
12795 }
12796
12797 void finish(int r) override {
12798 if (r == 0) {
12799 // since recursive scrub is asynchronous, dump minimal output
12800 // to not upset cli tools.
12801 if (header && header->get_recursive()) {
12802 formatter->open_object_section("results");
12803 formatter->dump_int("return_code", 0);
12804 formatter->dump_string("scrub_tag", tag);
12805 formatter->dump_string("mode", "asynchronous");
12806 formatter->close_section(); // results
12807 }
12808 } else { // we failed the lookup or something; dump ourselves
12809 formatter->open_object_section("results");
12810 formatter->dump_int("return_code", r);
12811 formatter->close_section(); // results
12812 r = 0; // already dumped in formatter
12813 }
12814 if (on_finish)
12815 on_finish->complete(r);
12816 }
12817 };
12818
12819 void MDCache::enqueue_scrub(
12820 std::string_view path,
12821 std::string_view tag,
12822 bool force, bool recursive, bool repair,
12823 Formatter *f, Context *fin)
12824 {
12825 dout(10) << __func__ << " " << path << dendl;
12826 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB);
12827 if (path == "~mdsdir") {
12828 filepath fp(MDS_INO_MDSDIR(mds->get_nodeid()));
12829 mdr->set_filepath(fp);
12830 } else {
12831 filepath fp(path);
12832 mdr->set_filepath(path);
12833 }
12834
12835 bool is_internal = false;
12836 std::string tag_str(tag);
12837 if (tag_str.empty()) {
12838 uuid_d uuid_gen;
12839 uuid_gen.generate_random();
12840 tag_str = uuid_gen.to_string();
12841 is_internal = true;
12842 }
12843
12844 C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(tag_str, f, fin);
12845 cs->header = std::make_shared<ScrubHeader>(
12846 tag_str, is_internal, force, recursive, repair, f);
12847
12848 mdr->internal_op_finish = cs;
12849 enqueue_scrub_work(mdr);
12850 }
12851
12852 void MDCache::enqueue_scrub_work(MDRequestRef& mdr)
12853 {
12854 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
12855 if (NULL == in)
12856 return;
12857
12858 // TODO: Remove this restriction
12859 ceph_assert(in->is_auth());
12860
12861 C_MDS_EnqueueScrub *cs = static_cast<C_MDS_EnqueueScrub*>(mdr->internal_op_finish);
12862 ScrubHeaderRef header = cs->header;
12863
12864 // Cannot scrub same dentry twice at same time
12865 if (in->scrub_is_in_progress()) {
12866 mds->server->respond_to_request(mdr, -EBUSY);
12867 return;
12868 } else {
12869 in->scrub_info();
12870 }
12871
12872 header->set_origin(in);
12873
12874 Context *fin;
12875 if (header->get_recursive()) {
12876 header->get_origin()->get(CInode::PIN_SCRUBQUEUE);
12877 fin = new MDSInternalContextWrapper(mds,
12878 new LambdaContext([this, header](int r) {
12879 recursive_scrub_finish(header);
12880 header->get_origin()->put(CInode::PIN_SCRUBQUEUE);
12881 })
12882 );
12883 } else {
12884 fin = cs->take_finisher();
12885 }
12886
12887 // If the scrub did some repair, then flush the journal at the end of
12888 // the scrub. Otherwise in the case of e.g. rewriting a backtrace
12889 // the on disk state will still look damaged.
12890 auto scrub_finish = new LambdaContext([this, header, fin](int r){
12891 if (!header->get_repaired()) {
12892 if (fin)
12893 fin->complete(r);
12894 return;
12895 }
12896
12897 auto flush_finish = new LambdaContext([this, fin](int r){
12898 dout(4) << "Expiring log segments because scrub did some repairs" << dendl;
12899 mds->mdlog->trim_all();
12900
12901 if (fin) {
12902 MDSGatherBuilder gather(g_ceph_context);
12903 auto& expiring_segments = mds->mdlog->get_expiring_segments();
12904 for (auto logseg : expiring_segments)
12905 logseg->wait_for_expiry(gather.new_sub());
12906 ceph_assert(gather.has_subs());
12907 gather.set_finisher(new MDSInternalContextWrapper(mds, fin));
12908 gather.activate();
12909 }
12910 });
12911
12912 dout(4) << "Flushing journal because scrub did some repairs" << dendl;
12913 mds->mdlog->start_new_segment();
12914 mds->mdlog->flush();
12915 mds->mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, flush_finish));
12916 });
12917
12918 if (!header->get_recursive()) {
12919 mds->scrubstack->enqueue_inode_top(in, header,
12920 new MDSInternalContextWrapper(mds, scrub_finish));
12921 } else {
12922 mds->scrubstack->enqueue_inode_bottom(in, header,
12923 new MDSInternalContextWrapper(mds, scrub_finish));
12924 }
12925
12926 mds->server->respond_to_request(mdr, 0);
12927 return;
12928 }
12929
12930 void MDCache::recursive_scrub_finish(const ScrubHeaderRef& header)
12931 {
12932 if (header->get_origin()->is_base() &&
12933 header->get_force() && header->get_repair()) {
12934 // notify snapserver that base directory is recursively scrubbed.
12935 // After both root and mdsdir are recursively scrubbed, snapserver
12936 // knows that all old format snaprealms are converted to the new
12937 // format.
12938 if (mds->mdsmap->get_num_in_mds() == 1 &&
12939 mds->mdsmap->get_num_failed_mds() == 0 &&
12940 mds->mdsmap->get_tableserver() == mds->get_nodeid()) {
12941 mds->mark_base_recursively_scrubbed(header->get_origin()->ino());
12942 }
12943 }
12944 }
12945
12946 struct C_MDC_RespondInternalRequest : public MDCacheLogContext {
12947 MDRequestRef mdr;
12948 C_MDC_RespondInternalRequest(MDCache *c, MDRequestRef& m) :
12949 MDCacheLogContext(c), mdr(m) {}
12950 void finish(int r) override {
12951 mdr->apply();
12952 get_mds()->server->respond_to_request(mdr, r);
12953 }
12954 };
12955
12956 void MDCache::repair_dirfrag_stats(CDir *dir)
12957 {
12958 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS);
12959 mdr->pin(dir);
12960 mdr->internal_op_private = dir;
12961 mdr->internal_op_finish = new C_MDSInternalNoop;
12962 repair_dirfrag_stats_work(mdr);
12963 }
12964
12965 void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr)
12966 {
12967 CDir *dir = static_cast<CDir*>(mdr->internal_op_private);
12968 dout(10) << __func__ << " " << *dir << dendl;
12969
12970 if (!dir->is_auth()) {
12971 mds->server->respond_to_request(mdr, -ESTALE);
12972 return;
12973 }
12974
12975 if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) {
12976 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr));
12977
12978 mds->locker->drop_locks(mdr.get());
12979 mdr->drop_local_auth_pins();
12980 if (mdr->is_any_remote_auth_pin())
12981 mds->locker->notify_freeze_waiter(dir);
12982 return;
12983 }
12984
12985 mdr->auth_pin(dir);
12986
12987 MutationImpl::LockOpVec lov;
12988 CInode *diri = dir->inode;
12989 lov.add_rdlock(&diri->dirfragtreelock);
12990 lov.add_wrlock(&diri->nestlock);
12991 lov.add_wrlock(&diri->filelock);
12992 if (!mds->locker->acquire_locks(mdr, lov))
12993 return;
12994
12995 if (!dir->is_complete()) {
12996 dir->fetch(new C_MDS_RetryRequest(this, mdr));
12997 return;
12998 }
12999
13000 frag_info_t frag_info;
13001 nest_info_t nest_info;
13002 for (auto it = dir->begin(); it != dir->end(); ++it) {
13003 CDentry *dn = it->second;
13004 if (dn->last != CEPH_NOSNAP)
13005 continue;
13006 CDentry::linkage_t *dnl = dn->get_projected_linkage();
13007 if (dnl->is_primary()) {
13008 CInode *in = dnl->get_inode();
13009 nest_info.add(in->get_projected_inode()->accounted_rstat);
13010 if (in->is_dir())
13011 frag_info.nsubdirs++;
13012 else
13013 frag_info.nfiles++;
13014 } else if (dnl->is_remote())
13015 frag_info.nfiles++;
13016 }
13017
13018 fnode_t *pf = dir->get_projected_fnode();
13019 bool good_fragstat = frag_info.same_sums(pf->fragstat);
13020 bool good_rstat = nest_info.same_sums(pf->rstat);
13021 if (good_fragstat && good_rstat) {
13022 dout(10) << __func__ << " no corruption found" << dendl;
13023 mds->server->respond_to_request(mdr, 0);
13024 return;
13025 }
13026
13027 pf = dir->project_fnode();
13028 pf->version = dir->pre_dirty();
13029 mdr->add_projected_fnode(dir);
13030
13031 mdr->ls = mds->mdlog->get_current_segment();
13032 EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag");
13033 mds->mdlog->start_entry(le);
13034
13035 if (!good_fragstat) {
13036 if (pf->fragstat.mtime > frag_info.mtime)
13037 frag_info.mtime = pf->fragstat.mtime;
13038 if (pf->fragstat.change_attr > frag_info.change_attr)
13039 frag_info.change_attr = pf->fragstat.change_attr;
13040 pf->fragstat = frag_info;
13041 mds->locker->mark_updated_scatterlock(&diri->filelock);
13042 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13043 mdr->add_updated_lock(&diri->filelock);
13044 }
13045
13046 if (!good_rstat) {
13047 if (pf->rstat.rctime > nest_info.rctime)
13048 nest_info.rctime = pf->rstat.rctime;
13049 pf->rstat = nest_info;
13050 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13051 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13052 mdr->add_updated_lock(&diri->nestlock);
13053 }
13054
13055 le->metablob.add_dir_context(dir);
13056 le->metablob.add_dir(dir, true);
13057
13058 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13059 }
13060
13061 void MDCache::repair_inode_stats(CInode *diri)
13062 {
13063 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS);
13064 mdr->pin(diri);
13065 mdr->internal_op_private = diri;
13066 mdr->internal_op_finish = new C_MDSInternalNoop;
13067 repair_inode_stats_work(mdr);
13068 }
13069
13070 void MDCache::repair_inode_stats_work(MDRequestRef& mdr)
13071 {
13072 CInode *diri = static_cast<CInode*>(mdr->internal_op_private);
13073 dout(10) << __func__ << " " << *diri << dendl;
13074
13075 if (!diri->is_auth()) {
13076 mds->server->respond_to_request(mdr, -ESTALE);
13077 return;
13078 }
13079 if (!diri->is_dir()) {
13080 mds->server->respond_to_request(mdr, -ENOTDIR);
13081 return;
13082 }
13083
13084 MutationImpl::LockOpVec lov;
13085
13086 if (mdr->ls) // already marked filelock/nestlock dirty ?
13087 goto do_rdlocks;
13088
13089 lov.add_rdlock(&diri->dirfragtreelock);
13090 lov.add_wrlock(&diri->nestlock);
13091 lov.add_wrlock(&diri->filelock);
13092 if (!mds->locker->acquire_locks(mdr, lov))
13093 return;
13094
13095 // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger
13096 // the scatter-gather process, which will fix any fragstat/rstat errors.
13097 {
13098 frag_vec_t leaves;
13099 diri->dirfragtree.get_leaves(leaves);
13100 for (const auto& leaf : leaves) {
13101 CDir *dir = diri->get_dirfrag(leaf);
13102 if (!dir) {
13103 ceph_assert(mdr->is_auth_pinned(diri));
13104 dir = diri->get_or_open_dirfrag(this, leaf);
13105 }
13106 if (dir->get_version() == 0) {
13107 ceph_assert(dir->is_auth());
13108 dir->fetch(new C_MDS_RetryRequest(this, mdr));
13109 return;
13110 }
13111 }
13112 }
13113
13114 diri->state_set(CInode::STATE_REPAIRSTATS);
13115 mdr->ls = mds->mdlog->get_current_segment();
13116 mds->locker->mark_updated_scatterlock(&diri->filelock);
13117 mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir);
13118 mds->locker->mark_updated_scatterlock(&diri->nestlock);
13119 mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest);
13120
13121 mds->locker->drop_locks(mdr.get());
13122
13123 do_rdlocks:
13124 // force the scatter-gather process
13125 lov.clear();
13126 lov.add_rdlock(&diri->dirfragtreelock);
13127 lov.add_rdlock(&diri->nestlock);
13128 lov.add_rdlock(&diri->filelock);
13129 if (!mds->locker->acquire_locks(mdr, lov))
13130 return;
13131
13132 diri->state_clear(CInode::STATE_REPAIRSTATS);
13133
13134 frag_info_t dir_info;
13135 nest_info_t nest_info;
13136 nest_info.rsubdirs = 1; // it gets one to account for self
13137 if (const sr_t *srnode = diri->get_projected_srnode(); srnode)
13138 nest_info.rsnaps = srnode->snaps.size();
13139
13140 {
13141 frag_vec_t leaves;
13142 diri->dirfragtree.get_leaves(leaves);
13143 for (const auto& leaf : leaves) {
13144 CDir *dir = diri->get_dirfrag(leaf);
13145 ceph_assert(dir);
13146 ceph_assert(dir->get_version() > 0);
13147 dir_info.add(dir->fnode.accounted_fragstat);
13148 nest_info.add(dir->fnode.accounted_rstat);
13149 }
13150 }
13151
13152 if (!dir_info.same_sums(diri->inode.dirstat) ||
13153 !nest_info.same_sums(diri->inode.rstat)) {
13154 dout(10) << __func__ << " failed to fix fragstat/rstat on "
13155 << *diri << dendl;
13156 }
13157
13158 mds->server->respond_to_request(mdr, 0);
13159 }
13160
13161 void MDCache::upgrade_inode_snaprealm(CInode *in)
13162 {
13163 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_UPGRADE_SNAPREALM);
13164 mdr->pin(in);
13165 mdr->internal_op_private = in;
13166 mdr->internal_op_finish = new C_MDSInternalNoop;
13167 upgrade_inode_snaprealm_work(mdr);
13168 }
13169
13170 void MDCache::upgrade_inode_snaprealm_work(MDRequestRef& mdr)
13171 {
13172 CInode *in = static_cast<CInode*>(mdr->internal_op_private);
13173 dout(10) << __func__ << " " << *in << dendl;
13174
13175 if (!in->is_auth()) {
13176 mds->server->respond_to_request(mdr, -ESTALE);
13177 return;
13178 }
13179
13180 MutationImpl::LockOpVec lov;
13181 lov.add_xlock(&in->snaplock);
13182 if (!mds->locker->acquire_locks(mdr, lov))
13183 return;
13184
13185 // project_snaprealm() upgrades snaprealm format
13186 auto &pi = in->project_inode(false, true);
13187 mdr->add_projected_inode(in);
13188 pi.inode.version = in->pre_dirty();
13189
13190 mdr->ls = mds->mdlog->get_current_segment();
13191 EUpdate *le = new EUpdate(mds->mdlog, "upgrade_snaprealm");
13192 mds->mdlog->start_entry(le);
13193
13194 if (in->is_base()) {
13195 le->metablob.add_root(true, in);
13196 } else {
13197 CDentry *pdn = in->get_projected_parent_dn();
13198 le->metablob.add_dir_context(pdn->get_dir());
13199 le->metablob.add_primary_dentry(pdn, in, true);
13200 }
13201
13202 mds->mdlog->submit_entry(le, new C_MDC_RespondInternalRequest(this, mdr));
13203 }
13204
13205 void MDCache::flush_dentry(std::string_view path, Context *fin)
13206 {
13207 if (is_readonly()) {
13208 dout(10) << __func__ << ": read-only FS" << dendl;
13209 fin->complete(-EROFS);
13210 return;
13211 }
13212 dout(10) << "flush_dentry " << path << dendl;
13213 MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH);
13214 filepath fp(path);
13215 mdr->set_filepath(fp);
13216 mdr->internal_op_finish = fin;
13217 flush_dentry_work(mdr);
13218 }
13219
13220 class C_FinishIOMDR : public MDSContext {
13221 protected:
13222 MDSRank *mds;
13223 MDRequestRef mdr;
13224 MDSRank *get_mds() override { return mds; }
13225 public:
13226 C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {}
13227 void finish(int r) override { mds->server->respond_to_request(mdr, r); }
13228 };
13229
13230 void MDCache::flush_dentry_work(MDRequestRef& mdr)
13231 {
13232 MutationImpl::LockOpVec lov;
13233 CInode *in = mds->server->rdlock_path_pin_ref(mdr, true);
13234 if (!in)
13235 return;
13236
13237 ceph_assert(in->is_auth());
13238 in->flush(new C_FinishIOMDR(mds, mdr));
13239 }
13240
13241
13242 /**
13243 * Initialize performance counters with global perfcounter
13244 * collection.
13245 */
13246 void MDCache::register_perfcounters()
13247 {
13248 PerfCountersBuilder pcb(g_ceph_context, "mds_cache", l_mdc_first, l_mdc_last);
13249
13250 // Stray/purge statistics
13251 pcb.add_u64(l_mdc_num_strays, "num_strays", "Stray dentries", "stry",
13252 PerfCountersBuilder::PRIO_INTERESTING);
13253 pcb.add_u64(l_mdc_num_recovering_enqueued,
13254 "num_recovering_enqueued", "Files waiting for recovery", "recy",
13255 PerfCountersBuilder::PRIO_INTERESTING);
13256 pcb.add_u64_counter(l_mdc_recovery_completed,
13257 "recovery_completed", "File recoveries completed", "recd",
13258 PerfCountersBuilder::PRIO_INTERESTING);
13259
13260 // useful recovery queue statistics
13261 pcb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
13262 pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing",
13263 "Files currently being recovered");
13264 pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized",
13265 "Files waiting for recovery with elevated priority");
13266 pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started",
13267 "File recoveries started");
13268
13269 // along with other stray dentries stats
13270 pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed",
13271 "Stray dentries delayed");
13272 pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing",
13273 "Stray dentries enqueuing for purge");
13274 pcb.add_u64_counter(l_mdc_strays_created, "strays_created",
13275 "Stray dentries created");
13276 pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued",
13277 "Stray dentries enqueued for purge");
13278 pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated",
13279 "Stray dentries reintegrated");
13280 pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated",
13281 "Stray dentries migrated");
13282
13283 // low prio internal request stats
13284 pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub",
13285 "Internal Request type enqueue scrub");
13286 pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir",
13287 "Internal Request type export dir");
13288 pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush",
13289 "Internal Request type flush");
13290 pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir",
13291 "Internal Request type fragmentdir");
13292 pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats",
13293 "Internal Request type frag stats");
13294 pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats",
13295 "Internal Request type inode stats");
13296
13297 logger.reset(pcb.create_perf_counters());
13298 g_ceph_context->get_perfcounters_collection()->add(logger.get());
13299 recovery_queue.set_logger(logger.get());
13300 stray_manager.set_logger(logger.get());
13301 }
13302
13303 /**
13304 * Call this when putting references to an inode/dentry or
13305 * when attempting to trim it.
13306 *
13307 * If this inode is no longer linked by anyone, and this MDS
13308 * rank holds the primary dentry, and that dentry is in a stray
13309 * directory, then give up the dentry to the StrayManager, never
13310 * to be seen again by MDCache.
13311 *
13312 * @param delay if true, then purgeable inodes are stashed til
13313 * the next trim(), rather than being purged right
13314 * away.
13315 */
13316 void MDCache::maybe_eval_stray(CInode *in, bool delay) {
13317 if (in->inode.nlink > 0 || in->is_base() || is_readonly() ||
13318 mds->get_state() <= MDSMap::STATE_REJOIN)
13319 return;
13320
13321 CDentry *dn = in->get_projected_parent_dn();
13322
13323 if (dn->state_test(CDentry::STATE_PURGING)) {
13324 /* We have already entered the purging process, no need
13325 * to re-evaluate me ! */
13326 return;
13327 }
13328
13329 if (dn->get_dir()->get_inode()->is_stray()) {
13330 if (delay)
13331 stray_manager.queue_delayed(dn);
13332 else
13333 stray_manager.eval_stray(dn);
13334 }
13335 }
13336
13337 void MDCache::clear_dirty_bits_for_stray(CInode* diri) {
13338 dout(10) << __func__ << " " << *diri << dendl;
13339 ceph_assert(diri->get_projected_parent_dir()->inode->is_stray());
13340 auto&& ls = diri->get_dirfrags();
13341 for (auto &p : ls) {
13342 if (p->is_auth() && !(p->is_frozen() || p->is_freezing()))
13343 p->try_remove_dentries_for_stray();
13344 }
13345 if (!diri->snaprealm) {
13346 if (diri->is_auth())
13347 diri->clear_dirty_rstat();
13348 diri->clear_scatter_dirty();
13349 }
13350 }
13351
13352 bool MDCache::dump_inode(Formatter *f, uint64_t number) {
13353 CInode *in = get_inode(number);
13354 if (!in) {
13355 return false;
13356 }
13357 f->open_object_section("inode");
13358 in->dump(f, CInode::DUMP_DEFAULT | CInode::DUMP_PATH);
13359 f->close_section();
13360 return true;
13361 }
13362
13363 void MDCache::handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap) {
13364 // process export_pin_delayed_queue whenever a new MDSMap received
13365 auto &q = export_pin_delayed_queue;
13366 for (auto it = q.begin(); it != q.end(); ) {
13367 auto *in = *it;
13368 mds_rank_t export_pin = in->get_export_pin(false);
13369 if (in->is_ephemerally_pinned()) {
13370 dout(10) << "ephemeral export pin to " << export_pin << " for " << *in << dendl;
13371 }
13372 dout(10) << " delayed export_pin=" << export_pin << " on " << *in
13373 << " max_mds=" << mdsmap.get_max_mds() << dendl;
13374 if (export_pin >= mdsmap.get_max_mds()) {
13375 it++;
13376 continue;
13377 }
13378
13379 in->state_clear(CInode::STATE_DELAYEDEXPORTPIN);
13380 it = q.erase(it);
13381 in->queue_export_pin(export_pin);
13382 }
13383
13384 if (mdsmap.get_max_mds() != oldmap.get_max_mds()) {
13385 dout(10) << "Checking ephemerally pinned directories for redistribute due to max_mds change." << dendl;
13386 /* copy to vector to avoid removals during iteration */
13387 std::vector<CInode*> migrate;
13388 migrate.assign(rand_ephemeral_pins.begin(), rand_ephemeral_pins.end());
13389 for (auto& in : migrate) {
13390 in->maybe_ephemeral_rand();
13391 }
13392 migrate.assign(dist_ephemeral_pins.begin(), dist_ephemeral_pins.end());
13393 for (auto& in : migrate) {
13394 in->maybe_ephemeral_dist();
13395 }
13396 }
13397 }