]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDCache.h
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mds / MDCache.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14 #ifndef CEPH_MDCACHE_H
15 #define CEPH_MDCACHE_H
16
17 #include <atomic>
18 #include <string_view>
19 #include <thread>
20
21 #include "common/DecayCounter.h"
22 #include "include/common_fwd.h"
23 #include "include/types.h"
24 #include "include/filepath.h"
25 #include "include/elist.h"
26
27 #include "messages/MCacheExpire.h"
28 #include "messages/MClientQuota.h"
29 #include "messages/MClientRequest.h"
30 #include "messages/MClientSnap.h"
31 #include "messages/MDentryLink.h"
32 #include "messages/MDentryUnlink.h"
33 #include "messages/MDirUpdate.h"
34 #include "messages/MDiscover.h"
35 #include "messages/MDiscoverReply.h"
36 #include "messages/MGatherCaps.h"
37 #include "messages/MGenericMessage.h"
38 #include "messages/MInodeFileCaps.h"
39 #include "messages/MLock.h"
40 #include "messages/MMDSCacheRejoin.h"
41 #include "messages/MMDSFindIno.h"
42 #include "messages/MMDSFindInoReply.h"
43 #include "messages/MMDSFragmentNotify.h"
44 #include "messages/MMDSFragmentNotifyAck.h"
45 #include "messages/MMDSOpenIno.h"
46 #include "messages/MMDSOpenInoReply.h"
47 #include "messages/MMDSResolve.h"
48 #include "messages/MMDSResolveAck.h"
49 #include "messages/MMDSPeerRequest.h"
50 #include "messages/MMDSSnapUpdate.h"
51
52 #include "osdc/Filer.h"
53 #include "CInode.h"
54 #include "CDentry.h"
55 #include "CDir.h"
56 #include "include/Context.h"
57 #include "events/EMetaBlob.h"
58 #include "RecoveryQueue.h"
59 #include "StrayManager.h"
60 #include "OpenFileTable.h"
61 #include "MDSContext.h"
62 #include "MDSMap.h"
63 #include "Mutation.h"
64
65 class MDSRank;
66 class Session;
67 class Migrator;
68
69 class Session;
70
71 class ESubtreeMap;
72
73 enum {
74 l_mdc_first = 3000,
75 // How many inodes currently in stray dentries
76 l_mdc_num_strays,
77 // How many stray dentries are currently delayed for purge due to refs
78 l_mdc_num_strays_delayed,
79 // How many stray dentries are currently being enqueued for purge
80 l_mdc_num_strays_enqueuing,
81
82 // How many dentries have ever been added to stray dir
83 l_mdc_strays_created,
84 // How many dentries have been passed on to PurgeQueue
85 l_mdc_strays_enqueued,
86 // How many strays have been reintegrated?
87 l_mdc_strays_reintegrated,
88 // How many strays have been migrated?
89 l_mdc_strays_migrated,
90
91 // How many inode sizes currently being recovered
92 l_mdc_num_recovering_processing,
93 // How many inodes currently waiting to have size recovered
94 l_mdc_num_recovering_enqueued,
95 // How many inodes waiting with elevated priority for recovery
96 l_mdc_num_recovering_prioritized,
97 // How many inodes ever started size recovery
98 l_mdc_recovery_started,
99 // How many inodes ever completed size recovery
100 l_mdc_recovery_completed,
101
102 l_mdss_ireq_enqueue_scrub,
103 l_mdss_ireq_exportdir,
104 l_mdss_ireq_flush,
105 l_mdss_ireq_fragmentdir,
106 l_mdss_ireq_fragstats,
107 l_mdss_ireq_inodestats,
108
109 l_mdc_last,
110 };
111
112 // flags for path_traverse();
113 static const int MDS_TRAVERSE_DISCOVER = (1 << 0);
114 static const int MDS_TRAVERSE_PATH_LOCKED = (1 << 1);
115 static const int MDS_TRAVERSE_WANT_DENTRY = (1 << 2);
116 static const int MDS_TRAVERSE_WANT_AUTH = (1 << 3);
117 static const int MDS_TRAVERSE_RDLOCK_SNAP = (1 << 4);
118 static const int MDS_TRAVERSE_RDLOCK_SNAP2 = (1 << 5);
119 static const int MDS_TRAVERSE_WANT_DIRLAYOUT = (1 << 6);
120 static const int MDS_TRAVERSE_RDLOCK_PATH = (1 << 7);
121 static const int MDS_TRAVERSE_XLOCK_DENTRY = (1 << 8);
122 static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK = (1 << 9);
123 static const int MDS_TRAVERSE_CHECK_LOCKCACHE = (1 << 10);
124
125
126 // flags for predirty_journal_parents()
127 static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
128 static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
129 static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
130
131 class MDCache {
132 public:
133 typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap;
134
135 using clock = ceph::coarse_mono_clock;
136 using time = ceph::coarse_mono_time;
137
138 // -- discover --
139 struct discover_info_t {
140 discover_info_t() {}
141 ~discover_info_t() {
142 if (basei)
143 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
144 }
145 void pin_base(CInode *b) {
146 basei = b;
147 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
148 }
149
150 ceph_tid_t tid = 0;
151 mds_rank_t mds = -1;
152 inodeno_t ino;
153 frag_t frag;
154 snapid_t snap = CEPH_NOSNAP;
155 filepath want_path;
156 CInode *basei = nullptr;
157 bool want_base_dir = false;
158 bool path_locked = false;
159 };
160
161 // [reconnect/rejoin caps]
162 struct reconnected_cap_info_t {
163 reconnected_cap_info_t() {}
164 inodeno_t realm_ino = 0;
165 snapid_t snap_follows = 0;
166 int dirty_caps = 0;
167 bool snapflush = 0;
168 };
169
170 // -- find_ino_peer --
171 struct find_ino_peer_info_t {
172 find_ino_peer_info_t() {}
173 inodeno_t ino;
174 ceph_tid_t tid = 0;
175 MDSContext *fin = nullptr;
176 bool path_locked = false;
177 mds_rank_t hint = MDS_RANK_NONE;
178 mds_rank_t checking = MDS_RANK_NONE;
179 set<mds_rank_t> checked;
180 };
181
182 friend class C_MDC_RejoinOpenInoFinish;
183 friend class C_MDC_RejoinSessionsOpened;
184
185 friend class Locker;
186 friend class Migrator;
187 friend class MDBalancer;
188
189 // StrayManager needs to be able to remove_inode() from us
190 // when it is done purging
191 friend class StrayManager;
192
193 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
194 ~MDCache();
195
196 uint64_t cache_limit_memory(void) {
197 return cache_memory_limit;
198 }
199 double cache_toofull_ratio(void) const {
200 double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
201 return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve);
202 }
203 bool cache_toofull(void) const {
204 return cache_toofull_ratio() > 0.0;
205 }
206 uint64_t cache_size(void) const {
207 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
208 }
209 bool cache_overfull(void) const {
210 return cache_size() > cache_memory_limit*cache_health_threshold;
211 }
212
213 void advance_stray();
214
215 unsigned get_ephemeral_dist_frag_bits() const {
216 return export_ephemeral_dist_frag_bits;
217 }
218 bool get_export_ephemeral_distributed_config(void) const {
219 return export_ephemeral_distributed_config;
220 }
221
222 bool get_export_ephemeral_random_config(void) const {
223 return export_ephemeral_random_config;
224 }
225
226 /**
227 * Call this when you know that a CDentry is ready to be passed
228 * on to StrayManager (i.e. this is a stray you've just created)
229 */
230 void notify_stray(CDentry *dn) {
231 ceph_assert(dn->get_dir()->get_inode()->is_stray());
232 if (dn->state_test(CDentry::STATE_PURGING))
233 return;
234
235 stray_manager.eval_stray(dn);
236 }
237
238 mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0);
239
240 void maybe_eval_stray(CInode *in, bool delay=false);
241 void clear_dirty_bits_for_stray(CInode* diri);
242
243 bool is_readonly() { return readonly; }
244 void force_readonly();
245
246 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
247 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
248
249 void register_perfcounters();
250
251 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
252 client_leases[pool].push_back(&r->item_lease);
253 r->ttl = ttl;
254 }
255
256 void notify_stray_removed()
257 {
258 stray_manager.notify_stray_removed();
259 }
260
261 void notify_stray_created()
262 {
263 stray_manager.notify_stray_created();
264 }
265
266 void eval_remote(CDentry *dn)
267 {
268 stray_manager.eval_remote(dn);
269 }
270
271 void _send_discover(discover_info_t& dis);
272 discover_info_t& _create_discover(mds_rank_t mds) {
273 ceph_tid_t t = ++discover_last_tid;
274 discover_info_t& d = discovers[t];
275 d.tid = t;
276 d.mds = mds;
277 return d;
278 }
279
280 void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
281 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
282 mds_rank_t from=MDS_RANK_NONE);
283 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
284 bool path_locked=false, mds_rank_t from=MDS_RANK_NONE);
285 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
286 bool path_locked=false);
287 void kick_discovers(mds_rank_t who); // after a failure.
288
289 // adjust subtree auth specification
290 // dir->dir_auth
291 // imports/exports/nested_exports
292 // join/split subtrees as appropriate
293 bool is_subtrees() { return !subtrees.empty(); }
294 template<typename T>
295 void get_subtrees(T& c) {
296 if constexpr (std::is_same_v<T, std::vector<CDir*>>)
297 c.reserve(c.size() + subtrees.size());
298 for (const auto& p : subtrees) {
299 c.push_back(p.first);
300 }
301 }
302 void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
303 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
304 adjust_subtree_auth(root, mds_authority_t(a,b));
305 }
306 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
307 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
308 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
309 }
310 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
311 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
312 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
313 }
314 void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
315 void try_subtree_merge(CDir *root);
316 void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
317 void eval_subtree_root(CInode *diri);
318 CDir *get_subtree_root(CDir *dir);
319 CDir *get_projected_subtree_root(CDir *dir);
320 bool is_leaf_subtree(CDir *dir) {
321 ceph_assert(subtrees.count(dir));
322 return subtrees[dir].empty();
323 }
324 void remove_subtree(CDir *dir);
325 bool is_subtree(CDir *root) {
326 return subtrees.count(root);
327 }
328 void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
329 void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
330 void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
331 void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
332
333 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
334 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
335
336 auto get_auth_subtrees() {
337 std::vector<CDir*> c;
338 for (auto& p : subtrees) {
339 auto& root = p.first;
340 if (root->is_auth()) {
341 c.push_back(root);
342 }
343 }
344 return c;
345 }
346
347 auto get_fullauth_subtrees() {
348 std::vector<CDir*> c;
349 for (auto& p : subtrees) {
350 auto& root = p.first;
351 if (root->is_full_dir_auth()) {
352 c.push_back(root);
353 }
354 }
355 return c;
356 }
357 auto num_subtrees_fullauth() const {
358 std::size_t n = 0;
359 for (auto& p : subtrees) {
360 auto& root = p.first;
361 if (root->is_full_dir_auth()) {
362 ++n;
363 }
364 }
365 return n;
366 }
367
368 auto num_subtrees_fullnonauth() const {
369 std::size_t n = 0;
370 for (auto& p : subtrees) {
371 auto& root = p.first;
372 if (root->is_full_dir_nonauth()) {
373 ++n;
374 }
375 }
376 return n;
377 }
378
379 auto num_subtrees() const {
380 return subtrees.size();
381 }
382
383 int get_num_client_requests();
384
385 MDRequestRef request_start(const cref_t<MClientRequest>& req);
386 MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m);
387 MDRequestRef request_start_internal(int op);
388 bool have_request(metareqid_t rid) {
389 return active_requests.count(rid);
390 }
391 MDRequestRef request_get(metareqid_t rid);
392 void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
393 void request_finish(MDRequestRef& mdr);
394 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
395 void dispatch_request(MDRequestRef& mdr);
396 void request_drop_foreign_locks(MDRequestRef& mdr);
397 void request_drop_non_rdlocks(MDRequestRef& r);
398 void request_drop_locks(MDRequestRef& r);
399 void request_cleanup(MDRequestRef& r);
400
401 void request_kill(MDRequestRef& r); // called when session closes
402
403 // journal/snap helpers
404 CInode *pick_inode_snap(CInode *in, snapid_t follows);
405 CInode *cow_inode(CInode *in, snapid_t last);
406 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
407 snapid_t follows=CEPH_NOSNAP,
408 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
409 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
410
411 void project_rstat_inode_to_frag(const MutationRef& mut,
412 CInode *cur, CDir *parent, snapid_t first,
413 int linkunlink, SnapRealm *prealm);
414 void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
415 CDir *parent, int linkunlink, bool update_inode);
416 void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat,
417 snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head);
418 void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
419 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
420 CInode *in, CDir *parent,
421 int flags, int linkunlink=0,
422 snapid_t follows=CEPH_NOSNAP);
423
424 // peers
425 void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &peers, bool safe=false) {
426 uncommitted_leaders[reqid].ls = ls;
427 uncommitted_leaders[reqid].peers = peers;
428 uncommitted_leaders[reqid].safe = safe;
429 }
430 void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) {
431 uncommitted_leaders[reqid].waiters.push_back(c);
432 }
433 bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) {
434 auto p = uncommitted_leaders.find(reqid);
435 return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0;
436 }
437 void log_leader_commit(metareqid_t reqid);
438 void logged_leader_update(metareqid_t reqid);
439 void _logged_leader_commit(metareqid_t reqid);
440 void committed_leader_peer(metareqid_t r, mds_rank_t from);
441 void finish_committed_leaders();
442
443 void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr);
444 void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) {
445 uncommitted_peers.at(reqid).waiters.push_back(c);
446 }
447 void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true);
448 MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader);
449 void _logged_peer_commit(mds_rank_t from, metareqid_t reqid);
450
451 void set_recovery_set(set<mds_rank_t>& s);
452 void handle_mds_failure(mds_rank_t who);
453 void handle_mds_recovery(mds_rank_t who);
454
455 void recalc_auth_bits(bool replay);
456 void remove_inode_recursive(CInode *in);
457
458 bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
459 auto p = ambiguous_peer_updates.find(leader);
460 return p != ambiguous_peer_updates.end() && p->second.count(reqid);
461 }
462 void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
463 ambiguous_peer_updates[leader].insert(reqid);
464 }
465 void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
466 auto p = ambiguous_peer_updates.find(leader);
467 auto q = p->second.find(reqid);
468 ceph_assert(q != p->second.end());
469 p->second.erase(q);
470 if (p->second.empty())
471 ambiguous_peer_updates.erase(p);
472 }
473
474 void add_rollback(metareqid_t reqid, mds_rank_t leader) {
475 resolve_need_rollback[reqid] = leader;
476 }
477 void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
478
479 // ambiguous imports
480 void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
481 void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
482 bool have_ambiguous_import(dirfrag_t base) {
483 return my_ambiguous_imports.count(base);
484 }
485 void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
486 ceph_assert(my_ambiguous_imports.count(base));
487 bounds = my_ambiguous_imports[base];
488 }
489 void cancel_ambiguous_import(CDir *);
490 void finish_ambiguous_import(dirfrag_t dirino);
491 void resolve_start(MDSContext *resolve_done_);
492 void send_resolves();
493 void maybe_send_pending_resolves() {
494 if (resolves_pending)
495 send_subtree_resolves();
496 }
497
498 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
499 map<dirfrag_t,vector<dirfrag_t> >& subtrees);
500 ESubtreeMap *create_subtree_map();
501
502 void clean_open_file_lists();
503 void dump_openfiles(Formatter *f);
504 bool dump_inode(Formatter *f, uint64_t number);
505
506 void rejoin_start(MDSContext *rejoin_done_);
507 void rejoin_gather_finish();
508 void rejoin_send_rejoins();
509 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
510 int target=-1, bool drop_path=false) {
511 auto& ex = cap_exports[ino];
512 ex.first = target;
513 auto &_icr = ex.second[client] = icr;
514 if (drop_path)
515 _icr.path.clear();
516 }
517 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
518 mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
519 auto &_icr = cap_imports[ino][client][frommds] = icr;
520 if (drop_path)
521 _icr.path.clear();
522 }
523 void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
524 rejoin_client_map.emplace(client, inst);
525 }
526 bool rejoin_has_cap_reconnect(inodeno_t ino) const {
527 return cap_imports.count(ino);
528 }
529 void add_replay_ino_alloc(inodeno_t ino) {
530 cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
531 }
532 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
533 if (cap_imports.count(ino) &&
534 cap_imports[ino].count(client) &&
535 cap_imports[ino][client].count(MDS_RANK_NONE)) {
536 return &cap_imports[ino][client][MDS_RANK_NONE];
537 }
538 return NULL;
539 }
540 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
541 ceph_assert(cap_imports[ino].size() == 1);
542 ceph_assert(cap_imports[ino][client].size() == 1);
543 cap_imports.erase(ino);
544 }
545 void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
546 cap_reconnect_waiters[ino].push_back(c);
547 }
548
549 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
550 reconnected_cap_info_t &info = reconnected_caps[ino][client];
551 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
552 info.snap_follows = icr.snap_follows;
553 }
554 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
555 reconnected_cap_info_t &info = reconnected_caps[ino][client];
556 info.dirty_caps |= dirty;
557 if (snapflush)
558 info.snapflush = snapflush;
559 }
560 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
561 reconnected_snaprealms[ino][client] = seq;
562 }
563
564 void rejoin_open_ino_finish(inodeno_t ino, int ret);
565 void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
566 void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
567 bool process_imported_caps();
568 void choose_lock_states_and_reconnect_caps();
569 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
570 map<client_t,ref_t<MClientSnap>>& splits);
571 void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,ref_t<MClientSnap>>& splits);
572 void send_snaps(map<client_t,ref_t<MClientSnap>>& splits);
573 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
574 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
575 map<client_t,ref_t<MClientSnap>>& updates);
576 Capability* try_reconnect_cap(CInode *in, Session *session);
577 void export_remaining_imported_caps();
578
579 void do_cap_import(Session *session, CInode *in, Capability *cap,
580 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
581 int peer, int p_flags);
582 void do_delayed_cap_imports();
583 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
584 snapid_t snap_follows);
585 void open_snaprealms();
586
587 bool open_undef_inodes_dirfrags();
588 void opened_undef_inode(CInode *in);
589 void opened_undef_dirfrag(CDir *dir) {
590 rejoin_undef_dirfrags.erase(dir);
591 }
592
593 void reissue_all_caps();
594
595 void start_files_to_recover();
596 void do_file_recover();
597 void queue_file_recover(CInode *in);
598 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
599
600 void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
601
602 // debug
603 void log_stat();
604
605 // root inode
606 CInode *get_root() { return root; }
607 CInode *get_myin() { return myin; }
608
609 size_t get_cache_size() { return lru.lru_get_size(); }
610
611 // trimming
612 std::pair<bool, uint64_t> trim(uint64_t count=0);
613
614 bool trim_non_auth_subtree(CDir *directory);
615 void standby_trim_segment(LogSegment *ls);
616 void try_trim_non_auth_subtree(CDir *dir);
617 bool can_trim_non_auth_dirfrag(CDir *dir) {
618 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
619 uncommitted_peer_rename_olddir.count(dir->inode) == 0;
620 }
621
622 /**
623 * For all unreferenced inodes, dirs, dentries below an inode, compose
624 * expiry messages. This is used when giving up all replicas of entities
625 * for an MDS peer in the 'stopping' state, such that the peer can
626 * empty its cache and finish shutting down.
627 *
628 * We have to make sure we're only expiring un-referenced items to
629 * avoid interfering with ongoing stray-movement (we can't distinguish
630 * between the "moving my strays" and "waiting for my cache to empty"
631 * phases within 'stopping')
632 *
633 * @return false if we completed cleanly, true if caller should stop
634 * expiring because we hit something with refs.
635 */
636 bool expire_recursive(CInode *in, expiremap& expiremap);
637
638 void trim_client_leases();
639 void check_memory_usage();
640
641 void shutdown_start();
642 void shutdown_check();
643 bool shutdown_pass();
644 bool shutdown(); // clear cache (ie at shutodwn)
645 bool shutdown_export_strays();
646 void shutdown_export_stray_finish(inodeno_t ino) {
647 if (shutdown_exporting_strays.erase(ino))
648 shutdown_export_strays();
649 }
650
651 // inode_map
652 bool have_inode(vinodeno_t vino) {
653 if (vino.snapid == CEPH_NOSNAP)
654 return inode_map.count(vino.ino) ? true : false;
655 else
656 return snap_inode_map.count(vino) ? true : false;
657 }
658 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
659 return have_inode(vinodeno_t(ino, snap));
660 }
661 CInode* get_inode(vinodeno_t vino) {
662 if (vino.snapid == CEPH_NOSNAP) {
663 auto p = inode_map.find(vino.ino);
664 if (p != inode_map.end())
665 return p->second;
666 } else {
667 auto p = snap_inode_map.find(vino);
668 if (p != snap_inode_map.end())
669 return p->second;
670 }
671 return NULL;
672 }
673 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
674 return get_inode(vinodeno_t(ino, s));
675 }
676 CInode* lookup_snap_inode(vinodeno_t vino) {
677 auto p = snap_inode_map.lower_bound(vino);
678 if (p != snap_inode_map.end() &&
679 p->second->ino() == vino.ino && p->second->first <= vino.snapid)
680 return p->second;
681 return NULL;
682 }
683
684 CDir* get_dirfrag(dirfrag_t df) {
685 CInode *in = get_inode(df.ino);
686 if (!in)
687 return NULL;
688 return in->get_dirfrag(df.frag);
689 }
690 CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
691 CInode *in = get_inode(ino);
692 if (!in)
693 return NULL;
694 frag_t fg = in->pick_dirfrag(dn);
695 return in->get_dirfrag(fg);
696 }
697 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
698 CInode *diri = get_inode(df.ino);
699 if (!diri)
700 return NULL;
701 CDir *dir = force_dir_fragment(diri, df.frag, replay);
702 if (!dir)
703 dir = diri->get_dirfrag(df.frag);
704 return dir;
705 }
706
707 MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
708
709 void add_inode(CInode *in);
710
711 void remove_inode(CInode *in);
712
713 void touch_dentry(CDentry *dn) {
714 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
715 bottom_lru.lru_midtouch(dn);
716 } else {
717 if (dn->is_auth())
718 lru.lru_touch(dn);
719 else
720 lru.lru_midtouch(dn);
721 }
722 }
723 void touch_dentry_bottom(CDentry *dn) {
724 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
725 return;
726 lru.lru_bottouch(dn);
727 }
728
729 // truncate
730 void truncate_inode(CInode *in, LogSegment *ls);
731 void _truncate_inode(CInode *in, LogSegment *ls);
732 void truncate_inode_finish(CInode *in, LogSegment *ls);
733 void truncate_inode_logged(CInode *in, MutationRef& mut);
734
735 void add_recovered_truncate(CInode *in, LogSegment *ls);
736 void remove_recovered_truncate(CInode *in, LogSegment *ls);
737 void start_recovered_truncates();
738
739 // purge unsafe inodes
740 void start_purge_inodes();
741 void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls);
742
743 CDir *get_auth_container(CDir *in);
744 CDir *get_export_container(CDir *dir);
745 void find_nested_exports(CDir *dir, set<CDir*>& s);
746 void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
747
748 void init_layouts();
749 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
750 int mode) const;
751 CInode *create_system_inode(inodeno_t ino, int mode);
752 CInode *create_root_inode();
753
754 void create_empty_hierarchy(MDSGather *gather);
755 void create_mydir_hierarchy(MDSGather *gather);
756
757 bool is_open() { return open; }
758 void wait_for_open(MDSContext *c) {
759 waiting_for_open.push_back(c);
760 }
761
762 void open_root_inode(MDSContext *c);
763 void open_root();
764 void open_mydir_inode(MDSContext *c);
765 void open_mydir_frag(MDSContext *c);
766 void populate_mydir();
767
768 void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
769 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
770 version_t dpv, MDSContext *fin);
771
772 void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
773 CDir *get_stray_dir(CInode *in);
774
775 /**
776 * Find the given dentry (and whether it exists or not), its ancestors,
777 * and get them all into memory and usable on this MDS. This function
778 * makes a best-effort attempt to load everything; if it needs to
779 * go away and do something then it will put the request on a waitlist.
780 * It prefers the mdr, then the req, then the fin. (At least one of these
781 * must be non-null.)
782 *
783 * At least one of the params mdr, req, and fin must be non-null.
784 *
785 * @param mdr The MDRequest associated with the path. Can be null.
786 * @param cf A MDSContextFactory for waiter building.
787 * @param path The path to traverse to.
788 *
789 * @param flags Specifies different lookup behaviors.
790 * By default, path_traverse() forwards the request to the auth MDS if that
791 * is appropriate (ie, if it doesn't know the contents of a directory).
792 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
793 * attempts to look up the path from a different MDS (and bring them into
794 * its cache as replicas).
795 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
796 * dentry is encountered.
797 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
798 * tail dentry does not exist. return 0 even tail dentry is null.
799 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
800 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
801 *
802 * @param pdnvec Data return parameter -- on success, contains a
803 * vector of dentries. On failure, is either empty or contains the
804 * full trace of traversable dentries.
805 * @param pin Data return parameter -- if successful, points to the inode
806 * associated with filepath. If unsuccessful, is null.
807 *
808 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
809 * If it returns 1, the requester associated with this call has been placed
810 * on the appropriate waitlist, and it should unwind itself and back out.
811 * If it returns 2 the request has been forwarded, and again the requester
812 * should unwind itself and back out.
813 */
814 int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
815 const filepath& path, int flags,
816 vector<CDentry*> *pdnvec, CInode **pin=nullptr);
817
818 CInode *cache_traverse(const filepath& path);
819
820 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
821 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
822
823 bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
824 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
825 set<CDir*>& fetch_queue, set<inodeno_t>& missing,
826 C_GatherBuilder &gather_bld);
827
828 void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
829 bool want_xlocked=false);
830 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
831 bool want_xlocked, int r);
832
833 void make_trace(vector<CDentry*>& trace, CInode *in);
834
835 void kick_open_ino_peers(mds_rank_t who);
836 void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
837 bool want_replica=true, bool want_xlocked=false,
838 vector<inode_backpointer_t> *ancestors_hint=nullptr,
839 mds_rank_t auth_hint=MDS_RANK_NONE);
840
841 void find_ino_peers(inodeno_t ino, MDSContext *c,
842 mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false);
843 void _do_find_ino_peer(find_ino_peer_info_t& fip);
844 void handle_find_ino(const cref_t<MMDSFindIno> &m);
845 void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m);
846 void kick_find_ino_peers(mds_rank_t who);
847
848 SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
849 void create_global_snaprealm();
850 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
851 void send_snap_update(CInode *in, version_t stid, int snap_op);
852 void handle_snap_update(const cref_t<MMDSSnapUpdate> &m);
853 void notify_global_snaprealm_update(int snap_op);
854
855 // -- stray --
856 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
857 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
858
859 // == messages ==
860 void dispatch(const cref_t<Message> &m);
861
862 void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
863 void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
864 void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
865 uint64_t features);
866
867 void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
868 void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
869 void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
870
871 void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
872 void decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from);
873
874 // -- namespace --
875 void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl);
876 void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p);
877 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
878 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
879
880 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
881 uncommitted_fragments.at(dirfrag).waiters.push_back(c);
882 }
883 bool is_any_uncommitted_fragment() const {
884 return !uncommitted_fragments.empty();
885 }
886 void wait_for_uncommitted_fragments(MDSContext* finisher);
887 void rollback_uncommitted_fragments();
888
889 void split_dir(CDir *dir, int byn);
890 void merge_dir(CInode *diri, frag_t fg);
891
892 void find_stale_fragment_freeze();
893 void fragment_freeze_inc_num_waiters(CDir *dir);
894 bool fragment_are_all_frozen(CDir *dir);
895 int get_num_fragmenting_dirs() { return fragments.size(); }
896
897 // -- updates --
898 //int send_inode_updates(CInode *in);
899 //void handle_inode_update(MInodeUpdate *m);
900
901 int send_dir_updates(CDir *in, bool bcast=false);
902 void handle_dir_update(const cref_t<MDirUpdate> &m);
903
904 // -- cache expiration --
905 void handle_cache_expire(const cref_t<MCacheExpire> &m);
906 void process_delayed_expire(CDir *dir);
907 void discard_delayed_expire(CDir *dir);
908
909 // -- mdsmap --
910 void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap);
911
912 int dump_cache() { return dump_cache({}, nullptr); }
913 int dump_cache(std::string_view filename);
914 int dump_cache(Formatter *f);
915 void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
916
917 void cache_status(Formatter *f);
918
919 void dump_resolve_status(Formatter *f) const;
920 void dump_rejoin_status(Formatter *f) const;
921
922 // == crap fns ==
923 void show_cache();
924 void show_subtrees(int dbl=10, bool force_print=false);
925
926 CInode *hack_pick_random_inode() {
927 ceph_assert(!inode_map.empty());
928 int n = rand() % inode_map.size();
929 auto p = inode_map.begin();
930 while (n--) ++p;
931 return p->second;
932 }
933
934 void flush_dentry(std::string_view path, Context *fin);
935 /**
936 * Create and start an OP_ENQUEUE_SCRUB
937 */
938 void enqueue_scrub(std::string_view path, std::string_view tag,
939 bool force, bool recursive, bool repair,
940 Formatter *f, Context *fin);
941 void repair_inode_stats(CInode *diri);
942 void repair_dirfrag_stats(CDir *dir);
943 void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
944
945 // my leader
946 MDSRank *mds;
947
948 // -- my cache --
949 LRU lru; // dentry lru for expiring items from cache
950 LRU bottom_lru; // dentries that should be trimmed ASAP
951
952 DecayRate decayrate;
953
954 int num_shadow_inodes = 0;
955
956 int num_inodes_with_caps = 0;
957
958 unsigned max_dir_commit_size;
959
960 file_layout_t default_file_layout;
961 file_layout_t default_log_layout;
962
963 // -- client leases --
964 static constexpr std::size_t client_lease_pools = 3;
965 std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
966
967 // -- client caps --
968 uint64_t last_cap_id = 0;
969
970 map<ceph_tid_t, discover_info_t> discovers;
971 ceph_tid_t discover_last_tid = 0;
972
973 // waiters
974 map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
975
976 map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
977 map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
978
979 // realm inodes
980 set<CInode*> rejoin_pending_snaprealms;
981 // cap imports. delayed snap parent opens.
982 map<client_t,set<CInode*> > delayed_imported_caps;
983
984 // subsystems
985 std::unique_ptr<Migrator> migrator;
986
987 bool did_shutdown_log_cap = false;
988
989 map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
990 ceph_tid_t find_ino_peer_last_tid = 0;
991
992 // delayed cache expire
993 map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
994
995 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
996 std::set<CInode *> export_pin_queue;
997 std::set<CInode *> export_pin_delayed_queue;
998 std::set<CInode *> export_ephemeral_pins;
999
1000 OpenFileTable open_file_table;
1001
1002 double export_ephemeral_random_max = 0.0;
1003
1004 protected:
1005 // track leader requests whose peers haven't acknowledged commit
1006 struct uleader {
1007 uleader() {}
1008 set<mds_rank_t> peers;
1009 LogSegment *ls = nullptr;
1010 MDSContext::vec waiters;
1011 bool safe = false;
1012 bool committing = false;
1013 bool recovering = false;
1014 };
1015
1016 struct upeer {
1017 upeer() {}
1018 mds_rank_t leader;
1019 LogSegment *ls = nullptr;
1020 MDPeerUpdate *su = nullptr;
1021 MDSContext::vec waiters;
1022 };
1023
1024 struct open_ino_info_t {
1025 open_ino_info_t() {}
1026 vector<inode_backpointer_t> ancestors;
1027 set<mds_rank_t> checked;
1028 mds_rank_t checking = MDS_RANK_NONE;
1029 mds_rank_t auth_hint = MDS_RANK_NONE;
1030 bool check_peers = true;
1031 bool fetch_backtrace = true;
1032 bool discover = false;
1033 bool want_replica = false;
1034 bool want_xlocked = false;
1035 version_t tid = 0;
1036 int64_t pool = -1;
1037 int last_err = 0;
1038 MDSContext::vec waiters;
1039 };
1040
1041 friend struct C_MDC_OpenInoTraverseDir;
1042 friend struct C_MDC_OpenInoParentOpened;
1043 friend struct C_MDC_RetryScanStray;
1044
1045 friend class C_IO_MDC_OpenInoBacktraceFetched;
1046 friend class C_MDC_Join;
1047 friend class C_MDC_RespondInternalRequest;
1048
1049 friend class EPeerUpdate;
1050 friend class ECommitted;
1051
1052 void set_readonly() { readonly = true; }
1053
1054 void handle_resolve(const cref_t<MMDSResolve> &m);
1055 void handle_resolve_ack(const cref_t<MMDSResolveAck> &m);
1056 void process_delayed_resolve();
1057 void discard_delayed_resolve(mds_rank_t who);
1058 void maybe_resolve_finish();
1059 void disambiguate_my_imports();
1060 void disambiguate_other_imports();
1061 void trim_unlinked_inodes();
1062
1063 void send_peer_resolves();
1064 void send_subtree_resolves();
1065 void maybe_finish_peer_resolve();
1066
1067 void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin);
1068 void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m);
1069 void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m);
1070 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
1071 CDir* rejoin_invent_dirfrag(dirfrag_t df);
1072 void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m);
1073 void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
1074 set<vinodeno_t>& acked_inodes,
1075 set<SimpleLock *>& gather_locks);
1076 void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m);
1077 void rejoin_send_acks();
1078 void rejoin_trim_undef_inodes();
1079 void maybe_send_pending_rejoins() {
1080 if (rejoins_pending)
1081 rejoin_send_rejoins();
1082 }
1083
1084 void touch_inode(CInode *in) {
1085 if (in->get_parent_dn())
1086 touch_dentry(in->get_projected_parent_dn());
1087 }
1088
1089 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
1090 set<SimpleLock *>& gather_locks);
1091 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
1092
1093 void rename_file(CDentry *srcdn, CDentry *destdn);
1094
1095 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1096 void _open_ino_parent_opened(inodeno_t ino, int ret);
1097 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
1098 void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent);
1099 int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
1100 const vector<inode_backpointer_t>& ancestors,
1101 bool discover, bool want_xlocked, mds_rank_t *hint);
1102 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1103 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1104 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
1105 void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0);
1106 void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m);
1107
1108 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1109 // -- replicas --
1110 void handle_discover(const cref_t<MDiscover> &dis);
1111 void handle_discover_reply(const cref_t<MDiscoverReply> &m);
1112 void handle_dentry_link(const cref_t<MDentryLink> &m);
1113 void handle_dentry_unlink(const cref_t<MDentryUnlink> &m);
1114
1115 int dump_cache(std::string_view fn, Formatter *f);
1116
1117 void flush_dentry_work(MDRequestRef& mdr);
1118 /**
1119 * Resolve path to a dentry and pass it onto the ScrubStack.
1120 *
1121 * TODO: return enough information to the original mdr formatter
1122 * and completion that they can subsequeuntly check the progress of
1123 * this scrub (we won't block them on a whole scrub as it can take a very
1124 * long time)
1125 */
1126 void enqueue_scrub_work(MDRequestRef& mdr);
1127 void repair_inode_stats_work(MDRequestRef& mdr);
1128 void repair_dirfrag_stats_work(MDRequestRef& mdr);
1129 void rdlock_dirfrags_stats_work(MDRequestRef& mdr);
1130
1131 ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
1132 map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
1133 CInode *root = nullptr; // root inode
1134 CInode *myin = nullptr; // .ceph/mds%d dir
1135
1136 bool readonly = false;
1137
1138 int stray_index = 0;
1139 int stray_fragmenting_index = -1;
1140
1141 set<CInode*> base_inodes;
1142
1143 std::unique_ptr<PerfCounters> logger;
1144
1145 Filer filer;
1146 std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
1147
1148 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1149 map<CDir*,set<CDir*> > subtrees;
1150 map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
1151
1152 // -- requests --
1153 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
1154
1155 // -- recovery --
1156 set<mds_rank_t> recovery_set;
1157
1158 // [resolve]
1159 // from EImportStart w/o EImportFinish during journal replay
1160 map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
1161 // from MMDSResolves
1162 map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
1163
1164 map<CInode*, int> uncommitted_peer_rename_olddir; // peer: preserve the non-auth dir until seeing commit.
1165 map<CInode*, int> uncommitted_peer_unlink; // peer: preserve the unlinked inode until seeing commit.
1166
1167 map<metareqid_t, uleader> uncommitted_leaders; // leader: req -> peer set
1168 map<metareqid_t, upeer> uncommitted_peers; // peer: preserve the peer req until seeing commit.
1169
1170 set<metareqid_t> pending_leaders;
1171 map<int, set<metareqid_t> > ambiguous_peer_updates;
1172
1173 bool resolves_pending = false;
1174 set<mds_rank_t> resolve_gather; // nodes i need resolves from
1175 set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
1176 set<version_t> resolve_snapclient_commits;
1177 map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal
1178 map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve;
1179
1180 // [rejoin]
1181 bool rejoins_pending = false;
1182 set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
1183 set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
1184 set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
1185 set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
1186 map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
1187 map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_peer_exports;
1188
1189 map<client_t,entity_inst_t> rejoin_client_map;
1190 map<client_t,client_metadata_t> rejoin_client_metadata_map;
1191 map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
1192
1193 map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
1194
1195 map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
1196 set<inodeno_t> cap_imports_missing;
1197 map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
1198 int cap_imports_num_opening = 0;
1199
1200 set<CInode*> rejoin_undef_inodes;
1201 set<CInode*> rejoin_potential_updated_scatterlocks;
1202 set<CDir*> rejoin_undef_dirfrags;
1203 map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
1204
1205 vector<CInode*> rejoin_recover_q, rejoin_check_q;
1206 list<SimpleLock*> rejoin_eval_locks;
1207 MDSContext::vec rejoin_waiters;
1208
1209 std::unique_ptr<MDSContext> rejoin_done;
1210 std::unique_ptr<MDSContext> resolve_done;
1211
1212 ceph_tid_t open_ino_last_tid = 0;
1213 map<inodeno_t,open_ino_info_t> opening_inodes;
1214
1215 StrayManager stray_manager;
1216
1217 private:
1218 // -- fragmenting --
1219 struct ufragment {
1220 ufragment() {}
1221 int bits = 0;
1222 bool committed = false;
1223 LogSegment *ls = nullptr;
1224 MDSContext::vec waiters;
1225 frag_vec_t old_frags;
1226 bufferlist rollback;
1227 };
1228
1229 struct fragment_info_t {
1230 fragment_info_t() {}
1231 bool is_fragmenting() { return !resultfrags.empty(); }
1232 uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
1233 int bits;
1234 std::vector<CDir*> dirs;
1235 std::vector<CDir*> resultfrags;
1236 MDRequestRef mdr;
1237 set<mds_rank_t> notify_ack_waiting;
1238 bool finishing = false;
1239
1240 // for deadlock detection
1241 bool all_frozen = false;
1242 utime_t last_cum_auth_pins_change;
1243 int last_cum_auth_pins = 0;
1244 int num_remote_waiters = 0; // number of remote authpin waiters
1245 };
1246
1247 typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
1248
1249 friend class EFragment;
1250 friend class C_MDC_FragmentFrozen;
1251 friend class C_MDC_FragmentMarking;
1252 friend class C_MDC_FragmentPrep;
1253 friend class C_MDC_FragmentStore;
1254 friend class C_MDC_FragmentCommit;
1255 friend class C_MDC_FragmentRollback;
1256 friend class C_IO_MDC_FragmentPurgeOld;
1257
1258 // -- subtrees --
1259 static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
1260 static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
1261
1262 CInode *get_stray() {
1263 return strays[stray_index];
1264 }
1265
1266 void identify_files_to_recover();
1267
1268 std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
1269 bool trim_dentry(CDentry *dn, expiremap& expiremap);
1270 void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
1271 bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
1272 void send_expire_messages(expiremap& expiremap);
1273 void trim_non_auth(); // trim out trimmable non-auth items
1274
1275 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1276 std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay);
1277 void adjust_dir_fragments(CInode *diri,
1278 const std::vector<CDir*>& srcfrags,
1279 frag_t basefrag, int bits,
1280 std::vector<CDir*>* resultfrags,
1281 MDSContext::vec& waiters,
1282 bool replay);
1283 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1284 void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1285
1286 bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs);
1287 void fragment_freeze_dirs(const std::vector<CDir*>& dirs);
1288 void fragment_mark_and_complete(MDRequestRef& mdr);
1289 void fragment_frozen(MDRequestRef& mdr, int r);
1290 void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
1291 void fragment_drop_locks(fragment_info_t &info);
1292 void fragment_maybe_finish(const fragment_info_iterator& it);
1293 void dispatch_fragment_dir(MDRequestRef& mdr);
1294 void _fragment_logged(MDRequestRef& mdr);
1295 void _fragment_stored(MDRequestRef& mdr);
1296 void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1297 void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
1298
1299 void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m);
1300 void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m);
1301
1302 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
1303 LogSegment *ls, bufferlist *rollback=NULL);
1304 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1305 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
1306
1307 void upkeep_main(void);
1308
1309 uint64_t cache_memory_limit;
1310 double cache_reservation;
1311 double cache_health_threshold;
1312 std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
1313
1314 bool export_ephemeral_distributed_config;
1315 bool export_ephemeral_random_config;
1316 unsigned export_ephemeral_dist_frag_bits;
1317
1318 // File size recovery
1319 RecoveryQueue recovery_queue;
1320
1321 // shutdown
1322 set<inodeno_t> shutdown_exporting_strays;
1323 pair<dirfrag_t, string> shutdown_export_next;
1324
1325 bool opening_root = false, open = false;
1326 MDSContext::vec waiting_for_open;
1327
1328 // -- snaprealms --
1329 SnapRealm *global_snaprealm = nullptr;
1330
1331 map<dirfrag_t, ufragment> uncommitted_fragments;
1332
1333 map<dirfrag_t,fragment_info_t> fragments;
1334
1335 DecayCounter trim_counter;
1336
1337 std::thread upkeeper;
1338 ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
1339 ceph::condition_variable upkeep_cvar;
1340 time upkeep_last_trim = time::min();
1341 time upkeep_last_release = time::min();
1342 std::atomic<bool> upkeep_trim_shutdown{false};
1343 };
1344
1345 class C_MDS_RetryRequest : public MDSInternalContext {
1346 MDCache *cache;
1347 MDRequestRef mdr;
1348 public:
1349 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) :
1350 MDSInternalContext(c->mds), cache(c), mdr(r) {}
1351 void finish(int r) override;
1352 };
1353
1354 class CF_MDS_RetryRequestFactory : public MDSContextFactory {
1355 public:
1356 CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
1357 mdcache(cache), mdr(mdr), drop_locks(dl) {}
1358 MDSContext *build() override;
1359 private:
1360 MDCache *mdcache;
1361 MDRequestRef mdr;
1362 bool drop_locks;
1363 };
1364
1365 #endif