]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.h
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / mds / MDCache.h
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
7c673cae
FG
14#ifndef CEPH_MDCACHE_H
15#define CEPH_MDCACHE_H
16
eafe8130 17#include <atomic>
11fdf7f2 18#include <string_view>
eafe8130 19#include <thread>
94b18763 20
a8e16298 21#include "common/DecayCounter.h"
9f95a23c 22#include "include/common_fwd.h"
7c673cae
FG
23#include "include/types.h"
24#include "include/filepath.h"
25#include "include/elist.h"
26
11fdf7f2
TL
27#include "messages/MCacheExpire.h"
28#include "messages/MClientQuota.h"
29#include "messages/MClientRequest.h"
30#include "messages/MClientSnap.h"
31#include "messages/MDentryLink.h"
32#include "messages/MDentryUnlink.h"
33#include "messages/MDirUpdate.h"
34#include "messages/MDiscover.h"
35#include "messages/MDiscoverReply.h"
36#include "messages/MGatherCaps.h"
37#include "messages/MGenericMessage.h"
38#include "messages/MInodeFileCaps.h"
39#include "messages/MLock.h"
40#include "messages/MMDSCacheRejoin.h"
41#include "messages/MMDSFindIno.h"
42#include "messages/MMDSFindInoReply.h"
43#include "messages/MMDSFragmentNotify.h"
44#include "messages/MMDSFragmentNotifyAck.h"
45#include "messages/MMDSOpenIno.h"
46#include "messages/MMDSOpenInoReply.h"
47#include "messages/MMDSResolve.h"
48#include "messages/MMDSResolveAck.h"
f67539c2 49#include "messages/MMDSPeerRequest.h"
11fdf7f2
TL
50#include "messages/MMDSSnapUpdate.h"
51
7c673cae
FG
52#include "osdc/Filer.h"
53#include "CInode.h"
54#include "CDentry.h"
55#include "CDir.h"
56#include "include/Context.h"
57#include "events/EMetaBlob.h"
58#include "RecoveryQueue.h"
59#include "StrayManager.h"
11fdf7f2 60#include "OpenFileTable.h"
7c673cae
FG
61#include "MDSContext.h"
62#include "MDSMap.h"
63#include "Mutation.h"
64
7c673cae
FG
65class MDSRank;
66class Session;
67class Migrator;
68
7c673cae
FG
69class Session;
70
7c673cae
FG
71class ESubtreeMap;
72
73enum {
74 l_mdc_first = 3000,
1e59de90
TL
75
76 // dir updates for replication
77 l_mdc_dir_update,
78 l_mdc_dir_update_receipt,
79 l_mdc_dir_try_discover,
80 l_mdc_dir_send_discover,
81 l_mdc_dir_handle_discover,
82
7c673cae
FG
83 // How many inodes currently in stray dentries
84 l_mdc_num_strays,
85 // How many stray dentries are currently delayed for purge due to refs
86 l_mdc_num_strays_delayed,
87 // How many stray dentries are currently being enqueued for purge
88 l_mdc_num_strays_enqueuing,
89
90 // How many dentries have ever been added to stray dir
91 l_mdc_strays_created,
92 // How many dentries have been passed on to PurgeQueue
93 l_mdc_strays_enqueued,
94 // How many strays have been reintegrated?
95 l_mdc_strays_reintegrated,
96 // How many strays have been migrated?
97 l_mdc_strays_migrated,
98
99 // How many inode sizes currently being recovered
100 l_mdc_num_recovering_processing,
101 // How many inodes currently waiting to have size recovered
102 l_mdc_num_recovering_enqueued,
103 // How many inodes waiting with elevated priority for recovery
104 l_mdc_num_recovering_prioritized,
105 // How many inodes ever started size recovery
106 l_mdc_recovery_started,
107 // How many inodes ever completed size recovery
108 l_mdc_recovery_completed,
109
d2e6a577
FG
110 l_mdss_ireq_enqueue_scrub,
111 l_mdss_ireq_exportdir,
112 l_mdss_ireq_flush,
113 l_mdss_ireq_fragmentdir,
114 l_mdss_ireq_fragstats,
115 l_mdss_ireq_inodestats,
116
7c673cae
FG
117 l_mdc_last,
118};
119
9f95a23c
TL
120// flags for path_traverse();
121static const int MDS_TRAVERSE_DISCOVER = (1 << 0);
122static const int MDS_TRAVERSE_PATH_LOCKED = (1 << 1);
123static const int MDS_TRAVERSE_WANT_DENTRY = (1 << 2);
124static const int MDS_TRAVERSE_WANT_AUTH = (1 << 3);
125static const int MDS_TRAVERSE_RDLOCK_SNAP = (1 << 4);
126static const int MDS_TRAVERSE_RDLOCK_SNAP2 = (1 << 5);
127static const int MDS_TRAVERSE_WANT_DIRLAYOUT = (1 << 6);
128static const int MDS_TRAVERSE_RDLOCK_PATH = (1 << 7);
129static const int MDS_TRAVERSE_XLOCK_DENTRY = (1 << 8);
130static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK = (1 << 9);
131static const int MDS_TRAVERSE_CHECK_LOCKCACHE = (1 << 10);
1e59de90 132static const int MDS_TRAVERSE_WANT_INODE = (1 << 11);
9f95a23c 133
7c673cae
FG
134
135// flags for predirty_journal_parents()
136static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
137static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
138static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
139
140class MDCache {
141 public:
9f95a23c
TL
142 typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap;
143
91327a77
AA
144 using clock = ceph::coarse_mono_clock;
145 using time = ceph::coarse_mono_time;
146
9f95a23c
TL
147 // -- discover --
148 struct discover_info_t {
149 discover_info_t() {}
150 ~discover_info_t() {
151 if (basei)
152 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
153 }
154 void pin_base(CInode *b) {
155 basei = b;
156 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
157 }
7c673cae 158
9f95a23c
TL
159 ceph_tid_t tid = 0;
160 mds_rank_t mds = -1;
161 inodeno_t ino;
162 frag_t frag;
163 snapid_t snap = CEPH_NOSNAP;
164 filepath want_path;
165 CInode *basei = nullptr;
166 bool want_base_dir = false;
167 bool path_locked = false;
168 };
7c673cae 169
9f95a23c
TL
170 // [reconnect/rejoin caps]
171 struct reconnected_cap_info_t {
172 reconnected_cap_info_t() {}
173 inodeno_t realm_ino = 0;
174 snapid_t snap_follows = 0;
175 int dirty_caps = 0;
176 bool snapflush = 0;
177 };
7c673cae 178
9f95a23c
TL
179 // -- find_ino_peer --
180 struct find_ino_peer_info_t {
181 find_ino_peer_info_t() {}
182 inodeno_t ino;
183 ceph_tid_t tid = 0;
184 MDSContext *fin = nullptr;
185 bool path_locked = false;
186 mds_rank_t hint = MDS_RANK_NONE;
187 mds_rank_t checking = MDS_RANK_NONE;
20effc67 188 std::set<mds_rank_t> checked;
9f95a23c 189 };
7c673cae 190
9f95a23c
TL
191 friend class C_MDC_RejoinOpenInoFinish;
192 friend class C_MDC_RejoinSessionsOpened;
7c673cae 193
9f95a23c
TL
194 friend class Locker;
195 friend class Migrator;
196 friend class MDBalancer;
7c673cae 197
9f95a23c
TL
198 // StrayManager needs to be able to remove_inode() from us
199 // when it is done purging
200 friend class StrayManager;
7c673cae 201
9f95a23c
TL
202 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
203 ~MDCache();
91327a77 204
05a536ef
TL
205 void insert_taken_inos(inodeno_t ino) {
206 replay_taken_inos.insert(ino);
207 }
208 void clear_taken_inos(inodeno_t ino) {
209 replay_taken_inos.erase(ino);
210 }
211 bool test_and_clear_taken_inos(inodeno_t ino) {
212 return replay_taken_inos.erase(ino) != 0;
213 }
214 bool is_taken_inos_empty(void) {
215 return replay_taken_inos.empty();
216 }
217
91327a77
AA
218 uint64_t cache_limit_memory(void) {
219 return cache_memory_limit;
181888fb
FG
220 }
221 double cache_toofull_ratio(void) const {
91327a77 222 double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
9f95a23c 223 return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve);
181888fb
FG
224 }
225 bool cache_toofull(void) const {
226 return cache_toofull_ratio() > 0.0;
227 }
228 uint64_t cache_size(void) const {
229 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
230 }
231 bool cache_overfull(void) const {
9f95a23c 232 return cache_size() > cache_memory_limit*cache_health_threshold;
181888fb
FG
233 }
234
f67539c2 235 void advance_stray();
7c673cae 236
f67539c2
TL
237 unsigned get_ephemeral_dist_frag_bits() const {
238 return export_ephemeral_dist_frag_bits;
239 }
f6b5b4d7
TL
240 bool get_export_ephemeral_distributed_config(void) const {
241 return export_ephemeral_distributed_config;
242 }
243
244 bool get_export_ephemeral_random_config(void) const {
245 return export_ephemeral_random_config;
246 }
247
20effc67
TL
248 bool get_symlink_recovery(void) const {
249 return symlink_recovery;
250 }
251
7c673cae
FG
252 /**
253 * Call this when you know that a CDentry is ready to be passed
254 * on to StrayManager (i.e. this is a stray you've just created)
255 */
256 void notify_stray(CDentry *dn) {
11fdf7f2 257 ceph_assert(dn->get_dir()->get_inode()->is_stray());
a8e16298
TL
258 if (dn->state_test(CDentry::STATE_PURGING))
259 return;
260
7c673cae
FG
261 stray_manager.eval_stray(dn);
262 }
263
f67539c2 264 mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0);
f6b5b4d7 265
7c673cae 266 void maybe_eval_stray(CInode *in, bool delay=false);
31f18b77
FG
267 void clear_dirty_bits_for_stray(CInode* diri);
268
7c673cae
FG
269 bool is_readonly() { return readonly; }
270 void force_readonly();
271
7c673cae
FG
272 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
273 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
274
7c673cae
FG
275 void register_perfcounters();
276
7c673cae
FG
277 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
278 client_leases[pool].push_back(&r->item_lease);
279 r->ttl = ttl;
280 }
281
282 void notify_stray_removed()
283 {
284 stray_manager.notify_stray_removed();
285 }
286
287 void notify_stray_created()
288 {
289 stray_manager.notify_stray_created();
290 }
291
31f18b77
FG
292 void eval_remote(CDentry *dn)
293 {
294 stray_manager.eval_remote(dn);
295 }
296
7c673cae
FG
297 void _send_discover(discover_info_t& dis);
298 discover_info_t& _create_discover(mds_rank_t mds) {
299 ceph_tid_t t = ++discover_last_tid;
300 discover_info_t& d = discovers[t];
301 d.tid = t;
302 d.mds = mds;
303 return d;
304 }
305
11fdf7f2
TL
306 void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
307 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
7c673cae 308 mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 309 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
9f95a23c 310 bool path_locked=false, mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 311 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
9f95a23c 312 bool path_locked=false);
7c673cae
FG
313 void kick_discovers(mds_rank_t who); // after a failure.
314
7c673cae
FG
315 // adjust subtree auth specification
316 // dir->dir_auth
317 // imports/exports/nested_exports
318 // join/split subtrees as appropriate
7c673cae 319 bool is_subtrees() { return !subtrees.empty(); }
11fdf7f2
TL
320 template<typename T>
321 void get_subtrees(T& c) {
322 if constexpr (std::is_same_v<T, std::vector<CDir*>>)
323 c.reserve(c.size() + subtrees.size());
324 for (const auto& p : subtrees) {
325 c.push_back(p.first);
326 }
327 }
28e407b8 328 void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
224ce89b
WB
329 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
330 adjust_subtree_auth(root, mds_authority_t(a,b));
7c673cae 331 }
20effc67
TL
332 void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_authority_t auth);
333 void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_rank_t a) {
7c673cae
FG
334 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
335 }
20effc67
TL
336 void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, const mds_authority_t &auth);
337 void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, mds_rank_t a) {
7c673cae
FG
338 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
339 }
20effc67 340 void map_dirfrag_set(const std::list<dirfrag_t>& dfs, std::set<CDir*>& result);
7c673cae 341 void try_subtree_merge(CDir *root);
20effc67 342 void try_subtree_merge_at(CDir *root, std::set<CInode*> *to_eval, bool adjust_pop=true);
7c673cae
FG
343 void eval_subtree_root(CInode *diri);
344 CDir *get_subtree_root(CDir *dir);
345 CDir *get_projected_subtree_root(CDir *dir);
346 bool is_leaf_subtree(CDir *dir) {
11fdf7f2 347 ceph_assert(subtrees.count(dir));
7c673cae
FG
348 return subtrees[dir].empty();
349 }
350 void remove_subtree(CDir *dir);
351 bool is_subtree(CDir *root) {
352 return subtrees.count(root);
353 }
20effc67
TL
354 void get_subtree_bounds(CDir *root, std::set<CDir*>& bounds);
355 void get_wouldbe_subtree_bounds(CDir *root, std::set<CDir*>& bounds);
356 void verify_subtree_bounds(CDir *root, const std::set<CDir*>& bounds);
357 void verify_subtree_bounds(CDir *root, const std::list<dirfrag_t>& bounds);
7c673cae
FG
358
359 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
224ce89b 360 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
7c673cae 361
11fdf7f2
TL
362 auto get_auth_subtrees() {
363 std::vector<CDir*> c;
364 for (auto& p : subtrees) {
365 auto& root = p.first;
366 if (root->is_auth()) {
367 c.push_back(root);
368 }
369 }
370 return c;
371 }
7c673cae 372
11fdf7f2
TL
373 auto get_fullauth_subtrees() {
374 std::vector<CDir*> c;
375 for (auto& p : subtrees) {
376 auto& root = p.first;
377 if (root->is_full_dir_auth()) {
378 c.push_back(root);
379 }
380 }
381 return c;
382 }
383 auto num_subtrees_fullauth() const {
384 std::size_t n = 0;
385 for (auto& p : subtrees) {
386 auto& root = p.first;
387 if (root->is_full_dir_auth()) {
388 ++n;
389 }
390 }
391 return n;
392 }
7c673cae 393
11fdf7f2
TL
394 auto num_subtrees_fullnonauth() const {
395 std::size_t n = 0;
396 for (auto& p : subtrees) {
397 auto& root = p.first;
398 if (root->is_full_dir_nonauth()) {
399 ++n;
400 }
401 }
402 return n;
403 }
7c673cae 404
11fdf7f2
TL
405 auto num_subtrees() const {
406 return subtrees.size();
407 }
7c673cae 408
7c673cae
FG
409 int get_num_client_requests();
410
9f95a23c 411 MDRequestRef request_start(const cref_t<MClientRequest>& req);
f67539c2 412 MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m);
7c673cae
FG
413 MDRequestRef request_start_internal(int op);
414 bool have_request(metareqid_t rid) {
415 return active_requests.count(rid);
416 }
417 MDRequestRef request_get(metareqid_t rid);
20effc67 418 void request_pin_ref(MDRequestRef& r, CInode *ref, std::vector<CDentry*>& trace);
7c673cae
FG
419 void request_finish(MDRequestRef& mdr);
420 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
421 void dispatch_request(MDRequestRef& mdr);
422 void request_drop_foreign_locks(MDRequestRef& mdr);
423 void request_drop_non_rdlocks(MDRequestRef& r);
424 void request_drop_locks(MDRequestRef& r);
425 void request_cleanup(MDRequestRef& r);
426
427 void request_kill(MDRequestRef& r); // called when session closes
428
429 // journal/snap helpers
430 CInode *pick_inode_snap(CInode *in, snapid_t follows);
431 CInode *cow_inode(CInode *in, snapid_t last);
432 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
433 snapid_t follows=CEPH_NOSNAP,
434 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
7c673cae
FG
435 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
436
f67539c2
TL
437 void project_rstat_inode_to_frag(const MutationRef& mut,
438 CInode *cur, CDir *parent, snapid_t first,
7c673cae 439 int linkunlink, SnapRealm *prealm);
f67539c2 440 void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last,
7c673cae 441 CDir *parent, int linkunlink, bool update_inode);
f67539c2
TL
442 void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat,
443 snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head);
a8e16298 444 void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
7c673cae
FG
445 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
446 CInode *in, CDir *parent,
447 int flags, int linkunlink=0,
448 snapid_t follows=CEPH_NOSNAP);
449
f67539c2 450 // peers
20effc67 451 void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, std::set<mds_rank_t> &peers, bool safe=false) {
f67539c2
TL
452 uncommitted_leaders[reqid].ls = ls;
453 uncommitted_leaders[reqid].peers = peers;
454 uncommitted_leaders[reqid].safe = safe;
7c673cae 455 }
f67539c2
TL
456 void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) {
457 uncommitted_leaders[reqid].waiters.push_back(c);
7c673cae 458 }
f67539c2
TL
459 bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) {
460 auto p = uncommitted_leaders.find(reqid);
461 return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0;
7c673cae 462 }
f67539c2
TL
463 void log_leader_commit(metareqid_t reqid);
464 void logged_leader_update(metareqid_t reqid);
465 void _logged_leader_commit(metareqid_t reqid);
466 void committed_leader_peer(metareqid_t r, mds_rank_t from);
467 void finish_committed_leaders();
7c673cae 468
f67539c2
TL
469 void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr);
470 void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) {
471 uncommitted_peers.at(reqid).waiters.push_back(c);
e306af50 472 }
f67539c2
TL
473 void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true);
474 MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader);
475 void _logged_peer_commit(mds_rank_t from, metareqid_t reqid);
7c673cae 476
20effc67 477 void set_recovery_set(std::set<mds_rank_t>& s);
7c673cae
FG
478 void handle_mds_failure(mds_rank_t who);
479 void handle_mds_recovery(mds_rank_t who);
480
7c673cae
FG
481 void recalc_auth_bits(bool replay);
482 void remove_inode_recursive(CInode *in);
483
f67539c2
TL
484 bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
485 auto p = ambiguous_peer_updates.find(leader);
486 return p != ambiguous_peer_updates.end() && p->second.count(reqid);
7c673cae 487 }
f67539c2
TL
488 void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
489 ambiguous_peer_updates[leader].insert(reqid);
7c673cae 490 }
f67539c2
TL
491 void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) {
492 auto p = ambiguous_peer_updates.find(leader);
7c673cae 493 auto q = p->second.find(reqid);
11fdf7f2 494 ceph_assert(q != p->second.end());
7c673cae
FG
495 p->second.erase(q);
496 if (p->second.empty())
f67539c2 497 ambiguous_peer_updates.erase(p);
7c673cae
FG
498 }
499
f67539c2
TL
500 void add_rollback(metareqid_t reqid, mds_rank_t leader) {
501 resolve_need_rollback[reqid] = leader;
7c673cae 502 }
e306af50 503 void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
7c673cae
FG
504
505 // ambiguous imports
20effc67
TL
506 void add_ambiguous_import(dirfrag_t base, const std::vector<dirfrag_t>& bounds);
507 void add_ambiguous_import(CDir *base, const std::set<CDir*>& bounds);
7c673cae
FG
508 bool have_ambiguous_import(dirfrag_t base) {
509 return my_ambiguous_imports.count(base);
510 }
20effc67 511 void get_ambiguous_import_bounds(dirfrag_t base, std::vector<dirfrag_t>& bounds) {
11fdf7f2 512 ceph_assert(my_ambiguous_imports.count(base));
7c673cae
FG
513 bounds = my_ambiguous_imports[base];
514 }
515 void cancel_ambiguous_import(CDir *);
516 void finish_ambiguous_import(dirfrag_t dirino);
11fdf7f2 517 void resolve_start(MDSContext *resolve_done_);
7c673cae 518 void send_resolves();
7c673cae
FG
519 void maybe_send_pending_resolves() {
520 if (resolves_pending)
521 send_subtree_resolves();
522 }
523
524 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
20effc67 525 std::map<dirfrag_t,std::vector<dirfrag_t> >& subtrees);
7c673cae
FG
526 ESubtreeMap *create_subtree_map();
527
7c673cae 528 void clean_open_file_lists();
11fdf7f2
TL
529 void dump_openfiles(Formatter *f);
530 bool dump_inode(Formatter *f, uint64_t number);
7c673cae 531
11fdf7f2 532 void rejoin_start(MDSContext *rejoin_done_);
7c673cae
FG
533 void rejoin_gather_finish();
534 void rejoin_send_rejoins();
535 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2 536 int target=-1, bool drop_path=false) {
28e407b8
AA
537 auto& ex = cap_exports[ino];
538 ex.first = target;
11fdf7f2
TL
539 auto &_icr = ex.second[client] = icr;
540 if (drop_path)
541 _icr.path.clear();
7c673cae
FG
542 }
543 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2
TL
544 mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
545 auto &_icr = cap_imports[ino][client][frommds] = icr;
546 if (drop_path)
547 _icr.path.clear();
7c673cae 548 }
28e407b8
AA
549 void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
550 rejoin_client_map.emplace(client, inst);
551 }
11fdf7f2
TL
552 bool rejoin_has_cap_reconnect(inodeno_t ino) const {
553 return cap_imports.count(ino);
554 }
555 void add_replay_ino_alloc(inodeno_t ino) {
556 cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
557 }
7c673cae
FG
558 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
559 if (cap_imports.count(ino) &&
560 cap_imports[ino].count(client) &&
561 cap_imports[ino][client].count(MDS_RANK_NONE)) {
562 return &cap_imports[ino][client][MDS_RANK_NONE];
563 }
564 return NULL;
565 }
566 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
11fdf7f2
TL
567 ceph_assert(cap_imports[ino].size() == 1);
568 ceph_assert(cap_imports[ino][client].size() == 1);
7c673cae
FG
569 cap_imports.erase(ino);
570 }
11fdf7f2 571 void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
7c673cae
FG
572 cap_reconnect_waiters[ino].push_back(c);
573 }
574
7c673cae
FG
575 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
576 reconnected_cap_info_t &info = reconnected_caps[ino][client];
577 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
578 info.snap_follows = icr.snap_follows;
579 }
11fdf7f2 580 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
7c673cae
FG
581 reconnected_cap_info_t &info = reconnected_caps[ino][client];
582 info.dirty_caps |= dirty;
11fdf7f2
TL
583 if (snapflush)
584 info.snapflush = snapflush;
7c673cae
FG
585 }
586 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
587 reconnected_snaprealms[ino][client] = seq;
588 }
589
7c673cae 590 void rejoin_open_ino_finish(inodeno_t ino, int ret);
11fdf7f2 591 void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
20effc67 592 void rejoin_open_sessions_finish(std::map<client_t,std::pair<Session*,uint64_t> >& session_map);
7c673cae
FG
593 bool process_imported_caps();
594 void choose_lock_states_and_reconnect_caps();
595 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
20effc67
TL
596 std::map<client_t,ref_t<MClientSnap>>& splits);
597 void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, std::map<client_t,ref_t<MClientSnap>>& splits);
598 void send_snaps(std::map<client_t,ref_t<MClientSnap>>& splits);
7c673cae 599 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
11fdf7f2 600 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
20effc67 601 std::map<client_t,ref_t<MClientSnap>>& updates);
a8e16298 602 Capability* try_reconnect_cap(CInode *in, Session *session);
7c673cae
FG
603 void export_remaining_imported_caps();
604
7c673cae
FG
605 void do_cap_import(Session *session, CInode *in, Capability *cap,
606 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
607 int peer, int p_flags);
608 void do_delayed_cap_imports();
609 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
610 snapid_t snap_follows);
11fdf7f2 611 void open_snaprealms();
7c673cae
FG
612
613 bool open_undef_inodes_dirfrags();
614 void opened_undef_inode(CInode *in);
615 void opened_undef_dirfrag(CDir *dir) {
616 rejoin_undef_dirfrags.erase(dir);
617 }
618
619 void reissue_all_caps();
7c673cae 620
7c673cae
FG
621 void start_files_to_recover();
622 void do_file_recover();
623 void queue_file_recover(CInode *in);
624 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
625
92f5a8d4 626 void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
7c673cae
FG
627
628 // debug
629 void log_stat();
630
631 // root inode
632 CInode *get_root() { return root; }
633 CInode *get_myin() { return myin; }
634
7c673cae
FG
635 size_t get_cache_size() { return lru.lru_get_size(); }
636
637 // trimming
a8e16298 638 std::pair<bool, uint64_t> trim(uint64_t count=0);
9f95a23c 639
7c673cae
FG
640 bool trim_non_auth_subtree(CDir *directory);
641 void standby_trim_segment(LogSegment *ls);
642 void try_trim_non_auth_subtree(CDir *dir);
643 bool can_trim_non_auth_dirfrag(CDir *dir) {
644 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
f67539c2 645 uncommitted_peer_rename_olddir.count(dir->inode) == 0;
7c673cae
FG
646 }
647
648 /**
649 * For all unreferenced inodes, dirs, dentries below an inode, compose
650 * expiry messages. This is used when giving up all replicas of entities
651 * for an MDS peer in the 'stopping' state, such that the peer can
652 * empty its cache and finish shutting down.
653 *
654 * We have to make sure we're only expiring un-referenced items to
655 * avoid interfering with ongoing stray-movement (we can't distinguish
656 * between the "moving my strays" and "waiting for my cache to empty"
657 * phases within 'stopping')
658 *
659 * @return false if we completed cleanly, true if caller should stop
660 * expiring because we hit something with refs.
661 */
11fdf7f2 662 bool expire_recursive(CInode *in, expiremap& expiremap);
7c673cae
FG
663
664 void trim_client_leases();
665 void check_memory_usage();
666
7c673cae
FG
667 void shutdown_start();
668 void shutdown_check();
669 bool shutdown_pass();
7c673cae 670 bool shutdown(); // clear cache (ie at shutodwn)
f64942e4
AA
671 bool shutdown_export_strays();
672 void shutdown_export_stray_finish(inodeno_t ino) {
673 if (shutdown_exporting_strays.erase(ino))
674 shutdown_export_strays();
675 }
7c673cae 676
7c673cae
FG
677 // inode_map
678 bool have_inode(vinodeno_t vino) {
b32b8144
FG
679 if (vino.snapid == CEPH_NOSNAP)
680 return inode_map.count(vino.ino) ? true : false;
681 else
682 return snap_inode_map.count(vino) ? true : false;
7c673cae
FG
683 }
684 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
685 return have_inode(vinodeno_t(ino, snap));
686 }
687 CInode* get_inode(vinodeno_t vino) {
b32b8144
FG
688 if (vino.snapid == CEPH_NOSNAP) {
689 auto p = inode_map.find(vino.ino);
690 if (p != inode_map.end())
691 return p->second;
692 } else {
693 auto p = snap_inode_map.find(vino);
694 if (p != snap_inode_map.end())
695 return p->second;
696 }
7c673cae
FG
697 return NULL;
698 }
699 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
700 return get_inode(vinodeno_t(ino, s));
701 }
11fdf7f2
TL
702 CInode* lookup_snap_inode(vinodeno_t vino) {
703 auto p = snap_inode_map.lower_bound(vino);
704 if (p != snap_inode_map.end() &&
705 p->second->ino() == vino.ino && p->second->first <= vino.snapid)
706 return p->second;
707 return NULL;
708 }
7c673cae
FG
709
710 CDir* get_dirfrag(dirfrag_t df) {
711 CInode *in = get_inode(df.ino);
712 if (!in)
713 return NULL;
714 return in->get_dirfrag(df.frag);
715 }
11fdf7f2 716 CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
7c673cae
FG
717 CInode *in = get_inode(ino);
718 if (!in)
719 return NULL;
720 frag_t fg = in->pick_dirfrag(dn);
721 return in->get_dirfrag(fg);
722 }
723 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
724 CInode *diri = get_inode(df.ino);
725 if (!diri)
726 return NULL;
727 CDir *dir = force_dir_fragment(diri, df.frag, replay);
728 if (!dir)
729 dir = diri->get_dirfrag(df.frag);
730 return dir;
731 }
732
11fdf7f2 733 MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
7c673cae 734
7c673cae
FG
735 void add_inode(CInode *in);
736
737 void remove_inode(CInode *in);
9f95a23c 738
7c673cae 739 void touch_dentry(CDentry *dn) {
31f18b77
FG
740 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
741 bottom_lru.lru_midtouch(dn);
742 } else {
743 if (dn->is_auth())
744 lru.lru_touch(dn);
745 else
746 lru.lru_midtouch(dn);
747 }
7c673cae
FG
748 }
749 void touch_dentry_bottom(CDentry *dn) {
31f18b77
FG
750 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
751 return;
7c673cae 752 lru.lru_bottouch(dn);
7c673cae 753 }
7c673cae 754
7c673cae
FG
755 // truncate
756 void truncate_inode(CInode *in, LogSegment *ls);
757 void _truncate_inode(CInode *in, LogSegment *ls);
758 void truncate_inode_finish(CInode *in, LogSegment *ls);
1e59de90
TL
759 void truncate_inode_write_finish(CInode *in, LogSegment *ls,
760 uint32_t block_size);
7c673cae
FG
761 void truncate_inode_logged(CInode *in, MutationRef& mut);
762
763 void add_recovered_truncate(CInode *in, LogSegment *ls);
764 void remove_recovered_truncate(CInode *in, LogSegment *ls);
765 void start_recovered_truncates();
766
9f95a23c
TL
767 // purge unsafe inodes
768 void start_purge_inodes();
769 void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls);
7c673cae 770
7c673cae
FG
771 CDir *get_auth_container(CDir *in);
772 CDir *get_export_container(CDir *dir);
20effc67
TL
773 void find_nested_exports(CDir *dir, std::set<CDir*>& s);
774 void find_nested_exports_under(CDir *import, CDir *dir, std::set<CDir*>& s);
7c673cae 775
7c673cae
FG
776 void init_layouts();
777 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
778 int mode) const;
779 CInode *create_system_inode(inodeno_t ino, int mode);
780 CInode *create_root_inode();
781
782 void create_empty_hierarchy(MDSGather *gather);
783 void create_mydir_hierarchy(MDSGather *gather);
784
785 bool is_open() { return open; }
11fdf7f2 786 void wait_for_open(MDSContext *c) {
7c673cae
FG
787 waiting_for_open.push_back(c);
788 }
789
11fdf7f2 790 void open_root_inode(MDSContext *c);
7c673cae 791 void open_root();
11fdf7f2
TL
792 void open_mydir_inode(MDSContext *c);
793 void open_mydir_frag(MDSContext *c);
7c673cae
FG
794 void populate_mydir();
795
11fdf7f2 796 void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
7c673cae 797 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
11fdf7f2 798 version_t dpv, MDSContext *fin);
7c673cae 799
11fdf7f2 800 void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
7c673cae 801 CDir *get_stray_dir(CInode *in);
7c673cae 802
7c673cae
FG
803 /**
804 * Find the given dentry (and whether it exists or not), its ancestors,
805 * and get them all into memory and usable on this MDS. This function
806 * makes a best-effort attempt to load everything; if it needs to
807 * go away and do something then it will put the request on a waitlist.
808 * It prefers the mdr, then the req, then the fin. (At least one of these
809 * must be non-null.)
810 *
811 * At least one of the params mdr, req, and fin must be non-null.
812 *
813 * @param mdr The MDRequest associated with the path. Can be null.
11fdf7f2 814 * @param cf A MDSContextFactory for waiter building.
7c673cae 815 * @param path The path to traverse to.
9f95a23c
TL
816 *
817 * @param flags Specifies different lookup behaviors.
818 * By default, path_traverse() forwards the request to the auth MDS if that
819 * is appropriate (ie, if it doesn't know the contents of a directory).
820 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
821 * attempts to look up the path from a different MDS (and bring them into
822 * its cache as replicas).
823 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
824 * dentry is encountered.
825 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
826 * tail dentry does not exist. return 0 even tail dentry is null.
1e59de90
TL
827 * MDS_TRAVERSE_WANT_INODE: Caller only wants target inode if it exists, or
828 * wants tail dentry if target inode does not exist and MDS_TRAVERSE_WANT_DENTRY
829 * is also set.
9f95a23c
TL
830 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
831 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
1e59de90
TL
832 * MDS_TRAVERSE_XLOCK_DENTRY: Caller wants to xlock tail dentry if MDS_TRAVERSE_WANT_INODE
833 * is not set or (MDS_TRAVERSE_WANT_INODE is set but target inode does not exist)
9f95a23c 834 *
7c673cae
FG
835 * @param pdnvec Data return parameter -- on success, contains a
836 * vector of dentries. On failure, is either empty or contains the
837 * full trace of traversable dentries.
838 * @param pin Data return parameter -- if successful, points to the inode
839 * associated with filepath. If unsuccessful, is null.
7c673cae
FG
840 *
841 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
842 * If it returns 1, the requester associated with this call has been placed
843 * on the appropriate waitlist, and it should unwind itself and back out.
844 * If it returns 2 the request has been forwarded, and again the requester
845 * should unwind itself and back out.
846 */
9f95a23c
TL
847 int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
848 const filepath& path, int flags,
20effc67 849 std::vector<CDentry*> *pdnvec, CInode **pin=nullptr);
7c673cae 850
1e59de90
TL
851 int maybe_request_forward_to_auth(MDRequestRef& mdr, MDSContextFactory& cf,
852 MDSCacheObject *p);
853
7c673cae
FG
854 CInode *cache_traverse(const filepath& path);
855
11fdf7f2 856 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
7c673cae
FG
857 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
858
20effc67 859 bool parallel_fetch(std::map<inodeno_t,filepath>& pathmap, std::set<inodeno_t>& missing);
7c673cae 860 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
20effc67 861 std::set<CDir*>& fetch_queue, std::set<inodeno_t>& missing,
7c673cae
FG
862 C_GatherBuilder &gather_bld);
863
11fdf7f2 864 void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
7c673cae 865 bool want_xlocked=false);
11fdf7f2 866 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
867 bool want_xlocked, int r);
868
20effc67 869 void make_trace(std::vector<CDentry*>& trace, CInode *in);
7c673cae 870
11fdf7f2 871 void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
f91f0fd5 872 bool want_replica=true, bool want_xlocked=false,
20effc67 873 std::vector<inode_backpointer_t> *ancestors_hint=nullptr,
f91f0fd5 874 mds_rank_t auth_hint=MDS_RANK_NONE);
1e59de90
TL
875 void open_ino_batch_start();
876 void open_ino_batch_submit();
877 void kick_open_ino_peers(mds_rank_t who);
7c673cae 878
9f95a23c
TL
879 void find_ino_peers(inodeno_t ino, MDSContext *c,
880 mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false);
7c673cae 881 void _do_find_ino_peer(find_ino_peer_info_t& fip);
9f95a23c
TL
882 void handle_find_ino(const cref_t<MMDSFindIno> &m);
883 void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m);
7c673cae
FG
884 void kick_find_ino_peers(mds_rank_t who);
885
11fdf7f2
TL
886 SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
887 void create_global_snaprealm();
888 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
889 void send_snap_update(CInode *in, version_t stid, int snap_op);
9f95a23c 890 void handle_snap_update(const cref_t<MMDSSnapUpdate> &m);
11fdf7f2 891 void notify_global_snaprealm_update(int snap_op);
7c673cae
FG
892
893 // -- stray --
7c673cae
FG
894 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
895 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
896
7c673cae 897 // == messages ==
9f95a23c 898 void dispatch(const cref_t<Message> &m);
7c673cae 899
9f95a23c
TL
900 void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
901 void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
902 void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144 903 uint64_t features);
7c673cae 904
9f95a23c
TL
905 void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
906 void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
907 void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
7c673cae 908
9f95a23c 909 void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
33c7a0ef 910 void decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from);
7c673cae
FG
911
912 // -- namespace --
9f95a23c
TL
913 void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl);
914 void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p);
7c673cae 915 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
aee94f69 916 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
a8e16298 917
11fdf7f2 918 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
e306af50
TL
919 uncommitted_fragments.at(dirfrag).waiters.push_back(c);
920 }
921 bool is_any_uncommitted_fragment() const {
922 return !uncommitted_fragments.empty();
7c673cae 923 }
f91f0fd5 924 void wait_for_uncommitted_fragments(MDSContext* finisher);
e306af50
TL
925 void rollback_uncommitted_fragments();
926
7c673cae
FG
927 void split_dir(CDir *dir, int byn);
928 void merge_dir(CInode *diri, frag_t fg);
7c673cae
FG
929
930 void find_stale_fragment_freeze();
931 void fragment_freeze_inc_num_waiters(CDir *dir);
932 bool fragment_are_all_frozen(CDir *dir);
933 int get_num_fragmenting_dirs() { return fragments.size(); }
934
935 // -- updates --
936 //int send_inode_updates(CInode *in);
937 //void handle_inode_update(MInodeUpdate *m);
938
939 int send_dir_updates(CDir *in, bool bcast=false);
9f95a23c 940 void handle_dir_update(const cref_t<MDirUpdate> &m);
7c673cae
FG
941
942 // -- cache expiration --
9f95a23c 943 void handle_cache_expire(const cref_t<MCacheExpire> &m);
7c673cae
FG
944 void process_delayed_expire(CDir *dir);
945 void discard_delayed_expire(CDir *dir);
946
eafe8130 947 // -- mdsmap --
f6b5b4d7 948 void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap);
eafe8130 949
20effc67
TL
950 int dump_cache() { return dump_cache({}, nullptr, 0); }
951 int dump_cache(std::string_view filename, double timeout);
952 int dump_cache(Formatter *f, double timeout);
11fdf7f2 953 void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
7c673cae 954
f64942e4 955 void cache_status(Formatter *f);
181888fb 956
7c673cae
FG
957 void dump_resolve_status(Formatter *f) const;
958 void dump_rejoin_status(Formatter *f) const;
959
960 // == crap fns ==
7c673cae 961 void show_cache();
81eedcae 962 void show_subtrees(int dbl=10, bool force_print=false);
7c673cae
FG
963
964 CInode *hack_pick_random_inode() {
11fdf7f2 965 ceph_assert(!inode_map.empty());
7c673cae 966 int n = rand() % inode_map.size();
b32b8144 967 auto p = inode_map.begin();
7c673cae
FG
968 while (n--) ++p;
969 return p->second;
970 }
971
11fdf7f2 972 void flush_dentry(std::string_view path, Context *fin);
7c673cae
FG
973 /**
974 * Create and start an OP_ENQUEUE_SCRUB
975 */
11fdf7f2 976 void enqueue_scrub(std::string_view path, std::string_view tag,
7c673cae 977 bool force, bool recursive, bool repair,
aee94f69 978 bool scrub_mdsdir, Formatter *f, Context *fin);
7c673cae
FG
979 void repair_inode_stats(CInode *diri);
980 void repair_dirfrag_stats(CDir *dir);
f67539c2 981 void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin);
7c673cae 982
f67539c2 983 // my leader
9f95a23c
TL
984 MDSRank *mds;
985
986 // -- my cache --
987 LRU lru; // dentry lru for expiring items from cache
988 LRU bottom_lru; // dentries that should be trimmed ASAP
989
990 DecayRate decayrate;
991
992 int num_shadow_inodes = 0;
993
994 int num_inodes_with_caps = 0;
995
996 unsigned max_dir_commit_size;
997
998 file_layout_t default_file_layout;
999 file_layout_t default_log_layout;
1000
1001 // -- client leases --
1002 static constexpr std::size_t client_lease_pools = 3;
1003 std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
1004
1005 // -- client caps --
1006 uint64_t last_cap_id = 0;
1007
20effc67 1008 std::map<ceph_tid_t, discover_info_t> discovers;
9f95a23c
TL
1009 ceph_tid_t discover_last_tid = 0;
1010
1011 // waiters
20effc67 1012 std::map<int, std::map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
9f95a23c 1013
20effc67
TL
1014 std::map<inodeno_t,std::map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
1015 std::map<inodeno_t,std::map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
9f95a23c
TL
1016
1017 // realm inodes
20effc67 1018 std::set<CInode*> rejoin_pending_snaprealms;
9f95a23c 1019 // cap imports. delayed snap parent opens.
20effc67 1020 std::map<client_t,std::set<CInode*> > delayed_imported_caps;
9f95a23c
TL
1021
1022 // subsystems
1023 std::unique_ptr<Migrator> migrator;
1024
1025 bool did_shutdown_log_cap = false;
1026
20effc67 1027 std::map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
9f95a23c
TL
1028 ceph_tid_t find_ino_peer_last_tid = 0;
1029
1030 // delayed cache expire
20effc67 1031 std::map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
9f95a23c 1032
7c673cae
FG
1033 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1034 std::set<CInode *> export_pin_queue;
eafe8130 1035 std::set<CInode *> export_pin_delayed_queue;
f67539c2 1036 std::set<CInode *> export_ephemeral_pins;
11fdf7f2
TL
1037
1038 OpenFileTable open_file_table;
eafe8130 1039
f6b5b4d7
TL
1040 double export_ephemeral_random_max = 0.0;
1041
9f95a23c 1042 protected:
f67539c2
TL
1043 // track leader requests whose peers haven't acknowledged commit
1044 struct uleader {
1045 uleader() {}
20effc67 1046 std::set<mds_rank_t> peers;
9f95a23c
TL
1047 LogSegment *ls = nullptr;
1048 MDSContext::vec waiters;
1049 bool safe = false;
1050 bool committing = false;
1051 bool recovering = false;
1052 };
1053
f67539c2
TL
1054 struct upeer {
1055 upeer() {}
1056 mds_rank_t leader;
e306af50 1057 LogSegment *ls = nullptr;
f67539c2 1058 MDPeerUpdate *su = nullptr;
e306af50
TL
1059 MDSContext::vec waiters;
1060 };
1061
9f95a23c
TL
1062 struct open_ino_info_t {
1063 open_ino_info_t() {}
20effc67
TL
1064 std::vector<inode_backpointer_t> ancestors;
1065 std::set<mds_rank_t> checked;
9f95a23c
TL
1066 mds_rank_t checking = MDS_RANK_NONE;
1067 mds_rank_t auth_hint = MDS_RANK_NONE;
1068 bool check_peers = true;
1069 bool fetch_backtrace = true;
1070 bool discover = false;
1071 bool want_replica = false;
1072 bool want_xlocked = false;
1073 version_t tid = 0;
1074 int64_t pool = -1;
1075 int last_err = 0;
1076 MDSContext::vec waiters;
1077 };
1078
1e59de90
TL
1079 ceph_tid_t open_ino_last_tid = 0;
1080 std::map<inodeno_t,open_ino_info_t> opening_inodes;
1081
1082 bool open_ino_batch = false;
1083 std::map<CDir*, std::pair<std::vector<std::string>, MDSContext::vec> > open_ino_batched_fetch;
1084
9f95a23c
TL
1085 friend struct C_MDC_OpenInoTraverseDir;
1086 friend struct C_MDC_OpenInoParentOpened;
1087 friend struct C_MDC_RetryScanStray;
1088
1089 friend class C_IO_MDC_OpenInoBacktraceFetched;
1090 friend class C_MDC_Join;
1091 friend class C_MDC_RespondInternalRequest;
1092
f67539c2 1093 friend class EPeerUpdate;
9f95a23c
TL
1094 friend class ECommitted;
1095
1096 void set_readonly() { readonly = true; }
1097
1098 void handle_resolve(const cref_t<MMDSResolve> &m);
1099 void handle_resolve_ack(const cref_t<MMDSResolveAck> &m);
1100 void process_delayed_resolve();
1101 void discard_delayed_resolve(mds_rank_t who);
1102 void maybe_resolve_finish();
1103 void disambiguate_my_imports();
1104 void disambiguate_other_imports();
1105 void trim_unlinked_inodes();
9f95a23c 1106
f67539c2 1107 void send_peer_resolves();
9f95a23c 1108 void send_subtree_resolves();
f67539c2 1109 void maybe_finish_peer_resolve();
9f95a23c
TL
1110
1111 void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin);
1112 void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m);
1113 void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m);
1114 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
1115 CDir* rejoin_invent_dirfrag(dirfrag_t df);
1116 void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m);
1117 void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
20effc67
TL
1118 std::set<vinodeno_t>& acked_inodes,
1119 std::set<SimpleLock *>& gather_locks);
9f95a23c
TL
1120 void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m);
1121 void rejoin_send_acks();
1122 void rejoin_trim_undef_inodes();
1123 void maybe_send_pending_rejoins() {
1124 if (rejoins_pending)
1125 rejoin_send_rejoins();
1126 }
1127
1128 void touch_inode(CInode *in) {
1129 if (in->get_parent_dn())
1130 touch_dentry(in->get_projected_parent_dn());
1131 }
1132
1133 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
20effc67
TL
1134 std::set<SimpleLock *>& gather_locks);
1135 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, std::set<SimpleLock *>& gather_locks);
9f95a23c
TL
1136
1137 void rename_file(CDentry *srcdn, CDentry *destdn);
1138
1139 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1140 void _open_ino_parent_opened(inodeno_t ino, int ret);
1141 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
1e59de90
TL
1142 void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, bool parent,
1143 CDir *dir, std::string_view dname);
9f95a23c 1144 int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
20effc67 1145 const std::vector<inode_backpointer_t>& ancestors,
9f95a23c
TL
1146 bool discover, bool want_xlocked, mds_rank_t *hint);
1147 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1148 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1149 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
1150 void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0);
1151 void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m);
1152
1153 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1154 // -- replicas --
1155 void handle_discover(const cref_t<MDiscover> &dis);
1156 void handle_discover_reply(const cref_t<MDiscoverReply> &m);
1157 void handle_dentry_link(const cref_t<MDentryLink> &m);
1158 void handle_dentry_unlink(const cref_t<MDentryUnlink> &m);
1159
20effc67 1160 int dump_cache(std::string_view fn, Formatter *f, double timeout);
9f95a23c
TL
1161
1162 void flush_dentry_work(MDRequestRef& mdr);
1163 /**
1164 * Resolve path to a dentry and pass it onto the ScrubStack.
1165 *
1166 * TODO: return enough information to the original mdr formatter
1167 * and completion that they can subsequeuntly check the progress of
1168 * this scrub (we won't block them on a whole scrub as it can take a very
1169 * long time)
1170 */
1171 void enqueue_scrub_work(MDRequestRef& mdr);
9f95a23c
TL
1172 void repair_inode_stats_work(MDRequestRef& mdr);
1173 void repair_dirfrag_stats_work(MDRequestRef& mdr);
f67539c2 1174 void rdlock_dirfrags_stats_work(MDRequestRef& mdr);
9f95a23c
TL
1175
1176 ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
20effc67 1177 std::map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
9f95a23c
TL
1178 CInode *root = nullptr; // root inode
1179 CInode *myin = nullptr; // .ceph/mds%d dir
1180
1181 bool readonly = false;
1182
1183 int stray_index = 0;
f67539c2 1184 int stray_fragmenting_index = -1;
9f95a23c 1185
20effc67 1186 std::set<CInode*> base_inodes;
9f95a23c
TL
1187
1188 std::unique_ptr<PerfCounters> logger;
1189
1190 Filer filer;
9f95a23c
TL
1191 std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
1192
1193 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
20effc67
TL
1194 std::map<CDir*,std::set<CDir*> > subtrees;
1195 std::map<CInode*,std::list<std::pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
9f95a23c
TL
1196
1197 // -- requests --
1198 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
1199
1200 // -- recovery --
20effc67 1201 std::set<mds_rank_t> recovery_set;
9f95a23c
TL
1202
1203 // [resolve]
1204 // from EImportStart w/o EImportFinish during journal replay
20effc67 1205 std::map<dirfrag_t, std::vector<dirfrag_t> > my_ambiguous_imports;
9f95a23c 1206 // from MMDSResolves
20effc67 1207 std::map<mds_rank_t, std::map<dirfrag_t, std::vector<dirfrag_t> > > other_ambiguous_imports;
9f95a23c 1208
20effc67
TL
1209 std::map<CInode*, int> uncommitted_peer_rename_olddir; // peer: preserve the non-auth dir until seeing commit.
1210 std::map<CInode*, int> uncommitted_peer_unlink; // peer: preserve the unlinked inode until seeing commit.
9f95a23c 1211
20effc67
TL
1212 std::map<metareqid_t, uleader> uncommitted_leaders; // leader: req -> peer set
1213 std::map<metareqid_t, upeer> uncommitted_peers; // peer: preserve the peer req until seeing commit.
9f95a23c 1214
20effc67
TL
1215 std::set<metareqid_t> pending_leaders;
1216 std::map<int, std::set<metareqid_t> > ambiguous_peer_updates;
9f95a23c
TL
1217
1218 bool resolves_pending = false;
20effc67
TL
1219 std::set<mds_rank_t> resolve_gather; // nodes i need resolves from
1220 std::set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
1221 std::set<version_t> resolve_snapclient_commits;
1222 std::map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal
1223 std::map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve;
9f95a23c
TL
1224
1225 // [rejoin]
1226 bool rejoins_pending = false;
20effc67
TL
1227 std::set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
1228 std::set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
1229 std::set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
1230 std::set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
1231 std::map<mds_rank_t,std::map<inodeno_t,std::map<client_t,Capability::Import> > > rejoin_imported_caps;
1232 std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,Capability::Export> > > rejoin_peer_exports;
1233
1234 std::map<client_t,entity_inst_t> rejoin_client_map;
1235 std::map<client_t,client_metadata_t> rejoin_client_metadata_map;
1236 std::map<client_t,std::pair<Session*,uint64_t> > rejoin_session_map;
1237
1238 std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
1239
1240 std::map<inodeno_t,std::map<client_t,std::map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
1241 std::set<inodeno_t> cap_imports_missing;
1242 std::map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
9f95a23c
TL
1243 int cap_imports_num_opening = 0;
1244
20effc67
TL
1245 std::set<CInode*> rejoin_undef_inodes;
1246 std::set<CInode*> rejoin_potential_updated_scatterlocks;
1247 std::set<CDir*> rejoin_undef_dirfrags;
1248 std::map<mds_rank_t, std::set<CInode*> > rejoin_unlinked_inodes;
9f95a23c 1249
20effc67
TL
1250 std::vector<CInode*> rejoin_recover_q, rejoin_check_q;
1251 std::list<SimpleLock*> rejoin_eval_locks;
9f95a23c
TL
1252 MDSContext::vec rejoin_waiters;
1253
1254 std::unique_ptr<MDSContext> rejoin_done;
1255 std::unique_ptr<MDSContext> resolve_done;
1256
9f95a23c
TL
1257 StrayManager stray_manager;
1258
1259 private:
05a536ef
TL
1260 std::set<inodeno_t> replay_taken_inos; // the inos have been taken when replaying
1261
9f95a23c
TL
1262 // -- fragmenting --
1263 struct ufragment {
1264 ufragment() {}
1265 int bits = 0;
1266 bool committed = false;
1267 LogSegment *ls = nullptr;
1268 MDSContext::vec waiters;
1269 frag_vec_t old_frags;
1270 bufferlist rollback;
1271 };
1272
1273 struct fragment_info_t {
1274 fragment_info_t() {}
1275 bool is_fragmenting() { return !resultfrags.empty(); }
1276 uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
1277 int bits;
1278 std::vector<CDir*> dirs;
1279 std::vector<CDir*> resultfrags;
1280 MDRequestRef mdr;
20effc67 1281 std::set<mds_rank_t> notify_ack_waiting;
9f95a23c
TL
1282 bool finishing = false;
1283
1284 // for deadlock detection
1285 bool all_frozen = false;
1286 utime_t last_cum_auth_pins_change;
1287 int last_cum_auth_pins = 0;
1288 int num_remote_waiters = 0; // number of remote authpin waiters
1289 };
1290
20effc67 1291 typedef std::map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
9f95a23c
TL
1292
1293 friend class EFragment;
1294 friend class C_MDC_FragmentFrozen;
1295 friend class C_MDC_FragmentMarking;
1296 friend class C_MDC_FragmentPrep;
1297 friend class C_MDC_FragmentStore;
1298 friend class C_MDC_FragmentCommit;
f67539c2 1299 friend class C_MDC_FragmentRollback;
9f95a23c
TL
1300 friend class C_IO_MDC_FragmentPurgeOld;
1301
1302 // -- subtrees --
1303 static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
1304 static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
1305
1306 CInode *get_stray() {
1307 return strays[stray_index];
1308 }
1309
1310 void identify_files_to_recover();
1311
1312 std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
1313 bool trim_dentry(CDentry *dn, expiremap& expiremap);
1314 void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
1315 bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
1316 void send_expire_messages(expiremap& expiremap);
1317 void trim_non_auth(); // trim out trimmable non-auth items
1318
1319 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1320 std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay);
1321 void adjust_dir_fragments(CInode *diri,
1322 const std::vector<CDir*>& srcfrags,
1323 frag_t basefrag, int bits,
1324 std::vector<CDir*>* resultfrags,
1325 MDSContext::vec& waiters,
1326 bool replay);
1327 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
20effc67 1328 void get_force_dirfrag_bound_set(const std::vector<dirfrag_t>& dfs, std::set<CDir*>& bounds);
9f95a23c
TL
1329
1330 bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs);
1331 void fragment_freeze_dirs(const std::vector<CDir*>& dirs);
1332 void fragment_mark_and_complete(MDRequestRef& mdr);
1333 void fragment_frozen(MDRequestRef& mdr, int r);
1334 void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
1335 void fragment_drop_locks(fragment_info_t &info);
1336 void fragment_maybe_finish(const fragment_info_iterator& it);
1337 void dispatch_fragment_dir(MDRequestRef& mdr);
1338 void _fragment_logged(MDRequestRef& mdr);
1339 void _fragment_stored(MDRequestRef& mdr);
1340 void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1341 void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
1342
1343 void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m);
1344 void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m);
1345
1346 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
1347 LogSegment *ls, bufferlist *rollback=NULL);
1348 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1349 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
1350
b3b6e05e
TL
1351 void upkeep_main(void);
1352
9f95a23c
TL
1353 uint64_t cache_memory_limit;
1354 double cache_reservation;
1355 double cache_health_threshold;
9f95a23c
TL
1356 std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
1357
f6b5b4d7
TL
1358 bool export_ephemeral_distributed_config;
1359 bool export_ephemeral_random_config;
f67539c2 1360 unsigned export_ephemeral_dist_frag_bits;
f6b5b4d7 1361
20effc67
TL
1362 // Stores the symlink target on the file object's head
1363 bool symlink_recovery;
1364
9f95a23c
TL
1365 // File size recovery
1366 RecoveryQueue recovery_queue;
1367
1368 // shutdown
20effc67
TL
1369 std::set<inodeno_t> shutdown_exporting_strays;
1370 std::pair<dirfrag_t, std::string> shutdown_export_next;
9f95a23c
TL
1371
1372 bool opening_root = false, open = false;
1373 MDSContext::vec waiting_for_open;
1374
1375 // -- snaprealms --
1376 SnapRealm *global_snaprealm = nullptr;
1377
20effc67 1378 std::map<dirfrag_t, ufragment> uncommitted_fragments;
9f95a23c 1379
20effc67 1380 std::map<dirfrag_t,fragment_info_t> fragments;
9f95a23c
TL
1381
1382 DecayCounter trim_counter;
1383
eafe8130
TL
1384 std::thread upkeeper;
1385 ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
1386 ceph::condition_variable upkeep_cvar;
1387 time upkeep_last_trim = time::min();
92f5a8d4 1388 time upkeep_last_release = time::min();
eafe8130 1389 std::atomic<bool> upkeep_trim_shutdown{false};
7c673cae
FG
1390};
1391
1392class C_MDS_RetryRequest : public MDSInternalContext {
1393 MDCache *cache;
1394 MDRequestRef mdr;
1395 public:
f67539c2
TL
1396 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) :
1397 MDSInternalContext(c->mds), cache(c), mdr(r) {}
7c673cae
FG
1398 void finish(int r) override;
1399};
1400
f67539c2
TL
1401class CF_MDS_RetryRequestFactory : public MDSContextFactory {
1402public:
1403 CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
1404 mdcache(cache), mdr(mdr), drop_locks(dl) {}
1405 MDSContext *build() override;
1406private:
1407 MDCache *mdcache;
1408 MDRequestRef mdr;
1409 bool drop_locks;
1410};
1411
1e59de90
TL
1412/**
1413 * Only for contexts called back from an I/O completion
1414 *
1415 * Note: duplication of members wrt MDCacheContext, because
1416 * it'ls the lesser of two evils compared with introducing
1417 * yet another piece of (multiple) inheritance.
1418 */
1419class MDCacheIOContext : public virtual MDSIOContextBase {
1420protected:
1421 MDCache *mdcache;
1422 MDSRank *get_mds() override
1423 {
1424 ceph_assert(mdcache != NULL);
1425 return mdcache->mds;
1426 }
1427public:
1428 explicit MDCacheIOContext(MDCache *mdc_, bool track=true) :
1429 MDSIOContextBase(track), mdcache(mdc_) {}
1430};
1431
7c673cae 1432#endif