]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.h
bump version to 15.2.11-pve1
[ceph.git] / ceph / src / mds / MDCache.h
CommitLineData
11fdf7f2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
7c673cae
FG
14#ifndef CEPH_MDCACHE_H
15#define CEPH_MDCACHE_H
16
eafe8130 17#include <atomic>
11fdf7f2 18#include <string_view>
eafe8130 19#include <thread>
94b18763 20
a8e16298 21#include "common/DecayCounter.h"
9f95a23c 22#include "include/common_fwd.h"
7c673cae
FG
23#include "include/types.h"
24#include "include/filepath.h"
25#include "include/elist.h"
26
11fdf7f2
TL
27#include "messages/MCacheExpire.h"
28#include "messages/MClientQuota.h"
29#include "messages/MClientRequest.h"
30#include "messages/MClientSnap.h"
31#include "messages/MDentryLink.h"
32#include "messages/MDentryUnlink.h"
33#include "messages/MDirUpdate.h"
34#include "messages/MDiscover.h"
35#include "messages/MDiscoverReply.h"
36#include "messages/MGatherCaps.h"
37#include "messages/MGenericMessage.h"
38#include "messages/MInodeFileCaps.h"
39#include "messages/MLock.h"
40#include "messages/MMDSCacheRejoin.h"
41#include "messages/MMDSFindIno.h"
42#include "messages/MMDSFindInoReply.h"
43#include "messages/MMDSFragmentNotify.h"
44#include "messages/MMDSFragmentNotifyAck.h"
45#include "messages/MMDSOpenIno.h"
46#include "messages/MMDSOpenInoReply.h"
47#include "messages/MMDSResolve.h"
48#include "messages/MMDSResolveAck.h"
49#include "messages/MMDSSlaveRequest.h"
50#include "messages/MMDSSnapUpdate.h"
51
7c673cae
FG
52#include "osdc/Filer.h"
53#include "CInode.h"
54#include "CDentry.h"
55#include "CDir.h"
56#include "include/Context.h"
57#include "events/EMetaBlob.h"
58#include "RecoveryQueue.h"
59#include "StrayManager.h"
11fdf7f2 60#include "OpenFileTable.h"
7c673cae
FG
61#include "MDSContext.h"
62#include "MDSMap.h"
63#include "Mutation.h"
64
7c673cae
FG
65class MDSRank;
66class Session;
67class Migrator;
68
7c673cae
FG
69class Session;
70
7c673cae
FG
71class ESubtreeMap;
72
73enum {
74 l_mdc_first = 3000,
75 // How many inodes currently in stray dentries
76 l_mdc_num_strays,
77 // How many stray dentries are currently delayed for purge due to refs
78 l_mdc_num_strays_delayed,
79 // How many stray dentries are currently being enqueued for purge
80 l_mdc_num_strays_enqueuing,
81
82 // How many dentries have ever been added to stray dir
83 l_mdc_strays_created,
84 // How many dentries have been passed on to PurgeQueue
85 l_mdc_strays_enqueued,
86 // How many strays have been reintegrated?
87 l_mdc_strays_reintegrated,
88 // How many strays have been migrated?
89 l_mdc_strays_migrated,
90
91 // How many inode sizes currently being recovered
92 l_mdc_num_recovering_processing,
93 // How many inodes currently waiting to have size recovered
94 l_mdc_num_recovering_enqueued,
95 // How many inodes waiting with elevated priority for recovery
96 l_mdc_num_recovering_prioritized,
97 // How many inodes ever started size recovery
98 l_mdc_recovery_started,
99 // How many inodes ever completed size recovery
100 l_mdc_recovery_completed,
101
d2e6a577
FG
102 l_mdss_ireq_enqueue_scrub,
103 l_mdss_ireq_exportdir,
104 l_mdss_ireq_flush,
105 l_mdss_ireq_fragmentdir,
106 l_mdss_ireq_fragstats,
107 l_mdss_ireq_inodestats,
108
7c673cae
FG
109 l_mdc_last,
110};
111
9f95a23c
TL
112// flags for path_traverse();
113static const int MDS_TRAVERSE_DISCOVER = (1 << 0);
114static const int MDS_TRAVERSE_PATH_LOCKED = (1 << 1);
115static const int MDS_TRAVERSE_WANT_DENTRY = (1 << 2);
116static const int MDS_TRAVERSE_WANT_AUTH = (1 << 3);
117static const int MDS_TRAVERSE_RDLOCK_SNAP = (1 << 4);
118static const int MDS_TRAVERSE_RDLOCK_SNAP2 = (1 << 5);
119static const int MDS_TRAVERSE_WANT_DIRLAYOUT = (1 << 6);
120static const int MDS_TRAVERSE_RDLOCK_PATH = (1 << 7);
121static const int MDS_TRAVERSE_XLOCK_DENTRY = (1 << 8);
122static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK = (1 << 9);
123static const int MDS_TRAVERSE_CHECK_LOCKCACHE = (1 << 10);
124
7c673cae
FG
125
126// flags for predirty_journal_parents()
127static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
128static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
129static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
130
131class MDCache {
132 public:
9f95a23c
TL
133 typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap;
134
91327a77
AA
135 using clock = ceph::coarse_mono_clock;
136 using time = ceph::coarse_mono_time;
137
9f95a23c
TL
138 // -- discover --
139 struct discover_info_t {
140 discover_info_t() {}
141 ~discover_info_t() {
142 if (basei)
143 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
144 }
145 void pin_base(CInode *b) {
146 basei = b;
147 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
148 }
7c673cae 149
9f95a23c
TL
150 ceph_tid_t tid = 0;
151 mds_rank_t mds = -1;
152 inodeno_t ino;
153 frag_t frag;
154 snapid_t snap = CEPH_NOSNAP;
155 filepath want_path;
156 CInode *basei = nullptr;
157 bool want_base_dir = false;
158 bool path_locked = false;
159 };
7c673cae 160
9f95a23c
TL
161 // [reconnect/rejoin caps]
162 struct reconnected_cap_info_t {
163 reconnected_cap_info_t() {}
164 inodeno_t realm_ino = 0;
165 snapid_t snap_follows = 0;
166 int dirty_caps = 0;
167 bool snapflush = 0;
168 };
7c673cae 169
9f95a23c
TL
170 // -- find_ino_peer --
171 struct find_ino_peer_info_t {
172 find_ino_peer_info_t() {}
173 inodeno_t ino;
174 ceph_tid_t tid = 0;
175 MDSContext *fin = nullptr;
176 bool path_locked = false;
177 mds_rank_t hint = MDS_RANK_NONE;
178 mds_rank_t checking = MDS_RANK_NONE;
179 set<mds_rank_t> checked;
180 };
7c673cae 181
9f95a23c
TL
182 friend class C_MDC_RejoinOpenInoFinish;
183 friend class C_MDC_RejoinSessionsOpened;
7c673cae 184
9f95a23c
TL
185 friend class Locker;
186 friend class Migrator;
187 friend class MDBalancer;
7c673cae 188
9f95a23c
TL
189 // StrayManager needs to be able to remove_inode() from us
190 // when it is done purging
191 friend class StrayManager;
7c673cae 192
9f95a23c
TL
193 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
194 ~MDCache();
91327a77 195
91327a77
AA
196 uint64_t cache_limit_memory(void) {
197 return cache_memory_limit;
181888fb
FG
198 }
199 double cache_toofull_ratio(void) const {
91327a77 200 double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
9f95a23c 201 return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve);
181888fb
FG
202 }
203 bool cache_toofull(void) const {
204 return cache_toofull_ratio() > 0.0;
205 }
206 uint64_t cache_size(void) const {
207 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
208 }
209 bool cache_overfull(void) const {
9f95a23c 210 return cache_size() > cache_memory_limit*cache_health_threshold;
181888fb
FG
211 }
212
7c673cae
FG
213 void advance_stray() {
214 stray_index = (stray_index+1)%NUM_STRAY;
215 }
216
f6b5b4d7
TL
217 bool get_export_ephemeral_distributed_config(void) const {
218 return export_ephemeral_distributed_config;
219 }
220
221 bool get_export_ephemeral_random_config(void) const {
222 return export_ephemeral_random_config;
223 }
224
7c673cae
FG
225 /**
226 * Call this when you know that a CDentry is ready to be passed
227 * on to StrayManager (i.e. this is a stray you've just created)
228 */
229 void notify_stray(CDentry *dn) {
11fdf7f2 230 ceph_assert(dn->get_dir()->get_inode()->is_stray());
a8e16298
TL
231 if (dn->state_test(CDentry::STATE_PURGING))
232 return;
233
7c673cae
FG
234 stray_manager.eval_stray(dn);
235 }
236
f6b5b4d7
TL
237 mds_rank_t hash_into_rank_bucket(inodeno_t ino);
238
7c673cae 239 void maybe_eval_stray(CInode *in, bool delay=false);
31f18b77
FG
240 void clear_dirty_bits_for_stray(CInode* diri);
241
7c673cae
FG
242 bool is_readonly() { return readonly; }
243 void force_readonly();
244
7c673cae
FG
245 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
246 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
247
7c673cae
FG
248 void register_perfcounters();
249
7c673cae
FG
250 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
251 client_leases[pool].push_back(&r->item_lease);
252 r->ttl = ttl;
253 }
254
255 void notify_stray_removed()
256 {
257 stray_manager.notify_stray_removed();
258 }
259
260 void notify_stray_created()
261 {
262 stray_manager.notify_stray_created();
263 }
264
31f18b77
FG
265 void eval_remote(CDentry *dn)
266 {
267 stray_manager.eval_remote(dn);
268 }
269
7c673cae
FG
270 void _send_discover(discover_info_t& dis);
271 discover_info_t& _create_discover(mds_rank_t mds) {
272 ceph_tid_t t = ++discover_last_tid;
273 discover_info_t& d = discovers[t];
274 d.tid = t;
275 d.mds = mds;
276 return d;
277 }
278
11fdf7f2
TL
279 void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE);
280 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish,
7c673cae 281 mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 282 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
9f95a23c 283 bool path_locked=false, mds_rank_t from=MDS_RANK_NONE);
11fdf7f2 284 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish,
9f95a23c 285 bool path_locked=false);
7c673cae
FG
286 void kick_discovers(mds_rank_t who); // after a failure.
287
7c673cae
FG
288 // adjust subtree auth specification
289 // dir->dir_auth
290 // imports/exports/nested_exports
291 // join/split subtrees as appropriate
7c673cae 292 bool is_subtrees() { return !subtrees.empty(); }
11fdf7f2
TL
293 template<typename T>
294 void get_subtrees(T& c) {
295 if constexpr (std::is_same_v<T, std::vector<CDir*>>)
296 c.reserve(c.size() + subtrees.size());
297 for (const auto& p : subtrees) {
298 c.push_back(p.first);
299 }
300 }
28e407b8 301 void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
224ce89b
WB
302 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
303 adjust_subtree_auth(root, mds_authority_t(a,b));
7c673cae 304 }
11fdf7f2
TL
305 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_authority_t auth);
306 void adjust_bounded_subtree_auth(CDir *dir, const set<CDir*>& bounds, mds_rank_t a) {
7c673cae
FG
307 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
308 }
11fdf7f2
TL
309 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, const mds_authority_t &auth);
310 void adjust_bounded_subtree_auth(CDir *dir, const vector<dirfrag_t>& bounds, mds_rank_t a) {
7c673cae
FG
311 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
312 }
11fdf7f2 313 void map_dirfrag_set(const list<dirfrag_t>& dfs, set<CDir*>& result);
7c673cae 314 void try_subtree_merge(CDir *root);
28e407b8 315 void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
7c673cae
FG
316 void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
317 void eval_subtree_root(CInode *diri);
318 CDir *get_subtree_root(CDir *dir);
319 CDir *get_projected_subtree_root(CDir *dir);
320 bool is_leaf_subtree(CDir *dir) {
11fdf7f2 321 ceph_assert(subtrees.count(dir));
7c673cae
FG
322 return subtrees[dir].empty();
323 }
324 void remove_subtree(CDir *dir);
325 bool is_subtree(CDir *root) {
326 return subtrees.count(root);
327 }
328 void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
329 void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
330 void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
331 void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
332
333 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
224ce89b 334 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
7c673cae 335
11fdf7f2
TL
336 auto get_auth_subtrees() {
337 std::vector<CDir*> c;
338 for (auto& p : subtrees) {
339 auto& root = p.first;
340 if (root->is_auth()) {
341 c.push_back(root);
342 }
343 }
344 return c;
345 }
7c673cae 346
11fdf7f2
TL
347 auto get_fullauth_subtrees() {
348 std::vector<CDir*> c;
349 for (auto& p : subtrees) {
350 auto& root = p.first;
351 if (root->is_full_dir_auth()) {
352 c.push_back(root);
353 }
354 }
355 return c;
356 }
357 auto num_subtrees_fullauth() const {
358 std::size_t n = 0;
359 for (auto& p : subtrees) {
360 auto& root = p.first;
361 if (root->is_full_dir_auth()) {
362 ++n;
363 }
364 }
365 return n;
366 }
7c673cae 367
11fdf7f2
TL
368 auto num_subtrees_fullnonauth() const {
369 std::size_t n = 0;
370 for (auto& p : subtrees) {
371 auto& root = p.first;
372 if (root->is_full_dir_nonauth()) {
373 ++n;
374 }
375 }
376 return n;
377 }
7c673cae 378
11fdf7f2
TL
379 auto num_subtrees() const {
380 return subtrees.size();
381 }
7c673cae 382
7c673cae
FG
383 int get_num_client_requests();
384
9f95a23c
TL
385 MDRequestRef request_start(const cref_t<MClientRequest>& req);
386 MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, const cref_t<Message> &m);
7c673cae
FG
387 MDRequestRef request_start_internal(int op);
388 bool have_request(metareqid_t rid) {
389 return active_requests.count(rid);
390 }
391 MDRequestRef request_get(metareqid_t rid);
392 void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
393 void request_finish(MDRequestRef& mdr);
394 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
395 void dispatch_request(MDRequestRef& mdr);
396 void request_drop_foreign_locks(MDRequestRef& mdr);
397 void request_drop_non_rdlocks(MDRequestRef& r);
398 void request_drop_locks(MDRequestRef& r);
399 void request_cleanup(MDRequestRef& r);
400
401 void request_kill(MDRequestRef& r); // called when session closes
402
403 // journal/snap helpers
404 CInode *pick_inode_snap(CInode *in, snapid_t follows);
405 CInode *cow_inode(CInode *in, snapid_t last);
406 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
407 snapid_t follows=CEPH_NOSNAP,
408 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
409 void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
410 CInode **pcow_inode=0);
411 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
412
413 void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
414 int linkunlink, SnapRealm *prealm);
94b18763 415 void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
416 CDir *parent, int linkunlink, bool update_inode);
417 void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
418 snapid_t ofirst, snapid_t last,
419 CInode *pin, bool cow_head);
a8e16298 420 void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false);
7c673cae
FG
421 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
422 CInode *in, CDir *parent,
423 int flags, int linkunlink=0,
424 snapid_t follows=CEPH_NOSNAP);
425
426 // slaves
427 void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
428 uncommitted_masters[reqid].ls = ls;
429 uncommitted_masters[reqid].slaves = slaves;
430 uncommitted_masters[reqid].safe = safe;
431 }
11fdf7f2 432 void wait_for_uncommitted_master(metareqid_t reqid, MDSContext *c) {
7c673cae
FG
433 uncommitted_masters[reqid].waiters.push_back(c);
434 }
435 bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
436 auto p = uncommitted_masters.find(reqid);
437 return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
438 }
439 void log_master_commit(metareqid_t reqid);
440 void logged_master_update(metareqid_t reqid);
441 void _logged_master_commit(metareqid_t reqid);
442 void committed_master_slave(metareqid_t r, mds_rank_t from);
443 void finish_committed_masters();
444
e306af50
TL
445 void add_uncommitted_slave(metareqid_t reqid, LogSegment*, mds_rank_t, MDSlaveUpdate *su=nullptr);
446 void wait_for_uncommitted_slave(metareqid_t reqid, MDSContext *c) {
447 uncommitted_slaves.at(reqid).waiters.push_back(c);
448 }
449 void finish_uncommitted_slave(metareqid_t reqid, bool assert_exist=true);
450 MDSlaveUpdate* get_uncommitted_slave(metareqid_t reqid, mds_rank_t master);
7c673cae
FG
451 void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
452
7c673cae
FG
453 void set_recovery_set(set<mds_rank_t>& s);
454 void handle_mds_failure(mds_rank_t who);
455 void handle_mds_recovery(mds_rank_t who);
456
7c673cae
FG
457 void recalc_auth_bits(bool replay);
458 void remove_inode_recursive(CInode *in);
459
460 bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
461 auto p = ambiguous_slave_updates.find(master);
462 return p != ambiguous_slave_updates.end() && p->second.count(reqid);
463 }
464 void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
465 ambiguous_slave_updates[master].insert(reqid);
466 }
467 void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
468 auto p = ambiguous_slave_updates.find(master);
469 auto q = p->second.find(reqid);
11fdf7f2 470 ceph_assert(q != p->second.end());
7c673cae
FG
471 p->second.erase(q);
472 if (p->second.empty())
473 ambiguous_slave_updates.erase(p);
474 }
475
476 void add_rollback(metareqid_t reqid, mds_rank_t master) {
11fdf7f2 477 resolve_need_rollback[reqid] = master;
7c673cae 478 }
e306af50 479 void finish_rollback(metareqid_t reqid, MDRequestRef& mdr);
7c673cae
FG
480
481 // ambiguous imports
482 void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
483 void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
484 bool have_ambiguous_import(dirfrag_t base) {
485 return my_ambiguous_imports.count(base);
486 }
487 void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
11fdf7f2 488 ceph_assert(my_ambiguous_imports.count(base));
7c673cae
FG
489 bounds = my_ambiguous_imports[base];
490 }
491 void cancel_ambiguous_import(CDir *);
492 void finish_ambiguous_import(dirfrag_t dirino);
11fdf7f2 493 void resolve_start(MDSContext *resolve_done_);
7c673cae 494 void send_resolves();
7c673cae
FG
495 void maybe_send_pending_resolves() {
496 if (resolves_pending)
497 send_subtree_resolves();
498 }
499
500 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
501 map<dirfrag_t,vector<dirfrag_t> >& subtrees);
502 ESubtreeMap *create_subtree_map();
503
7c673cae 504 void clean_open_file_lists();
11fdf7f2
TL
505 void dump_openfiles(Formatter *f);
506 bool dump_inode(Formatter *f, uint64_t number);
7c673cae 507
11fdf7f2 508 void rejoin_start(MDSContext *rejoin_done_);
7c673cae
FG
509 void rejoin_gather_finish();
510 void rejoin_send_rejoins();
511 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2 512 int target=-1, bool drop_path=false) {
28e407b8
AA
513 auto& ex = cap_exports[ino];
514 ex.first = target;
11fdf7f2
TL
515 auto &_icr = ex.second[client] = icr;
516 if (drop_path)
517 _icr.path.clear();
7c673cae
FG
518 }
519 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
11fdf7f2
TL
520 mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) {
521 auto &_icr = cap_imports[ino][client][frommds] = icr;
522 if (drop_path)
523 _icr.path.clear();
7c673cae 524 }
28e407b8
AA
525 void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
526 rejoin_client_map.emplace(client, inst);
527 }
11fdf7f2
TL
528 bool rejoin_has_cap_reconnect(inodeno_t ino) const {
529 return cap_imports.count(ino);
530 }
531 void add_replay_ino_alloc(inodeno_t ino) {
532 cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin
533 }
7c673cae
FG
534 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
535 if (cap_imports.count(ino) &&
536 cap_imports[ino].count(client) &&
537 cap_imports[ino][client].count(MDS_RANK_NONE)) {
538 return &cap_imports[ino][client][MDS_RANK_NONE];
539 }
540 return NULL;
541 }
542 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
11fdf7f2
TL
543 ceph_assert(cap_imports[ino].size() == 1);
544 ceph_assert(cap_imports[ino][client].size() == 1);
7c673cae
FG
545 cap_imports.erase(ino);
546 }
11fdf7f2 547 void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) {
7c673cae
FG
548 cap_reconnect_waiters[ino].push_back(c);
549 }
550
7c673cae
FG
551 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
552 reconnected_cap_info_t &info = reconnected_caps[ino][client];
553 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
554 info.snap_follows = icr.snap_follows;
555 }
11fdf7f2 556 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) {
7c673cae
FG
557 reconnected_cap_info_t &info = reconnected_caps[ino][client];
558 info.dirty_caps |= dirty;
11fdf7f2
TL
559 if (snapflush)
560 info.snapflush = snapflush;
7c673cae
FG
561 }
562 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
563 reconnected_snaprealms[ino][client] = seq;
564 }
565
7c673cae 566 void rejoin_open_ino_finish(inodeno_t ino, int ret);
11fdf7f2 567 void rejoin_prefetch_ino_finish(inodeno_t ino, int ret);
28e407b8 568 void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
7c673cae
FG
569 bool process_imported_caps();
570 void choose_lock_states_and_reconnect_caps();
571 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
9f95a23c
TL
572 map<client_t,ref_t<MClientSnap>>& splits);
573 void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, map<client_t,ref_t<MClientSnap>>& splits);
574 void send_snaps(map<client_t,ref_t<MClientSnap>>& splits);
7c673cae 575 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
11fdf7f2 576 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq,
9f95a23c 577 map<client_t,ref_t<MClientSnap>>& updates);
a8e16298 578 Capability* try_reconnect_cap(CInode *in, Session *session);
7c673cae
FG
579 void export_remaining_imported_caps();
580
7c673cae
FG
581 void do_cap_import(Session *session, CInode *in, Capability *cap,
582 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
583 int peer, int p_flags);
584 void do_delayed_cap_imports();
585 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
586 snapid_t snap_follows);
11fdf7f2 587 void open_snaprealms();
7c673cae
FG
588
589 bool open_undef_inodes_dirfrags();
590 void opened_undef_inode(CInode *in);
591 void opened_undef_dirfrag(CDir *dir) {
592 rejoin_undef_dirfrags.erase(dir);
593 }
594
595 void reissue_all_caps();
7c673cae 596
7c673cae
FG
597 void start_files_to_recover();
598 void do_file_recover();
599 void queue_file_recover(CInode *in);
600 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
601
92f5a8d4 602 void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map);
7c673cae
FG
603
604 // debug
605 void log_stat();
606
607 // root inode
608 CInode *get_root() { return root; }
609 CInode *get_myin() { return myin; }
610
7c673cae
FG
611 size_t get_cache_size() { return lru.lru_get_size(); }
612
613 // trimming
a8e16298 614 std::pair<bool, uint64_t> trim(uint64_t count=0);
9f95a23c 615
7c673cae
FG
616 bool trim_non_auth_subtree(CDir *directory);
617 void standby_trim_segment(LogSegment *ls);
618 void try_trim_non_auth_subtree(CDir *dir);
619 bool can_trim_non_auth_dirfrag(CDir *dir) {
620 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
621 uncommitted_slave_rename_olddir.count(dir->inode) == 0;
622 }
623
624 /**
625 * For all unreferenced inodes, dirs, dentries below an inode, compose
626 * expiry messages. This is used when giving up all replicas of entities
627 * for an MDS peer in the 'stopping' state, such that the peer can
628 * empty its cache and finish shutting down.
629 *
630 * We have to make sure we're only expiring un-referenced items to
631 * avoid interfering with ongoing stray-movement (we can't distinguish
632 * between the "moving my strays" and "waiting for my cache to empty"
633 * phases within 'stopping')
634 *
635 * @return false if we completed cleanly, true if caller should stop
636 * expiring because we hit something with refs.
637 */
11fdf7f2 638 bool expire_recursive(CInode *in, expiremap& expiremap);
7c673cae
FG
639
640 void trim_client_leases();
641 void check_memory_usage();
642
7c673cae
FG
643 void shutdown_start();
644 void shutdown_check();
645 bool shutdown_pass();
7c673cae 646 bool shutdown(); // clear cache (ie at shutodwn)
f64942e4
AA
647 bool shutdown_export_strays();
648 void shutdown_export_stray_finish(inodeno_t ino) {
649 if (shutdown_exporting_strays.erase(ino))
650 shutdown_export_strays();
651 }
7c673cae 652
7c673cae
FG
653 // inode_map
654 bool have_inode(vinodeno_t vino) {
b32b8144
FG
655 if (vino.snapid == CEPH_NOSNAP)
656 return inode_map.count(vino.ino) ? true : false;
657 else
658 return snap_inode_map.count(vino) ? true : false;
7c673cae
FG
659 }
660 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
661 return have_inode(vinodeno_t(ino, snap));
662 }
663 CInode* get_inode(vinodeno_t vino) {
b32b8144
FG
664 if (vino.snapid == CEPH_NOSNAP) {
665 auto p = inode_map.find(vino.ino);
666 if (p != inode_map.end())
667 return p->second;
668 } else {
669 auto p = snap_inode_map.find(vino);
670 if (p != snap_inode_map.end())
671 return p->second;
672 }
7c673cae
FG
673 return NULL;
674 }
675 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
676 return get_inode(vinodeno_t(ino, s));
677 }
11fdf7f2
TL
678 CInode* lookup_snap_inode(vinodeno_t vino) {
679 auto p = snap_inode_map.lower_bound(vino);
680 if (p != snap_inode_map.end() &&
681 p->second->ino() == vino.ino && p->second->first <= vino.snapid)
682 return p->second;
683 return NULL;
684 }
7c673cae
FG
685
686 CDir* get_dirfrag(dirfrag_t df) {
687 CInode *in = get_inode(df.ino);
688 if (!in)
689 return NULL;
690 return in->get_dirfrag(df.frag);
691 }
11fdf7f2 692 CDir* get_dirfrag(inodeno_t ino, std::string_view dn) {
7c673cae
FG
693 CInode *in = get_inode(ino);
694 if (!in)
695 return NULL;
696 frag_t fg = in->pick_dirfrag(dn);
697 return in->get_dirfrag(fg);
698 }
699 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
700 CInode *diri = get_inode(df.ino);
701 if (!diri)
702 return NULL;
703 CDir *dir = force_dir_fragment(diri, df.frag, replay);
704 if (!dir)
705 dir = diri->get_dirfrag(df.frag);
706 return dir;
707 }
708
11fdf7f2 709 MDSCacheObject *get_object(const MDSCacheObjectInfo &info);
7c673cae 710
7c673cae
FG
711 void add_inode(CInode *in);
712
713 void remove_inode(CInode *in);
9f95a23c 714
7c673cae 715 void touch_dentry(CDentry *dn) {
31f18b77
FG
716 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
717 bottom_lru.lru_midtouch(dn);
718 } else {
719 if (dn->is_auth())
720 lru.lru_touch(dn);
721 else
722 lru.lru_midtouch(dn);
723 }
7c673cae
FG
724 }
725 void touch_dentry_bottom(CDentry *dn) {
31f18b77
FG
726 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
727 return;
7c673cae 728 lru.lru_bottouch(dn);
7c673cae 729 }
7c673cae 730
7c673cae
FG
731 // truncate
732 void truncate_inode(CInode *in, LogSegment *ls);
733 void _truncate_inode(CInode *in, LogSegment *ls);
734 void truncate_inode_finish(CInode *in, LogSegment *ls);
735 void truncate_inode_logged(CInode *in, MutationRef& mut);
736
737 void add_recovered_truncate(CInode *in, LogSegment *ls);
738 void remove_recovered_truncate(CInode *in, LogSegment *ls);
739 void start_recovered_truncates();
740
9f95a23c
TL
741 // purge unsafe inodes
742 void start_purge_inodes();
743 void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls);
7c673cae 744
7c673cae
FG
745 CDir *get_auth_container(CDir *in);
746 CDir *get_export_container(CDir *dir);
747 void find_nested_exports(CDir *dir, set<CDir*>& s);
748 void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
749
7c673cae
FG
750 void init_layouts();
751 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
752 int mode) const;
753 CInode *create_system_inode(inodeno_t ino, int mode);
754 CInode *create_root_inode();
755
756 void create_empty_hierarchy(MDSGather *gather);
757 void create_mydir_hierarchy(MDSGather *gather);
758
759 bool is_open() { return open; }
11fdf7f2 760 void wait_for_open(MDSContext *c) {
7c673cae
FG
761 waiting_for_open.push_back(c);
762 }
763
11fdf7f2 764 void open_root_inode(MDSContext *c);
7c673cae 765 void open_root();
11fdf7f2
TL
766 void open_mydir_inode(MDSContext *c);
767 void open_mydir_frag(MDSContext *c);
7c673cae
FG
768 void populate_mydir();
769
11fdf7f2 770 void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin);
7c673cae 771 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
11fdf7f2 772 version_t dpv, MDSContext *fin);
7c673cae 773
11fdf7f2 774 void open_foreign_mdsdir(inodeno_t ino, MDSContext *c);
7c673cae
FG
775 CDir *get_stray_dir(CInode *in);
776 CDentry *get_or_create_stray_dentry(CInode *in);
777
7c673cae
FG
778 /**
779 * Find the given dentry (and whether it exists or not), its ancestors,
780 * and get them all into memory and usable on this MDS. This function
781 * makes a best-effort attempt to load everything; if it needs to
782 * go away and do something then it will put the request on a waitlist.
783 * It prefers the mdr, then the req, then the fin. (At least one of these
784 * must be non-null.)
785 *
786 * At least one of the params mdr, req, and fin must be non-null.
787 *
788 * @param mdr The MDRequest associated with the path. Can be null.
11fdf7f2 789 * @param cf A MDSContextFactory for waiter building.
7c673cae 790 * @param path The path to traverse to.
9f95a23c
TL
791 *
792 * @param flags Specifies different lookup behaviors.
793 * By default, path_traverse() forwards the request to the auth MDS if that
794 * is appropriate (ie, if it doesn't know the contents of a directory).
795 * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse()
796 * attempts to look up the path from a different MDS (and bring them into
797 * its cache as replicas).
798 * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked
799 * dentry is encountered.
800 * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if
801 * tail dentry does not exist. return 0 even tail dentry is null.
802 * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode
803 * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set).
804 *
7c673cae
FG
805 * @param pdnvec Data return parameter -- on success, contains a
806 * vector of dentries. On failure, is either empty or contains the
807 * full trace of traversable dentries.
808 * @param pin Data return parameter -- if successful, points to the inode
809 * associated with filepath. If unsuccessful, is null.
7c673cae
FG
810 *
811 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
812 * If it returns 1, the requester associated with this call has been placed
813 * on the appropriate waitlist, and it should unwind itself and back out.
814 * If it returns 2 the request has been forwarded, and again the requester
815 * should unwind itself and back out.
816 */
9f95a23c
TL
817 int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf,
818 const filepath& path, int flags,
819 vector<CDentry*> *pdnvec, CInode **pin=nullptr);
7c673cae
FG
820
821 CInode *cache_traverse(const filepath& path);
822
11fdf7f2 823 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin);
7c673cae
FG
824 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
825
826 bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
827 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
828 set<CDir*>& fetch_queue, set<inodeno_t>& missing,
829 C_GatherBuilder &gather_bld);
830
11fdf7f2 831 void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin,
7c673cae 832 bool want_xlocked=false);
11fdf7f2 833 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin,
7c673cae
FG
834 bool want_xlocked, int r);
835
836 void make_trace(vector<CDentry*>& trace, CInode *in);
837
7c673cae 838 void kick_open_ino_peers(mds_rank_t who);
11fdf7f2 839 void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin,
f91f0fd5
TL
840 bool want_replica=true, bool want_xlocked=false,
841 vector<inode_backpointer_t> *ancestors_hint=nullptr,
842 mds_rank_t auth_hint=MDS_RANK_NONE);
7c673cae 843
9f95a23c
TL
844 void find_ino_peers(inodeno_t ino, MDSContext *c,
845 mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false);
7c673cae 846 void _do_find_ino_peer(find_ino_peer_info_t& fip);
9f95a23c
TL
847 void handle_find_ino(const cref_t<MMDSFindIno> &m);
848 void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m);
7c673cae
FG
849 void kick_find_ino_peers(mds_rank_t who);
850
11fdf7f2
TL
851 SnapRealm *get_global_snaprealm() const { return global_snaprealm; }
852 void create_global_snaprealm();
853 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true);
854 void send_snap_update(CInode *in, version_t stid, int snap_op);
9f95a23c 855 void handle_snap_update(const cref_t<MMDSSnapUpdate> &m);
11fdf7f2 856 void notify_global_snaprealm_update(int snap_op);
7c673cae
FG
857
858 // -- stray --
7c673cae
FG
859 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
860 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
861
7c673cae 862 // == messages ==
9f95a23c 863 void dispatch(const cref_t<Message> &m);
7c673cae 864
9f95a23c
TL
865 void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
866 void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
867 void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144 868 uint64_t features);
7c673cae 869
9f95a23c
TL
870 void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished);
871 void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished);
872 void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished);
7c673cae 873
9f95a23c
TL
874 void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
875 void decode_replica_stray(CDentry *&straydn, const bufferlist &bl, mds_rank_t from);
7c673cae
FG
876
877 // -- namespace --
9f95a23c
TL
878 void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl);
879 void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p);
7c673cae
FG
880 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
881 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
a8e16298 882
11fdf7f2 883 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) {
e306af50
TL
884 uncommitted_fragments.at(dirfrag).waiters.push_back(c);
885 }
886 bool is_any_uncommitted_fragment() const {
887 return !uncommitted_fragments.empty();
7c673cae 888 }
f91f0fd5 889 void wait_for_uncommitted_fragments(MDSContext* finisher);
e306af50
TL
890 void rollback_uncommitted_fragments();
891
7c673cae
FG
892 void split_dir(CDir *dir, int byn);
893 void merge_dir(CInode *diri, frag_t fg);
7c673cae
FG
894
895 void find_stale_fragment_freeze();
896 void fragment_freeze_inc_num_waiters(CDir *dir);
897 bool fragment_are_all_frozen(CDir *dir);
898 int get_num_fragmenting_dirs() { return fragments.size(); }
899
900 // -- updates --
901 //int send_inode_updates(CInode *in);
902 //void handle_inode_update(MInodeUpdate *m);
903
904 int send_dir_updates(CDir *in, bool bcast=false);
9f95a23c 905 void handle_dir_update(const cref_t<MDirUpdate> &m);
7c673cae
FG
906
907 // -- cache expiration --
9f95a23c 908 void handle_cache_expire(const cref_t<MCacheExpire> &m);
7c673cae
FG
909 void process_delayed_expire(CDir *dir);
910 void discard_delayed_expire(CDir *dir);
911
eafe8130 912 // -- mdsmap --
f6b5b4d7 913 void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap);
eafe8130 914
9f95a23c 915 int dump_cache() { return dump_cache({}, nullptr); }
11fdf7f2 916 int dump_cache(std::string_view filename);
31f18b77 917 int dump_cache(Formatter *f);
11fdf7f2 918 void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f);
7c673cae 919
f64942e4 920 void cache_status(Formatter *f);
181888fb 921
7c673cae
FG
922 void dump_resolve_status(Formatter *f) const;
923 void dump_rejoin_status(Formatter *f) const;
924
925 // == crap fns ==
7c673cae 926 void show_cache();
81eedcae 927 void show_subtrees(int dbl=10, bool force_print=false);
7c673cae
FG
928
929 CInode *hack_pick_random_inode() {
11fdf7f2 930 ceph_assert(!inode_map.empty());
7c673cae 931 int n = rand() % inode_map.size();
b32b8144 932 auto p = inode_map.begin();
7c673cae
FG
933 while (n--) ++p;
934 return p->second;
935 }
936
11fdf7f2 937 void flush_dentry(std::string_view path, Context *fin);
7c673cae
FG
938 /**
939 * Create and start an OP_ENQUEUE_SCRUB
940 */
11fdf7f2 941 void enqueue_scrub(std::string_view path, std::string_view tag,
7c673cae
FG
942 bool force, bool recursive, bool repair,
943 Formatter *f, Context *fin);
944 void repair_inode_stats(CInode *diri);
945 void repair_dirfrag_stats(CDir *dir);
11fdf7f2 946 void upgrade_inode_snaprealm(CInode *in);
7c673cae 947
9f95a23c
TL
948 // my master
949 MDSRank *mds;
950
951 // -- my cache --
952 LRU lru; // dentry lru for expiring items from cache
953 LRU bottom_lru; // dentries that should be trimmed ASAP
954
955 DecayRate decayrate;
956
957 int num_shadow_inodes = 0;
958
959 int num_inodes_with_caps = 0;
960
961 unsigned max_dir_commit_size;
962
963 file_layout_t default_file_layout;
964 file_layout_t default_log_layout;
965
966 // -- client leases --
967 static constexpr std::size_t client_lease_pools = 3;
968 std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0};
969
970 // -- client caps --
971 uint64_t last_cap_id = 0;
972
973 map<ceph_tid_t, discover_info_t> discovers;
974 ceph_tid_t discover_last_tid = 0;
975
976 // waiters
977 map<int, map<inodeno_t, MDSContext::vec > > waiting_for_base_ino;
978
979 map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
980 map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
981
982 // realm inodes
983 set<CInode*> rejoin_pending_snaprealms;
984 // cap imports. delayed snap parent opens.
985 map<client_t,set<CInode*> > delayed_imported_caps;
986
987 // subsystems
988 std::unique_ptr<Migrator> migrator;
989
990 bool did_shutdown_log_cap = false;
991
992 map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
993 ceph_tid_t find_ino_peer_last_tid = 0;
994
995 // delayed cache expire
996 map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg
997
7c673cae
FG
998 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
999 std::set<CInode *> export_pin_queue;
eafe8130 1000 std::set<CInode *> export_pin_delayed_queue;
f6b5b4d7
TL
1001 std::set<CInode *> rand_ephemeral_pins;
1002 std::set<CInode *> dist_ephemeral_pins;
11fdf7f2
TL
1003
1004 OpenFileTable open_file_table;
eafe8130 1005
f6b5b4d7
TL
1006 double export_ephemeral_random_max = 0.0;
1007
9f95a23c
TL
1008 protected:
1009 // track master requests whose slaves haven't acknowledged commit
1010 struct umaster {
1011 umaster() {}
1012 set<mds_rank_t> slaves;
1013 LogSegment *ls = nullptr;
1014 MDSContext::vec waiters;
1015 bool safe = false;
1016 bool committing = false;
1017 bool recovering = false;
1018 };
1019
e306af50
TL
1020 struct uslave {
1021 uslave() {}
1022 mds_rank_t master;
1023 LogSegment *ls = nullptr;
1024 MDSlaveUpdate *su = nullptr;
1025 MDSContext::vec waiters;
1026 };
1027
9f95a23c
TL
1028 struct open_ino_info_t {
1029 open_ino_info_t() {}
1030 vector<inode_backpointer_t> ancestors;
1031 set<mds_rank_t> checked;
1032 mds_rank_t checking = MDS_RANK_NONE;
1033 mds_rank_t auth_hint = MDS_RANK_NONE;
1034 bool check_peers = true;
1035 bool fetch_backtrace = true;
1036 bool discover = false;
1037 bool want_replica = false;
1038 bool want_xlocked = false;
1039 version_t tid = 0;
1040 int64_t pool = -1;
1041 int last_err = 0;
1042 MDSContext::vec waiters;
1043 };
1044
1045 friend struct C_MDC_OpenInoTraverseDir;
1046 friend struct C_MDC_OpenInoParentOpened;
1047 friend struct C_MDC_RetryScanStray;
1048
1049 friend class C_IO_MDC_OpenInoBacktraceFetched;
1050 friend class C_MDC_Join;
1051 friend class C_MDC_RespondInternalRequest;
1052
1053 friend class ESlaveUpdate;
1054 friend class ECommitted;
1055
1056 void set_readonly() { readonly = true; }
1057
1058 void handle_resolve(const cref_t<MMDSResolve> &m);
1059 void handle_resolve_ack(const cref_t<MMDSResolveAck> &m);
1060 void process_delayed_resolve();
1061 void discard_delayed_resolve(mds_rank_t who);
1062 void maybe_resolve_finish();
1063 void disambiguate_my_imports();
1064 void disambiguate_other_imports();
1065 void trim_unlinked_inodes();
9f95a23c
TL
1066
1067 void send_slave_resolves();
1068 void send_subtree_resolves();
1069 void maybe_finish_slave_resolve();
1070
1071 void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin);
1072 void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m);
1073 void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m);
1074 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
1075 CDir* rejoin_invent_dirfrag(dirfrag_t df);
1076 void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m);
1077 void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack,
1078 set<vinodeno_t>& acked_inodes,
1079 set<SimpleLock *>& gather_locks);
1080 void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m);
1081 void rejoin_send_acks();
1082 void rejoin_trim_undef_inodes();
1083 void maybe_send_pending_rejoins() {
1084 if (rejoins_pending)
1085 rejoin_send_rejoins();
1086 }
1087
1088 void touch_inode(CInode *in) {
1089 if (in->get_parent_dn())
1090 touch_dentry(in->get_projected_parent_dn());
1091 }
1092
1093 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
1094 set<SimpleLock *>& gather_locks);
1095 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
1096
1097 void rename_file(CDentry *srcdn, CDentry *destdn);
1098
1099 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
1100 void _open_ino_parent_opened(inodeno_t ino, int ret);
1101 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
1102 void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, CDir *dir, bool parent);
1103 int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m,
1104 const vector<inode_backpointer_t>& ancestors,
1105 bool discover, bool want_xlocked, mds_rank_t *hint);
1106 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
1107 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
1108 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
1109 void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0);
1110 void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m);
1111
1112 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1113 // -- replicas --
1114 void handle_discover(const cref_t<MDiscover> &dis);
1115 void handle_discover_reply(const cref_t<MDiscoverReply> &m);
1116 void handle_dentry_link(const cref_t<MDentryLink> &m);
1117 void handle_dentry_unlink(const cref_t<MDentryUnlink> &m);
1118
1119 int dump_cache(std::string_view fn, Formatter *f);
1120
1121 void flush_dentry_work(MDRequestRef& mdr);
1122 /**
1123 * Resolve path to a dentry and pass it onto the ScrubStack.
1124 *
1125 * TODO: return enough information to the original mdr formatter
1126 * and completion that they can subsequeuntly check the progress of
1127 * this scrub (we won't block them on a whole scrub as it can take a very
1128 * long time)
1129 */
1130 void enqueue_scrub_work(MDRequestRef& mdr);
1131 void recursive_scrub_finish(const ScrubHeaderRef& header);
1132 void repair_inode_stats_work(MDRequestRef& mdr);
1133 void repair_dirfrag_stats_work(MDRequestRef& mdr);
1134 void upgrade_inode_snaprealm_work(MDRequestRef& mdr);
1135
1136 ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
1137 map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
1138 CInode *root = nullptr; // root inode
1139 CInode *myin = nullptr; // .ceph/mds%d dir
1140
1141 bool readonly = false;
1142
1143 int stray_index = 0;
1144
1145 set<CInode*> base_inodes;
1146
1147 std::unique_ptr<PerfCounters> logger;
1148
1149 Filer filer;
1150 bool exceeded_size_limit = false;
1151 std::array<xlist<ClientLease*>, client_lease_pools> client_leases{};
1152
1153 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
1154 map<CDir*,set<CDir*> > subtrees;
1155 map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
1156
1157 // -- requests --
1158 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
1159
1160 // -- recovery --
1161 set<mds_rank_t> recovery_set;
1162
1163 // [resolve]
1164 // from EImportStart w/o EImportFinish during journal replay
1165 map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
1166 // from MMDSResolves
1167 map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
1168
9f95a23c
TL
1169 map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
1170 map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
1171
1172 map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
e306af50 1173 map<metareqid_t, uslave> uncommitted_slaves; // slave: preserve the slave req until seeing commit.
9f95a23c
TL
1174
1175 set<metareqid_t> pending_masters;
1176 map<int, set<metareqid_t> > ambiguous_slave_updates;
1177
1178 bool resolves_pending = false;
1179 set<mds_rank_t> resolve_gather; // nodes i need resolves from
1180 set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
1181 set<version_t> resolve_snapclient_commits;
1182 map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal
1183 map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve;
1184
1185 // [rejoin]
1186 bool rejoins_pending = false;
1187 set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
1188 set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
1189 set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
1190 set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
1191 map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
1192 map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
1193
1194 map<client_t,entity_inst_t> rejoin_client_map;
1195 map<client_t,client_metadata_t> rejoin_client_metadata_map;
1196 map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
1197
1198 map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
1199
1200 map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
1201 set<inodeno_t> cap_imports_missing;
1202 map<inodeno_t, MDSContext::vec > cap_reconnect_waiters;
1203 int cap_imports_num_opening = 0;
1204
1205 set<CInode*> rejoin_undef_inodes;
1206 set<CInode*> rejoin_potential_updated_scatterlocks;
1207 set<CDir*> rejoin_undef_dirfrags;
1208 map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
1209
1210 vector<CInode*> rejoin_recover_q, rejoin_check_q;
1211 list<SimpleLock*> rejoin_eval_locks;
1212 MDSContext::vec rejoin_waiters;
1213
1214 std::unique_ptr<MDSContext> rejoin_done;
1215 std::unique_ptr<MDSContext> resolve_done;
1216
1217 ceph_tid_t open_ino_last_tid = 0;
1218 map<inodeno_t,open_ino_info_t> opening_inodes;
1219
1220 StrayManager stray_manager;
1221
1222 private:
1223 // -- fragmenting --
1224 struct ufragment {
1225 ufragment() {}
1226 int bits = 0;
1227 bool committed = false;
1228 LogSegment *ls = nullptr;
1229 MDSContext::vec waiters;
1230 frag_vec_t old_frags;
1231 bufferlist rollback;
1232 };
1233
1234 struct fragment_info_t {
1235 fragment_info_t() {}
1236 bool is_fragmenting() { return !resultfrags.empty(); }
1237 uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; }
1238 int bits;
1239 std::vector<CDir*> dirs;
1240 std::vector<CDir*> resultfrags;
1241 MDRequestRef mdr;
1242 set<mds_rank_t> notify_ack_waiting;
1243 bool finishing = false;
1244
1245 // for deadlock detection
1246 bool all_frozen = false;
1247 utime_t last_cum_auth_pins_change;
1248 int last_cum_auth_pins = 0;
1249 int num_remote_waiters = 0; // number of remote authpin waiters
1250 };
1251
1252 typedef map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator;
1253
1254 friend class EFragment;
1255 friend class C_MDC_FragmentFrozen;
1256 friend class C_MDC_FragmentMarking;
1257 friend class C_MDC_FragmentPrep;
1258 friend class C_MDC_FragmentStore;
1259 friend class C_MDC_FragmentCommit;
1260 friend class C_IO_MDC_FragmentPurgeOld;
1261
1262 // -- subtrees --
1263 static const unsigned int SUBTREES_COUNT_THRESHOLD = 5;
1264 static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5;
1265
1266 CInode *get_stray() {
1267 return strays[stray_index];
1268 }
1269
1270 void identify_files_to_recover();
1271
1272 std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
1273 bool trim_dentry(CDentry *dn, expiremap& expiremap);
1274 void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
1275 bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
1276 void send_expire_messages(expiremap& expiremap);
1277 void trim_non_auth(); // trim out trimmable non-auth items
1278
1279 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1280 std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay);
1281 void adjust_dir_fragments(CInode *diri,
1282 const std::vector<CDir*>& srcfrags,
1283 frag_t basefrag, int bits,
1284 std::vector<CDir*>* resultfrags,
1285 MDSContext::vec& waiters,
1286 bool replay);
1287 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1288 void get_force_dirfrag_bound_set(const vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1289
1290 bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs);
1291 void fragment_freeze_dirs(const std::vector<CDir*>& dirs);
1292 void fragment_mark_and_complete(MDRequestRef& mdr);
1293 void fragment_frozen(MDRequestRef& mdr, int r);
1294 void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs);
1295 void fragment_drop_locks(fragment_info_t &info);
1296 void fragment_maybe_finish(const fragment_info_iterator& it);
1297 void dispatch_fragment_dir(MDRequestRef& mdr);
1298 void _fragment_logged(MDRequestRef& mdr);
1299 void _fragment_stored(MDRequestRef& mdr);
1300 void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr);
1301 void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr);
1302
1303 void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m);
1304 void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m);
1305
1306 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag,
1307 LogSegment *ls, bufferlist *rollback=NULL);
1308 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1309 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
1310
1311 uint64_t cache_memory_limit;
1312 double cache_reservation;
1313 double cache_health_threshold;
9f95a23c
TL
1314 std::array<CInode *, NUM_STRAY> strays{}; // my stray dir
1315
f6b5b4d7
TL
1316 bool export_ephemeral_distributed_config;
1317 bool export_ephemeral_random_config;
1318
9f95a23c
TL
1319 // File size recovery
1320 RecoveryQueue recovery_queue;
1321
1322 // shutdown
1323 set<inodeno_t> shutdown_exporting_strays;
1324 pair<dirfrag_t, string> shutdown_export_next;
1325
1326 bool opening_root = false, open = false;
1327 MDSContext::vec waiting_for_open;
1328
1329 // -- snaprealms --
1330 SnapRealm *global_snaprealm = nullptr;
1331
1332 map<dirfrag_t, ufragment> uncommitted_fragments;
1333
1334 map<dirfrag_t,fragment_info_t> fragments;
1335
1336 DecayCounter trim_counter;
1337
eafe8130
TL
1338 std::thread upkeeper;
1339 ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex");
1340 ceph::condition_variable upkeep_cvar;
1341 time upkeep_last_trim = time::min();
92f5a8d4 1342 time upkeep_last_release = time::min();
eafe8130 1343 std::atomic<bool> upkeep_trim_shutdown{false};
7c673cae
FG
1344};
1345
1346class C_MDS_RetryRequest : public MDSInternalContext {
1347 MDCache *cache;
1348 MDRequestRef mdr;
1349 public:
1350 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1351 void finish(int r) override;
1352};
1353
1354#endif