]>
Commit | Line | Data |
---|---|---|
11fdf7f2 | 1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
7c673cae FG |
2 | // vim: ts=8 sw=2 smarttab |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
7c673cae FG |
14 | #ifndef CEPH_MDCACHE_H |
15 | #define CEPH_MDCACHE_H | |
16 | ||
eafe8130 | 17 | #include <atomic> |
11fdf7f2 | 18 | #include <string_view> |
eafe8130 | 19 | #include <thread> |
94b18763 | 20 | |
a8e16298 | 21 | #include "common/DecayCounter.h" |
9f95a23c | 22 | #include "include/common_fwd.h" |
7c673cae FG |
23 | #include "include/types.h" |
24 | #include "include/filepath.h" | |
25 | #include "include/elist.h" | |
26 | ||
11fdf7f2 TL |
27 | #include "messages/MCacheExpire.h" |
28 | #include "messages/MClientQuota.h" | |
29 | #include "messages/MClientRequest.h" | |
30 | #include "messages/MClientSnap.h" | |
31 | #include "messages/MDentryLink.h" | |
32 | #include "messages/MDentryUnlink.h" | |
33 | #include "messages/MDirUpdate.h" | |
34 | #include "messages/MDiscover.h" | |
35 | #include "messages/MDiscoverReply.h" | |
36 | #include "messages/MGatherCaps.h" | |
37 | #include "messages/MGenericMessage.h" | |
38 | #include "messages/MInodeFileCaps.h" | |
39 | #include "messages/MLock.h" | |
40 | #include "messages/MMDSCacheRejoin.h" | |
41 | #include "messages/MMDSFindIno.h" | |
42 | #include "messages/MMDSFindInoReply.h" | |
43 | #include "messages/MMDSFragmentNotify.h" | |
44 | #include "messages/MMDSFragmentNotifyAck.h" | |
45 | #include "messages/MMDSOpenIno.h" | |
46 | #include "messages/MMDSOpenInoReply.h" | |
47 | #include "messages/MMDSResolve.h" | |
48 | #include "messages/MMDSResolveAck.h" | |
f67539c2 | 49 | #include "messages/MMDSPeerRequest.h" |
11fdf7f2 TL |
50 | #include "messages/MMDSSnapUpdate.h" |
51 | ||
7c673cae FG |
52 | #include "osdc/Filer.h" |
53 | #include "CInode.h" | |
54 | #include "CDentry.h" | |
55 | #include "CDir.h" | |
56 | #include "include/Context.h" | |
57 | #include "events/EMetaBlob.h" | |
58 | #include "RecoveryQueue.h" | |
59 | #include "StrayManager.h" | |
11fdf7f2 | 60 | #include "OpenFileTable.h" |
7c673cae FG |
61 | #include "MDSContext.h" |
62 | #include "MDSMap.h" | |
63 | #include "Mutation.h" | |
64 | ||
7c673cae FG |
65 | class MDSRank; |
66 | class Session; | |
67 | class Migrator; | |
68 | ||
7c673cae FG |
69 | class Session; |
70 | ||
7c673cae FG |
71 | class ESubtreeMap; |
72 | ||
73 | enum { | |
74 | l_mdc_first = 3000, | |
1e59de90 TL |
75 | |
76 | // dir updates for replication | |
77 | l_mdc_dir_update, | |
78 | l_mdc_dir_update_receipt, | |
79 | l_mdc_dir_try_discover, | |
80 | l_mdc_dir_send_discover, | |
81 | l_mdc_dir_handle_discover, | |
82 | ||
7c673cae FG |
83 | // How many inodes currently in stray dentries |
84 | l_mdc_num_strays, | |
85 | // How many stray dentries are currently delayed for purge due to refs | |
86 | l_mdc_num_strays_delayed, | |
87 | // How many stray dentries are currently being enqueued for purge | |
88 | l_mdc_num_strays_enqueuing, | |
89 | ||
90 | // How many dentries have ever been added to stray dir | |
91 | l_mdc_strays_created, | |
92 | // How many dentries have been passed on to PurgeQueue | |
93 | l_mdc_strays_enqueued, | |
94 | // How many strays have been reintegrated? | |
95 | l_mdc_strays_reintegrated, | |
96 | // How many strays have been migrated? | |
97 | l_mdc_strays_migrated, | |
98 | ||
99 | // How many inode sizes currently being recovered | |
100 | l_mdc_num_recovering_processing, | |
101 | // How many inodes currently waiting to have size recovered | |
102 | l_mdc_num_recovering_enqueued, | |
103 | // How many inodes waiting with elevated priority for recovery | |
104 | l_mdc_num_recovering_prioritized, | |
105 | // How many inodes ever started size recovery | |
106 | l_mdc_recovery_started, | |
107 | // How many inodes ever completed size recovery | |
108 | l_mdc_recovery_completed, | |
109 | ||
d2e6a577 FG |
110 | l_mdss_ireq_enqueue_scrub, |
111 | l_mdss_ireq_exportdir, | |
112 | l_mdss_ireq_flush, | |
113 | l_mdss_ireq_fragmentdir, | |
114 | l_mdss_ireq_fragstats, | |
115 | l_mdss_ireq_inodestats, | |
116 | ||
7c673cae FG |
117 | l_mdc_last, |
118 | }; | |
119 | ||
9f95a23c TL |
120 | // flags for path_traverse(); |
121 | static const int MDS_TRAVERSE_DISCOVER = (1 << 0); | |
122 | static const int MDS_TRAVERSE_PATH_LOCKED = (1 << 1); | |
123 | static const int MDS_TRAVERSE_WANT_DENTRY = (1 << 2); | |
124 | static const int MDS_TRAVERSE_WANT_AUTH = (1 << 3); | |
125 | static const int MDS_TRAVERSE_RDLOCK_SNAP = (1 << 4); | |
126 | static const int MDS_TRAVERSE_RDLOCK_SNAP2 = (1 << 5); | |
127 | static const int MDS_TRAVERSE_WANT_DIRLAYOUT = (1 << 6); | |
128 | static const int MDS_TRAVERSE_RDLOCK_PATH = (1 << 7); | |
129 | static const int MDS_TRAVERSE_XLOCK_DENTRY = (1 << 8); | |
130 | static const int MDS_TRAVERSE_RDLOCK_AUTHLOCK = (1 << 9); | |
131 | static const int MDS_TRAVERSE_CHECK_LOCKCACHE = (1 << 10); | |
1e59de90 | 132 | static const int MDS_TRAVERSE_WANT_INODE = (1 << 11); |
9f95a23c | 133 | |
7c673cae FG |
134 | |
135 | // flags for predirty_journal_parents() | |
136 | static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting | |
137 | static const int PREDIRTY_DIR = 2; // update parent dir mtime/size | |
138 | static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) | |
139 | ||
140 | class MDCache { | |
141 | public: | |
9f95a23c TL |
142 | typedef std::map<mds_rank_t, ref_t<MCacheExpire>> expiremap; |
143 | ||
91327a77 AA |
144 | using clock = ceph::coarse_mono_clock; |
145 | using time = ceph::coarse_mono_time; | |
146 | ||
9f95a23c TL |
147 | // -- discover -- |
148 | struct discover_info_t { | |
149 | discover_info_t() {} | |
150 | ~discover_info_t() { | |
151 | if (basei) | |
152 | basei->put(MDSCacheObject::PIN_DISCOVERBASE); | |
153 | } | |
154 | void pin_base(CInode *b) { | |
155 | basei = b; | |
156 | basei->get(MDSCacheObject::PIN_DISCOVERBASE); | |
157 | } | |
7c673cae | 158 | |
9f95a23c TL |
159 | ceph_tid_t tid = 0; |
160 | mds_rank_t mds = -1; | |
161 | inodeno_t ino; | |
162 | frag_t frag; | |
163 | snapid_t snap = CEPH_NOSNAP; | |
164 | filepath want_path; | |
165 | CInode *basei = nullptr; | |
166 | bool want_base_dir = false; | |
167 | bool path_locked = false; | |
168 | }; | |
7c673cae | 169 | |
9f95a23c TL |
170 | // [reconnect/rejoin caps] |
171 | struct reconnected_cap_info_t { | |
172 | reconnected_cap_info_t() {} | |
173 | inodeno_t realm_ino = 0; | |
174 | snapid_t snap_follows = 0; | |
175 | int dirty_caps = 0; | |
176 | bool snapflush = 0; | |
177 | }; | |
7c673cae | 178 | |
9f95a23c TL |
179 | // -- find_ino_peer -- |
180 | struct find_ino_peer_info_t { | |
181 | find_ino_peer_info_t() {} | |
182 | inodeno_t ino; | |
183 | ceph_tid_t tid = 0; | |
184 | MDSContext *fin = nullptr; | |
185 | bool path_locked = false; | |
186 | mds_rank_t hint = MDS_RANK_NONE; | |
187 | mds_rank_t checking = MDS_RANK_NONE; | |
20effc67 | 188 | std::set<mds_rank_t> checked; |
9f95a23c | 189 | }; |
7c673cae | 190 | |
9f95a23c TL |
191 | friend class C_MDC_RejoinOpenInoFinish; |
192 | friend class C_MDC_RejoinSessionsOpened; | |
7c673cae | 193 | |
9f95a23c TL |
194 | friend class Locker; |
195 | friend class Migrator; | |
196 | friend class MDBalancer; | |
7c673cae | 197 | |
9f95a23c TL |
198 | // StrayManager needs to be able to remove_inode() from us |
199 | // when it is done purging | |
200 | friend class StrayManager; | |
7c673cae | 201 | |
9f95a23c TL |
202 | explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_); |
203 | ~MDCache(); | |
91327a77 | 204 | |
05a536ef TL |
205 | void insert_taken_inos(inodeno_t ino) { |
206 | replay_taken_inos.insert(ino); | |
207 | } | |
208 | void clear_taken_inos(inodeno_t ino) { | |
209 | replay_taken_inos.erase(ino); | |
210 | } | |
211 | bool test_and_clear_taken_inos(inodeno_t ino) { | |
212 | return replay_taken_inos.erase(ino) != 0; | |
213 | } | |
214 | bool is_taken_inos_empty(void) { | |
215 | return replay_taken_inos.empty(); | |
216 | } | |
217 | ||
91327a77 AA |
218 | uint64_t cache_limit_memory(void) { |
219 | return cache_memory_limit; | |
181888fb FG |
220 | } |
221 | double cache_toofull_ratio(void) const { | |
91327a77 | 222 | double memory_reserve = cache_memory_limit*(1.0-cache_reservation); |
9f95a23c | 223 | return fmax(0.0, (cache_size()-memory_reserve)/memory_reserve); |
181888fb FG |
224 | } |
225 | bool cache_toofull(void) const { | |
226 | return cache_toofull_ratio() > 0.0; | |
227 | } | |
228 | uint64_t cache_size(void) const { | |
229 | return mempool::get_pool(mempool::mds_co::id).allocated_bytes(); | |
230 | } | |
231 | bool cache_overfull(void) const { | |
9f95a23c | 232 | return cache_size() > cache_memory_limit*cache_health_threshold; |
181888fb FG |
233 | } |
234 | ||
f67539c2 | 235 | void advance_stray(); |
7c673cae | 236 | |
f67539c2 TL |
237 | unsigned get_ephemeral_dist_frag_bits() const { |
238 | return export_ephemeral_dist_frag_bits; | |
239 | } | |
f6b5b4d7 TL |
240 | bool get_export_ephemeral_distributed_config(void) const { |
241 | return export_ephemeral_distributed_config; | |
242 | } | |
243 | ||
244 | bool get_export_ephemeral_random_config(void) const { | |
245 | return export_ephemeral_random_config; | |
246 | } | |
247 | ||
20effc67 TL |
248 | bool get_symlink_recovery(void) const { |
249 | return symlink_recovery; | |
250 | } | |
251 | ||
7c673cae FG |
252 | /** |
253 | * Call this when you know that a CDentry is ready to be passed | |
254 | * on to StrayManager (i.e. this is a stray you've just created) | |
255 | */ | |
256 | void notify_stray(CDentry *dn) { | |
11fdf7f2 | 257 | ceph_assert(dn->get_dir()->get_inode()->is_stray()); |
a8e16298 TL |
258 | if (dn->state_test(CDentry::STATE_PURGING)) |
259 | return; | |
260 | ||
7c673cae FG |
261 | stray_manager.eval_stray(dn); |
262 | } | |
263 | ||
f67539c2 | 264 | mds_rank_t hash_into_rank_bucket(inodeno_t ino, frag_t fg=0); |
f6b5b4d7 | 265 | |
7c673cae | 266 | void maybe_eval_stray(CInode *in, bool delay=false); |
31f18b77 FG |
267 | void clear_dirty_bits_for_stray(CInode* diri); |
268 | ||
7c673cae FG |
269 | bool is_readonly() { return readonly; } |
270 | void force_readonly(); | |
271 | ||
7c673cae FG |
272 | static file_layout_t gen_default_file_layout(const MDSMap &mdsmap); |
273 | static file_layout_t gen_default_log_layout(const MDSMap &mdsmap); | |
274 | ||
7c673cae FG |
275 | void register_perfcounters(); |
276 | ||
7c673cae FG |
277 | void touch_client_lease(ClientLease *r, int pool, utime_t ttl) { |
278 | client_leases[pool].push_back(&r->item_lease); | |
279 | r->ttl = ttl; | |
280 | } | |
281 | ||
282 | void notify_stray_removed() | |
283 | { | |
284 | stray_manager.notify_stray_removed(); | |
285 | } | |
286 | ||
287 | void notify_stray_created() | |
288 | { | |
289 | stray_manager.notify_stray_created(); | |
290 | } | |
291 | ||
31f18b77 FG |
292 | void eval_remote(CDentry *dn) |
293 | { | |
294 | stray_manager.eval_remote(dn); | |
295 | } | |
296 | ||
7c673cae FG |
297 | void _send_discover(discover_info_t& dis); |
298 | discover_info_t& _create_discover(mds_rank_t mds) { | |
299 | ceph_tid_t t = ++discover_last_tid; | |
300 | discover_info_t& d = discovers[t]; | |
301 | d.tid = t; | |
302 | d.mds = mds; | |
303 | return d; | |
304 | } | |
305 | ||
11fdf7f2 TL |
306 | void discover_base_ino(inodeno_t want_ino, MDSContext *onfinish, mds_rank_t from=MDS_RANK_NONE); |
307 | void discover_dir_frag(CInode *base, frag_t approx_fg, MDSContext *onfinish, | |
7c673cae | 308 | mds_rank_t from=MDS_RANK_NONE); |
11fdf7f2 | 309 | void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSContext *onfinish, |
9f95a23c | 310 | bool path_locked=false, mds_rank_t from=MDS_RANK_NONE); |
11fdf7f2 | 311 | void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSContext *onfinish, |
9f95a23c | 312 | bool path_locked=false); |
7c673cae FG |
313 | void kick_discovers(mds_rank_t who); // after a failure. |
314 | ||
7c673cae FG |
315 | // adjust subtree auth specification |
316 | // dir->dir_auth | |
317 | // imports/exports/nested_exports | |
318 | // join/split subtrees as appropriate | |
7c673cae | 319 | bool is_subtrees() { return !subtrees.empty(); } |
11fdf7f2 TL |
320 | template<typename T> |
321 | void get_subtrees(T& c) { | |
322 | if constexpr (std::is_same_v<T, std::vector<CDir*>>) | |
323 | c.reserve(c.size() + subtrees.size()); | |
324 | for (const auto& p : subtrees) { | |
325 | c.push_back(p.first); | |
326 | } | |
327 | } | |
28e407b8 | 328 | void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true); |
224ce89b WB |
329 | void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) { |
330 | adjust_subtree_auth(root, mds_authority_t(a,b)); | |
7c673cae | 331 | } |
20effc67 TL |
332 | void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_authority_t auth); |
333 | void adjust_bounded_subtree_auth(CDir *dir, const std::set<CDir*>& bounds, mds_rank_t a) { | |
7c673cae FG |
334 | adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); |
335 | } | |
20effc67 TL |
336 | void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, const mds_authority_t &auth); |
337 | void adjust_bounded_subtree_auth(CDir *dir, const std::vector<dirfrag_t>& bounds, mds_rank_t a) { | |
7c673cae FG |
338 | adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); |
339 | } | |
20effc67 | 340 | void map_dirfrag_set(const std::list<dirfrag_t>& dfs, std::set<CDir*>& result); |
7c673cae | 341 | void try_subtree_merge(CDir *root); |
20effc67 | 342 | void try_subtree_merge_at(CDir *root, std::set<CInode*> *to_eval, bool adjust_pop=true); |
7c673cae FG |
343 | void eval_subtree_root(CInode *diri); |
344 | CDir *get_subtree_root(CDir *dir); | |
345 | CDir *get_projected_subtree_root(CDir *dir); | |
346 | bool is_leaf_subtree(CDir *dir) { | |
11fdf7f2 | 347 | ceph_assert(subtrees.count(dir)); |
7c673cae FG |
348 | return subtrees[dir].empty(); |
349 | } | |
350 | void remove_subtree(CDir *dir); | |
351 | bool is_subtree(CDir *root) { | |
352 | return subtrees.count(root); | |
353 | } | |
20effc67 TL |
354 | void get_subtree_bounds(CDir *root, std::set<CDir*>& bounds); |
355 | void get_wouldbe_subtree_bounds(CDir *root, std::set<CDir*>& bounds); | |
356 | void verify_subtree_bounds(CDir *root, const std::set<CDir*>& bounds); | |
357 | void verify_subtree_bounds(CDir *root, const std::list<dirfrag_t>& bounds); | |
7c673cae FG |
358 | |
359 | void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir); | |
224ce89b | 360 | void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop); |
7c673cae | 361 | |
11fdf7f2 TL |
362 | auto get_auth_subtrees() { |
363 | std::vector<CDir*> c; | |
364 | for (auto& p : subtrees) { | |
365 | auto& root = p.first; | |
366 | if (root->is_auth()) { | |
367 | c.push_back(root); | |
368 | } | |
369 | } | |
370 | return c; | |
371 | } | |
7c673cae | 372 | |
11fdf7f2 TL |
373 | auto get_fullauth_subtrees() { |
374 | std::vector<CDir*> c; | |
375 | for (auto& p : subtrees) { | |
376 | auto& root = p.first; | |
377 | if (root->is_full_dir_auth()) { | |
378 | c.push_back(root); | |
379 | } | |
380 | } | |
381 | return c; | |
382 | } | |
383 | auto num_subtrees_fullauth() const { | |
384 | std::size_t n = 0; | |
385 | for (auto& p : subtrees) { | |
386 | auto& root = p.first; | |
387 | if (root->is_full_dir_auth()) { | |
388 | ++n; | |
389 | } | |
390 | } | |
391 | return n; | |
392 | } | |
7c673cae | 393 | |
11fdf7f2 TL |
394 | auto num_subtrees_fullnonauth() const { |
395 | std::size_t n = 0; | |
396 | for (auto& p : subtrees) { | |
397 | auto& root = p.first; | |
398 | if (root->is_full_dir_nonauth()) { | |
399 | ++n; | |
400 | } | |
401 | } | |
402 | return n; | |
403 | } | |
7c673cae | 404 | |
11fdf7f2 TL |
405 | auto num_subtrees() const { |
406 | return subtrees.size(); | |
407 | } | |
7c673cae | 408 | |
7c673cae FG |
409 | int get_num_client_requests(); |
410 | ||
9f95a23c | 411 | MDRequestRef request_start(const cref_t<MClientRequest>& req); |
f67539c2 | 412 | MDRequestRef request_start_peer(metareqid_t rid, __u32 attempt, const cref_t<Message> &m); |
7c673cae FG |
413 | MDRequestRef request_start_internal(int op); |
414 | bool have_request(metareqid_t rid) { | |
415 | return active_requests.count(rid); | |
416 | } | |
417 | MDRequestRef request_get(metareqid_t rid); | |
20effc67 | 418 | void request_pin_ref(MDRequestRef& r, CInode *ref, std::vector<CDentry*>& trace); |
7c673cae FG |
419 | void request_finish(MDRequestRef& mdr); |
420 | void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0); | |
421 | void dispatch_request(MDRequestRef& mdr); | |
422 | void request_drop_foreign_locks(MDRequestRef& mdr); | |
423 | void request_drop_non_rdlocks(MDRequestRef& r); | |
424 | void request_drop_locks(MDRequestRef& r); | |
425 | void request_cleanup(MDRequestRef& r); | |
426 | ||
427 | void request_kill(MDRequestRef& r); // called when session closes | |
428 | ||
429 | // journal/snap helpers | |
430 | CInode *pick_inode_snap(CInode *in, snapid_t follows); | |
431 | CInode *cow_inode(CInode *in, snapid_t last); | |
432 | void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn, | |
433 | snapid_t follows=CEPH_NOSNAP, | |
434 | CInode **pcow_inode=0, CDentry::linkage_t *dnl=0); | |
7c673cae FG |
435 | void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); |
436 | ||
f67539c2 TL |
437 | void project_rstat_inode_to_frag(const MutationRef& mut, |
438 | CInode *cur, CDir *parent, snapid_t first, | |
7c673cae | 439 | int linkunlink, SnapRealm *prealm); |
f67539c2 | 440 | void _project_rstat_inode_to_frag(const CInode::mempool_inode* inode, snapid_t ofirst, snapid_t last, |
7c673cae | 441 | CDir *parent, int linkunlink, bool update_inode); |
f67539c2 TL |
442 | void project_rstat_frag_to_inode(const nest_info_t& rstat, const nest_info_t& accounted_rstat, |
443 | snapid_t ofirst, snapid_t last, CInode *pin, bool cow_head); | |
a8e16298 | 444 | void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1, bool quota_change = false); |
7c673cae FG |
445 | void predirty_journal_parents(MutationRef mut, EMetaBlob *blob, |
446 | CInode *in, CDir *parent, | |
447 | int flags, int linkunlink=0, | |
448 | snapid_t follows=CEPH_NOSNAP); | |
449 | ||
f67539c2 | 450 | // peers |
20effc67 | 451 | void add_uncommitted_leader(metareqid_t reqid, LogSegment *ls, std::set<mds_rank_t> &peers, bool safe=false) { |
f67539c2 TL |
452 | uncommitted_leaders[reqid].ls = ls; |
453 | uncommitted_leaders[reqid].peers = peers; | |
454 | uncommitted_leaders[reqid].safe = safe; | |
7c673cae | 455 | } |
f67539c2 TL |
456 | void wait_for_uncommitted_leader(metareqid_t reqid, MDSContext *c) { |
457 | uncommitted_leaders[reqid].waiters.push_back(c); | |
7c673cae | 458 | } |
f67539c2 TL |
459 | bool have_uncommitted_leader(metareqid_t reqid, mds_rank_t from) { |
460 | auto p = uncommitted_leaders.find(reqid); | |
461 | return p != uncommitted_leaders.end() && p->second.peers.count(from) > 0; | |
7c673cae | 462 | } |
f67539c2 TL |
463 | void log_leader_commit(metareqid_t reqid); |
464 | void logged_leader_update(metareqid_t reqid); | |
465 | void _logged_leader_commit(metareqid_t reqid); | |
466 | void committed_leader_peer(metareqid_t r, mds_rank_t from); | |
467 | void finish_committed_leaders(); | |
7c673cae | 468 | |
f67539c2 TL |
469 | void add_uncommitted_peer(metareqid_t reqid, LogSegment*, mds_rank_t, MDPeerUpdate *su=nullptr); |
470 | void wait_for_uncommitted_peer(metareqid_t reqid, MDSContext *c) { | |
471 | uncommitted_peers.at(reqid).waiters.push_back(c); | |
e306af50 | 472 | } |
f67539c2 TL |
473 | void finish_uncommitted_peer(metareqid_t reqid, bool assert_exist=true); |
474 | MDPeerUpdate* get_uncommitted_peer(metareqid_t reqid, mds_rank_t leader); | |
475 | void _logged_peer_commit(mds_rank_t from, metareqid_t reqid); | |
7c673cae | 476 | |
20effc67 | 477 | void set_recovery_set(std::set<mds_rank_t>& s); |
7c673cae FG |
478 | void handle_mds_failure(mds_rank_t who); |
479 | void handle_mds_recovery(mds_rank_t who); | |
480 | ||
7c673cae FG |
481 | void recalc_auth_bits(bool replay); |
482 | void remove_inode_recursive(CInode *in); | |
483 | ||
f67539c2 TL |
484 | bool is_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { |
485 | auto p = ambiguous_peer_updates.find(leader); | |
486 | return p != ambiguous_peer_updates.end() && p->second.count(reqid); | |
7c673cae | 487 | } |
f67539c2 TL |
488 | void add_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { |
489 | ambiguous_peer_updates[leader].insert(reqid); | |
7c673cae | 490 | } |
f67539c2 TL |
491 | void remove_ambiguous_peer_update(metareqid_t reqid, mds_rank_t leader) { |
492 | auto p = ambiguous_peer_updates.find(leader); | |
7c673cae | 493 | auto q = p->second.find(reqid); |
11fdf7f2 | 494 | ceph_assert(q != p->second.end()); |
7c673cae FG |
495 | p->second.erase(q); |
496 | if (p->second.empty()) | |
f67539c2 | 497 | ambiguous_peer_updates.erase(p); |
7c673cae FG |
498 | } |
499 | ||
f67539c2 TL |
500 | void add_rollback(metareqid_t reqid, mds_rank_t leader) { |
501 | resolve_need_rollback[reqid] = leader; | |
7c673cae | 502 | } |
e306af50 | 503 | void finish_rollback(metareqid_t reqid, MDRequestRef& mdr); |
7c673cae FG |
504 | |
505 | // ambiguous imports | |
20effc67 TL |
506 | void add_ambiguous_import(dirfrag_t base, const std::vector<dirfrag_t>& bounds); |
507 | void add_ambiguous_import(CDir *base, const std::set<CDir*>& bounds); | |
7c673cae FG |
508 | bool have_ambiguous_import(dirfrag_t base) { |
509 | return my_ambiguous_imports.count(base); | |
510 | } | |
20effc67 | 511 | void get_ambiguous_import_bounds(dirfrag_t base, std::vector<dirfrag_t>& bounds) { |
11fdf7f2 | 512 | ceph_assert(my_ambiguous_imports.count(base)); |
7c673cae FG |
513 | bounds = my_ambiguous_imports[base]; |
514 | } | |
515 | void cancel_ambiguous_import(CDir *); | |
516 | void finish_ambiguous_import(dirfrag_t dirino); | |
11fdf7f2 | 517 | void resolve_start(MDSContext *resolve_done_); |
7c673cae | 518 | void send_resolves(); |
7c673cae FG |
519 | void maybe_send_pending_resolves() { |
520 | if (resolves_pending) | |
521 | send_subtree_resolves(); | |
522 | } | |
523 | ||
524 | void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, | |
20effc67 | 525 | std::map<dirfrag_t,std::vector<dirfrag_t> >& subtrees); |
7c673cae FG |
526 | ESubtreeMap *create_subtree_map(); |
527 | ||
7c673cae | 528 | void clean_open_file_lists(); |
11fdf7f2 TL |
529 | void dump_openfiles(Formatter *f); |
530 | bool dump_inode(Formatter *f, uint64_t number); | |
7c673cae | 531 | |
11fdf7f2 | 532 | void rejoin_start(MDSContext *rejoin_done_); |
7c673cae FG |
533 | void rejoin_gather_finish(); |
534 | void rejoin_send_rejoins(); | |
535 | void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, | |
11fdf7f2 | 536 | int target=-1, bool drop_path=false) { |
28e407b8 AA |
537 | auto& ex = cap_exports[ino]; |
538 | ex.first = target; | |
11fdf7f2 TL |
539 | auto &_icr = ex.second[client] = icr; |
540 | if (drop_path) | |
541 | _icr.path.clear(); | |
7c673cae FG |
542 | } |
543 | void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, | |
11fdf7f2 TL |
544 | mds_rank_t frommds=MDS_RANK_NONE, bool drop_path=false) { |
545 | auto &_icr = cap_imports[ino][client][frommds] = icr; | |
546 | if (drop_path) | |
547 | _icr.path.clear(); | |
7c673cae | 548 | } |
28e407b8 AA |
549 | void rejoin_recovered_client(client_t client, const entity_inst_t& inst) { |
550 | rejoin_client_map.emplace(client, inst); | |
551 | } | |
11fdf7f2 TL |
552 | bool rejoin_has_cap_reconnect(inodeno_t ino) const { |
553 | return cap_imports.count(ino); | |
554 | } | |
555 | void add_replay_ino_alloc(inodeno_t ino) { | |
556 | cap_imports_missing.insert(ino); // avoid opening ino during cache rejoin | |
557 | } | |
7c673cae FG |
558 | const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { |
559 | if (cap_imports.count(ino) && | |
560 | cap_imports[ino].count(client) && | |
561 | cap_imports[ino][client].count(MDS_RANK_NONE)) { | |
562 | return &cap_imports[ino][client][MDS_RANK_NONE]; | |
563 | } | |
564 | return NULL; | |
565 | } | |
566 | void remove_replay_cap_reconnect(inodeno_t ino, client_t client) { | |
11fdf7f2 TL |
567 | ceph_assert(cap_imports[ino].size() == 1); |
568 | ceph_assert(cap_imports[ino][client].size() == 1); | |
7c673cae FG |
569 | cap_imports.erase(ino); |
570 | } | |
11fdf7f2 | 571 | void wait_replay_cap_reconnect(inodeno_t ino, MDSContext *c) { |
7c673cae FG |
572 | cap_reconnect_waiters[ino].push_back(c); |
573 | } | |
574 | ||
7c673cae FG |
575 | void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) { |
576 | reconnected_cap_info_t &info = reconnected_caps[ino][client]; | |
577 | info.realm_ino = inodeno_t(icr.capinfo.snaprealm); | |
578 | info.snap_follows = icr.snap_follows; | |
579 | } | |
11fdf7f2 | 580 | void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty, bool snapflush) { |
7c673cae FG |
581 | reconnected_cap_info_t &info = reconnected_caps[ino][client]; |
582 | info.dirty_caps |= dirty; | |
11fdf7f2 TL |
583 | if (snapflush) |
584 | info.snapflush = snapflush; | |
7c673cae FG |
585 | } |
586 | void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { | |
587 | reconnected_snaprealms[ino][client] = seq; | |
588 | } | |
589 | ||
7c673cae | 590 | void rejoin_open_ino_finish(inodeno_t ino, int ret); |
11fdf7f2 | 591 | void rejoin_prefetch_ino_finish(inodeno_t ino, int ret); |
20effc67 | 592 | void rejoin_open_sessions_finish(std::map<client_t,std::pair<Session*,uint64_t> >& session_map); |
7c673cae FG |
593 | bool process_imported_caps(); |
594 | void choose_lock_states_and_reconnect_caps(); | |
595 | void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, | |
20effc67 TL |
596 | std::map<client_t,ref_t<MClientSnap>>& splits); |
597 | void prepare_realm_merge(SnapRealm *realm, SnapRealm *parent_realm, std::map<client_t,ref_t<MClientSnap>>& splits); | |
598 | void send_snaps(std::map<client_t,ref_t<MClientSnap>>& splits); | |
7c673cae | 599 | Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); |
11fdf7f2 | 600 | void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq, |
20effc67 | 601 | std::map<client_t,ref_t<MClientSnap>>& updates); |
a8e16298 | 602 | Capability* try_reconnect_cap(CInode *in, Session *session); |
7c673cae FG |
603 | void export_remaining_imported_caps(); |
604 | ||
7c673cae FG |
605 | void do_cap_import(Session *session, CInode *in, Capability *cap, |
606 | uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, | |
607 | int peer, int p_flags); | |
608 | void do_delayed_cap_imports(); | |
609 | void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client, | |
610 | snapid_t snap_follows); | |
11fdf7f2 | 611 | void open_snaprealms(); |
7c673cae FG |
612 | |
613 | bool open_undef_inodes_dirfrags(); | |
614 | void opened_undef_inode(CInode *in); | |
615 | void opened_undef_dirfrag(CDir *dir) { | |
616 | rejoin_undef_dirfrags.erase(dir); | |
617 | } | |
618 | ||
619 | void reissue_all_caps(); | |
7c673cae | 620 | |
7c673cae FG |
621 | void start_files_to_recover(); |
622 | void do_file_recover(); | |
623 | void queue_file_recover(CInode *in); | |
624 | void _queued_file_recover_cow(CInode *in, MutationRef& mut); | |
625 | ||
92f5a8d4 | 626 | void handle_conf_change(const std::set<std::string>& changed, const MDSMap& mds_map); |
7c673cae FG |
627 | |
628 | // debug | |
629 | void log_stat(); | |
630 | ||
631 | // root inode | |
632 | CInode *get_root() { return root; } | |
633 | CInode *get_myin() { return myin; } | |
634 | ||
7c673cae FG |
635 | size_t get_cache_size() { return lru.lru_get_size(); } |
636 | ||
637 | // trimming | |
a8e16298 | 638 | std::pair<bool, uint64_t> trim(uint64_t count=0); |
9f95a23c | 639 | |
7c673cae FG |
640 | bool trim_non_auth_subtree(CDir *directory); |
641 | void standby_trim_segment(LogSegment *ls); | |
642 | void try_trim_non_auth_subtree(CDir *dir); | |
643 | bool can_trim_non_auth_dirfrag(CDir *dir) { | |
644 | return my_ambiguous_imports.count((dir)->dirfrag()) == 0 && | |
f67539c2 | 645 | uncommitted_peer_rename_olddir.count(dir->inode) == 0; |
7c673cae FG |
646 | } |
647 | ||
648 | /** | |
649 | * For all unreferenced inodes, dirs, dentries below an inode, compose | |
650 | * expiry messages. This is used when giving up all replicas of entities | |
651 | * for an MDS peer in the 'stopping' state, such that the peer can | |
652 | * empty its cache and finish shutting down. | |
653 | * | |
654 | * We have to make sure we're only expiring un-referenced items to | |
655 | * avoid interfering with ongoing stray-movement (we can't distinguish | |
656 | * between the "moving my strays" and "waiting for my cache to empty" | |
657 | * phases within 'stopping') | |
658 | * | |
659 | * @return false if we completed cleanly, true if caller should stop | |
660 | * expiring because we hit something with refs. | |
661 | */ | |
11fdf7f2 | 662 | bool expire_recursive(CInode *in, expiremap& expiremap); |
7c673cae FG |
663 | |
664 | void trim_client_leases(); | |
665 | void check_memory_usage(); | |
666 | ||
7c673cae FG |
667 | void shutdown_start(); |
668 | void shutdown_check(); | |
669 | bool shutdown_pass(); | |
7c673cae | 670 | bool shutdown(); // clear cache (ie at shutodwn) |
f64942e4 AA |
671 | bool shutdown_export_strays(); |
672 | void shutdown_export_stray_finish(inodeno_t ino) { | |
673 | if (shutdown_exporting_strays.erase(ino)) | |
674 | shutdown_export_strays(); | |
675 | } | |
7c673cae | 676 | |
7c673cae FG |
677 | // inode_map |
678 | bool have_inode(vinodeno_t vino) { | |
b32b8144 FG |
679 | if (vino.snapid == CEPH_NOSNAP) |
680 | return inode_map.count(vino.ino) ? true : false; | |
681 | else | |
682 | return snap_inode_map.count(vino) ? true : false; | |
7c673cae FG |
683 | } |
684 | bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { | |
685 | return have_inode(vinodeno_t(ino, snap)); | |
686 | } | |
687 | CInode* get_inode(vinodeno_t vino) { | |
b32b8144 FG |
688 | if (vino.snapid == CEPH_NOSNAP) { |
689 | auto p = inode_map.find(vino.ino); | |
690 | if (p != inode_map.end()) | |
691 | return p->second; | |
692 | } else { | |
693 | auto p = snap_inode_map.find(vino); | |
694 | if (p != snap_inode_map.end()) | |
695 | return p->second; | |
696 | } | |
7c673cae FG |
697 | return NULL; |
698 | } | |
699 | CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { | |
700 | return get_inode(vinodeno_t(ino, s)); | |
701 | } | |
11fdf7f2 TL |
702 | CInode* lookup_snap_inode(vinodeno_t vino) { |
703 | auto p = snap_inode_map.lower_bound(vino); | |
704 | if (p != snap_inode_map.end() && | |
705 | p->second->ino() == vino.ino && p->second->first <= vino.snapid) | |
706 | return p->second; | |
707 | return NULL; | |
708 | } | |
7c673cae FG |
709 | |
710 | CDir* get_dirfrag(dirfrag_t df) { | |
711 | CInode *in = get_inode(df.ino); | |
712 | if (!in) | |
713 | return NULL; | |
714 | return in->get_dirfrag(df.frag); | |
715 | } | |
11fdf7f2 | 716 | CDir* get_dirfrag(inodeno_t ino, std::string_view dn) { |
7c673cae FG |
717 | CInode *in = get_inode(ino); |
718 | if (!in) | |
719 | return NULL; | |
720 | frag_t fg = in->pick_dirfrag(dn); | |
721 | return in->get_dirfrag(fg); | |
722 | } | |
723 | CDir* get_force_dirfrag(dirfrag_t df, bool replay) { | |
724 | CInode *diri = get_inode(df.ino); | |
725 | if (!diri) | |
726 | return NULL; | |
727 | CDir *dir = force_dir_fragment(diri, df.frag, replay); | |
728 | if (!dir) | |
729 | dir = diri->get_dirfrag(df.frag); | |
730 | return dir; | |
731 | } | |
732 | ||
11fdf7f2 | 733 | MDSCacheObject *get_object(const MDSCacheObjectInfo &info); |
7c673cae | 734 | |
7c673cae FG |
735 | void add_inode(CInode *in); |
736 | ||
737 | void remove_inode(CInode *in); | |
9f95a23c | 738 | |
7c673cae | 739 | void touch_dentry(CDentry *dn) { |
31f18b77 FG |
740 | if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { |
741 | bottom_lru.lru_midtouch(dn); | |
742 | } else { | |
743 | if (dn->is_auth()) | |
744 | lru.lru_touch(dn); | |
745 | else | |
746 | lru.lru_midtouch(dn); | |
747 | } | |
7c673cae FG |
748 | } |
749 | void touch_dentry_bottom(CDentry *dn) { | |
31f18b77 FG |
750 | if (dn->state_test(CDentry::STATE_BOTTOMLRU)) |
751 | return; | |
7c673cae | 752 | lru.lru_bottouch(dn); |
7c673cae | 753 | } |
7c673cae | 754 | |
7c673cae FG |
755 | // truncate |
756 | void truncate_inode(CInode *in, LogSegment *ls); | |
757 | void _truncate_inode(CInode *in, LogSegment *ls); | |
758 | void truncate_inode_finish(CInode *in, LogSegment *ls); | |
1e59de90 TL |
759 | void truncate_inode_write_finish(CInode *in, LogSegment *ls, |
760 | uint32_t block_size); | |
7c673cae FG |
761 | void truncate_inode_logged(CInode *in, MutationRef& mut); |
762 | ||
763 | void add_recovered_truncate(CInode *in, LogSegment *ls); | |
764 | void remove_recovered_truncate(CInode *in, LogSegment *ls); | |
765 | void start_recovered_truncates(); | |
766 | ||
9f95a23c TL |
767 | // purge unsafe inodes |
768 | void start_purge_inodes(); | |
769 | void purge_inodes(const interval_set<inodeno_t>& i, LogSegment *ls); | |
7c673cae | 770 | |
7c673cae FG |
771 | CDir *get_auth_container(CDir *in); |
772 | CDir *get_export_container(CDir *dir); | |
20effc67 TL |
773 | void find_nested_exports(CDir *dir, std::set<CDir*>& s); |
774 | void find_nested_exports_under(CDir *import, CDir *dir, std::set<CDir*>& s); | |
7c673cae | 775 | |
7c673cae FG |
776 | void init_layouts(); |
777 | void create_unlinked_system_inode(CInode *in, inodeno_t ino, | |
778 | int mode) const; | |
779 | CInode *create_system_inode(inodeno_t ino, int mode); | |
780 | CInode *create_root_inode(); | |
781 | ||
782 | void create_empty_hierarchy(MDSGather *gather); | |
783 | void create_mydir_hierarchy(MDSGather *gather); | |
784 | ||
785 | bool is_open() { return open; } | |
11fdf7f2 | 786 | void wait_for_open(MDSContext *c) { |
7c673cae FG |
787 | waiting_for_open.push_back(c); |
788 | } | |
789 | ||
11fdf7f2 | 790 | void open_root_inode(MDSContext *c); |
7c673cae | 791 | void open_root(); |
11fdf7f2 TL |
792 | void open_mydir_inode(MDSContext *c); |
793 | void open_mydir_frag(MDSContext *c); | |
7c673cae FG |
794 | void populate_mydir(); |
795 | ||
11fdf7f2 | 796 | void _create_system_file(CDir *dir, std::string_view name, CInode *in, MDSContext *fin); |
7c673cae | 797 | void _create_system_file_finish(MutationRef& mut, CDentry *dn, |
11fdf7f2 | 798 | version_t dpv, MDSContext *fin); |
7c673cae | 799 | |
11fdf7f2 | 800 | void open_foreign_mdsdir(inodeno_t ino, MDSContext *c); |
7c673cae | 801 | CDir *get_stray_dir(CInode *in); |
7c673cae | 802 | |
7c673cae FG |
803 | /** |
804 | * Find the given dentry (and whether it exists or not), its ancestors, | |
805 | * and get them all into memory and usable on this MDS. This function | |
806 | * makes a best-effort attempt to load everything; if it needs to | |
807 | * go away and do something then it will put the request on a waitlist. | |
808 | * It prefers the mdr, then the req, then the fin. (At least one of these | |
809 | * must be non-null.) | |
810 | * | |
811 | * At least one of the params mdr, req, and fin must be non-null. | |
812 | * | |
813 | * @param mdr The MDRequest associated with the path. Can be null. | |
11fdf7f2 | 814 | * @param cf A MDSContextFactory for waiter building. |
7c673cae | 815 | * @param path The path to traverse to. |
9f95a23c TL |
816 | * |
817 | * @param flags Specifies different lookup behaviors. | |
818 | * By default, path_traverse() forwards the request to the auth MDS if that | |
819 | * is appropriate (ie, if it doesn't know the contents of a directory). | |
820 | * MDS_TRAVERSE_DISCOVER: Instead of forwarding request, path_traverse() | |
821 | * attempts to look up the path from a different MDS (and bring them into | |
822 | * its cache as replicas). | |
823 | * MDS_TRAVERSE_PATH_LOCKED: path_traverse() will procceed when xlocked | |
824 | * dentry is encountered. | |
825 | * MDS_TRAVERSE_WANT_DENTRY: Caller wants tail dentry. Add a null dentry if | |
826 | * tail dentry does not exist. return 0 even tail dentry is null. | |
1e59de90 TL |
827 | * MDS_TRAVERSE_WANT_INODE: Caller only wants target inode if it exists, or |
828 | * wants tail dentry if target inode does not exist and MDS_TRAVERSE_WANT_DENTRY | |
829 | * is also set. | |
9f95a23c TL |
830 | * MDS_TRAVERSE_WANT_AUTH: Always forward request to auth MDS of target inode |
831 | * or auth MDS of tail dentry (MDS_TRAVERSE_WANT_DENTRY is set). | |
1e59de90 TL |
832 | * MDS_TRAVERSE_XLOCK_DENTRY: Caller wants to xlock tail dentry if MDS_TRAVERSE_WANT_INODE |
833 | * is not set or (MDS_TRAVERSE_WANT_INODE is set but target inode does not exist) | |
9f95a23c | 834 | * |
7c673cae FG |
835 | * @param pdnvec Data return parameter -- on success, contains a |
836 | * vector of dentries. On failure, is either empty or contains the | |
837 | * full trace of traversable dentries. | |
838 | * @param pin Data return parameter -- if successful, points to the inode | |
839 | * associated with filepath. If unsuccessful, is null. | |
7c673cae FG |
840 | * |
841 | * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise. | |
842 | * If it returns 1, the requester associated with this call has been placed | |
843 | * on the appropriate waitlist, and it should unwind itself and back out. | |
844 | * If it returns 2 the request has been forwarded, and again the requester | |
845 | * should unwind itself and back out. | |
846 | */ | |
9f95a23c TL |
847 | int path_traverse(MDRequestRef& mdr, MDSContextFactory& cf, |
848 | const filepath& path, int flags, | |
20effc67 | 849 | std::vector<CDentry*> *pdnvec, CInode **pin=nullptr); |
7c673cae | 850 | |
1e59de90 TL |
851 | int maybe_request_forward_to_auth(MDRequestRef& mdr, MDSContextFactory& cf, |
852 | MDSCacheObject *p); | |
853 | ||
7c673cae FG |
854 | CInode *cache_traverse(const filepath& path); |
855 | ||
11fdf7f2 | 856 | void open_remote_dirfrag(CInode *diri, frag_t fg, MDSContext *fin); |
7c673cae FG |
857 | CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false); |
858 | ||
20effc67 | 859 | bool parallel_fetch(std::map<inodeno_t,filepath>& pathmap, std::set<inodeno_t>& missing); |
7c673cae | 860 | bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, |
20effc67 | 861 | std::set<CDir*>& fetch_queue, std::set<inodeno_t>& missing, |
7c673cae FG |
862 | C_GatherBuilder &gather_bld); |
863 | ||
11fdf7f2 | 864 | void open_remote_dentry(CDentry *dn, bool projected, MDSContext *fin, |
7c673cae | 865 | bool want_xlocked=false); |
11fdf7f2 | 866 | void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSContext *fin, |
7c673cae FG |
867 | bool want_xlocked, int r); |
868 | ||
20effc67 | 869 | void make_trace(std::vector<CDentry*>& trace, CInode *in); |
7c673cae | 870 | |
11fdf7f2 | 871 | void open_ino(inodeno_t ino, int64_t pool, MDSContext *fin, |
f91f0fd5 | 872 | bool want_replica=true, bool want_xlocked=false, |
20effc67 | 873 | std::vector<inode_backpointer_t> *ancestors_hint=nullptr, |
f91f0fd5 | 874 | mds_rank_t auth_hint=MDS_RANK_NONE); |
1e59de90 TL |
875 | void open_ino_batch_start(); |
876 | void open_ino_batch_submit(); | |
877 | void kick_open_ino_peers(mds_rank_t who); | |
7c673cae | 878 | |
9f95a23c TL |
879 | void find_ino_peers(inodeno_t ino, MDSContext *c, |
880 | mds_rank_t hint=MDS_RANK_NONE, bool path_locked=false); | |
7c673cae | 881 | void _do_find_ino_peer(find_ino_peer_info_t& fip); |
9f95a23c TL |
882 | void handle_find_ino(const cref_t<MMDSFindIno> &m); |
883 | void handle_find_ino_reply(const cref_t<MMDSFindInoReply> &m); | |
7c673cae FG |
884 | void kick_find_ino_peers(mds_rank_t who); |
885 | ||
11fdf7f2 TL |
886 | SnapRealm *get_global_snaprealm() const { return global_snaprealm; } |
887 | void create_global_snaprealm(); | |
888 | void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool notify_clients=true); | |
889 | void send_snap_update(CInode *in, version_t stid, int snap_op); | |
9f95a23c | 890 | void handle_snap_update(const cref_t<MMDSSnapUpdate> &m); |
11fdf7f2 | 891 | void notify_global_snaprealm_update(int snap_op); |
7c673cae FG |
892 | |
893 | // -- stray -- | |
7c673cae FG |
894 | void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); |
895 | uint64_t get_num_strays() const { return stray_manager.get_num_strays(); } | |
896 | ||
7c673cae | 897 | // == messages == |
9f95a23c | 898 | void dispatch(const cref_t<Message> &m); |
7c673cae | 899 | |
9f95a23c TL |
900 | void encode_replica_dir(CDir *dir, mds_rank_t to, bufferlist& bl); |
901 | void encode_replica_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl); | |
902 | void encode_replica_inode(CInode *in, mds_rank_t to, bufferlist& bl, | |
b32b8144 | 903 | uint64_t features); |
7c673cae | 904 | |
9f95a23c TL |
905 | void decode_replica_dir(CDir *&dir, bufferlist::const_iterator& p, CInode *diri, mds_rank_t from, MDSContext::vec& finished); |
906 | void decode_replica_dentry(CDentry *&dn, bufferlist::const_iterator& p, CDir *dir, MDSContext::vec& finished); | |
907 | void decode_replica_inode(CInode *&in, bufferlist::const_iterator& p, CDentry *dn, MDSContext::vec& finished); | |
7c673cae | 908 | |
9f95a23c | 909 | void encode_replica_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); |
33c7a0ef | 910 | void decode_replica_stray(CDentry *&straydn, CInode **in, const bufferlist &bl, mds_rank_t from); |
7c673cae FG |
911 | |
912 | // -- namespace -- | |
9f95a23c TL |
913 | void encode_remote_dentry_link(CDentry::linkage_t *dnl, bufferlist& bl); |
914 | void decode_remote_dentry_link(CDir *dir, CDentry *dn, bufferlist::const_iterator& p); | |
7c673cae | 915 | void send_dentry_link(CDentry *dn, MDRequestRef& mdr); |
aee94f69 | 916 | void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); |
a8e16298 | 917 | |
11fdf7f2 | 918 | void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSContext *c) { |
e306af50 TL |
919 | uncommitted_fragments.at(dirfrag).waiters.push_back(c); |
920 | } | |
921 | bool is_any_uncommitted_fragment() const { | |
922 | return !uncommitted_fragments.empty(); | |
7c673cae | 923 | } |
f91f0fd5 | 924 | void wait_for_uncommitted_fragments(MDSContext* finisher); |
e306af50 TL |
925 | void rollback_uncommitted_fragments(); |
926 | ||
7c673cae FG |
927 | void split_dir(CDir *dir, int byn); |
928 | void merge_dir(CInode *diri, frag_t fg); | |
7c673cae FG |
929 | |
930 | void find_stale_fragment_freeze(); | |
931 | void fragment_freeze_inc_num_waiters(CDir *dir); | |
932 | bool fragment_are_all_frozen(CDir *dir); | |
933 | int get_num_fragmenting_dirs() { return fragments.size(); } | |
934 | ||
935 | // -- updates -- | |
936 | //int send_inode_updates(CInode *in); | |
937 | //void handle_inode_update(MInodeUpdate *m); | |
938 | ||
939 | int send_dir_updates(CDir *in, bool bcast=false); | |
9f95a23c | 940 | void handle_dir_update(const cref_t<MDirUpdate> &m); |
7c673cae FG |
941 | |
942 | // -- cache expiration -- | |
9f95a23c | 943 | void handle_cache_expire(const cref_t<MCacheExpire> &m); |
7c673cae FG |
944 | void process_delayed_expire(CDir *dir); |
945 | void discard_delayed_expire(CDir *dir); | |
946 | ||
eafe8130 | 947 | // -- mdsmap -- |
f6b5b4d7 | 948 | void handle_mdsmap(const MDSMap &mdsmap, const MDSMap &oldmap); |
eafe8130 | 949 | |
20effc67 TL |
950 | int dump_cache() { return dump_cache({}, nullptr, 0); } |
951 | int dump_cache(std::string_view filename, double timeout); | |
952 | int dump_cache(Formatter *f, double timeout); | |
11fdf7f2 | 953 | void dump_tree(CInode *in, const int cur_depth, const int max_depth, Formatter *f); |
7c673cae | 954 | |
f64942e4 | 955 | void cache_status(Formatter *f); |
181888fb | 956 | |
7c673cae FG |
957 | void dump_resolve_status(Formatter *f) const; |
958 | void dump_rejoin_status(Formatter *f) const; | |
959 | ||
960 | // == crap fns == | |
7c673cae | 961 | void show_cache(); |
81eedcae | 962 | void show_subtrees(int dbl=10, bool force_print=false); |
7c673cae FG |
963 | |
964 | CInode *hack_pick_random_inode() { | |
11fdf7f2 | 965 | ceph_assert(!inode_map.empty()); |
7c673cae | 966 | int n = rand() % inode_map.size(); |
b32b8144 | 967 | auto p = inode_map.begin(); |
7c673cae FG |
968 | while (n--) ++p; |
969 | return p->second; | |
970 | } | |
971 | ||
11fdf7f2 | 972 | void flush_dentry(std::string_view path, Context *fin); |
7c673cae FG |
973 | /** |
974 | * Create and start an OP_ENQUEUE_SCRUB | |
975 | */ | |
11fdf7f2 | 976 | void enqueue_scrub(std::string_view path, std::string_view tag, |
7c673cae | 977 | bool force, bool recursive, bool repair, |
aee94f69 | 978 | bool scrub_mdsdir, Formatter *f, Context *fin); |
7c673cae FG |
979 | void repair_inode_stats(CInode *diri); |
980 | void repair_dirfrag_stats(CDir *dir); | |
f67539c2 | 981 | void rdlock_dirfrags_stats(CInode *diri, MDSInternalContext *fin); |
7c673cae | 982 | |
f67539c2 | 983 | // my leader |
9f95a23c TL |
984 | MDSRank *mds; |
985 | ||
986 | // -- my cache -- | |
987 | LRU lru; // dentry lru for expiring items from cache | |
988 | LRU bottom_lru; // dentries that should be trimmed ASAP | |
989 | ||
990 | DecayRate decayrate; | |
991 | ||
992 | int num_shadow_inodes = 0; | |
993 | ||
994 | int num_inodes_with_caps = 0; | |
995 | ||
996 | unsigned max_dir_commit_size; | |
997 | ||
998 | file_layout_t default_file_layout; | |
999 | file_layout_t default_log_layout; | |
1000 | ||
1001 | // -- client leases -- | |
1002 | static constexpr std::size_t client_lease_pools = 3; | |
1003 | std::array<float, client_lease_pools> client_lease_durations{5.0, 30.0, 300.0}; | |
1004 | ||
1005 | // -- client caps -- | |
1006 | uint64_t last_cap_id = 0; | |
1007 | ||
20effc67 | 1008 | std::map<ceph_tid_t, discover_info_t> discovers; |
9f95a23c TL |
1009 | ceph_tid_t discover_last_tid = 0; |
1010 | ||
1011 | // waiters | |
20effc67 | 1012 | std::map<int, std::map<inodeno_t, MDSContext::vec > > waiting_for_base_ino; |
9f95a23c | 1013 | |
20effc67 TL |
1014 | std::map<inodeno_t,std::map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino |
1015 | std::map<inodeno_t,std::map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq | |
9f95a23c TL |
1016 | |
1017 | // realm inodes | |
20effc67 | 1018 | std::set<CInode*> rejoin_pending_snaprealms; |
9f95a23c | 1019 | // cap imports. delayed snap parent opens. |
20effc67 | 1020 | std::map<client_t,std::set<CInode*> > delayed_imported_caps; |
9f95a23c TL |
1021 | |
1022 | // subsystems | |
1023 | std::unique_ptr<Migrator> migrator; | |
1024 | ||
1025 | bool did_shutdown_log_cap = false; | |
1026 | ||
20effc67 | 1027 | std::map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer; |
9f95a23c TL |
1028 | ceph_tid_t find_ino_peer_last_tid = 0; |
1029 | ||
1030 | // delayed cache expire | |
20effc67 | 1031 | std::map<CDir*, expiremap> delayed_expire; // subtree root -> expire msg |
9f95a23c | 1032 | |
7c673cae FG |
1033 | /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ |
1034 | std::set<CInode *> export_pin_queue; | |
eafe8130 | 1035 | std::set<CInode *> export_pin_delayed_queue; |
f67539c2 | 1036 | std::set<CInode *> export_ephemeral_pins; |
11fdf7f2 TL |
1037 | |
1038 | OpenFileTable open_file_table; | |
eafe8130 | 1039 | |
f6b5b4d7 TL |
1040 | double export_ephemeral_random_max = 0.0; |
1041 | ||
9f95a23c | 1042 | protected: |
f67539c2 TL |
1043 | // track leader requests whose peers haven't acknowledged commit |
1044 | struct uleader { | |
1045 | uleader() {} | |
20effc67 | 1046 | std::set<mds_rank_t> peers; |
9f95a23c TL |
1047 | LogSegment *ls = nullptr; |
1048 | MDSContext::vec waiters; | |
1049 | bool safe = false; | |
1050 | bool committing = false; | |
1051 | bool recovering = false; | |
1052 | }; | |
1053 | ||
f67539c2 TL |
1054 | struct upeer { |
1055 | upeer() {} | |
1056 | mds_rank_t leader; | |
e306af50 | 1057 | LogSegment *ls = nullptr; |
f67539c2 | 1058 | MDPeerUpdate *su = nullptr; |
e306af50 TL |
1059 | MDSContext::vec waiters; |
1060 | }; | |
1061 | ||
9f95a23c TL |
1062 | struct open_ino_info_t { |
1063 | open_ino_info_t() {} | |
20effc67 TL |
1064 | std::vector<inode_backpointer_t> ancestors; |
1065 | std::set<mds_rank_t> checked; | |
9f95a23c TL |
1066 | mds_rank_t checking = MDS_RANK_NONE; |
1067 | mds_rank_t auth_hint = MDS_RANK_NONE; | |
1068 | bool check_peers = true; | |
1069 | bool fetch_backtrace = true; | |
1070 | bool discover = false; | |
1071 | bool want_replica = false; | |
1072 | bool want_xlocked = false; | |
1073 | version_t tid = 0; | |
1074 | int64_t pool = -1; | |
1075 | int last_err = 0; | |
1076 | MDSContext::vec waiters; | |
1077 | }; | |
1078 | ||
1e59de90 TL |
1079 | ceph_tid_t open_ino_last_tid = 0; |
1080 | std::map<inodeno_t,open_ino_info_t> opening_inodes; | |
1081 | ||
1082 | bool open_ino_batch = false; | |
1083 | std::map<CDir*, std::pair<std::vector<std::string>, MDSContext::vec> > open_ino_batched_fetch; | |
1084 | ||
9f95a23c TL |
1085 | friend struct C_MDC_OpenInoTraverseDir; |
1086 | friend struct C_MDC_OpenInoParentOpened; | |
1087 | friend struct C_MDC_RetryScanStray; | |
1088 | ||
1089 | friend class C_IO_MDC_OpenInoBacktraceFetched; | |
1090 | friend class C_MDC_Join; | |
1091 | friend class C_MDC_RespondInternalRequest; | |
1092 | ||
f67539c2 | 1093 | friend class EPeerUpdate; |
9f95a23c TL |
1094 | friend class ECommitted; |
1095 | ||
1096 | void set_readonly() { readonly = true; } | |
1097 | ||
1098 | void handle_resolve(const cref_t<MMDSResolve> &m); | |
1099 | void handle_resolve_ack(const cref_t<MMDSResolveAck> &m); | |
1100 | void process_delayed_resolve(); | |
1101 | void discard_delayed_resolve(mds_rank_t who); | |
1102 | void maybe_resolve_finish(); | |
1103 | void disambiguate_my_imports(); | |
1104 | void disambiguate_other_imports(); | |
1105 | void trim_unlinked_inodes(); | |
9f95a23c | 1106 | |
f67539c2 | 1107 | void send_peer_resolves(); |
9f95a23c | 1108 | void send_subtree_resolves(); |
f67539c2 | 1109 | void maybe_finish_peer_resolve(); |
9f95a23c TL |
1110 | |
1111 | void rejoin_walk(CDir *dir, const ref_t<MMDSCacheRejoin> &rejoin); | |
1112 | void handle_cache_rejoin(const cref_t<MMDSCacheRejoin> &m); | |
1113 | void handle_cache_rejoin_weak(const cref_t<MMDSCacheRejoin> &m); | |
1114 | CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); | |
1115 | CDir* rejoin_invent_dirfrag(dirfrag_t df); | |
1116 | void handle_cache_rejoin_strong(const cref_t<MMDSCacheRejoin> &m); | |
1117 | void rejoin_scour_survivor_replicas(mds_rank_t from, const cref_t<MMDSCacheRejoin> &ack, | |
20effc67 TL |
1118 | std::set<vinodeno_t>& acked_inodes, |
1119 | std::set<SimpleLock *>& gather_locks); | |
9f95a23c TL |
1120 | void handle_cache_rejoin_ack(const cref_t<MMDSCacheRejoin> &m); |
1121 | void rejoin_send_acks(); | |
1122 | void rejoin_trim_undef_inodes(); | |
1123 | void maybe_send_pending_rejoins() { | |
1124 | if (rejoins_pending) | |
1125 | rejoin_send_rejoins(); | |
1126 | } | |
1127 | ||
1128 | void touch_inode(CInode *in) { | |
1129 | if (in->get_parent_dn()) | |
1130 | touch_dentry(in->get_projected_parent_dn()); | |
1131 | } | |
1132 | ||
1133 | void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin, | |
20effc67 TL |
1134 | std::set<SimpleLock *>& gather_locks); |
1135 | void dentry_remove_replica(CDentry *dn, mds_rank_t rep, std::set<SimpleLock *>& gather_locks); | |
9f95a23c TL |
1136 | |
1137 | void rename_file(CDentry *srcdn, CDentry *destdn); | |
1138 | ||
1139 | void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); | |
1140 | void _open_ino_parent_opened(inodeno_t ino, int ret); | |
1141 | void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); | |
1e59de90 TL |
1142 | void _open_ino_fetch_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, bool parent, |
1143 | CDir *dir, std::string_view dname); | |
9f95a23c | 1144 | int open_ino_traverse_dir(inodeno_t ino, const cref_t<MMDSOpenIno> &m, |
20effc67 | 1145 | const std::vector<inode_backpointer_t>& ancestors, |
9f95a23c TL |
1146 | bool discover, bool want_xlocked, mds_rank_t *hint); |
1147 | void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); | |
1148 | void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); | |
1149 | void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); | |
1150 | void handle_open_ino(const cref_t<MMDSOpenIno> &m, int err=0); | |
1151 | void handle_open_ino_reply(const cref_t<MMDSOpenInoReply> &m); | |
1152 | ||
1153 | void scan_stray_dir(dirfrag_t next=dirfrag_t()); | |
1154 | // -- replicas -- | |
1155 | void handle_discover(const cref_t<MDiscover> &dis); | |
1156 | void handle_discover_reply(const cref_t<MDiscoverReply> &m); | |
1157 | void handle_dentry_link(const cref_t<MDentryLink> &m); | |
1158 | void handle_dentry_unlink(const cref_t<MDentryUnlink> &m); | |
1159 | ||
20effc67 | 1160 | int dump_cache(std::string_view fn, Formatter *f, double timeout); |
9f95a23c TL |
1161 | |
1162 | void flush_dentry_work(MDRequestRef& mdr); | |
1163 | /** | |
1164 | * Resolve path to a dentry and pass it onto the ScrubStack. | |
1165 | * | |
1166 | * TODO: return enough information to the original mdr formatter | |
1167 | * and completion that they can subsequeuntly check the progress of | |
1168 | * this scrub (we won't block them on a whole scrub as it can take a very | |
1169 | * long time) | |
1170 | */ | |
1171 | void enqueue_scrub_work(MDRequestRef& mdr); | |
9f95a23c TL |
1172 | void repair_inode_stats_work(MDRequestRef& mdr); |
1173 | void repair_dirfrag_stats_work(MDRequestRef& mdr); | |
f67539c2 | 1174 | void rdlock_dirfrags_stats_work(MDRequestRef& mdr); |
9f95a23c TL |
1175 | |
1176 | ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino | |
20effc67 | 1177 | std::map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino |
9f95a23c TL |
1178 | CInode *root = nullptr; // root inode |
1179 | CInode *myin = nullptr; // .ceph/mds%d dir | |
1180 | ||
1181 | bool readonly = false; | |
1182 | ||
1183 | int stray_index = 0; | |
f67539c2 | 1184 | int stray_fragmenting_index = -1; |
9f95a23c | 1185 | |
20effc67 | 1186 | std::set<CInode*> base_inodes; |
9f95a23c TL |
1187 | |
1188 | std::unique_ptr<PerfCounters> logger; | |
1189 | ||
1190 | Filer filer; | |
9f95a23c TL |
1191 | std::array<xlist<ClientLease*>, client_lease_pools> client_leases{}; |
1192 | ||
1193 | /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */ | |
20effc67 TL |
1194 | std::map<CDir*,std::set<CDir*> > subtrees; |
1195 | std::map<CInode*,std::list<std::pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir | |
9f95a23c TL |
1196 | |
1197 | // -- requests -- | |
1198 | ceph::unordered_map<metareqid_t, MDRequestRef> active_requests; | |
1199 | ||
1200 | // -- recovery -- | |
20effc67 | 1201 | std::set<mds_rank_t> recovery_set; |
9f95a23c TL |
1202 | |
1203 | // [resolve] | |
1204 | // from EImportStart w/o EImportFinish during journal replay | |
20effc67 | 1205 | std::map<dirfrag_t, std::vector<dirfrag_t> > my_ambiguous_imports; |
9f95a23c | 1206 | // from MMDSResolves |
20effc67 | 1207 | std::map<mds_rank_t, std::map<dirfrag_t, std::vector<dirfrag_t> > > other_ambiguous_imports; |
9f95a23c | 1208 | |
20effc67 TL |
1209 | std::map<CInode*, int> uncommitted_peer_rename_olddir; // peer: preserve the non-auth dir until seeing commit. |
1210 | std::map<CInode*, int> uncommitted_peer_unlink; // peer: preserve the unlinked inode until seeing commit. | |
9f95a23c | 1211 | |
20effc67 TL |
1212 | std::map<metareqid_t, uleader> uncommitted_leaders; // leader: req -> peer set |
1213 | std::map<metareqid_t, upeer> uncommitted_peers; // peer: preserve the peer req until seeing commit. | |
9f95a23c | 1214 | |
20effc67 TL |
1215 | std::set<metareqid_t> pending_leaders; |
1216 | std::map<int, std::set<metareqid_t> > ambiguous_peer_updates; | |
9f95a23c TL |
1217 | |
1218 | bool resolves_pending = false; | |
20effc67 TL |
1219 | std::set<mds_rank_t> resolve_gather; // nodes i need resolves from |
1220 | std::set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from | |
1221 | std::set<version_t> resolve_snapclient_commits; | |
1222 | std::map<metareqid_t, mds_rank_t> resolve_need_rollback; // rollbacks i'm writing to the journal | |
1223 | std::map<mds_rank_t, cref_t<MMDSResolve>> delayed_resolve; | |
9f95a23c TL |
1224 | |
1225 | // [rejoin] | |
1226 | bool rejoins_pending = false; | |
20effc67 TL |
1227 | std::set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin |
1228 | std::set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to | |
1229 | std::set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to | |
1230 | std::set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack | |
1231 | std::map<mds_rank_t,std::map<inodeno_t,std::map<client_t,Capability::Import> > > rejoin_imported_caps; | |
1232 | std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,Capability::Export> > > rejoin_peer_exports; | |
1233 | ||
1234 | std::map<client_t,entity_inst_t> rejoin_client_map; | |
1235 | std::map<client_t,client_metadata_t> rejoin_client_metadata_map; | |
1236 | std::map<client_t,std::pair<Session*,uint64_t> > rejoin_session_map; | |
1237 | ||
1238 | std::map<inodeno_t,std::pair<mds_rank_t,std::map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex | |
1239 | ||
1240 | std::map<inodeno_t,std::map<client_t,std::map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex | |
1241 | std::set<inodeno_t> cap_imports_missing; | |
1242 | std::map<inodeno_t, MDSContext::vec > cap_reconnect_waiters; | |
9f95a23c TL |
1243 | int cap_imports_num_opening = 0; |
1244 | ||
20effc67 TL |
1245 | std::set<CInode*> rejoin_undef_inodes; |
1246 | std::set<CInode*> rejoin_potential_updated_scatterlocks; | |
1247 | std::set<CDir*> rejoin_undef_dirfrags; | |
1248 | std::map<mds_rank_t, std::set<CInode*> > rejoin_unlinked_inodes; | |
9f95a23c | 1249 | |
20effc67 TL |
1250 | std::vector<CInode*> rejoin_recover_q, rejoin_check_q; |
1251 | std::list<SimpleLock*> rejoin_eval_locks; | |
9f95a23c TL |
1252 | MDSContext::vec rejoin_waiters; |
1253 | ||
1254 | std::unique_ptr<MDSContext> rejoin_done; | |
1255 | std::unique_ptr<MDSContext> resolve_done; | |
1256 | ||
9f95a23c TL |
1257 | StrayManager stray_manager; |
1258 | ||
1259 | private: | |
05a536ef TL |
1260 | std::set<inodeno_t> replay_taken_inos; // the inos have been taken when replaying |
1261 | ||
9f95a23c TL |
1262 | // -- fragmenting -- |
1263 | struct ufragment { | |
1264 | ufragment() {} | |
1265 | int bits = 0; | |
1266 | bool committed = false; | |
1267 | LogSegment *ls = nullptr; | |
1268 | MDSContext::vec waiters; | |
1269 | frag_vec_t old_frags; | |
1270 | bufferlist rollback; | |
1271 | }; | |
1272 | ||
1273 | struct fragment_info_t { | |
1274 | fragment_info_t() {} | |
1275 | bool is_fragmenting() { return !resultfrags.empty(); } | |
1276 | uint64_t get_tid() { return mdr ? mdr->reqid.tid : 0; } | |
1277 | int bits; | |
1278 | std::vector<CDir*> dirs; | |
1279 | std::vector<CDir*> resultfrags; | |
1280 | MDRequestRef mdr; | |
20effc67 | 1281 | std::set<mds_rank_t> notify_ack_waiting; |
9f95a23c TL |
1282 | bool finishing = false; |
1283 | ||
1284 | // for deadlock detection | |
1285 | bool all_frozen = false; | |
1286 | utime_t last_cum_auth_pins_change; | |
1287 | int last_cum_auth_pins = 0; | |
1288 | int num_remote_waiters = 0; // number of remote authpin waiters | |
1289 | }; | |
1290 | ||
20effc67 | 1291 | typedef std::map<dirfrag_t,fragment_info_t>::iterator fragment_info_iterator; |
9f95a23c TL |
1292 | |
1293 | friend class EFragment; | |
1294 | friend class C_MDC_FragmentFrozen; | |
1295 | friend class C_MDC_FragmentMarking; | |
1296 | friend class C_MDC_FragmentPrep; | |
1297 | friend class C_MDC_FragmentStore; | |
1298 | friend class C_MDC_FragmentCommit; | |
f67539c2 | 1299 | friend class C_MDC_FragmentRollback; |
9f95a23c TL |
1300 | friend class C_IO_MDC_FragmentPurgeOld; |
1301 | ||
1302 | // -- subtrees -- | |
1303 | static const unsigned int SUBTREES_COUNT_THRESHOLD = 5; | |
1304 | static const unsigned int SUBTREES_DEPTH_THRESHOLD = 5; | |
1305 | ||
1306 | CInode *get_stray() { | |
1307 | return strays[stray_index]; | |
1308 | } | |
1309 | ||
1310 | void identify_files_to_recover(); | |
1311 | ||
1312 | std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap); | |
1313 | bool trim_dentry(CDentry *dn, expiremap& expiremap); | |
1314 | void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap); | |
1315 | bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&); | |
1316 | void send_expire_messages(expiremap& expiremap); | |
1317 | void trim_non_auth(); // trim out trimmable non-auth items | |
1318 | ||
1319 | void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, | |
1320 | std::vector<CDir*>* frags, MDSContext::vec& waiters, bool replay); | |
1321 | void adjust_dir_fragments(CInode *diri, | |
1322 | const std::vector<CDir*>& srcfrags, | |
1323 | frag_t basefrag, int bits, | |
1324 | std::vector<CDir*>* resultfrags, | |
1325 | MDSContext::vec& waiters, | |
1326 | bool replay); | |
1327 | CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true); | |
20effc67 | 1328 | void get_force_dirfrag_bound_set(const std::vector<dirfrag_t>& dfs, std::set<CDir*>& bounds); |
9f95a23c TL |
1329 | |
1330 | bool can_fragment(CInode *diri, const std::vector<CDir*>& dirs); | |
1331 | void fragment_freeze_dirs(const std::vector<CDir*>& dirs); | |
1332 | void fragment_mark_and_complete(MDRequestRef& mdr); | |
1333 | void fragment_frozen(MDRequestRef& mdr, int r); | |
1334 | void fragment_unmark_unfreeze_dirs(const std::vector<CDir*>& dirs); | |
1335 | void fragment_drop_locks(fragment_info_t &info); | |
1336 | void fragment_maybe_finish(const fragment_info_iterator& it); | |
1337 | void dispatch_fragment_dir(MDRequestRef& mdr); | |
1338 | void _fragment_logged(MDRequestRef& mdr); | |
1339 | void _fragment_stored(MDRequestRef& mdr); | |
1340 | void _fragment_committed(dirfrag_t f, const MDRequestRef& mdr); | |
1341 | void _fragment_old_purged(dirfrag_t f, int bits, const MDRequestRef& mdr); | |
1342 | ||
1343 | void handle_fragment_notify(const cref_t<MMDSFragmentNotify> &m); | |
1344 | void handle_fragment_notify_ack(const cref_t<MMDSFragmentNotifyAck> &m); | |
1345 | ||
1346 | void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, const frag_vec_t& old_frag, | |
1347 | LogSegment *ls, bufferlist *rollback=NULL); | |
1348 | void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); | |
1349 | void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags); | |
1350 | ||
b3b6e05e TL |
1351 | void upkeep_main(void); |
1352 | ||
9f95a23c TL |
1353 | uint64_t cache_memory_limit; |
1354 | double cache_reservation; | |
1355 | double cache_health_threshold; | |
9f95a23c TL |
1356 | std::array<CInode *, NUM_STRAY> strays{}; // my stray dir |
1357 | ||
f6b5b4d7 TL |
1358 | bool export_ephemeral_distributed_config; |
1359 | bool export_ephemeral_random_config; | |
f67539c2 | 1360 | unsigned export_ephemeral_dist_frag_bits; |
f6b5b4d7 | 1361 | |
20effc67 TL |
1362 | // Stores the symlink target on the file object's head |
1363 | bool symlink_recovery; | |
1364 | ||
9f95a23c TL |
1365 | // File size recovery |
1366 | RecoveryQueue recovery_queue; | |
1367 | ||
1368 | // shutdown | |
20effc67 TL |
1369 | std::set<inodeno_t> shutdown_exporting_strays; |
1370 | std::pair<dirfrag_t, std::string> shutdown_export_next; | |
9f95a23c TL |
1371 | |
1372 | bool opening_root = false, open = false; | |
1373 | MDSContext::vec waiting_for_open; | |
1374 | ||
1375 | // -- snaprealms -- | |
1376 | SnapRealm *global_snaprealm = nullptr; | |
1377 | ||
20effc67 | 1378 | std::map<dirfrag_t, ufragment> uncommitted_fragments; |
9f95a23c | 1379 | |
20effc67 | 1380 | std::map<dirfrag_t,fragment_info_t> fragments; |
9f95a23c TL |
1381 | |
1382 | DecayCounter trim_counter; | |
1383 | ||
eafe8130 TL |
1384 | std::thread upkeeper; |
1385 | ceph::mutex upkeep_mutex = ceph::make_mutex("MDCache::upkeep_mutex"); | |
1386 | ceph::condition_variable upkeep_cvar; | |
1387 | time upkeep_last_trim = time::min(); | |
92f5a8d4 | 1388 | time upkeep_last_release = time::min(); |
eafe8130 | 1389 | std::atomic<bool> upkeep_trim_shutdown{false}; |
7c673cae FG |
1390 | }; |
1391 | ||
1392 | class C_MDS_RetryRequest : public MDSInternalContext { | |
1393 | MDCache *cache; | |
1394 | MDRequestRef mdr; | |
1395 | public: | |
f67539c2 TL |
1396 | C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) : |
1397 | MDSInternalContext(c->mds), cache(c), mdr(r) {} | |
7c673cae FG |
1398 | void finish(int r) override; |
1399 | }; | |
1400 | ||
f67539c2 TL |
1401 | class CF_MDS_RetryRequestFactory : public MDSContextFactory { |
1402 | public: | |
1403 | CF_MDS_RetryRequestFactory(MDCache *cache, MDRequestRef &mdr, bool dl) : | |
1404 | mdcache(cache), mdr(mdr), drop_locks(dl) {} | |
1405 | MDSContext *build() override; | |
1406 | private: | |
1407 | MDCache *mdcache; | |
1408 | MDRequestRef mdr; | |
1409 | bool drop_locks; | |
1410 | }; | |
1411 | ||
1e59de90 TL |
1412 | /** |
1413 | * Only for contexts called back from an I/O completion | |
1414 | * | |
1415 | * Note: duplication of members wrt MDCacheContext, because | |
1416 | * it'ls the lesser of two evils compared with introducing | |
1417 | * yet another piece of (multiple) inheritance. | |
1418 | */ | |
1419 | class MDCacheIOContext : public virtual MDSIOContextBase { | |
1420 | protected: | |
1421 | MDCache *mdcache; | |
1422 | MDSRank *get_mds() override | |
1423 | { | |
1424 | ceph_assert(mdcache != NULL); | |
1425 | return mdcache->mds; | |
1426 | } | |
1427 | public: | |
1428 | explicit MDCacheIOContext(MDCache *mdc_, bool track=true) : | |
1429 | MDSIOContextBase(track), mdcache(mdc_) {} | |
1430 | }; | |
1431 | ||
7c673cae | 1432 | #endif |