]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDCache.h
update sources to 12.2.10
[ceph.git] / ceph / src / mds / MDCache.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15
16
17#ifndef CEPH_MDCACHE_H
18#define CEPH_MDCACHE_H
19
94b18763
FG
20#include <boost/utility/string_view.hpp>
21
7c673cae
FG
22#include "include/types.h"
23#include "include/filepath.h"
24#include "include/elist.h"
25
26#include "osdc/Filer.h"
27#include "CInode.h"
28#include "CDentry.h"
29#include "CDir.h"
30#include "include/Context.h"
31#include "events/EMetaBlob.h"
32#include "RecoveryQueue.h"
33#include "StrayManager.h"
34#include "MDSContext.h"
35#include "MDSMap.h"
36#include "Mutation.h"
37
38#include "messages/MClientRequest.h"
39#include "messages/MMDSSlaveRequest.h"
40
41class PerfCounters;
42
43class MDSRank;
44class Session;
45class Migrator;
46
47class Message;
48class Session;
49
50class MMDSResolve;
51class MMDSResolveAck;
52class MMDSCacheRejoin;
53class MDiscover;
54class MDiscoverReply;
55class MCacheExpire;
56class MDirUpdate;
57class MDentryLink;
58class MDentryUnlink;
59class MLock;
60struct MMDSFindIno;
61struct MMDSFindInoReply;
62struct MMDSOpenIno;
63struct MMDSOpenInoReply;
64
65class Message;
66class MClientRequest;
67class MMDSSlaveRequest;
68struct MClientSnap;
69
70class MMDSFragmentNotify;
71
72class ESubtreeMap;
73
74enum {
75 l_mdc_first = 3000,
76 // How many inodes currently in stray dentries
77 l_mdc_num_strays,
78 // How many stray dentries are currently delayed for purge due to refs
79 l_mdc_num_strays_delayed,
80 // How many stray dentries are currently being enqueued for purge
81 l_mdc_num_strays_enqueuing,
82
83 // How many dentries have ever been added to stray dir
84 l_mdc_strays_created,
85 // How many dentries have been passed on to PurgeQueue
86 l_mdc_strays_enqueued,
87 // How many strays have been reintegrated?
88 l_mdc_strays_reintegrated,
89 // How many strays have been migrated?
90 l_mdc_strays_migrated,
91
92 // How many inode sizes currently being recovered
93 l_mdc_num_recovering_processing,
94 // How many inodes currently waiting to have size recovered
95 l_mdc_num_recovering_enqueued,
96 // How many inodes waiting with elevated priority for recovery
97 l_mdc_num_recovering_prioritized,
98 // How many inodes ever started size recovery
99 l_mdc_recovery_started,
100 // How many inodes ever completed size recovery
101 l_mdc_recovery_completed,
102
d2e6a577
FG
103 l_mdss_ireq_enqueue_scrub,
104 l_mdss_ireq_exportdir,
105 l_mdss_ireq_flush,
106 l_mdss_ireq_fragmentdir,
107 l_mdss_ireq_fragstats,
108 l_mdss_ireq_inodestats,
109
7c673cae
FG
110 l_mdc_last,
111};
112
113
114// flags for predirty_journal_parents()
115static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting
116static const int PREDIRTY_DIR = 2; // update parent dir mtime/size
117static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback)
118
119class MDCache {
120 public:
91327a77
AA
121 using clock = ceph::coarse_mono_clock;
122 using time = ceph::coarse_mono_time;
123
7c673cae
FG
124 // my master
125 MDSRank *mds;
126
127 // -- my cache --
128 LRU lru; // dentry lru for expiring items from cache
31f18b77 129 LRU bottom_lru; // dentries that should be trimmed ASAP
7c673cae 130 protected:
b32b8144
FG
131 ceph::unordered_map<inodeno_t,CInode*> inode_map; // map of head inodes by ino
132 map<vinodeno_t, CInode*> snap_inode_map; // map of snap inodes by ino
7c673cae
FG
133 CInode *root; // root inode
134 CInode *myin; // .ceph/mds%d dir
135
136 bool readonly;
137 void set_readonly() { readonly = true; }
138
139 CInode *strays[NUM_STRAY]; // my stray dir
140 int stray_index;
141
142 CInode *get_stray() {
143 return strays[stray_index];
144 }
145
146 set<CInode*> base_inodes;
147
148 std::unique_ptr<PerfCounters> logger;
149
150 Filer filer;
151
152 bool exceeded_size_limit;
153
91327a77
AA
154private:
155 uint64_t cache_inode_limit;
156 uint64_t cache_memory_limit;
157 double cache_reservation;
158 double cache_health_threshold;
159
7c673cae 160public:
91327a77
AA
161 uint64_t cache_limit_inodes(void) {
162 return cache_inode_limit;
181888fb 163 }
91327a77
AA
164 uint64_t cache_limit_memory(void) {
165 return cache_memory_limit;
181888fb
FG
166 }
167 double cache_toofull_ratio(void) const {
91327a77
AA
168 double inode_reserve = cache_inode_limit*(1.0-cache_reservation);
169 double memory_reserve = cache_memory_limit*(1.0-cache_reservation);
170 return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, cache_inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve));
181888fb
FG
171 }
172 bool cache_toofull(void) const {
173 return cache_toofull_ratio() > 0.0;
174 }
175 uint64_t cache_size(void) const {
176 return mempool::get_pool(mempool::mds_co::id).allocated_bytes();
177 }
178 bool cache_overfull(void) const {
91327a77 179 return (cache_inode_limit > 0 && CInode::count() > cache_inode_limit*cache_health_threshold) || (cache_size() > cache_memory_limit*cache_health_threshold);
181888fb
FG
180 }
181
7c673cae
FG
182 void advance_stray() {
183 stray_index = (stray_index+1)%NUM_STRAY;
184 }
185
186 void activate_stray_manager();
187
188 /**
189 * Call this when you know that a CDentry is ready to be passed
190 * on to StrayManager (i.e. this is a stray you've just created)
191 */
192 void notify_stray(CDentry *dn) {
193 assert(dn->get_dir()->get_inode()->is_stray());
194 stray_manager.eval_stray(dn);
195 }
196
197 void maybe_eval_stray(CInode *in, bool delay=false);
31f18b77
FG
198 void clear_dirty_bits_for_stray(CInode* diri);
199
7c673cae
FG
200 bool is_readonly() { return readonly; }
201 void force_readonly();
202
203 DecayRate decayrate;
204
b32b8144
FG
205 int num_shadow_inodes;
206
7c673cae
FG
207 int num_inodes_with_caps;
208
209 unsigned max_dir_commit_size;
210
211 static file_layout_t gen_default_file_layout(const MDSMap &mdsmap);
212 static file_layout_t gen_default_log_layout(const MDSMap &mdsmap);
213
214 file_layout_t default_file_layout;
215 file_layout_t default_log_layout;
216
217 void register_perfcounters();
218
219 // -- client leases --
220public:
221 static const int client_lease_pools = 3;
222 float client_lease_durations[client_lease_pools];
223protected:
224 xlist<ClientLease*> client_leases[client_lease_pools];
225public:
226 void touch_client_lease(ClientLease *r, int pool, utime_t ttl) {
227 client_leases[pool].push_back(&r->item_lease);
228 r->ttl = ttl;
229 }
230
231 void notify_stray_removed()
232 {
233 stray_manager.notify_stray_removed();
234 }
235
236 void notify_stray_created()
237 {
238 stray_manager.notify_stray_created();
239 }
240
31f18b77
FG
241 void eval_remote(CDentry *dn)
242 {
243 stray_manager.eval_remote(dn);
244 }
245
7c673cae
FG
246 // -- client caps --
247 uint64_t last_cap_id;
248
249
250
251 // -- discover --
252 struct discover_info_t {
253 ceph_tid_t tid;
254 mds_rank_t mds;
255 inodeno_t ino;
256 frag_t frag;
257 snapid_t snap;
258 filepath want_path;
31f18b77 259 CInode *basei;
7c673cae
FG
260 bool want_base_dir;
261 bool want_xlocked;
262
263 discover_info_t() :
31f18b77 264 tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL),
7c673cae
FG
265 want_base_dir(false), want_xlocked(false) {}
266 ~discover_info_t() {
31f18b77
FG
267 if (basei)
268 basei->put(MDSCacheObject::PIN_DISCOVERBASE);
7c673cae 269 }
31f18b77
FG
270 void pin_base(CInode *b) {
271 basei = b;
272 basei->get(MDSCacheObject::PIN_DISCOVERBASE);
7c673cae
FG
273 }
274 };
275
276 map<ceph_tid_t, discover_info_t> discovers;
277 ceph_tid_t discover_last_tid;
278
279 void _send_discover(discover_info_t& dis);
280 discover_info_t& _create_discover(mds_rank_t mds) {
281 ceph_tid_t t = ++discover_last_tid;
282 discover_info_t& d = discovers[t];
283 d.tid = t;
284 d.mds = mds;
285 return d;
286 }
287
288 // waiters
289 map<int, map<inodeno_t, list<MDSInternalContextBase*> > > waiting_for_base_ino;
290
291 void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE);
292 void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish,
293 mds_rank_t from=MDS_RANK_NONE);
294 void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
295 bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE);
296 void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish,
297 bool want_xlocked=false);
298 void kick_discovers(mds_rank_t who); // after a failure.
299
300
301 // -- subtrees --
302protected:
303 /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */
304 map<CDir*,set<CDir*> > subtrees;
305 map<CInode*,list<pair<CDir*,CDir*> > > projected_subtree_renames; // renamed ino -> target dir
306
307 // adjust subtree auth specification
308 // dir->dir_auth
309 // imports/exports/nested_exports
310 // join/split subtrees as appropriate
311public:
312 bool is_subtrees() { return !subtrees.empty(); }
313 void list_subtrees(list<CDir*>& ls);
28e407b8 314 void adjust_subtree_auth(CDir *root, mds_authority_t auth, bool adjust_pop=true);
224ce89b
WB
315 void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) {
316 adjust_subtree_auth(root, mds_authority_t(a,b));
7c673cae
FG
317 }
318 void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_authority_t auth);
319 void adjust_bounded_subtree_auth(CDir *dir, set<CDir*>& bounds, mds_rank_t a) {
320 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
321 }
322 void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_authority_t auth);
323 void adjust_bounded_subtree_auth(CDir *dir, vector<dirfrag_t>& bounds, mds_rank_t a) {
324 adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN));
325 }
326 void map_dirfrag_set(list<dirfrag_t>& dfs, set<CDir*>& result);
327 void try_subtree_merge(CDir *root);
28e407b8 328 void try_subtree_merge_at(CDir *root, set<CInode*> *to_eval, bool adjust_pop=true);
7c673cae
FG
329 void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut);
330 void eval_subtree_root(CInode *diri);
331 CDir *get_subtree_root(CDir *dir);
332 CDir *get_projected_subtree_root(CDir *dir);
333 bool is_leaf_subtree(CDir *dir) {
334 assert(subtrees.count(dir));
335 return subtrees[dir].empty();
336 }
337 void remove_subtree(CDir *dir);
338 bool is_subtree(CDir *root) {
339 return subtrees.count(root);
340 }
341 void get_subtree_bounds(CDir *root, set<CDir*>& bounds);
342 void get_wouldbe_subtree_bounds(CDir *root, set<CDir*>& bounds);
343 void verify_subtree_bounds(CDir *root, const set<CDir*>& bounds);
344 void verify_subtree_bounds(CDir *root, const list<dirfrag_t>& bounds);
345
346 void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir);
224ce89b 347 void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop);
7c673cae
FG
348
349 void get_auth_subtrees(set<CDir*>& s);
350 void get_fullauth_subtrees(set<CDir*>& s);
351
352 int num_subtrees();
353 int num_subtrees_fullauth();
354 int num_subtrees_fullnonauth();
355
356
357protected:
358 // delayed cache expire
359 map<CDir*, map<mds_rank_t, MCacheExpire*> > delayed_expire; // subtree root -> expire msg
360
361
362 // -- requests --
363 ceph::unordered_map<metareqid_t, MDRequestRef> active_requests;
364
365public:
366 int get_num_client_requests();
367
368 MDRequestRef request_start(MClientRequest *req);
369 MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m);
370 MDRequestRef request_start_internal(int op);
371 bool have_request(metareqid_t rid) {
372 return active_requests.count(rid);
373 }
374 MDRequestRef request_get(metareqid_t rid);
375 void request_pin_ref(MDRequestRef& r, CInode *ref, vector<CDentry*>& trace);
376 void request_finish(MDRequestRef& mdr);
377 void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0);
378 void dispatch_request(MDRequestRef& mdr);
379 void request_drop_foreign_locks(MDRequestRef& mdr);
380 void request_drop_non_rdlocks(MDRequestRef& r);
381 void request_drop_locks(MDRequestRef& r);
382 void request_cleanup(MDRequestRef& r);
383
384 void request_kill(MDRequestRef& r); // called when session closes
385
386 // journal/snap helpers
387 CInode *pick_inode_snap(CInode *in, snapid_t follows);
388 CInode *cow_inode(CInode *in, snapid_t last);
389 void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn,
390 snapid_t follows=CEPH_NOSNAP,
391 CInode **pcow_inode=0, CDentry::linkage_t *dnl=0);
392 void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP,
393 CInode **pcow_inode=0);
394 void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP);
395
396 void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first,
397 int linkunlink, SnapRealm *prealm);
94b18763 398 void _project_rstat_inode_to_frag(CInode::mempool_inode & inode, snapid_t ofirst, snapid_t last,
7c673cae
FG
399 CDir *parent, int linkunlink, bool update_inode);
400 void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat,
401 snapid_t ofirst, snapid_t last,
402 CInode *pin, bool cow_head);
28e407b8 403 void broadcast_quota_to_client(CInode *in, client_t exclude_ct = -1);
7c673cae
FG
404 void predirty_journal_parents(MutationRef mut, EMetaBlob *blob,
405 CInode *in, CDir *parent,
406 int flags, int linkunlink=0,
407 snapid_t follows=CEPH_NOSNAP);
408
409 // slaves
410 void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set<mds_rank_t> &slaves, bool safe=false) {
411 uncommitted_masters[reqid].ls = ls;
412 uncommitted_masters[reqid].slaves = slaves;
413 uncommitted_masters[reqid].safe = safe;
414 }
415 void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) {
416 uncommitted_masters[reqid].waiters.push_back(c);
417 }
418 bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) {
419 auto p = uncommitted_masters.find(reqid);
420 return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0;
421 }
422 void log_master_commit(metareqid_t reqid);
423 void logged_master_update(metareqid_t reqid);
424 void _logged_master_commit(metareqid_t reqid);
425 void committed_master_slave(metareqid_t r, mds_rank_t from);
426 void finish_committed_masters();
427
428 void _logged_slave_commit(mds_rank_t from, metareqid_t reqid);
429
430 // -- recovery --
431protected:
432 set<mds_rank_t> recovery_set;
433
434public:
435 void set_recovery_set(set<mds_rank_t>& s);
436 void handle_mds_failure(mds_rank_t who);
437 void handle_mds_recovery(mds_rank_t who);
438
439protected:
440 // [resolve]
441 // from EImportStart w/o EImportFinish during journal replay
442 map<dirfrag_t, vector<dirfrag_t> > my_ambiguous_imports;
443 // from MMDSResolves
444 map<mds_rank_t, map<dirfrag_t, vector<dirfrag_t> > > other_ambiguous_imports;
445
446 map<mds_rank_t, map<metareqid_t, MDSlaveUpdate*> > uncommitted_slave_updates; // slave: for replay.
447 map<CInode*, int> uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit.
448 map<CInode*, int> uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit.
449
450 // track master requests whose slaves haven't acknowledged commit
451 struct umaster {
452 set<mds_rank_t> slaves;
453 LogSegment *ls;
454 list<MDSInternalContextBase*> waiters;
455 bool safe;
456 bool committing;
457 bool recovering;
458 umaster() : ls(NULL), safe(false), committing(false), recovering(false) {}
459 };
460 map<metareqid_t, umaster> uncommitted_masters; // master: req -> slave set
461
462 set<metareqid_t> pending_masters;
463 map<int, set<metareqid_t> > ambiguous_slave_updates;
464
465 friend class ESlaveUpdate;
466 friend class ECommitted;
467
468 bool resolves_pending;
469 set<mds_rank_t> resolve_gather; // nodes i need resolves from
470 set<mds_rank_t> resolve_ack_gather; // nodes i need a resolve_ack from
471 map<metareqid_t, mds_rank_t> need_resolve_rollback; // rollbacks i'm writing to the journal
472 map<mds_rank_t, MMDSResolve*> delayed_resolve;
473
474 void handle_resolve(MMDSResolve *m);
475 void handle_resolve_ack(MMDSResolveAck *m);
476 void process_delayed_resolve();
477 void discard_delayed_resolve(mds_rank_t who);
478 void maybe_resolve_finish();
479 void disambiguate_my_imports();
480 void disambiguate_other_imports();
481 void trim_unlinked_inodes();
482 void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*);
483 void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
484 MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master);
485public:
486 void recalc_auth_bits(bool replay);
487 void remove_inode_recursive(CInode *in);
488
489 bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
490 auto p = ambiguous_slave_updates.find(master);
491 return p != ambiguous_slave_updates.end() && p->second.count(reqid);
492 }
493 void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
494 ambiguous_slave_updates[master].insert(reqid);
495 }
496 void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) {
497 auto p = ambiguous_slave_updates.find(master);
498 auto q = p->second.find(reqid);
499 assert(q != p->second.end());
500 p->second.erase(q);
501 if (p->second.empty())
502 ambiguous_slave_updates.erase(p);
503 }
504
505 void add_rollback(metareqid_t reqid, mds_rank_t master) {
506 need_resolve_rollback[reqid] = master;
507 }
508 void finish_rollback(metareqid_t reqid);
509
510 // ambiguous imports
511 void add_ambiguous_import(dirfrag_t base, const vector<dirfrag_t>& bounds);
512 void add_ambiguous_import(CDir *base, const set<CDir*>& bounds);
513 bool have_ambiguous_import(dirfrag_t base) {
514 return my_ambiguous_imports.count(base);
515 }
516 void get_ambiguous_import_bounds(dirfrag_t base, vector<dirfrag_t>& bounds) {
517 assert(my_ambiguous_imports.count(base));
518 bounds = my_ambiguous_imports[base];
519 }
520 void cancel_ambiguous_import(CDir *);
521 void finish_ambiguous_import(dirfrag_t dirino);
522 void resolve_start(MDSInternalContext *resolve_done_);
523 void send_resolves();
524 void send_slave_resolves();
525 void send_subtree_resolves();
526 void maybe_send_pending_resolves() {
527 if (resolves_pending)
528 send_subtree_resolves();
529 }
530
531 void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent,
532 map<dirfrag_t,vector<dirfrag_t> >& subtrees);
533 ESubtreeMap *create_subtree_map();
534
535
536 void clean_open_file_lists();
537
538protected:
539 // [rejoin]
540 bool rejoins_pending;
541 set<mds_rank_t> rejoin_gather; // nodes from whom i need a rejoin
542 set<mds_rank_t> rejoin_sent; // nodes i sent a rejoin to
31f18b77 543 set<mds_rank_t> rejoin_ack_sent; // nodes i sent a rejoin to
7c673cae
FG
544 set<mds_rank_t> rejoin_ack_gather; // nodes from whom i need a rejoin ack
545 map<mds_rank_t,map<inodeno_t,map<client_t,Capability::Import> > > rejoin_imported_caps;
546 map<inodeno_t,pair<mds_rank_t,map<client_t,Capability::Export> > > rejoin_slave_exports;
547 map<client_t,entity_inst_t> rejoin_client_map;
28e407b8 548 map<client_t,pair<Session*,uint64_t> > rejoin_session_map;
7c673cae 549
28e407b8 550 map<inodeno_t,pair<mds_rank_t,map<client_t,cap_reconnect_t> > > cap_exports; // ino -> target, client -> capex
7c673cae
FG
551
552 map<inodeno_t,map<client_t,map<mds_rank_t,cap_reconnect_t> > > cap_imports; // ino -> client -> frommds -> capex
553 set<inodeno_t> cap_imports_missing;
554 map<inodeno_t, list<MDSInternalContextBase*> > cap_reconnect_waiters;
555 int cap_imports_num_opening;
556
557 set<CInode*> rejoin_undef_inodes;
558 set<CInode*> rejoin_potential_updated_scatterlocks;
559 set<CDir*> rejoin_undef_dirfrags;
560 map<mds_rank_t, set<CInode*> > rejoin_unlinked_inodes;
561
562 vector<CInode*> rejoin_recover_q, rejoin_check_q;
563 list<SimpleLock*> rejoin_eval_locks;
564 list<MDSInternalContextBase*> rejoin_waiters;
565
566 void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin);
567 void handle_cache_rejoin(MMDSCacheRejoin *m);
568 void handle_cache_rejoin_weak(MMDSCacheRejoin *m);
569 CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last);
570 CDir* rejoin_invent_dirfrag(dirfrag_t df);
571 void handle_cache_rejoin_strong(MMDSCacheRejoin *m);
572 void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack,
573 set<vinodeno_t>& acked_inodes,
574 set<SimpleLock *>& gather_locks);
575 void handle_cache_rejoin_ack(MMDSCacheRejoin *m);
576 void rejoin_send_acks();
577 void rejoin_trim_undef_inodes();
578 void maybe_send_pending_rejoins() {
579 if (rejoins_pending)
580 rejoin_send_rejoins();
581 }
582 std::unique_ptr<MDSInternalContext> rejoin_done;
583 std::unique_ptr<MDSInternalContext> resolve_done;
584public:
585 void rejoin_start(MDSInternalContext *rejoin_done_);
586 void rejoin_gather_finish();
587 void rejoin_send_rejoins();
588 void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
589 int target=-1) {
28e407b8
AA
590 auto& ex = cap_exports[ino];
591 ex.first = target;
592 ex.second[client] = icr;
7c673cae
FG
593 }
594 void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr,
595 mds_rank_t frommds=MDS_RANK_NONE) {
596 cap_imports[ino][client][frommds] = icr;
597 }
28e407b8
AA
598 void rejoin_recovered_client(client_t client, const entity_inst_t& inst) {
599 rejoin_client_map.emplace(client, inst);
600 }
7c673cae
FG
601 const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) {
602 if (cap_imports.count(ino) &&
603 cap_imports[ino].count(client) &&
604 cap_imports[ino][client].count(MDS_RANK_NONE)) {
605 return &cap_imports[ino][client][MDS_RANK_NONE];
606 }
607 return NULL;
608 }
609 void remove_replay_cap_reconnect(inodeno_t ino, client_t client) {
610 assert(cap_imports[ino].size() == 1);
611 assert(cap_imports[ino][client].size() == 1);
612 cap_imports.erase(ino);
613 }
614 void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) {
615 cap_reconnect_waiters[ino].push_back(c);
616 }
617
618 // [reconnect/rejoin caps]
619 struct reconnected_cap_info_t {
620 inodeno_t realm_ino;
621 snapid_t snap_follows;
622 int dirty_caps;
623 reconnected_cap_info_t() :
624 realm_ino(0), snap_follows(0), dirty_caps(0) {}
625 };
626 map<inodeno_t,map<client_t, reconnected_cap_info_t> > reconnected_caps; // inode -> client -> snap_follows,realmino
627 map<inodeno_t,map<client_t, snapid_t> > reconnected_snaprealms; // realmino -> client -> realmseq
628
629 void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) {
630 reconnected_cap_info_t &info = reconnected_caps[ino][client];
631 info.realm_ino = inodeno_t(icr.capinfo.snaprealm);
632 info.snap_follows = icr.snap_follows;
633 }
634 void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) {
635 reconnected_cap_info_t &info = reconnected_caps[ino][client];
636 info.dirty_caps |= dirty;
637 }
638 void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) {
639 reconnected_snaprealms[ino][client] = seq;
640 }
641
642 friend class C_MDC_RejoinOpenInoFinish;
643 friend class C_MDC_RejoinSessionsOpened;
644 void rejoin_open_ino_finish(inodeno_t ino, int ret);
28e407b8 645 void rejoin_open_sessions_finish(map<client_t,pair<Session*,uint64_t> >& session_map);
7c673cae
FG
646 bool process_imported_caps();
647 void choose_lock_states_and_reconnect_caps();
648 void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino,
649 map<client_t,MClientSnap*>& splits);
650 void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false);
651 void send_snaps(map<client_t,MClientSnap*>& splits);
652 Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds);
653 void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq);
654 void try_reconnect_cap(CInode *in, Session *session);
655 void export_remaining_imported_caps();
656
657 // cap imports. delayed snap parent opens.
658 // realm inode -> client -> cap inodes needing to split to this realm
659 map<CInode*,set<CInode*> > missing_snap_parents;
660 map<client_t,set<CInode*> > delayed_imported_caps;
661
662 void do_cap_import(Session *session, CInode *in, Capability *cap,
663 uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq,
664 int peer, int p_flags);
665 void do_delayed_cap_imports();
666 void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client,
667 snapid_t snap_follows);
668 void check_realm_past_parents(SnapRealm *realm, bool reconnect);
669 void open_snap_parents();
670
671 bool open_undef_inodes_dirfrags();
672 void opened_undef_inode(CInode *in);
673 void opened_undef_dirfrag(CDir *dir) {
674 rejoin_undef_dirfrags.erase(dir);
675 }
676
677 void reissue_all_caps();
678
679
680 friend class Locker;
681 friend class Migrator;
682 friend class MDBalancer;
683
684 // StrayManager needs to be able to remove_inode() from us
685 // when it is done purging
686 friend class StrayManager;
687
688 // File size recovery
689private:
690 RecoveryQueue recovery_queue;
691 void identify_files_to_recover();
692public:
693 void start_files_to_recover();
694 void do_file_recover();
695 void queue_file_recover(CInode *in);
696 void _queued_file_recover_cow(CInode *in, MutationRef& mut);
697
698 // subsystems
699 std::unique_ptr<Migrator> migrator;
700
701 public:
702 explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_);
703 ~MDCache();
91327a77
AA
704 void handle_conf_change(const struct md_config_t *conf,
705 const std::set <std::string> &changed,
706 const MDSMap &mds_map);
7c673cae
FG
707
708 // debug
709 void log_stat();
710
711 // root inode
712 CInode *get_root() { return root; }
713 CInode *get_myin() { return myin; }
714
7c673cae
FG
715 size_t get_cache_size() { return lru.lru_get_size(); }
716
717 // trimming
181888fb
FG
718 bool trim(uint64_t count=0);
719private:
720 void trim_lru(uint64_t count, map<mds_rank_t, MCacheExpire*>& expiremap);
7c673cae
FG
721 bool trim_dentry(CDentry *dn, map<mds_rank_t, MCacheExpire*>& expiremap);
722 void trim_dirfrag(CDir *dir, CDir *con,
723 map<mds_rank_t, MCacheExpire*>& expiremap);
724 bool trim_inode(CDentry *dn, CInode *in, CDir *con,
725 map<mds_rank_t,class MCacheExpire*>& expiremap);
726 void send_expire_messages(map<mds_rank_t, MCacheExpire*>& expiremap);
727 void trim_non_auth(); // trim out trimmable non-auth items
181888fb 728public:
7c673cae
FG
729 bool trim_non_auth_subtree(CDir *directory);
730 void standby_trim_segment(LogSegment *ls);
731 void try_trim_non_auth_subtree(CDir *dir);
732 bool can_trim_non_auth_dirfrag(CDir *dir) {
733 return my_ambiguous_imports.count((dir)->dirfrag()) == 0 &&
734 uncommitted_slave_rename_olddir.count(dir->inode) == 0;
735 }
736
737 /**
738 * For all unreferenced inodes, dirs, dentries below an inode, compose
739 * expiry messages. This is used when giving up all replicas of entities
740 * for an MDS peer in the 'stopping' state, such that the peer can
741 * empty its cache and finish shutting down.
742 *
743 * We have to make sure we're only expiring un-referenced items to
744 * avoid interfering with ongoing stray-movement (we can't distinguish
745 * between the "moving my strays" and "waiting for my cache to empty"
746 * phases within 'stopping')
747 *
748 * @return false if we completed cleanly, true if caller should stop
749 * expiring because we hit something with refs.
750 */
751 bool expire_recursive(
752 CInode *in,
753 std::map<mds_rank_t, MCacheExpire*>& expiremap);
754
755 void trim_client_leases();
756 void check_memory_usage();
757
91327a77 758 time last_recall_state;
7c673cae
FG
759
760 // shutdown
761private:
762 set<inodeno_t> shutdown_exported_strays;
763public:
764 void shutdown_start();
765 void shutdown_check();
766 bool shutdown_pass();
767 bool shutdown_export_strays();
768 bool shutdown(); // clear cache (ie at shutodwn)
769
770 bool did_shutdown_log_cap;
771
772 // inode_map
773 bool have_inode(vinodeno_t vino) {
b32b8144
FG
774 if (vino.snapid == CEPH_NOSNAP)
775 return inode_map.count(vino.ino) ? true : false;
776 else
777 return snap_inode_map.count(vino) ? true : false;
7c673cae
FG
778 }
779 bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) {
780 return have_inode(vinodeno_t(ino, snap));
781 }
782 CInode* get_inode(vinodeno_t vino) {
b32b8144
FG
783 if (vino.snapid == CEPH_NOSNAP) {
784 auto p = inode_map.find(vino.ino);
785 if (p != inode_map.end())
786 return p->second;
787 } else {
788 auto p = snap_inode_map.find(vino);
789 if (p != snap_inode_map.end())
790 return p->second;
791 }
7c673cae
FG
792 return NULL;
793 }
794 CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) {
795 return get_inode(vinodeno_t(ino, s));
796 }
797
798 CDir* get_dirfrag(dirfrag_t df) {
799 CInode *in = get_inode(df.ino);
800 if (!in)
801 return NULL;
802 return in->get_dirfrag(df.frag);
803 }
94b18763 804 CDir* get_dirfrag(inodeno_t ino, boost::string_view dn) {
7c673cae
FG
805 CInode *in = get_inode(ino);
806 if (!in)
807 return NULL;
808 frag_t fg = in->pick_dirfrag(dn);
809 return in->get_dirfrag(fg);
810 }
811 CDir* get_force_dirfrag(dirfrag_t df, bool replay) {
812 CInode *diri = get_inode(df.ino);
813 if (!diri)
814 return NULL;
815 CDir *dir = force_dir_fragment(diri, df.frag, replay);
816 if (!dir)
817 dir = diri->get_dirfrag(df.frag);
818 return dir;
819 }
820
821 MDSCacheObject *get_object(MDSCacheObjectInfo &info);
822
823
824
825 public:
826 void add_inode(CInode *in);
827
828 void remove_inode(CInode *in);
829 protected:
830 void touch_inode(CInode *in) {
831 if (in->get_parent_dn())
832 touch_dentry(in->get_projected_parent_dn());
833 }
834public:
835 void touch_dentry(CDentry *dn) {
31f18b77
FG
836 if (dn->state_test(CDentry::STATE_BOTTOMLRU)) {
837 bottom_lru.lru_midtouch(dn);
838 } else {
839 if (dn->is_auth())
840 lru.lru_touch(dn);
841 else
842 lru.lru_midtouch(dn);
843 }
7c673cae
FG
844 }
845 void touch_dentry_bottom(CDentry *dn) {
31f18b77
FG
846 if (dn->state_test(CDentry::STATE_BOTTOMLRU))
847 return;
7c673cae 848 lru.lru_bottouch(dn);
7c673cae
FG
849 }
850protected:
851
852 void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin,
853 set<SimpleLock *>& gather_locks);
854 void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set<SimpleLock *>& gather_locks);
855
856 void rename_file(CDentry *srcdn, CDentry *destdn);
857
858 public:
859 // truncate
860 void truncate_inode(CInode *in, LogSegment *ls);
861 void _truncate_inode(CInode *in, LogSegment *ls);
862 void truncate_inode_finish(CInode *in, LogSegment *ls);
863 void truncate_inode_logged(CInode *in, MutationRef& mut);
864
865 void add_recovered_truncate(CInode *in, LogSegment *ls);
866 void remove_recovered_truncate(CInode *in, LogSegment *ls);
867 void start_recovered_truncates();
868
869
870 public:
871 CDir *get_auth_container(CDir *in);
872 CDir *get_export_container(CDir *dir);
873 void find_nested_exports(CDir *dir, set<CDir*>& s);
874 void find_nested_exports_under(CDir *import, CDir *dir, set<CDir*>& s);
875
876
877private:
878 bool opening_root, open;
879 list<MDSInternalContextBase*> waiting_for_open;
880
881public:
882 void init_layouts();
883 void create_unlinked_system_inode(CInode *in, inodeno_t ino,
884 int mode) const;
885 CInode *create_system_inode(inodeno_t ino, int mode);
886 CInode *create_root_inode();
887
888 void create_empty_hierarchy(MDSGather *gather);
889 void create_mydir_hierarchy(MDSGather *gather);
890
891 bool is_open() { return open; }
892 void wait_for_open(MDSInternalContextBase *c) {
893 waiting_for_open.push_back(c);
894 }
895
896 void open_root_inode(MDSInternalContextBase *c);
897 void open_root();
898 void open_mydir_inode(MDSInternalContextBase *c);
28e407b8 899 void open_mydir_frag(MDSInternalContextBase *c);
7c673cae
FG
900 void populate_mydir();
901
902 void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin);
903 void _create_system_file_finish(MutationRef& mut, CDentry *dn,
904 version_t dpv, MDSInternalContextBase *fin);
905
906 void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c);
907 CDir *get_stray_dir(CInode *in);
908 CDentry *get_or_create_stray_dentry(CInode *in);
909
910 MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin);
911
912 /**
913 * Find the given dentry (and whether it exists or not), its ancestors,
914 * and get them all into memory and usable on this MDS. This function
915 * makes a best-effort attempt to load everything; if it needs to
916 * go away and do something then it will put the request on a waitlist.
917 * It prefers the mdr, then the req, then the fin. (At least one of these
918 * must be non-null.)
919 *
920 * At least one of the params mdr, req, and fin must be non-null.
921 *
922 * @param mdr The MDRequest associated with the path. Can be null.
923 * @param req The Message associated with the path. Can be null.
924 * @param fin The Context associated with the path. Can be null.
925 * @param path The path to traverse to.
926 * @param pdnvec Data return parameter -- on success, contains a
927 * vector of dentries. On failure, is either empty or contains the
928 * full trace of traversable dentries.
929 * @param pin Data return parameter -- if successful, points to the inode
930 * associated with filepath. If unsuccessful, is null.
931 * @param onfail Specifies different lookup failure behaviors. If set to
932 * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null
933 * dentries (instead of returning -ENOENT). If set to
934 * MDS_TRAVERSE_FORWARD, it will forward the request to the auth
935 * MDS if that becomes appropriate (ie, if it doesn't know the contents
936 * of a directory). If set to MDS_TRAVERSE_DISCOVER, it
937 * will attempt to look up the path from a different MDS (and bring them
938 * into its cache as replicas).
939 *
940 * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise.
941 * If it returns 1, the requester associated with this call has been placed
942 * on the appropriate waitlist, and it should unwind itself and back out.
943 * If it returns 2 the request has been forwarded, and again the requester
944 * should unwind itself and back out.
945 */
946 int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path,
947 vector<CDentry*> *pdnvec, CInode **pin, int onfail);
948
949 CInode *cache_traverse(const filepath& path);
950
951 void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin);
952 CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false);
953
954 bool parallel_fetch(map<inodeno_t,filepath>& pathmap, set<inodeno_t>& missing);
955 bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path,
956 set<CDir*>& fetch_queue, set<inodeno_t>& missing,
957 C_GatherBuilder &gather_bld);
958
959 void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin,
960 bool want_xlocked=false);
961 void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin,
962 bool want_xlocked, int r);
963
964 void make_trace(vector<CDentry*>& trace, CInode *in);
965
966protected:
967 struct open_ino_info_t {
968 vector<inode_backpointer_t> ancestors;
969 set<mds_rank_t> checked;
970 mds_rank_t checking;
971 mds_rank_t auth_hint;
972 bool check_peers;
973 bool fetch_backtrace;
974 bool discover;
975 bool want_replica;
976 bool want_xlocked;
977 version_t tid;
978 int64_t pool;
979 int last_err;
980 list<MDSInternalContextBase*> waiters;
981 open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE),
982 check_peers(true), fetch_backtrace(true), discover(false),
983 want_replica(false), want_xlocked(false), tid(0), pool(-1),
984 last_err(0) {}
985 };
986 ceph_tid_t open_ino_last_tid;
987 map<inodeno_t,open_ino_info_t> opening_inodes;
988
989 void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err);
990 void _open_ino_parent_opened(inodeno_t ino, int ret);
991 void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err);
992 void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent);
993 int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m,
994 vector<inode_backpointer_t>& ancestors,
995 bool discover, bool want_xlocked, mds_rank_t *hint);
996 void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err);
997 void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err);
998 void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info);
999 void handle_open_ino(MMDSOpenIno *m, int err=0);
1000 void handle_open_ino_reply(MMDSOpenInoReply *m);
1001 friend class C_IO_MDC_OpenInoBacktraceFetched;
1002 friend struct C_MDC_OpenInoTraverseDir;
1003 friend struct C_MDC_OpenInoParentOpened;
1004
1005public:
1006 void kick_open_ino_peers(mds_rank_t who);
1007 void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin,
1008 bool want_replica=true, bool want_xlocked=false);
1009
1010 // -- find_ino_peer --
1011 struct find_ino_peer_info_t {
1012 inodeno_t ino;
1013 ceph_tid_t tid;
1014 MDSInternalContextBase *fin;
1015 mds_rank_t hint;
1016 mds_rank_t checking;
1017 set<mds_rank_t> checked;
1018
1019 find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {}
1020 };
1021
1022 map<ceph_tid_t, find_ino_peer_info_t> find_ino_peer;
1023 ceph_tid_t find_ino_peer_last_tid;
1024
1025 void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE);
1026 void _do_find_ino_peer(find_ino_peer_info_t& fip);
1027 void handle_find_ino(MMDSFindIno *m);
1028 void handle_find_ino_reply(MMDSFindInoReply *m);
1029 void kick_find_ino_peers(mds_rank_t who);
1030
1031 // -- snaprealms --
1032public:
1033 void snaprealm_create(MDRequestRef& mdr, CInode *in);
1034 void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in);
1035
1036 // -- stray --
1037public:
7c673cae
FG
1038 void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin);
1039 uint64_t get_num_strays() const { return stray_manager.get_num_strays(); }
1040
1041protected:
1042 void scan_stray_dir(dirfrag_t next=dirfrag_t());
1043 StrayManager stray_manager;
1044 friend struct C_MDC_RetryScanStray;
1045 friend class C_IO_MDC_FetchedBacktrace;
1046
1047 // == messages ==
1048 public:
1049 void dispatch(Message *m);
1050
1051 protected:
1052 // -- replicas --
1053 void handle_discover(MDiscover *dis);
1054 void handle_discover_reply(MDiscoverReply *m);
1055 friend class C_MDC_Join;
1056
1057public:
b32b8144
FG
1058 void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl);
1059 void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl);
7c673cae 1060 void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl,
b32b8144 1061 uint64_t features);
7c673cae
FG
1062
1063 CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list<MDSInternalContextBase*>& finished);
7c673cae
FG
1064 CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list<MDSInternalContextBase*>& finished);
1065 CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list<MDSInternalContextBase*>& finished);
1066
1067 void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl);
1068 CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from);
1069
1070 // -- namespace --
1071public:
1072 void send_dentry_link(CDentry *dn, MDRequestRef& mdr);
1073 void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr);
1074protected:
1075 void handle_dentry_link(MDentryLink *m);
1076 void handle_dentry_unlink(MDentryUnlink *m);
1077
1078
1079 // -- fragmenting --
1080private:
1081 struct ufragment {
1082 int bits;
1083 bool committed;
1084 LogSegment *ls;
1085 list<MDSInternalContextBase*> waiters;
1086 list<frag_t> old_frags;
1087 bufferlist rollback;
1088 ufragment() : bits(0), committed(false), ls(NULL) {}
1089 };
1090 map<dirfrag_t, ufragment> uncommitted_fragments;
1091
1092 struct fragment_info_t {
1093 int bits;
1094 list<CDir*> dirs;
1095 list<CDir*> resultfrags;
1096 MDRequestRef mdr;
1097 // for deadlock detection
1098 bool all_frozen;
1099 utime_t last_cum_auth_pins_change;
1100 int last_cum_auth_pins;
1101 int num_remote_waiters; // number of remote authpin waiters
1102 fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {}
1103 bool is_fragmenting() { return !resultfrags.empty(); }
1104 };
1105 map<dirfrag_t,fragment_info_t> fragments;
1106
1107 void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits,
1108 list<CDir*>& frags, list<MDSInternalContextBase*>& waiters, bool replay);
1109 void adjust_dir_fragments(CInode *diri,
1110 list<CDir*>& srcfrags,
1111 frag_t basefrag, int bits,
1112 list<CDir*>& resultfrags,
1113 list<MDSInternalContextBase*>& waiters,
1114 bool replay);
1115 CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true);
1116 void get_force_dirfrag_bound_set(vector<dirfrag_t>& dfs, set<CDir*>& bounds);
1117
1118 bool can_fragment(CInode *diri, list<CDir*>& dirs);
1119 void fragment_freeze_dirs(list<CDir*>& dirs);
1120 void fragment_mark_and_complete(MDRequestRef& mdr);
1121 void fragment_frozen(MDRequestRef& mdr, int r);
1122 void fragment_unmark_unfreeze_dirs(list<CDir*>& dirs);
1123 void dispatch_fragment_dir(MDRequestRef& mdr);
1124 void _fragment_logged(MDRequestRef& mdr);
1125 void _fragment_stored(MDRequestRef& mdr);
1126 void _fragment_committed(dirfrag_t f, list<CDir*>& resultfrags);
1127 void _fragment_finish(dirfrag_t f, list<CDir*>& resultfrags);
1128
1129 friend class EFragment;
1130 friend class C_MDC_FragmentFrozen;
1131 friend class C_MDC_FragmentMarking;
1132 friend class C_MDC_FragmentPrep;
1133 friend class C_MDC_FragmentStore;
1134 friend class C_MDC_FragmentCommit;
1135 friend class C_IO_MDC_FragmentFinish;
1136
1137 void handle_fragment_notify(MMDSFragmentNotify *m);
1138
1139 void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list<frag_t>& old_frag,
1140 LogSegment *ls, bufferlist *rollback=NULL);
1141 void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
1142 void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list<frag_t>& old_frags);
1143public:
1144 void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
1145 assert(uncommitted_fragments.count(dirfrag));
1146 uncommitted_fragments[dirfrag].waiters.push_back(c);
1147 }
1148 void split_dir(CDir *dir, int byn);
1149 void merge_dir(CInode *diri, frag_t fg);
1150 void rollback_uncommitted_fragments();
1151
1152 void find_stale_fragment_freeze();
1153 void fragment_freeze_inc_num_waiters(CDir *dir);
1154 bool fragment_are_all_frozen(CDir *dir);
1155 int get_num_fragmenting_dirs() { return fragments.size(); }
1156
1157 // -- updates --
1158 //int send_inode_updates(CInode *in);
1159 //void handle_inode_update(MInodeUpdate *m);
1160
1161 int send_dir_updates(CDir *in, bool bcast=false);
1162 void handle_dir_update(MDirUpdate *m);
1163
1164 // -- cache expiration --
1165 void handle_cache_expire(MCacheExpire *m);
1166 void process_delayed_expire(CDir *dir);
1167 void discard_delayed_expire(CDir *dir);
1168
1169protected:
94b18763
FG
1170 int dump_cache(boost::string_view fn, Formatter *f,
1171 boost::string_view dump_root = "",
7c673cae
FG
1172 int depth = -1);
1173public:
31f18b77 1174 int dump_cache() { return dump_cache(NULL, NULL); }
94b18763 1175 int dump_cache(boost::string_view filename);
31f18b77 1176 int dump_cache(Formatter *f);
94b18763 1177 int dump_cache(boost::string_view dump_root, int depth, Formatter *f);
7c673cae 1178
181888fb
FG
1179 int cache_status(Formatter *f);
1180
7c673cae
FG
1181 void dump_resolve_status(Formatter *f) const;
1182 void dump_rejoin_status(Formatter *f) const;
1183
1184 // == crap fns ==
1185 public:
1186 void show_cache();
1187 void show_subtrees(int dbl=10);
1188
1189 CInode *hack_pick_random_inode() {
1190 assert(!inode_map.empty());
1191 int n = rand() % inode_map.size();
b32b8144 1192 auto p = inode_map.begin();
7c673cae
FG
1193 while (n--) ++p;
1194 return p->second;
1195 }
1196
1197protected:
1198 void flush_dentry_work(MDRequestRef& mdr);
1199 /**
1200 * Resolve path to a dentry and pass it onto the ScrubStack.
1201 *
1202 * TODO: return enough information to the original mdr formatter
1203 * and completion that they can subsequeuntly check the progress of
1204 * this scrub (we won't block them on a whole scrub as it can take a very
1205 * long time)
1206 */
1207 void enqueue_scrub_work(MDRequestRef& mdr);
1208 void repair_inode_stats_work(MDRequestRef& mdr);
1209 void repair_dirfrag_stats_work(MDRequestRef& mdr);
1210 friend class C_MDC_RepairDirfragStats;
1211public:
94b18763 1212 void flush_dentry(boost::string_view path, Context *fin);
7c673cae
FG
1213 /**
1214 * Create and start an OP_ENQUEUE_SCRUB
1215 */
94b18763 1216 void enqueue_scrub(boost::string_view path, boost::string_view tag,
7c673cae
FG
1217 bool force, bool recursive, bool repair,
1218 Formatter *f, Context *fin);
1219 void repair_inode_stats(CInode *diri);
1220 void repair_dirfrag_stats(CDir *dir);
1221
1222public:
1223 /* Because exports may fail, this set lets us keep track of inodes that need exporting. */
1224 std::set<CInode *> export_pin_queue;
1225};
1226
1227class C_MDS_RetryRequest : public MDSInternalContext {
1228 MDCache *cache;
1229 MDRequestRef mdr;
1230 public:
1231 C_MDS_RetryRequest(MDCache *c, MDRequestRef& r);
1232 void finish(int r) override;
1233};
1234
1235#endif