]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSRank.h
update sources to v12.2.3
[ceph.git] / ceph / src / mds / MDSRank.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef MDS_RANK_H_
16 #define MDS_RANK_H_
17
18 #include "common/DecayCounter.h"
19 #include "common/LogClient.h"
20 #include "common/Timer.h"
21 #include "common/TrackedOp.h"
22
23 #include "messages/MCommand.h"
24
25 #include "Beacon.h"
26 #include "DamageTable.h"
27 #include "MDSMap.h"
28 #include "SessionMap.h"
29 #include "MDCache.h"
30 #include "Migrator.h"
31 #include "MDLog.h"
32 #include "PurgeQueue.h"
33 #include "osdc/Journaler.h"
34
35 // Full .h import instead of forward declaration for PerfCounter, for the
36 // benefit of those including this header and using MDSRank::logger
37 #include "common/perf_counters.h"
38
39 enum {
40 l_mds_first = 2000,
41 l_mds_request,
42 l_mds_reply,
43 l_mds_reply_latency,
44 l_mds_forward,
45 l_mds_dir_fetch,
46 l_mds_dir_commit,
47 l_mds_dir_split,
48 l_mds_dir_merge,
49 l_mds_inode_max,
50 l_mds_inodes,
51 l_mds_inodes_top,
52 l_mds_inodes_bottom,
53 l_mds_inodes_pin_tail,
54 l_mds_inodes_pinned,
55 l_mds_inodes_expired,
56 l_mds_inodes_with_caps,
57 l_mds_caps,
58 l_mds_subtrees,
59 l_mds_traverse,
60 l_mds_traverse_hit,
61 l_mds_traverse_forward,
62 l_mds_traverse_discover,
63 l_mds_traverse_dir_fetch,
64 l_mds_traverse_remote_ino,
65 l_mds_traverse_lock,
66 l_mds_load_cent,
67 l_mds_dispatch_queue_len,
68 l_mds_exported,
69 l_mds_exported_inodes,
70 l_mds_imported,
71 l_mds_imported_inodes,
72 l_mds_last,
73 };
74
75 // memory utilization
76 enum {
77 l_mdm_first = 2500,
78 l_mdm_ino,
79 l_mdm_inoa,
80 l_mdm_inos,
81 l_mdm_dir,
82 l_mdm_dira,
83 l_mdm_dirs,
84 l_mdm_dn,
85 l_mdm_dna,
86 l_mdm_dns,
87 l_mdm_cap,
88 l_mdm_capa,
89 l_mdm_caps,
90 l_mdm_rss,
91 l_mdm_heap,
92 l_mdm_buf,
93 l_mdm_last,
94 };
95
96 namespace ceph {
97 struct heartbeat_handle_d;
98 }
99
100 class Server;
101 class Locker;
102 class MDCache;
103 class MDLog;
104 class MDBalancer;
105 class InoTable;
106 class SnapServer;
107 class SnapClient;
108 class MDSTableServer;
109 class MDSTableClient;
110 class Messenger;
111 class Objecter;
112 class MonClient;
113 class Finisher;
114 class MMDSMap;
115 class ScrubStack;
116
117 /**
118 * The public part of this class's interface is what's exposed to all
119 * the various subsystems (server, mdcache, etc), such as pointers
120 * to the other subsystems, and message-sending calls.
121 */
122 class MDSRank {
123 protected:
124 const mds_rank_t whoami;
125
126 // Incarnation as seen in MDSMap at the point where a rank is
127 // assigned.
128 int incarnation;
129
130 public:
131 mds_rank_t get_nodeid() const { return whoami; }
132 int64_t get_metadata_pool();
133
134 // Reference to global MDS::mds_lock, so that users of MDSRank don't
135 // carry around references to the outer MDS, and we can substitute
136 // a separate lock here in future potentially.
137 Mutex &mds_lock;
138
139 class CephContext *cct;
140
141 bool is_daemon_stopping() const;
142
143 // Reference to global cluster log client, just to avoid initialising
144 // a separate one here.
145 LogChannelRef &clog;
146
147 // Reference to global timer utility, because MDSRank and MDSDaemon
148 // currently both use the same mds_lock, so it makes sense for them
149 // to share a timer.
150 SafeTimer &timer;
151
152 MDSMap *&mdsmap;
153
154 Objecter *objecter;
155
156 // sub systems
157 Server *server;
158 MDCache *mdcache;
159 Locker *locker;
160 MDLog *mdlog;
161 MDBalancer *balancer;
162 ScrubStack *scrubstack;
163 DamageTable damage_table;
164
165
166 InoTable *inotable;
167
168 SnapServer *snapserver;
169 SnapClient *snapclient;
170
171 MDSTableClient *get_table_client(int t);
172 MDSTableServer *get_table_server(int t);
173
174 SessionMap sessionmap;
175 Session *get_session(client_t client) {
176 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
177 }
178
179 PerfCounters *logger, *mlogger;
180 OpTracker op_tracker;
181
182 // The last different state I held before current
183 MDSMap::DaemonState last_state;
184 // The state assigned to me by the MDSMap
185 MDSMap::DaemonState state;
186
187 bool cluster_degraded;
188
189 MDSMap::DaemonState get_state() const { return state; }
190 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
191
192 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
193 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
194 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
195 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
196 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
197 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
198 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
199 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
200 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
201 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
202 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
203 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
204 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
205 bool is_cluster_degraded() const { return cluster_degraded; }
206
207 void handle_write_error(int err);
208
209 void handle_conf_change(const struct md_config_t *conf,
210 const std::set <std::string> &changed)
211 {
212 purge_queue.handle_conf_change(conf, changed, *mdsmap);
213 }
214
215 void update_mlogger();
216 protected:
217 // Flag to indicate we entered shutdown: anyone seeing this to be true
218 // after taking mds_lock must drop out.
219 bool stopping;
220
221 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
222 // because its init/shutdown happens at the top level.
223 PurgeQueue purge_queue;
224
225 class ProgressThread : public Thread {
226 MDSRank *mds;
227 Cond cond;
228 public:
229 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
230 void * entry() override;
231 void shutdown();
232 void signal() {cond.Signal();}
233 } progress_thread;
234
235 list<Message*> waiting_for_nolaggy;
236 list<MDSInternalContextBase*> finished_queue;
237 // Dispatch, retry, queues
238 int dispatch_depth;
239 void inc_dispatch_depth() { ++dispatch_depth; }
240 void dec_dispatch_depth() { --dispatch_depth; }
241 void retry_dispatch(Message *m);
242 bool handle_deferrable_message(Message *m);
243 void _advance_queues();
244 bool _dispatch(Message *m, bool new_msg);
245
246 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
247
248 bool is_stale_message(Message *m) const;
249
250 map<mds_rank_t, version_t> peer_mdsmap_epoch;
251
252 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
253
254 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
255 list<MDSInternalContextBase*> replay_queue;
256 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
257 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
258
259 epoch_t osd_epoch_barrier;
260
261 // Const reference to the beacon so that we can behave differently
262 // when it's laggy.
263 Beacon &beacon;
264
265 /**
266 * Emit clog warnings for any ops reported as warnings by optracker
267 */
268 void check_ops_in_flight();
269
270 int mds_slow_req_count;
271
272 /**
273 * Share MDSMap with clients
274 */
275 void bcast_mds_map(); // to mounted clients
276 epoch_t last_client_mdsmap_bcast;
277
278 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
279
280 void create_logger();
281 public:
282
283 void queue_waiter(MDSInternalContextBase *c) {
284 finished_queue.push_back(c);
285 progress_thread.signal();
286 }
287 void queue_waiters(list<MDSInternalContextBase*>& ls) {
288 finished_queue.splice( finished_queue.end(), ls );
289 progress_thread.signal();
290 }
291
292 MDSRank(
293 mds_rank_t whoami_,
294 Mutex &mds_lock_,
295 LogChannelRef &clog_,
296 SafeTimer &timer_,
297 Beacon &beacon_,
298 MDSMap *& mdsmap_,
299 Messenger *msgr,
300 MonClient *monc_,
301 Context *respawn_hook_,
302 Context *suicide_hook_);
303
304 protected:
305 ~MDSRank();
306
307 public:
308
309 // Daemon lifetime functions: these guys break the abstraction
310 // and call up into the parent MDSDaemon instance. It's kind
311 // of unavoidable: if we want any depth into our calls
312 // to be able to e.g. tear down the whole process, we have to
313 // have a reference going all the way down.
314 // >>>
315 void suicide();
316 void respawn();
317 // <<<
318
319 /**
320 * Call this periodically if inside a potentially long running piece
321 * of code while holding the mds_lock
322 */
323 void heartbeat_reset();
324
325 /**
326 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
327 * this when an unrecoverable error is encountered while attempting
328 * to load an MDS rank's data structures. This is *not* for use with
329 * errors affecting normal dirfrag/inode objects -- they should be handled
330 * through cleaner scrub/repair mechanisms.
331 *
332 * Callers must already hold mds_lock.
333 */
334 void damaged();
335
336 /**
337 * Wrapper around `damaged` for users who are not
338 * already holding mds_lock.
339 *
340 * Callers must not already hold mds_lock.
341 */
342 void damaged_unlocked();
343
344 utime_t get_laggy_until() const;
345
346 void send_message_mds(Message *m, mds_rank_t mds);
347 void forward_message_mds(Message *req, mds_rank_t mds);
348
349 void send_message_client_counted(Message *m, client_t client);
350 void send_message_client_counted(Message *m, Session *session);
351 void send_message_client_counted(Message *m, Connection *connection);
352 void send_message_client_counted(Message *m, const ConnectionRef& con) {
353 send_message_client_counted(m, con.get());
354 }
355 void send_message_client(Message *m, Session *session);
356 void send_message(Message *m, Connection *c);
357 void send_message(Message *m, const ConnectionRef& c) {
358 send_message(m, c.get());
359 }
360
361 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
362 waiting_for_active_peer[who].push_back(c);
363 }
364 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
365 assert(cluster_degraded);
366 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
367 }
368
369 void wait_for_active(MDSInternalContextBase *c) {
370 waiting_for_active.push_back(c);
371 }
372 void wait_for_replay(MDSInternalContextBase *c) {
373 waiting_for_replay.push_back(c);
374 }
375 void wait_for_reconnect(MDSInternalContextBase *c) {
376 waiting_for_reconnect.push_back(c);
377 }
378 void wait_for_resolve(MDSInternalContextBase *c) {
379 waiting_for_resolve.push_back(c);
380 }
381 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
382 waiting_for_mdsmap[e].push_back(c);
383 }
384 void enqueue_replay(MDSInternalContextBase *c) {
385 replay_queue.push_back(c);
386 }
387
388 bool queue_one_replay();
389
390 void set_osd_epoch_barrier(epoch_t e);
391 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
392 epoch_t get_osd_epoch() const;
393
394 ceph_tid_t issue_tid() { return ++last_tid; }
395
396 Finisher *finisher;
397
398 MDSMap *get_mds_map() { return mdsmap; }
399
400 int get_req_rate() const { return logger->get(l_mds_request); }
401
402 int get_mds_slow_req_count() const { return mds_slow_req_count; }
403
404 void dump_status(Formatter *f) const;
405
406 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
407 bool is_export_target(mds_rank_t rank) {
408 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
409 return map_targets.count(rank);
410 }
411
412 bool evict_client(int64_t session_id, bool wait, bool blacklist,
413 std::stringstream& ss, Context *on_killed=nullptr);
414
415 protected:
416 void dump_clientreplay_status(Formatter *f) const;
417 void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
418 void command_tag_path(Formatter *f, const string& path,
419 const string &tag);
420 void command_flush_path(Formatter *f, const string& path);
421 void command_flush_journal(Formatter *f);
422 void command_get_subtrees(Formatter *f);
423 void command_export_dir(Formatter *f,
424 const std::string &path, mds_rank_t dest);
425 bool command_dirfrag_split(
426 cmdmap_t cmdmap,
427 std::ostream &ss);
428 bool command_dirfrag_merge(
429 cmdmap_t cmdmap,
430 std::ostream &ss);
431 bool command_dirfrag_ls(
432 cmdmap_t cmdmap,
433 std::ostream &ss,
434 Formatter *f);
435 int _command_export_dir(const std::string &path, mds_rank_t dest);
436 int _command_flush_journal(std::stringstream *ss);
437 CDir *_command_dirfrag_get(
438 const cmdmap_t &cmdmap,
439 std::ostream &ss);
440
441 protected:
442 Messenger *messenger;
443 MonClient *monc;
444
445 Context *respawn_hook;
446 Context *suicide_hook;
447
448 // Friended to access retry_dispatch
449 friend class C_MDS_RetryMessage;
450
451 // FIXME the state machine logic should be separable from the dispatch
452 // logic that calls it.
453 // >>>
454 void calc_recovery_set();
455 void request_state(MDSMap::DaemonState s);
456
457 bool standby_replaying; // true if current replay pass is in standby-replay mode
458
459 typedef enum {
460 // The MDSMap is available, configure default layouts and structures
461 MDS_BOOT_INITIAL = 0,
462 // We are ready to open some inodes
463 MDS_BOOT_OPEN_ROOT,
464 // We are ready to do a replay if needed
465 MDS_BOOT_PREPARE_LOG,
466 // Replay is complete
467 MDS_BOOT_REPLAY_DONE
468 } BootStep;
469 friend class C_MDS_BootStart;
470 friend class C_MDS_InternalBootStart;
471 void boot_create(); // i am new mds.
472 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
473
474 void replay_start();
475 void creating_done();
476 void starting_done();
477 void replay_done();
478 void standby_replay_restart();
479 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
480 class C_MDS_StandbyReplayRestart;
481 class C_MDS_StandbyReplayRestartFinish;
482
483 void reopen_log();
484
485 void resolve_start();
486 void resolve_done();
487 void reconnect_start();
488 void reconnect_done();
489 void rejoin_joint_start();
490 void rejoin_start();
491 void rejoin_done();
492 void recovery_done(int oldstate);
493 void clientreplay_start();
494 void clientreplay_done();
495 void active_start();
496 void stopping_start();
497 void stopping_done();
498
499 void validate_sessions();
500 // <<<
501
502 // >>>
503 void handle_mds_recovery(mds_rank_t who);
504 void handle_mds_failure(mds_rank_t who);
505 // <<<
506
507 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
508 void update_targets(utime_t now);
509 };
510
511 /* This expects to be given a reference which it is responsible for.
512 * The finish function calls functions which
513 * will put the Message exactly once.*/
514 class C_MDS_RetryMessage : public MDSInternalContext {
515 protected:
516 Message *m;
517 public:
518 C_MDS_RetryMessage(MDSRank *mds, Message *m)
519 : MDSInternalContext(mds)
520 {
521 assert(m);
522 this->m = m;
523 }
524 void finish(int r) override {
525 mds->retry_dispatch(m);
526 }
527 };
528
529 /**
530 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
531 * the service/dispatcher stuff like init/shutdown that subsystems should
532 * never touch.
533 */
534 class MDSRankDispatcher : public MDSRank
535 {
536 public:
537 void init();
538 void tick();
539 void shutdown();
540 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
541 Formatter *f, std::ostream& ss);
542 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
543 void handle_osd_map();
544 void update_log_config();
545
546 bool handle_command(
547 const cmdmap_t &cmdmap,
548 MCommand *m,
549 int *r,
550 std::stringstream *ds,
551 std::stringstream *ss,
552 bool *need_reply);
553
554 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
555 void evict_clients(const SessionFilter &filter, MCommand *m);
556
557 // Call into me from MDS::ms_dispatch
558 bool ms_dispatch(Message *m);
559
560 MDSRankDispatcher(
561 mds_rank_t whoami_,
562 Mutex &mds_lock_,
563 LogChannelRef &clog_,
564 SafeTimer &timer_,
565 Beacon &beacon_,
566 MDSMap *& mdsmap_,
567 Messenger *msgr,
568 MonClient *monc_,
569 Context *respawn_hook_,
570 Context *suicide_hook_);
571 };
572
573 // This utility for MDS and MDSRank dispatchers.
574 #define ALLOW_MESSAGES_FROM(peers) \
575 do { \
576 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
577 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
578 << " allowing=" << #peers << " message=" << *m << dendl; \
579 m->put(); \
580 return true; \
581 } \
582 } while (0)
583
584 #endif // MDS_RANK_H_
585