]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDSRank.h
bump version to 12.2.11-pve1
[ceph.git] / ceph / src / mds / MDSRank.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef MDS_RANK_H_
16#define MDS_RANK_H_
17
94b18763
FG
18#include <boost/utility/string_view.hpp>
19
7c673cae
FG
20#include "common/DecayCounter.h"
21#include "common/LogClient.h"
22#include "common/Timer.h"
23#include "common/TrackedOp.h"
24
25#include "messages/MCommand.h"
26
27#include "Beacon.h"
28#include "DamageTable.h"
29#include "MDSMap.h"
30#include "SessionMap.h"
31#include "MDCache.h"
7c673cae
FG
32#include "MDLog.h"
33#include "PurgeQueue.h"
91327a77 34#include "Server.h"
7c673cae
FG
35#include "osdc/Journaler.h"
36
37// Full .h import instead of forward declaration for PerfCounter, for the
38// benefit of those including this header and using MDSRank::logger
39#include "common/perf_counters.h"
40
41enum {
42 l_mds_first = 2000,
43 l_mds_request,
44 l_mds_reply,
45 l_mds_reply_latency,
46 l_mds_forward,
47 l_mds_dir_fetch,
48 l_mds_dir_commit,
49 l_mds_dir_split,
50 l_mds_dir_merge,
51 l_mds_inode_max,
52 l_mds_inodes,
53 l_mds_inodes_top,
54 l_mds_inodes_bottom,
55 l_mds_inodes_pin_tail,
56 l_mds_inodes_pinned,
57 l_mds_inodes_expired,
58 l_mds_inodes_with_caps,
59 l_mds_caps,
60 l_mds_subtrees,
61 l_mds_traverse,
62 l_mds_traverse_hit,
63 l_mds_traverse_forward,
64 l_mds_traverse_discover,
65 l_mds_traverse_dir_fetch,
66 l_mds_traverse_remote_ino,
67 l_mds_traverse_lock,
68 l_mds_load_cent,
69 l_mds_dispatch_queue_len,
70 l_mds_exported,
71 l_mds_exported_inodes,
72 l_mds_imported,
73 l_mds_imported_inodes,
74 l_mds_last,
75};
76
77// memory utilization
78enum {
79 l_mdm_first = 2500,
80 l_mdm_ino,
81 l_mdm_inoa,
82 l_mdm_inos,
83 l_mdm_dir,
84 l_mdm_dira,
85 l_mdm_dirs,
86 l_mdm_dn,
87 l_mdm_dna,
88 l_mdm_dns,
89 l_mdm_cap,
90 l_mdm_capa,
91 l_mdm_caps,
92 l_mdm_rss,
93 l_mdm_heap,
94 l_mdm_buf,
95 l_mdm_last,
96};
97
98namespace ceph {
99 struct heartbeat_handle_d;
100}
101
7c673cae
FG
102class Locker;
103class MDCache;
104class MDLog;
105class MDBalancer;
106class InoTable;
107class SnapServer;
108class SnapClient;
109class MDSTableServer;
110class MDSTableClient;
111class Messenger;
112class Objecter;
113class MonClient;
114class Finisher;
115class MMDSMap;
116class ScrubStack;
f64942e4 117class C_MDS_Send_Command_Reply;
7c673cae
FG
118
119/**
120 * The public part of this class's interface is what's exposed to all
121 * the various subsystems (server, mdcache, etc), such as pointers
122 * to the other subsystems, and message-sending calls.
123 */
124class MDSRank {
125 protected:
126 const mds_rank_t whoami;
127
128 // Incarnation as seen in MDSMap at the point where a rank is
129 // assigned.
130 int incarnation;
131
132 public:
f64942e4
AA
133
134 friend class C_Flush_Journal;
135 friend class C_Drop_Cache;
136
7c673cae
FG
137 mds_rank_t get_nodeid() const { return whoami; }
138 int64_t get_metadata_pool();
139
140 // Reference to global MDS::mds_lock, so that users of MDSRank don't
141 // carry around references to the outer MDS, and we can substitute
142 // a separate lock here in future potentially.
143 Mutex &mds_lock;
144
94b18763
FG
145 mono_time get_starttime() const {
146 return starttime;
147 }
148 chrono::duration<double> get_uptime() const {
149 mono_time now = mono_clock::now();
150 return chrono::duration<double>(now-starttime);
151 }
152
b32b8144
FG
153 class CephContext *cct;
154
7c673cae
FG
155 bool is_daemon_stopping() const;
156
157 // Reference to global cluster log client, just to avoid initialising
158 // a separate one here.
159 LogChannelRef &clog;
160
161 // Reference to global timer utility, because MDSRank and MDSDaemon
162 // currently both use the same mds_lock, so it makes sense for them
163 // to share a timer.
164 SafeTimer &timer;
165
166 MDSMap *&mdsmap;
167
168 Objecter *objecter;
169
170 // sub systems
171 Server *server;
172 MDCache *mdcache;
173 Locker *locker;
174 MDLog *mdlog;
175 MDBalancer *balancer;
176 ScrubStack *scrubstack;
177 DamageTable damage_table;
178
179
180 InoTable *inotable;
181
182 SnapServer *snapserver;
183 SnapClient *snapclient;
184
185 MDSTableClient *get_table_client(int t);
186 MDSTableServer *get_table_server(int t);
187
188 SessionMap sessionmap;
189 Session *get_session(client_t client) {
190 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
191 }
94b18763 192 Session *get_session(Message *m);
7c673cae
FG
193
194 PerfCounters *logger, *mlogger;
195 OpTracker op_tracker;
196
197 // The last different state I held before current
198 MDSMap::DaemonState last_state;
199 // The state assigned to me by the MDSMap
200 MDSMap::DaemonState state;
201
202 bool cluster_degraded;
203
204 MDSMap::DaemonState get_state() const { return state; }
205 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
206
207 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
208 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
209 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
210 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
211 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
212 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
213 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
214 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
215 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
216 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
217 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
218 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
219 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
220 bool is_cluster_degraded() const { return cluster_degraded; }
221
222 void handle_write_error(int err);
223
224 void handle_conf_change(const struct md_config_t *conf,
225 const std::set <std::string> &changed)
226 {
91327a77
AA
227 mdcache->handle_conf_change(conf, changed, *mdsmap);
228 sessionmap.handle_conf_change(conf, changed);
229 server->handle_conf_change(conf, changed);
7c673cae
FG
230 purge_queue.handle_conf_change(conf, changed, *mdsmap);
231 }
232
c07f9fc5 233 void update_mlogger();
7c673cae
FG
234 protected:
235 // Flag to indicate we entered shutdown: anyone seeing this to be true
236 // after taking mds_lock must drop out.
237 bool stopping;
238
239 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
240 // because its init/shutdown happens at the top level.
241 PurgeQueue purge_queue;
242
243 class ProgressThread : public Thread {
244 MDSRank *mds;
245 Cond cond;
246 public:
247 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
248 void * entry() override;
249 void shutdown();
250 void signal() {cond.Signal();}
251 } progress_thread;
252
253 list<Message*> waiting_for_nolaggy;
254 list<MDSInternalContextBase*> finished_queue;
255 // Dispatch, retry, queues
256 int dispatch_depth;
257 void inc_dispatch_depth() { ++dispatch_depth; }
258 void dec_dispatch_depth() { --dispatch_depth; }
259 void retry_dispatch(Message *m);
260 bool handle_deferrable_message(Message *m);
261 void _advance_queues();
262 bool _dispatch(Message *m, bool new_msg);
263
264 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
265
266 bool is_stale_message(Message *m) const;
267
268 map<mds_rank_t, version_t> peer_mdsmap_epoch;
269
270 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
271
272 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
28e407b8 273 list<MDSInternalContextBase*> waiting_for_any_client_connection;
7c673cae
FG
274 list<MDSInternalContextBase*> replay_queue;
275 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
276 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
277
278 epoch_t osd_epoch_barrier;
279
280 // Const reference to the beacon so that we can behave differently
281 // when it's laggy.
282 Beacon &beacon;
283
284 /**
285 * Emit clog warnings for any ops reported as warnings by optracker
286 */
287 void check_ops_in_flight();
288
289 int mds_slow_req_count;
290
291 /**
292 * Share MDSMap with clients
293 */
294 void bcast_mds_map(); // to mounted clients
295 epoch_t last_client_mdsmap_bcast;
296
297 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
298
299 void create_logger();
300 public:
301
302 void queue_waiter(MDSInternalContextBase *c) {
303 finished_queue.push_back(c);
304 progress_thread.signal();
305 }
91327a77
AA
306 void queue_waiter_front(MDSInternalContextBase *c) {
307 finished_queue.push_back(c);
308 progress_thread.signal();
309 }
94b18763 310 void queue_waiters(std::list<MDSInternalContextBase*>& ls) {
7c673cae
FG
311 finished_queue.splice( finished_queue.end(), ls );
312 progress_thread.signal();
313 }
91327a77
AA
314 void queue_waiters_front(std::list<MDSInternalContextBase*>& ls) {
315 finished_queue.splice(finished_queue.begin(), ls);
316 progress_thread.signal();
317 }
7c673cae
FG
318
319 MDSRank(
320 mds_rank_t whoami_,
321 Mutex &mds_lock_,
322 LogChannelRef &clog_,
323 SafeTimer &timer_,
324 Beacon &beacon_,
325 MDSMap *& mdsmap_,
326 Messenger *msgr,
327 MonClient *monc_,
328 Context *respawn_hook_,
329 Context *suicide_hook_);
330
331 protected:
332 ~MDSRank();
333
334 public:
335
336 // Daemon lifetime functions: these guys break the abstraction
337 // and call up into the parent MDSDaemon instance. It's kind
338 // of unavoidable: if we want any depth into our calls
339 // to be able to e.g. tear down the whole process, we have to
340 // have a reference going all the way down.
341 // >>>
342 void suicide();
343 void respawn();
344 // <<<
345
346 /**
347 * Call this periodically if inside a potentially long running piece
348 * of code while holding the mds_lock
349 */
350 void heartbeat_reset();
351
352 /**
353 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
354 * this when an unrecoverable error is encountered while attempting
355 * to load an MDS rank's data structures. This is *not* for use with
356 * errors affecting normal dirfrag/inode objects -- they should be handled
357 * through cleaner scrub/repair mechanisms.
358 *
359 * Callers must already hold mds_lock.
360 */
361 void damaged();
362
363 /**
364 * Wrapper around `damaged` for users who are not
365 * already holding mds_lock.
366 *
367 * Callers must not already hold mds_lock.
368 */
369 void damaged_unlocked();
370
91327a77
AA
371 double last_cleared_laggy() const {
372 return beacon.last_cleared_laggy();
373 }
374
375 double get_dispatch_queue_max_age(utime_t now) const;
7c673cae
FG
376
377 void send_message_mds(Message *m, mds_rank_t mds);
378 void forward_message_mds(Message *req, mds_rank_t mds);
379
380 void send_message_client_counted(Message *m, client_t client);
381 void send_message_client_counted(Message *m, Session *session);
382 void send_message_client_counted(Message *m, Connection *connection);
383 void send_message_client_counted(Message *m, const ConnectionRef& con) {
384 send_message_client_counted(m, con.get());
385 }
386 void send_message_client(Message *m, Session *session);
387 void send_message(Message *m, Connection *c);
388 void send_message(Message *m, const ConnectionRef& c) {
389 send_message(m, c.get());
390 }
391
392 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
393 waiting_for_active_peer[who].push_back(c);
394 }
395 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
396 assert(cluster_degraded);
397 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
398 }
399
28e407b8
AA
400 void wait_for_any_client_connection(MDSInternalContextBase *c) {
401 waiting_for_any_client_connection.push_back(c);
402 }
403 void kick_waiters_for_any_client_connection(void) {
404 finish_contexts(g_ceph_context, waiting_for_any_client_connection);
405 }
7c673cae
FG
406 void wait_for_active(MDSInternalContextBase *c) {
407 waiting_for_active.push_back(c);
408 }
409 void wait_for_replay(MDSInternalContextBase *c) {
410 waiting_for_replay.push_back(c);
411 }
412 void wait_for_reconnect(MDSInternalContextBase *c) {
413 waiting_for_reconnect.push_back(c);
414 }
415 void wait_for_resolve(MDSInternalContextBase *c) {
416 waiting_for_resolve.push_back(c);
417 }
418 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
419 waiting_for_mdsmap[e].push_back(c);
420 }
421 void enqueue_replay(MDSInternalContextBase *c) {
422 replay_queue.push_back(c);
423 }
424
425 bool queue_one_replay();
426
427 void set_osd_epoch_barrier(epoch_t e);
428 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
429 epoch_t get_osd_epoch() const;
430
431 ceph_tid_t issue_tid() { return ++last_tid; }
432
433 Finisher *finisher;
434
435 MDSMap *get_mds_map() { return mdsmap; }
436
28e407b8 437 uint64_t get_num_requests() const { return logger->get(l_mds_request); }
7c673cae
FG
438
439 int get_mds_slow_req_count() const { return mds_slow_req_count; }
440
441 void dump_status(Formatter *f) const;
442
443 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
444 bool is_export_target(mds_rank_t rank) {
445 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
446 return map_targets.count(rank);
447 }
448
31f18b77
FG
449 bool evict_client(int64_t session_id, bool wait, bool blacklist,
450 std::stringstream& ss, Context *on_killed=nullptr);
451
7c673cae
FG
452 protected:
453 void dump_clientreplay_status(Formatter *f) const;
94b18763
FG
454 void command_scrub_path(Formatter *f, boost::string_view path, vector<string>& scrubop_vec);
455 void command_tag_path(Formatter *f, boost::string_view path,
456 boost::string_view tag);
457 void command_flush_path(Formatter *f, boost::string_view path);
7c673cae
FG
458 void command_flush_journal(Formatter *f);
459 void command_get_subtrees(Formatter *f);
460 void command_export_dir(Formatter *f,
94b18763 461 boost::string_view path, mds_rank_t dest);
7c673cae
FG
462 bool command_dirfrag_split(
463 cmdmap_t cmdmap,
464 std::ostream &ss);
465 bool command_dirfrag_merge(
466 cmdmap_t cmdmap,
467 std::ostream &ss);
468 bool command_dirfrag_ls(
469 cmdmap_t cmdmap,
470 std::ostream &ss,
471 Formatter *f);
94b18763 472 int _command_export_dir(boost::string_view path, mds_rank_t dest);
7c673cae
FG
473 CDir *_command_dirfrag_get(
474 const cmdmap_t &cmdmap,
475 std::ostream &ss);
476
f64942e4
AA
477 void cache_drop_send_reply(Formatter *f, C_MDS_Send_Command_Reply *reply, int r);
478 void command_cache_drop(uint64_t timeout, Formatter *f, Context *on_finish);
479
7c673cae
FG
480 protected:
481 Messenger *messenger;
482 MonClient *monc;
483
484 Context *respawn_hook;
485 Context *suicide_hook;
486
487 // Friended to access retry_dispatch
488 friend class C_MDS_RetryMessage;
489
490 // FIXME the state machine logic should be separable from the dispatch
491 // logic that calls it.
492 // >>>
493 void calc_recovery_set();
494 void request_state(MDSMap::DaemonState s);
495
496 bool standby_replaying; // true if current replay pass is in standby-replay mode
497
498 typedef enum {
499 // The MDSMap is available, configure default layouts and structures
500 MDS_BOOT_INITIAL = 0,
501 // We are ready to open some inodes
502 MDS_BOOT_OPEN_ROOT,
503 // We are ready to do a replay if needed
504 MDS_BOOT_PREPARE_LOG,
505 // Replay is complete
506 MDS_BOOT_REPLAY_DONE
507 } BootStep;
508 friend class C_MDS_BootStart;
509 friend class C_MDS_InternalBootStart;
510 void boot_create(); // i am new mds.
511 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
512
513 void replay_start();
514 void creating_done();
515 void starting_done();
516 void replay_done();
517 void standby_replay_restart();
518 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
519 class C_MDS_StandbyReplayRestart;
520 class C_MDS_StandbyReplayRestartFinish;
521
522 void reopen_log();
523
524 void resolve_start();
525 void resolve_done();
526 void reconnect_start();
527 void reconnect_done();
528 void rejoin_joint_start();
529 void rejoin_start();
530 void rejoin_done();
531 void recovery_done(int oldstate);
532 void clientreplay_start();
533 void clientreplay_done();
534 void active_start();
535 void stopping_start();
536 void stopping_done();
537
538 void validate_sessions();
539 // <<<
540
541 // >>>
542 void handle_mds_recovery(mds_rank_t who);
543 void handle_mds_failure(mds_rank_t who);
544 // <<<
545
546 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
547 void update_targets(utime_t now);
94b18763
FG
548
549private:
550 mono_time starttime = mono_clock::zero();
7c673cae
FG
551};
552
553/* This expects to be given a reference which it is responsible for.
554 * The finish function calls functions which
555 * will put the Message exactly once.*/
556class C_MDS_RetryMessage : public MDSInternalContext {
557protected:
558 Message *m;
559public:
560 C_MDS_RetryMessage(MDSRank *mds, Message *m)
561 : MDSInternalContext(mds)
562 {
563 assert(m);
564 this->m = m;
565 }
566 void finish(int r) override {
567 mds->retry_dispatch(m);
568 }
569};
570
571/**
572 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
573 * the service/dispatcher stuff like init/shutdown that subsystems should
574 * never touch.
575 */
576class MDSRankDispatcher : public MDSRank
577{
578public:
579 void init();
580 void tick();
581 void shutdown();
582 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
583 Formatter *f, std::ostream& ss);
584 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
585 void handle_osd_map();
7c673cae
FG
586 void update_log_config();
587
588 bool handle_command(
589 const cmdmap_t &cmdmap,
590 MCommand *m,
591 int *r,
592 std::stringstream *ds,
593 std::stringstream *ss,
f64942e4 594 Context **run_later,
7c673cae
FG
595 bool *need_reply);
596
597 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
31f18b77 598 void evict_clients(const SessionFilter &filter, MCommand *m);
7c673cae
FG
599
600 // Call into me from MDS::ms_dispatch
601 bool ms_dispatch(Message *m);
602
603 MDSRankDispatcher(
604 mds_rank_t whoami_,
605 Mutex &mds_lock_,
606 LogChannelRef &clog_,
607 SafeTimer &timer_,
608 Beacon &beacon_,
609 MDSMap *& mdsmap_,
610 Messenger *msgr,
611 MonClient *monc_,
612 Context *respawn_hook_,
613 Context *suicide_hook_);
614};
615
616// This utility for MDS and MDSRank dispatchers.
617#define ALLOW_MESSAGES_FROM(peers) \
618do { \
619 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
620 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
621 << " allowing=" << #peers << " message=" << *m << dendl; \
622 m->put(); \
623 return true; \
624 } \
625} while (0)
626
627#endif // MDS_RANK_H_
628