]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/MDSRank.h
bump version to 12.1.1-pve1 while rebasing patches
[ceph.git] / ceph / src / mds / MDSRank.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#ifndef MDS_RANK_H_
16#define MDS_RANK_H_
17
18#include "common/DecayCounter.h"
19#include "common/LogClient.h"
20#include "common/Timer.h"
21#include "common/TrackedOp.h"
22
23#include "messages/MCommand.h"
24
25#include "Beacon.h"
26#include "DamageTable.h"
27#include "MDSMap.h"
28#include "SessionMap.h"
29#include "MDCache.h"
30#include "Migrator.h"
31#include "MDLog.h"
32#include "PurgeQueue.h"
33#include "osdc/Journaler.h"
34
35// Full .h import instead of forward declaration for PerfCounter, for the
36// benefit of those including this header and using MDSRank::logger
37#include "common/perf_counters.h"
38
39enum {
40 l_mds_first = 2000,
41 l_mds_request,
42 l_mds_reply,
43 l_mds_reply_latency,
44 l_mds_forward,
45 l_mds_dir_fetch,
46 l_mds_dir_commit,
47 l_mds_dir_split,
48 l_mds_dir_merge,
49 l_mds_inode_max,
50 l_mds_inodes,
51 l_mds_inodes_top,
52 l_mds_inodes_bottom,
53 l_mds_inodes_pin_tail,
54 l_mds_inodes_pinned,
55 l_mds_inodes_expired,
56 l_mds_inodes_with_caps,
57 l_mds_caps,
58 l_mds_subtrees,
59 l_mds_traverse,
60 l_mds_traverse_hit,
61 l_mds_traverse_forward,
62 l_mds_traverse_discover,
63 l_mds_traverse_dir_fetch,
64 l_mds_traverse_remote_ino,
65 l_mds_traverse_lock,
66 l_mds_load_cent,
67 l_mds_dispatch_queue_len,
68 l_mds_exported,
69 l_mds_exported_inodes,
70 l_mds_imported,
71 l_mds_imported_inodes,
72 l_mds_last,
73};
74
75// memory utilization
76enum {
77 l_mdm_first = 2500,
78 l_mdm_ino,
79 l_mdm_inoa,
80 l_mdm_inos,
81 l_mdm_dir,
82 l_mdm_dira,
83 l_mdm_dirs,
84 l_mdm_dn,
85 l_mdm_dna,
86 l_mdm_dns,
87 l_mdm_cap,
88 l_mdm_capa,
89 l_mdm_caps,
90 l_mdm_rss,
91 l_mdm_heap,
92 l_mdm_buf,
93 l_mdm_last,
94};
95
96namespace ceph {
97 struct heartbeat_handle_d;
98}
99
100class Server;
101class Locker;
102class MDCache;
103class MDLog;
104class MDBalancer;
105class InoTable;
106class SnapServer;
107class SnapClient;
108class MDSTableServer;
109class MDSTableClient;
110class Messenger;
111class Objecter;
112class MonClient;
113class Finisher;
114class MMDSMap;
115class ScrubStack;
116
117/**
118 * The public part of this class's interface is what's exposed to all
119 * the various subsystems (server, mdcache, etc), such as pointers
120 * to the other subsystems, and message-sending calls.
121 */
122class MDSRank {
123 protected:
124 const mds_rank_t whoami;
125
126 // Incarnation as seen in MDSMap at the point where a rank is
127 // assigned.
128 int incarnation;
129
130 public:
131 mds_rank_t get_nodeid() const { return whoami; }
132 int64_t get_metadata_pool();
133
134 // Reference to global MDS::mds_lock, so that users of MDSRank don't
135 // carry around references to the outer MDS, and we can substitute
136 // a separate lock here in future potentially.
137 Mutex &mds_lock;
138
139 bool is_daemon_stopping() const;
140
141 // Reference to global cluster log client, just to avoid initialising
142 // a separate one here.
143 LogChannelRef &clog;
144
145 // Reference to global timer utility, because MDSRank and MDSDaemon
146 // currently both use the same mds_lock, so it makes sense for them
147 // to share a timer.
148 SafeTimer &timer;
149
150 MDSMap *&mdsmap;
151
152 Objecter *objecter;
153
154 // sub systems
155 Server *server;
156 MDCache *mdcache;
157 Locker *locker;
158 MDLog *mdlog;
159 MDBalancer *balancer;
160 ScrubStack *scrubstack;
161 DamageTable damage_table;
162
163
164 InoTable *inotable;
165
166 SnapServer *snapserver;
167 SnapClient *snapclient;
168
169 MDSTableClient *get_table_client(int t);
170 MDSTableServer *get_table_server(int t);
171
172 SessionMap sessionmap;
173 Session *get_session(client_t client) {
174 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
175 }
176
177 PerfCounters *logger, *mlogger;
178 OpTracker op_tracker;
179
180 // The last different state I held before current
181 MDSMap::DaemonState last_state;
182 // The state assigned to me by the MDSMap
183 MDSMap::DaemonState state;
184
185 bool cluster_degraded;
186
187 MDSMap::DaemonState get_state() const { return state; }
188 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
189
190 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
191 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
192 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
193 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
194 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
195 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
196 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
197 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
198 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
199 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
200 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
201 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
202 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
203 bool is_cluster_degraded() const { return cluster_degraded; }
204
205 void handle_write_error(int err);
206
207 void handle_conf_change(const struct md_config_t *conf,
208 const std::set <std::string> &changed)
209 {
210 purge_queue.handle_conf_change(conf, changed, *mdsmap);
211 }
212
213 protected:
214 // Flag to indicate we entered shutdown: anyone seeing this to be true
215 // after taking mds_lock must drop out.
216 bool stopping;
217
218 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
219 // because its init/shutdown happens at the top level.
220 PurgeQueue purge_queue;
221
222 class ProgressThread : public Thread {
223 MDSRank *mds;
224 Cond cond;
225 public:
226 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
227 void * entry() override;
228 void shutdown();
229 void signal() {cond.Signal();}
230 } progress_thread;
231
232 list<Message*> waiting_for_nolaggy;
233 list<MDSInternalContextBase*> finished_queue;
234 // Dispatch, retry, queues
235 int dispatch_depth;
236 void inc_dispatch_depth() { ++dispatch_depth; }
237 void dec_dispatch_depth() { --dispatch_depth; }
238 void retry_dispatch(Message *m);
239 bool handle_deferrable_message(Message *m);
240 void _advance_queues();
241 bool _dispatch(Message *m, bool new_msg);
242
243 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
244
245 bool is_stale_message(Message *m) const;
246
247 map<mds_rank_t, version_t> peer_mdsmap_epoch;
248
249 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
250
251 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
252 list<MDSInternalContextBase*> replay_queue;
253 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
254 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
255
256 epoch_t osd_epoch_barrier;
257
258 // Const reference to the beacon so that we can behave differently
259 // when it's laggy.
260 Beacon &beacon;
261
262 /**
263 * Emit clog warnings for any ops reported as warnings by optracker
264 */
265 void check_ops_in_flight();
266
267 int mds_slow_req_count;
268
269 /**
270 * Share MDSMap with clients
271 */
272 void bcast_mds_map(); // to mounted clients
273 epoch_t last_client_mdsmap_bcast;
274
275 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
276
277 void create_logger();
278 public:
279
280 void queue_waiter(MDSInternalContextBase *c) {
281 finished_queue.push_back(c);
282 progress_thread.signal();
283 }
284 void queue_waiters(list<MDSInternalContextBase*>& ls) {
285 finished_queue.splice( finished_queue.end(), ls );
286 progress_thread.signal();
287 }
288
289 MDSRank(
290 mds_rank_t whoami_,
291 Mutex &mds_lock_,
292 LogChannelRef &clog_,
293 SafeTimer &timer_,
294 Beacon &beacon_,
295 MDSMap *& mdsmap_,
296 Messenger *msgr,
297 MonClient *monc_,
298 Context *respawn_hook_,
299 Context *suicide_hook_);
300
301 protected:
302 ~MDSRank();
303
304 public:
305
306 // Daemon lifetime functions: these guys break the abstraction
307 // and call up into the parent MDSDaemon instance. It's kind
308 // of unavoidable: if we want any depth into our calls
309 // to be able to e.g. tear down the whole process, we have to
310 // have a reference going all the way down.
311 // >>>
312 void suicide();
313 void respawn();
314 // <<<
315
316 /**
317 * Call this periodically if inside a potentially long running piece
318 * of code while holding the mds_lock
319 */
320 void heartbeat_reset();
321
322 /**
323 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
324 * this when an unrecoverable error is encountered while attempting
325 * to load an MDS rank's data structures. This is *not* for use with
326 * errors affecting normal dirfrag/inode objects -- they should be handled
327 * through cleaner scrub/repair mechanisms.
328 *
329 * Callers must already hold mds_lock.
330 */
331 void damaged();
332
333 /**
334 * Wrapper around `damaged` for users who are not
335 * already holding mds_lock.
336 *
337 * Callers must not already hold mds_lock.
338 */
339 void damaged_unlocked();
340
341 utime_t get_laggy_until() const;
342
343 void send_message_mds(Message *m, mds_rank_t mds);
344 void forward_message_mds(Message *req, mds_rank_t mds);
345
346 void send_message_client_counted(Message *m, client_t client);
347 void send_message_client_counted(Message *m, Session *session);
348 void send_message_client_counted(Message *m, Connection *connection);
349 void send_message_client_counted(Message *m, const ConnectionRef& con) {
350 send_message_client_counted(m, con.get());
351 }
352 void send_message_client(Message *m, Session *session);
353 void send_message(Message *m, Connection *c);
354 void send_message(Message *m, const ConnectionRef& c) {
355 send_message(m, c.get());
356 }
357
358 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
359 waiting_for_active_peer[who].push_back(c);
360 }
361 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
362 assert(cluster_degraded);
363 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
364 }
365
366 void wait_for_active(MDSInternalContextBase *c) {
367 waiting_for_active.push_back(c);
368 }
369 void wait_for_replay(MDSInternalContextBase *c) {
370 waiting_for_replay.push_back(c);
371 }
372 void wait_for_reconnect(MDSInternalContextBase *c) {
373 waiting_for_reconnect.push_back(c);
374 }
375 void wait_for_resolve(MDSInternalContextBase *c) {
376 waiting_for_resolve.push_back(c);
377 }
378 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
379 waiting_for_mdsmap[e].push_back(c);
380 }
381 void enqueue_replay(MDSInternalContextBase *c) {
382 replay_queue.push_back(c);
383 }
384
385 bool queue_one_replay();
386
387 void set_osd_epoch_barrier(epoch_t e);
388 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
389 epoch_t get_osd_epoch() const;
390
391 ceph_tid_t issue_tid() { return ++last_tid; }
392
393 Finisher *finisher;
394
395 MDSMap *get_mds_map() { return mdsmap; }
396
397 int get_req_rate() const { return logger->get(l_mds_request); }
398
399 int get_mds_slow_req_count() const { return mds_slow_req_count; }
400
401 void dump_status(Formatter *f) const;
402
403 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
404 bool is_export_target(mds_rank_t rank) {
405 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
406 return map_targets.count(rank);
407 }
408
31f18b77
FG
409 bool evict_client(int64_t session_id, bool wait, bool blacklist,
410 std::stringstream& ss, Context *on_killed=nullptr);
411
7c673cae
FG
412 protected:
413 void dump_clientreplay_status(Formatter *f) const;
414 void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
415 void command_tag_path(Formatter *f, const string& path,
416 const string &tag);
417 void command_flush_path(Formatter *f, const string& path);
418 void command_flush_journal(Formatter *f);
419 void command_get_subtrees(Formatter *f);
420 void command_export_dir(Formatter *f,
421 const std::string &path, mds_rank_t dest);
422 bool command_dirfrag_split(
423 cmdmap_t cmdmap,
424 std::ostream &ss);
425 bool command_dirfrag_merge(
426 cmdmap_t cmdmap,
427 std::ostream &ss);
428 bool command_dirfrag_ls(
429 cmdmap_t cmdmap,
430 std::ostream &ss,
431 Formatter *f);
432 int _command_export_dir(const std::string &path, mds_rank_t dest);
433 int _command_flush_journal(std::stringstream *ss);
434 CDir *_command_dirfrag_get(
435 const cmdmap_t &cmdmap,
436 std::ostream &ss);
437
438 protected:
439 Messenger *messenger;
440 MonClient *monc;
441
442 Context *respawn_hook;
443 Context *suicide_hook;
444
445 // Friended to access retry_dispatch
446 friend class C_MDS_RetryMessage;
447
448 // FIXME the state machine logic should be separable from the dispatch
449 // logic that calls it.
450 // >>>
451 void calc_recovery_set();
452 void request_state(MDSMap::DaemonState s);
453
454 bool standby_replaying; // true if current replay pass is in standby-replay mode
455
456 typedef enum {
457 // The MDSMap is available, configure default layouts and structures
458 MDS_BOOT_INITIAL = 0,
459 // We are ready to open some inodes
460 MDS_BOOT_OPEN_ROOT,
461 // We are ready to do a replay if needed
462 MDS_BOOT_PREPARE_LOG,
463 // Replay is complete
464 MDS_BOOT_REPLAY_DONE
465 } BootStep;
466 friend class C_MDS_BootStart;
467 friend class C_MDS_InternalBootStart;
468 void boot_create(); // i am new mds.
469 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
470
471 void replay_start();
472 void creating_done();
473 void starting_done();
474 void replay_done();
475 void standby_replay_restart();
476 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
477 class C_MDS_StandbyReplayRestart;
478 class C_MDS_StandbyReplayRestartFinish;
479
480 void reopen_log();
481
482 void resolve_start();
483 void resolve_done();
484 void reconnect_start();
485 void reconnect_done();
486 void rejoin_joint_start();
487 void rejoin_start();
488 void rejoin_done();
489 void recovery_done(int oldstate);
490 void clientreplay_start();
491 void clientreplay_done();
492 void active_start();
493 void stopping_start();
494 void stopping_done();
495
496 void validate_sessions();
497 // <<<
498
499 // >>>
500 void handle_mds_recovery(mds_rank_t who);
501 void handle_mds_failure(mds_rank_t who);
502 // <<<
503
504 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
505 void update_targets(utime_t now);
506};
507
508/* This expects to be given a reference which it is responsible for.
509 * The finish function calls functions which
510 * will put the Message exactly once.*/
511class C_MDS_RetryMessage : public MDSInternalContext {
512protected:
513 Message *m;
514public:
515 C_MDS_RetryMessage(MDSRank *mds, Message *m)
516 : MDSInternalContext(mds)
517 {
518 assert(m);
519 this->m = m;
520 }
521 void finish(int r) override {
522 mds->retry_dispatch(m);
523 }
524};
525
526/**
527 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
528 * the service/dispatcher stuff like init/shutdown that subsystems should
529 * never touch.
530 */
531class MDSRankDispatcher : public MDSRank
532{
533public:
534 void init();
535 void tick();
536 void shutdown();
537 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
538 Formatter *f, std::ostream& ss);
539 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
540 void handle_osd_map();
7c673cae
FG
541 void update_log_config();
542
543 bool handle_command(
544 const cmdmap_t &cmdmap,
545 MCommand *m,
546 int *r,
547 std::stringstream *ds,
548 std::stringstream *ss,
549 bool *need_reply);
550
551 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
31f18b77 552 void evict_clients(const SessionFilter &filter, MCommand *m);
7c673cae
FG
553
554 // Call into me from MDS::ms_dispatch
555 bool ms_dispatch(Message *m);
556
557 MDSRankDispatcher(
558 mds_rank_t whoami_,
559 Mutex &mds_lock_,
560 LogChannelRef &clog_,
561 SafeTimer &timer_,
562 Beacon &beacon_,
563 MDSMap *& mdsmap_,
564 Messenger *msgr,
565 MonClient *monc_,
566 Context *respawn_hook_,
567 Context *suicide_hook_);
568};
569
570// This utility for MDS and MDSRank dispatchers.
571#define ALLOW_MESSAGES_FROM(peers) \
572do { \
573 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
574 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
575 << " allowing=" << #peers << " message=" << *m << dendl; \
576 m->put(); \
577 return true; \
578 } \
579} while (0)
580
581#endif // MDS_RANK_H_
582