]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSRank.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mds / MDSRank.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2015 Red Hat
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #ifndef MDS_RANK_H_
16 #define MDS_RANK_H_
17
18 #include "common/DecayCounter.h"
19 #include "common/LogClient.h"
20 #include "common/Timer.h"
21 #include "common/TrackedOp.h"
22
23 #include "messages/MCommand.h"
24
25 #include "Beacon.h"
26 #include "DamageTable.h"
27 #include "MDSMap.h"
28 #include "SessionMap.h"
29 #include "MDCache.h"
30 #include "Migrator.h"
31 #include "MDLog.h"
32 #include "PurgeQueue.h"
33 #include "osdc/Journaler.h"
34
35 // Full .h import instead of forward declaration for PerfCounter, for the
36 // benefit of those including this header and using MDSRank::logger
37 #include "common/perf_counters.h"
38
39 enum {
40 l_mds_first = 2000,
41 l_mds_request,
42 l_mds_reply,
43 l_mds_reply_latency,
44 l_mds_forward,
45 l_mds_dir_fetch,
46 l_mds_dir_commit,
47 l_mds_dir_split,
48 l_mds_dir_merge,
49 l_mds_inode_max,
50 l_mds_inodes,
51 l_mds_inodes_top,
52 l_mds_inodes_bottom,
53 l_mds_inodes_pin_tail,
54 l_mds_inodes_pinned,
55 l_mds_inodes_expired,
56 l_mds_inodes_with_caps,
57 l_mds_caps,
58 l_mds_subtrees,
59 l_mds_traverse,
60 l_mds_traverse_hit,
61 l_mds_traverse_forward,
62 l_mds_traverse_discover,
63 l_mds_traverse_dir_fetch,
64 l_mds_traverse_remote_ino,
65 l_mds_traverse_lock,
66 l_mds_load_cent,
67 l_mds_dispatch_queue_len,
68 l_mds_exported,
69 l_mds_exported_inodes,
70 l_mds_imported,
71 l_mds_imported_inodes,
72 l_mds_last,
73 };
74
75 // memory utilization
76 enum {
77 l_mdm_first = 2500,
78 l_mdm_ino,
79 l_mdm_inoa,
80 l_mdm_inos,
81 l_mdm_dir,
82 l_mdm_dira,
83 l_mdm_dirs,
84 l_mdm_dn,
85 l_mdm_dna,
86 l_mdm_dns,
87 l_mdm_cap,
88 l_mdm_capa,
89 l_mdm_caps,
90 l_mdm_rss,
91 l_mdm_heap,
92 l_mdm_buf,
93 l_mdm_last,
94 };
95
96 namespace ceph {
97 struct heartbeat_handle_d;
98 }
99
100 class Server;
101 class Locker;
102 class MDCache;
103 class MDLog;
104 class MDBalancer;
105 class InoTable;
106 class SnapServer;
107 class SnapClient;
108 class MDSTableServer;
109 class MDSTableClient;
110 class Messenger;
111 class Objecter;
112 class MonClient;
113 class Finisher;
114 class MMDSMap;
115 class ScrubStack;
116
117 /**
118 * The public part of this class's interface is what's exposed to all
119 * the various subsystems (server, mdcache, etc), such as pointers
120 * to the other subsystems, and message-sending calls.
121 */
122 class MDSRank {
123 protected:
124 const mds_rank_t whoami;
125
126 // Incarnation as seen in MDSMap at the point where a rank is
127 // assigned.
128 int incarnation;
129
130 public:
131 mds_rank_t get_nodeid() const { return whoami; }
132 int64_t get_metadata_pool();
133
134 // Reference to global MDS::mds_lock, so that users of MDSRank don't
135 // carry around references to the outer MDS, and we can substitute
136 // a separate lock here in future potentially.
137 Mutex &mds_lock;
138
139 bool is_daemon_stopping() const;
140
141 // Reference to global cluster log client, just to avoid initialising
142 // a separate one here.
143 LogChannelRef &clog;
144
145 // Reference to global timer utility, because MDSRank and MDSDaemon
146 // currently both use the same mds_lock, so it makes sense for them
147 // to share a timer.
148 SafeTimer &timer;
149
150 MDSMap *&mdsmap;
151
152 Objecter *objecter;
153
154 // sub systems
155 Server *server;
156 MDCache *mdcache;
157 Locker *locker;
158 MDLog *mdlog;
159 MDBalancer *balancer;
160 ScrubStack *scrubstack;
161 DamageTable damage_table;
162
163
164 InoTable *inotable;
165
166 SnapServer *snapserver;
167 SnapClient *snapclient;
168
169 MDSTableClient *get_table_client(int t);
170 MDSTableServer *get_table_server(int t);
171
172 SessionMap sessionmap;
173 Session *get_session(client_t client) {
174 return sessionmap.get_session(entity_name_t::CLIENT(client.v));
175 }
176
177 PerfCounters *logger, *mlogger;
178 OpTracker op_tracker;
179
180 // The last different state I held before current
181 MDSMap::DaemonState last_state;
182 // The state assigned to me by the MDSMap
183 MDSMap::DaemonState state;
184
185 bool cluster_degraded;
186
187 MDSMap::DaemonState get_state() const { return state; }
188 MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
189
190 bool is_creating() const { return state == MDSMap::STATE_CREATING; }
191 bool is_starting() const { return state == MDSMap::STATE_STARTING; }
192 bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
193 bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
194 bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
195 bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
196 bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
197 bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
198 bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
199 bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
200 bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
201 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
202 bool is_stopped() const { return mdsmap->is_stopped(whoami); }
203 bool is_cluster_degraded() const { return cluster_degraded; }
204
205 void handle_write_error(int err);
206
207 void handle_conf_change(const struct md_config_t *conf,
208 const std::set <std::string> &changed)
209 {
210 purge_queue.handle_conf_change(conf, changed, *mdsmap);
211 }
212
213 protected:
214 // Flag to indicate we entered shutdown: anyone seeing this to be true
215 // after taking mds_lock must drop out.
216 bool stopping;
217
218 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
219 // because its init/shutdown happens at the top level.
220 PurgeQueue purge_queue;
221
222 class ProgressThread : public Thread {
223 MDSRank *mds;
224 Cond cond;
225 public:
226 explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
227 void * entry() override;
228 void shutdown();
229 void signal() {cond.Signal();}
230 } progress_thread;
231
232 list<Message*> waiting_for_nolaggy;
233 list<MDSInternalContextBase*> finished_queue;
234 // Dispatch, retry, queues
235 int dispatch_depth;
236 void inc_dispatch_depth() { ++dispatch_depth; }
237 void dec_dispatch_depth() { --dispatch_depth; }
238 void retry_dispatch(Message *m);
239 bool handle_deferrable_message(Message *m);
240 void _advance_queues();
241 bool _dispatch(Message *m, bool new_msg);
242
243 ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
244
245 bool is_stale_message(Message *m) const;
246
247 map<mds_rank_t, version_t> peer_mdsmap_epoch;
248
249 ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
250
251 list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
252 list<MDSInternalContextBase*> replay_queue;
253 map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
254 map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
255
256 epoch_t osd_epoch_barrier;
257
258 // Const reference to the beacon so that we can behave differently
259 // when it's laggy.
260 Beacon &beacon;
261
262 /**
263 * Emit clog warnings for any ops reported as warnings by optracker
264 */
265 void check_ops_in_flight();
266
267 int mds_slow_req_count;
268
269 /**
270 * Share MDSMap with clients
271 */
272 void bcast_mds_map(); // to mounted clients
273 epoch_t last_client_mdsmap_bcast;
274
275 map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
276
277 void create_logger();
278 public:
279
280 void queue_waiter(MDSInternalContextBase *c) {
281 finished_queue.push_back(c);
282 progress_thread.signal();
283 }
284 void queue_waiters(list<MDSInternalContextBase*>& ls) {
285 finished_queue.splice( finished_queue.end(), ls );
286 progress_thread.signal();
287 }
288
289 MDSRank(
290 mds_rank_t whoami_,
291 Mutex &mds_lock_,
292 LogChannelRef &clog_,
293 SafeTimer &timer_,
294 Beacon &beacon_,
295 MDSMap *& mdsmap_,
296 Messenger *msgr,
297 MonClient *monc_,
298 Context *respawn_hook_,
299 Context *suicide_hook_);
300
301 protected:
302 ~MDSRank();
303
304 public:
305
306 // Daemon lifetime functions: these guys break the abstraction
307 // and call up into the parent MDSDaemon instance. It's kind
308 // of unavoidable: if we want any depth into our calls
309 // to be able to e.g. tear down the whole process, we have to
310 // have a reference going all the way down.
311 // >>>
312 void suicide();
313 void respawn();
314 // <<<
315
316 /**
317 * Call this periodically if inside a potentially long running piece
318 * of code while holding the mds_lock
319 */
320 void heartbeat_reset();
321
322 /**
323 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
324 * this when an unrecoverable error is encountered while attempting
325 * to load an MDS rank's data structures. This is *not* for use with
326 * errors affecting normal dirfrag/inode objects -- they should be handled
327 * through cleaner scrub/repair mechanisms.
328 *
329 * Callers must already hold mds_lock.
330 */
331 void damaged();
332
333 /**
334 * Wrapper around `damaged` for users who are not
335 * already holding mds_lock.
336 *
337 * Callers must not already hold mds_lock.
338 */
339 void damaged_unlocked();
340
341 utime_t get_laggy_until() const;
342
343 void send_message_mds(Message *m, mds_rank_t mds);
344 void forward_message_mds(Message *req, mds_rank_t mds);
345
346 void send_message_client_counted(Message *m, client_t client);
347 void send_message_client_counted(Message *m, Session *session);
348 void send_message_client_counted(Message *m, Connection *connection);
349 void send_message_client_counted(Message *m, const ConnectionRef& con) {
350 send_message_client_counted(m, con.get());
351 }
352 void send_message_client(Message *m, Session *session);
353 void send_message(Message *m, Connection *c);
354 void send_message(Message *m, const ConnectionRef& c) {
355 send_message(m, c.get());
356 }
357
358 void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
359 waiting_for_active_peer[who].push_back(c);
360 }
361 void wait_for_cluster_recovered(MDSInternalContextBase *c) {
362 assert(cluster_degraded);
363 waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
364 }
365
366 void wait_for_active(MDSInternalContextBase *c) {
367 waiting_for_active.push_back(c);
368 }
369 void wait_for_replay(MDSInternalContextBase *c) {
370 waiting_for_replay.push_back(c);
371 }
372 void wait_for_reconnect(MDSInternalContextBase *c) {
373 waiting_for_reconnect.push_back(c);
374 }
375 void wait_for_resolve(MDSInternalContextBase *c) {
376 waiting_for_resolve.push_back(c);
377 }
378 void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
379 waiting_for_mdsmap[e].push_back(c);
380 }
381 void enqueue_replay(MDSInternalContextBase *c) {
382 replay_queue.push_back(c);
383 }
384
385 bool queue_one_replay();
386
387 void set_osd_epoch_barrier(epoch_t e);
388 epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
389 epoch_t get_osd_epoch() const;
390
391 ceph_tid_t issue_tid() { return ++last_tid; }
392
393 Finisher *finisher;
394
395 MDSMap *get_mds_map() { return mdsmap; }
396
397 int get_req_rate() const { return logger->get(l_mds_request); }
398
399 int get_mds_slow_req_count() const { return mds_slow_req_count; }
400
401 void dump_status(Formatter *f) const;
402
403 void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
404 bool is_export_target(mds_rank_t rank) {
405 const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
406 return map_targets.count(rank);
407 }
408
409 protected:
410 void dump_clientreplay_status(Formatter *f) const;
411 void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
412 void command_tag_path(Formatter *f, const string& path,
413 const string &tag);
414 void command_flush_path(Formatter *f, const string& path);
415 void command_flush_journal(Formatter *f);
416 void command_get_subtrees(Formatter *f);
417 void command_export_dir(Formatter *f,
418 const std::string &path, mds_rank_t dest);
419 bool command_dirfrag_split(
420 cmdmap_t cmdmap,
421 std::ostream &ss);
422 bool command_dirfrag_merge(
423 cmdmap_t cmdmap,
424 std::ostream &ss);
425 bool command_dirfrag_ls(
426 cmdmap_t cmdmap,
427 std::ostream &ss,
428 Formatter *f);
429 int _command_export_dir(const std::string &path, mds_rank_t dest);
430 int _command_flush_journal(std::stringstream *ss);
431 CDir *_command_dirfrag_get(
432 const cmdmap_t &cmdmap,
433 std::ostream &ss);
434
435 protected:
436 Messenger *messenger;
437 MonClient *monc;
438
439 Context *respawn_hook;
440 Context *suicide_hook;
441
442 // Friended to access retry_dispatch
443 friend class C_MDS_RetryMessage;
444
445 // FIXME the state machine logic should be separable from the dispatch
446 // logic that calls it.
447 // >>>
448 void calc_recovery_set();
449 void request_state(MDSMap::DaemonState s);
450
451 bool standby_replaying; // true if current replay pass is in standby-replay mode
452
453 typedef enum {
454 // The MDSMap is available, configure default layouts and structures
455 MDS_BOOT_INITIAL = 0,
456 // We are ready to open some inodes
457 MDS_BOOT_OPEN_ROOT,
458 // We are ready to do a replay if needed
459 MDS_BOOT_PREPARE_LOG,
460 // Replay is complete
461 MDS_BOOT_REPLAY_DONE
462 } BootStep;
463 friend class C_MDS_BootStart;
464 friend class C_MDS_InternalBootStart;
465 void boot_create(); // i am new mds.
466 void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
467
468 void replay_start();
469 void creating_done();
470 void starting_done();
471 void replay_done();
472 void standby_replay_restart();
473 void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
474 class C_MDS_StandbyReplayRestart;
475 class C_MDS_StandbyReplayRestartFinish;
476
477 void reopen_log();
478
479 void resolve_start();
480 void resolve_done();
481 void reconnect_start();
482 void reconnect_done();
483 void rejoin_joint_start();
484 void rejoin_start();
485 void rejoin_done();
486 void recovery_done(int oldstate);
487 void clientreplay_start();
488 void clientreplay_done();
489 void active_start();
490 void stopping_start();
491 void stopping_done();
492
493 void validate_sessions();
494 // <<<
495
496 // >>>
497 void handle_mds_recovery(mds_rank_t who);
498 void handle_mds_failure(mds_rank_t who);
499 // <<<
500
501 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
502 void update_targets(utime_t now);
503 };
504
505 /* This expects to be given a reference which it is responsible for.
506 * The finish function calls functions which
507 * will put the Message exactly once.*/
508 class C_MDS_RetryMessage : public MDSInternalContext {
509 protected:
510 Message *m;
511 public:
512 C_MDS_RetryMessage(MDSRank *mds, Message *m)
513 : MDSInternalContext(mds)
514 {
515 assert(m);
516 this->m = m;
517 }
518 void finish(int r) override {
519 mds->retry_dispatch(m);
520 }
521 };
522
523 /**
524 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
525 * the service/dispatcher stuff like init/shutdown that subsystems should
526 * never touch.
527 */
528 class MDSRankDispatcher : public MDSRank
529 {
530 public:
531 void init();
532 void tick();
533 void shutdown();
534 bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
535 Formatter *f, std::ostream& ss);
536 void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
537 void handle_osd_map();
538 bool kill_session(int64_t session_id, bool wait, std::stringstream& ss);
539 void update_log_config();
540
541 bool handle_command(
542 const cmdmap_t &cmdmap,
543 MCommand *m,
544 int *r,
545 std::stringstream *ds,
546 std::stringstream *ss,
547 bool *need_reply);
548
549 void dump_sessions(const SessionFilter &filter, Formatter *f) const;
550 void evict_sessions(const SessionFilter &filter, MCommand *m);
551
552 // Call into me from MDS::ms_dispatch
553 bool ms_dispatch(Message *m);
554
555 MDSRankDispatcher(
556 mds_rank_t whoami_,
557 Mutex &mds_lock_,
558 LogChannelRef &clog_,
559 SafeTimer &timer_,
560 Beacon &beacon_,
561 MDSMap *& mdsmap_,
562 Messenger *msgr,
563 MonClient *monc_,
564 Context *respawn_hook_,
565 Context *suicide_hook_);
566 };
567
568 // This utility for MDS and MDSRank dispatchers.
569 #define ALLOW_MESSAGES_FROM(peers) \
570 do { \
571 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
572 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
573 << " allowing=" << #peers << " message=" << *m << dendl; \
574 m->put(); \
575 return true; \
576 } \
577 } while (0)
578
579 #endif // MDS_RANK_H_
580