ceph/src/mds/MDSRank.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2015 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 #ifndef MDS_RANK_H_
  16 #define MDS_RANK_H_
  17
  18 #include "common/DecayCounter.h"
  19 #include "common/LogClient.h"
  20 #include "common/Timer.h"
  21 #include "common/TrackedOp.h"
  22
  23 #include "messages/MCommand.h"
  24
  25 #include "Beacon.h"
  26 #include "DamageTable.h"
  27 #include "MDSMap.h"
  28 #include "SessionMap.h"
  29 #include "MDCache.h"
  30 #include "Migrator.h"
  31 #include "MDLog.h"
  32 #include "PurgeQueue.h"
  33 #include "osdc/Journaler.h"
  34
  35 // Full .h import instead of forward declaration for PerfCounter, for the
  36 // benefit of those including this header and using MDSRank::logger
  37 #include "common/perf_counters.h"
  38
  39 enum {
  40   l_mds_first = 2000,
  41   l_mds_request,
  42   l_mds_reply,
  43   l_mds_reply_latency,
  44   l_mds_forward,
  45   l_mds_dir_fetch,
  46   l_mds_dir_commit,
  47   l_mds_dir_split,
  48   l_mds_dir_merge,
  49   l_mds_inode_max,
  50   l_mds_inodes,
  51   l_mds_inodes_top,
  52   l_mds_inodes_bottom,
  53   l_mds_inodes_pin_tail,
  54   l_mds_inodes_pinned,
  55   l_mds_inodes_expired,
  56   l_mds_inodes_with_caps,
  57   l_mds_caps,
  58   l_mds_subtrees,
  59   l_mds_traverse,
  60   l_mds_traverse_hit,
  61   l_mds_traverse_forward,
  62   l_mds_traverse_discover,
  63   l_mds_traverse_dir_fetch,
  64   l_mds_traverse_remote_ino,
  65   l_mds_traverse_lock,
  66   l_mds_load_cent,
  67   l_mds_dispatch_queue_len,
  68   l_mds_exported,
  69   l_mds_exported_inodes,
  70   l_mds_imported,
  71   l_mds_imported_inodes,
  72   l_mds_last,
  73 };
  74
  75 // memory utilization
  76 enum {
  77   l_mdm_first = 2500,
  78   l_mdm_ino,
  79   l_mdm_inoa,
  80   l_mdm_inos,
  81   l_mdm_dir,
  82   l_mdm_dira,
  83   l_mdm_dirs,
  84   l_mdm_dn,
  85   l_mdm_dna,
  86   l_mdm_dns,
  87   l_mdm_cap,
  88   l_mdm_capa,
  89   l_mdm_caps,
  90   l_mdm_rss,
  91   l_mdm_heap,
  92   l_mdm_buf,
  93   l_mdm_last,
  94 };
  95
  96 namespace ceph {
  97   struct heartbeat_handle_d;
  98 }
  99
 100 class Server;
 101 class Locker;
 102 class MDCache;
 103 class MDLog;
 104 class MDBalancer;
 105 class InoTable;
 106 class SnapServer;
 107 class SnapClient;
 108 class MDSTableServer;
 109 class MDSTableClient;
 110 class Messenger;
 111 class Objecter;
 112 class MonClient;
 113 class Finisher;
 114 class MMDSMap;
 115 class ScrubStack;
 116
 117 /**
 118  * The public part of this class's interface is what's exposed to all
 119  * the various subsystems (server, mdcache, etc), such as pointers
 120  * to the other subsystems, and message-sending calls.
 121  */
 122 class MDSRank {
 123   protected:
 124     const mds_rank_t whoami;
 125
 126     // Incarnation as seen in MDSMap at the point where a rank is
 127     // assigned.
 128     int incarnation;
 129
 130   public:
 131     mds_rank_t get_nodeid() const { return whoami; }
 132     int64_t get_metadata_pool();
 133
 134     // Reference to global MDS::mds_lock, so that users of MDSRank don't
 135     // carry around references to the outer MDS, and we can substitute
 136     // a separate lock here in future potentially.
 137     Mutex &mds_lock;
 138
 139     class CephContext *cct;
 140
 141     bool is_daemon_stopping() const;
 142
 143     // Reference to global cluster log client, just to avoid initialising
 144     // a separate one here.
 145     LogChannelRef &clog;
 146
 147     // Reference to global timer utility, because MDSRank and MDSDaemon
 148     // currently both use the same mds_lock, so it makes sense for them
 149     // to share a timer.
 150     SafeTimer &timer;
 151
 152     MDSMap *&mdsmap;
 153
 154     Objecter     *objecter;
 155
 156     // sub systems
 157     Server       *server;
 158     MDCache      *mdcache;
 159     Locker       *locker;
 160     MDLog        *mdlog;
 161     MDBalancer   *balancer;
 162     ScrubStack   *scrubstack;
 163     DamageTable  damage_table;
 164
 165
 166     InoTable     *inotable;
 167
 168     SnapServer   *snapserver;
 169     SnapClient   *snapclient;
 170
 171     MDSTableClient *get_table_client(int t);
 172     MDSTableServer *get_table_server(int t);
 173
 174     SessionMap   sessionmap;
 175     Session *get_session(client_t client) {
 176       return sessionmap.get_session(entity_name_t::CLIENT(client.v));
 177     }
 178
 179     PerfCounters       *logger, *mlogger;
 180     OpTracker    op_tracker;
 181
 182     // The last different state I held before current
 183     MDSMap::DaemonState last_state;
 184     // The state assigned to me by the MDSMap
 185     MDSMap::DaemonState state;
 186
 187     bool cluster_degraded;
 188
 189     MDSMap::DaemonState get_state() const { return state; }
 190     MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
 191
 192     bool is_creating() const { return state == MDSMap::STATE_CREATING; }
 193     bool is_starting() const { return state == MDSMap::STATE_STARTING; }
 194     bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
 195     bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
 196     bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
 197     bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
 198     bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
 199     bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
 200     bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
 201     bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
 202     bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
 203     bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
 204     bool is_stopped() const { return mdsmap->is_stopped(whoami); }
 205     bool is_cluster_degraded() const { return cluster_degraded; }
 206
 207     void handle_write_error(int err);
 208
 209     void handle_conf_change(const struct md_config_t *conf,
 210                             const std::set <std::string> &changed)
 211     {
 212       purge_queue.handle_conf_change(conf, changed, *mdsmap);
 213     }
 214
 215     void update_mlogger();
 216   protected:
 217     // Flag to indicate we entered shutdown: anyone seeing this to be true
 218     // after taking mds_lock must drop out.
 219     bool stopping;
 220
 221     // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
 222     // because its init/shutdown happens at the top level.
 223     PurgeQueue   purge_queue;
 224
 225     class ProgressThread : public Thread {
 226       MDSRank *mds;
 227       Cond cond;
 228       public:
 229       explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
 230       void * entry() override;
 231       void shutdown();
 232       void signal() {cond.Signal();}
 233     } progress_thread;
 234
 235     list<Message*> waiting_for_nolaggy;
 236     list<MDSInternalContextBase*> finished_queue;
 237     // Dispatch, retry, queues
 238     int dispatch_depth;
 239     void inc_dispatch_depth() { ++dispatch_depth; }
 240     void dec_dispatch_depth() { --dispatch_depth; }
 241     void retry_dispatch(Message *m);
 242     bool handle_deferrable_message(Message *m);
 243     void _advance_queues();
 244     bool _dispatch(Message *m, bool new_msg);
 245
 246     ceph::heartbeat_handle_d *hb;  // Heartbeat for threads using mds_lock
 247
 248     bool is_stale_message(Message *m) const;
 249
 250     map<mds_rank_t, version_t> peer_mdsmap_epoch;
 251
 252     ceph_tid_t last_tid;    // for mds-initiated requests (e.g. stray rename)
 253
 254     list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
 255     list<MDSInternalContextBase*> replay_queue;
 256     map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
 257     map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
 258
 259     epoch_t osd_epoch_barrier;
 260
 261     // Const reference to the beacon so that we can behave differently
 262     // when it's laggy.
 263     Beacon &beacon;
 264
 265     /**
 266      * Emit clog warnings for any ops reported as warnings by optracker
 267      */
 268     void check_ops_in_flight();
 269
 270     int mds_slow_req_count;
 271
 272     /**
 273      * Share MDSMap with clients
 274      */
 275     void bcast_mds_map();  // to mounted clients
 276     epoch_t      last_client_mdsmap_bcast;
 277
 278     map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
 279
 280     void create_logger();
 281   public:
 282
 283     void queue_waiter(MDSInternalContextBase *c) {
 284       finished_queue.push_back(c);
 285       progress_thread.signal();
 286     }
 287     void queue_waiters(list<MDSInternalContextBase*>& ls) {
 288       finished_queue.splice( finished_queue.end(), ls );
 289       progress_thread.signal();
 290     }
 291
 292     MDSRank(
 293         mds_rank_t whoami_,
 294         Mutex &mds_lock_,
 295         LogChannelRef &clog_,
 296         SafeTimer &timer_,
 297         Beacon &beacon_,
 298         MDSMap *& mdsmap_,
 299         Messenger *msgr,
 300         MonClient *monc_,
 301         Context *respawn_hook_,
 302         Context *suicide_hook_);
 303
 304   protected:
 305     ~MDSRank();
 306
 307   public:
 308
 309     // Daemon lifetime functions: these guys break the abstraction
 310     // and call up into the parent MDSDaemon instance.  It's kind
 311     // of unavoidable: if we want any depth into our calls
 312     // to be able to e.g. tear down the whole process, we have to
 313     // have a reference going all the way down.
 314     // >>>
 315     void suicide();
 316     void respawn();
 317     // <<<
 318
 319     /**
 320      * Call this periodically if inside a potentially long running piece
 321      * of code while holding the mds_lock
 322      */
 323     void heartbeat_reset();
 324
 325     /**
 326      * Report state DAMAGED to the mon, and then pass on to respawn().  Call
 327      * this when an unrecoverable error is encountered while attempting
 328      * to load an MDS rank's data structures.  This is *not* for use with
 329      * errors affecting normal dirfrag/inode objects -- they should be handled
 330      * through cleaner scrub/repair mechanisms.
 331      *
 332      * Callers must already hold mds_lock.
 333      */
 334     void damaged();
 335
 336     /**
 337      * Wrapper around `damaged` for users who are not
 338      * already holding mds_lock.
 339      *
 340      * Callers must not already hold mds_lock.
 341      */
 342     void damaged_unlocked();
 343
 344     utime_t get_laggy_until() const;
 345
 346     void send_message_mds(Message *m, mds_rank_t mds);
 347     void forward_message_mds(Message *req, mds_rank_t mds);
 348
 349     void send_message_client_counted(Message *m, client_t client);
 350     void send_message_client_counted(Message *m, Session *session);
 351     void send_message_client_counted(Message *m, Connection *connection);
 352     void send_message_client_counted(Message *m, const ConnectionRef& con) {
 353       send_message_client_counted(m, con.get());
 354     }
 355     void send_message_client(Message *m, Session *session);
 356     void send_message(Message *m, Connection *c);
 357     void send_message(Message *m, const ConnectionRef& c) {
 358       send_message(m, c.get());
 359     }
 360
 361     void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
 362       waiting_for_active_peer[who].push_back(c);
 363     }
 364     void wait_for_cluster_recovered(MDSInternalContextBase *c) {
 365       assert(cluster_degraded);
 366       waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
 367     }
 368
 369     void wait_for_active(MDSInternalContextBase *c) {
 370       waiting_for_active.push_back(c);
 371     }
 372     void wait_for_replay(MDSInternalContextBase *c) {
 373       waiting_for_replay.push_back(c);
 374     }
 375     void wait_for_reconnect(MDSInternalContextBase *c) {
 376       waiting_for_reconnect.push_back(c);
 377     }
 378     void wait_for_resolve(MDSInternalContextBase *c) {
 379       waiting_for_resolve.push_back(c);
 380     }
 381     void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
 382       waiting_for_mdsmap[e].push_back(c);
 383     }
 384     void enqueue_replay(MDSInternalContextBase *c) {
 385       replay_queue.push_back(c);
 386     }
 387
 388     bool queue_one_replay();
 389
 390     void set_osd_epoch_barrier(epoch_t e);
 391     epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
 392     epoch_t get_osd_epoch() const;
 393
 394     ceph_tid_t issue_tid() { return ++last_tid; }
 395
 396     Finisher     *finisher;
 397
 398     MDSMap *get_mds_map() { return mdsmap; }
 399
 400     int get_req_rate() const { return logger->get(l_mds_request); }
 401
 402     int get_mds_slow_req_count() const { return mds_slow_req_count; }
 403
 404     void dump_status(Formatter *f) const;
 405
 406     void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
 407     bool is_export_target(mds_rank_t rank) {
 408       const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
 409       return map_targets.count(rank);
 410     }
 411
 412     bool evict_client(int64_t session_id, bool wait, bool blacklist,
 413                       std::stringstream& ss, Context *on_killed=nullptr);
 414
 415   protected:
 416     void dump_clientreplay_status(Formatter *f) const;
 417     void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
 418     void command_tag_path(Formatter *f, const string& path,
 419                           const string &tag);
 420     void command_flush_path(Formatter *f, const string& path);
 421     void command_flush_journal(Formatter *f);
 422     void command_get_subtrees(Formatter *f);
 423     void command_export_dir(Formatter *f,
 424         const std::string &path, mds_rank_t dest);
 425     bool command_dirfrag_split(
 426         cmdmap_t cmdmap,
 427         std::ostream &ss);
 428     bool command_dirfrag_merge(
 429         cmdmap_t cmdmap,
 430         std::ostream &ss);
 431     bool command_dirfrag_ls(
 432         cmdmap_t cmdmap,
 433         std::ostream &ss,
 434         Formatter *f);
 435     int _command_export_dir(const std::string &path, mds_rank_t dest);
 436     int _command_flush_journal(std::stringstream *ss);
 437     CDir *_command_dirfrag_get(
 438         const cmdmap_t &cmdmap,
 439         std::ostream &ss);
 440
 441   protected:
 442     Messenger    *messenger;
 443     MonClient    *monc;
 444
 445     Context *respawn_hook;
 446     Context *suicide_hook;
 447
 448     // Friended to access retry_dispatch
 449     friend class C_MDS_RetryMessage;
 450
 451     // FIXME the state machine logic should be separable from the dispatch
 452     // logic that calls it.
 453     // >>>
 454     void calc_recovery_set();
 455     void request_state(MDSMap::DaemonState s);
 456
 457     bool standby_replaying;  // true if current replay pass is in standby-replay mode
 458
 459     typedef enum {
 460       // The MDSMap is available, configure default layouts and structures
 461       MDS_BOOT_INITIAL = 0,
 462       // We are ready to open some inodes
 463       MDS_BOOT_OPEN_ROOT,
 464       // We are ready to do a replay if needed
 465       MDS_BOOT_PREPARE_LOG,
 466       // Replay is complete
 467       MDS_BOOT_REPLAY_DONE
 468     } BootStep;
 469     friend class C_MDS_BootStart;
 470     friend class C_MDS_InternalBootStart;
 471     void boot_create();             // i am new mds.
 472     void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0);    // starting|replay
 473
 474     void replay_start();
 475     void creating_done();
 476     void starting_done();
 477     void replay_done();
 478     void standby_replay_restart();
 479     void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
 480     class C_MDS_StandbyReplayRestart;
 481     class C_MDS_StandbyReplayRestartFinish;
 482
 483     void reopen_log();
 484
 485     void resolve_start();
 486     void resolve_done();
 487     void reconnect_start();
 488     void reconnect_done();
 489     void rejoin_joint_start();
 490     void rejoin_start();
 491     void rejoin_done();
 492     void recovery_done(int oldstate);
 493     void clientreplay_start();
 494     void clientreplay_done();
 495     void active_start();
 496     void stopping_start();
 497     void stopping_done();
 498
 499     void validate_sessions();
 500     // <<<
 501
 502     // >>>
 503     void handle_mds_recovery(mds_rank_t who);
 504     void handle_mds_failure(mds_rank_t who);
 505     // <<<
 506
 507     /* Update MDSMap export_targets for this rank. Called on ::tick(). */
 508     void update_targets(utime_t now);
 509 };
 510
 511 /* This expects to be given a reference which it is responsible for.
 512  * The finish function calls functions which
 513  * will put the Message exactly once.*/
 514 class C_MDS_RetryMessage : public MDSInternalContext {
 515 protected:
 516   Message *m;
 517 public:
 518   C_MDS_RetryMessage(MDSRank *mds, Message *m)
 519     : MDSInternalContext(mds)
 520   {
 521     assert(m);
 522     this->m = m;
 523   }
 524   void finish(int r) override {
 525     mds->retry_dispatch(m);
 526   }
 527 };
 528
 529 /**
 530  * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
 531  * the service/dispatcher stuff like init/shutdown that subsystems should
 532  * never touch.
 533  */
 534 class MDSRankDispatcher : public MDSRank
 535 {
 536 public:
 537   void init();
 538   void tick();
 539   void shutdown();
 540   bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
 541                            Formatter *f, std::ostream& ss);
 542   void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
 543   void handle_osd_map();
 544   void update_log_config();
 545
 546   bool handle_command(
 547     const cmdmap_t &cmdmap,
 548     MCommand *m,
 549     int *r,
 550     std::stringstream *ds,
 551     std::stringstream *ss,
 552     bool *need_reply);
 553
 554   void dump_sessions(const SessionFilter &filter, Formatter *f) const;
 555   void evict_clients(const SessionFilter &filter, MCommand *m);
 556
 557   // Call into me from MDS::ms_dispatch
 558   bool ms_dispatch(Message *m);
 559
 560   MDSRankDispatcher(
 561       mds_rank_t whoami_,
 562       Mutex &mds_lock_,
 563       LogChannelRef &clog_,
 564       SafeTimer &timer_,
 565       Beacon &beacon_,
 566       MDSMap *& mdsmap_,
 567       Messenger *msgr,
 568       MonClient *monc_,
 569       Context *respawn_hook_,
 570       Context *suicide_hook_);
 571 };
 572
 573 // This utility for MDS and MDSRank dispatchers.
 574 #define ALLOW_MESSAGES_FROM(peers) \
 575 do { \
 576   if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
 577     dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
 578            << " allowing=" << #peers << " message=" << *m << dendl; \
 579     m->put();                                                       \
 580     return true; \
 581   } \
 582 } while (0)
 583
 584 #endif // MDS_RANK_H_
 585