ceph/src/mon/Monitor.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 /*
  16  * This is the top level monitor. It runs on each machine in the Monitor
  17  * Cluster. The election of a leader for the paxos algorithm only happens
  18  * once per machine via the elector. There is a separate paxos instance (state)
  19  * kept for each of the system components: Object Store Device (OSD) Monitor,
  20  * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
  21  */
  22
  23 #ifndef CEPH_MONITOR_H
  24 #define CEPH_MONITOR_H
  25
  26 #include <errno.h>
  27 #include <cmath>
  28
  29 #include "include/types.h"
  30 #include "include/health.h"
  31 #include "msg/Messenger.h"
  32
  33 #include "common/Timer.h"
  34
  35 #include "health_check.h"
  36 #include "MonMap.h"
  37 #include "Elector.h"
  38 #include "Paxos.h"
  39 #include "Session.h"
  40 #include "PGStatService.h"
  41 #include "MonCommand.h"
  42
  43 #include "common/LogClient.h"
  44 #include "auth/cephx/CephxKeyServer.h"
  45 #include "auth/AuthMethodList.h"
  46 #include "auth/KeyRing.h"
  47 #include "messages/MMonCommand.h"
  48 #include "mon/MonitorDBStore.h"
  49 #include "include/memory.h"
  50 #include "mgr/MgrClient.h"
  51
  52 #include "mon/MonOpRequest.h"
  53 #include "common/WorkQueue.h"
  54
  55
  56 #define CEPH_MON_PROTOCOL     13 /* cluster internal */
  57
  58
  59 enum {
  60   l_cluster_first = 555000,
  61   l_cluster_num_mon,
  62   l_cluster_num_mon_quorum,
  63   l_cluster_num_osd,
  64   l_cluster_num_osd_up,
  65   l_cluster_num_osd_in,
  66   l_cluster_osd_epoch,
  67   l_cluster_osd_bytes,
  68   l_cluster_osd_bytes_used,
  69   l_cluster_osd_bytes_avail,
  70   l_cluster_num_pool,
  71   l_cluster_num_pg,
  72   l_cluster_num_pg_active_clean,
  73   l_cluster_num_pg_active,
  74   l_cluster_num_pg_peering,
  75   l_cluster_num_object,
  76   l_cluster_num_object_degraded,
  77   l_cluster_num_object_misplaced,
  78   l_cluster_num_object_unfound,
  79   l_cluster_num_bytes,
  80   l_cluster_num_mds_up,
  81   l_cluster_num_mds_in,
  82   l_cluster_num_mds_failed,
  83   l_cluster_mds_epoch,
  84   l_cluster_last,
  85 };
  86
  87 enum {
  88   l_mon_first = 456000,
  89   l_mon_num_sessions,
  90   l_mon_session_add,
  91   l_mon_session_rm,
  92   l_mon_session_trim,
  93   l_mon_num_elections,
  94   l_mon_election_call,
  95   l_mon_election_win,
  96   l_mon_election_lose,
  97   l_mon_last,
  98 };
  99
 100 class QuorumService;
 101 class PaxosService;
 102
 103 class PerfCounters;
 104 class AdminSocketHook;
 105
 106 class MMonGetMap;
 107 class MMonGetVersion;
 108 class MMonMetadata;
 109 class MMonSync;
 110 class MMonScrub;
 111 class MMonProbe;
 112 struct MMonSubscribe;
 113 struct MRoute;
 114 struct MForward;
 115 struct MTimeCheck;
 116 struct MMonHealth;
 117
 118 #define COMPAT_SET_LOC "feature_set"
 119
 120 class C_MonContext final : public FunctionContext {
 121   const Monitor *mon;
 122 public:
 123   explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
 124     : FunctionContext(std::move(callback)), mon(m) {}
 125   void finish(int r) override;
 126 };
 127
 128 class Monitor : public Dispatcher,
 129                 public md_config_obs_t {
 130 public:
 131   // me
 132   string name;
 133   int rank;
 134   Messenger *messenger;
 135   ConnectionRef con_self;
 136   Mutex lock;
 137   SafeTimer timer;
 138   Finisher finisher;
 139   ThreadPool cpu_tp;  ///< threadpool for CPU intensive work
 140
 141   /// true if we have ever joined a quorum.  if false, we are either a
 142   /// new cluster, a newly joining monitor, or a just-upgraded
 143   /// monitor.
 144   bool has_ever_joined;
 145
 146   PerfCounters *logger, *cluster_logger;
 147   bool cluster_logger_registered;
 148
 149   void register_cluster_logger();
 150   void unregister_cluster_logger();
 151
 152   MonMap *monmap;
 153   uuid_d fingerprint;
 154
 155   set<entity_addr_t> extra_probe_peers;
 156
 157   LogClient log_client;
 158   LogChannelRef clog;
 159   LogChannelRef audit_clog;
 160   KeyRing keyring;
 161   KeyServer key_server;
 162
 163   AuthMethodList auth_cluster_required;
 164   AuthMethodList auth_service_required;
 165
 166   CompatSet features;
 167
 168   vector<MonCommand> leader_mon_commands; // quorum leader's commands
 169   vector<MonCommand> local_mon_commands;  // commands i support
 170   bufferlist local_mon_commands_bl;       // encoded version of above
 171
 172   // for upgrading mon cluster that still uses PGMonitor
 173   vector<MonCommand> local_upgrading_mon_commands;  // mixed mon cluster commands
 174   bufferlist local_upgrading_mon_commands_bl;       // encoded version of above
 175
 176   Messenger *mgr_messenger;
 177   MgrClient mgr_client;
 178   uint64_t mgr_proxy_bytes = 0;  // in-flight proxied mgr command message bytes
 179
 180   const MonPGStatService *pgservice;
 181
 182 private:
 183   void new_tick();
 184
 185   // -- local storage --
 186 public:
 187   MonitorDBStore *store;
 188   static const string MONITOR_NAME;
 189   static const string MONITOR_STORE_PREFIX;
 190
 191   // -- monitor state --
 192 private:
 193   enum {
 194     STATE_PROBING = 1,
 195     STATE_SYNCHRONIZING,
 196     STATE_ELECTING,
 197     STATE_LEADER,
 198     STATE_PEON,
 199     STATE_SHUTDOWN
 200   };
 201   int state;
 202
 203 public:
 204   static const char *get_state_name(int s) {
 205     switch (s) {
 206     case STATE_PROBING: return "probing";
 207     case STATE_SYNCHRONIZING: return "synchronizing";
 208     case STATE_ELECTING: return "electing";
 209     case STATE_LEADER: return "leader";
 210     case STATE_PEON: return "peon";
 211     case STATE_SHUTDOWN: return "shutdown";
 212     default: return "???";
 213     }
 214   }
 215   const char *get_state_name() const {
 216     return get_state_name(state);
 217   }
 218
 219   bool is_shutdown() const { return state == STATE_SHUTDOWN; }
 220   bool is_probing() const { return state == STATE_PROBING; }
 221   bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
 222   bool is_electing() const { return state == STATE_ELECTING; }
 223   bool is_leader() const { return state == STATE_LEADER; }
 224   bool is_peon() const { return state == STATE_PEON; }
 225
 226   const utime_t &get_leader_since() const;
 227
 228   void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
 229
 230   // -- elector --
 231 private:
 232   Paxos *paxos;
 233   Elector elector;
 234   friend class Elector;
 235
 236   /// features we require of peers (based on on-disk compatset)
 237   uint64_t required_features;
 238
 239   int leader;            // current leader (to best of knowledge)
 240   set<int> quorum;       // current active set of monitors (if !starting)
 241   utime_t leader_since;  // when this monitor became the leader, if it is the leader
 242   utime_t exited_quorum; // time detected as not in quorum; 0 if in
 243
 244   // map of counts of connected clients, by type and features, for
 245   // each quorum mon
 246   map<int,FeatureMap> quorum_feature_map;
 247
 248   /**
 249    * Intersection of quorum member's connection feature bits.
 250    */
 251   uint64_t quorum_con_features;
 252   /**
 253    * Intersection of quorum members mon-specific feature bits
 254    */
 255   mon_feature_t quorum_mon_features;
 256
 257   set<string> outside_quorum;
 258
 259   /**
 260    * @defgroup Monitor_h_scrub
 261    * @{
 262    */
 263   version_t scrub_version;            ///< paxos version we are scrubbing
 264   map<int,ScrubResult> scrub_result;  ///< results so far
 265
 266   /**
 267    * trigger a cross-mon scrub
 268    *
 269    * Verify all mons are storing identical content
 270    */
 271   int scrub_start();
 272   int scrub();
 273   void handle_scrub(MonOpRequestRef op);
 274   bool _scrub(ScrubResult *r,
 275               pair<string,string> *start,
 276               int *num_keys);
 277   void scrub_check_results();
 278   void scrub_timeout();
 279   void scrub_finish();
 280   void scrub_reset();
 281   void scrub_update_interval(int secs);
 282
 283   Context *scrub_event;       ///< periodic event to trigger scrub (leader)
 284   Context *scrub_timeout_event;  ///< scrub round timeout (leader)
 285   void scrub_event_start();
 286   void scrub_event_cancel();
 287   void scrub_reset_timeout();
 288   void scrub_cancel_timeout();
 289
 290   struct ScrubState {
 291     pair<string,string> last_key; ///< last scrubbed key
 292     bool finished;
 293
 294     ScrubState() : finished(false) { }
 295     virtual ~ScrubState() { }
 296   };
 297   ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
 298
 299   /**
 300    * @defgroup Monitor_h_sync Synchronization
 301    * @{
 302    */
 303   /**
 304    * @} // provider state
 305    */
 306   struct SyncProvider {
 307     entity_inst_t entity;  ///< who
 308     uint64_t cookie;       ///< unique cookie for this sync attempt
 309     utime_t timeout;       ///< when we give up and expire this attempt
 310     version_t last_committed; ///< last paxos version on peer
 311     pair<string,string> last_key; ///< last key sent to (or on) peer
 312     bool full;             ///< full scan?
 313     MonitorDBStore::Synchronizer synchronizer;   ///< iterator
 314
 315     SyncProvider() : cookie(0), last_committed(0), full(false) {}
 316
 317     void reset_timeout(CephContext *cct, int grace) {
 318       timeout = ceph_clock_now();
 319       timeout += grace;
 320     }
 321   };
 322
 323   map<uint64_t, SyncProvider> sync_providers;  ///< cookie -> SyncProvider for those syncing from us
 324   uint64_t sync_provider_count;   ///< counter for issued cookies to keep them unique
 325
 326   /**
 327    * @} // requester state
 328    */
 329   entity_inst_t sync_provider;   ///< who we are syncing from
 330   uint64_t sync_cookie;          ///< 0 if we are starting, non-zero otherwise
 331   bool sync_full;                ///< true if we are a full sync, false for recent catch-up
 332   version_t sync_start_version;  ///< last_committed at sync start
 333   Context *sync_timeout_event;   ///< timeout event
 334
 335   /**
 336    * floor for sync source
 337    *
 338    * When we sync we forget about our old last_committed value which
 339    * can be dangerous.  For example, if we have a cluster of:
 340    *
 341    *   mon.a: lc 100
 342    *   mon.b: lc 80
 343    *   mon.c: lc 100 (us)
 344    *
 345    * If something forces us to sync (say, corruption, or manual
 346    * intervention, or bug), we forget last_committed, and might abort.
 347    * If mon.a happens to be down when we come back, we will see:
 348    *
 349    *   mon.b: lc 80
 350    *   mon.c: lc 0 (us)
 351    *
 352    * and sync from mon.b, at which point a+b will both have lc 80 and
 353    * come online with a majority holding out of date commits.
 354    *
 355    * Avoid this by preserving our old last_committed value prior to
 356    * sync and never going backwards.
 357    */
 358   version_t sync_last_committed_floor;
 359
 360   /**
 361    * Obtain the synchronization target prefixes in set form.
 362    *
 363    * We consider a target prefix all those that are relevant when
 364    * synchronizing two stores. That is, all those that hold paxos service's
 365    * versions, as well as paxos versions, or any control keys such as the
 366    * first or last committed version.
 367    *
 368    * Given the current design, this function should return the name of all and
 369    * any available paxos service, plus the paxos name.
 370    *
 371    * @returns a set of strings referring to the prefixes being synchronized
 372    */
 373   set<string> get_sync_targets_names();
 374
 375   /**
 376    * Reset the monitor's sync-related data structures for syncing *from* a peer
 377    */
 378   void sync_reset_requester();
 379
 380   /**
 381    * Reset sync state related to allowing others to sync from us
 382    */
 383   void sync_reset_provider();
 384
 385   /**
 386    * Caled when a sync attempt times out (requester-side)
 387    */
 388   void sync_timeout();
 389
 390   /**
 391    * Get the latest monmap for backup purposes during sync
 392    */
 393   void sync_obtain_latest_monmap(bufferlist &bl);
 394
 395   /**
 396    * Start sync process
 397    *
 398    * Start pulling committed state from another monitor.
 399    *
 400    * @param entity where to pull committed state from
 401    * @param full whether to do a full sync or just catch up on recent paxos
 402    */
 403   void sync_start(entity_inst_t &entity, bool full);
 404
 405 public:
 406   /**
 407    * force a sync on next mon restart
 408    */
 409   void sync_force(Formatter *f, ostream& ss);
 410
 411 private:
 412   /**
 413    * store critical state for safekeeping during sync
 414    *
 415    * We store a few things on the side that we don't want to get clobbered by sync.  This
 416    * includes the latest monmap and a lower bound on last_committed.
 417    */
 418   void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
 419
 420   /**
 421    * reset the sync timeout
 422    *
 423    * This is used on the client to restart if things aren't progressing
 424    */
 425   void sync_reset_timeout();
 426
 427   /**
 428    * trim stale sync provider state
 429    *
 430    * If someone is syncing from us and hasn't talked to us recently, expire their state.
 431    */
 432   void sync_trim_providers();
 433
 434   /**
 435    * Complete a sync
 436    *
 437    * Finish up a sync after we've gotten all of the chunks.
 438    *
 439    * @param last_committed final last_committed value from provider
 440    */
 441   void sync_finish(version_t last_committed);
 442
 443   /**
 444    * request the next chunk from the provider
 445    */
 446   void sync_get_next_chunk();
 447
 448   /**
 449    * handle sync message
 450    *
 451    * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
 452    */
 453   void handle_sync(MonOpRequestRef op);
 454
 455   void _sync_reply_no_cookie(MonOpRequestRef op);
 456
 457   void handle_sync_get_cookie(MonOpRequestRef op);
 458   void handle_sync_get_chunk(MonOpRequestRef op);
 459   void handle_sync_finish(MonOpRequestRef op);
 460
 461   void handle_sync_cookie(MonOpRequestRef op);
 462   void handle_sync_forward(MonOpRequestRef op);
 463   void handle_sync_chunk(MonOpRequestRef op);
 464   void handle_sync_no_cookie(MonOpRequestRef op);
 465
 466   /**
 467    * @} // Synchronization
 468    */
 469
 470   list<Context*> waitfor_quorum;
 471   list<Context*> maybe_wait_for_quorum;
 472
 473   /**
 474    * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
 475    * @{
 476    *
 477    * We use time checks to keep track of any clock drifting going on in the
 478    * cluster. This is accomplished by periodically ping each monitor in the
 479    * quorum and register its response time on a map, assessing how much its
 480    * clock has drifted. We also take this opportunity to assess the latency
 481    * on response.
 482    *
 483    * This mechanism works as follows:
 484    *
 485    *  - Leader sends out a 'PING' message to each other monitor in the quorum.
 486    *    The message is timestamped with the leader's current time. The leader's
 487    *    current time is recorded in a map, associated with each peon's
 488    *    instance.
 489    *  - The peon replies to the leader with a timestamped 'PONG' message.
 490    *  - The leader calculates a delta between the peon's timestamp and its
 491    *    current time and stashes it.
 492    *  - The leader also calculates the time it took to receive the 'PONG'
 493    *    since the 'PING' was sent, and stashes an approximate latency estimate.
 494    *  - Once all the quorum members have pong'ed, the leader will share the
 495    *    clock skew and latency maps with all the monitors in the quorum.
 496    */
 497   map<entity_inst_t, utime_t> timecheck_waiting;
 498   map<entity_inst_t, double> timecheck_skews;
 499   map<entity_inst_t, double> timecheck_latencies;
 500   // odd value means we are mid-round; even value means the round has
 501   // finished.
 502   version_t timecheck_round;
 503   unsigned int timecheck_acks;
 504   utime_t timecheck_round_start;
 505   friend class HealthMonitor;
 506   /* When we hit a skew we will start a new round based off of
 507    * 'mon_timecheck_skew_interval'. Each new round will be backed off
 508    * until we hit 'mon_timecheck_interval' -- which is the typical
 509    * interval when not in the presence of a skew.
 510    *
 511    * This variable tracks the number of rounds with skews since last clean
 512    * so that we can report to the user and properly adjust the backoff.
 513    */
 514   uint64_t timecheck_rounds_since_clean;
 515   /**
 516    * Time Check event.
 517    */
 518   Context *timecheck_event;
 519
 520   void timecheck_start();
 521   void timecheck_finish();
 522   void timecheck_start_round();
 523   void timecheck_finish_round(bool success = true);
 524   void timecheck_cancel_round();
 525   void timecheck_cleanup();
 526   void timecheck_reset_event();
 527   void timecheck_check_skews();
 528   void timecheck_report();
 529   void timecheck();
 530   health_status_t timecheck_status(ostringstream &ss,
 531                                    const double skew_bound,
 532                                    const double latency);
 533   void handle_timecheck_leader(MonOpRequestRef op);
 534   void handle_timecheck_peon(MonOpRequestRef op);
 535   void handle_timecheck(MonOpRequestRef op);
 536
 537   /**
 538    * Returns 'true' if this is considered to be a skew; 'false' otherwise.
 539    */
 540   bool timecheck_has_skew(const double skew_bound, double *abs) const {
 541     double abs_skew = std::fabs(skew_bound);
 542     if (abs)
 543       *abs = abs_skew;
 544     return (abs_skew > g_conf->mon_clock_drift_allowed);
 545   }
 546
 547   /**
 548    * @}
 549    */
 550   /**
 551    * Handle ping messages from others.
 552    */
 553   void handle_ping(MonOpRequestRef op);
 554
 555   Context *probe_timeout_event = nullptr;  // for probing
 556
 557   void reset_probe_timeout();
 558   void cancel_probe_timeout();
 559   void probe_timeout(int r);
 560
 561   void _apply_compatset_features(CompatSet &new_features);
 562
 563 public:
 564   epoch_t get_epoch();
 565   int get_leader() const { return leader; }
 566   string get_leader_name() {
 567     return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
 568   }
 569   const set<int>& get_quorum() const { return quorum; }
 570   list<string> get_quorum_names() {
 571     list<string> q;
 572     for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
 573       q.push_back(monmap->get_name(*p));
 574     return q;
 575   }
 576   uint64_t get_quorum_con_features() const {
 577     return quorum_con_features;
 578   }
 579   mon_feature_t get_quorum_mon_features() const {
 580     return quorum_mon_features;
 581   }
 582   uint64_t get_required_features() const {
 583     return required_features;
 584   }
 585   mon_feature_t get_required_mon_features() const {
 586     return monmap->get_required_features();
 587   }
 588   void apply_quorum_to_compatset_features();
 589   void apply_monmap_to_compatset_features();
 590   void calc_quorum_requirements();
 591
 592   void get_combined_feature_map(FeatureMap *fm);
 593
 594 private:
 595   void _reset();   ///< called from bootstrap, start_, or join_election
 596   void wait_for_paxos_write();
 597   void _finish_svc_election(); ///< called by {win,lose}_election
 598 public:
 599   void bootstrap();
 600   void join_election();
 601   void start_election();
 602   void win_standalone_election();
 603   // end election (called by Elector)
 604   void win_election(epoch_t epoch, set<int>& q,
 605                     uint64_t features,
 606                     const mon_feature_t& mon_features,
 607                     const map<int,Metadata>& metadata);
 608   void lose_election(epoch_t epoch, set<int>& q, int l,
 609                      uint64_t features,
 610                      const mon_feature_t& mon_features);
 611   // end election (called by Elector)
 612   void finish_election();
 613
 614   void update_logger();
 615
 616   /**
 617    * Vector holding the Services serviced by this Monitor.
 618    */
 619   vector<PaxosService*> paxos_service;
 620
 621   class PGMonitor *pgmon() {
 622     return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
 623   }
 624
 625   class MDSMonitor *mdsmon() {
 626     return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
 627   }
 628
 629   class MonmapMonitor *monmon() {
 630     return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
 631   }
 632
 633   class OSDMonitor *osdmon() {
 634     return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
 635   }
 636
 637   class AuthMonitor *authmon() {
 638     return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
 639   }
 640
 641   class LogMonitor *logmon() {
 642     return (class LogMonitor*) paxos_service[PAXOS_LOG];
 643   }
 644
 645   class MgrMonitor *mgrmon() {
 646     return (class MgrMonitor*) paxos_service[PAXOS_MGR];
 647   }
 648
 649   class MgrStatMonitor *mgrstatmon() {
 650     return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
 651   }
 652
 653   class HealthMonitor *healthmon() {
 654     return (class HealthMonitor*) paxos_service[PAXOS_HEALTH];
 655   }
 656
 657   friend class Paxos;
 658   friend class OSDMonitor;
 659   friend class MDSMonitor;
 660   friend class MonmapMonitor;
 661   friend class PGMonitor;
 662   friend class LogMonitor;
 663   friend class ConfigKeyService;
 664
 665   QuorumService *health_monitor;
 666   QuorumService *config_key_service;
 667
 668   // -- sessions --
 669   MonSessionMap session_map;
 670   Mutex session_map_lock{"Monitor::session_map_lock"};
 671   AdminSocketHook *admin_hook;
 672
 673   template<typename Func, typename...Args>
 674   void with_session_map(Func&& func) {
 675     Mutex::Locker l(session_map_lock);
 676     std::forward<Func>(func)(session_map);
 677   }
 678   void send_latest_monmap(Connection *con);
 679
 680   // messages
 681   void handle_get_version(MonOpRequestRef op);
 682   void handle_subscribe(MonOpRequestRef op);
 683   void handle_mon_get_map(MonOpRequestRef op);
 684
 685   static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
 686                                     map<string,string> &param_str_map);
 687   static const MonCommand *_get_moncommand(
 688     const string &cmd_prefix,
 689     const vector<MonCommand>& cmds);
 690   bool _allowed_command(MonSession *s, string &module, string &prefix,
 691                         const map<string,cmd_vartype>& cmdmap,
 692                         const map<string,string>& param_str_map,
 693                         const MonCommand *this_cmd);
 694   void get_mon_status(Formatter *f, ostream& ss);
 695   void _quorum_status(Formatter *f, ostream& ss);
 696   bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
 697   void handle_command(MonOpRequestRef op);
 698   void handle_route(MonOpRequestRef op);
 699
 700   void handle_mon_metadata(MonOpRequestRef op);
 701   int get_mon_metadata(int mon, Formatter *f, ostream& err);
 702   int print_nodes(Formatter *f, ostream& err);
 703
 704   // Accumulate metadata across calls to update_mon_metadata
 705   map<int, Metadata> mon_metadata;
 706   map<int, Metadata> pending_metadata;
 707
 708   /**
 709    *
 710    */
 711   struct health_cache_t {
 712     health_status_t overall;
 713     string summary;
 714
 715     void reset() {
 716       // health_status_t doesn't really have a NONE value and we're not
 717       // okay with setting something else (say, HEALTH_ERR).  so just
 718       // leave it be.
 719       summary.clear();
 720     }
 721   } health_status_cache;
 722
 723   Context *health_tick_event = nullptr;
 724   Context *health_interval_event = nullptr;
 725
 726   void health_tick_start();
 727   void health_tick_stop();
 728   utime_t health_interval_calc_next_update();
 729   void health_interval_start();
 730   void health_interval_stop();
 731   void health_events_cleanup();
 732
 733   void health_to_clog_update_conf(const std::set<std::string> &changed);
 734
 735   void do_health_to_clog_interval();
 736   void do_health_to_clog(bool force = false);
 737
 738   /**
 739    * Generate health report
 740    *
 741    * @param status one-line status summary
 742    * @param detailbl optional bufferlist* to fill with a detailed report
 743    * @returns health status
 744    */
 745   health_status_t get_health(list<string>& status, bufferlist *detailbl,
 746                              Formatter *f);
 747
 748   health_status_t get_health_status(
 749     bool want_detail,
 750     Formatter *f,
 751     std::string *plain,
 752     const char *sep1 = " ",
 753     const char *sep2 = "; ");
 754   void log_health(
 755     const health_check_map_t& updated,
 756     const health_check_map_t& previous,
 757     MonitorDBStore::TransactionRef t);
 758
 759 protected:
 760
 761   class HealthCheckLogStatus {
 762     public:
 763     health_status_t severity;
 764     std::string last_message;
 765     utime_t updated_at = 0;
 766     HealthCheckLogStatus(health_status_t severity_,
 767                          const std::string &last_message_,
 768                          utime_t updated_at_)
 769       : severity(severity_),
 770         last_message(last_message_),
 771         updated_at(updated_at_)
 772     {}
 773   };
 774   std::map<std::string, HealthCheckLogStatus> health_check_log_times;
 775
 776 public:
 777
 778   void get_cluster_status(stringstream &ss, Formatter *f);
 779
 780   void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
 781   void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
 782
 783
 784   void handle_probe(MonOpRequestRef op);
 785   /**
 786    * Handle a Probe Operation, replying with our name, quorum and known versions.
 787    *
 788    * We use the MMonProbe message class for anything and everything related with
 789    * Monitor probing. One of the operations relates directly with the probing
 790    * itself, in which we receive a probe request and to which we reply with
 791    * our name, our quorum and the known versions for each Paxos service. Thus the
 792    * redundant function name. This reply will obviously be sent to the one
 793    * probing/requesting these infos.
 794    *
 795    * @todo Add @pre and @post
 796    *
 797    * @param m A Probe message, with an operation of type Probe.
 798    */
 799   void handle_probe_probe(MonOpRequestRef op);
 800   void handle_probe_reply(MonOpRequestRef op);
 801
 802   // request routing
 803   struct RoutedRequest {
 804     uint64_t tid;
 805     bufferlist request_bl;
 806     MonSession *session;
 807     ConnectionRef con;
 808     uint64_t con_features;
 809     entity_inst_t client_inst;
 810     MonOpRequestRef op;
 811
 812     RoutedRequest() : tid(0), session(NULL), con_features(0) {}
 813     ~RoutedRequest() {
 814       if (session)
 815         session->put();
 816     }
 817   };
 818   uint64_t routed_request_tid;
 819   map<uint64_t, RoutedRequest*> routed_requests;
 820
 821   void forward_request_leader(MonOpRequestRef op);
 822   void handle_forward(MonOpRequestRef op);
 823   void try_send_message(Message *m, const entity_inst_t& to);
 824   void send_reply(MonOpRequestRef op, Message *reply);
 825   void no_reply(MonOpRequestRef op);
 826   void resend_routed_requests();
 827   void remove_session(MonSession *s);
 828   void remove_all_sessions();
 829   void waitlist_or_zap_client(MonOpRequestRef op);
 830
 831   void send_command(const entity_inst_t& inst,
 832                     const vector<string>& com);
 833
 834 public:
 835   struct C_Command : public C_MonOp {
 836     Monitor *mon;
 837     int rc;
 838     string rs;
 839     bufferlist rdata;
 840     version_t version;
 841     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
 842       C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
 843     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
 844       C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
 845
 846     void _finish(int r) override {
 847       MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
 848       if (r >= 0) {
 849         ostringstream ss;
 850         if (!op->get_req()->get_connection()) {
 851           ss << "connection dropped for command ";
 852         } else {
 853           MonSession *s = op->get_session();
 854
 855           // if client drops we may not have a session to draw information from.
 856           if (s) {
 857             ss << "from='" << s->inst << "' "
 858               << "entity='" << s->entity_name << "' ";
 859           } else {
 860             ss << "session dropped for command ";
 861           }
 862         }
 863         ss << "cmd='" << m->cmd << "': finished";
 864
 865         mon->audit_clog->info() << ss.str();
 866         mon->reply_command(op, rc, rs, rdata, version);
 867       }
 868       else if (r == -ECANCELED)
 869         return;
 870       else if (r == -EAGAIN)
 871         mon->dispatch_op(op);
 872       else
 873         assert(0 == "bad C_Command return value");
 874     }
 875   };
 876
 877  private:
 878   class C_RetryMessage : public C_MonOp {
 879     Monitor *mon;
 880   public:
 881     C_RetryMessage(Monitor *m, MonOpRequestRef op) :
 882       C_MonOp(op), mon(m) { }
 883
 884     void _finish(int r) override {
 885       if (r == -EAGAIN || r >= 0)
 886         mon->dispatch_op(op);
 887       else if (r == -ECANCELED)
 888         return;
 889       else
 890         assert(0 == "bad C_RetryMessage return value");
 891     }
 892   };
 893
 894   //ms_dispatch handles a lot of logic and we want to reuse it
 895   //on forwarded messages, so we create a non-locking version for this class
 896   void _ms_dispatch(Message *m);
 897   bool ms_dispatch(Message *m) override {
 898     lock.Lock();
 899     _ms_dispatch(m);
 900     lock.Unlock();
 901     return true;
 902   }
 903   void dispatch_op(MonOpRequestRef op);
 904   //mon_caps is used for un-connected messages from monitors
 905   MonCap * mon_caps;
 906   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
 907   bool ms_verify_authorizer(Connection *con, int peer_type,
 908                             int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
 909                             bool& isvalid, CryptoKey& session_key,
 910                             std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
 911   bool ms_handle_reset(Connection *con) override;
 912   void ms_handle_remote_reset(Connection *con) override {}
 913   bool ms_handle_refused(Connection *con) override;
 914
 915   int write_default_keyring(bufferlist& bl);
 916   void extract_save_mon_key(KeyRing& keyring);
 917
 918   void collect_metadata(Metadata *m);
 919   void update_mon_metadata(int from, Metadata&& m);
 920   int load_metadata();
 921   void count_metadata(const string& field, Formatter *f);
 922   void count_metadata(const string& field, map<string,int> *out);
 923
 924   // features
 925   static CompatSet get_initial_supported_features();
 926   static CompatSet get_supported_features();
 927   static CompatSet get_legacy_features();
 928   /// read the ondisk features into the CompatSet pointed to by read_features
 929   static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
 930   void read_features();
 931   void write_features(MonitorDBStore::TransactionRef t);
 932
 933   OpTracker op_tracker;
 934
 935  public:
 936   Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
 937           Messenger *m, Messenger *mgr_m, MonMap *map);
 938   ~Monitor() override;
 939
 940   static int check_features(MonitorDBStore *store);
 941
 942   // config observer
 943   const char** get_tracked_conf_keys() const override;
 944   void handle_conf_change(const struct md_config_t *conf,
 945                           const std::set<std::string> &changed) override;
 946
 947   void update_log_clients();
 948   int sanitize_options();
 949   int preinit();
 950   int init();
 951   void init_paxos();
 952   void refresh_from_paxos(bool *need_bootstrap);
 953   void shutdown();
 954   void tick();
 955
 956   void handle_signal(int sig);
 957
 958   int mkfs(bufferlist& osdmapbl);
 959
 960   /**
 961    * check cluster_fsid file
 962    *
 963    * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
 964    */
 965   int check_fsid();
 966
 967   /**
 968    * write cluster_fsid file
 969    *
 970    * @return 0 on success, or negative error code
 971    */
 972   int write_fsid();
 973   int write_fsid(MonitorDBStore::TransactionRef t);
 974
 975   void do_admin_command(std::string command, cmdmap_t& cmdmap,
 976                         std::string format, ostream& ss);
 977
 978 private:
 979   // don't allow copying
 980   Monitor(const Monitor& rhs);
 981   Monitor& operator=(const Monitor &rhs);
 982
 983 public:
 984   static void format_command_descriptions(const std::vector<MonCommand> &commands,
 985                                           Formatter *f,
 986                                           bufferlist *rdata,
 987                                           bool hide_mgr_flag=false);
 988
 989   const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
 990     if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
 991       return local_mon_commands;
 992     else
 993       return local_upgrading_mon_commands;
 994   }
 995   const bufferlist& get_local_commands_bl(mon_feature_t f) {
 996     if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
 997       return local_mon_commands_bl;
 998     else
 999       return local_upgrading_mon_commands_bl;
1000   }
1001   void set_leader_commands(const std::vector<MonCommand>& cmds) {
1002     leader_mon_commands = cmds;
1003   }
1004
1005   static bool is_keyring_required();
1006 };
1007
1008 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1009 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1010 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1011 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1012 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1013 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1014 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1015 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1016 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1017 // make sure you add your feature to Monitor::get_supported_features
1018
1019
1020
1021 #endif