ceph/src/mon/Monitor.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 /*
  16  * This is the top level monitor. It runs on each machine in the Monitor
  17  * Cluster. The election of a leader for the paxos algorithm only happens
  18  * once per machine via the elector. There is a separate paxos instance (state)
  19  * kept for each of the system components: Object Store Device (OSD) Monitor,
  20  * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
  21  */
  22
  23 #ifndef CEPH_MONITOR_H
  24 #define CEPH_MONITOR_H
  25
  26 #include <errno.h>
  27 #include <cmath>
  28
  29 #include "include/types.h"
  30 #include "include/health.h"
  31 #include "msg/Messenger.h"
  32
  33 #include "common/Timer.h"
  34
  35 #include "health_check.h"
  36 #include "MonMap.h"
  37 #include "Elector.h"
  38 #include "Paxos.h"
  39 #include "Session.h"
  40 #include "PGStatService.h"
  41 #include "MonCommand.h"
  42
  43 #include "common/LogClient.h"
  44 #include "auth/cephx/CephxKeyServer.h"
  45 #include "auth/AuthMethodList.h"
  46 #include "auth/KeyRing.h"
  47 #include "messages/MMonCommand.h"
  48 #include "mon/MonitorDBStore.h"
  49 #include "include/memory.h"
  50 #include "mgr/MgrClient.h"
  51
  52 #include "mon/MonOpRequest.h"
  53 #include "common/WorkQueue.h"
  54
  55
  56 #define CEPH_MON_PROTOCOL     13 /* cluster internal */
  57
  58
  59 enum {
  60   l_cluster_first = 555000,
  61   l_cluster_num_mon,
  62   l_cluster_num_mon_quorum,
  63   l_cluster_num_osd,
  64   l_cluster_num_osd_up,
  65   l_cluster_num_osd_in,
  66   l_cluster_osd_epoch,
  67   l_cluster_osd_bytes,
  68   l_cluster_osd_bytes_used,
  69   l_cluster_osd_bytes_avail,
  70   l_cluster_num_pool,
  71   l_cluster_num_pg,
  72   l_cluster_num_pg_active_clean,
  73   l_cluster_num_pg_active,
  74   l_cluster_num_pg_peering,
  75   l_cluster_num_object,
  76   l_cluster_num_object_degraded,
  77   l_cluster_num_object_misplaced,
  78   l_cluster_num_object_unfound,
  79   l_cluster_num_bytes,
  80   l_cluster_num_mds_up,
  81   l_cluster_num_mds_in,
  82   l_cluster_num_mds_failed,
  83   l_cluster_mds_epoch,
  84   l_cluster_last,
  85 };
  86
  87 enum {
  88   l_mon_first = 456000,
  89   l_mon_num_sessions,
  90   l_mon_session_add,
  91   l_mon_session_rm,
  92   l_mon_session_trim,
  93   l_mon_num_elections,
  94   l_mon_election_call,
  95   l_mon_election_win,
  96   l_mon_election_lose,
  97   l_mon_last,
  98 };
  99
 100 class QuorumService;
 101 class PaxosService;
 102
 103 class PerfCounters;
 104 class AdminSocketHook;
 105
 106 class MMonGetMap;
 107 class MMonGetVersion;
 108 class MMonMetadata;
 109 class MMonSync;
 110 class MMonScrub;
 111 class MMonProbe;
 112 struct MMonSubscribe;
 113 struct MRoute;
 114 struct MForward;
 115 struct MTimeCheck;
 116 struct MMonHealth;
 117
 118 #define COMPAT_SET_LOC "feature_set"
 119
 120 class C_MonContext final : public FunctionContext {
 121   const Monitor *mon;
 122 public:
 123   explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
 124     : FunctionContext(std::move(callback)), mon(m) {}
 125   void finish(int r) override;
 126 };
 127
 128 class Monitor : public Dispatcher,
 129                 public md_config_obs_t {
 130 public:
 131   // me
 132   string name;
 133   int rank;
 134   Messenger *messenger;
 135   ConnectionRef con_self;
 136   Mutex lock;
 137   SafeTimer timer;
 138   Finisher finisher;
 139   ThreadPool cpu_tp;  ///< threadpool for CPU intensive work
 140
 141   /// true if we have ever joined a quorum.  if false, we are either a
 142   /// new cluster, a newly joining monitor, or a just-upgraded
 143   /// monitor.
 144   bool has_ever_joined;
 145
 146   PerfCounters *logger, *cluster_logger;
 147   bool cluster_logger_registered;
 148
 149   void register_cluster_logger();
 150   void unregister_cluster_logger();
 151
 152   MonMap *monmap;
 153   uuid_d fingerprint;
 154
 155   set<entity_addr_t> extra_probe_peers;
 156
 157   LogClient log_client;
 158   LogChannelRef clog;
 159   LogChannelRef audit_clog;
 160   KeyRing keyring;
 161   KeyServer key_server;
 162
 163   AuthMethodList auth_cluster_required;
 164   AuthMethodList auth_service_required;
 165
 166   CompatSet features;
 167
 168   const MonCommand *leader_supported_mon_commands;
 169   int leader_supported_mon_commands_size;
 170
 171   Messenger *mgr_messenger;
 172   MgrClient mgr_client;
 173   uint64_t mgr_proxy_bytes = 0;  // in-flight proxied mgr command message bytes
 174
 175   const MonPGStatService *pgservice;
 176
 177 private:
 178   void new_tick();
 179
 180   // -- local storage --
 181 public:
 182   MonitorDBStore *store;
 183   static const string MONITOR_NAME;
 184   static const string MONITOR_STORE_PREFIX;
 185
 186   // -- monitor state --
 187 private:
 188   enum {
 189     STATE_PROBING = 1,
 190     STATE_SYNCHRONIZING,
 191     STATE_ELECTING,
 192     STATE_LEADER,
 193     STATE_PEON,
 194     STATE_SHUTDOWN
 195   };
 196   int state;
 197
 198 public:
 199   static const char *get_state_name(int s) {
 200     switch (s) {
 201     case STATE_PROBING: return "probing";
 202     case STATE_SYNCHRONIZING: return "synchronizing";
 203     case STATE_ELECTING: return "electing";
 204     case STATE_LEADER: return "leader";
 205     case STATE_PEON: return "peon";
 206     case STATE_SHUTDOWN: return "shutdown";
 207     default: return "???";
 208     }
 209   }
 210   const char *get_state_name() const {
 211     return get_state_name(state);
 212   }
 213
 214   bool is_shutdown() const { return state == STATE_SHUTDOWN; }
 215   bool is_probing() const { return state == STATE_PROBING; }
 216   bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
 217   bool is_electing() const { return state == STATE_ELECTING; }
 218   bool is_leader() const { return state == STATE_LEADER; }
 219   bool is_peon() const { return state == STATE_PEON; }
 220
 221   const utime_t &get_leader_since() const;
 222
 223   void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
 224
 225   // -- elector --
 226 private:
 227   Paxos *paxos;
 228   Elector elector;
 229   friend class Elector;
 230
 231   /// features we require of peers (based on on-disk compatset)
 232   uint64_t required_features;
 233
 234   int leader;            // current leader (to best of knowledge)
 235   set<int> quorum;       // current active set of monitors (if !starting)
 236   utime_t leader_since;  // when this monitor became the leader, if it is the leader
 237   utime_t exited_quorum; // time detected as not in quorum; 0 if in
 238
 239   // map of counts of connected clients, by type and features, for
 240   // each quorum mon
 241   map<int,FeatureMap> quorum_feature_map;
 242
 243   /**
 244    * Intersection of quorum member's connection feature bits.
 245    */
 246   uint64_t quorum_con_features;
 247   /**
 248    * Intersection of quorum members mon-specific feature bits
 249    */
 250   mon_feature_t quorum_mon_features;
 251   bufferlist supported_commands_bl; // encoded MonCommands we support
 252
 253   set<string> outside_quorum;
 254
 255   /**
 256    * @defgroup Monitor_h_scrub
 257    * @{
 258    */
 259   version_t scrub_version;            ///< paxos version we are scrubbing
 260   map<int,ScrubResult> scrub_result;  ///< results so far
 261
 262   /**
 263    * trigger a cross-mon scrub
 264    *
 265    * Verify all mons are storing identical content
 266    */
 267   int scrub_start();
 268   int scrub();
 269   void handle_scrub(MonOpRequestRef op);
 270   bool _scrub(ScrubResult *r,
 271               pair<string,string> *start,
 272               int *num_keys);
 273   void scrub_check_results();
 274   void scrub_timeout();
 275   void scrub_finish();
 276   void scrub_reset();
 277   void scrub_update_interval(int secs);
 278
 279   Context *scrub_event;       ///< periodic event to trigger scrub (leader)
 280   Context *scrub_timeout_event;  ///< scrub round timeout (leader)
 281   void scrub_event_start();
 282   void scrub_event_cancel();
 283   void scrub_reset_timeout();
 284   void scrub_cancel_timeout();
 285
 286   struct ScrubState {
 287     pair<string,string> last_key; ///< last scrubbed key
 288     bool finished;
 289
 290     ScrubState() : finished(false) { }
 291     virtual ~ScrubState() { }
 292   };
 293   ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
 294
 295   /**
 296    * @defgroup Monitor_h_sync Synchronization
 297    * @{
 298    */
 299   /**
 300    * @} // provider state
 301    */
 302   struct SyncProvider {
 303     entity_inst_t entity;  ///< who
 304     uint64_t cookie;       ///< unique cookie for this sync attempt
 305     utime_t timeout;       ///< when we give up and expire this attempt
 306     version_t last_committed; ///< last paxos version on peer
 307     pair<string,string> last_key; ///< last key sent to (or on) peer
 308     bool full;             ///< full scan?
 309     MonitorDBStore::Synchronizer synchronizer;   ///< iterator
 310
 311     SyncProvider() : cookie(0), last_committed(0), full(false) {}
 312
 313     void reset_timeout(CephContext *cct, int grace) {
 314       timeout = ceph_clock_now();
 315       timeout += grace;
 316     }
 317   };
 318
 319   map<uint64_t, SyncProvider> sync_providers;  ///< cookie -> SyncProvider for those syncing from us
 320   uint64_t sync_provider_count;   ///< counter for issued cookies to keep them unique
 321
 322   /**
 323    * @} // requester state
 324    */
 325   entity_inst_t sync_provider;   ///< who we are syncing from
 326   uint64_t sync_cookie;          ///< 0 if we are starting, non-zero otherwise
 327   bool sync_full;                ///< true if we are a full sync, false for recent catch-up
 328   version_t sync_start_version;  ///< last_committed at sync start
 329   Context *sync_timeout_event;   ///< timeout event
 330
 331   /**
 332    * floor for sync source
 333    *
 334    * When we sync we forget about our old last_committed value which
 335    * can be dangerous.  For example, if we have a cluster of:
 336    *
 337    *   mon.a: lc 100
 338    *   mon.b: lc 80
 339    *   mon.c: lc 100 (us)
 340    *
 341    * If something forces us to sync (say, corruption, or manual
 342    * intervention, or bug), we forget last_committed, and might abort.
 343    * If mon.a happens to be down when we come back, we will see:
 344    *
 345    *   mon.b: lc 80
 346    *   mon.c: lc 0 (us)
 347    *
 348    * and sync from mon.b, at which point a+b will both have lc 80 and
 349    * come online with a majority holding out of date commits.
 350    *
 351    * Avoid this by preserving our old last_committed value prior to
 352    * sync and never going backwards.
 353    */
 354   version_t sync_last_committed_floor;
 355
 356   /**
 357    * Obtain the synchronization target prefixes in set form.
 358    *
 359    * We consider a target prefix all those that are relevant when
 360    * synchronizing two stores. That is, all those that hold paxos service's
 361    * versions, as well as paxos versions, or any control keys such as the
 362    * first or last committed version.
 363    *
 364    * Given the current design, this function should return the name of all and
 365    * any available paxos service, plus the paxos name.
 366    *
 367    * @returns a set of strings referring to the prefixes being synchronized
 368    */
 369   set<string> get_sync_targets_names();
 370
 371   /**
 372    * Reset the monitor's sync-related data structures for syncing *from* a peer
 373    */
 374   void sync_reset_requester();
 375
 376   /**
 377    * Reset sync state related to allowing others to sync from us
 378    */
 379   void sync_reset_provider();
 380
 381   /**
 382    * Caled when a sync attempt times out (requester-side)
 383    */
 384   void sync_timeout();
 385
 386   /**
 387    * Get the latest monmap for backup purposes during sync
 388    */
 389   void sync_obtain_latest_monmap(bufferlist &bl);
 390
 391   /**
 392    * Start sync process
 393    *
 394    * Start pulling committed state from another monitor.
 395    *
 396    * @param entity where to pull committed state from
 397    * @param full whether to do a full sync or just catch up on recent paxos
 398    */
 399   void sync_start(entity_inst_t &entity, bool full);
 400
 401 public:
 402   /**
 403    * force a sync on next mon restart
 404    */
 405   void sync_force(Formatter *f, ostream& ss);
 406
 407 private:
 408   /**
 409    * store critical state for safekeeping during sync
 410    *
 411    * We store a few things on the side that we don't want to get clobbered by sync.  This
 412    * includes the latest monmap and a lower bound on last_committed.
 413    */
 414   void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
 415
 416   /**
 417    * reset the sync timeout
 418    *
 419    * This is used on the client to restart if things aren't progressing
 420    */
 421   void sync_reset_timeout();
 422
 423   /**
 424    * trim stale sync provider state
 425    *
 426    * If someone is syncing from us and hasn't talked to us recently, expire their state.
 427    */
 428   void sync_trim_providers();
 429
 430   /**
 431    * Complete a sync
 432    *
 433    * Finish up a sync after we've gotten all of the chunks.
 434    *
 435    * @param last_committed final last_committed value from provider
 436    */
 437   void sync_finish(version_t last_committed);
 438
 439   /**
 440    * request the next chunk from the provider
 441    */
 442   void sync_get_next_chunk();
 443
 444   /**
 445    * handle sync message
 446    *
 447    * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
 448    */
 449   void handle_sync(MonOpRequestRef op);
 450
 451   void _sync_reply_no_cookie(MonOpRequestRef op);
 452
 453   void handle_sync_get_cookie(MonOpRequestRef op);
 454   void handle_sync_get_chunk(MonOpRequestRef op);
 455   void handle_sync_finish(MonOpRequestRef op);
 456
 457   void handle_sync_cookie(MonOpRequestRef op);
 458   void handle_sync_forward(MonOpRequestRef op);
 459   void handle_sync_chunk(MonOpRequestRef op);
 460   void handle_sync_no_cookie(MonOpRequestRef op);
 461
 462   /**
 463    * @} // Synchronization
 464    */
 465
 466   list<Context*> waitfor_quorum;
 467   list<Context*> maybe_wait_for_quorum;
 468
 469   /**
 470    * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
 471    * @{
 472    *
 473    * We use time checks to keep track of any clock drifting going on in the
 474    * cluster. This is accomplished by periodically ping each monitor in the
 475    * quorum and register its response time on a map, assessing how much its
 476    * clock has drifted. We also take this opportunity to assess the latency
 477    * on response.
 478    *
 479    * This mechanism works as follows:
 480    *
 481    *  - Leader sends out a 'PING' message to each other monitor in the quorum.
 482    *    The message is timestamped with the leader's current time. The leader's
 483    *    current time is recorded in a map, associated with each peon's
 484    *    instance.
 485    *  - The peon replies to the leader with a timestamped 'PONG' message.
 486    *  - The leader calculates a delta between the peon's timestamp and its
 487    *    current time and stashes it.
 488    *  - The leader also calculates the time it took to receive the 'PONG'
 489    *    since the 'PING' was sent, and stashes an approximate latency estimate.
 490    *  - Once all the quorum members have pong'ed, the leader will share the
 491    *    clock skew and latency maps with all the monitors in the quorum.
 492    */
 493   map<entity_inst_t, utime_t> timecheck_waiting;
 494   map<entity_inst_t, double> timecheck_skews;
 495   map<entity_inst_t, double> timecheck_latencies;
 496   // odd value means we are mid-round; even value means the round has
 497   // finished.
 498   version_t timecheck_round;
 499   unsigned int timecheck_acks;
 500   utime_t timecheck_round_start;
 501   friend class HealthMonitor;
 502   /* When we hit a skew we will start a new round based off of
 503    * 'mon_timecheck_skew_interval'. Each new round will be backed off
 504    * until we hit 'mon_timecheck_interval' -- which is the typical
 505    * interval when not in the presence of a skew.
 506    *
 507    * This variable tracks the number of rounds with skews since last clean
 508    * so that we can report to the user and properly adjust the backoff.
 509    */
 510   uint64_t timecheck_rounds_since_clean;
 511   /**
 512    * Time Check event.
 513    */
 514   Context *timecheck_event;
 515
 516   void timecheck_start();
 517   void timecheck_finish();
 518   void timecheck_start_round();
 519   void timecheck_finish_round(bool success = true);
 520   void timecheck_cancel_round();
 521   void timecheck_cleanup();
 522   void timecheck_reset_event();
 523   void timecheck_check_skews();
 524   void timecheck_report();
 525   void timecheck();
 526   health_status_t timecheck_status(ostringstream &ss,
 527                                    const double skew_bound,
 528                                    const double latency);
 529   void handle_timecheck_leader(MonOpRequestRef op);
 530   void handle_timecheck_peon(MonOpRequestRef op);
 531   void handle_timecheck(MonOpRequestRef op);
 532
 533   /**
 534    * Returns 'true' if this is considered to be a skew; 'false' otherwise.
 535    */
 536   bool timecheck_has_skew(const double skew_bound, double *abs) const {
 537     double abs_skew = std::fabs(skew_bound);
 538     if (abs)
 539       *abs = abs_skew;
 540     return (abs_skew > g_conf->mon_clock_drift_allowed);
 541   }
 542
 543   /**
 544    * @}
 545    */
 546   /**
 547    * Handle ping messages from others.
 548    */
 549   void handle_ping(MonOpRequestRef op);
 550
 551   Context *probe_timeout_event = nullptr;  // for probing
 552
 553   void reset_probe_timeout();
 554   void cancel_probe_timeout();
 555   void probe_timeout(int r);
 556
 557   void _apply_compatset_features(CompatSet &new_features);
 558
 559 public:
 560   epoch_t get_epoch();
 561   int get_leader() const { return leader; }
 562   string get_leader_name() {
 563     return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
 564   }
 565   const set<int>& get_quorum() const { return quorum; }
 566   list<string> get_quorum_names() {
 567     list<string> q;
 568     for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
 569       q.push_back(monmap->get_name(*p));
 570     return q;
 571   }
 572   uint64_t get_quorum_con_features() const {
 573     return quorum_con_features;
 574   }
 575   mon_feature_t get_quorum_mon_features() const {
 576     return quorum_mon_features;
 577   }
 578   uint64_t get_required_features() const {
 579     return required_features;
 580   }
 581   mon_feature_t get_required_mon_features() const {
 582     return monmap->get_required_features();
 583   }
 584   void apply_quorum_to_compatset_features();
 585   void apply_monmap_to_compatset_features();
 586   void calc_quorum_requirements();
 587
 588   void get_combined_feature_map(FeatureMap *fm);
 589
 590 private:
 591   void _reset();   ///< called from bootstrap, start_, or join_election
 592   void wait_for_paxos_write();
 593   void _finish_svc_election(); ///< called by {win,lose}_election
 594 public:
 595   void bootstrap();
 596   void join_election();
 597   void start_election();
 598   void win_standalone_election();
 599   // end election (called by Elector)
 600   void win_election(epoch_t epoch, set<int>& q,
 601                     uint64_t features,
 602                     const mon_feature_t& mon_features,
 603                     const map<int,Metadata>& metadata,
 604                     const MonCommand *cmdset, int cmdsize);
 605   void lose_election(epoch_t epoch, set<int>& q, int l,
 606                      uint64_t features,
 607                      const mon_feature_t& mon_features);
 608   // end election (called by Elector)
 609   void finish_election();
 610
 611   const bufferlist& get_supported_commands_bl() {
 612     return supported_commands_bl;
 613   }
 614
 615   void update_logger();
 616
 617   /**
 618    * Vector holding the Services serviced by this Monitor.
 619    */
 620   vector<PaxosService*> paxos_service;
 621
 622   class PGMonitor *pgmon() {
 623     return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
 624   }
 625
 626   class MDSMonitor *mdsmon() {
 627     return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
 628   }
 629
 630   class MonmapMonitor *monmon() {
 631     return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
 632   }
 633
 634   class OSDMonitor *osdmon() {
 635     return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
 636   }
 637
 638   class AuthMonitor *authmon() {
 639     return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
 640   }
 641
 642   class LogMonitor *logmon() {
 643     return (class LogMonitor*) paxos_service[PAXOS_LOG];
 644   }
 645
 646   class MgrMonitor *mgrmon() {
 647     return (class MgrMonitor*) paxos_service[PAXOS_MGR];
 648   }
 649
 650   class MgrStatMonitor *mgrstatmon() {
 651     return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
 652   }
 653
 654   class MgrStatMonitor *healthmon() {
 655     return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
 656   }
 657
 658   friend class Paxos;
 659   friend class OSDMonitor;
 660   friend class MDSMonitor;
 661   friend class MonmapMonitor;
 662   friend class PGMonitor;
 663   friend class LogMonitor;
 664   friend class ConfigKeyService;
 665
 666   QuorumService *health_monitor;
 667   QuorumService *config_key_service;
 668
 669   // -- sessions --
 670   MonSessionMap session_map;
 671   Mutex session_map_lock{"Monitor::session_map_lock"};
 672   AdminSocketHook *admin_hook;
 673
 674   template<typename Func, typename...Args>
 675   void with_session_map(Func&& func) {
 676     Mutex::Locker l(session_map_lock);
 677     std::forward<Func>(func)(session_map);
 678   }
 679   void send_latest_monmap(Connection *con);
 680
 681   // messages
 682   void handle_get_version(MonOpRequestRef op);
 683   void handle_subscribe(MonOpRequestRef op);
 684   void handle_mon_get_map(MonOpRequestRef op);
 685
 686   static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
 687                                     map<string,string> &param_str_map);
 688   static const MonCommand *_get_moncommand(
 689     const string &cmd_prefix,
 690     const MonCommand *cmds,
 691     int cmds_size);
 692   bool _allowed_command(MonSession *s, string &module, string &prefix,
 693                         const map<string,cmd_vartype>& cmdmap,
 694                         const map<string,string>& param_str_map,
 695                         const MonCommand *this_cmd);
 696   void get_mon_status(Formatter *f, ostream& ss);
 697   void _quorum_status(Formatter *f, ostream& ss);
 698   bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
 699   void handle_command(MonOpRequestRef op);
 700   void handle_route(MonOpRequestRef op);
 701
 702   void handle_mon_metadata(MonOpRequestRef op);
 703   int get_mon_metadata(int mon, Formatter *f, ostream& err);
 704   int print_nodes(Formatter *f, ostream& err);
 705
 706   // Accumulate metadata across calls to update_mon_metadata
 707   map<int, Metadata> mon_metadata;
 708   map<int, Metadata> pending_metadata;
 709
 710   /**
 711    *
 712    */
 713   struct health_cache_t {
 714     health_status_t overall;
 715     string summary;
 716
 717     void reset() {
 718       // health_status_t doesn't really have a NONE value and we're not
 719       // okay with setting something else (say, HEALTH_ERR).  so just
 720       // leave it be.
 721       summary.clear();
 722     }
 723   } health_status_cache;
 724
 725   Context *health_tick_event = nullptr;
 726   Context *health_interval_event = nullptr;
 727
 728   void health_tick_start();
 729   void health_tick_stop();
 730   utime_t health_interval_calc_next_update();
 731   void health_interval_start();
 732   void health_interval_stop();
 733   void health_events_cleanup();
 734
 735   void health_to_clog_update_conf(const std::set<std::string> &changed);
 736
 737   void do_health_to_clog_interval();
 738   void do_health_to_clog(bool force = false);
 739
 740   /**
 741    * Generate health report
 742    *
 743    * @param status one-line status summary
 744    * @param detailbl optional bufferlist* to fill with a detailed report
 745    * @returns health status
 746    */
 747   health_status_t get_health(list<string>& status, bufferlist *detailbl,
 748                              Formatter *f);
 749
 750   health_status_t get_health_status(
 751     bool want_detail,
 752     Formatter *f,
 753     std::string *plain,
 754     const char *sep1 = " ",
 755     const char *sep2 = "; ");
 756   void log_health(
 757     const health_check_map_t& updated,
 758     const health_check_map_t& previous,
 759     MonitorDBStore::TransactionRef t);
 760
 761   void get_cluster_status(stringstream &ss, Formatter *f);
 762
 763   void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
 764   void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
 765
 766
 767   void handle_probe(MonOpRequestRef op);
 768   /**
 769    * Handle a Probe Operation, replying with our name, quorum and known versions.
 770    *
 771    * We use the MMonProbe message class for anything and everything related with
 772    * Monitor probing. One of the operations relates directly with the probing
 773    * itself, in which we receive a probe request and to which we reply with
 774    * our name, our quorum and the known versions for each Paxos service. Thus the
 775    * redundant function name. This reply will obviously be sent to the one
 776    * probing/requesting these infos.
 777    *
 778    * @todo Add @pre and @post
 779    *
 780    * @param m A Probe message, with an operation of type Probe.
 781    */
 782   void handle_probe_probe(MonOpRequestRef op);
 783   void handle_probe_reply(MonOpRequestRef op);
 784
 785   // request routing
 786   struct RoutedRequest {
 787     uint64_t tid;
 788     bufferlist request_bl;
 789     MonSession *session;
 790     ConnectionRef con;
 791     uint64_t con_features;
 792     entity_inst_t client_inst;
 793     MonOpRequestRef op;
 794
 795     RoutedRequest() : tid(0), session(NULL), con_features(0) {}
 796     ~RoutedRequest() {
 797       if (session)
 798         session->put();
 799     }
 800   };
 801   uint64_t routed_request_tid;
 802   map<uint64_t, RoutedRequest*> routed_requests;
 803
 804   void forward_request_leader(MonOpRequestRef op);
 805   void handle_forward(MonOpRequestRef op);
 806   void try_send_message(Message *m, const entity_inst_t& to);
 807   void send_reply(MonOpRequestRef op, Message *reply);
 808   void no_reply(MonOpRequestRef op);
 809   void resend_routed_requests();
 810   void remove_session(MonSession *s);
 811   void remove_all_sessions();
 812   void waitlist_or_zap_client(MonOpRequestRef op);
 813
 814   void send_command(const entity_inst_t& inst,
 815                     const vector<string>& com);
 816
 817 public:
 818   struct C_Command : public C_MonOp {
 819     Monitor *mon;
 820     int rc;
 821     string rs;
 822     bufferlist rdata;
 823     version_t version;
 824     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
 825       C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
 826     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
 827       C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
 828
 829     void _finish(int r) override {
 830       MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
 831       if (r >= 0) {
 832         ostringstream ss;
 833         if (!op->get_req()->get_connection()) {
 834           ss << "connection dropped for command ";
 835         } else {
 836           MonSession *s = op->get_session();
 837
 838           // if client drops we may not have a session to draw information from.
 839           if (s) {
 840             ss << "from='" << s->inst << "' "
 841               << "entity='" << s->entity_name << "' ";
 842           } else {
 843             ss << "session dropped for command ";
 844           }
 845         }
 846         ss << "cmd='" << m->cmd << "': finished";
 847
 848         mon->audit_clog->info() << ss.str();
 849         mon->reply_command(op, rc, rs, rdata, version);
 850       }
 851       else if (r == -ECANCELED)
 852         return;
 853       else if (r == -EAGAIN)
 854         mon->dispatch_op(op);
 855       else
 856         assert(0 == "bad C_Command return value");
 857     }
 858   };
 859
 860  private:
 861   class C_RetryMessage : public C_MonOp {
 862     Monitor *mon;
 863   public:
 864     C_RetryMessage(Monitor *m, MonOpRequestRef op) :
 865       C_MonOp(op), mon(m) { }
 866
 867     void _finish(int r) override {
 868       if (r == -EAGAIN || r >= 0)
 869         mon->dispatch_op(op);
 870       else if (r == -ECANCELED)
 871         return;
 872       else
 873         assert(0 == "bad C_RetryMessage return value");
 874     }
 875   };
 876
 877   //ms_dispatch handles a lot of logic and we want to reuse it
 878   //on forwarded messages, so we create a non-locking version for this class
 879   void _ms_dispatch(Message *m);
 880   bool ms_dispatch(Message *m) override {
 881     lock.Lock();
 882     _ms_dispatch(m);
 883     lock.Unlock();
 884     return true;
 885   }
 886   void dispatch_op(MonOpRequestRef op);
 887   //mon_caps is used for un-connected messages from monitors
 888   MonCap * mon_caps;
 889   bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
 890   bool ms_verify_authorizer(Connection *con, int peer_type,
 891                             int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
 892                             bool& isvalid, CryptoKey& session_key) override;
 893   bool ms_handle_reset(Connection *con) override;
 894   void ms_handle_remote_reset(Connection *con) override {}
 895   bool ms_handle_refused(Connection *con) override;
 896
 897   int write_default_keyring(bufferlist& bl);
 898   void extract_save_mon_key(KeyRing& keyring);
 899
 900   void collect_metadata(Metadata *m);
 901   void update_mon_metadata(int from, Metadata&& m);
 902   int load_metadata();
 903   void count_metadata(const string& field, Formatter *f);
 904   void count_metadata(const string& field, map<string,int> *out);
 905
 906   // features
 907   static CompatSet get_initial_supported_features();
 908   static CompatSet get_supported_features();
 909   static CompatSet get_legacy_features();
 910   /// read the ondisk features into the CompatSet pointed to by read_features
 911   static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
 912   void read_features();
 913   void write_features(MonitorDBStore::TransactionRef t);
 914
 915   OpTracker op_tracker;
 916
 917  public:
 918   Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
 919           Messenger *m, Messenger *mgr_m, MonMap *map);
 920   ~Monitor() override;
 921
 922   static int check_features(MonitorDBStore *store);
 923
 924   // config observer
 925   const char** get_tracked_conf_keys() const override;
 926   void handle_conf_change(const struct md_config_t *conf,
 927                           const std::set<std::string> &changed) override;
 928
 929   void update_log_clients();
 930   int sanitize_options();
 931   int preinit();
 932   int init();
 933   void init_paxos();
 934   void refresh_from_paxos(bool *need_bootstrap);
 935   void shutdown();
 936   void tick();
 937
 938   void handle_signal(int sig);
 939
 940   int mkfs(bufferlist& osdmapbl);
 941
 942   /**
 943    * check cluster_fsid file
 944    *
 945    * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
 946    */
 947   int check_fsid();
 948
 949   /**
 950    * write cluster_fsid file
 951    *
 952    * @return 0 on success, or negative error code
 953    */
 954   int write_fsid();
 955   int write_fsid(MonitorDBStore::TransactionRef t);
 956
 957   void do_admin_command(std::string command, cmdmap_t& cmdmap,
 958                         std::string format, ostream& ss);
 959
 960 private:
 961   // don't allow copying
 962   Monitor(const Monitor& rhs);
 963   Monitor& operator=(const Monitor &rhs);
 964
 965 public:
 966   static void format_command_descriptions(const std::vector<MonCommand> &commands,
 967                                           Formatter *f,
 968                                           bufferlist *rdata,
 969                                           bool hide_mgr_flag=false);
 970   void get_locally_supported_monitor_commands(const MonCommand **cmds, int *count);
 971   /// the Monitor owns this pointer once you pass it in
 972   void set_leader_supported_commands(const MonCommand *cmds, int size);
 973   static bool is_keyring_required();
 974 };
 975
 976 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
 977 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
 978 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
 979 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
 980 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
 981 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
 982 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
 983 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
 984 // make sure you add your feature to Monitor::get_supported_features
 985
 986
 987
 988 #endif