ceph/src/mon/Monitor.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14
  15 /*
  16  * This is the top level monitor. It runs on each machine in the Monitor
  17  * Cluster. The election of a leader for the paxos algorithm only happens
  18  * once per machine via the elector. There is a separate paxos instance (state)
  19  * kept for each of the system components: Object Store Device (OSD) Monitor,
  20  * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
  21  */
  22
  23 #ifndef CEPH_MONITOR_H
  24 #define CEPH_MONITOR_H
  25
  26 #include <errno.h>
  27 #include <cmath>
  28 #include <string>
  29
  30 #include "include/types.h"
  31 #include "include/health.h"
  32 #include "msg/Messenger.h"
  33
  34 #include "common/Timer.h"
  35
  36 #include "health_check.h"
  37 #include "MonMap.h"
  38 #include "Elector.h"
  39 #include "Paxos.h"
  40 #include "Session.h"
  41 #include "MonCommand.h"
  42
  43
  44 #include "common/config_obs.h"
  45 #include "common/LogClient.h"
  46 #include "auth/AuthClient.h"
  47 #include "auth/AuthServer.h"
  48 #include "auth/cephx/CephxKeyServer.h"
  49 #include "auth/AuthMethodList.h"
  50 #include "auth/KeyRing.h"
  51 #include "include/common_fwd.h"
  52 #include "messages/MMonCommand.h"
  53 #include "mon/MonitorDBStore.h"
  54 #include "mgr/MgrClient.h"
  55
  56 #include "mon/MonOpRequest.h"
  57 #include "common/WorkQueue.h"
  58
  59
  60 #define CEPH_MON_PROTOCOL     13 /* cluster internal */
  61
  62
  63 enum {
  64   l_cluster_first = 555000,
  65   l_cluster_num_mon,
  66   l_cluster_num_mon_quorum,
  67   l_cluster_num_osd,
  68   l_cluster_num_osd_up,
  69   l_cluster_num_osd_in,
  70   l_cluster_osd_epoch,
  71   l_cluster_osd_bytes,
  72   l_cluster_osd_bytes_used,
  73   l_cluster_osd_bytes_avail,
  74   l_cluster_num_pool,
  75   l_cluster_num_pg,
  76   l_cluster_num_pg_active_clean,
  77   l_cluster_num_pg_active,
  78   l_cluster_num_pg_peering,
  79   l_cluster_num_object,
  80   l_cluster_num_object_degraded,
  81   l_cluster_num_object_misplaced,
  82   l_cluster_num_object_unfound,
  83   l_cluster_num_bytes,
  84   l_cluster_last,
  85 };
  86
  87 enum {
  88   l_mon_first = 456000,
  89   l_mon_num_sessions,
  90   l_mon_session_add,
  91   l_mon_session_rm,
  92   l_mon_session_trim,
  93   l_mon_num_elections,
  94   l_mon_election_call,
  95   l_mon_election_win,
  96   l_mon_election_lose,
  97   l_mon_last,
  98 };
  99
 100 class QuorumService;
 101 class PaxosService;
 102
 103 class AdminSocketHook;
 104
 105 #define COMPAT_SET_LOC "feature_set"
 106
 107 class Monitor : public Dispatcher,
 108                 public AuthClient,
 109                 public AuthServer,
 110                 public md_config_obs_t {
 111 public:
 112   int orig_argc = 0;
 113   const char **orig_argv = nullptr;
 114
 115   // me
 116   string name;
 117   int rank;
 118   Messenger *messenger;
 119   ConnectionRef con_self;
 120   ceph::mutex lock = ceph::make_mutex("Monitor::lock");
 121   SafeTimer timer;
 122   Finisher finisher;
 123   ThreadPool cpu_tp;  ///< threadpool for CPU intensive work
 124
 125   ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
 126
 127   /// true if we have ever joined a quorum.  if false, we are either a
 128   /// new cluster, a newly joining monitor, or a just-upgraded
 129   /// monitor.
 130   bool has_ever_joined;
 131
 132   PerfCounters *logger, *cluster_logger;
 133   bool cluster_logger_registered;
 134
 135   void register_cluster_logger();
 136   void unregister_cluster_logger();
 137
 138   MonMap *monmap;
 139   uuid_d fingerprint;
 140
 141   set<entity_addrvec_t> extra_probe_peers;
 142
 143   LogClient log_client;
 144   LogChannelRef clog;
 145   LogChannelRef audit_clog;
 146   KeyRing keyring;
 147   KeyServer key_server;
 148
 149   AuthMethodList auth_cluster_required;
 150   AuthMethodList auth_service_required;
 151
 152   CompatSet features;
 153
 154   vector<MonCommand> leader_mon_commands; // quorum leader's commands
 155   vector<MonCommand> local_mon_commands;  // commands i support
 156   bufferlist local_mon_commands_bl;       // encoded version of above
 157
 158   vector<MonCommand> prenautilus_local_mon_commands;
 159   bufferlist prenautilus_local_mon_commands_bl;
 160
 161   Messenger *mgr_messenger;
 162   MgrClient mgr_client;
 163   uint64_t mgr_proxy_bytes = 0;  // in-flight proxied mgr command message bytes
 164   std::string gss_ktfile_client{};
 165
 166 private:
 167   void new_tick();
 168
 169   // -- local storage --
 170 public:
 171   MonitorDBStore *store;
 172   static const string MONITOR_NAME;
 173   static const string MONITOR_STORE_PREFIX;
 174
 175   // -- monitor state --
 176 private:
 177   enum {
 178     STATE_INIT = 1,
 179     STATE_PROBING,
 180     STATE_SYNCHRONIZING,
 181     STATE_ELECTING,
 182     STATE_LEADER,
 183     STATE_PEON,
 184     STATE_SHUTDOWN
 185   };
 186   int state = STATE_INIT;
 187
 188 public:
 189   static const char *get_state_name(int s) {
 190     switch (s) {
 191     case STATE_PROBING: return "probing";
 192     case STATE_SYNCHRONIZING: return "synchronizing";
 193     case STATE_ELECTING: return "electing";
 194     case STATE_LEADER: return "leader";
 195     case STATE_PEON: return "peon";
 196     case STATE_SHUTDOWN: return "shutdown";
 197     default: return "???";
 198     }
 199   }
 200   const char *get_state_name() const {
 201     return get_state_name(state);
 202   }
 203
 204   bool is_init() const { return state == STATE_INIT; }
 205   bool is_shutdown() const { return state == STATE_SHUTDOWN; }
 206   bool is_probing() const { return state == STATE_PROBING; }
 207   bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
 208   bool is_electing() const { return state == STATE_ELECTING; }
 209   bool is_leader() const { return state == STATE_LEADER; }
 210   bool is_peon() const { return state == STATE_PEON; }
 211
 212   const utime_t &get_leader_since() const;
 213
 214   void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
 215
 216   std::vector<DaemonHealthMetric> get_health_metrics();
 217
 218   // -- elector --
 219 private:
 220   Paxos *paxos;
 221   Elector elector;
 222   friend class Elector;
 223
 224   /// features we require of peers (based on on-disk compatset)
 225   uint64_t required_features;
 226
 227   int leader;            // current leader (to best of knowledge)
 228   set<int> quorum;       // current active set of monitors (if !starting)
 229   mono_clock::time_point quorum_since;  // when quorum formed
 230   utime_t leader_since;  // when this monitor became the leader, if it is the leader
 231   utime_t exited_quorum; // time detected as not in quorum; 0 if in
 232
 233   // map of counts of connected clients, by type and features, for
 234   // each quorum mon
 235   map<int,FeatureMap> quorum_feature_map;
 236
 237   /**
 238    * Intersection of quorum member's connection feature bits.
 239    */
 240   uint64_t quorum_con_features;
 241   /**
 242    * Intersection of quorum members mon-specific feature bits
 243    */
 244   mon_feature_t quorum_mon_features;
 245
 246   ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
 247
 248   set<string> outside_quorum;
 249
 250   /**
 251    * @defgroup Monitor_h_scrub
 252    * @{
 253    */
 254   version_t scrub_version;            ///< paxos version we are scrubbing
 255   map<int,ScrubResult> scrub_result;  ///< results so far
 256
 257   /**
 258    * trigger a cross-mon scrub
 259    *
 260    * Verify all mons are storing identical content
 261    */
 262   int scrub_start();
 263   int scrub();
 264   void handle_scrub(MonOpRequestRef op);
 265   bool _scrub(ScrubResult *r,
 266               pair<string,string> *start,
 267               int *num_keys);
 268   void scrub_check_results();
 269   void scrub_timeout();
 270   void scrub_finish();
 271   void scrub_reset();
 272   void scrub_update_interval(int secs);
 273
 274   Context *scrub_event;       ///< periodic event to trigger scrub (leader)
 275   Context *scrub_timeout_event;  ///< scrub round timeout (leader)
 276   void scrub_event_start();
 277   void scrub_event_cancel();
 278   void scrub_reset_timeout();
 279   void scrub_cancel_timeout();
 280
 281   struct ScrubState {
 282     pair<string,string> last_key; ///< last scrubbed key
 283     bool finished;
 284
 285     ScrubState() : finished(false) { }
 286     virtual ~ScrubState() { }
 287   };
 288   std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
 289
 290   /**
 291    * @defgroup Monitor_h_sync Synchronization
 292    * @{
 293    */
 294   /**
 295    * @} // provider state
 296    */
 297   struct SyncProvider {
 298     entity_addrvec_t addrs;
 299     uint64_t cookie;       ///< unique cookie for this sync attempt
 300     utime_t timeout;       ///< when we give up and expire this attempt
 301     version_t last_committed; ///< last paxos version on peer
 302     pair<string,string> last_key; ///< last key sent to (or on) peer
 303     bool full;             ///< full scan?
 304     MonitorDBStore::Synchronizer synchronizer;   ///< iterator
 305
 306     SyncProvider() : cookie(0), last_committed(0), full(false) {}
 307
 308     void reset_timeout(CephContext *cct, int grace) {
 309       timeout = ceph_clock_now();
 310       timeout += grace;
 311     }
 312   };
 313
 314   map<uint64_t, SyncProvider> sync_providers;  ///< cookie -> SyncProvider for those syncing from us
 315   uint64_t sync_provider_count;   ///< counter for issued cookies to keep them unique
 316
 317   /**
 318    * @} // requester state
 319    */
 320   entity_addrvec_t sync_provider;  ///< who we are syncing from
 321   uint64_t sync_cookie;          ///< 0 if we are starting, non-zero otherwise
 322   bool sync_full;                ///< true if we are a full sync, false for recent catch-up
 323   version_t sync_start_version;  ///< last_committed at sync start
 324   Context *sync_timeout_event;   ///< timeout event
 325
 326   /**
 327    * floor for sync source
 328    *
 329    * When we sync we forget about our old last_committed value which
 330    * can be dangerous.  For example, if we have a cluster of:
 331    *
 332    *   mon.a: lc 100
 333    *   mon.b: lc 80
 334    *   mon.c: lc 100 (us)
 335    *
 336    * If something forces us to sync (say, corruption, or manual
 337    * intervention, or bug), we forget last_committed, and might abort.
 338    * If mon.a happens to be down when we come back, we will see:
 339    *
 340    *   mon.b: lc 80
 341    *   mon.c: lc 0 (us)
 342    *
 343    * and sync from mon.b, at which point a+b will both have lc 80 and
 344    * come online with a majority holding out of date commits.
 345    *
 346    * Avoid this by preserving our old last_committed value prior to
 347    * sync and never going backwards.
 348    */
 349   version_t sync_last_committed_floor;
 350
 351   /**
 352    * Obtain the synchronization target prefixes in set form.
 353    *
 354    * We consider a target prefix all those that are relevant when
 355    * synchronizing two stores. That is, all those that hold paxos service's
 356    * versions, as well as paxos versions, or any control keys such as the
 357    * first or last committed version.
 358    *
 359    * Given the current design, this function should return the name of all and
 360    * any available paxos service, plus the paxos name.
 361    *
 362    * @returns a set of strings referring to the prefixes being synchronized
 363    */
 364   set<string> get_sync_targets_names();
 365
 366   /**
 367    * Reset the monitor's sync-related data structures for syncing *from* a peer
 368    */
 369   void sync_reset_requester();
 370
 371   /**
 372    * Reset sync state related to allowing others to sync from us
 373    */
 374   void sync_reset_provider();
 375
 376   /**
 377    * Caled when a sync attempt times out (requester-side)
 378    */
 379   void sync_timeout();
 380
 381   /**
 382    * Get the latest monmap for backup purposes during sync
 383    */
 384   void sync_obtain_latest_monmap(bufferlist &bl);
 385
 386   /**
 387    * Start sync process
 388    *
 389    * Start pulling committed state from another monitor.
 390    *
 391    * @param entity where to pull committed state from
 392    * @param full whether to do a full sync or just catch up on recent paxos
 393    */
 394   void sync_start(entity_addrvec_t &addrs, bool full);
 395
 396 public:
 397   /**
 398    * force a sync on next mon restart
 399    */
 400   void sync_force(Formatter *f);
 401
 402 private:
 403   /**
 404    * store critical state for safekeeping during sync
 405    *
 406    * We store a few things on the side that we don't want to get clobbered by sync.  This
 407    * includes the latest monmap and a lower bound on last_committed.
 408    */
 409   void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
 410
 411   /**
 412    * reset the sync timeout
 413    *
 414    * This is used on the client to restart if things aren't progressing
 415    */
 416   void sync_reset_timeout();
 417
 418   /**
 419    * trim stale sync provider state
 420    *
 421    * If someone is syncing from us and hasn't talked to us recently, expire their state.
 422    */
 423   void sync_trim_providers();
 424
 425   /**
 426    * Complete a sync
 427    *
 428    * Finish up a sync after we've gotten all of the chunks.
 429    *
 430    * @param last_committed final last_committed value from provider
 431    */
 432   void sync_finish(version_t last_committed);
 433
 434   /**
 435    * request the next chunk from the provider
 436    */
 437   void sync_get_next_chunk();
 438
 439   /**
 440    * handle sync message
 441    *
 442    * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
 443    */
 444   void handle_sync(MonOpRequestRef op);
 445
 446   void _sync_reply_no_cookie(MonOpRequestRef op);
 447
 448   void handle_sync_get_cookie(MonOpRequestRef op);
 449   void handle_sync_get_chunk(MonOpRequestRef op);
 450   void handle_sync_finish(MonOpRequestRef op);
 451
 452   void handle_sync_cookie(MonOpRequestRef op);
 453   void handle_sync_forward(MonOpRequestRef op);
 454   void handle_sync_chunk(MonOpRequestRef op);
 455   void handle_sync_no_cookie(MonOpRequestRef op);
 456
 457   /**
 458    * @} // Synchronization
 459    */
 460
 461   list<Context*> waitfor_quorum;
 462   list<Context*> maybe_wait_for_quorum;
 463
 464   /**
 465    * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
 466    * @{
 467    *
 468    * We use time checks to keep track of any clock drifting going on in the
 469    * cluster. This is accomplished by periodically ping each monitor in the
 470    * quorum and register its response time on a map, assessing how much its
 471    * clock has drifted. We also take this opportunity to assess the latency
 472    * on response.
 473    *
 474    * This mechanism works as follows:
 475    *
 476    *  - Leader sends out a 'PING' message to each other monitor in the quorum.
 477    *    The message is timestamped with the leader's current time. The leader's
 478    *    current time is recorded in a map, associated with each peon's
 479    *    instance.
 480    *  - The peon replies to the leader with a timestamped 'PONG' message.
 481    *  - The leader calculates a delta between the peon's timestamp and its
 482    *    current time and stashes it.
 483    *  - The leader also calculates the time it took to receive the 'PONG'
 484    *    since the 'PING' was sent, and stashes an approximate latency estimate.
 485    *  - Once all the quorum members have pong'ed, the leader will share the
 486    *    clock skew and latency maps with all the monitors in the quorum.
 487    */
 488   map<int, utime_t> timecheck_waiting;
 489   map<int, double> timecheck_skews;
 490   map<int, double> timecheck_latencies;
 491   // odd value means we are mid-round; even value means the round has
 492   // finished.
 493   version_t timecheck_round;
 494   unsigned int timecheck_acks;
 495   utime_t timecheck_round_start;
 496   friend class HealthMonitor;
 497   /* When we hit a skew we will start a new round based off of
 498    * 'mon_timecheck_skew_interval'. Each new round will be backed off
 499    * until we hit 'mon_timecheck_interval' -- which is the typical
 500    * interval when not in the presence of a skew.
 501    *
 502    * This variable tracks the number of rounds with skews since last clean
 503    * so that we can report to the user and properly adjust the backoff.
 504    */
 505   uint64_t timecheck_rounds_since_clean;
 506   /**
 507    * Time Check event.
 508    */
 509   Context *timecheck_event;
 510
 511   void timecheck_start();
 512   void timecheck_finish();
 513   void timecheck_start_round();
 514   void timecheck_finish_round(bool success = true);
 515   void timecheck_cancel_round();
 516   void timecheck_cleanup();
 517   void timecheck_reset_event();
 518   void timecheck_check_skews();
 519   void timecheck_report();
 520   void timecheck();
 521   health_status_t timecheck_status(ostringstream &ss,
 522                                    const double skew_bound,
 523                                    const double latency);
 524   void handle_timecheck_leader(MonOpRequestRef op);
 525   void handle_timecheck_peon(MonOpRequestRef op);
 526   void handle_timecheck(MonOpRequestRef op);
 527
 528   /**
 529    * Returns 'true' if this is considered to be a skew; 'false' otherwise.
 530    */
 531   bool timecheck_has_skew(const double skew_bound, double *abs) const {
 532     double abs_skew = std::fabs(skew_bound);
 533     if (abs)
 534       *abs = abs_skew;
 535     return (abs_skew > g_conf()->mon_clock_drift_allowed);
 536   }
 537
 538   /**
 539    * @}
 540    */
 541   /**
 542    * Handle ping messages from others.
 543    */
 544   void handle_ping(MonOpRequestRef op);
 545
 546   Context *probe_timeout_event = nullptr;  // for probing
 547
 548   void reset_probe_timeout();
 549   void cancel_probe_timeout();
 550   void probe_timeout(int r);
 551
 552   void _apply_compatset_features(CompatSet &new_features);
 553
 554 public:
 555   epoch_t get_epoch();
 556   int get_leader() const { return leader; }
 557   string get_leader_name() {
 558     return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
 559   }
 560   const set<int>& get_quorum() const { return quorum; }
 561   list<string> get_quorum_names() {
 562     list<string> q;
 563     for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
 564       q.push_back(monmap->get_name(*p));
 565     return q;
 566   }
 567   uint64_t get_quorum_con_features() const {
 568     return quorum_con_features;
 569   }
 570   mon_feature_t get_quorum_mon_features() const {
 571     return quorum_mon_features;
 572   }
 573   uint64_t get_required_features() const {
 574     return required_features;
 575   }
 576   mon_feature_t get_required_mon_features() const {
 577     return monmap->get_required_features();
 578   }
 579   void apply_quorum_to_compatset_features();
 580   void apply_monmap_to_compatset_features();
 581   void calc_quorum_requirements();
 582
 583   void get_combined_feature_map(FeatureMap *fm);
 584
 585 private:
 586   void _reset();   ///< called from bootstrap, start_, or join_election
 587   void wait_for_paxos_write();
 588   void _finish_svc_election(); ///< called by {win,lose}_election
 589   void respawn();
 590 public:
 591   void bootstrap();
 592   void join_election();
 593   void start_election();
 594   void win_standalone_election();
 595   // end election (called by Elector)
 596   void win_election(epoch_t epoch, const set<int>& q,
 597                     uint64_t features,
 598                     const mon_feature_t& mon_features,
 599                     ceph_release_t min_mon_release,
 600                     const map<int,Metadata>& metadata);
 601   void lose_election(epoch_t epoch, set<int>& q, int l,
 602                      uint64_t features,
 603                      const mon_feature_t& mon_features,
 604                      ceph_release_t min_mon_release);
 605   // end election (called by Elector)
 606   void finish_election();
 607
 608   void update_logger();
 609
 610   /**
 611    * Vector holding the Services serviced by this Monitor.
 612    */
 613   vector<std::unique_ptr<PaxosService>> paxos_service;
 614
 615   class MDSMonitor *mdsmon() {
 616     return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
 617   }
 618
 619   class MonmapMonitor *monmon() {
 620     return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
 621   }
 622
 623   class OSDMonitor *osdmon() {
 624     return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
 625   }
 626
 627   class AuthMonitor *authmon() {
 628     return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
 629   }
 630
 631   class LogMonitor *logmon() {
 632     return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
 633   }
 634
 635   class MgrMonitor *mgrmon() {
 636     return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
 637   }
 638
 639   class MgrStatMonitor *mgrstatmon() {
 640     return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
 641   }
 642
 643   class HealthMonitor *healthmon() {
 644     return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
 645   }
 646
 647   class ConfigMonitor *configmon() {
 648     return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
 649   }
 650
 651   friend class Paxos;
 652   friend class OSDMonitor;
 653   friend class MDSMonitor;
 654   friend class MonmapMonitor;
 655   friend class LogMonitor;
 656   friend class ConfigKeyService;
 657
 658   QuorumService *config_key_service;
 659
 660   // -- sessions --
 661   MonSessionMap session_map;
 662   ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
 663   AdminSocketHook *admin_hook;
 664
 665   template<typename Func, typename...Args>
 666   void with_session_map(Func&& func) {
 667     std::lock_guard l(session_map_lock);
 668     std::forward<Func>(func)(session_map);
 669   }
 670   void send_latest_monmap(Connection *con);
 671
 672   // messages
 673   void handle_get_version(MonOpRequestRef op);
 674   void handle_subscribe(MonOpRequestRef op);
 675   void handle_mon_get_map(MonOpRequestRef op);
 676
 677   static void _generate_command_map(cmdmap_t& cmdmap,
 678                                     map<string,string> &param_str_map);
 679   static const MonCommand *_get_moncommand(
 680     const string &cmd_prefix,
 681     const vector<MonCommand>& cmds);
 682   bool _allowed_command(MonSession *s, const string& module,
 683                         const string& prefix,
 684                         const cmdmap_t& cmdmap,
 685                         const map<string,string>& param_str_map,
 686                         const MonCommand *this_cmd);
 687   void get_mon_status(Formatter *f);
 688   void _quorum_status(Formatter *f, ostream& ss);
 689   bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
 690                                 std::ostream& ss);
 691   void handle_tell_command(MonOpRequestRef op);
 692   void handle_command(MonOpRequestRef op);
 693   void handle_route(MonOpRequestRef op);
 694
 695   void handle_mon_metadata(MonOpRequestRef op);
 696   int get_mon_metadata(int mon, Formatter *f, ostream& err);
 697   int print_nodes(Formatter *f, ostream& err);
 698
 699   // Accumulate metadata across calls to update_mon_metadata
 700   map<int, Metadata> mon_metadata;
 701   map<int, Metadata> pending_metadata;
 702
 703   /**
 704    *
 705    */
 706   struct health_cache_t {
 707     health_status_t overall;
 708     string summary;
 709
 710     void reset() {
 711       // health_status_t doesn't really have a NONE value and we're not
 712       // okay with setting something else (say, HEALTH_ERR).  so just
 713       // leave it be.
 714       summary.clear();
 715     }
 716   } health_status_cache;
 717
 718   Context *health_tick_event = nullptr;
 719   Context *health_interval_event = nullptr;
 720
 721   void health_tick_start();
 722   void health_tick_stop();
 723   ceph::real_clock::time_point health_interval_calc_next_update();
 724   void health_interval_start();
 725   void health_interval_stop();
 726   void health_events_cleanup();
 727
 728   void health_to_clog_update_conf(const std::set<std::string> &changed);
 729
 730   void do_health_to_clog_interval();
 731   void do_health_to_clog(bool force = false);
 732
 733   void log_health(
 734     const health_check_map_t& updated,
 735     const health_check_map_t& previous,
 736     MonitorDBStore::TransactionRef t);
 737
 738 protected:
 739
 740   class HealthCheckLogStatus {
 741     public:
 742     health_status_t severity;
 743     std::string last_message;
 744     utime_t updated_at = 0;
 745     HealthCheckLogStatus(health_status_t severity_,
 746                          const std::string &last_message_,
 747                          utime_t updated_at_)
 748       : severity(severity_),
 749         last_message(last_message_),
 750         updated_at(updated_at_)
 751     {}
 752   };
 753   std::map<std::string, HealthCheckLogStatus> health_check_log_times;
 754
 755 public:
 756
 757   void get_cluster_status(stringstream &ss, Formatter *f);
 758
 759   void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
 760   void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
 761
 762   void reply_tell_command(MonOpRequestRef op, int rc, const string &rs);
 763
 764
 765
 766   void handle_probe(MonOpRequestRef op);
 767   /**
 768    * Handle a Probe Operation, replying with our name, quorum and known versions.
 769    *
 770    * We use the MMonProbe message class for anything and everything related with
 771    * Monitor probing. One of the operations relates directly with the probing
 772    * itself, in which we receive a probe request and to which we reply with
 773    * our name, our quorum and the known versions for each Paxos service. Thus the
 774    * redundant function name. This reply will obviously be sent to the one
 775    * probing/requesting these infos.
 776    *
 777    * @todo Add @pre and @post
 778    *
 779    * @param m A Probe message, with an operation of type Probe.
 780    */
 781   void handle_probe_probe(MonOpRequestRef op);
 782   void handle_probe_reply(MonOpRequestRef op);
 783
 784   // request routing
 785   struct RoutedRequest {
 786     uint64_t tid;
 787     bufferlist request_bl;
 788     MonSession *session;
 789     ConnectionRef con;
 790     uint64_t con_features;
 791     MonOpRequestRef op;
 792
 793     RoutedRequest() : tid(0), session(NULL), con_features(0) {}
 794     ~RoutedRequest() {
 795       if (session)
 796         session->put();
 797     }
 798   };
 799   uint64_t routed_request_tid;
 800   map<uint64_t, RoutedRequest*> routed_requests;
 801
 802   void forward_request_leader(MonOpRequestRef op);
 803   void handle_forward(MonOpRequestRef op);
 804   void send_reply(MonOpRequestRef op, Message *reply);
 805   void no_reply(MonOpRequestRef op);
 806   void resend_routed_requests();
 807   void remove_session(MonSession *s);
 808   void remove_all_sessions();
 809   void waitlist_or_zap_client(MonOpRequestRef op);
 810
 811   void send_mon_message(Message *m, int rank);
 812
 813 public:
 814   struct C_Command : public C_MonOp {
 815     Monitor *mon;
 816     int rc;
 817     string rs;
 818     bufferlist rdata;
 819     version_t version;
 820     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
 821       C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
 822     C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
 823       C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
 824
 825     void _finish(int r) override {
 826       auto m = op->get_req<MMonCommand>();
 827       if (r >= 0) {
 828         ostringstream ss;
 829         if (!op->get_req()->get_connection()) {
 830           ss << "connection dropped for command ";
 831         } else {
 832           MonSession *s = op->get_session();
 833
 834           // if client drops we may not have a session to draw information from.
 835           if (s) {
 836             ss << "from='" << s->name << " " << s->addrs << "' "
 837               << "entity='" << s->entity_name << "' ";
 838           } else {
 839             ss << "session dropped for command ";
 840           }
 841         }
 842         ss << "cmd='" << m->cmd << "': finished";
 843
 844         mon->audit_clog->info() << ss.str();
 845         mon->reply_command(op, rc, rs, rdata, version);
 846       }
 847       else if (r == -ECANCELED)
 848         return;
 849       else if (r == -EAGAIN)
 850         mon->dispatch_op(op);
 851       else
 852         ceph_abort_msg("bad C_Command return value");
 853     }
 854   };
 855
 856  private:
 857   class C_RetryMessage : public C_MonOp {
 858     Monitor *mon;
 859   public:
 860     C_RetryMessage(Monitor *m, MonOpRequestRef op) :
 861       C_MonOp(op), mon(m) { }
 862
 863     void _finish(int r) override {
 864       if (r == -EAGAIN || r >= 0)
 865         mon->dispatch_op(op);
 866       else if (r == -ECANCELED)
 867         return;
 868       else
 869         ceph_abort_msg("bad C_RetryMessage return value");
 870     }
 871   };
 872
 873   //ms_dispatch handles a lot of logic and we want to reuse it
 874   //on forwarded messages, so we create a non-locking version for this class
 875   void _ms_dispatch(Message *m);
 876   bool ms_dispatch(Message *m) override {
 877     std::lock_guard l{lock};
 878     _ms_dispatch(m);
 879     return true;
 880   }
 881   void dispatch_op(MonOpRequestRef op);
 882   //mon_caps is used for un-connected messages from monitors
 883   MonCap mon_caps;
 884   bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
 885 public: // for AuthMonitor msgr1:
 886   int ms_handle_authentication(Connection *con) override;
 887 private:
 888   void ms_handle_accept(Connection *con) override;
 889   bool ms_handle_reset(Connection *con) override;
 890   void ms_handle_remote_reset(Connection *con) override {}
 891   bool ms_handle_refused(Connection *con) override;
 892
 893   // AuthClient
 894   int get_auth_request(
 895     Connection *con,
 896     AuthConnectionMeta *auth_meta,
 897     uint32_t *method,
 898     vector<uint32_t> *preferred_modes,
 899     bufferlist *out) override;
 900   int handle_auth_reply_more(
 901     Connection *con,
 902     AuthConnectionMeta *auth_meta,
 903    const bufferlist& bl,
 904     bufferlist *reply) override;
 905   int handle_auth_done(
 906     Connection *con,
 907     AuthConnectionMeta *auth_meta,
 908     uint64_t global_id,
 909     uint32_t con_mode,
 910     const bufferlist& bl,
 911     CryptoKey *session_key,
 912     std::string *connection_secret) override;
 913   int handle_auth_bad_method(
 914     Connection *con,
 915     AuthConnectionMeta *auth_meta,
 916     uint32_t old_auth_method,
 917     int result,
 918     const std::vector<uint32_t>& allowed_methods,
 919     const std::vector<uint32_t>& allowed_modes) override;
 920   // /AuthClient
 921   // AuthServer
 922   int handle_auth_request(
 923     Connection *con,
 924     AuthConnectionMeta *auth_meta,
 925     bool more,
 926     uint32_t auth_method,
 927     const bufferlist& bl,
 928     bufferlist *reply) override;
 929   // /AuthServer
 930
 931   int write_default_keyring(bufferlist& bl);
 932   void extract_save_mon_key(KeyRing& keyring);
 933
 934   void collect_metadata(Metadata *m);
 935   void update_mon_metadata(int from, Metadata&& m);
 936   int load_metadata();
 937   void count_metadata(const string& field, Formatter *f);
 938   void count_metadata(const string& field, map<string,int> *out);
 939
 940   // features
 941   static CompatSet get_initial_supported_features();
 942   static CompatSet get_supported_features();
 943   static CompatSet get_legacy_features();
 944   /// read the ondisk features into the CompatSet pointed to by read_features
 945   static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
 946   void read_features();
 947   void write_features(MonitorDBStore::TransactionRef t);
 948
 949   OpTracker op_tracker;
 950
 951  public:
 952   Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
 953           Messenger *m, Messenger *mgr_m, MonMap *map);
 954   ~Monitor() override;
 955
 956   static int check_features(MonitorDBStore *store);
 957
 958   // config observer
 959   const char** get_tracked_conf_keys() const override;
 960   void handle_conf_change(const ConfigProxy& conf,
 961                           const std::set<std::string> &changed) override;
 962
 963   void update_log_clients();
 964   int sanitize_options();
 965   int preinit();
 966   int init();
 967   void init_paxos();
 968   void refresh_from_paxos(bool *need_bootstrap);
 969   void shutdown();
 970   void tick();
 971
 972   void handle_signal(int sig);
 973
 974   int mkfs(bufferlist& osdmapbl);
 975
 976   /**
 977    * check cluster_fsid file
 978    *
 979    * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
 980    */
 981   int check_fsid();
 982
 983   /**
 984    * write cluster_fsid file
 985    *
 986    * @return 0 on success, or negative error code
 987    */
 988   int write_fsid();
 989   int write_fsid(MonitorDBStore::TransactionRef t);
 990
 991   int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
 992                        Formatter *f,
 993                        std::ostream& err,
 994                        std::ostream& out);
 995
 996 private:
 997   // don't allow copying
 998   Monitor(const Monitor& rhs);
 999   Monitor& operator=(const Monitor &rhs);
1000
1001 public:
1002   static void format_command_descriptions(const std::vector<MonCommand> &commands,
1003                                           Formatter *f,
1004                                           uint64_t features,
1005                                           bufferlist *rdata);
1006
1007   const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
1008     if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1009       return local_mon_commands;
1010     } else {
1011       return prenautilus_local_mon_commands;
1012     }
1013   }
1014   const bufferlist& get_local_commands_bl(mon_feature_t f) {
1015     if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1016       return local_mon_commands_bl;
1017     } else {
1018       return prenautilus_local_mon_commands_bl;
1019     }
1020   }
1021   void set_leader_commands(const std::vector<MonCommand>& cmds) {
1022     leader_mon_commands = cmds;
1023   }
1024
1025   bool is_keyring_required();
1026 };
1027
1028 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1029 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1030 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1031 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1032 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1033 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1034 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1035 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1036 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1037 #define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
1038 #define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
1039 #define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
1040 // make sure you add your feature to Monitor::get_supported_features
1041
1042
1043 /* Callers use:
1044  *
1045  *      new C_MonContext{...}
1046  *
1047  * instead of
1048  *
1049  *      new C_MonContext(...)
1050  *
1051  * because of gcc bug [1].
1052  *
1053  * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
1054  */
1055 template<typename T>
1056 class C_MonContext : public LambdaContext<T> {
1057 public:
1058   C_MonContext(const Monitor* m, T&& f) :
1059       LambdaContext<T>(std::forward<T>(f)),
1060       mon(m)
1061   {}
1062   void finish(int r) override {
1063     if (mon->is_shutdown())
1064       return;
1065     LambdaContext<T>::finish(r);
1066   }
1067 private:
1068   const Monitor* mon;
1069 };
1070
1071 #endif