]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | /* | |
16 | * This is the top level monitor. It runs on each machine in the Monitor | |
17 | * Cluster. The election of a leader for the paxos algorithm only happens | |
18 | * once per machine via the elector. There is a separate paxos instance (state) | |
19 | * kept for each of the system components: Object Store Device (OSD) Monitor, | |
20 | * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor. | |
21 | */ | |
22 | ||
23 | #ifndef CEPH_MONITOR_H | |
24 | #define CEPH_MONITOR_H | |
25 | ||
26 | #include <errno.h> | |
27 | #include <cmath> | |
28 | ||
29 | #include "include/types.h" | |
224ce89b | 30 | #include "include/health.h" |
7c673cae FG |
31 | #include "msg/Messenger.h" |
32 | ||
33 | #include "common/Timer.h" | |
34 | ||
224ce89b | 35 | #include "health_check.h" |
7c673cae FG |
36 | #include "MonMap.h" |
37 | #include "Elector.h" | |
38 | #include "Paxos.h" | |
39 | #include "Session.h" | |
31f18b77 | 40 | #include "PGStatService.h" |
c07f9fc5 | 41 | #include "MonCommand.h" |
7c673cae FG |
42 | |
43 | #include "common/LogClient.h" | |
44 | #include "auth/cephx/CephxKeyServer.h" | |
45 | #include "auth/AuthMethodList.h" | |
46 | #include "auth/KeyRing.h" | |
47 | #include "messages/MMonCommand.h" | |
48 | #include "mon/MonitorDBStore.h" | |
49 | #include "include/memory.h" | |
50 | #include "mgr/MgrClient.h" | |
51 | ||
52 | #include "mon/MonOpRequest.h" | |
53 | #include "common/WorkQueue.h" | |
54 | ||
55 | ||
56 | #define CEPH_MON_PROTOCOL 13 /* cluster internal */ | |
57 | ||
58 | ||
59 | enum { | |
60 | l_cluster_first = 555000, | |
61 | l_cluster_num_mon, | |
62 | l_cluster_num_mon_quorum, | |
63 | l_cluster_num_osd, | |
64 | l_cluster_num_osd_up, | |
65 | l_cluster_num_osd_in, | |
66 | l_cluster_osd_epoch, | |
67 | l_cluster_osd_bytes, | |
68 | l_cluster_osd_bytes_used, | |
69 | l_cluster_osd_bytes_avail, | |
70 | l_cluster_num_pool, | |
71 | l_cluster_num_pg, | |
72 | l_cluster_num_pg_active_clean, | |
73 | l_cluster_num_pg_active, | |
74 | l_cluster_num_pg_peering, | |
75 | l_cluster_num_object, | |
76 | l_cluster_num_object_degraded, | |
77 | l_cluster_num_object_misplaced, | |
78 | l_cluster_num_object_unfound, | |
79 | l_cluster_num_bytes, | |
80 | l_cluster_num_mds_up, | |
81 | l_cluster_num_mds_in, | |
82 | l_cluster_num_mds_failed, | |
83 | l_cluster_mds_epoch, | |
84 | l_cluster_last, | |
85 | }; | |
86 | ||
87 | enum { | |
88 | l_mon_first = 456000, | |
89 | l_mon_num_sessions, | |
90 | l_mon_session_add, | |
91 | l_mon_session_rm, | |
92 | l_mon_session_trim, | |
93 | l_mon_num_elections, | |
94 | l_mon_election_call, | |
95 | l_mon_election_win, | |
96 | l_mon_election_lose, | |
97 | l_mon_last, | |
98 | }; | |
99 | ||
100 | class QuorumService; | |
101 | class PaxosService; | |
102 | ||
103 | class PerfCounters; | |
104 | class AdminSocketHook; | |
105 | ||
106 | class MMonGetMap; | |
107 | class MMonGetVersion; | |
108 | class MMonMetadata; | |
109 | class MMonSync; | |
110 | class MMonScrub; | |
111 | class MMonProbe; | |
112 | struct MMonSubscribe; | |
113 | struct MRoute; | |
114 | struct MForward; | |
115 | struct MTimeCheck; | |
116 | struct MMonHealth; | |
7c673cae FG |
117 | |
118 | #define COMPAT_SET_LOC "feature_set" | |
119 | ||
120 | class C_MonContext final : public FunctionContext { | |
121 | const Monitor *mon; | |
122 | public: | |
123 | explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback) | |
124 | : FunctionContext(std::move(callback)), mon(m) {} | |
125 | void finish(int r) override; | |
126 | }; | |
127 | ||
128 | class Monitor : public Dispatcher, | |
129 | public md_config_obs_t { | |
130 | public: | |
131 | // me | |
132 | string name; | |
133 | int rank; | |
134 | Messenger *messenger; | |
135 | ConnectionRef con_self; | |
136 | Mutex lock; | |
137 | SafeTimer timer; | |
138 | Finisher finisher; | |
139 | ThreadPool cpu_tp; ///< threadpool for CPU intensive work | |
140 | ||
141 | /// true if we have ever joined a quorum. if false, we are either a | |
142 | /// new cluster, a newly joining monitor, or a just-upgraded | |
143 | /// monitor. | |
144 | bool has_ever_joined; | |
145 | ||
146 | PerfCounters *logger, *cluster_logger; | |
147 | bool cluster_logger_registered; | |
148 | ||
149 | void register_cluster_logger(); | |
150 | void unregister_cluster_logger(); | |
151 | ||
152 | MonMap *monmap; | |
153 | uuid_d fingerprint; | |
154 | ||
155 | set<entity_addr_t> extra_probe_peers; | |
156 | ||
157 | LogClient log_client; | |
158 | LogChannelRef clog; | |
159 | LogChannelRef audit_clog; | |
160 | KeyRing keyring; | |
161 | KeyServer key_server; | |
162 | ||
163 | AuthMethodList auth_cluster_required; | |
164 | AuthMethodList auth_service_required; | |
165 | ||
166 | CompatSet features; | |
167 | ||
d2e6a577 FG |
168 | vector<MonCommand> leader_mon_commands; // quorum leader's commands |
169 | vector<MonCommand> local_mon_commands; // commands i support | |
170 | bufferlist local_mon_commands_bl; // encoded version of above | |
171 | ||
172 | // for upgrading mon cluster that still uses PGMonitor | |
173 | vector<MonCommand> local_upgrading_mon_commands; // mixed mon cluster commands | |
174 | bufferlist local_upgrading_mon_commands_bl; // encoded version of above | |
7c673cae FG |
175 | |
176 | Messenger *mgr_messenger; | |
177 | MgrClient mgr_client; | |
178 | uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes | |
179 | ||
31f18b77 FG |
180 | const MonPGStatService *pgservice; |
181 | ||
7c673cae FG |
182 | private: |
183 | void new_tick(); | |
184 | ||
185 | // -- local storage -- | |
186 | public: | |
187 | MonitorDBStore *store; | |
188 | static const string MONITOR_NAME; | |
189 | static const string MONITOR_STORE_PREFIX; | |
190 | ||
191 | // -- monitor state -- | |
192 | private: | |
193 | enum { | |
194 | STATE_PROBING = 1, | |
195 | STATE_SYNCHRONIZING, | |
196 | STATE_ELECTING, | |
197 | STATE_LEADER, | |
198 | STATE_PEON, | |
199 | STATE_SHUTDOWN | |
200 | }; | |
201 | int state; | |
202 | ||
203 | public: | |
204 | static const char *get_state_name(int s) { | |
205 | switch (s) { | |
206 | case STATE_PROBING: return "probing"; | |
207 | case STATE_SYNCHRONIZING: return "synchronizing"; | |
208 | case STATE_ELECTING: return "electing"; | |
209 | case STATE_LEADER: return "leader"; | |
210 | case STATE_PEON: return "peon"; | |
211 | case STATE_SHUTDOWN: return "shutdown"; | |
212 | default: return "???"; | |
213 | } | |
214 | } | |
215 | const char *get_state_name() const { | |
216 | return get_state_name(state); | |
217 | } | |
218 | ||
219 | bool is_shutdown() const { return state == STATE_SHUTDOWN; } | |
220 | bool is_probing() const { return state == STATE_PROBING; } | |
221 | bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; } | |
222 | bool is_electing() const { return state == STATE_ELECTING; } | |
223 | bool is_leader() const { return state == STATE_LEADER; } | |
224 | bool is_peon() const { return state == STATE_PEON; } | |
225 | ||
226 | const utime_t &get_leader_since() const; | |
227 | ||
228 | void prepare_new_fingerprint(MonitorDBStore::TransactionRef t); | |
229 | ||
230 | // -- elector -- | |
231 | private: | |
232 | Paxos *paxos; | |
233 | Elector elector; | |
234 | friend class Elector; | |
235 | ||
236 | /// features we require of peers (based on on-disk compatset) | |
237 | uint64_t required_features; | |
238 | ||
239 | int leader; // current leader (to best of knowledge) | |
240 | set<int> quorum; // current active set of monitors (if !starting) | |
241 | utime_t leader_since; // when this monitor became the leader, if it is the leader | |
242 | utime_t exited_quorum; // time detected as not in quorum; 0 if in | |
31f18b77 FG |
243 | |
244 | // map of counts of connected clients, by type and features, for | |
245 | // each quorum mon | |
246 | map<int,FeatureMap> quorum_feature_map; | |
247 | ||
7c673cae FG |
248 | /** |
249 | * Intersection of quorum member's connection feature bits. | |
250 | */ | |
251 | uint64_t quorum_con_features; | |
252 | /** | |
253 | * Intersection of quorum members mon-specific feature bits | |
254 | */ | |
255 | mon_feature_t quorum_mon_features; | |
7c673cae FG |
256 | |
257 | set<string> outside_quorum; | |
258 | ||
259 | /** | |
260 | * @defgroup Monitor_h_scrub | |
261 | * @{ | |
262 | */ | |
263 | version_t scrub_version; ///< paxos version we are scrubbing | |
264 | map<int,ScrubResult> scrub_result; ///< results so far | |
265 | ||
266 | /** | |
267 | * trigger a cross-mon scrub | |
268 | * | |
269 | * Verify all mons are storing identical content | |
270 | */ | |
271 | int scrub_start(); | |
272 | int scrub(); | |
273 | void handle_scrub(MonOpRequestRef op); | |
274 | bool _scrub(ScrubResult *r, | |
275 | pair<string,string> *start, | |
276 | int *num_keys); | |
277 | void scrub_check_results(); | |
278 | void scrub_timeout(); | |
279 | void scrub_finish(); | |
280 | void scrub_reset(); | |
281 | void scrub_update_interval(int secs); | |
282 | ||
283 | Context *scrub_event; ///< periodic event to trigger scrub (leader) | |
284 | Context *scrub_timeout_event; ///< scrub round timeout (leader) | |
285 | void scrub_event_start(); | |
286 | void scrub_event_cancel(); | |
287 | void scrub_reset_timeout(); | |
288 | void scrub_cancel_timeout(); | |
289 | ||
290 | struct ScrubState { | |
291 | pair<string,string> last_key; ///< last scrubbed key | |
292 | bool finished; | |
293 | ||
294 | ScrubState() : finished(false) { } | |
295 | virtual ~ScrubState() { } | |
296 | }; | |
297 | ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub | |
298 | ||
299 | /** | |
300 | * @defgroup Monitor_h_sync Synchronization | |
301 | * @{ | |
302 | */ | |
303 | /** | |
304 | * @} // provider state | |
305 | */ | |
306 | struct SyncProvider { | |
307 | entity_inst_t entity; ///< who | |
308 | uint64_t cookie; ///< unique cookie for this sync attempt | |
309 | utime_t timeout; ///< when we give up and expire this attempt | |
310 | version_t last_committed; ///< last paxos version on peer | |
311 | pair<string,string> last_key; ///< last key sent to (or on) peer | |
312 | bool full; ///< full scan? | |
313 | MonitorDBStore::Synchronizer synchronizer; ///< iterator | |
314 | ||
315 | SyncProvider() : cookie(0), last_committed(0), full(false) {} | |
316 | ||
317 | void reset_timeout(CephContext *cct, int grace) { | |
318 | timeout = ceph_clock_now(); | |
319 | timeout += grace; | |
320 | } | |
321 | }; | |
322 | ||
323 | map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us | |
324 | uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique | |
325 | ||
326 | /** | |
327 | * @} // requester state | |
328 | */ | |
329 | entity_inst_t sync_provider; ///< who we are syncing from | |
330 | uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise | |
331 | bool sync_full; ///< true if we are a full sync, false for recent catch-up | |
332 | version_t sync_start_version; ///< last_committed at sync start | |
333 | Context *sync_timeout_event; ///< timeout event | |
334 | ||
335 | /** | |
336 | * floor for sync source | |
337 | * | |
338 | * When we sync we forget about our old last_committed value which | |
339 | * can be dangerous. For example, if we have a cluster of: | |
340 | * | |
341 | * mon.a: lc 100 | |
342 | * mon.b: lc 80 | |
343 | * mon.c: lc 100 (us) | |
344 | * | |
345 | * If something forces us to sync (say, corruption, or manual | |
346 | * intervention, or bug), we forget last_committed, and might abort. | |
347 | * If mon.a happens to be down when we come back, we will see: | |
348 | * | |
349 | * mon.b: lc 80 | |
350 | * mon.c: lc 0 (us) | |
351 | * | |
352 | * and sync from mon.b, at which point a+b will both have lc 80 and | |
353 | * come online with a majority holding out of date commits. | |
354 | * | |
355 | * Avoid this by preserving our old last_committed value prior to | |
356 | * sync and never going backwards. | |
357 | */ | |
358 | version_t sync_last_committed_floor; | |
359 | ||
360 | /** | |
361 | * Obtain the synchronization target prefixes in set form. | |
362 | * | |
363 | * We consider a target prefix all those that are relevant when | |
364 | * synchronizing two stores. That is, all those that hold paxos service's | |
365 | * versions, as well as paxos versions, or any control keys such as the | |
366 | * first or last committed version. | |
367 | * | |
368 | * Given the current design, this function should return the name of all and | |
369 | * any available paxos service, plus the paxos name. | |
370 | * | |
371 | * @returns a set of strings referring to the prefixes being synchronized | |
372 | */ | |
373 | set<string> get_sync_targets_names(); | |
374 | ||
375 | /** | |
376 | * Reset the monitor's sync-related data structures for syncing *from* a peer | |
377 | */ | |
378 | void sync_reset_requester(); | |
379 | ||
380 | /** | |
381 | * Reset sync state related to allowing others to sync from us | |
382 | */ | |
383 | void sync_reset_provider(); | |
384 | ||
385 | /** | |
386 | * Caled when a sync attempt times out (requester-side) | |
387 | */ | |
388 | void sync_timeout(); | |
389 | ||
390 | /** | |
391 | * Get the latest monmap for backup purposes during sync | |
392 | */ | |
393 | void sync_obtain_latest_monmap(bufferlist &bl); | |
394 | ||
395 | /** | |
396 | * Start sync process | |
397 | * | |
398 | * Start pulling committed state from another monitor. | |
399 | * | |
400 | * @param entity where to pull committed state from | |
401 | * @param full whether to do a full sync or just catch up on recent paxos | |
402 | */ | |
403 | void sync_start(entity_inst_t &entity, bool full); | |
404 | ||
405 | public: | |
406 | /** | |
407 | * force a sync on next mon restart | |
408 | */ | |
409 | void sync_force(Formatter *f, ostream& ss); | |
410 | ||
411 | private: | |
412 | /** | |
413 | * store critical state for safekeeping during sync | |
414 | * | |
415 | * We store a few things on the side that we don't want to get clobbered by sync. This | |
416 | * includes the latest monmap and a lower bound on last_committed. | |
417 | */ | |
418 | void sync_stash_critical_state(MonitorDBStore::TransactionRef tx); | |
419 | ||
420 | /** | |
421 | * reset the sync timeout | |
422 | * | |
423 | * This is used on the client to restart if things aren't progressing | |
424 | */ | |
425 | void sync_reset_timeout(); | |
426 | ||
427 | /** | |
428 | * trim stale sync provider state | |
429 | * | |
430 | * If someone is syncing from us and hasn't talked to us recently, expire their state. | |
431 | */ | |
432 | void sync_trim_providers(); | |
433 | ||
434 | /** | |
435 | * Complete a sync | |
436 | * | |
437 | * Finish up a sync after we've gotten all of the chunks. | |
438 | * | |
439 | * @param last_committed final last_committed value from provider | |
440 | */ | |
441 | void sync_finish(version_t last_committed); | |
442 | ||
443 | /** | |
444 | * request the next chunk from the provider | |
445 | */ | |
446 | void sync_get_next_chunk(); | |
447 | ||
448 | /** | |
449 | * handle sync message | |
450 | * | |
451 | * @param m Sync message with operation type MMonSync::OP_START_CHUNKS | |
452 | */ | |
453 | void handle_sync(MonOpRequestRef op); | |
454 | ||
455 | void _sync_reply_no_cookie(MonOpRequestRef op); | |
456 | ||
457 | void handle_sync_get_cookie(MonOpRequestRef op); | |
458 | void handle_sync_get_chunk(MonOpRequestRef op); | |
459 | void handle_sync_finish(MonOpRequestRef op); | |
460 | ||
461 | void handle_sync_cookie(MonOpRequestRef op); | |
462 | void handle_sync_forward(MonOpRequestRef op); | |
463 | void handle_sync_chunk(MonOpRequestRef op); | |
464 | void handle_sync_no_cookie(MonOpRequestRef op); | |
465 | ||
466 | /** | |
467 | * @} // Synchronization | |
468 | */ | |
469 | ||
470 | list<Context*> waitfor_quorum; | |
471 | list<Context*> maybe_wait_for_quorum; | |
472 | ||
473 | /** | |
474 | * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System | |
475 | * @{ | |
476 | * | |
477 | * We use time checks to keep track of any clock drifting going on in the | |
478 | * cluster. This is accomplished by periodically ping each monitor in the | |
479 | * quorum and register its response time on a map, assessing how much its | |
480 | * clock has drifted. We also take this opportunity to assess the latency | |
481 | * on response. | |
482 | * | |
483 | * This mechanism works as follows: | |
484 | * | |
485 | * - Leader sends out a 'PING' message to each other monitor in the quorum. | |
486 | * The message is timestamped with the leader's current time. The leader's | |
487 | * current time is recorded in a map, associated with each peon's | |
488 | * instance. | |
489 | * - The peon replies to the leader with a timestamped 'PONG' message. | |
490 | * - The leader calculates a delta between the peon's timestamp and its | |
491 | * current time and stashes it. | |
492 | * - The leader also calculates the time it took to receive the 'PONG' | |
493 | * since the 'PING' was sent, and stashes an approximate latency estimate. | |
494 | * - Once all the quorum members have pong'ed, the leader will share the | |
495 | * clock skew and latency maps with all the monitors in the quorum. | |
496 | */ | |
497 | map<entity_inst_t, utime_t> timecheck_waiting; | |
498 | map<entity_inst_t, double> timecheck_skews; | |
499 | map<entity_inst_t, double> timecheck_latencies; | |
500 | // odd value means we are mid-round; even value means the round has | |
501 | // finished. | |
502 | version_t timecheck_round; | |
503 | unsigned int timecheck_acks; | |
504 | utime_t timecheck_round_start; | |
224ce89b | 505 | friend class HealthMonitor; |
7c673cae FG |
506 | /* When we hit a skew we will start a new round based off of |
507 | * 'mon_timecheck_skew_interval'. Each new round will be backed off | |
508 | * until we hit 'mon_timecheck_interval' -- which is the typical | |
509 | * interval when not in the presence of a skew. | |
510 | * | |
511 | * This variable tracks the number of rounds with skews since last clean | |
512 | * so that we can report to the user and properly adjust the backoff. | |
513 | */ | |
514 | uint64_t timecheck_rounds_since_clean; | |
515 | /** | |
516 | * Time Check event. | |
517 | */ | |
518 | Context *timecheck_event; | |
519 | ||
520 | void timecheck_start(); | |
521 | void timecheck_finish(); | |
522 | void timecheck_start_round(); | |
523 | void timecheck_finish_round(bool success = true); | |
524 | void timecheck_cancel_round(); | |
525 | void timecheck_cleanup(); | |
526 | void timecheck_reset_event(); | |
527 | void timecheck_check_skews(); | |
528 | void timecheck_report(); | |
529 | void timecheck(); | |
530 | health_status_t timecheck_status(ostringstream &ss, | |
531 | const double skew_bound, | |
532 | const double latency); | |
533 | void handle_timecheck_leader(MonOpRequestRef op); | |
534 | void handle_timecheck_peon(MonOpRequestRef op); | |
535 | void handle_timecheck(MonOpRequestRef op); | |
536 | ||
537 | /** | |
538 | * Returns 'true' if this is considered to be a skew; 'false' otherwise. | |
539 | */ | |
540 | bool timecheck_has_skew(const double skew_bound, double *abs) const { | |
541 | double abs_skew = std::fabs(skew_bound); | |
542 | if (abs) | |
543 | *abs = abs_skew; | |
544 | return (abs_skew > g_conf->mon_clock_drift_allowed); | |
545 | } | |
546 | ||
547 | /** | |
548 | * @} | |
549 | */ | |
550 | /** | |
551 | * Handle ping messages from others. | |
552 | */ | |
553 | void handle_ping(MonOpRequestRef op); | |
554 | ||
555 | Context *probe_timeout_event = nullptr; // for probing | |
556 | ||
557 | void reset_probe_timeout(); | |
558 | void cancel_probe_timeout(); | |
559 | void probe_timeout(int r); | |
560 | ||
561 | void _apply_compatset_features(CompatSet &new_features); | |
562 | ||
563 | public: | |
564 | epoch_t get_epoch(); | |
565 | int get_leader() const { return leader; } | |
224ce89b WB |
566 | string get_leader_name() { |
567 | return quorum.empty() ? string() : monmap->get_name(*quorum.begin()); | |
568 | } | |
7c673cae FG |
569 | const set<int>& get_quorum() const { return quorum; } |
570 | list<string> get_quorum_names() { | |
571 | list<string> q; | |
572 | for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) | |
573 | q.push_back(monmap->get_name(*p)); | |
574 | return q; | |
575 | } | |
576 | uint64_t get_quorum_con_features() const { | |
577 | return quorum_con_features; | |
578 | } | |
579 | mon_feature_t get_quorum_mon_features() const { | |
580 | return quorum_mon_features; | |
581 | } | |
582 | uint64_t get_required_features() const { | |
583 | return required_features; | |
584 | } | |
585 | mon_feature_t get_required_mon_features() const { | |
586 | return monmap->get_required_features(); | |
587 | } | |
588 | void apply_quorum_to_compatset_features(); | |
589 | void apply_monmap_to_compatset_features(); | |
590 | void calc_quorum_requirements(); | |
591 | ||
31f18b77 FG |
592 | void get_combined_feature_map(FeatureMap *fm); |
593 | ||
7c673cae FG |
594 | private: |
595 | void _reset(); ///< called from bootstrap, start_, or join_election | |
596 | void wait_for_paxos_write(); | |
597 | void _finish_svc_election(); ///< called by {win,lose}_election | |
598 | public: | |
599 | void bootstrap(); | |
600 | void join_election(); | |
601 | void start_election(); | |
602 | void win_standalone_election(); | |
603 | // end election (called by Elector) | |
604 | void win_election(epoch_t epoch, set<int>& q, | |
605 | uint64_t features, | |
606 | const mon_feature_t& mon_features, | |
d2e6a577 | 607 | const map<int,Metadata>& metadata); |
7c673cae FG |
608 | void lose_election(epoch_t epoch, set<int>& q, int l, |
609 | uint64_t features, | |
610 | const mon_feature_t& mon_features); | |
611 | // end election (called by Elector) | |
612 | void finish_election(); | |
613 | ||
7c673cae FG |
614 | void update_logger(); |
615 | ||
616 | /** | |
617 | * Vector holding the Services serviced by this Monitor. | |
618 | */ | |
619 | vector<PaxosService*> paxos_service; | |
620 | ||
7c673cae FG |
621 | class PGMonitor *pgmon() { |
622 | return (class PGMonitor *)paxos_service[PAXOS_PGMAP]; | |
623 | } | |
624 | ||
625 | class MDSMonitor *mdsmon() { | |
626 | return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP]; | |
627 | } | |
628 | ||
629 | class MonmapMonitor *monmon() { | |
630 | return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP]; | |
631 | } | |
632 | ||
633 | class OSDMonitor *osdmon() { | |
634 | return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP]; | |
635 | } | |
636 | ||
637 | class AuthMonitor *authmon() { | |
638 | return (class AuthMonitor *)paxos_service[PAXOS_AUTH]; | |
639 | } | |
640 | ||
641 | class LogMonitor *logmon() { | |
642 | return (class LogMonitor*) paxos_service[PAXOS_LOG]; | |
643 | } | |
644 | ||
645 | class MgrMonitor *mgrmon() { | |
646 | return (class MgrMonitor*) paxos_service[PAXOS_MGR]; | |
647 | } | |
648 | ||
31f18b77 FG |
649 | class MgrStatMonitor *mgrstatmon() { |
650 | return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT]; | |
651 | } | |
652 | ||
b32b8144 FG |
653 | class HealthMonitor *healthmon() { |
654 | return (class HealthMonitor*) paxos_service[PAXOS_HEALTH]; | |
224ce89b WB |
655 | } |
656 | ||
7c673cae FG |
657 | friend class Paxos; |
658 | friend class OSDMonitor; | |
659 | friend class MDSMonitor; | |
660 | friend class MonmapMonitor; | |
661 | friend class PGMonitor; | |
662 | friend class LogMonitor; | |
663 | friend class ConfigKeyService; | |
664 | ||
665 | QuorumService *health_monitor; | |
666 | QuorumService *config_key_service; | |
667 | ||
668 | // -- sessions -- | |
669 | MonSessionMap session_map; | |
670 | Mutex session_map_lock{"Monitor::session_map_lock"}; | |
671 | AdminSocketHook *admin_hook; | |
672 | ||
673 | template<typename Func, typename...Args> | |
674 | void with_session_map(Func&& func) { | |
675 | Mutex::Locker l(session_map_lock); | |
676 | std::forward<Func>(func)(session_map); | |
677 | } | |
678 | void send_latest_monmap(Connection *con); | |
679 | ||
680 | // messages | |
681 | void handle_get_version(MonOpRequestRef op); | |
682 | void handle_subscribe(MonOpRequestRef op); | |
683 | void handle_mon_get_map(MonOpRequestRef op); | |
684 | ||
685 | static void _generate_command_map(map<string,cmd_vartype>& cmdmap, | |
686 | map<string,string> ¶m_str_map); | |
c07f9fc5 FG |
687 | static const MonCommand *_get_moncommand( |
688 | const string &cmd_prefix, | |
d2e6a577 | 689 | const vector<MonCommand>& cmds); |
7c673cae FG |
690 | bool _allowed_command(MonSession *s, string &module, string &prefix, |
691 | const map<string,cmd_vartype>& cmdmap, | |
692 | const map<string,string>& param_str_map, | |
693 | const MonCommand *this_cmd); | |
694 | void get_mon_status(Formatter *f, ostream& ss); | |
695 | void _quorum_status(Formatter *f, ostream& ss); | |
696 | bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss); | |
697 | void handle_command(MonOpRequestRef op); | |
698 | void handle_route(MonOpRequestRef op); | |
699 | ||
700 | void handle_mon_metadata(MonOpRequestRef op); | |
701 | int get_mon_metadata(int mon, Formatter *f, ostream& err); | |
702 | int print_nodes(Formatter *f, ostream& err); | |
703 | ||
704 | // Accumulate metadata across calls to update_mon_metadata | |
224ce89b | 705 | map<int, Metadata> mon_metadata; |
7c673cae FG |
706 | map<int, Metadata> pending_metadata; |
707 | ||
708 | /** | |
709 | * | |
710 | */ | |
711 | struct health_cache_t { | |
712 | health_status_t overall; | |
713 | string summary; | |
714 | ||
715 | void reset() { | |
716 | // health_status_t doesn't really have a NONE value and we're not | |
717 | // okay with setting something else (say, HEALTH_ERR). so just | |
718 | // leave it be. | |
719 | summary.clear(); | |
720 | } | |
721 | } health_status_cache; | |
722 | ||
723 | Context *health_tick_event = nullptr; | |
724 | Context *health_interval_event = nullptr; | |
725 | ||
726 | void health_tick_start(); | |
727 | void health_tick_stop(); | |
728 | utime_t health_interval_calc_next_update(); | |
729 | void health_interval_start(); | |
730 | void health_interval_stop(); | |
731 | void health_events_cleanup(); | |
732 | ||
733 | void health_to_clog_update_conf(const std::set<std::string> &changed); | |
734 | ||
735 | void do_health_to_clog_interval(); | |
736 | void do_health_to_clog(bool force = false); | |
737 | ||
738 | /** | |
739 | * Generate health report | |
740 | * | |
741 | * @param status one-line status summary | |
742 | * @param detailbl optional bufferlist* to fill with a detailed report | |
743 | * @returns health status | |
744 | */ | |
745 | health_status_t get_health(list<string>& status, bufferlist *detailbl, | |
746 | Formatter *f); | |
224ce89b WB |
747 | |
748 | health_status_t get_health_status( | |
749 | bool want_detail, | |
750 | Formatter *f, | |
751 | std::string *plain, | |
752 | const char *sep1 = " ", | |
753 | const char *sep2 = "; "); | |
754 | void log_health( | |
755 | const health_check_map_t& updated, | |
756 | const health_check_map_t& previous, | |
757 | MonitorDBStore::TransactionRef t); | |
758 | ||
181888fb FG |
759 | protected: |
760 | ||
761 | class HealthCheckLogStatus { | |
762 | public: | |
763 | health_status_t severity; | |
764 | std::string last_message; | |
765 | utime_t updated_at = 0; | |
766 | HealthCheckLogStatus(health_status_t severity_, | |
767 | const std::string &last_message_, | |
768 | utime_t updated_at_) | |
769 | : severity(severity_), | |
770 | last_message(last_message_), | |
771 | updated_at(updated_at_) | |
772 | {} | |
773 | }; | |
774 | std::map<std::string, HealthCheckLogStatus> health_check_log_times; | |
775 | ||
776 | public: | |
777 | ||
7c673cae FG |
778 | void get_cluster_status(stringstream &ss, Formatter *f); |
779 | ||
780 | void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version); | |
781 | void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version); | |
782 | ||
783 | ||
784 | void handle_probe(MonOpRequestRef op); | |
785 | /** | |
786 | * Handle a Probe Operation, replying with our name, quorum and known versions. | |
787 | * | |
788 | * We use the MMonProbe message class for anything and everything related with | |
789 | * Monitor probing. One of the operations relates directly with the probing | |
790 | * itself, in which we receive a probe request and to which we reply with | |
791 | * our name, our quorum and the known versions for each Paxos service. Thus the | |
792 | * redundant function name. This reply will obviously be sent to the one | |
793 | * probing/requesting these infos. | |
794 | * | |
795 | * @todo Add @pre and @post | |
796 | * | |
797 | * @param m A Probe message, with an operation of type Probe. | |
798 | */ | |
799 | void handle_probe_probe(MonOpRequestRef op); | |
800 | void handle_probe_reply(MonOpRequestRef op); | |
801 | ||
802 | // request routing | |
803 | struct RoutedRequest { | |
804 | uint64_t tid; | |
805 | bufferlist request_bl; | |
806 | MonSession *session; | |
807 | ConnectionRef con; | |
808 | uint64_t con_features; | |
809 | entity_inst_t client_inst; | |
810 | MonOpRequestRef op; | |
811 | ||
812 | RoutedRequest() : tid(0), session(NULL), con_features(0) {} | |
813 | ~RoutedRequest() { | |
814 | if (session) | |
815 | session->put(); | |
816 | } | |
817 | }; | |
818 | uint64_t routed_request_tid; | |
819 | map<uint64_t, RoutedRequest*> routed_requests; | |
820 | ||
821 | void forward_request_leader(MonOpRequestRef op); | |
822 | void handle_forward(MonOpRequestRef op); | |
823 | void try_send_message(Message *m, const entity_inst_t& to); | |
824 | void send_reply(MonOpRequestRef op, Message *reply); | |
825 | void no_reply(MonOpRequestRef op); | |
826 | void resend_routed_requests(); | |
827 | void remove_session(MonSession *s); | |
828 | void remove_all_sessions(); | |
829 | void waitlist_or_zap_client(MonOpRequestRef op); | |
830 | ||
831 | void send_command(const entity_inst_t& inst, | |
832 | const vector<string>& com); | |
833 | ||
834 | public: | |
835 | struct C_Command : public C_MonOp { | |
836 | Monitor *mon; | |
837 | int rc; | |
838 | string rs; | |
839 | bufferlist rdata; | |
840 | version_t version; | |
841 | C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) : | |
842 | C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){} | |
843 | C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) : | |
844 | C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){} | |
845 | ||
846 | void _finish(int r) override { | |
847 | MMonCommand *m = static_cast<MMonCommand*>(op->get_req()); | |
848 | if (r >= 0) { | |
849 | ostringstream ss; | |
850 | if (!op->get_req()->get_connection()) { | |
851 | ss << "connection dropped for command "; | |
852 | } else { | |
853 | MonSession *s = op->get_session(); | |
854 | ||
855 | // if client drops we may not have a session to draw information from. | |
856 | if (s) { | |
857 | ss << "from='" << s->inst << "' " | |
858 | << "entity='" << s->entity_name << "' "; | |
859 | } else { | |
860 | ss << "session dropped for command "; | |
861 | } | |
862 | } | |
863 | ss << "cmd='" << m->cmd << "': finished"; | |
864 | ||
865 | mon->audit_clog->info() << ss.str(); | |
866 | mon->reply_command(op, rc, rs, rdata, version); | |
867 | } | |
868 | else if (r == -ECANCELED) | |
869 | return; | |
870 | else if (r == -EAGAIN) | |
871 | mon->dispatch_op(op); | |
872 | else | |
873 | assert(0 == "bad C_Command return value"); | |
874 | } | |
875 | }; | |
876 | ||
877 | private: | |
878 | class C_RetryMessage : public C_MonOp { | |
879 | Monitor *mon; | |
880 | public: | |
881 | C_RetryMessage(Monitor *m, MonOpRequestRef op) : | |
882 | C_MonOp(op), mon(m) { } | |
883 | ||
884 | void _finish(int r) override { | |
885 | if (r == -EAGAIN || r >= 0) | |
886 | mon->dispatch_op(op); | |
887 | else if (r == -ECANCELED) | |
888 | return; | |
889 | else | |
890 | assert(0 == "bad C_RetryMessage return value"); | |
891 | } | |
892 | }; | |
893 | ||
894 | //ms_dispatch handles a lot of logic and we want to reuse it | |
895 | //on forwarded messages, so we create a non-locking version for this class | |
896 | void _ms_dispatch(Message *m); | |
897 | bool ms_dispatch(Message *m) override { | |
898 | lock.Lock(); | |
899 | _ms_dispatch(m); | |
900 | lock.Unlock(); | |
901 | return true; | |
902 | } | |
903 | void dispatch_op(MonOpRequestRef op); | |
904 | //mon_caps is used for un-connected messages from monitors | |
905 | MonCap * mon_caps; | |
906 | bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override; | |
907 | bool ms_verify_authorizer(Connection *con, int peer_type, | |
908 | int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply, | |
28e407b8 AA |
909 | bool& isvalid, CryptoKey& session_key, |
910 | std::unique_ptr<AuthAuthorizerChallenge> *challenge) override; | |
7c673cae FG |
911 | bool ms_handle_reset(Connection *con) override; |
912 | void ms_handle_remote_reset(Connection *con) override {} | |
913 | bool ms_handle_refused(Connection *con) override; | |
914 | ||
915 | int write_default_keyring(bufferlist& bl); | |
916 | void extract_save_mon_key(KeyRing& keyring); | |
917 | ||
224ce89b | 918 | void collect_metadata(Metadata *m); |
7c673cae | 919 | void update_mon_metadata(int from, Metadata&& m); |
224ce89b | 920 | int load_metadata(); |
31f18b77 | 921 | void count_metadata(const string& field, Formatter *f); |
c07f9fc5 | 922 | void count_metadata(const string& field, map<string,int> *out); |
7c673cae FG |
923 | |
924 | // features | |
925 | static CompatSet get_initial_supported_features(); | |
926 | static CompatSet get_supported_features(); | |
927 | static CompatSet get_legacy_features(); | |
928 | /// read the ondisk features into the CompatSet pointed to by read_features | |
929 | static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features); | |
930 | void read_features(); | |
931 | void write_features(MonitorDBStore::TransactionRef t); | |
932 | ||
933 | OpTracker op_tracker; | |
934 | ||
935 | public: | |
936 | Monitor(CephContext *cct_, string nm, MonitorDBStore *s, | |
937 | Messenger *m, Messenger *mgr_m, MonMap *map); | |
938 | ~Monitor() override; | |
939 | ||
940 | static int check_features(MonitorDBStore *store); | |
941 | ||
942 | // config observer | |
943 | const char** get_tracked_conf_keys() const override; | |
944 | void handle_conf_change(const struct md_config_t *conf, | |
945 | const std::set<std::string> &changed) override; | |
946 | ||
947 | void update_log_clients(); | |
948 | int sanitize_options(); | |
949 | int preinit(); | |
950 | int init(); | |
951 | void init_paxos(); | |
952 | void refresh_from_paxos(bool *need_bootstrap); | |
953 | void shutdown(); | |
954 | void tick(); | |
955 | ||
956 | void handle_signal(int sig); | |
957 | ||
958 | int mkfs(bufferlist& osdmapbl); | |
959 | ||
960 | /** | |
961 | * check cluster_fsid file | |
962 | * | |
963 | * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code | |
964 | */ | |
965 | int check_fsid(); | |
966 | ||
967 | /** | |
968 | * write cluster_fsid file | |
969 | * | |
970 | * @return 0 on success, or negative error code | |
971 | */ | |
972 | int write_fsid(); | |
973 | int write_fsid(MonitorDBStore::TransactionRef t); | |
974 | ||
975 | void do_admin_command(std::string command, cmdmap_t& cmdmap, | |
976 | std::string format, ostream& ss); | |
977 | ||
978 | private: | |
979 | // don't allow copying | |
980 | Monitor(const Monitor& rhs); | |
981 | Monitor& operator=(const Monitor &rhs); | |
982 | ||
983 | public: | |
c07f9fc5 | 984 | static void format_command_descriptions(const std::vector<MonCommand> &commands, |
7c673cae FG |
985 | Formatter *f, |
986 | bufferlist *rdata, | |
987 | bool hide_mgr_flag=false); | |
d2e6a577 FG |
988 | |
989 | const std::vector<MonCommand> &get_local_commands(mon_feature_t f) { | |
990 | if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) | |
991 | return local_mon_commands; | |
992 | else | |
993 | return local_upgrading_mon_commands; | |
994 | } | |
995 | const bufferlist& get_local_commands_bl(mon_feature_t f) { | |
996 | if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS)) | |
997 | return local_mon_commands_bl; | |
998 | else | |
999 | return local_upgrading_mon_commands_bl; | |
1000 | } | |
1001 | void set_leader_commands(const std::vector<MonCommand>& cmds) { | |
1002 | leader_mon_commands = cmds; | |
1003 | } | |
1004 | ||
7c673cae FG |
1005 | static bool is_keyring_required(); |
1006 | }; | |
1007 | ||
1008 | #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)") | |
1009 | #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)") | |
1010 | #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)") | |
1011 | #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools") | |
1012 | #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding") | |
1013 | #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code") | |
1014 | #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code") | |
1015 | #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features") | |
181888fb | 1016 | #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout") |
7c673cae FG |
1017 | // make sure you add your feature to Monitor::get_supported_features |
1018 | ||
7c673cae | 1019 | |
7c673cae FG |
1020 | |
1021 | #endif |