]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | /* | |
16 | * This is the top level monitor. It runs on each machine in the Monitor | |
17 | * Cluster. The election of a leader for the paxos algorithm only happens | |
18 | * once per machine via the elector. There is a separate paxos instance (state) | |
19 | * kept for each of the system components: Object Store Device (OSD) Monitor, | |
20 | * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor. | |
21 | */ | |
22 | ||
23 | #ifndef CEPH_MONITOR_H | |
24 | #define CEPH_MONITOR_H | |
25 | ||
26 | #include <errno.h> | |
27 | #include <cmath> | |
28 | ||
29 | #include "include/types.h" | |
224ce89b | 30 | #include "include/health.h" |
7c673cae FG |
31 | #include "msg/Messenger.h" |
32 | ||
33 | #include "common/Timer.h" | |
34 | ||
224ce89b | 35 | #include "health_check.h" |
7c673cae FG |
36 | #include "MonMap.h" |
37 | #include "Elector.h" | |
38 | #include "Paxos.h" | |
39 | #include "Session.h" | |
31f18b77 | 40 | #include "PGStatService.h" |
c07f9fc5 | 41 | #include "MonCommand.h" |
7c673cae FG |
42 | |
43 | #include "common/LogClient.h" | |
44 | #include "auth/cephx/CephxKeyServer.h" | |
45 | #include "auth/AuthMethodList.h" | |
46 | #include "auth/KeyRing.h" | |
47 | #include "messages/MMonCommand.h" | |
48 | #include "mon/MonitorDBStore.h" | |
49 | #include "include/memory.h" | |
50 | #include "mgr/MgrClient.h" | |
51 | ||
52 | #include "mon/MonOpRequest.h" | |
53 | #include "common/WorkQueue.h" | |
54 | ||
55 | ||
56 | #define CEPH_MON_PROTOCOL 13 /* cluster internal */ | |
57 | ||
58 | ||
59 | enum { | |
60 | l_cluster_first = 555000, | |
61 | l_cluster_num_mon, | |
62 | l_cluster_num_mon_quorum, | |
63 | l_cluster_num_osd, | |
64 | l_cluster_num_osd_up, | |
65 | l_cluster_num_osd_in, | |
66 | l_cluster_osd_epoch, | |
67 | l_cluster_osd_bytes, | |
68 | l_cluster_osd_bytes_used, | |
69 | l_cluster_osd_bytes_avail, | |
70 | l_cluster_num_pool, | |
71 | l_cluster_num_pg, | |
72 | l_cluster_num_pg_active_clean, | |
73 | l_cluster_num_pg_active, | |
74 | l_cluster_num_pg_peering, | |
75 | l_cluster_num_object, | |
76 | l_cluster_num_object_degraded, | |
77 | l_cluster_num_object_misplaced, | |
78 | l_cluster_num_object_unfound, | |
79 | l_cluster_num_bytes, | |
80 | l_cluster_num_mds_up, | |
81 | l_cluster_num_mds_in, | |
82 | l_cluster_num_mds_failed, | |
83 | l_cluster_mds_epoch, | |
84 | l_cluster_last, | |
85 | }; | |
86 | ||
87 | enum { | |
88 | l_mon_first = 456000, | |
89 | l_mon_num_sessions, | |
90 | l_mon_session_add, | |
91 | l_mon_session_rm, | |
92 | l_mon_session_trim, | |
93 | l_mon_num_elections, | |
94 | l_mon_election_call, | |
95 | l_mon_election_win, | |
96 | l_mon_election_lose, | |
97 | l_mon_last, | |
98 | }; | |
99 | ||
100 | class QuorumService; | |
101 | class PaxosService; | |
102 | ||
103 | class PerfCounters; | |
104 | class AdminSocketHook; | |
105 | ||
106 | class MMonGetMap; | |
107 | class MMonGetVersion; | |
108 | class MMonMetadata; | |
109 | class MMonSync; | |
110 | class MMonScrub; | |
111 | class MMonProbe; | |
112 | struct MMonSubscribe; | |
113 | struct MRoute; | |
114 | struct MForward; | |
115 | struct MTimeCheck; | |
116 | struct MMonHealth; | |
7c673cae FG |
117 | |
118 | #define COMPAT_SET_LOC "feature_set" | |
119 | ||
120 | class C_MonContext final : public FunctionContext { | |
121 | const Monitor *mon; | |
122 | public: | |
123 | explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback) | |
124 | : FunctionContext(std::move(callback)), mon(m) {} | |
125 | void finish(int r) override; | |
126 | }; | |
127 | ||
128 | class Monitor : public Dispatcher, | |
129 | public md_config_obs_t { | |
130 | public: | |
131 | // me | |
132 | string name; | |
133 | int rank; | |
134 | Messenger *messenger; | |
135 | ConnectionRef con_self; | |
136 | Mutex lock; | |
137 | SafeTimer timer; | |
138 | Finisher finisher; | |
139 | ThreadPool cpu_tp; ///< threadpool for CPU intensive work | |
140 | ||
141 | /// true if we have ever joined a quorum. if false, we are either a | |
142 | /// new cluster, a newly joining monitor, or a just-upgraded | |
143 | /// monitor. | |
144 | bool has_ever_joined; | |
145 | ||
146 | PerfCounters *logger, *cluster_logger; | |
147 | bool cluster_logger_registered; | |
148 | ||
149 | void register_cluster_logger(); | |
150 | void unregister_cluster_logger(); | |
151 | ||
152 | MonMap *monmap; | |
153 | uuid_d fingerprint; | |
154 | ||
155 | set<entity_addr_t> extra_probe_peers; | |
156 | ||
157 | LogClient log_client; | |
158 | LogChannelRef clog; | |
159 | LogChannelRef audit_clog; | |
160 | KeyRing keyring; | |
161 | KeyServer key_server; | |
162 | ||
163 | AuthMethodList auth_cluster_required; | |
164 | AuthMethodList auth_service_required; | |
165 | ||
166 | CompatSet features; | |
167 | ||
168 | const MonCommand *leader_supported_mon_commands; | |
169 | int leader_supported_mon_commands_size; | |
170 | ||
171 | Messenger *mgr_messenger; | |
172 | MgrClient mgr_client; | |
173 | uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes | |
174 | ||
31f18b77 FG |
175 | const MonPGStatService *pgservice; |
176 | ||
7c673cae FG |
177 | private: |
178 | void new_tick(); | |
179 | ||
180 | // -- local storage -- | |
181 | public: | |
182 | MonitorDBStore *store; | |
183 | static const string MONITOR_NAME; | |
184 | static const string MONITOR_STORE_PREFIX; | |
185 | ||
186 | // -- monitor state -- | |
187 | private: | |
188 | enum { | |
189 | STATE_PROBING = 1, | |
190 | STATE_SYNCHRONIZING, | |
191 | STATE_ELECTING, | |
192 | STATE_LEADER, | |
193 | STATE_PEON, | |
194 | STATE_SHUTDOWN | |
195 | }; | |
196 | int state; | |
197 | ||
198 | public: | |
199 | static const char *get_state_name(int s) { | |
200 | switch (s) { | |
201 | case STATE_PROBING: return "probing"; | |
202 | case STATE_SYNCHRONIZING: return "synchronizing"; | |
203 | case STATE_ELECTING: return "electing"; | |
204 | case STATE_LEADER: return "leader"; | |
205 | case STATE_PEON: return "peon"; | |
206 | case STATE_SHUTDOWN: return "shutdown"; | |
207 | default: return "???"; | |
208 | } | |
209 | } | |
210 | const char *get_state_name() const { | |
211 | return get_state_name(state); | |
212 | } | |
213 | ||
214 | bool is_shutdown() const { return state == STATE_SHUTDOWN; } | |
215 | bool is_probing() const { return state == STATE_PROBING; } | |
216 | bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; } | |
217 | bool is_electing() const { return state == STATE_ELECTING; } | |
218 | bool is_leader() const { return state == STATE_LEADER; } | |
219 | bool is_peon() const { return state == STATE_PEON; } | |
220 | ||
221 | const utime_t &get_leader_since() const; | |
222 | ||
223 | void prepare_new_fingerprint(MonitorDBStore::TransactionRef t); | |
224 | ||
225 | // -- elector -- | |
226 | private: | |
227 | Paxos *paxos; | |
228 | Elector elector; | |
229 | friend class Elector; | |
230 | ||
231 | /// features we require of peers (based on on-disk compatset) | |
232 | uint64_t required_features; | |
233 | ||
234 | int leader; // current leader (to best of knowledge) | |
235 | set<int> quorum; // current active set of monitors (if !starting) | |
236 | utime_t leader_since; // when this monitor became the leader, if it is the leader | |
237 | utime_t exited_quorum; // time detected as not in quorum; 0 if in | |
31f18b77 FG |
238 | |
239 | // map of counts of connected clients, by type and features, for | |
240 | // each quorum mon | |
241 | map<int,FeatureMap> quorum_feature_map; | |
242 | ||
7c673cae FG |
243 | /** |
244 | * Intersection of quorum member's connection feature bits. | |
245 | */ | |
246 | uint64_t quorum_con_features; | |
247 | /** | |
248 | * Intersection of quorum members mon-specific feature bits | |
249 | */ | |
250 | mon_feature_t quorum_mon_features; | |
251 | bufferlist supported_commands_bl; // encoded MonCommands we support | |
252 | ||
253 | set<string> outside_quorum; | |
254 | ||
255 | /** | |
256 | * @defgroup Monitor_h_scrub | |
257 | * @{ | |
258 | */ | |
259 | version_t scrub_version; ///< paxos version we are scrubbing | |
260 | map<int,ScrubResult> scrub_result; ///< results so far | |
261 | ||
262 | /** | |
263 | * trigger a cross-mon scrub | |
264 | * | |
265 | * Verify all mons are storing identical content | |
266 | */ | |
267 | int scrub_start(); | |
268 | int scrub(); | |
269 | void handle_scrub(MonOpRequestRef op); | |
270 | bool _scrub(ScrubResult *r, | |
271 | pair<string,string> *start, | |
272 | int *num_keys); | |
273 | void scrub_check_results(); | |
274 | void scrub_timeout(); | |
275 | void scrub_finish(); | |
276 | void scrub_reset(); | |
277 | void scrub_update_interval(int secs); | |
278 | ||
279 | Context *scrub_event; ///< periodic event to trigger scrub (leader) | |
280 | Context *scrub_timeout_event; ///< scrub round timeout (leader) | |
281 | void scrub_event_start(); | |
282 | void scrub_event_cancel(); | |
283 | void scrub_reset_timeout(); | |
284 | void scrub_cancel_timeout(); | |
285 | ||
286 | struct ScrubState { | |
287 | pair<string,string> last_key; ///< last scrubbed key | |
288 | bool finished; | |
289 | ||
290 | ScrubState() : finished(false) { } | |
291 | virtual ~ScrubState() { } | |
292 | }; | |
293 | ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub | |
294 | ||
295 | /** | |
296 | * @defgroup Monitor_h_sync Synchronization | |
297 | * @{ | |
298 | */ | |
299 | /** | |
300 | * @} // provider state | |
301 | */ | |
302 | struct SyncProvider { | |
303 | entity_inst_t entity; ///< who | |
304 | uint64_t cookie; ///< unique cookie for this sync attempt | |
305 | utime_t timeout; ///< when we give up and expire this attempt | |
306 | version_t last_committed; ///< last paxos version on peer | |
307 | pair<string,string> last_key; ///< last key sent to (or on) peer | |
308 | bool full; ///< full scan? | |
309 | MonitorDBStore::Synchronizer synchronizer; ///< iterator | |
310 | ||
311 | SyncProvider() : cookie(0), last_committed(0), full(false) {} | |
312 | ||
313 | void reset_timeout(CephContext *cct, int grace) { | |
314 | timeout = ceph_clock_now(); | |
315 | timeout += grace; | |
316 | } | |
317 | }; | |
318 | ||
319 | map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us | |
320 | uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique | |
321 | ||
322 | /** | |
323 | * @} // requester state | |
324 | */ | |
325 | entity_inst_t sync_provider; ///< who we are syncing from | |
326 | uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise | |
327 | bool sync_full; ///< true if we are a full sync, false for recent catch-up | |
328 | version_t sync_start_version; ///< last_committed at sync start | |
329 | Context *sync_timeout_event; ///< timeout event | |
330 | ||
331 | /** | |
332 | * floor for sync source | |
333 | * | |
334 | * When we sync we forget about our old last_committed value which | |
335 | * can be dangerous. For example, if we have a cluster of: | |
336 | * | |
337 | * mon.a: lc 100 | |
338 | * mon.b: lc 80 | |
339 | * mon.c: lc 100 (us) | |
340 | * | |
341 | * If something forces us to sync (say, corruption, or manual | |
342 | * intervention, or bug), we forget last_committed, and might abort. | |
343 | * If mon.a happens to be down when we come back, we will see: | |
344 | * | |
345 | * mon.b: lc 80 | |
346 | * mon.c: lc 0 (us) | |
347 | * | |
348 | * and sync from mon.b, at which point a+b will both have lc 80 and | |
349 | * come online with a majority holding out of date commits. | |
350 | * | |
351 | * Avoid this by preserving our old last_committed value prior to | |
352 | * sync and never going backwards. | |
353 | */ | |
354 | version_t sync_last_committed_floor; | |
355 | ||
356 | /** | |
357 | * Obtain the synchronization target prefixes in set form. | |
358 | * | |
359 | * We consider a target prefix all those that are relevant when | |
360 | * synchronizing two stores. That is, all those that hold paxos service's | |
361 | * versions, as well as paxos versions, or any control keys such as the | |
362 | * first or last committed version. | |
363 | * | |
364 | * Given the current design, this function should return the name of all and | |
365 | * any available paxos service, plus the paxos name. | |
366 | * | |
367 | * @returns a set of strings referring to the prefixes being synchronized | |
368 | */ | |
369 | set<string> get_sync_targets_names(); | |
370 | ||
371 | /** | |
372 | * Reset the monitor's sync-related data structures for syncing *from* a peer | |
373 | */ | |
374 | void sync_reset_requester(); | |
375 | ||
376 | /** | |
377 | * Reset sync state related to allowing others to sync from us | |
378 | */ | |
379 | void sync_reset_provider(); | |
380 | ||
381 | /** | |
382 | * Caled when a sync attempt times out (requester-side) | |
383 | */ | |
384 | void sync_timeout(); | |
385 | ||
386 | /** | |
387 | * Get the latest monmap for backup purposes during sync | |
388 | */ | |
389 | void sync_obtain_latest_monmap(bufferlist &bl); | |
390 | ||
391 | /** | |
392 | * Start sync process | |
393 | * | |
394 | * Start pulling committed state from another monitor. | |
395 | * | |
396 | * @param entity where to pull committed state from | |
397 | * @param full whether to do a full sync or just catch up on recent paxos | |
398 | */ | |
399 | void sync_start(entity_inst_t &entity, bool full); | |
400 | ||
401 | public: | |
402 | /** | |
403 | * force a sync on next mon restart | |
404 | */ | |
405 | void sync_force(Formatter *f, ostream& ss); | |
406 | ||
407 | private: | |
408 | /** | |
409 | * store critical state for safekeeping during sync | |
410 | * | |
411 | * We store a few things on the side that we don't want to get clobbered by sync. This | |
412 | * includes the latest monmap and a lower bound on last_committed. | |
413 | */ | |
414 | void sync_stash_critical_state(MonitorDBStore::TransactionRef tx); | |
415 | ||
416 | /** | |
417 | * reset the sync timeout | |
418 | * | |
419 | * This is used on the client to restart if things aren't progressing | |
420 | */ | |
421 | void sync_reset_timeout(); | |
422 | ||
423 | /** | |
424 | * trim stale sync provider state | |
425 | * | |
426 | * If someone is syncing from us and hasn't talked to us recently, expire their state. | |
427 | */ | |
428 | void sync_trim_providers(); | |
429 | ||
430 | /** | |
431 | * Complete a sync | |
432 | * | |
433 | * Finish up a sync after we've gotten all of the chunks. | |
434 | * | |
435 | * @param last_committed final last_committed value from provider | |
436 | */ | |
437 | void sync_finish(version_t last_committed); | |
438 | ||
439 | /** | |
440 | * request the next chunk from the provider | |
441 | */ | |
442 | void sync_get_next_chunk(); | |
443 | ||
444 | /** | |
445 | * handle sync message | |
446 | * | |
447 | * @param m Sync message with operation type MMonSync::OP_START_CHUNKS | |
448 | */ | |
449 | void handle_sync(MonOpRequestRef op); | |
450 | ||
451 | void _sync_reply_no_cookie(MonOpRequestRef op); | |
452 | ||
453 | void handle_sync_get_cookie(MonOpRequestRef op); | |
454 | void handle_sync_get_chunk(MonOpRequestRef op); | |
455 | void handle_sync_finish(MonOpRequestRef op); | |
456 | ||
457 | void handle_sync_cookie(MonOpRequestRef op); | |
458 | void handle_sync_forward(MonOpRequestRef op); | |
459 | void handle_sync_chunk(MonOpRequestRef op); | |
460 | void handle_sync_no_cookie(MonOpRequestRef op); | |
461 | ||
462 | /** | |
463 | * @} // Synchronization | |
464 | */ | |
465 | ||
466 | list<Context*> waitfor_quorum; | |
467 | list<Context*> maybe_wait_for_quorum; | |
468 | ||
469 | /** | |
470 | * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System | |
471 | * @{ | |
472 | * | |
473 | * We use time checks to keep track of any clock drifting going on in the | |
474 | * cluster. This is accomplished by periodically ping each monitor in the | |
475 | * quorum and register its response time on a map, assessing how much its | |
476 | * clock has drifted. We also take this opportunity to assess the latency | |
477 | * on response. | |
478 | * | |
479 | * This mechanism works as follows: | |
480 | * | |
481 | * - Leader sends out a 'PING' message to each other monitor in the quorum. | |
482 | * The message is timestamped with the leader's current time. The leader's | |
483 | * current time is recorded in a map, associated with each peon's | |
484 | * instance. | |
485 | * - The peon replies to the leader with a timestamped 'PONG' message. | |
486 | * - The leader calculates a delta between the peon's timestamp and its | |
487 | * current time and stashes it. | |
488 | * - The leader also calculates the time it took to receive the 'PONG' | |
489 | * since the 'PING' was sent, and stashes an approximate latency estimate. | |
490 | * - Once all the quorum members have pong'ed, the leader will share the | |
491 | * clock skew and latency maps with all the monitors in the quorum. | |
492 | */ | |
493 | map<entity_inst_t, utime_t> timecheck_waiting; | |
494 | map<entity_inst_t, double> timecheck_skews; | |
495 | map<entity_inst_t, double> timecheck_latencies; | |
496 | // odd value means we are mid-round; even value means the round has | |
497 | // finished. | |
498 | version_t timecheck_round; | |
499 | unsigned int timecheck_acks; | |
500 | utime_t timecheck_round_start; | |
224ce89b | 501 | friend class HealthMonitor; |
7c673cae FG |
502 | /* When we hit a skew we will start a new round based off of |
503 | * 'mon_timecheck_skew_interval'. Each new round will be backed off | |
504 | * until we hit 'mon_timecheck_interval' -- which is the typical | |
505 | * interval when not in the presence of a skew. | |
506 | * | |
507 | * This variable tracks the number of rounds with skews since last clean | |
508 | * so that we can report to the user and properly adjust the backoff. | |
509 | */ | |
510 | uint64_t timecheck_rounds_since_clean; | |
511 | /** | |
512 | * Time Check event. | |
513 | */ | |
514 | Context *timecheck_event; | |
515 | ||
516 | void timecheck_start(); | |
517 | void timecheck_finish(); | |
518 | void timecheck_start_round(); | |
519 | void timecheck_finish_round(bool success = true); | |
520 | void timecheck_cancel_round(); | |
521 | void timecheck_cleanup(); | |
522 | void timecheck_reset_event(); | |
523 | void timecheck_check_skews(); | |
524 | void timecheck_report(); | |
525 | void timecheck(); | |
526 | health_status_t timecheck_status(ostringstream &ss, | |
527 | const double skew_bound, | |
528 | const double latency); | |
529 | void handle_timecheck_leader(MonOpRequestRef op); | |
530 | void handle_timecheck_peon(MonOpRequestRef op); | |
531 | void handle_timecheck(MonOpRequestRef op); | |
532 | ||
533 | /** | |
534 | * Returns 'true' if this is considered to be a skew; 'false' otherwise. | |
535 | */ | |
536 | bool timecheck_has_skew(const double skew_bound, double *abs) const { | |
537 | double abs_skew = std::fabs(skew_bound); | |
538 | if (abs) | |
539 | *abs = abs_skew; | |
540 | return (abs_skew > g_conf->mon_clock_drift_allowed); | |
541 | } | |
542 | ||
543 | /** | |
544 | * @} | |
545 | */ | |
546 | /** | |
547 | * Handle ping messages from others. | |
548 | */ | |
549 | void handle_ping(MonOpRequestRef op); | |
550 | ||
551 | Context *probe_timeout_event = nullptr; // for probing | |
552 | ||
553 | void reset_probe_timeout(); | |
554 | void cancel_probe_timeout(); | |
555 | void probe_timeout(int r); | |
556 | ||
557 | void _apply_compatset_features(CompatSet &new_features); | |
558 | ||
559 | public: | |
560 | epoch_t get_epoch(); | |
561 | int get_leader() const { return leader; } | |
224ce89b WB |
562 | string get_leader_name() { |
563 | return quorum.empty() ? string() : monmap->get_name(*quorum.begin()); | |
564 | } | |
7c673cae FG |
565 | const set<int>& get_quorum() const { return quorum; } |
566 | list<string> get_quorum_names() { | |
567 | list<string> q; | |
568 | for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p) | |
569 | q.push_back(monmap->get_name(*p)); | |
570 | return q; | |
571 | } | |
572 | uint64_t get_quorum_con_features() const { | |
573 | return quorum_con_features; | |
574 | } | |
575 | mon_feature_t get_quorum_mon_features() const { | |
576 | return quorum_mon_features; | |
577 | } | |
578 | uint64_t get_required_features() const { | |
579 | return required_features; | |
580 | } | |
581 | mon_feature_t get_required_mon_features() const { | |
582 | return monmap->get_required_features(); | |
583 | } | |
584 | void apply_quorum_to_compatset_features(); | |
585 | void apply_monmap_to_compatset_features(); | |
586 | void calc_quorum_requirements(); | |
587 | ||
31f18b77 FG |
588 | void get_combined_feature_map(FeatureMap *fm); |
589 | ||
7c673cae FG |
590 | private: |
591 | void _reset(); ///< called from bootstrap, start_, or join_election | |
592 | void wait_for_paxos_write(); | |
593 | void _finish_svc_election(); ///< called by {win,lose}_election | |
594 | public: | |
595 | void bootstrap(); | |
596 | void join_election(); | |
597 | void start_election(); | |
598 | void win_standalone_election(); | |
599 | // end election (called by Elector) | |
600 | void win_election(epoch_t epoch, set<int>& q, | |
601 | uint64_t features, | |
602 | const mon_feature_t& mon_features, | |
224ce89b | 603 | const map<int,Metadata>& metadata, |
7c673cae FG |
604 | const MonCommand *cmdset, int cmdsize); |
605 | void lose_election(epoch_t epoch, set<int>& q, int l, | |
606 | uint64_t features, | |
607 | const mon_feature_t& mon_features); | |
608 | // end election (called by Elector) | |
609 | void finish_election(); | |
610 | ||
611 | const bufferlist& get_supported_commands_bl() { | |
612 | return supported_commands_bl; | |
613 | } | |
614 | ||
615 | void update_logger(); | |
616 | ||
617 | /** | |
618 | * Vector holding the Services serviced by this Monitor. | |
619 | */ | |
620 | vector<PaxosService*> paxos_service; | |
621 | ||
7c673cae FG |
622 | class PGMonitor *pgmon() { |
623 | return (class PGMonitor *)paxos_service[PAXOS_PGMAP]; | |
624 | } | |
625 | ||
626 | class MDSMonitor *mdsmon() { | |
627 | return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP]; | |
628 | } | |
629 | ||
630 | class MonmapMonitor *monmon() { | |
631 | return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP]; | |
632 | } | |
633 | ||
634 | class OSDMonitor *osdmon() { | |
635 | return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP]; | |
636 | } | |
637 | ||
638 | class AuthMonitor *authmon() { | |
639 | return (class AuthMonitor *)paxos_service[PAXOS_AUTH]; | |
640 | } | |
641 | ||
642 | class LogMonitor *logmon() { | |
643 | return (class LogMonitor*) paxos_service[PAXOS_LOG]; | |
644 | } | |
645 | ||
646 | class MgrMonitor *mgrmon() { | |
647 | return (class MgrMonitor*) paxos_service[PAXOS_MGR]; | |
648 | } | |
649 | ||
31f18b77 FG |
650 | class MgrStatMonitor *mgrstatmon() { |
651 | return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT]; | |
652 | } | |
653 | ||
224ce89b WB |
654 | class MgrStatMonitor *healthmon() { |
655 | return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT]; | |
656 | } | |
657 | ||
7c673cae FG |
658 | friend class Paxos; |
659 | friend class OSDMonitor; | |
660 | friend class MDSMonitor; | |
661 | friend class MonmapMonitor; | |
662 | friend class PGMonitor; | |
663 | friend class LogMonitor; | |
664 | friend class ConfigKeyService; | |
665 | ||
666 | QuorumService *health_monitor; | |
667 | QuorumService *config_key_service; | |
668 | ||
669 | // -- sessions -- | |
670 | MonSessionMap session_map; | |
671 | Mutex session_map_lock{"Monitor::session_map_lock"}; | |
672 | AdminSocketHook *admin_hook; | |
673 | ||
674 | template<typename Func, typename...Args> | |
675 | void with_session_map(Func&& func) { | |
676 | Mutex::Locker l(session_map_lock); | |
677 | std::forward<Func>(func)(session_map); | |
678 | } | |
679 | void send_latest_monmap(Connection *con); | |
680 | ||
681 | // messages | |
682 | void handle_get_version(MonOpRequestRef op); | |
683 | void handle_subscribe(MonOpRequestRef op); | |
684 | void handle_mon_get_map(MonOpRequestRef op); | |
685 | ||
686 | static void _generate_command_map(map<string,cmd_vartype>& cmdmap, | |
687 | map<string,string> ¶m_str_map); | |
c07f9fc5 FG |
688 | static const MonCommand *_get_moncommand( |
689 | const string &cmd_prefix, | |
690 | const MonCommand *cmds, | |
691 | int cmds_size); | |
7c673cae FG |
692 | bool _allowed_command(MonSession *s, string &module, string &prefix, |
693 | const map<string,cmd_vartype>& cmdmap, | |
694 | const map<string,string>& param_str_map, | |
695 | const MonCommand *this_cmd); | |
696 | void get_mon_status(Formatter *f, ostream& ss); | |
697 | void _quorum_status(Formatter *f, ostream& ss); | |
698 | bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss); | |
699 | void handle_command(MonOpRequestRef op); | |
700 | void handle_route(MonOpRequestRef op); | |
701 | ||
702 | void handle_mon_metadata(MonOpRequestRef op); | |
703 | int get_mon_metadata(int mon, Formatter *f, ostream& err); | |
704 | int print_nodes(Formatter *f, ostream& err); | |
705 | ||
706 | // Accumulate metadata across calls to update_mon_metadata | |
224ce89b | 707 | map<int, Metadata> mon_metadata; |
7c673cae FG |
708 | map<int, Metadata> pending_metadata; |
709 | ||
710 | /** | |
711 | * | |
712 | */ | |
713 | struct health_cache_t { | |
714 | health_status_t overall; | |
715 | string summary; | |
716 | ||
717 | void reset() { | |
718 | // health_status_t doesn't really have a NONE value and we're not | |
719 | // okay with setting something else (say, HEALTH_ERR). so just | |
720 | // leave it be. | |
721 | summary.clear(); | |
722 | } | |
723 | } health_status_cache; | |
724 | ||
725 | Context *health_tick_event = nullptr; | |
726 | Context *health_interval_event = nullptr; | |
727 | ||
728 | void health_tick_start(); | |
729 | void health_tick_stop(); | |
730 | utime_t health_interval_calc_next_update(); | |
731 | void health_interval_start(); | |
732 | void health_interval_stop(); | |
733 | void health_events_cleanup(); | |
734 | ||
735 | void health_to_clog_update_conf(const std::set<std::string> &changed); | |
736 | ||
737 | void do_health_to_clog_interval(); | |
738 | void do_health_to_clog(bool force = false); | |
739 | ||
740 | /** | |
741 | * Generate health report | |
742 | * | |
743 | * @param status one-line status summary | |
744 | * @param detailbl optional bufferlist* to fill with a detailed report | |
745 | * @returns health status | |
746 | */ | |
747 | health_status_t get_health(list<string>& status, bufferlist *detailbl, | |
748 | Formatter *f); | |
224ce89b WB |
749 | |
750 | health_status_t get_health_status( | |
751 | bool want_detail, | |
752 | Formatter *f, | |
753 | std::string *plain, | |
754 | const char *sep1 = " ", | |
755 | const char *sep2 = "; "); | |
756 | void log_health( | |
757 | const health_check_map_t& updated, | |
758 | const health_check_map_t& previous, | |
759 | MonitorDBStore::TransactionRef t); | |
760 | ||
7c673cae FG |
761 | void get_cluster_status(stringstream &ss, Formatter *f); |
762 | ||
763 | void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version); | |
764 | void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version); | |
765 | ||
766 | ||
767 | void handle_probe(MonOpRequestRef op); | |
768 | /** | |
769 | * Handle a Probe Operation, replying with our name, quorum and known versions. | |
770 | * | |
771 | * We use the MMonProbe message class for anything and everything related with | |
772 | * Monitor probing. One of the operations relates directly with the probing | |
773 | * itself, in which we receive a probe request and to which we reply with | |
774 | * our name, our quorum and the known versions for each Paxos service. Thus the | |
775 | * redundant function name. This reply will obviously be sent to the one | |
776 | * probing/requesting these infos. | |
777 | * | |
778 | * @todo Add @pre and @post | |
779 | * | |
780 | * @param m A Probe message, with an operation of type Probe. | |
781 | */ | |
782 | void handle_probe_probe(MonOpRequestRef op); | |
783 | void handle_probe_reply(MonOpRequestRef op); | |
784 | ||
785 | // request routing | |
786 | struct RoutedRequest { | |
787 | uint64_t tid; | |
788 | bufferlist request_bl; | |
789 | MonSession *session; | |
790 | ConnectionRef con; | |
791 | uint64_t con_features; | |
792 | entity_inst_t client_inst; | |
793 | MonOpRequestRef op; | |
794 | ||
795 | RoutedRequest() : tid(0), session(NULL), con_features(0) {} | |
796 | ~RoutedRequest() { | |
797 | if (session) | |
798 | session->put(); | |
799 | } | |
800 | }; | |
801 | uint64_t routed_request_tid; | |
802 | map<uint64_t, RoutedRequest*> routed_requests; | |
803 | ||
804 | void forward_request_leader(MonOpRequestRef op); | |
805 | void handle_forward(MonOpRequestRef op); | |
806 | void try_send_message(Message *m, const entity_inst_t& to); | |
807 | void send_reply(MonOpRequestRef op, Message *reply); | |
808 | void no_reply(MonOpRequestRef op); | |
809 | void resend_routed_requests(); | |
810 | void remove_session(MonSession *s); | |
811 | void remove_all_sessions(); | |
812 | void waitlist_or_zap_client(MonOpRequestRef op); | |
813 | ||
814 | void send_command(const entity_inst_t& inst, | |
815 | const vector<string>& com); | |
816 | ||
817 | public: | |
818 | struct C_Command : public C_MonOp { | |
819 | Monitor *mon; | |
820 | int rc; | |
821 | string rs; | |
822 | bufferlist rdata; | |
823 | version_t version; | |
824 | C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) : | |
825 | C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){} | |
826 | C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) : | |
827 | C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){} | |
828 | ||
829 | void _finish(int r) override { | |
830 | MMonCommand *m = static_cast<MMonCommand*>(op->get_req()); | |
831 | if (r >= 0) { | |
832 | ostringstream ss; | |
833 | if (!op->get_req()->get_connection()) { | |
834 | ss << "connection dropped for command "; | |
835 | } else { | |
836 | MonSession *s = op->get_session(); | |
837 | ||
838 | // if client drops we may not have a session to draw information from. | |
839 | if (s) { | |
840 | ss << "from='" << s->inst << "' " | |
841 | << "entity='" << s->entity_name << "' "; | |
842 | } else { | |
843 | ss << "session dropped for command "; | |
844 | } | |
845 | } | |
846 | ss << "cmd='" << m->cmd << "': finished"; | |
847 | ||
848 | mon->audit_clog->info() << ss.str(); | |
849 | mon->reply_command(op, rc, rs, rdata, version); | |
850 | } | |
851 | else if (r == -ECANCELED) | |
852 | return; | |
853 | else if (r == -EAGAIN) | |
854 | mon->dispatch_op(op); | |
855 | else | |
856 | assert(0 == "bad C_Command return value"); | |
857 | } | |
858 | }; | |
859 | ||
860 | private: | |
861 | class C_RetryMessage : public C_MonOp { | |
862 | Monitor *mon; | |
863 | public: | |
864 | C_RetryMessage(Monitor *m, MonOpRequestRef op) : | |
865 | C_MonOp(op), mon(m) { } | |
866 | ||
867 | void _finish(int r) override { | |
868 | if (r == -EAGAIN || r >= 0) | |
869 | mon->dispatch_op(op); | |
870 | else if (r == -ECANCELED) | |
871 | return; | |
872 | else | |
873 | assert(0 == "bad C_RetryMessage return value"); | |
874 | } | |
875 | }; | |
876 | ||
877 | //ms_dispatch handles a lot of logic and we want to reuse it | |
878 | //on forwarded messages, so we create a non-locking version for this class | |
879 | void _ms_dispatch(Message *m); | |
880 | bool ms_dispatch(Message *m) override { | |
881 | lock.Lock(); | |
882 | _ms_dispatch(m); | |
883 | lock.Unlock(); | |
884 | return true; | |
885 | } | |
886 | void dispatch_op(MonOpRequestRef op); | |
887 | //mon_caps is used for un-connected messages from monitors | |
888 | MonCap * mon_caps; | |
889 | bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override; | |
890 | bool ms_verify_authorizer(Connection *con, int peer_type, | |
891 | int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply, | |
892 | bool& isvalid, CryptoKey& session_key) override; | |
893 | bool ms_handle_reset(Connection *con) override; | |
894 | void ms_handle_remote_reset(Connection *con) override {} | |
895 | bool ms_handle_refused(Connection *con) override; | |
896 | ||
897 | int write_default_keyring(bufferlist& bl); | |
898 | void extract_save_mon_key(KeyRing& keyring); | |
899 | ||
224ce89b | 900 | void collect_metadata(Metadata *m); |
7c673cae | 901 | void update_mon_metadata(int from, Metadata&& m); |
224ce89b | 902 | int load_metadata(); |
31f18b77 | 903 | void count_metadata(const string& field, Formatter *f); |
c07f9fc5 | 904 | void count_metadata(const string& field, map<string,int> *out); |
7c673cae FG |
905 | |
906 | // features | |
907 | static CompatSet get_initial_supported_features(); | |
908 | static CompatSet get_supported_features(); | |
909 | static CompatSet get_legacy_features(); | |
910 | /// read the ondisk features into the CompatSet pointed to by read_features | |
911 | static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features); | |
912 | void read_features(); | |
913 | void write_features(MonitorDBStore::TransactionRef t); | |
914 | ||
915 | OpTracker op_tracker; | |
916 | ||
917 | public: | |
918 | Monitor(CephContext *cct_, string nm, MonitorDBStore *s, | |
919 | Messenger *m, Messenger *mgr_m, MonMap *map); | |
920 | ~Monitor() override; | |
921 | ||
922 | static int check_features(MonitorDBStore *store); | |
923 | ||
924 | // config observer | |
925 | const char** get_tracked_conf_keys() const override; | |
926 | void handle_conf_change(const struct md_config_t *conf, | |
927 | const std::set<std::string> &changed) override; | |
928 | ||
929 | void update_log_clients(); | |
930 | int sanitize_options(); | |
931 | int preinit(); | |
932 | int init(); | |
933 | void init_paxos(); | |
934 | void refresh_from_paxos(bool *need_bootstrap); | |
935 | void shutdown(); | |
936 | void tick(); | |
937 | ||
938 | void handle_signal(int sig); | |
939 | ||
940 | int mkfs(bufferlist& osdmapbl); | |
941 | ||
942 | /** | |
943 | * check cluster_fsid file | |
944 | * | |
945 | * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code | |
946 | */ | |
947 | int check_fsid(); | |
948 | ||
949 | /** | |
950 | * write cluster_fsid file | |
951 | * | |
952 | * @return 0 on success, or negative error code | |
953 | */ | |
954 | int write_fsid(); | |
955 | int write_fsid(MonitorDBStore::TransactionRef t); | |
956 | ||
957 | void do_admin_command(std::string command, cmdmap_t& cmdmap, | |
958 | std::string format, ostream& ss); | |
959 | ||
960 | private: | |
961 | // don't allow copying | |
962 | Monitor(const Monitor& rhs); | |
963 | Monitor& operator=(const Monitor &rhs); | |
964 | ||
965 | public: | |
c07f9fc5 | 966 | static void format_command_descriptions(const std::vector<MonCommand> &commands, |
7c673cae FG |
967 | Formatter *f, |
968 | bufferlist *rdata, | |
969 | bool hide_mgr_flag=false); | |
970 | void get_locally_supported_monitor_commands(const MonCommand **cmds, int *count); | |
971 | /// the Monitor owns this pointer once you pass it in | |
972 | void set_leader_supported_commands(const MonCommand *cmds, int size); | |
973 | static bool is_keyring_required(); | |
974 | }; | |
975 | ||
976 | #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)") | |
977 | #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)") | |
978 | #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)") | |
979 | #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools") | |
980 | #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding") | |
981 | #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code") | |
982 | #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code") | |
983 | #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features") | |
984 | // make sure you add your feature to Monitor::get_supported_features | |
985 | ||
7c673cae | 986 | |
7c673cae FG |
987 | |
988 | #endif |