]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2013 Inktank, Inc | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
7c673cae FG |
15 | #include <stdlib.h> |
16 | #include <limits.h> | |
224ce89b | 17 | #include <sstream> |
11fdf7f2 | 18 | #include <regex> |
7c673cae | 19 | |
11fdf7f2 | 20 | #include "include/ceph_assert.h" |
224ce89b | 21 | #include "include/stringify.h" |
7c673cae FG |
22 | |
23 | #include "mon/Monitor.h" | |
7c673cae | 24 | #include "mon/HealthMonitor.h" |
7c673cae | 25 | |
224ce89b WB |
26 | #include "messages/MMonHealthChecks.h" |
27 | ||
7c673cae | 28 | #include "common/Formatter.h" |
7c673cae FG |
29 | |
30 | #define dout_subsys ceph_subsys_mon | |
31 | #undef dout_prefix | |
32 | #define dout_prefix _prefix(_dout, mon, this) | |
33 | static ostream& _prefix(std::ostream *_dout, const Monitor *mon, | |
34 | const HealthMonitor *hmon) { | |
35 | return *_dout << "mon." << mon->name << "@" << mon->rank | |
224ce89b WB |
36 | << "(" << mon->get_state_name() << ").health "; |
37 | } | |
38 | ||
39 | HealthMonitor::HealthMonitor(Monitor *m, Paxos *p, const string& service_name) | |
40 | : PaxosService(m, p, service_name) { | |
7c673cae FG |
41 | } |
42 | ||
43 | void HealthMonitor::init() | |
44 | { | |
45 | dout(10) << __func__ << dendl; | |
224ce89b WB |
46 | } |
47 | ||
48 | void HealthMonitor::create_initial() | |
49 | { | |
50 | dout(10) << __func__ << dendl; | |
51 | } | |
52 | ||
53 | void HealthMonitor::update_from_paxos(bool *need_bootstrap) | |
54 | { | |
55 | version = get_last_committed(); | |
56 | dout(10) << __func__ << dendl; | |
57 | load_health(); | |
58 | ||
59 | bufferlist qbl; | |
60 | mon->store->get(service_name, "quorum", qbl); | |
61 | if (qbl.length()) { | |
11fdf7f2 TL |
62 | auto p = qbl.cbegin(); |
63 | decode(quorum_checks, p); | |
224ce89b WB |
64 | } else { |
65 | quorum_checks.clear(); | |
66 | } | |
67 | ||
68 | bufferlist lbl; | |
69 | mon->store->get(service_name, "leader", lbl); | |
70 | if (lbl.length()) { | |
11fdf7f2 TL |
71 | auto p = lbl.cbegin(); |
72 | decode(leader_checks, p); | |
224ce89b WB |
73 | } else { |
74 | leader_checks.clear(); | |
75 | } | |
7c673cae | 76 | |
224ce89b WB |
77 | dout(20) << "dump:"; |
78 | JSONFormatter jf(true); | |
79 | jf.open_object_section("health"); | |
80 | jf.open_object_section("quorum_health"); | |
81 | for (auto& p : quorum_checks) { | |
82 | string s = string("mon.") + stringify(p.first); | |
83 | jf.dump_object(s.c_str(), p.second); | |
7c673cae | 84 | } |
224ce89b WB |
85 | jf.close_section(); |
86 | jf.dump_object("leader_health", leader_checks); | |
87 | jf.close_section(); | |
88 | jf.flush(*_dout); | |
89 | *_dout << dendl; | |
7c673cae FG |
90 | } |
91 | ||
224ce89b | 92 | void HealthMonitor::create_pending() |
7c673cae | 93 | { |
224ce89b | 94 | dout(10) << " " << version << dendl; |
7c673cae FG |
95 | } |
96 | ||
224ce89b WB |
97 | void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t) |
98 | { | |
99 | ++version; | |
100 | dout(10) << " " << version << dendl; | |
101 | put_last_committed(t, version); | |
102 | ||
103 | bufferlist qbl; | |
11fdf7f2 | 104 | encode(quorum_checks, qbl); |
224ce89b WB |
105 | t->put(service_name, "quorum", qbl); |
106 | bufferlist lbl; | |
11fdf7f2 | 107 | encode(leader_checks, lbl); |
224ce89b WB |
108 | t->put(service_name, "leader", lbl); |
109 | ||
110 | health_check_map_t pending_health; | |
111 | ||
112 | // combine per-mon details carefully... | |
113 | map<string,set<string>> names; // code -> <mon names> | |
114 | for (auto p : quorum_checks) { | |
115 | for (auto q : p.second.checks) { | |
116 | names[q.first].insert(mon->monmap->get_name(p.first)); | |
117 | } | |
118 | pending_health.merge(p.second); | |
7c673cae | 119 | } |
c07f9fc5 | 120 | for (auto &p : pending_health.checks) { |
11fdf7f2 | 121 | p.second.summary = std::regex_replace( |
224ce89b | 122 | p.second.summary, |
11fdf7f2 | 123 | std::regex("%hasorhave%"), |
c07f9fc5 | 124 | names[p.first].size() > 1 ? "have" : "has"); |
11fdf7f2 | 125 | p.second.summary = std::regex_replace( |
224ce89b | 126 | p.second.summary, |
11fdf7f2 TL |
127 | std::regex("%names%"), stringify(names[p.first])); |
128 | p.second.summary = std::regex_replace( | |
224ce89b | 129 | p.second.summary, |
11fdf7f2 | 130 | std::regex("%plurals%"), |
224ce89b | 131 | names[p.first].size() > 1 ? "s" : ""); |
11fdf7f2 | 132 | p.second.summary = std::regex_replace( |
224ce89b | 133 | p.second.summary, |
11fdf7f2 | 134 | std::regex("%isorare%"), |
224ce89b WB |
135 | names[p.first].size() > 1 ? "are" : "is"); |
136 | } | |
137 | ||
138 | pending_health.merge(leader_checks); | |
139 | encode_health(pending_health, t); | |
7c673cae FG |
140 | } |
141 | ||
11fdf7f2 | 142 | version_t HealthMonitor::get_trim_to() const |
224ce89b WB |
143 | { |
144 | // we don't actually need *any* old states, but keep a few. | |
145 | if (version > 5) { | |
146 | return version - 5; | |
7c673cae | 147 | } |
224ce89b | 148 | return 0; |
7c673cae FG |
149 | } |
150 | ||
224ce89b | 151 | bool HealthMonitor::preprocess_query(MonOpRequestRef op) |
7c673cae | 152 | { |
c07f9fc5 FG |
153 | return false; |
154 | } | |
155 | ||
156 | bool HealthMonitor::prepare_update(MonOpRequestRef op) | |
157 | { | |
158 | Message *m = op->get_req(); | |
159 | dout(7) << "prepare_update " << *m | |
160 | << " from " << m->get_orig_source_inst() << dendl; | |
161 | switch (m->get_type()) { | |
224ce89b | 162 | case MSG_MON_HEALTH_CHECKS: |
c07f9fc5 FG |
163 | return prepare_health_checks(op); |
164 | default: | |
165 | return false; | |
224ce89b | 166 | } |
224ce89b WB |
167 | } |
168 | ||
c07f9fc5 | 169 | bool HealthMonitor::prepare_health_checks(MonOpRequestRef op) |
224ce89b WB |
170 | { |
171 | MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req()); | |
c07f9fc5 FG |
172 | // no need to check if it's changed, the peon has done so |
173 | quorum_checks[m->get_source().num()] = std::move(m->health_checks); | |
224ce89b WB |
174 | return true; |
175 | } | |
176 | ||
177 | void HealthMonitor::tick() | |
178 | { | |
179 | if (!is_active()) { | |
180 | return; | |
181 | } | |
182 | dout(10) << __func__ << dendl; | |
183 | bool changed = false; | |
184 | if (check_member_health()) { | |
185 | changed = true; | |
186 | } | |
c07f9fc5 FG |
187 | if (!mon->is_leader()) { |
188 | return; | |
189 | } | |
190 | if (check_leader_health()) { | |
191 | changed = true; | |
7c673cae | 192 | } |
224ce89b WB |
193 | if (changed) { |
194 | propose_pending(); | |
195 | } | |
196 | } | |
197 | ||
198 | bool HealthMonitor::check_member_health() | |
199 | { | |
200 | dout(20) << __func__ << dendl; | |
201 | bool changed = false; | |
7c673cae | 202 | |
224ce89b WB |
203 | // snapshot of usage |
204 | DataStats stats; | |
11fdf7f2 | 205 | get_fs_stats(stats.fs_stats, g_conf()->mon_data.c_str()); |
224ce89b WB |
206 | map<string,uint64_t> extra; |
207 | uint64_t store_size = mon->store->get_estimated_size(extra); | |
11fdf7f2 | 208 | ceph_assert(store_size > 0); |
224ce89b WB |
209 | stats.store_stats.bytes_total = store_size; |
210 | stats.store_stats.bytes_sst = extra["sst"]; | |
211 | stats.store_stats.bytes_log = extra["log"]; | |
212 | stats.store_stats.bytes_misc = extra["misc"]; | |
213 | stats.last_update = ceph_clock_now(); | |
214 | dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%" | |
1adf2230 AA |
215 | << " total " << byte_u_t(stats.fs_stats.byte_total) |
216 | << ", used " << byte_u_t(stats.fs_stats.byte_used) | |
217 | << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl; | |
224ce89b WB |
218 | |
219 | // MON_DISK_{LOW,CRIT,BIG} | |
220 | health_check_map_t next; | |
11fdf7f2 | 221 | if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) { |
224ce89b WB |
222 | stringstream ss, ss2; |
223 | ss << "mon%plurals% %names% %isorare% very low on available space"; | |
224 | auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str()); | |
225 | ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent | |
226 | << "% avail"; | |
227 | d.detail.push_back(ss2.str()); | |
11fdf7f2 | 228 | } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) { |
224ce89b WB |
229 | stringstream ss, ss2; |
230 | ss << "mon%plurals% %names% %isorare% low on available space"; | |
c07f9fc5 | 231 | auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str()); |
224ce89b WB |
232 | ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent |
233 | << "% avail"; | |
234 | d.detail.push_back(ss2.str()); | |
235 | } | |
11fdf7f2 | 236 | if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) { |
224ce89b WB |
237 | stringstream ss, ss2; |
238 | ss << "mon%plurals% %names% %isorare% using a lot of disk space"; | |
239 | auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str()); | |
240 | ss2 << "mon." << mon->name << " is " | |
1adf2230 | 241 | << byte_u_t(stats.store_stats.bytes_total) |
224ce89b | 242 | << " >= mon_data_size_warn (" |
11fdf7f2 | 243 | << byte_u_t(g_conf()->mon_data_size_warn) << ")"; |
224ce89b | 244 | d.detail.push_back(ss2.str()); |
7c673cae FG |
245 | } |
246 | ||
224ce89b WB |
247 | // OSD_NO_DOWN_OUT_INTERVAL |
248 | { | |
249 | // Warn if 'mon_osd_down_out_interval' is set to zero. | |
250 | // Having this option set to zero on the leader acts much like the | |
251 | // 'noout' flag. It's hard to figure out what's going wrong with clusters | |
252 | // without the 'noout' flag set but acting like that just the same, so | |
253 | // we report a HEALTH_WARN in case this option is set to zero. | |
254 | // This is an ugly hack to get the warning out, but until we find a way | |
255 | // to spread global options throughout the mon cluster and have all mons | |
256 | // using a base set of the same options, we need to work around this sort | |
257 | // of things. | |
258 | // There's also the obvious drawback that if this is set on a single | |
259 | // monitor on a 3-monitor cluster, this warning will only be shown every | |
260 | // third monitor connection. | |
11fdf7f2 TL |
261 | if (g_conf()->mon_warn_on_osd_down_out_interval_zero && |
262 | g_conf()->mon_osd_down_out_interval == 0) { | |
224ce89b | 263 | ostringstream ss, ds; |
c07f9fc5 | 264 | ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0"; |
224ce89b WB |
265 | auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str()); |
266 | ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0"; | |
267 | d.detail.push_back(ds.str()); | |
268 | } | |
269 | } | |
270 | ||
c07f9fc5 FG |
271 | auto p = quorum_checks.find(mon->rank); |
272 | if (p == quorum_checks.end()) { | |
273 | if (next.empty()) { | |
274 | return false; | |
275 | } | |
276 | } else { | |
277 | if (p->second == next) { | |
278 | return false; | |
279 | } | |
280 | } | |
281 | ||
282 | if (mon->is_leader()) { | |
283 | // prepare to propose | |
284 | quorum_checks[mon->rank] = next; | |
285 | changed = true; | |
286 | } else { | |
11fdf7f2 TL |
287 | // tell the leader |
288 | mon->send_mon_message(new MMonHealthChecks(next), mon->get_leader()); | |
c07f9fc5 FG |
289 | } |
290 | ||
224ce89b | 291 | return changed; |
7c673cae FG |
292 | } |
293 | ||
224ce89b WB |
294 | bool HealthMonitor::check_leader_health() |
295 | { | |
296 | dout(20) << __func__ << dendl; | |
297 | bool changed = false; | |
298 | ||
299 | // prune quorum_health | |
300 | { | |
301 | auto& qset = mon->get_quorum(); | |
302 | auto p = quorum_checks.begin(); | |
303 | while (p != quorum_checks.end()) { | |
304 | if (qset.count(p->first) == 0) { | |
305 | p = quorum_checks.erase(p); | |
306 | changed = true; | |
307 | } else { | |
308 | ++p; | |
309 | } | |
310 | } | |
311 | } | |
312 | ||
313 | health_check_map_t next; | |
314 | ||
315 | // MON_DOWN | |
316 | { | |
317 | int max = mon->monmap->size(); | |
318 | int actual = mon->get_quorum().size(); | |
319 | if (actual < max) { | |
320 | ostringstream ss; | |
321 | ss << (max-actual) << "/" << max << " mons down, quorum " | |
322 | << mon->get_quorum_names(); | |
323 | auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str()); | |
324 | set<int> q = mon->get_quorum(); | |
325 | for (int i=0; i<max; i++) { | |
326 | if (q.count(i) == 0) { | |
327 | ostringstream ss; | |
328 | ss << "mon." << mon->monmap->get_name(i) << " (rank " << i | |
11fdf7f2 | 329 | << ") addr " << mon->monmap->get_addrs(i) |
224ce89b WB |
330 | << " is down (out of quorum)"; |
331 | d.detail.push_back(ss.str()); | |
332 | } | |
333 | } | |
334 | } | |
335 | } | |
336 | ||
337 | // MON_CLOCK_SKEW | |
338 | if (!mon->timecheck_skews.empty()) { | |
339 | list<string> warns; | |
340 | list<string> details; | |
11fdf7f2 TL |
341 | for (auto& i : mon->timecheck_skews) { |
342 | double skew = i.second; | |
343 | double latency = mon->timecheck_latencies[i.first]; | |
344 | string name = mon->monmap->get_name(i.first); | |
224ce89b WB |
345 | ostringstream tcss; |
346 | health_status_t tcstatus = mon->timecheck_status(tcss, skew, latency); | |
347 | if (tcstatus != HEALTH_OK) { | |
348 | warns.push_back(name); | |
349 | ostringstream tmp_ss; | |
11fdf7f2 | 350 | tmp_ss << "mon." << name << " " << tcss.str() |
224ce89b WB |
351 | << " (latency " << latency << "s)"; |
352 | details.push_back(tmp_ss.str()); | |
353 | } | |
354 | } | |
355 | if (!warns.empty()) { | |
356 | ostringstream ss; | |
357 | ss << "clock skew detected on"; | |
358 | while (!warns.empty()) { | |
359 | ss << " mon." << warns.front(); | |
360 | warns.pop_front(); | |
361 | if (!warns.empty()) | |
362 | ss << ","; | |
363 | } | |
c07f9fc5 | 364 | auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str()); |
224ce89b WB |
365 | d.detail.swap(details); |
366 | } | |
367 | } | |
368 | ||
11fdf7f2 TL |
369 | // MON_MSGR2_NOT_ENABLED |
370 | if (g_conf().get_val<bool>("ms_bind_msgr2") && | |
371 | g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled") && | |
372 | mon->monmap->get_required_features().contains_all( | |
373 | ceph::features::mon::FEATURE_NAUTILUS)) { | |
374 | list<string> details; | |
375 | for (auto& i : mon->monmap->mon_info) { | |
376 | if (!i.second.public_addrs.has_msgr2()) { | |
377 | ostringstream ds; | |
378 | ds << "mon." << i.first << " is not bound to a msgr2 port, only " | |
379 | << i.second.public_addrs; | |
380 | details.push_back(ds.str()); | |
381 | } | |
382 | } | |
383 | if (!details.empty()) { | |
384 | ostringstream ss; | |
385 | ss << details.size() << " monitors have not enabled msgr2"; | |
386 | auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str()); | |
387 | d.detail.swap(details); | |
388 | } | |
389 | } | |
390 | ||
224ce89b WB |
391 | if (next != leader_checks) { |
392 | changed = true; | |
393 | leader_checks = next; | |
394 | } | |
395 | return changed; | |
396 | } |