]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/HealthMonitor.cc
update sources to v12.1.1
[ceph.git] / ceph / src / mon / HealthMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013 Inktank, Inc
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <stdlib.h>
16 #include <limits.h>
17 #include <sstream>
18 #include <boost/regex.hpp>
19
20 #include "include/assert.h"
21 #include "include/stringify.h"
22
23 #include "mon/Monitor.h"
24 #include "mon/HealthService.h"
25 #include "mon/HealthMonitor.h"
26 #include "mon/DataHealthService.h"
27
28 #include "messages/MMonHealth.h"
29 #include "messages/MMonHealthChecks.h"
30
31 #include "common/Formatter.h"
32
33 #define dout_subsys ceph_subsys_mon
34 #undef dout_prefix
35 #define dout_prefix _prefix(_dout, mon, this)
36 static ostream& _prefix(std::ostream *_dout, const Monitor *mon,
37 const HealthMonitor *hmon) {
38 return *_dout << "mon." << mon->name << "@" << mon->rank
39 << "(" << mon->get_state_name() << ").health ";
40 }
41
42 HealthMonitor::HealthMonitor(Monitor *m, Paxos *p, const string& service_name)
43 : PaxosService(m, p, service_name) {
44 }
45
46 void HealthMonitor::init()
47 {
48 dout(10) << __func__ << dendl;
49 }
50
51 void HealthMonitor::create_initial()
52 {
53 dout(10) << __func__ << dendl;
54 }
55
56 void HealthMonitor::update_from_paxos(bool *need_bootstrap)
57 {
58 version = get_last_committed();
59 dout(10) << __func__ << dendl;
60 load_health();
61
62 bufferlist qbl;
63 mon->store->get(service_name, "quorum", qbl);
64 if (qbl.length()) {
65 auto p = qbl.begin();
66 ::decode(quorum_checks, p);
67 } else {
68 quorum_checks.clear();
69 }
70
71 bufferlist lbl;
72 mon->store->get(service_name, "leader", lbl);
73 if (lbl.length()) {
74 auto p = lbl.begin();
75 ::decode(leader_checks, p);
76 } else {
77 leader_checks.clear();
78 }
79
80 dout(20) << "dump:";
81 JSONFormatter jf(true);
82 jf.open_object_section("health");
83 jf.open_object_section("quorum_health");
84 for (auto& p : quorum_checks) {
85 string s = string("mon.") + stringify(p.first);
86 jf.dump_object(s.c_str(), p.second);
87 }
88 jf.close_section();
89 jf.dump_object("leader_health", leader_checks);
90 jf.close_section();
91 jf.flush(*_dout);
92 *_dout << dendl;
93 }
94
95 void HealthMonitor::create_pending()
96 {
97 dout(10) << " " << version << dendl;
98 }
99
100 void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
101 {
102 ++version;
103 dout(10) << " " << version << dendl;
104 put_last_committed(t, version);
105
106 bufferlist qbl;
107 ::encode(quorum_checks, qbl);
108 t->put(service_name, "quorum", qbl);
109 bufferlist lbl;
110 ::encode(leader_checks, lbl);
111 t->put(service_name, "leader", lbl);
112
113 health_check_map_t pending_health;
114
115 // combine per-mon details carefully...
116 map<string,set<string>> names; // code -> <mon names>
117 for (auto p : quorum_checks) {
118 for (auto q : p.second.checks) {
119 names[q.first].insert(mon->monmap->get_name(p.first));
120 }
121 pending_health.merge(p.second);
122 }
123 for (auto p : pending_health.checks) {
124 p.second.summary = boost::regex_replace(
125 p.second.summary,
126 boost::regex("%num%"), stringify(names[p.first].size()));
127 p.second.summary = boost::regex_replace(
128 p.second.summary,
129 boost::regex("%names%"), stringify(names[p.first]));
130 p.second.summary = boost::regex_replace(
131 p.second.summary,
132 boost::regex("%plurals%"),
133 names[p.first].size() > 1 ? "s" : "");
134 p.second.summary = boost::regex_replace(
135 p.second.summary,
136 boost::regex("%isorare%"),
137 names[p.first].size() > 1 ? "are" : "is");
138 }
139
140 pending_health.merge(leader_checks);
141 encode_health(pending_health, t);
142 }
143
144 version_t HealthMonitor::get_trim_to()
145 {
146 // we don't actually need *any* old states, but keep a few.
147 if (version > 5) {
148 return version - 5;
149 }
150 return 0;
151 }
152
153 bool HealthMonitor::preprocess_query(MonOpRequestRef op)
154 {
155 switch (op->get_req()->get_type()) {
156 case MSG_MON_HEALTH:
157 {
158 MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
159 int service_type = hm->get_service_type();
160 if (services.count(service_type) == 0) {
161 dout(1) << __func__ << " service type " << service_type
162 << " not registered -- drop message!" << dendl;
163 return false;
164 }
165 return services[service_type]->service_dispatch(op);
166 }
167
168 case MSG_MON_HEALTH_CHECKS:
169 return preprocess_health_checks(op);
170 }
171 return false;
172 }
173
174 bool HealthMonitor::prepare_update(MonOpRequestRef op)
175 {
176 return false;
177 }
178
179 bool HealthMonitor::preprocess_health_checks(MonOpRequestRef op)
180 {
181 MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req());
182 quorum_checks[m->get_source().num()] = m->health_checks;
183 return true;
184 }
185
186 void HealthMonitor::tick()
187 {
188 if (!is_active()) {
189 return;
190 }
191 dout(10) << __func__ << dendl;
192 bool changed = false;
193 if (check_member_health()) {
194 changed = true;
195 }
196 if (mon->is_leader()) {
197 if (check_leader_health()) {
198 changed = true;
199 }
200 }
201 if (changed) {
202 propose_pending();
203 }
204 }
205
206 bool HealthMonitor::check_member_health()
207 {
208 dout(20) << __func__ << dendl;
209 bool changed = false;
210
211 // snapshot of usage
212 DataStats stats;
213 get_fs_stats(stats.fs_stats, g_conf->mon_data.c_str());
214 map<string,uint64_t> extra;
215 uint64_t store_size = mon->store->get_estimated_size(extra);
216 assert(store_size > 0);
217 stats.store_stats.bytes_total = store_size;
218 stats.store_stats.bytes_sst = extra["sst"];
219 stats.store_stats.bytes_log = extra["log"];
220 stats.store_stats.bytes_misc = extra["misc"];
221 stats.last_update = ceph_clock_now();
222 dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
223 << " total " << prettybyte_t(stats.fs_stats.byte_total)
224 << ", used " << prettybyte_t(stats.fs_stats.byte_used)
225 << ", avail " << prettybyte_t(stats.fs_stats.byte_avail) << dendl;
226
227 // MON_DISK_{LOW,CRIT,BIG}
228 health_check_map_t next;
229 if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_crit) {
230 stringstream ss, ss2;
231 ss << "mon%plurals% %names% %isorare% very low on available space";
232 auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
233 ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
234 << "% avail";
235 d.detail.push_back(ss2.str());
236 } else if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
237 stringstream ss, ss2;
238 ss << "mon%plurals% %names% %isorare% low on available space";
239 auto& d = next.add("MON_DISK_LOW", HEALTH_ERR, ss.str());
240 ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
241 << "% avail";
242 d.detail.push_back(ss2.str());
243 }
244 if (stats.store_stats.bytes_total >= g_conf->mon_data_size_warn) {
245 stringstream ss, ss2;
246 ss << "mon%plurals% %names% %isorare% using a lot of disk space";
247 auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
248 ss2 << "mon." << mon->name << " is "
249 << prettybyte_t(stats.store_stats.bytes_total)
250 << " >= mon_data_size_warn ("
251 << prettybyte_t(g_conf->mon_data_size_warn) << ")";
252 d.detail.push_back(ss2.str());
253 }
254
255 auto p = quorum_checks.find(mon->rank);
256 if (p == quorum_checks.end() ||
257 p->second != next) {
258 if (mon->is_leader()) {
259 // prepare to propose
260 quorum_checks[mon->rank] = next;
261 changed = true;
262 } else {
263 // tell the leader
264 mon->messenger->send_message(new MMonHealthChecks(next),
265 mon->monmap->get_inst(mon->get_leader()));
266 }
267 }
268
269 // OSD_NO_DOWN_OUT_INTERVAL
270 {
271 // Warn if 'mon_osd_down_out_interval' is set to zero.
272 // Having this option set to zero on the leader acts much like the
273 // 'noout' flag. It's hard to figure out what's going wrong with clusters
274 // without the 'noout' flag set but acting like that just the same, so
275 // we report a HEALTH_WARN in case this option is set to zero.
276 // This is an ugly hack to get the warning out, but until we find a way
277 // to spread global options throughout the mon cluster and have all mons
278 // using a base set of the same options, we need to work around this sort
279 // of things.
280 // There's also the obvious drawback that if this is set on a single
281 // monitor on a 3-monitor cluster, this warning will only be shown every
282 // third monitor connection.
283 if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
284 g_conf->mon_osd_down_out_interval == 0) {
285 ostringstream ss, ds;
286 ss << "mon%plurals% %names %hasorhave% mon_osd_down_out_interval set to 0";
287 auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
288 ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
289 d.detail.push_back(ds.str());
290 }
291 }
292
293 return changed;
294 }
295
296 bool HealthMonitor::check_leader_health()
297 {
298 dout(20) << __func__ << dendl;
299 bool changed = false;
300
301 // prune quorum_health
302 {
303 auto& qset = mon->get_quorum();
304 auto p = quorum_checks.begin();
305 while (p != quorum_checks.end()) {
306 if (qset.count(p->first) == 0) {
307 p = quorum_checks.erase(p);
308 changed = true;
309 } else {
310 ++p;
311 }
312 }
313 }
314
315 health_check_map_t next;
316
317 // MON_DOWN
318 {
319 int max = mon->monmap->size();
320 int actual = mon->get_quorum().size();
321 if (actual < max) {
322 ostringstream ss;
323 ss << (max-actual) << "/" << max << " mons down, quorum "
324 << mon->get_quorum_names();
325 auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
326 set<int> q = mon->get_quorum();
327 for (int i=0; i<max; i++) {
328 if (q.count(i) == 0) {
329 ostringstream ss;
330 ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
331 << ") addr " << mon->monmap->get_addr(i)
332 << " is down (out of quorum)";
333 d.detail.push_back(ss.str());
334 }
335 }
336 }
337 }
338
339 // MON_CLOCK_SKEW
340 if (!mon->timecheck_skews.empty()) {
341 list<string> warns;
342 list<string> details;
343 for (map<entity_inst_t,double>::iterator i = mon->timecheck_skews.begin();
344 i != mon->timecheck_skews.end(); ++i) {
345 entity_inst_t inst = i->first;
346 double skew = i->second;
347 double latency = mon->timecheck_latencies[inst];
348 string name = mon->monmap->get_name(inst.addr);
349 ostringstream tcss;
350 health_status_t tcstatus = mon->timecheck_status(tcss, skew, latency);
351 if (tcstatus != HEALTH_OK) {
352 warns.push_back(name);
353 ostringstream tmp_ss;
354 tmp_ss << "mon." << name
355 << " addr " << inst.addr << " " << tcss.str()
356 << " (latency " << latency << "s)";
357 details.push_back(tmp_ss.str());
358 }
359 }
360 if (!warns.empty()) {
361 ostringstream ss;
362 ss << "clock skew detected on";
363 while (!warns.empty()) {
364 ss << " mon." << warns.front();
365 warns.pop_front();
366 if (!warns.empty())
367 ss << ",";
368 }
369 auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN,
370 "monitor clock skew detected");
371 d.detail.swap(details);
372 }
373 }
374
375 if (next != leader_checks) {
376 changed = true;
377 leader_checks = next;
378 }
379 return changed;
380 }