]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/HealthMonitor.cc
update sources to ceph Nautilus 14.2.1
[ceph.git] / ceph / src / mon / HealthMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2013 Inktank, Inc
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
7c673cae
FG
15#include <stdlib.h>
16#include <limits.h>
224ce89b 17#include <sstream>
11fdf7f2 18#include <regex>
7c673cae 19
11fdf7f2 20#include "include/ceph_assert.h"
224ce89b 21#include "include/stringify.h"
7c673cae
FG
22
23#include "mon/Monitor.h"
7c673cae 24#include "mon/HealthMonitor.h"
7c673cae 25
224ce89b
WB
26#include "messages/MMonHealthChecks.h"
27
7c673cae 28#include "common/Formatter.h"
7c673cae
FG
29
30#define dout_subsys ceph_subsys_mon
31#undef dout_prefix
32#define dout_prefix _prefix(_dout, mon, this)
33static ostream& _prefix(std::ostream *_dout, const Monitor *mon,
34 const HealthMonitor *hmon) {
35 return *_dout << "mon." << mon->name << "@" << mon->rank
224ce89b
WB
36 << "(" << mon->get_state_name() << ").health ";
37}
38
39HealthMonitor::HealthMonitor(Monitor *m, Paxos *p, const string& service_name)
40 : PaxosService(m, p, service_name) {
7c673cae
FG
41}
42
43void HealthMonitor::init()
44{
45 dout(10) << __func__ << dendl;
224ce89b
WB
46}
47
48void HealthMonitor::create_initial()
49{
50 dout(10) << __func__ << dendl;
51}
52
53void HealthMonitor::update_from_paxos(bool *need_bootstrap)
54{
55 version = get_last_committed();
56 dout(10) << __func__ << dendl;
57 load_health();
58
59 bufferlist qbl;
60 mon->store->get(service_name, "quorum", qbl);
61 if (qbl.length()) {
11fdf7f2
TL
62 auto p = qbl.cbegin();
63 decode(quorum_checks, p);
224ce89b
WB
64 } else {
65 quorum_checks.clear();
66 }
67
68 bufferlist lbl;
69 mon->store->get(service_name, "leader", lbl);
70 if (lbl.length()) {
11fdf7f2
TL
71 auto p = lbl.cbegin();
72 decode(leader_checks, p);
224ce89b
WB
73 } else {
74 leader_checks.clear();
75 }
7c673cae 76
224ce89b
WB
77 dout(20) << "dump:";
78 JSONFormatter jf(true);
79 jf.open_object_section("health");
80 jf.open_object_section("quorum_health");
81 for (auto& p : quorum_checks) {
82 string s = string("mon.") + stringify(p.first);
83 jf.dump_object(s.c_str(), p.second);
7c673cae 84 }
224ce89b
WB
85 jf.close_section();
86 jf.dump_object("leader_health", leader_checks);
87 jf.close_section();
88 jf.flush(*_dout);
89 *_dout << dendl;
7c673cae
FG
90}
91
224ce89b 92void HealthMonitor::create_pending()
7c673cae 93{
224ce89b 94 dout(10) << " " << version << dendl;
7c673cae
FG
95}
96
224ce89b
WB
97void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
98{
99 ++version;
100 dout(10) << " " << version << dendl;
101 put_last_committed(t, version);
102
103 bufferlist qbl;
11fdf7f2 104 encode(quorum_checks, qbl);
224ce89b
WB
105 t->put(service_name, "quorum", qbl);
106 bufferlist lbl;
11fdf7f2 107 encode(leader_checks, lbl);
224ce89b
WB
108 t->put(service_name, "leader", lbl);
109
110 health_check_map_t pending_health;
111
112 // combine per-mon details carefully...
113 map<string,set<string>> names; // code -> <mon names>
114 for (auto p : quorum_checks) {
115 for (auto q : p.second.checks) {
116 names[q.first].insert(mon->monmap->get_name(p.first));
117 }
118 pending_health.merge(p.second);
7c673cae 119 }
c07f9fc5 120 for (auto &p : pending_health.checks) {
11fdf7f2 121 p.second.summary = std::regex_replace(
224ce89b 122 p.second.summary,
11fdf7f2 123 std::regex("%hasorhave%"),
c07f9fc5 124 names[p.first].size() > 1 ? "have" : "has");
11fdf7f2 125 p.second.summary = std::regex_replace(
224ce89b 126 p.second.summary,
11fdf7f2
TL
127 std::regex("%names%"), stringify(names[p.first]));
128 p.second.summary = std::regex_replace(
224ce89b 129 p.second.summary,
11fdf7f2 130 std::regex("%plurals%"),
224ce89b 131 names[p.first].size() > 1 ? "s" : "");
11fdf7f2 132 p.second.summary = std::regex_replace(
224ce89b 133 p.second.summary,
11fdf7f2 134 std::regex("%isorare%"),
224ce89b
WB
135 names[p.first].size() > 1 ? "are" : "is");
136 }
137
138 pending_health.merge(leader_checks);
139 encode_health(pending_health, t);
7c673cae
FG
140}
141
11fdf7f2 142version_t HealthMonitor::get_trim_to() const
224ce89b
WB
143{
144 // we don't actually need *any* old states, but keep a few.
145 if (version > 5) {
146 return version - 5;
7c673cae 147 }
224ce89b 148 return 0;
7c673cae
FG
149}
150
224ce89b 151bool HealthMonitor::preprocess_query(MonOpRequestRef op)
7c673cae 152{
c07f9fc5
FG
153 return false;
154}
155
156bool HealthMonitor::prepare_update(MonOpRequestRef op)
157{
158 Message *m = op->get_req();
159 dout(7) << "prepare_update " << *m
160 << " from " << m->get_orig_source_inst() << dendl;
161 switch (m->get_type()) {
224ce89b 162 case MSG_MON_HEALTH_CHECKS:
c07f9fc5
FG
163 return prepare_health_checks(op);
164 default:
165 return false;
224ce89b 166 }
224ce89b
WB
167}
168
c07f9fc5 169bool HealthMonitor::prepare_health_checks(MonOpRequestRef op)
224ce89b
WB
170{
171 MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req());
c07f9fc5
FG
172 // no need to check if it's changed, the peon has done so
173 quorum_checks[m->get_source().num()] = std::move(m->health_checks);
224ce89b
WB
174 return true;
175}
176
177void HealthMonitor::tick()
178{
179 if (!is_active()) {
180 return;
181 }
182 dout(10) << __func__ << dendl;
183 bool changed = false;
184 if (check_member_health()) {
185 changed = true;
186 }
c07f9fc5
FG
187 if (!mon->is_leader()) {
188 return;
189 }
190 if (check_leader_health()) {
191 changed = true;
7c673cae 192 }
224ce89b
WB
193 if (changed) {
194 propose_pending();
195 }
196}
197
198bool HealthMonitor::check_member_health()
199{
200 dout(20) << __func__ << dendl;
201 bool changed = false;
7c673cae 202
224ce89b
WB
203 // snapshot of usage
204 DataStats stats;
11fdf7f2 205 get_fs_stats(stats.fs_stats, g_conf()->mon_data.c_str());
224ce89b
WB
206 map<string,uint64_t> extra;
207 uint64_t store_size = mon->store->get_estimated_size(extra);
11fdf7f2 208 ceph_assert(store_size > 0);
224ce89b
WB
209 stats.store_stats.bytes_total = store_size;
210 stats.store_stats.bytes_sst = extra["sst"];
211 stats.store_stats.bytes_log = extra["log"];
212 stats.store_stats.bytes_misc = extra["misc"];
213 stats.last_update = ceph_clock_now();
214 dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
1adf2230
AA
215 << " total " << byte_u_t(stats.fs_stats.byte_total)
216 << ", used " << byte_u_t(stats.fs_stats.byte_used)
217 << ", avail " << byte_u_t(stats.fs_stats.byte_avail) << dendl;
224ce89b
WB
218
219 // MON_DISK_{LOW,CRIT,BIG}
220 health_check_map_t next;
11fdf7f2 221 if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) {
224ce89b
WB
222 stringstream ss, ss2;
223 ss << "mon%plurals% %names% %isorare% very low on available space";
224 auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
225 ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
226 << "% avail";
227 d.detail.push_back(ss2.str());
11fdf7f2 228 } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) {
224ce89b
WB
229 stringstream ss, ss2;
230 ss << "mon%plurals% %names% %isorare% low on available space";
c07f9fc5 231 auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str());
224ce89b
WB
232 ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
233 << "% avail";
234 d.detail.push_back(ss2.str());
235 }
11fdf7f2 236 if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) {
224ce89b
WB
237 stringstream ss, ss2;
238 ss << "mon%plurals% %names% %isorare% using a lot of disk space";
239 auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
240 ss2 << "mon." << mon->name << " is "
1adf2230 241 << byte_u_t(stats.store_stats.bytes_total)
224ce89b 242 << " >= mon_data_size_warn ("
11fdf7f2 243 << byte_u_t(g_conf()->mon_data_size_warn) << ")";
224ce89b 244 d.detail.push_back(ss2.str());
7c673cae
FG
245 }
246
224ce89b
WB
247 // OSD_NO_DOWN_OUT_INTERVAL
248 {
249 // Warn if 'mon_osd_down_out_interval' is set to zero.
250 // Having this option set to zero on the leader acts much like the
251 // 'noout' flag. It's hard to figure out what's going wrong with clusters
252 // without the 'noout' flag set but acting like that just the same, so
253 // we report a HEALTH_WARN in case this option is set to zero.
254 // This is an ugly hack to get the warning out, but until we find a way
255 // to spread global options throughout the mon cluster and have all mons
256 // using a base set of the same options, we need to work around this sort
257 // of things.
258 // There's also the obvious drawback that if this is set on a single
259 // monitor on a 3-monitor cluster, this warning will only be shown every
260 // third monitor connection.
11fdf7f2
TL
261 if (g_conf()->mon_warn_on_osd_down_out_interval_zero &&
262 g_conf()->mon_osd_down_out_interval == 0) {
224ce89b 263 ostringstream ss, ds;
c07f9fc5 264 ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
224ce89b
WB
265 auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
266 ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
267 d.detail.push_back(ds.str());
268 }
269 }
270
c07f9fc5
FG
271 auto p = quorum_checks.find(mon->rank);
272 if (p == quorum_checks.end()) {
273 if (next.empty()) {
274 return false;
275 }
276 } else {
277 if (p->second == next) {
278 return false;
279 }
280 }
281
282 if (mon->is_leader()) {
283 // prepare to propose
284 quorum_checks[mon->rank] = next;
285 changed = true;
286 } else {
11fdf7f2
TL
287 // tell the leader
288 mon->send_mon_message(new MMonHealthChecks(next), mon->get_leader());
c07f9fc5
FG
289 }
290
224ce89b 291 return changed;
7c673cae
FG
292}
293
224ce89b
WB
294bool HealthMonitor::check_leader_health()
295{
296 dout(20) << __func__ << dendl;
297 bool changed = false;
298
299 // prune quorum_health
300 {
301 auto& qset = mon->get_quorum();
302 auto p = quorum_checks.begin();
303 while (p != quorum_checks.end()) {
304 if (qset.count(p->first) == 0) {
305 p = quorum_checks.erase(p);
306 changed = true;
307 } else {
308 ++p;
309 }
310 }
311 }
312
313 health_check_map_t next;
314
315 // MON_DOWN
316 {
317 int max = mon->monmap->size();
318 int actual = mon->get_quorum().size();
319 if (actual < max) {
320 ostringstream ss;
321 ss << (max-actual) << "/" << max << " mons down, quorum "
322 << mon->get_quorum_names();
323 auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
324 set<int> q = mon->get_quorum();
325 for (int i=0; i<max; i++) {
326 if (q.count(i) == 0) {
327 ostringstream ss;
328 ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
11fdf7f2 329 << ") addr " << mon->monmap->get_addrs(i)
224ce89b
WB
330 << " is down (out of quorum)";
331 d.detail.push_back(ss.str());
332 }
333 }
334 }
335 }
336
337 // MON_CLOCK_SKEW
338 if (!mon->timecheck_skews.empty()) {
339 list<string> warns;
340 list<string> details;
11fdf7f2
TL
341 for (auto& i : mon->timecheck_skews) {
342 double skew = i.second;
343 double latency = mon->timecheck_latencies[i.first];
344 string name = mon->monmap->get_name(i.first);
224ce89b
WB
345 ostringstream tcss;
346 health_status_t tcstatus = mon->timecheck_status(tcss, skew, latency);
347 if (tcstatus != HEALTH_OK) {
348 warns.push_back(name);
349 ostringstream tmp_ss;
11fdf7f2 350 tmp_ss << "mon." << name << " " << tcss.str()
224ce89b
WB
351 << " (latency " << latency << "s)";
352 details.push_back(tmp_ss.str());
353 }
354 }
355 if (!warns.empty()) {
356 ostringstream ss;
357 ss << "clock skew detected on";
358 while (!warns.empty()) {
359 ss << " mon." << warns.front();
360 warns.pop_front();
361 if (!warns.empty())
362 ss << ",";
363 }
c07f9fc5 364 auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str());
224ce89b
WB
365 d.detail.swap(details);
366 }
367 }
368
11fdf7f2
TL
369 // MON_MSGR2_NOT_ENABLED
370 if (g_conf().get_val<bool>("ms_bind_msgr2") &&
371 g_conf().get_val<bool>("mon_warn_on_msgr2_not_enabled") &&
372 mon->monmap->get_required_features().contains_all(
373 ceph::features::mon::FEATURE_NAUTILUS)) {
374 list<string> details;
375 for (auto& i : mon->monmap->mon_info) {
376 if (!i.second.public_addrs.has_msgr2()) {
377 ostringstream ds;
378 ds << "mon." << i.first << " is not bound to a msgr2 port, only "
379 << i.second.public_addrs;
380 details.push_back(ds.str());
381 }
382 }
383 if (!details.empty()) {
384 ostringstream ss;
385 ss << details.size() << " monitors have not enabled msgr2";
386 auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str());
387 d.detail.swap(details);
388 }
389 }
390
224ce89b
WB
391 if (next != leader_checks) {
392 changed = true;
393 leader_checks = next;
394 }
395 return changed;
396}