1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2013 Inktank, Inc
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #include "include/memory.h"
27 #ifdef HAVE_SYS_MOUNT_H
28 #include <sys/mount.h>
31 #ifdef HAVE_SYS_PARAM_H
32 #include <sys/param.h>
35 #include "messages/MMonHealth.h"
36 #include "include/assert.h"
37 #include "common/Formatter.h"
38 #include "common/errno.h"
40 #include "mon/Monitor.h"
41 #include "mon/DataHealthService.h"
43 #define dout_subsys ceph_subsys_mon
45 #define dout_prefix _prefix(_dout, mon, this)
46 static ostream
& _prefix(std::ostream
*_dout
, const Monitor
*mon
,
47 const DataHealthService
*svc
) {
50 return *_dout
<< "mon." << mon
->name
<< "@" << mon
->rank
51 << "(" << mon
->get_state_name() << ")." << svc
->get_name()
52 << "(" << svc
->get_epoch() << ") ";
55 void DataHealthService::start_epoch()
57 dout(10) << __func__
<< " epoch " << get_epoch() << dendl
;
58 // we are not bound by election epochs, but we should clear the stats
59 // everytime an election is triggerd. As far as we know, a monitor might
60 // have been running out of disk space and someone fixed it. We don't want
61 // to hold the cluster back, even confusing the user, due to some possibly
64 last_warned_percent
= 0;
67 void DataHealthService::get_health(
68 list
<pair
<health_status_t
,string
> >& summary
,
69 list
<pair
<health_status_t
,string
> > *detail
)
71 dout(10) << __func__
<< dendl
;
72 for (map
<entity_inst_t
,DataStats
>::iterator it
= stats
.begin();
73 it
!= stats
.end(); ++it
) {
74 string mon_name
= mon
->monmap
->get_name(it
->first
.addr
);
75 DataStats
& stats
= it
->second
;
77 health_status_t health_status
= HEALTH_OK
;
79 if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_crit
) {
80 health_status
= HEALTH_ERR
;
81 health_detail
= "low disk space, shutdown imminent";
82 } else if (stats
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_warn
) {
83 health_status
= HEALTH_WARN
;
84 health_detail
= "low disk space";
87 if (stats
.store_stats
.bytes_total
>= g_conf
->mon_data_size_warn
) {
88 if (health_status
> HEALTH_WARN
)
89 health_status
= HEALTH_WARN
;
90 if (!health_detail
.empty())
91 health_detail
.append("; ");
93 ss
<< "store is getting too big! "
94 << prettybyte_t(stats
.store_stats
.bytes_total
)
95 << " >= " << prettybyte_t(g_conf
->mon_data_size_warn
);
96 health_detail
.append(ss
.str());
99 if (health_status
!= HEALTH_OK
) {
101 ss
<< "mon." << mon_name
<< " " << health_detail
;
102 summary
.push_back(make_pair(health_status
, ss
.str()));
103 ss
<< " -- " << stats
.fs_stats
.avail_percent
<< "% avail";
105 detail
->push_back(make_pair(health_status
, ss
.str()));
110 int DataHealthService::update_store_stats(DataStats
&ours
)
112 map
<string
,uint64_t> extra
;
113 uint64_t store_size
= mon
->store
->get_estimated_size(extra
);
114 assert(store_size
> 0);
116 ours
.store_stats
.bytes_total
= store_size
;
117 ours
.store_stats
.bytes_sst
= extra
["sst"];
118 ours
.store_stats
.bytes_log
= extra
["log"];
119 ours
.store_stats
.bytes_misc
= extra
["misc"];
120 ours
.last_update
= ceph_clock_now();
126 int DataHealthService::update_stats()
128 entity_inst_t our_inst
= mon
->messenger
->get_myinst();
129 DataStats
& ours
= stats
[our_inst
];
131 int err
= get_fs_stats(ours
.fs_stats
, g_conf
->mon_data
.c_str());
133 derr
<< __func__
<< " get_fs_stats error: " << cpp_strerror(err
) << dendl
;
136 dout(0) << __func__
<< " avail " << ours
.fs_stats
.avail_percent
<< "%"
137 << " total " << prettybyte_t(ours
.fs_stats
.byte_total
)
138 << ", used " << prettybyte_t(ours
.fs_stats
.byte_used
)
139 << ", avail " << prettybyte_t(ours
.fs_stats
.byte_avail
) << dendl
;
140 ours
.last_update
= ceph_clock_now();
142 return update_store_stats(ours
);
145 void DataHealthService::share_stats()
147 dout(10) << __func__
<< dendl
;
151 assert(!stats
.empty());
152 entity_inst_t our_inst
= mon
->messenger
->get_myinst();
153 assert(stats
.count(our_inst
) > 0);
154 DataStats
&ours
= stats
[our_inst
];
155 const set
<int>& quorum
= mon
->get_quorum();
156 for (set
<int>::const_iterator it
= quorum
.begin();
157 it
!= quorum
.end(); ++it
) {
158 if (mon
->monmap
->get_name(*it
) == mon
->name
)
160 entity_inst_t inst
= mon
->monmap
->get_inst(*it
);
161 MMonHealth
*m
= new MMonHealth(HealthService::SERVICE_HEALTH_DATA
,
162 MMonHealth::OP_TELL
);
163 m
->data_stats
= ours
;
164 dout(20) << __func__
<< " send " << *m
<< " to " << inst
<< dendl
;
165 mon
->messenger
->send_message(m
, inst
);
169 void DataHealthService::service_tick()
171 dout(10) << __func__
<< dendl
;
173 int err
= update_stats();
175 derr
<< "something went wrong obtaining our disk stats: "
176 << cpp_strerror(err
) << dendl
;
183 DataStats
&ours
= stats
[mon
->messenger
->get_myinst()];
185 if (ours
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_crit
) {
186 derr
<< "reached critical levels of available space on local monitor storage"
187 << " -- shutdown!" << dendl
;
192 // we must backoff these warnings, and track how much data is being
193 // consumed in-between reports to assess if it's worth to log this info,
194 // otherwise we may very well contribute to the consumption of the
195 // already low available disk space.
196 if (ours
.fs_stats
.avail_percent
<= g_conf
->mon_data_avail_warn
) {
197 if (ours
.fs_stats
.avail_percent
!= last_warned_percent
)
199 << "reached concerning levels of available space on local monitor storage"
200 << " (" << ours
.fs_stats
.avail_percent
<< "% free)";
201 last_warned_percent
= ours
.fs_stats
.avail_percent
;
203 last_warned_percent
= 0;
207 void DataHealthService::handle_tell(MonOpRequestRef op
)
209 op
->mark_event("datahealth:handle_tell");
210 MMonHealth
*m
= static_cast<MMonHealth
*>(op
->get_req());
211 dout(10) << __func__
<< " " << *m
<< dendl
;
212 assert(m
->get_service_op() == MMonHealth::OP_TELL
);
214 stats
[m
->get_source_inst()] = m
->data_stats
;
217 bool DataHealthService::service_dispatch_op(MonOpRequestRef op
)
219 op
->mark_event("datahealth:service_dispatch_op");
220 MMonHealth
*m
= static_cast<MMonHealth
*>(op
->get_req());
221 dout(10) << __func__
<< " " << *m
<< dendl
;
222 assert(m
->get_service_type() == get_type());
224 dout(1) << __func__
<< " not in quorum -- drop message" << dendl
;
228 switch (m
->service_op
) {
229 case MMonHealth::OP_TELL
:
230 // someone is telling us their stats
234 dout(0) << __func__
<< " unknown op " << m
->service_op
<< dendl
;
235 assert(0 == "Unknown service op");