1 #include "DaemonMetricCollector.h"
3 #include <boost/json/src.hpp>
13 #include "common/admin_socket_client.h"
14 #include "common/debug.h"
15 #include "common/hostname.h"
16 #include "common/perf_counters.h"
17 #include "common/split.h"
18 #include "global/global_context.h"
19 #include "global/global_init.h"
20 #include "include/common_fwd.h"
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_ceph_exporter
26 using json_object
= boost::json::object
;
27 using json_value
= boost::json::value
;
28 using json_array
= boost::json::array
;
30 void DaemonMetricCollector::request_loop(boost::asio::steady_timer
&timer
) {
31 timer
.async_wait([&](const boost::system::error_code
&e
) {
32 std::cerr
<< e
<< std::endl
;
35 auto stats_period
= g_conf().get_val
<int64_t>("exporter_stats_period");
36 // time to wait before sending requests again
37 timer
.expires_from_now(std::chrono::seconds(stats_period
));
42 void DaemonMetricCollector::main() {
43 // time to wait before sending requests again
45 boost::asio::io_service io
;
46 boost::asio::steady_timer timer
{io
, std::chrono::seconds(0)};
51 std::string
DaemonMetricCollector::get_metrics() {
52 const std::lock_guard
<std::mutex
> lock(metrics_mutex
);
57 void add_metric(std::unique_ptr
<MetricsBuilder
> &builder
, T value
,
58 std::string name
, std::string description
, std::string mtype
,
60 builder
->add(std::to_string(value
), name
, description
, mtype
, labels
);
63 void add_double_or_int_metric(std::unique_ptr
<MetricsBuilder
> &builder
,
64 json_value value
, std::string name
,
65 std::string description
, std::string mtype
,
67 if (value
.is_int64()) {
68 int64_t v
= value
.as_int64();
69 add_metric(builder
, v
, name
, description
, mtype
, labels
);
70 } else if (value
.is_double()) {
71 double v
= value
.as_double();
72 add_metric(builder
, v
, name
, description
, mtype
, labels
);
76 std::string
boost_string_to_std(boost::json::string js
) {
77 std::string
res(js
.data());
81 std::string
quote(std::string value
) { return "\"" + value
+ "\""; }
83 void DaemonMetricCollector::dump_asok_metrics() {
84 BlockTimer
timer(__FILE__
, __FUNCTION__
);
86 std::vector
<std::pair
<std::string
, int>> daemon_pids
;
89 bool sort
= g_conf().get_val
<bool>("exporter_sort_metrics");
92 std::unique_ptr
<OrderedMetricsBuilder
>(new OrderedMetricsBuilder());
95 std::unique_ptr
<UnorderedMetricsBuilder
>(new UnorderedMetricsBuilder());
97 auto prio_limit
= g_conf().get_val
<int64_t>("exporter_prio_limit");
98 for (auto &[daemon_name
, sock_client
] : clients
) {
100 sock_client
.ping(&ok
);
105 std::string counter_dump_response
=
106 asok_request(sock_client
, "counter dump", daemon_name
);
107 if (counter_dump_response
.size() == 0) {
111 std::string counter_schema_response
=
112 asok_request(sock_client
, "counter schema", daemon_name
);
113 if (counter_schema_response
.size() == 0) {
118 json_object counter_dump
= boost::json::parse(counter_dump_response
).as_object();
119 json_object counter_schema
= boost::json::parse(counter_schema_response
).as_object();
121 for (auto &perf_group_item
: counter_schema
) {
122 std::string perf_group
= {perf_group_item
.key().begin(),
123 perf_group_item
.key().end()};
124 json_object perf_group_object
= perf_group_item
.value().as_object();
125 auto counters
= perf_group_object
["counters"].as_object();
126 auto counters_labels
= perf_group_object
["labels"].as_object();
127 auto counters_values
=
128 counter_dump
[perf_group
].as_object()["counters"].as_object();
131 for(auto &label
: counters_labels
) {
132 std::string label_key
= {label
.key().begin(), label
.key().end()};
133 labels
[label_key
] = quote(label
.value().as_string().c_str());
135 for (auto &counter
: counters
) {
136 json_object counter_group
= counter
.value().as_object();
137 if (counter_group
["priority"].as_int64() < prio_limit
) {
140 std::string counter_name_init
= {counter
.key().begin(), counter
.key().end()};
141 std::string counter_name
= perf_group
+ "_" + counter_name_init
;
142 promethize(counter_name
);
144 if (counters_labels
.empty()) {
145 auto labels_and_name
= get_labels_and_metric_name(daemon_name
, counter_name
);
146 labels
= labels_and_name
.first
;
147 counter_name
= labels_and_name
.second
;
149 // For now this is only required for rgw multi-site metrics
150 auto multisite_labels_and_name
= add_fixed_name_metrics(counter_name
);
151 if (!multisite_labels_and_name
.first
.empty()) {
152 labels
.insert(multisite_labels_and_name
.first
.begin(), multisite_labels_and_name
.first
.end());
153 counter_name
= multisite_labels_and_name
.second
;
155 labels
.insert({"ceph_daemon", quote(daemon_name
)});
156 auto perf_values
= counters_values
.at(counter_name_init
);
157 dump_asok_metric(counter_group
, perf_values
, counter_name
, labels
);
160 std::string config_show
=
161 asok_request(sock_client
, "config show", daemon_name
);
162 if (config_show
.size() == 0) {
166 json_object pid_file_json
= boost::json::parse(config_show
).as_object();
167 std::string pid_path
=
168 boost_string_to_std(pid_file_json
["pid_file"].as_string());
169 std::string pid_str
= read_file_to_string(pid_path
);
170 if (!pid_path
.size()) {
171 dout(1) << "pid path is empty; process metrics won't be fetched for: "
172 << daemon_name
<< dendl
;
174 if (!pid_str
.empty()) {
175 daemon_pids
.push_back({daemon_name
, std::stoi(pid_str
)});
178 dout(10) << "Perf counters retrieved for " << clients
.size() - failures
<< "/"
179 << clients
.size() << " daemons." << dendl
;
180 // get time spent on this function
182 std::string
scrap_desc(
183 "Time spent scraping and transforming perf counters to metrics");
184 labels_t scrap_labels
;
185 scrap_labels
["host"] = quote(ceph_get_hostname());
186 scrap_labels
["function"] = quote(__FUNCTION__
);
187 add_metric(builder
, timer
.get_ms(), "ceph_exporter_scrape_time", scrap_desc
,
188 "gauge", scrap_labels
);
190 const std::lock_guard
<std::mutex
> lock(metrics_mutex
);
191 // only get metrics if there's pid path for some or all daemons isn't empty
192 if (daemon_pids
.size() != 0) {
193 get_process_metrics(daemon_pids
);
195 metrics
= builder
->dump();
198 std::vector
<std::string
> read_proc_stat_file(std::string path
) {
199 std::string stat
= read_file_to_string(path
);
200 std::vector
<std::string
> strings
;
201 auto parts
= ceph::split(stat
);
202 strings
.assign(parts
.begin(), parts
.end());
206 struct pstat
read_pid_stat(int pid
) {
207 std::string
stat_path("/proc/" + std::to_string(pid
) + "/stat");
208 std::vector
<std::string
> stats
= read_proc_stat_file(stat_path
);
210 stat
.minflt
= std::stoul(stats
[9]);
211 stat
.majflt
= std::stoul(stats
[11]);
212 stat
.utime
= std::stoul(stats
[13]);
213 stat
.stime
= std::stoul(stats
[14]);
214 stat
.num_threads
= std::stoul(stats
[19]);
215 stat
.start_time
= std::stoul(stats
[21]);
216 stat
.vm_size
= std::stoul(stats
[22]);
217 stat
.resident_size
= std::stoi(stats
[23]);
221 void DaemonMetricCollector::get_process_metrics(
222 std::vector
<std::pair
<std::string
, int>> daemon_pids
) {
223 std::string
path("/proc");
224 std::stringstream ss
;
225 for (auto &[daemon_name
, pid
] : daemon_pids
) {
226 std::vector
<std::string
> uptimes
= read_proc_stat_file("/proc/uptime");
227 struct pstat stat
= read_pid_stat(pid
);
228 int clk_tck
= sysconf(_SC_CLK_TCK
);
229 double start_time_seconds
= stat
.start_time
/ (double)clk_tck
;
230 double user_time
= stat
.utime
/ (double)clk_tck
;
231 double kernel_time
= stat
.stime
/ (double)clk_tck
;
232 double total_time_seconds
= user_time
+ kernel_time
;
233 double uptime
= std::stod(uptimes
[0]);
234 double elapsed_time
= uptime
- start_time_seconds
;
235 double idle_time
= elapsed_time
- total_time_seconds
;
236 double usage
= total_time_seconds
* 100 / elapsed_time
;
239 labels
["ceph_daemon"] = quote(daemon_name
);
240 add_metric(builder
, stat
.minflt
, "ceph_exporter_minflt_total",
241 "Number of minor page faults of daemon", "counter", labels
);
242 add_metric(builder
, stat
.majflt
, "ceph_exporter_majflt_total",
243 "Number of major page faults of daemon", "counter", labels
);
244 add_metric(builder
, stat
.num_threads
, "ceph_exporter_num_threads",
245 "Number of threads used by daemon", "gauge", labels
);
246 add_metric(builder
, usage
, "ceph_exporter_cpu_usage",
247 "CPU usage of a daemon", "gauge", labels
);
249 std::string cpu_time_desc
= "Process time in kernel/user/idle mode";
250 labels_t cpu_total_labels
;
251 cpu_total_labels
["ceph_daemon"] = quote(daemon_name
);
252 cpu_total_labels
["mode"] = quote("kernel");
253 add_metric(builder
, kernel_time
, "ceph_exporter_cpu_total", cpu_time_desc
,
254 "counter", cpu_total_labels
);
255 cpu_total_labels
["mode"] = quote("user");
256 add_metric(builder
, user_time
, "ceph_exporter_cpu_total", cpu_time_desc
,
257 "counter", cpu_total_labels
);
258 cpu_total_labels
["mode"] = quote("idle");
259 add_metric(builder
, idle_time
, "ceph_exporter_cpu_total", cpu_time_desc
,
260 "counter", cpu_total_labels
);
261 add_metric(builder
, stat
.vm_size
, "ceph_exporter_vm_size",
262 "Virtual memory used in a daemon", "gauge", labels
);
263 add_metric(builder
, stat
.resident_size
, "ceph_exporter_resident_size",
264 "Resident memory in a daemon", "gauge", labels
);
268 std::string
DaemonMetricCollector::asok_request(AdminSocketClient
&asok
,
270 std::string daemon_name
) {
271 std::string
request("{\"prefix\": \"" + command
+ "\"}");
272 std::string response
;
273 std::string err
= asok
.do_request(request
, &response
);
274 if (err
.length() > 0 || response
.substr(0, 5) == "ERROR") {
275 dout(1) << "command " << command
<< "failed for daemon " << daemon_name
276 << "with error: " << err
<< dendl
;
282 std::pair
<labels_t
, std::string
>
283 DaemonMetricCollector::get_labels_and_metric_name(std::string daemon_name
,
284 std::string metric_name
) {
285 std::string new_metric_name
;
287 new_metric_name
= metric_name
;
288 // In vstart cluster socket files for rgw are stored as radosgw.<instance_id>.asok
289 if (daemon_name
.find("radosgw") != std::string::npos
) {
290 std::size_t pos
= daemon_name
.find_last_of('.');
291 std::string tmp
= daemon_name
.substr(pos
+1);
292 labels
["instance_id"] = quote(tmp
);
294 else if (daemon_name
.find("rgw") != std::string::npos
) {
295 std::string tmp
= daemon_name
.substr(16, std::string::npos
);
296 std::string::size_type pos
= tmp
.find('.');
297 labels
["instance_id"] = quote("rgw." + tmp
.substr(0, pos
));
299 else if (daemon_name
.find("rbd-mirror") != std::string::npos
) {
301 "^rbd_mirror_image_([^/]+)/(?:(?:([^/]+)/"
302 ")?)(.*)\\.(replay(?:_bytes|_latency)?)$");
304 if (std::regex_search(daemon_name
, match
, re
) == true) {
305 new_metric_name
= "ceph_rbd_mirror_image_" + match
.str(4);
306 labels
["pool"] = quote(match
.str(1));
307 labels
["namespace"] = quote(match
.str(2));
308 labels
["image"] = quote(match
.str(3));
311 return {labels
, new_metric_name
};
314 // Add fixed name metrics from existing ones that have details in their names
315 // that should be in labels (not in name). For backward compatibility,
316 // a new fixed name metric is created (instead of replacing)and details are put
317 // in new labels. Intended for RGW sync perf. counters but extendable as required.
318 // See: https://tracker.ceph.com/issues/45311
319 std::pair
<labels_t
, std::string
>
320 DaemonMetricCollector::add_fixed_name_metrics(std::string metric_name
) {
321 std::string new_metric_name
;
323 new_metric_name
= metric_name
;
325 std::regex
re("^data_sync_from_(.*)\\.");
327 if (std::regex_search(metric_name
, match
, re
) == true) {
328 new_metric_name
= std::regex_replace(metric_name
, re
, "from_([^.]*)', 'from_zone");
329 labels
["source_zone"] = quote(match
.str(1));
330 return {labels
, new_metric_name
};
336 perf_values can be either a int/double or a json_object. Since
337 json_value is a wrapper of both we use that class.
339 void DaemonMetricCollector::dump_asok_metric(json_object perf_info
,
340 json_value perf_values
,
343 int64_t type
= perf_info
["type"].as_int64();
344 std::string metric_type
=
345 boost_string_to_std(perf_info
["metric_type"].as_string());
346 std::string description
=
347 boost_string_to_std(perf_info
["description"].as_string());
349 if (type
& PERFCOUNTER_LONGRUNAVG
) {
350 int64_t count
= perf_values
.as_object()["avgcount"].as_int64();
351 add_metric(builder
, count
, name
+ "_count", description
, metric_type
,
353 json_value sum_value
= perf_values
.as_object()["sum"];
354 add_double_or_int_metric(builder
, sum_value
, name
+ "_sum", description
,
355 metric_type
, labels
);
356 } else if (type
& PERFCOUNTER_TIME
) {
357 if (perf_values
.is_int64()) {
358 double value
= perf_values
.as_int64() / 1000000000.0f
;
359 add_metric(builder
, value
, name
, description
, metric_type
, labels
);
360 } else if (perf_values
.is_double()) {
361 double value
= perf_values
.as_double() / 1000000000.0f
;
362 add_metric(builder
, value
, name
, description
, metric_type
, labels
);
365 add_double_or_int_metric(builder
, perf_values
, name
, description
,
366 metric_type
, labels
);
370 void DaemonMetricCollector::update_sockets() {
371 std::string sock_dir
= g_conf().get_val
<std::string
>("exporter_sock_dir");
373 std::filesystem::path sock_path
= sock_dir
;
374 if (!std::filesystem::is_directory(sock_path
.parent_path())) {
375 dout(1) << "ERROR: No such directory exist" << sock_dir
<< dendl
;
378 for (const auto &entry
: std::filesystem::directory_iterator(sock_dir
)) {
379 if (entry
.path().extension() == ".asok") {
380 std::string daemon_socket_name
= entry
.path().filename().string();
381 std::string daemon_name
=
382 daemon_socket_name
.substr(0, daemon_socket_name
.size() - 5);
383 if (clients
.find(daemon_name
) == clients
.end() &&
384 !(daemon_name
.find("mgr") != std::string::npos
) &&
385 !(daemon_name
.find("ceph-exporter") != std::string::npos
)) {
386 AdminSocketClient
sock(entry
.path().string());
387 clients
.insert({daemon_name
, std::move(sock
)});
393 void OrderedMetricsBuilder::add(std::string value
, std::string name
,
394 std::string description
, std::string mtype
,
396 if (metrics
.find(name
) == metrics
.end()) {
397 Metric
metric(name
, mtype
, description
);
398 metrics
[name
] = std::move(metric
);
400 Metric
&metric
= metrics
[name
];
401 metric
.add(labels
, value
);
404 std::string
OrderedMetricsBuilder::dump() {
405 for (auto &[name
, metric
] : metrics
) {
406 out
+= metric
.dump() + "\n";
411 void UnorderedMetricsBuilder::add(std::string value
, std::string name
,
412 std::string description
, std::string mtype
,
414 Metric
metric(name
, mtype
, description
);
415 metric
.add(labels
, value
);
416 out
+= metric
.dump() + "\n\n";
419 std::string
UnorderedMetricsBuilder::dump() { return out
; }
421 void Metric::add(labels_t labels
, std::string value
) {
423 entry
.labels
= labels
;
425 entries
.push_back(entry
);
428 std::string
Metric::dump() {
429 std::stringstream metric_ss
;
430 metric_ss
<< "# HELP " << name
<< " " << description
<< "\n";
431 metric_ss
<< "# TYPE " << name
<< " " << mtype
<< "\n";
432 for (auto &entry
: entries
) {
433 std::stringstream labels_ss
;
435 for (auto &[label_name
, label_value
] : entry
.labels
) {
436 labels_ss
<< label_name
<< "=" << label_value
;
437 if (i
< entry
.labels
.size() - 1) {
442 metric_ss
<< name
<< "{" << labels_ss
.str() << "} " << entry
.value
;
443 if (&entry
!= &entries
.back()) {
447 return metric_ss
.str();
450 DaemonMetricCollector
&collector_instance() {
451 static DaemonMetricCollector instance
;