]>
Commit | Line | Data |
---|---|---|
2a845540 | 1 | #include "DaemonMetricCollector.h" |
2a845540 TL |
2 | |
3 | #include <boost/json/src.hpp> | |
4 | #include <chrono> | |
5 | #include <filesystem> | |
6 | #include <iostream> | |
7 | #include <map> | |
8 | #include <memory> | |
9 | #include <regex> | |
10 | #include <string> | |
11 | #include <utility> | |
12 | ||
39ae355f TL |
13 | #include "common/admin_socket_client.h" |
14 | #include "common/debug.h" | |
15 | #include "common/hostname.h" | |
16 | #include "common/perf_counters.h" | |
17 | #include "common/split.h" | |
18 | #include "global/global_context.h" | |
19 | #include "global/global_init.h" | |
20 | #include "include/common_fwd.h" | |
21 | #include "util.h" | |
22 | ||
2a845540 TL |
23 | #define dout_context g_ceph_context |
24 | #define dout_subsys ceph_subsys_ceph_exporter | |
25 | ||
26 | using json_object = boost::json::object; | |
27 | using json_value = boost::json::value; | |
28 | using json_array = boost::json::array; | |
29 | ||
30 | void DaemonMetricCollector::request_loop(boost::asio::steady_timer &timer) { | |
31 | timer.async_wait([&](const boost::system::error_code &e) { | |
32 | std::cerr << e << std::endl; | |
33 | update_sockets(); | |
34 | dump_asok_metrics(); | |
35 | auto stats_period = g_conf().get_val<int64_t>("exporter_stats_period"); | |
36 | // time to wait before sending requests again | |
37 | timer.expires_from_now(std::chrono::seconds(stats_period)); | |
38 | request_loop(timer); | |
39 | }); | |
40 | } | |
41 | ||
42 | void DaemonMetricCollector::main() { | |
43 | // time to wait before sending requests again | |
44 | ||
45 | boost::asio::io_service io; | |
46 | boost::asio::steady_timer timer{io, std::chrono::seconds(0)}; | |
47 | request_loop(timer); | |
48 | io.run(); | |
49 | } | |
50 | ||
51 | std::string DaemonMetricCollector::get_metrics() { | |
52 | const std::lock_guard<std::mutex> lock(metrics_mutex); | |
53 | return metrics; | |
54 | } | |
55 | ||
56 | template <class T> | |
57 | void add_metric(std::unique_ptr<MetricsBuilder> &builder, T value, | |
58 | std::string name, std::string description, std::string mtype, | |
59 | labels_t labels) { | |
60 | builder->add(std::to_string(value), name, description, mtype, labels); | |
61 | } | |
62 | ||
63 | void add_double_or_int_metric(std::unique_ptr<MetricsBuilder> &builder, | |
64 | json_value value, std::string name, | |
65 | std::string description, std::string mtype, | |
66 | labels_t labels) { | |
67 | if (value.is_int64()) { | |
68 | int64_t v = value.as_int64(); | |
69 | add_metric(builder, v, name, description, mtype, labels); | |
70 | } else if (value.is_double()) { | |
71 | double v = value.as_double(); | |
72 | add_metric(builder, v, name, description, mtype, labels); | |
73 | } | |
74 | } | |
75 | ||
76 | std::string boost_string_to_std(boost::json::string js) { | |
77 | std::string res(js.data()); | |
78 | return res; | |
79 | } | |
80 | ||
81 | std::string quote(std::string value) { return "\"" + value + "\""; } | |
82 | ||
2a845540 TL |
83 | void DaemonMetricCollector::dump_asok_metrics() { |
84 | BlockTimer timer(__FILE__, __FUNCTION__); | |
85 | ||
86 | std::vector<std::pair<std::string, int>> daemon_pids; | |
87 | ||
39ae355f | 88 | int failures = 0; |
2a845540 TL |
89 | bool sort = g_conf().get_val<bool>("exporter_sort_metrics"); |
90 | if (sort) { | |
39ae355f TL |
91 | builder = |
92 | std::unique_ptr<OrderedMetricsBuilder>(new OrderedMetricsBuilder()); | |
2a845540 | 93 | } else { |
39ae355f TL |
94 | builder = |
95 | std::unique_ptr<UnorderedMetricsBuilder>(new UnorderedMetricsBuilder()); | |
2a845540 | 96 | } |
1e59de90 | 97 | auto prio_limit = g_conf().get_val<int64_t>("exporter_prio_limit"); |
2a845540 TL |
98 | for (auto &[daemon_name, sock_client] : clients) { |
99 | bool ok; | |
100 | sock_client.ping(&ok); | |
101 | if (!ok) { | |
39ae355f | 102 | failures++; |
2a845540 TL |
103 | continue; |
104 | } | |
1e59de90 TL |
105 | std::string counter_dump_response = |
106 | asok_request(sock_client, "counter dump", daemon_name); | |
107 | if (counter_dump_response.size() == 0) { | |
108 | failures++; | |
109 | continue; | |
2a845540 | 110 | } |
1e59de90 TL |
111 | std::string counter_schema_response = |
112 | asok_request(sock_client, "counter schema", daemon_name); | |
113 | if (counter_schema_response.size() == 0) { | |
39ae355f TL |
114 | failures++; |
115 | continue; | |
116 | } | |
1e59de90 TL |
117 | |
118 | json_object counter_dump = boost::json::parse(counter_dump_response).as_object(); | |
119 | json_object counter_schema = boost::json::parse(counter_schema_response).as_object(); | |
120 | ||
121 | for (auto &perf_group_item : counter_schema) { | |
122 | std::string perf_group = {perf_group_item.key().begin(), | |
123 | perf_group_item.key().end()}; | |
124 | json_object perf_group_object = perf_group_item.value().as_object(); | |
125 | auto counters = perf_group_object["counters"].as_object(); | |
126 | auto counters_labels = perf_group_object["labels"].as_object(); | |
127 | auto counters_values = | |
128 | counter_dump[perf_group].as_object()["counters"].as_object(); | |
129 | labels_t labels; | |
130 | ||
131 | for(auto &label: counters_labels) { | |
132 | std::string label_key = {label.key().begin(), label.key().end()}; | |
133 | labels[label_key] = quote(label.value().as_string().c_str()); | |
134 | } | |
135 | for (auto &counter : counters) { | |
136 | json_object counter_group = counter.value().as_object(); | |
137 | if (counter_group["priority"].as_int64() < prio_limit) { | |
138 | continue; | |
139 | } | |
140 | std::string counter_name_init = {counter.key().begin(), counter.key().end()}; | |
141 | std::string counter_name = perf_group + "_" + counter_name_init; | |
142 | promethize(counter_name); | |
143 | ||
144 | if (counters_labels.empty()) { | |
145 | auto labels_and_name = get_labels_and_metric_name(daemon_name, counter_name); | |
146 | labels = labels_and_name.first; | |
147 | counter_name = labels_and_name.second; | |
148 | } | |
149 | // For now this is only required for rgw multi-site metrics | |
150 | auto multisite_labels_and_name = add_fixed_name_metrics(counter_name); | |
151 | if (!multisite_labels_and_name.first.empty()) { | |
152 | labels.insert(multisite_labels_and_name.first.begin(), multisite_labels_and_name.first.end()); | |
153 | counter_name = multisite_labels_and_name.second; | |
154 | } | |
155 | labels.insert({"ceph_daemon", quote(daemon_name)}); | |
156 | auto perf_values = counters_values.at(counter_name_init); | |
157 | dump_asok_metric(counter_group, perf_values, counter_name, labels); | |
158 | } | |
159 | } | |
39ae355f TL |
160 | std::string config_show = |
161 | asok_request(sock_client, "config show", daemon_name); | |
162 | if (config_show.size() == 0) { | |
163 | failures++; | |
2a845540 TL |
164 | continue; |
165 | } | |
2a845540 TL |
166 | json_object pid_file_json = boost::json::parse(config_show).as_object(); |
167 | std::string pid_path = | |
39ae355f | 168 | boost_string_to_std(pid_file_json["pid_file"].as_string()); |
2a845540 TL |
169 | std::string pid_str = read_file_to_string(pid_path); |
170 | if (!pid_path.size()) { | |
39ae355f TL |
171 | dout(1) << "pid path is empty; process metrics won't be fetched for: " |
172 | << daemon_name << dendl; | |
173 | } | |
174 | if (!pid_str.empty()) { | |
175 | daemon_pids.push_back({daemon_name, std::stoi(pid_str)}); | |
2a845540 | 176 | } |
2a845540 | 177 | } |
39ae355f TL |
178 | dout(10) << "Perf counters retrieved for " << clients.size() - failures << "/" |
179 | << clients.size() << " daemons." << dendl; | |
2a845540 TL |
180 | // get time spent on this function |
181 | timer.stop(); | |
39ae355f TL |
182 | std::string scrap_desc( |
183 | "Time spent scraping and transforming perf counters to metrics"); | |
2a845540 TL |
184 | labels_t scrap_labels; |
185 | scrap_labels["host"] = quote(ceph_get_hostname()); | |
186 | scrap_labels["function"] = quote(__FUNCTION__); | |
187 | add_metric(builder, timer.get_ms(), "ceph_exporter_scrape_time", scrap_desc, | |
188 | "gauge", scrap_labels); | |
189 | ||
190 | const std::lock_guard<std::mutex> lock(metrics_mutex); | |
39ae355f TL |
191 | // only get metrics if there's pid path for some or all daemons isn't empty |
192 | if (daemon_pids.size() != 0) { | |
193 | get_process_metrics(daemon_pids); | |
194 | } | |
2a845540 TL |
195 | metrics = builder->dump(); |
196 | } | |
197 | ||
198 | std::vector<std::string> read_proc_stat_file(std::string path) { | |
199 | std::string stat = read_file_to_string(path); | |
200 | std::vector<std::string> strings; | |
201 | auto parts = ceph::split(stat); | |
202 | strings.assign(parts.begin(), parts.end()); | |
203 | return strings; | |
204 | } | |
205 | ||
206 | struct pstat read_pid_stat(int pid) { | |
207 | std::string stat_path("/proc/" + std::to_string(pid) + "/stat"); | |
208 | std::vector<std::string> stats = read_proc_stat_file(stat_path); | |
209 | struct pstat stat; | |
210 | stat.minflt = std::stoul(stats[9]); | |
211 | stat.majflt = std::stoul(stats[11]); | |
212 | stat.utime = std::stoul(stats[13]); | |
213 | stat.stime = std::stoul(stats[14]); | |
214 | stat.num_threads = std::stoul(stats[19]); | |
215 | stat.start_time = std::stoul(stats[21]); | |
216 | stat.vm_size = std::stoul(stats[22]); | |
217 | stat.resident_size = std::stoi(stats[23]); | |
218 | return stat; | |
219 | } | |
220 | ||
39ae355f TL |
221 | void DaemonMetricCollector::get_process_metrics( |
222 | std::vector<std::pair<std::string, int>> daemon_pids) { | |
2a845540 TL |
223 | std::string path("/proc"); |
224 | std::stringstream ss; | |
225 | for (auto &[daemon_name, pid] : daemon_pids) { | |
226 | std::vector<std::string> uptimes = read_proc_stat_file("/proc/uptime"); | |
227 | struct pstat stat = read_pid_stat(pid); | |
228 | int clk_tck = sysconf(_SC_CLK_TCK); | |
229 | double start_time_seconds = stat.start_time / (double)clk_tck; | |
230 | double user_time = stat.utime / (double)clk_tck; | |
231 | double kernel_time = stat.stime / (double)clk_tck; | |
232 | double total_time_seconds = user_time + kernel_time; | |
233 | double uptime = std::stod(uptimes[0]); | |
234 | double elapsed_time = uptime - start_time_seconds; | |
39ae355f | 235 | double idle_time = elapsed_time - total_time_seconds; |
2a845540 TL |
236 | double usage = total_time_seconds * 100 / elapsed_time; |
237 | ||
238 | labels_t labels; | |
239 | labels["ceph_daemon"] = quote(daemon_name); | |
240 | add_metric(builder, stat.minflt, "ceph_exporter_minflt_total", | |
241 | "Number of minor page faults of daemon", "counter", labels); | |
242 | add_metric(builder, stat.majflt, "ceph_exporter_majflt_total", | |
243 | "Number of major page faults of daemon", "counter", labels); | |
244 | add_metric(builder, stat.num_threads, "ceph_exporter_num_threads", | |
245 | "Number of threads used by daemon", "gauge", labels); | |
39ae355f TL |
246 | add_metric(builder, usage, "ceph_exporter_cpu_usage", |
247 | "CPU usage of a daemon", "gauge", labels); | |
2a845540 TL |
248 | |
249 | std::string cpu_time_desc = "Process time in kernel/user/idle mode"; | |
250 | labels_t cpu_total_labels; | |
251 | cpu_total_labels["ceph_daemon"] = quote(daemon_name); | |
252 | cpu_total_labels["mode"] = quote("kernel"); | |
253 | add_metric(builder, kernel_time, "ceph_exporter_cpu_total", cpu_time_desc, | |
254 | "counter", cpu_total_labels); | |
255 | cpu_total_labels["mode"] = quote("user"); | |
256 | add_metric(builder, user_time, "ceph_exporter_cpu_total", cpu_time_desc, | |
257 | "counter", cpu_total_labels); | |
258 | cpu_total_labels["mode"] = quote("idle"); | |
259 | add_metric(builder, idle_time, "ceph_exporter_cpu_total", cpu_time_desc, | |
260 | "counter", cpu_total_labels); | |
39ae355f TL |
261 | add_metric(builder, stat.vm_size, "ceph_exporter_vm_size", |
262 | "Virtual memory used in a daemon", "gauge", labels); | |
2a845540 TL |
263 | add_metric(builder, stat.resident_size, "ceph_exporter_resident_size", |
264 | "Resident memory in a daemon", "gauge", labels); | |
265 | } | |
266 | } | |
267 | ||
268 | std::string DaemonMetricCollector::asok_request(AdminSocketClient &asok, | |
39ae355f TL |
269 | std::string command, |
270 | std::string daemon_name) { | |
2a845540 TL |
271 | std::string request("{\"prefix\": \"" + command + "\"}"); |
272 | std::string response; | |
273 | std::string err = asok.do_request(request, &response); | |
274 | if (err.length() > 0 || response.substr(0, 5) == "ERROR") { | |
39ae355f TL |
275 | dout(1) << "command " << command << "failed for daemon " << daemon_name |
276 | << "with error: " << err << dendl; | |
2a845540 TL |
277 | return ""; |
278 | } | |
279 | return response; | |
280 | } | |
281 | ||
282 | std::pair<labels_t, std::string> | |
283 | DaemonMetricCollector::get_labels_and_metric_name(std::string daemon_name, | |
284 | std::string metric_name) { | |
285 | std::string new_metric_name; | |
286 | labels_t labels; | |
287 | new_metric_name = metric_name; | |
1e59de90 TL |
288 | // In vstart cluster socket files for rgw are stored as radosgw.<instance_id>.asok |
289 | if (daemon_name.find("radosgw") != std::string::npos) { | |
290 | std::size_t pos = daemon_name.find_last_of('.'); | |
291 | std::string tmp = daemon_name.substr(pos+1); | |
292 | labels["instance_id"] = quote(tmp); | |
293 | } | |
294 | else if (daemon_name.find("rgw") != std::string::npos) { | |
2a845540 TL |
295 | std::string tmp = daemon_name.substr(16, std::string::npos); |
296 | std::string::size_type pos = tmp.find('.'); | |
297 | labels["instance_id"] = quote("rgw." + tmp.substr(0, pos)); | |
1e59de90 TL |
298 | } |
299 | else if (daemon_name.find("rbd-mirror") != std::string::npos) { | |
300 | std::regex re( | |
301 | "^rbd_mirror_image_([^/]+)/(?:(?:([^/]+)/" | |
302 | ")?)(.*)\\.(replay(?:_bytes|_latency)?)$"); | |
303 | std::smatch match; | |
304 | if (std::regex_search(daemon_name, match, re) == true) { | |
305 | new_metric_name = "ceph_rbd_mirror_image_" + match.str(4); | |
306 | labels["pool"] = quote(match.str(1)); | |
307 | labels["namespace"] = quote(match.str(2)); | |
308 | labels["image"] = quote(match.str(3)); | |
2a845540 TL |
309 | } |
310 | } | |
311 | return {labels, new_metric_name}; | |
312 | } | |
313 | ||
1e59de90 TL |
314 | // Add fixed name metrics from existing ones that have details in their names |
315 | // that should be in labels (not in name). For backward compatibility, | |
316 | // a new fixed name metric is created (instead of replacing)and details are put | |
317 | // in new labels. Intended for RGW sync perf. counters but extendable as required. | |
318 | // See: https://tracker.ceph.com/issues/45311 | |
319 | std::pair<labels_t, std::string> | |
320 | DaemonMetricCollector::add_fixed_name_metrics(std::string metric_name) { | |
321 | std::string new_metric_name; | |
322 | labels_t labels; | |
323 | new_metric_name = metric_name; | |
324 | ||
325 | std::regex re("^data_sync_from_(.*)\\."); | |
326 | std::smatch match; | |
327 | if (std::regex_search(metric_name, match, re) == true) { | |
328 | new_metric_name = std::regex_replace(metric_name, re, "from_([^.]*)', 'from_zone"); | |
329 | labels["source_zone"] = quote(match.str(1)); | |
330 | return {labels, new_metric_name}; | |
331 | } | |
332 | return {}; | |
333 | } | |
334 | ||
2a845540 TL |
335 | /* |
336 | perf_values can be either a int/double or a json_object. Since | |
337 | json_value is a wrapper of both we use that class. | |
338 | */ | |
339 | void DaemonMetricCollector::dump_asok_metric(json_object perf_info, | |
340 | json_value perf_values, | |
341 | std::string name, | |
342 | labels_t labels) { | |
343 | int64_t type = perf_info["type"].as_int64(); | |
344 | std::string metric_type = | |
39ae355f | 345 | boost_string_to_std(perf_info["metric_type"].as_string()); |
2a845540 | 346 | std::string description = |
39ae355f | 347 | boost_string_to_std(perf_info["description"].as_string()); |
2a845540 TL |
348 | |
349 | if (type & PERFCOUNTER_LONGRUNAVG) { | |
350 | int64_t count = perf_values.as_object()["avgcount"].as_int64(); | |
351 | add_metric(builder, count, name + "_count", description, metric_type, | |
352 | labels); | |
353 | json_value sum_value = perf_values.as_object()["sum"]; | |
354 | add_double_or_int_metric(builder, sum_value, name + "_sum", description, | |
355 | metric_type, labels); | |
356 | } else if (type & PERFCOUNTER_TIME) { | |
357 | if (perf_values.is_int64()) { | |
358 | double value = perf_values.as_int64() / 1000000000.0f; | |
359 | add_metric(builder, value, name, description, metric_type, labels); | |
360 | } else if (perf_values.is_double()) { | |
361 | double value = perf_values.as_double() / 1000000000.0f; | |
362 | add_metric(builder, value, name, description, metric_type, labels); | |
363 | } | |
364 | } else { | |
365 | add_double_or_int_metric(builder, perf_values, name, description, | |
366 | metric_type, labels); | |
367 | } | |
368 | } | |
369 | ||
370 | void DaemonMetricCollector::update_sockets() { | |
371 | std::string sock_dir = g_conf().get_val<std::string>("exporter_sock_dir"); | |
372 | clients.clear(); | |
373 | std::filesystem::path sock_path = sock_dir; | |
39ae355f | 374 | if (!std::filesystem::is_directory(sock_path.parent_path())) { |
2a845540 TL |
375 | dout(1) << "ERROR: No such directory exist" << sock_dir << dendl; |
376 | return; | |
377 | } | |
39ae355f | 378 | for (const auto &entry : std::filesystem::directory_iterator(sock_dir)) { |
2a845540 TL |
379 | if (entry.path().extension() == ".asok") { |
380 | std::string daemon_socket_name = entry.path().filename().string(); | |
381 | std::string daemon_name = | |
39ae355f | 382 | daemon_socket_name.substr(0, daemon_socket_name.size() - 5); |
2a845540 TL |
383 | if (clients.find(daemon_name) == clients.end() && |
384 | !(daemon_name.find("mgr") != std::string::npos) && | |
385 | !(daemon_name.find("ceph-exporter") != std::string::npos)) { | |
386 | AdminSocketClient sock(entry.path().string()); | |
387 | clients.insert({daemon_name, std::move(sock)}); | |
388 | } | |
389 | } | |
390 | } | |
391 | } | |
392 | ||
393 | void OrderedMetricsBuilder::add(std::string value, std::string name, | |
394 | std::string description, std::string mtype, | |
395 | labels_t labels) { | |
2a845540 TL |
396 | if (metrics.find(name) == metrics.end()) { |
397 | Metric metric(name, mtype, description); | |
398 | metrics[name] = std::move(metric); | |
399 | } | |
400 | Metric &metric = metrics[name]; | |
401 | metric.add(labels, value); | |
402 | } | |
403 | ||
404 | std::string OrderedMetricsBuilder::dump() { | |
405 | for (auto &[name, metric] : metrics) { | |
406 | out += metric.dump() + "\n"; | |
407 | } | |
408 | return out; | |
409 | } | |
410 | ||
411 | void UnorderedMetricsBuilder::add(std::string value, std::string name, | |
412 | std::string description, std::string mtype, | |
413 | labels_t labels) { | |
2a845540 TL |
414 | Metric metric(name, mtype, description); |
415 | metric.add(labels, value); | |
416 | out += metric.dump() + "\n\n"; | |
417 | } | |
418 | ||
419 | std::string UnorderedMetricsBuilder::dump() { return out; } | |
420 | ||
421 | void Metric::add(labels_t labels, std::string value) { | |
422 | metric_entry entry; | |
423 | entry.labels = labels; | |
424 | entry.value = value; | |
425 | entries.push_back(entry); | |
426 | } | |
427 | ||
428 | std::string Metric::dump() { | |
429 | std::stringstream metric_ss; | |
430 | metric_ss << "# HELP " << name << " " << description << "\n"; | |
431 | metric_ss << "# TYPE " << name << " " << mtype << "\n"; | |
432 | for (auto &entry : entries) { | |
433 | std::stringstream labels_ss; | |
434 | size_t i = 0; | |
435 | for (auto &[label_name, label_value] : entry.labels) { | |
436 | labels_ss << label_name << "=" << label_value; | |
437 | if (i < entry.labels.size() - 1) { | |
438 | labels_ss << ","; | |
439 | } | |
440 | i++; | |
441 | } | |
442 | metric_ss << name << "{" << labels_ss.str() << "} " << entry.value; | |
443 | if (&entry != &entries.back()) { | |
444 | metric_ss << "\n"; | |
445 | } | |
446 | } | |
447 | return metric_ss.str(); | |
448 | } | |
449 | ||
450 | DaemonMetricCollector &collector_instance() { | |
451 | static DaemonMetricCollector instance; | |
452 | return instance; | |
453 | } |