]> git.proxmox.com Git - ceph.git/blob - ceph/src/common/HeartbeatMap.cc
bump version to 18.2.4-pve3
[ceph.git] / ceph / src / common / HeartbeatMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <utime.h>
16 #include <signal.h>
17
18 #include "HeartbeatMap.h"
19 #include "ceph_context.h"
20 #include "common/errno.h"
21 #include "common/valgrind.h"
22 #include "debug.h"
23
24 #define dout_subsys ceph_subsys_heartbeatmap
25 #undef dout_prefix
26 #define dout_prefix *_dout << "heartbeat_map "
27
28 using std::chrono::duration_cast;
29 using std::chrono::seconds;
30 using std::string;
31
32 namespace ceph {
33
34 HeartbeatMap::HeartbeatMap(CephContext *cct)
35 : m_cct(cct),
36 m_unhealthy_workers(0),
37 m_total_workers(0)
38 {
39 }
40
41 HeartbeatMap::~HeartbeatMap()
42 {
43 ceph_assert(m_workers.empty());
44 }
45
46 heartbeat_handle_d *HeartbeatMap::add_worker(const string& name, pthread_t thread_id)
47 {
48 std::unique_lock locker{m_rwlock};
49 ldout(m_cct, 10) << "add_worker '" << name << "'" << dendl;
50 heartbeat_handle_d *h = new heartbeat_handle_d(name);
51 ANNOTATE_BENIGN_RACE_SIZED(&h->timeout, sizeof(h->timeout),
52 "heartbeat_handle_d timeout");
53 ANNOTATE_BENIGN_RACE_SIZED(&h->suicide_timeout, sizeof(h->suicide_timeout),
54 "heartbeat_handle_d suicide_timeout");
55 m_workers.push_front(h);
56 h->list_item = m_workers.begin();
57 h->thread_id = thread_id;
58 return h;
59 }
60
61 void HeartbeatMap::remove_worker(const heartbeat_handle_d *h)
62 {
63 std::unique_lock locker{m_rwlock};
64 ldout(m_cct, 10) << "remove_worker '" << h->name << "'" << dendl;
65 m_workers.erase(h->list_item);
66 delete h;
67 }
68
69 bool HeartbeatMap::_check(const heartbeat_handle_d *h, const char *who,
70 ceph::coarse_mono_time now)
71 {
72 bool healthy = true;
73 if (auto was = h->timeout.load(std::memory_order_relaxed);
74 !clock::is_zero(was) && was < now) {
75 ldout(m_cct, 1) << who << " '" << h->name << "'"
76 << " had timed out after " << h->grace << dendl;
77 healthy = false;
78 }
79 if (auto was = h->suicide_timeout.load(std::memory_order_relaxed);
80 !clock::is_zero(was) && was < now) {
81 ldout(m_cct, 1) << who << " '" << h->name << "'"
82 << " had suicide timed out after " << h->suicide_grace << dendl;
83 pthread_kill(h->thread_id, SIGABRT);
84 sleep(1);
85 ceph_abort_msg("hit suicide timeout");
86 }
87 return healthy;
88 }
89
90 void HeartbeatMap::reset_timeout(heartbeat_handle_d *h,
91 ceph::timespan grace,
92 ceph::timespan suicide_grace)
93 {
94 ldout(m_cct, 20) << "reset_timeout '" << h->name << "' grace " << grace
95 << " suicide " << suicide_grace << dendl;
96 const auto now = clock::now();
97 _check(h, "reset_timeout", now);
98
99 h->timeout.store(now + grace, std::memory_order_relaxed);
100 h->grace = grace;
101
102 if (suicide_grace > ceph::timespan::zero()) {
103 h->suicide_timeout.store(now + suicide_grace, std::memory_order_relaxed);
104 } else {
105 h->suicide_timeout.store(clock::zero(), std::memory_order_relaxed);
106 }
107 h->suicide_grace = suicide_grace;
108 }
109
110 void HeartbeatMap::clear_timeout(heartbeat_handle_d *h)
111 {
112 ldout(m_cct, 20) << "clear_timeout '" << h->name << "'" << dendl;
113 auto now = clock::now();
114 _check(h, "clear_timeout", now);
115 h->timeout.store(clock::zero(), std::memory_order_relaxed);
116 h->suicide_timeout.store(clock::zero(), std::memory_order_relaxed);
117 }
118
119 bool HeartbeatMap::is_healthy()
120 {
121 int unhealthy = 0;
122 int total = 0;
123 m_rwlock.lock_shared();
124 auto now = ceph::coarse_mono_clock::now();
125 if (m_cct->_conf->heartbeat_inject_failure) {
126 ldout(m_cct, 0) << "is_healthy injecting failure for next " << m_cct->_conf->heartbeat_inject_failure << " seconds" << dendl;
127 m_inject_unhealthy_until = now + std::chrono::seconds(m_cct->_conf->heartbeat_inject_failure);
128 m_cct->_conf.set_val("heartbeat_inject_failure", "0");
129 }
130
131 bool healthy = true;
132 if (now < m_inject_unhealthy_until) {
133 auto sec = std::chrono::duration_cast<std::chrono::seconds>(m_inject_unhealthy_until - now).count();
134 ldout(m_cct, 0) << "is_healthy = false, injected failure for next "
135 << sec << " seconds" << dendl;
136 healthy = false;
137 }
138
139 for (auto p = m_workers.begin();
140 p != m_workers.end();
141 ++p) {
142 heartbeat_handle_d *h = *p;
143 if (!_check(h, "is_healthy", now)) {
144 healthy = false;
145 unhealthy++;
146 }
147 total++;
148 }
149 m_rwlock.unlock_shared();
150
151 m_unhealthy_workers = unhealthy;
152 m_total_workers = total;
153
154 ldout(m_cct, 20) << "is_healthy = " << (healthy ? "healthy" : "NOT HEALTHY")
155 << ", total workers: " << total << ", number of unhealthy: " << unhealthy << dendl;
156 return healthy;
157 }
158
159 int HeartbeatMap::get_unhealthy_workers() const
160 {
161 return m_unhealthy_workers;
162 }
163
164 int HeartbeatMap::get_total_workers() const
165 {
166 return m_total_workers;
167 }
168
169 void HeartbeatMap::check_touch_file()
170 {
171 string path = m_cct->_conf->heartbeat_file;
172 if (path.length() && is_healthy()) {
173 int fd = ::open(path.c_str(), O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
174 if (fd >= 0) {
175 ::utime(path.c_str(), NULL);
176 ::close(fd);
177 } else {
178 ldout(m_cct, 0) << "unable to touch " << path << ": "
179 << cpp_strerror(errno) << dendl;
180 }
181 }
182 }
183
184 }