]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/HeartbeatMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include "HeartbeatMap.h"
19 #include "ceph_context.h"
20 #include "common/errno.h"
21 #include "common/valgrind.h"
24 #define dout_subsys ceph_subsys_heartbeatmap
26 #define dout_prefix *_dout << "heartbeat_map "
28 using std::chrono::duration_cast
;
29 using std::chrono::seconds
;
34 HeartbeatMap::HeartbeatMap(CephContext
*cct
)
36 m_unhealthy_workers(0),
41 HeartbeatMap::~HeartbeatMap()
43 ceph_assert(m_workers
.empty());
46 heartbeat_handle_d
*HeartbeatMap::add_worker(const string
& name
, pthread_t thread_id
)
48 std::unique_lock locker
{m_rwlock
};
49 ldout(m_cct
, 10) << "add_worker '" << name
<< "'" << dendl
;
50 heartbeat_handle_d
*h
= new heartbeat_handle_d(name
);
51 ANNOTATE_BENIGN_RACE_SIZED(&h
->timeout
, sizeof(h
->timeout
),
52 "heartbeat_handle_d timeout");
53 ANNOTATE_BENIGN_RACE_SIZED(&h
->suicide_timeout
, sizeof(h
->suicide_timeout
),
54 "heartbeat_handle_d suicide_timeout");
55 m_workers
.push_front(h
);
56 h
->list_item
= m_workers
.begin();
57 h
->thread_id
= thread_id
;
61 void HeartbeatMap::remove_worker(const heartbeat_handle_d
*h
)
63 std::unique_lock locker
{m_rwlock
};
64 ldout(m_cct
, 10) << "remove_worker '" << h
->name
<< "'" << dendl
;
65 m_workers
.erase(h
->list_item
);
69 bool HeartbeatMap::_check(const heartbeat_handle_d
*h
, const char *who
,
70 ceph::coarse_mono_time now
)
73 if (auto was
= h
->timeout
.load(std::memory_order_relaxed
);
74 !clock::is_zero(was
) && was
< now
) {
75 ldout(m_cct
, 1) << who
<< " '" << h
->name
<< "'"
76 << " had timed out after " << h
->grace
<< dendl
;
79 if (auto was
= h
->suicide_timeout
.load(std::memory_order_relaxed
);
80 !clock::is_zero(was
) && was
< now
) {
81 ldout(m_cct
, 1) << who
<< " '" << h
->name
<< "'"
82 << " had suicide timed out after " << h
->suicide_grace
<< dendl
;
83 pthread_kill(h
->thread_id
, SIGABRT
);
85 ceph_abort_msg("hit suicide timeout");
90 void HeartbeatMap::reset_timeout(heartbeat_handle_d
*h
,
92 ceph::timespan suicide_grace
)
94 ldout(m_cct
, 20) << "reset_timeout '" << h
->name
<< "' grace " << grace
95 << " suicide " << suicide_grace
<< dendl
;
96 const auto now
= clock::now();
97 _check(h
, "reset_timeout", now
);
99 h
->timeout
.store(now
+ grace
, std::memory_order_relaxed
);
102 if (suicide_grace
> ceph::timespan::zero()) {
103 h
->suicide_timeout
.store(now
+ suicide_grace
, std::memory_order_relaxed
);
105 h
->suicide_timeout
.store(clock::zero(), std::memory_order_relaxed
);
107 h
->suicide_grace
= suicide_grace
;
110 void HeartbeatMap::clear_timeout(heartbeat_handle_d
*h
)
112 ldout(m_cct
, 20) << "clear_timeout '" << h
->name
<< "'" << dendl
;
113 auto now
= clock::now();
114 _check(h
, "clear_timeout", now
);
115 h
->timeout
.store(clock::zero(), std::memory_order_relaxed
);
116 h
->suicide_timeout
.store(clock::zero(), std::memory_order_relaxed
);
119 bool HeartbeatMap::is_healthy()
123 m_rwlock
.lock_shared();
124 auto now
= ceph::coarse_mono_clock::now();
125 if (m_cct
->_conf
->heartbeat_inject_failure
) {
126 ldout(m_cct
, 0) << "is_healthy injecting failure for next " << m_cct
->_conf
->heartbeat_inject_failure
<< " seconds" << dendl
;
127 m_inject_unhealthy_until
= now
+ std::chrono::seconds(m_cct
->_conf
->heartbeat_inject_failure
);
128 m_cct
->_conf
.set_val("heartbeat_inject_failure", "0");
132 if (now
< m_inject_unhealthy_until
) {
133 auto sec
= std::chrono::duration_cast
<std::chrono::seconds
>(m_inject_unhealthy_until
- now
).count();
134 ldout(m_cct
, 0) << "is_healthy = false, injected failure for next "
135 << sec
<< " seconds" << dendl
;
139 for (auto p
= m_workers
.begin();
140 p
!= m_workers
.end();
142 heartbeat_handle_d
*h
= *p
;
143 if (!_check(h
, "is_healthy", now
)) {
149 m_rwlock
.unlock_shared();
151 m_unhealthy_workers
= unhealthy
;
152 m_total_workers
= total
;
154 ldout(m_cct
, 20) << "is_healthy = " << (healthy
? "healthy" : "NOT HEALTHY")
155 << ", total workers: " << total
<< ", number of unhealthy: " << unhealthy
<< dendl
;
159 int HeartbeatMap::get_unhealthy_workers() const
161 return m_unhealthy_workers
;
164 int HeartbeatMap::get_total_workers() const
166 return m_total_workers
;
169 void HeartbeatMap::check_touch_file()
171 string path
= m_cct
->_conf
->heartbeat_file
;
172 if (path
.length() && is_healthy()) {
173 int fd
= ::open(path
.c_str(), O_WRONLY
|O_CREAT
|O_CLOEXEC
, 0644);
175 ::utime(path
.c_str(), NULL
);
178 ldout(m_cct
, 0) << "unable to touch " << path
<< ": "
179 << cpp_strerror(errno
) << dendl
;