]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/HeartbeatMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "HeartbeatMap.h"
18 #include "ceph_context.h"
19 #include "common/errno.h"
22 #define dout_subsys ceph_subsys_heartbeatmap
24 #define dout_prefix *_dout << "heartbeat_map "
28 HeartbeatMap::HeartbeatMap(CephContext
*cct
)
30 m_rwlock("HeartbeatMap::m_rwlock"),
31 m_inject_unhealthy_until(0),
32 m_unhealthy_workers(0),
37 HeartbeatMap::~HeartbeatMap()
39 assert(m_workers
.empty());
42 heartbeat_handle_d
*HeartbeatMap::add_worker(const string
& name
, pthread_t thread_id
)
45 ldout(m_cct
, 10) << "add_worker '" << name
<< "'" << dendl
;
46 heartbeat_handle_d
*h
= new heartbeat_handle_d(name
);
47 ANNOTATE_BENIGN_RACE_SIZED(&h
->timeout
, sizeof(h
->timeout
),
48 "heartbeat_handle_d timeout");
49 ANNOTATE_BENIGN_RACE_SIZED(&h
->suicide_timeout
, sizeof(h
->suicide_timeout
),
50 "heartbeat_handle_d suicide_timeout");
51 m_workers
.push_front(h
);
52 h
->list_item
= m_workers
.begin();
53 h
->thread_id
= thread_id
;
58 void HeartbeatMap::remove_worker(const heartbeat_handle_d
*h
)
61 ldout(m_cct
, 10) << "remove_worker '" << h
->name
<< "'" << dendl
;
62 m_workers
.erase(h
->list_item
);
67 bool HeartbeatMap::_check(const heartbeat_handle_d
*h
, const char *who
, time_t now
)
73 if (was
&& was
< now
) {
74 ldout(m_cct
, 1) << who
<< " '" << h
->name
<< "'"
75 << " had timed out after " << h
->grace
<< dendl
;
78 was
= h
->suicide_timeout
;
79 if (was
&& was
< now
) {
80 ldout(m_cct
, 1) << who
<< " '" << h
->name
<< "'"
81 << " had suicide timed out after " << h
->suicide_grace
<< dendl
;
82 pthread_kill(h
->thread_id
, SIGABRT
);
84 assert(0 == "hit suicide timeout");
89 void HeartbeatMap::reset_timeout(heartbeat_handle_d
*h
, time_t grace
, time_t suicide_grace
)
91 ldout(m_cct
, 20) << "reset_timeout '" << h
->name
<< "' grace " << grace
92 << " suicide " << suicide_grace
<< dendl
;
93 time_t now
= time(NULL
);
94 _check(h
, "reset_timeout", now
);
96 h
->timeout
= now
+ grace
;
100 h
->suicide_timeout
= now
+ suicide_grace
;
102 h
->suicide_timeout
= 0;
103 h
->suicide_grace
= suicide_grace
;
106 void HeartbeatMap::clear_timeout(heartbeat_handle_d
*h
)
108 ldout(m_cct
, 20) << "clear_timeout '" << h
->name
<< "'" << dendl
;
109 time_t now
= time(NULL
);
110 _check(h
, "clear_timeout", now
);
112 h
->suicide_timeout
= 0;
115 bool HeartbeatMap::is_healthy()
120 time_t now
= time(NULL
);
121 if (m_cct
->_conf
->heartbeat_inject_failure
) {
122 ldout(m_cct
, 0) << "is_healthy injecting failure for next " << m_cct
->_conf
->heartbeat_inject_failure
<< " seconds" << dendl
;
123 m_inject_unhealthy_until
= now
+ m_cct
->_conf
->heartbeat_inject_failure
;
124 m_cct
->_conf
->set_val("heartbeat_inject_failure", "0");
128 if (now
< m_inject_unhealthy_until
) {
129 ldout(m_cct
, 0) << "is_healthy = false, injected failure for next " << (m_inject_unhealthy_until
- now
) << " seconds" << dendl
;
133 for (list
<heartbeat_handle_d
*>::iterator p
= m_workers
.begin();
134 p
!= m_workers
.end();
136 heartbeat_handle_d
*h
= *p
;
137 if (!_check(h
, "is_healthy", now
)) {
145 m_unhealthy_workers
= unhealthy
;
146 m_total_workers
= total
;
148 ldout(m_cct
, 20) << "is_healthy = " << (healthy
? "healthy" : "NOT HEALTHY")
149 << ", total workers: " << total
<< ", number of unhealthy: " << unhealthy
<< dendl
;
153 int HeartbeatMap::get_unhealthy_workers() const
155 return m_unhealthy_workers
;
158 int HeartbeatMap::get_total_workers() const
160 return m_total_workers
;
163 void HeartbeatMap::check_touch_file()
166 string path
= m_cct
->_conf
->heartbeat_file
;
168 int fd
= ::open(path
.c_str(), O_WRONLY
|O_CREAT
|O_CLOEXEC
, 0644);
170 ::utimes(path
.c_str(), NULL
);
173 ldout(m_cct
, 0) << "unable to touch " << path
<< ": "
174 << cpp_strerror(errno
) << dendl
;