#else
#include <sys/types.h>
#endif
+#include <time.h>
+#include <sys/time.h>
typedef int64_t cs_time_t;
#define CS_TRUE !CS_FALSE
#define CS_MAX_NAME_LENGTH 256
#define CS_TIME_END ((cs_time_t)0x7FFFFFFFFFFFFFFFULL)
+#define CS_MAX(x, y) (((x) > (y)) ? (x) : (y))
typedef struct {
uint16_t length;
} cs_error_t;
+#define CS_TIME_MS_IN_SEC 1000ULL
+#define CS_TIME_US_IN_SEC 1000000ULL
+#define CS_TIME_NS_IN_SEC 1000000000ULL
+#define CS_TIME_US_IN_MSEC 1000ULL
+#define CS_TIME_NS_IN_MSEC 1000000ULL
+#define CS_TIME_NS_IN_USEC 1000ULL
+static inline uint64_t cs_timestamp_get(void)
+{
+ uint64_t result;
+
+#if defined _POSIX_MONOTONIC_CLOCK && _POSIX_MONOTONIC_CLOCK >= 0
+ struct timespec ts;
+
+ clock_gettime (CLOCK_MONOTONIC, &ts);
+ result = (ts.tv_sec * CS_TIME_NS_IN_SEC) + (uint64_t)ts.tv_nsec;
+#else
+ struct timeval time_from_epoch;
+
+ gettimeofday (&time_from_epoch, 0);
+ result = ((time_from_epoch.tv_sec * CS_TIME_NS_IN_SEC) +
+ (time_from_epoch.tv_usec * CS_TIME_NS_IN_USEC));
+#endif
+
+ return result;
+}
+
+
/*
* DEPRECATED
*/
#define QUORUM_ERR_SECURITY CS_ERR_SECURITY
#define quorum_error_t cs_error_t
-#endif
+#endif /* COROTYPES_H_DEFINED */
+
#include <signal.h>
#define SAM_CONFDB_S_FAILED "failed"
-#define SAM_CONFDB_S_REGISTERED "registered"
-#define SAM_CONFDB_S_STARTED "started"
+#define SAM_CONFDB_S_REGISTERED "stopped"
+#define SAM_CONFDB_S_STARTED "running"
#define SAM_CONFDB_S_Q_WAIT "waiting for quorum"
#define SAM_RP_MASK_Q(pol) (pol & (~SAM_RECOVERY_POLICY_QUORUM))
cs_error_t err;
const char *svalue;
uint64_t hc_period, last_hc;
- struct timeval tv;
const char *ssvalue[] = { [SAM_RECOVERY_POLICY_QUIT] = "quit", [SAM_RECOVERY_POLICY_RESTART] = "restart" };
switch (key) {
hc_period = sam_internal_data.time_interval;
if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle,
- "hc_period", &hc_period, sizeof (uint64_t), CONFDB_VALUETYPE_UINT64)) != CS_OK) {
+ "poll_period", &hc_period, sizeof (hc_period), CONFDB_VALUETYPE_UINT64)) != CS_OK) {
goto exit_error;
}
break;
case SAM_CONFDB_KEY_LAST_HC:
- if (gettimeofday (&tv, NULL) == -1) {
- last_hc = 0;
- } else {
- last_hc = ((uint64_t)tv.tv_sec * 1000) + ((uint64_t)tv.tv_usec / 1000);
- }
+ last_hc = cs_timestamp_get();
if ((err = confdb_key_create_typed (sam_internal_data.confdb_handle, sam_internal_data.confdb_pid_handle,
- "hc_last", &last_hc, sizeof (uint64_t), CONFDB_VALUETYPE_UINT64)) != CS_OK) {
+ "last_updated", &last_hc, sizeof (last_hc), CONFDB_VALUETYPE_UINT64)) != CS_OK) {
goto exit_error;
}
break;
.IP \(bu 3
\fIrecovery\fR - will be quit or restart depending on policy
.IP \(bu 3
-\fIhc_period\fR - period of health checking in milliseconds
+\fIpoll_period\fR - period of health checking in milliseconds
.IP \(bu 3
-\fIhc_last\fR - last known GMT time in milliseconds when health check was received
+\fIlast_updated\fR - Timestamp (in nanoseconds) of the last health check.
.IP \(bu 3
\fIstate\fR - state of process (can be one of registered, started, failed, waiting for quorum)
.RE
#include <corosync/lcr/lcr_comp.h>
#include <corosync/engine/coroapi.h>
#include <corosync/list.h>
-#include <corosync/totem/coropoll.h>
#include <corosync/engine/logsys.h>
#include "../exec/fsm.h"
LOGSYS_DECLARE_SUBSYS ("MON");
-#undef ENTER
-#define ENTER() log_printf (LOGSYS_LEVEL_INFO, "%s", __func__)
-
/*
* Service Interfaces required by service_message_handler struct
*/
static int mon_exec_init_fn (
struct corosync_api_v1 *corosync_api);
-hdb_handle_t mon_poll = 0;
static struct corosync_api_v1 *api;
static hdb_handle_t resources_obj;
-static pthread_t mon_poll_thread;
-#define MON_DEFAULT_PERIOD 3
+#define MON_DEFAULT_PERIOD 3000
+#define MON_MIN_PERIOD 500
+#define MON_MAX_PERIOD (120 * CS_TIME_MS_IN_SEC)
struct corosync_service_engine mon_service_engine = {
.name = "corosync resource monitoring service",
struct resource_instance {
hdb_handle_t handle;
const char *name;
- poll_timer_handle timer_handle;
+ corosync_timer_handle_t timer_handle;
void (*update_stats_fn) (void *data);
struct cs_fsm fsm;
- int32_t period;
+ uint64_t period;
objdb_value_types_t max_type;
union {
int32_t int32;
static void mon_config_changed (struct cs_fsm* fsm, int32_t event, void * data);
static void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
-const char * mon_ok_str = "ok";
+const char * mon_running_str = "running";
const char * mon_failed_str = "failed";
const char * mon_failure_str = "failure";
-const char * mon_disabled_str = "disabled";
+const char * mon_stopped_str = "stopped";
const char * mon_config_changed_str = "config_changed";
enum mon_resource_state {
- MON_S_DISABLED,
- MON_S_OK,
+ MON_S_STOPPED,
+ MON_S_RUNNING,
MON_S_FAILED
};
enum mon_resource_event {
};
struct cs_fsm_entry mon_fsm_table[] = {
- { MON_S_DISABLED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_DISABLED, MON_S_OK, -1} },
- { MON_S_DISABLED, MON_E_FAILURE, NULL, {-1} },
- { MON_S_OK, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} },
- { MON_S_OK, MON_E_FAILURE, mon_resource_failed, {MON_S_FAILED, -1} },
- { MON_S_FAILED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_OK, MON_S_DISABLED, -1} },
- { MON_S_FAILED, MON_E_FAILURE, NULL, {-1} },
+ { MON_S_STOPPED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_STOPPED, MON_S_RUNNING, -1} },
+ { MON_S_STOPPED, MON_E_FAILURE, NULL, {-1} },
+ { MON_S_RUNNING, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_RUNNING, MON_S_STOPPED, -1} },
+ { MON_S_RUNNING, MON_E_FAILURE, mon_resource_failed, {MON_S_FAILED, -1} },
+ { MON_S_FAILED, MON_E_CONFIG_CHANGED, mon_config_changed, {MON_S_RUNNING, MON_S_STOPPED, -1} },
+ { MON_S_FAILED, MON_E_FAILURE, NULL, {-1} },
};
/*
int32_t state)
{
switch (state) {
- case MON_S_DISABLED:
- return mon_disabled_str;
+ case MON_S_STOPPED:
+ return mon_stopped_str;
break;
- case MON_S_OK:
- return mon_ok_str;
+ case MON_S_RUNNING:
+ return mon_running_str;
break;
case MON_S_FAILED:
return mon_failed_str;
return NULL;
}
+static cs_error_t str_to_uint64_t(const char* str, uint64_t *out_value, uint64_t min, uint64_t max)
+{
+ char *endptr;
+
+ errno = 0;
+ *out_value = strtol(str, &endptr, 0);
+
+ /* Check for various possible errors */
+ if (errno != 0 || endptr == str) {
+ return CS_ERR_INVALID_PARAM;
+ }
+
+ if (*out_value > max || *out_value < min) {
+ return CS_ERR_INVALID_PARAM;
+ }
+ return CS_OK;
+}
+
static void mon_fsm_state_set (struct cs_fsm* fsm,
enum mon_resource_state next_state, struct resource_instance* inst)
{
char *str;
size_t str_len;
objdb_value_types_t type;
- int32_t tmp_value;
+ uint64_t tmp_value;
int32_t res;
ENTER();
(void**)&str, &str_len,
&type);
if (res == 0) {
- tmp_value = strtol (str, NULL, 0);
- if (tmp_value > 0 && tmp_value < 120) {
- if (inst->period != tmp_value) {
- inst->period = tmp_value;
- }
+ if (str_to_uint64_t(str, &tmp_value, MON_MIN_PERIOD, MON_MAX_PERIOD) == CS_OK) {
+ log_printf (LOGSYS_LEVEL_DEBUG,
+ "poll_period changing from:%"PRIu64" to %"PRIu64".",
+ inst->period, tmp_value);
+ inst->period = tmp_value;
+ } else {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "Could NOT use poll_period:%s ms for resource %s",
+ str, inst->name);
}
}
+ if (inst->timer_handle) {
+ api->timer_delete(inst->timer_handle);
+ inst->timer_handle = 0;
+ }
res = api->object_key_get_typed (inst->handle, "max",
(void**)&str, &str_len, &type);
if (res != 0) {
if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
inst->max.dbl = INT32_MAX;
}
- mon_fsm_state_set (fsm, MON_S_DISABLED, inst);
+ mon_fsm_state_set (fsm, MON_S_STOPPED, inst);
} else {
if (inst->max_type == OBJDB_VALUETYPE_INT32) {
inst->max.int32 = strtol (str, NULL, 0);
if (inst->max_type == OBJDB_VALUETYPE_DOUBLE) {
inst->max.dbl = strtod (str, NULL);
}
- mon_fsm_state_set (fsm, MON_S_OK, inst);
- }
-
- if (mon_poll == 0) {
- return;
+ mon_fsm_state_set (fsm, MON_S_RUNNING, inst);
+ /*
+ * run the updater, incase the period has shortened
+ * and to start the timer.
+ */
+ inst->update_stats_fn (inst);
}
- poll_timer_delete (mon_poll, inst->timer_handle);
- /*
- * run the updater, incase the period has shortened
- */
- inst->update_stats_fn (inst);
- poll_timer_add (mon_poll,
- inst->period * 1000, NULL,
- inst->update_stats_fn,
- &inst->timer_handle);
}
void mon_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
"current", strlen("current"),
&new_value, sizeof(new_value));
- timestamp = time (NULL);
+ timestamp = cs_timestamp_get();
api->object_key_replace (inst->handle,
"last_updated", strlen("last_updated"),
- ×tamp, sizeof(time_t));
+ ×tamp, sizeof(uint64_t));
- if (new_value > inst->max.int32) {
+ if (new_value > inst->max.int32 && inst->fsm.curr_state != MON_S_FAILED) {
cs_fsm_process (&inst->fsm, MON_E_FAILURE, inst);
}
}
- poll_timer_add (mon_poll,
- inst->period * 1000, inst,
- inst->update_stats_fn,
- &inst->timer_handle);
+ api->timer_add_duration(inst->period * MILLI_2_NANO_SECONDS,
+ inst, inst->update_stats_fn, &inst->timer_handle);
}
static double min15_loadavg_get(void)
int32_t res = 0;
double min15 = min15_loadavg_get();
- if (min15 < 0) {
- }
- res = api->object_key_replace (inst->handle,
- "current", strlen("current"),
- &min15, sizeof (min15));
- if (res != 0)
- log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", res);
-
- timestamp = cs_timestamp_get();
-
- res = api->object_key_replace (inst->handle,
- "last_updated", strlen("last_updated"),
- ×tamp, sizeof(uint64_t));
- if (res != 0)
- log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: %d", res);
-
- if (min15 > inst->max.dbl) {
- cs_fsm_process (&inst->fsm, MON_E_FAILURE, &inst);
- }
-
- poll_timer_add (mon_poll,
- inst->period * 1000, inst,
- inst->update_stats_fn,
- &inst->timer_handle);
-}
-
-static void *mon_thread_handler (void * unused)
-{
-#ifdef HAVE_LIBSTATGRAB
- sg_init();
-#endif /* HAVE_LIBSTATGRAB */
- mon_poll = poll_create ();
-
- poll_timer_add (mon_poll,
- memory_used_inst.period * 1000,
- &memory_used_inst,
- memory_used_inst.update_stats_fn,
- &memory_used_inst.timer_handle);
+ if (min15 > 0) {
+ res = api->object_key_replace (inst->handle,
+ "current", strlen("current"),
+ &min15, sizeof (min15));
+ if (res != 0) {
+ log_printf (LOGSYS_LEVEL_ERROR, "replace current failed: %d", res);
+ }
+ timestamp = cs_timestamp_get();
- poll_timer_add (mon_poll,
- load_15min_inst.period * 1000,
- &load_15min_inst,
- load_15min_inst.update_stats_fn,
- &load_15min_inst.timer_handle);
- poll_run (mon_poll);
+ res = api->object_key_replace (inst->handle,
+ "last_updated", strlen("last_updated"),
+ ×tamp, sizeof(uint64_t));
+ if (res != 0) {
+ log_printf (LOGSYS_LEVEL_ERROR, "replace last_updated failed: %d", res);
+ }
+ if (min15 > inst->max.dbl && inst->fsm.curr_state != MON_S_FAILED) {
+ cs_fsm_process (&inst->fsm, MON_E_FAILURE, &inst);
+ }
+ }
- return NULL;
+ api->timer_add_duration(inst->period * MILLI_2_NANO_SECONDS,
+ inst, inst->update_stats_fn, &inst->timer_handle);
}
static int object_find_or_create (
return ret;
}
+static void mon_object_destroyed(
+ hdb_handle_t parent_object_handle,
+ const void *name_pt, size_t name_len,
+ void *priv_data_pt)
+{
+ struct resource_instance* inst = (struct resource_instance*)priv_data_pt;
+
+ if (inst) {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "resource \"%s\" deleted from objdb!",
+ inst->name);
+
+ cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
+ }
+}
+
+
static void mon_key_change_notify (object_change_type_t change_type,
hdb_handle_t parent_object_handle,
hdb_handle_t object_handle,
{
struct resource_instance* inst = (struct resource_instance*)priv_data_pt;
- if ((strcmp ((char*)key_name_pt, "max") == 0) ||
- (strcmp ((char*)key_name_pt, "poll_period") == 0)) {
+ if ((strncmp ((char*)key_name_pt, "max", key_len) == 0) ||
+ (strncmp ((char*)key_name_pt, "poll_period", key_len) == 0)) {
ENTER();
cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
}
{
int32_t res;
char mon_period_str[32];
+ char *str;
size_t mon_period_len;
objdb_value_types_t mon_period_type;
- int32_t tmp_value;
+ uint64_t tmp_value;
int32_t zero_32 = 0;
time_t zero_64 = 0;
double zero_double = 0;
- ENTER();
-
object_find_or_create (parent,
&inst->handle,
inst->name, strlen (inst->name));
api->object_key_create_typed (inst->handle,
"last_updated", &zero_64,
- sizeof (time_t), OBJDB_VALUETYPE_INT64);
+ sizeof (uint64_t), OBJDB_VALUETYPE_UINT64);
api->object_key_create_typed (inst->handle,
- "state", mon_disabled_str, strlen (mon_disabled_str),
+ "state", mon_stopped_str, strlen (mon_stopped_str),
OBJDB_VALUETYPE_STRING);
inst->fsm.name = inst->name;
inst->fsm.curr_entry = 0;
- inst->fsm.curr_state = MON_S_DISABLED;
+ inst->fsm.curr_state = MON_S_STOPPED;
inst->fsm.table = mon_fsm_table;
inst->fsm.entries = sizeof(mon_fsm_table) / sizeof(struct cs_fsm_entry);
inst->fsm.state_to_str = mon_res_state_to_str;
res = api->object_key_get_typed (inst->handle,
"poll_period",
- (void**)&mon_period_str, &mon_period_len,
+ (void**)&str, &mon_period_len,
&mon_period_type);
if (res != 0) {
- mon_period_len = snprintf (mon_period_str, 32, "%d",
+ mon_period_len = snprintf (mon_period_str, 32, "%"PRIu64"",
inst->period);
api->object_key_create_typed (inst->handle,
"poll_period", &mon_period_str,
OBJDB_VALUETYPE_STRING);
}
else {
- tmp_value = strtol (mon_period_str, NULL, 0);
- if (tmp_value > 0 && tmp_value < 120)
+ if (str_to_uint64_t(str, &tmp_value, MON_MIN_PERIOD, MON_MAX_PERIOD) == CS_OK) {
inst->period = tmp_value;
+ } else {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "Could NOT use poll_period:%s ms for resource %s",
+ str, inst->name);
+ }
}
cs_fsm_process (&inst->fsm, MON_E_CONFIG_CHANGED, inst);
- poll_timer_add (mon_poll,
- inst->period * 1000, inst,
- inst->update_stats_fn,
- &inst->timer_handle);
-
- api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_ONE,
+ api->object_track_start (inst->handle, OBJECT_TRACK_DEPTH_RECURSIVE,
mon_key_change_notify,
- NULL, NULL, NULL, NULL);
+ NULL, mon_object_destroyed, NULL, inst);
}
hdb_handle_t obj;
hdb_handle_t parent;
+#ifdef HAVE_LIBSTATGRAB
+ sg_init();
+#endif /* HAVE_LIBSTATGRAB */
+
#ifdef COROSYNC_SOLARIS
logsys_subsys_init();
#endif
api = corosync_api;
- ENTER();
object_find_or_create (OBJECT_PARENT_HANDLE,
&resources_obj,
mon_instance_init (parent, &memory_used_inst);
mon_instance_init (parent, &load_15min_inst);
-
- pthread_create (&mon_poll_thread, NULL, mon_thread_handler, NULL);
-
return 0;
}
#include <sys/ioctl.h>
#include <linux/types.h>
#include <linux/watchdog.h>
-#include <linux/reboot.h>
+#include <sys/reboot.h>
#include <corosync/corotypes.h>
#include <corosync/corodefs.h>
struct resource {
hdb_handle_t handle;
char *recovery;
- char name[128];
+ char name[CS_MAX_NAME_LENGTH];
time_t last_updated;
struct cs_fsm fsm;
corosync_timer_handle_t check_timer;
- uint32_t check_timeout;
+ uint64_t check_timeout;
};
LOGSYS_DECLARE_SUBSYS("WD");
static void wd_resource_check_fn (void* resource_ref);
static struct corosync_api_v1 *api;
-#define WD_DEFAULT_TIMEOUT 6
-static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT;
-static uint32_t tickle_timeout = (WD_DEFAULT_TIMEOUT / 2);
+#define WD_DEFAULT_TIMEOUT_SEC 6
+#define WD_DEFAULT_TIMEOUT_MS (WD_DEFAULT_TIMEOUT_SEC * CS_TIME_MS_IN_SEC)
+#define WD_MIN_TIMEOUT_MS 500
+#define WD_MAX_TIMEOUT_MS (120 * CS_TIME_MS_IN_SEC)
+static uint32_t watchdog_timeout = WD_DEFAULT_TIMEOUT_SEC;
+static uint64_t tickle_timeout = (WD_DEFAULT_TIMEOUT_MS / 2);
static int dog = -1;
static corosync_timer_handle_t wd_timer;
static hdb_handle_t resources_obj;
static int watchdog_ok = 1;
struct corosync_service_engine wd_service_engine = {
- .name = "corosync self-fencing service",
+ .name = "corosync watchdog service",
.id = WD_SERVICE,
.priority = 1,
.private_data_size = 0,
- .flow_control = CS_LIB_FLOW_CONTROL_REQUIRED,
+ .flow_control = CS_LIB_FLOW_CONTROL_NOT_REQUIRED,
.lib_init_fn = NULL,
.lib_exit_fn = NULL,
.lib_engine = NULL,
static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data);
enum wd_resource_state {
- WD_S_GOOD,
+ WD_S_RUNNING,
WD_S_FAILED,
- WD_S_DISABLED
+ WD_S_STOPPED
};
enum wd_resource_event {
WD_E_CONFIG_CHANGED
};
-const char * wd_ok_str = "ok";
+const char * wd_running_str = "running";
const char * wd_failed_str = "failed";
const char * wd_failure_str = "failure";
-const char * wd_disabled_str = "disabled";
+const char * wd_stopped_str = "stopped";
const char * wd_config_changed_str = "config_changed";
struct cs_fsm_entry wd_fsm_table[] = {
- { WD_S_DISABLED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_DISABLED, WD_S_GOOD, -1} },
- { WD_S_DISABLED, WD_E_FAILURE, NULL, {-1} },
- { WD_S_GOOD, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} },
- { WD_S_GOOD, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
- { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_GOOD, WD_S_DISABLED, -1} },
- { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
+ { WD_S_STOPPED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_STOPPED, WD_S_RUNNING, -1} },
+ { WD_S_STOPPED, WD_E_FAILURE, NULL, {-1} },
+ { WD_S_RUNNING, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
+ { WD_S_RUNNING, WD_E_FAILURE, wd_resource_failed, {WD_S_FAILED, -1} },
+ { WD_S_FAILED, WD_E_CONFIG_CHANGED, wd_config_changed, {WD_S_RUNNING, WD_S_STOPPED, -1} },
+ { WD_S_FAILED, WD_E_FAILURE, NULL, {-1} },
};
/*
return ret;
}
+static cs_error_t str_to_uint64_t(const char* str, uint64_t *out_value, uint64_t min, uint64_t max)
+{
+ char *endptr;
+
+ errno = 0;
+ *out_value = strtol(str, &endptr, 0);
+
+ /* Check for various possible errors */
+ if (errno != 0 || endptr == str) {
+ return CS_ERR_INVALID_PARAM;
+ }
+
+ if (*out_value > max || *out_value < min) {
+ return CS_ERR_INVALID_PARAM;
+ }
+ return CS_OK;
+}
+
static const char * wd_res_state_to_str(struct cs_fsm* fsm,
int32_t state)
{
switch (state) {
- case WD_S_DISABLED:
- return wd_disabled_str;
+ case WD_S_STOPPED:
+ return wd_stopped_str;
break;
- case WD_S_GOOD:
- return wd_ok_str;
+ case WD_S_RUNNING:
+ return wd_running_str;
break;
case WD_S_FAILED:
return wd_failed_str;
}
/*
- * returns (0 == OK, 1 == failed)
+ * returns (CS_TRUE == OK, CS_FALSE == failed)
*/
-static int32_t wd_resource_has_failed (struct resource *ref)
+static int32_t wd_resource_state_is_ok (struct resource *ref)
{
hdb_handle_t resource = ref->handle;
int res;
char* state;
size_t state_len;
objdb_value_types_t type;
- time_t *last_updated;
- time_t my_time;
+ uint64_t *last_updated;
+ uint64_t my_time;
+ uint64_t allowed_period;
size_t last_updated_len;
res = api->object_key_get_typed (resource,
if (res != 0) {
/* key does not exist.
*/
- return 1;
+ return CS_FALSE;
}
res = api->object_key_get_typed (resource,
"state", (void**)&state, &state_len, &type);
if (res != 0 || strncmp (state, "disabled", strlen ("disabled")) == 0) {
/* key does not exist.
*/
- return 1;
+ return CS_FALSE;
+ }
+ if (*last_updated == 0) {
+ /* initial value */
+ return CS_TRUE;
}
- my_time = time (NULL);
+ my_time = cs_timestamp_get();
- if ((*last_updated + ref->check_timeout) < my_time) {
- log_printf (LOGSYS_LEVEL_INFO, "delayed %ld + %d < %ld",
- *last_updated, ref->check_timeout, my_time);
- return 1;
+ /*
+ * Here we check that the monitor has written a timestamp within the poll_period
+ * plus a grace factor of (0.5 * poll_period).
+ */
+ allowed_period = (ref->check_timeout * MILLI_2_NANO_SECONDS * 3) / 2;
+ if ((*last_updated + allowed_period) < my_time) {
+ log_printf (LOGSYS_LEVEL_ERROR,
+ "last_updated %"PRIu64" ms too late, period:%"PRIu64".",
+ (uint64_t)(my_time/MILLI_2_NANO_SECONDS - ((*last_updated + allowed_period) / MILLI_2_NANO_SECONDS)),
+ ref->check_timeout);
+ return CS_FALSE;
}
- if ((*last_updated + ref->check_timeout) < my_time ||
- strcmp (state, "bad") == 0) {
- return 1;
+ if (strcmp (state, wd_failed_str) == 0) {
+ return CS_FALSE;
}
- return 0;
+ return CS_TRUE;
}
static void wd_config_changed (struct cs_fsm* fsm, int32_t event, void * data)
size_t len;
char *state;
objdb_value_types_t type;
- char mon_period_str[32];
- int32_t tmp_value;
+ char *str;
+ uint64_t tmp_value;
+ uint64_t next_timeout;
struct resource *ref = (struct resource*)data;
+ next_timeout = ref->check_timeout;
+
res = api->object_key_get_typed (ref->handle,
"poll_period",
- (void**)&mon_period_str, &len,
+ (void**)&str, &len,
&type);
if (res == 0) {
- tmp_value = strtol (mon_period_str, NULL, 0);
- if (tmp_value > 0 && tmp_value < 120)
- ref->check_timeout = (tmp_value * 5)/4;
+ if (str_to_uint64_t(str, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) {
+ log_printf (LOGSYS_LEVEL_DEBUG,
+ "poll_period changing from:%"PRIu64" to %"PRIu64".",
+ ref->check_timeout, tmp_value);
+ /*
+ * To easy in the transition between poll_period's we are going
+ * to make the first timeout the bigger of the new and old value.
+ * This is to give the monitoring system time to adjust.
+ */
+ next_timeout = CS_MAX(tmp_value, ref->check_timeout);
+ ref->check_timeout = tmp_value;
+ } else {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "Could NOT use poll_period:%s ms for resource %s",
+ str, ref->name);
+ }
}
res = api->object_key_get_typed (ref->handle,
*/
log_printf (LOGSYS_LEVEL_WARNING,
"resource %s missing a recovery key.", ref->name);
- cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
+ cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
return;
}
res = api->object_key_get_typed (ref->handle,
*/
log_printf (LOGSYS_LEVEL_WARNING,
"resource %s missing a state key.", ref->name);
- cs_fsm_state_set(&ref->fsm, WD_S_DISABLED, ref);
+ cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
return;
}
-
- cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
-
if (ref->check_timer) {
api->timer_delete(ref->check_timer);
+ ref->check_timer = NULL;
}
- api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
- ref,
- wd_resource_check_fn, &ref->check_timer);
+ if (strcmp(wd_stopped_str, state) == 0) {
+ cs_fsm_state_set(&ref->fsm, WD_S_STOPPED, ref);
+ } else {
+ api->timer_add_duration(next_timeout * MILLI_2_NANO_SECONDS,
+ ref, wd_resource_check_fn, &ref->check_timer);
+ cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref);
+ }
}
static void wd_resource_failed (struct cs_fsm* fsm, int32_t event, void * data)
if (ref->check_timer) {
api->timer_delete(ref->check_timer);
+ ref->check_timer = NULL;
}
log_printf (LOGSYS_LEVEL_CRIT, "%s resource \"%s\" failed!",
watchdog_ok = 0;
}
else if (strcmp (ref->recovery, "reboot") == 0) {
- //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_RESTART, NULL);
+ reboot(RB_AUTOBOOT);
}
else if (strcmp (ref->recovery, "shutdown") == 0) {
- //reboot(LINUX_REBOOT_MAGIC1, LINUX_REBOOT_MAGIC2, LINUX_REBOOT_CMD_POWER_OFF, NULL);
+ reboot(RB_POWER_OFF);
}
cs_fsm_state_set(fsm, WD_S_FAILED, data);
}
{
struct resource* ref = (struct resource*)priv_data_pt;
- if (strcmp(key_name_pt, "last_updated") == 0 ||
- strcmp(key_name_pt, "current") == 0) {
+ if (strncmp(key_name_pt, "last_updated", key_len) == 0 ||
+ strncmp(key_name_pt, "current", key_len) == 0) {
return;
}
-// log_printf (LOGSYS_LEVEL_WARNING,
-// "watchdog resource key changed: %s.%s=%s ref=%p.",
-// (char*)object_name_pt, (char*)key_name_pt, (char*)key_value_pt, ref);
if (ref == NULL) {
return;
{
struct resource* ref = (struct resource*)priv_data_pt;
- log_printf (LOGSYS_LEVEL_WARNING,
- "watchdog resource \"%s\" deleted from objdb!",
- (char*)name_pt);
-
if (ref) {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "resource \"%s\" deleted from objdb!",
+ ref->name);
+
api->timer_delete(ref->check_timer);
ref->check_timer = NULL;
+ free(ref);
}
}
{
struct resource* ref = (struct resource*)resource_ref;
- log_printf (LOGSYS_LEVEL_INFO,
- "checking watchdog resource \"%s\".",
- ref->name);
- if (wd_resource_has_failed (ref) ) {
+ if (wd_resource_state_is_ok (ref) == CS_FALSE) {
cs_fsm_process(&ref->fsm, WD_E_FAILURE, ref);
- log_printf (LOGSYS_LEVEL_CRIT,
- "watchdog resource \"%s\" failed!",
- (char*)ref->name);
return;
}
- api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
+ api->timer_add_duration(ref->check_timeout*MILLI_2_NANO_SECONDS,
ref, wd_resource_check_fn, &ref->check_timer);
}
-
-static void wd_resource_create (hdb_handle_t resource_obj)
+/*
+ * return 0 - fully configured
+ * return -1 - partially configured
+ */
+static int32_t wd_resource_create (hdb_handle_t resource_obj)
{
int res;
size_t len;
char *state;
objdb_value_types_t type;
- char mon_period_str[32];
- int32_t tmp_value;
+ char period_str[32];
+ char *str;
+ uint64_t tmp_value;
struct resource *ref = malloc (sizeof (struct resource));
ref->handle = resource_obj;
- ref->check_timeout = WD_DEFAULT_TIMEOUT;
+ ref->check_timeout = WD_DEFAULT_TIMEOUT_MS;
ref->check_timer = NULL;
api->object_name_get (resource_obj,
ref->name,
ref->fsm.table = wd_fsm_table;
ref->fsm.entries = sizeof(wd_fsm_table) / sizeof(struct cs_fsm_entry);
ref->fsm.curr_entry = 0;
- ref->fsm.curr_state = WD_S_DISABLED;
+ ref->fsm.curr_state = WD_S_STOPPED;
ref->fsm.state_to_str = wd_res_state_to_str;
ref->fsm.event_to_str = wd_res_event_to_str;
api->object_priv_set (resource_obj, NULL);
res = api->object_key_get_typed (resource_obj,
"poll_period",
- (void**)&mon_period_str, &len,
+ (void**)&str, &len,
&type);
if (res != 0) {
- log_printf (LOGSYS_LEVEL_ERROR, "%s : %d",__func__, res);
- len = snprintf (mon_period_str, 32, "%d", ref->check_timeout);
+ len = snprintf (period_str, 32, "%"PRIu64"", ref->check_timeout);
api->object_key_create_typed (resource_obj,
- "poll_period", &mon_period_str,
+ "poll_period", &period_str,
len,
OBJDB_VALUETYPE_STRING);
}
else {
- tmp_value = strtol (mon_period_str, NULL, 0);
- if (tmp_value > 0 && tmp_value < 120)
- ref->check_timeout = (tmp_value * 5)/4;
+ if (str_to_uint64_t(str, &tmp_value, WD_MIN_TIMEOUT_MS, WD_MAX_TIMEOUT_MS) == CS_OK) {
+ ref->check_timeout = tmp_value;
+ } else {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "Could NOT use poll_period:%s ms for resource %s",
+ str, ref->name);
+ }
}
- api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_ONE,
+ api->object_track_start (resource_obj, OBJECT_TRACK_DEPTH_RECURSIVE,
wd_key_changed, NULL, wd_object_destroyed,
NULL, ref);
*/
log_printf (LOGSYS_LEVEL_WARNING,
"resource %s missing a recovery key.", ref->name);
- return;
+ return -1;
}
res = api->object_key_get_typed (resource_obj,
"state", (void*)&state, &len, &type);
*/
log_printf (LOGSYS_LEVEL_WARNING,
"resource %s missing a state key.", ref->name);
- return;
+ return -1;
}
res = api->object_key_get_typed (resource_obj,
ref->last_updated = 0;
}
- api->timer_add_duration((unsigned long long)ref->check_timeout*1000000000,
+ /*
+ * delay the first check to give the monitor time to start working.
+ */
+ tmp_value = CS_MAX(ref->check_timeout * 2, WD_DEFAULT_TIMEOUT_MS);
+ api->timer_add_duration(tmp_value * MILLI_2_NANO_SECONDS,
ref,
wd_resource_check_fn, &ref->check_timer);
- cs_fsm_state_set(&ref->fsm, WD_S_GOOD, ref);
+ cs_fsm_state_set(&ref->fsm, WD_S_RUNNING, ref);
+ return 0;
}
ENTER();
if (watchdog_ok) {
- if (dog > 0)
+ if (dog > 0) {
ioctl(dog, WDIOC_KEEPALIVE, &watchdog_ok);
+ }
+ api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
+ wd_tickle_fn, &wd_timer);
}
else {
log_printf (LOGSYS_LEVEL_ALERT, "NOT tickling the watchdog!");
}
- api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL,
- wd_tickle_fn, &wd_timer);
}
static void wd_resource_object_created(hdb_handle_t parent_object_handle,
hdb_handle_t obj_finder2;
hdb_handle_t resource_type;
hdb_handle_t resource;
- int res;
+ int res_count = 0;
ENTER();
"resources", strlen ("resources"),
&obj_finder);
- res = api->object_find_next (obj_finder, &resources_obj);
+ api->object_find_next (obj_finder, &resources_obj);
api->object_find_destroy (obj_finder);
- if (res != 0) {
- log_printf (LOGSYS_LEVEL_INFO, "no resources.");
- return;
- }
/* this will be the system or process level
*/
while (api->object_find_next (obj_finder2,
&resource) == 0) {
- wd_resource_create (resource);
+ if (wd_resource_create (resource) == 0) {
+ res_count++;
+ }
}
api->object_find_destroy (obj_finder2);
NULL, NULL);
}
api->object_find_destroy (obj_finder);
+ if (res_count == 0) {
+ log_printf (LOGSYS_LEVEL_INFO, "no resources configured.");
+ }
}
static void watchdog_timeout_apply (uint32_t new)
{
struct watchdog_info ident;
+ uint32_t original_timeout = watchdog_timeout;
- if (new < 2) {
- watchdog_timeout = 2;
- }
- else if (new > 120) {
- watchdog_timeout = 120;
- }
- else {
- watchdog_timeout = new;
+ if (new == original_timeout) {
+ return;
}
+ watchdog_timeout = new;
+
if (dog > 0) {
ioctl(dog, WDIOC_GETSUPPORT, &ident);
if (ident.options & WDIOF_SETTIMEOUT) {
}
ioctl(dog, WDIOC_GETTIMEOUT, &watchdog_timeout);
}
- tickle_timeout = watchdog_timeout / 2;
- log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout);
- log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %d seconds\n", tickle_timeout);
+ if (watchdog_timeout == new) {
+ tickle_timeout = (watchdog_timeout * CS_TIME_MS_IN_SEC)/ 2;
+
+ /* reset the tickle timer in case it was reduced.
+ */
+ api->timer_delete (wd_timer);
+ api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
+ wd_tickle_fn, &wd_timer);
+
+ log_printf (LOGSYS_LEVEL_DEBUG, "The Watchdog timeout is %d seconds\n", watchdog_timeout);
+ log_printf (LOGSYS_LEVEL_DEBUG, "The tickle timeout is %"PRIu64" ms\n", tickle_timeout);
+ } else {
+ log_printf (LOGSYS_LEVEL_WARNING,
+ "Could not change the Watchdog timeout from %d to %d seconds\n",
+ original_timeout, new);
+ }
+
}
static int setup_watchdog(void)
const void *key_value_pt, size_t key_value_len,
void *priv_data_pt)
{
- uint32_t tmp_value;
+ uint64_t tmp_value;
+ int32_t tmp_value_32;
ENTER();
if (change_type != OBJECT_KEY_DELETED &&
strncmp ((char*)key_name_pt, "watchdog_timeout", key_value_len) == 0) {
- tmp_value = strtol (key_value_pt, NULL, 0);
- watchdog_timeout_apply (tmp_value);
+ if (str_to_uint64_t(key_value_pt, &tmp_value, 2, 120) == CS_OK) {
+ tmp_value_32 = tmp_value;
+ watchdog_timeout_apply (tmp_value_32);
+ }
}
else {
- watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
+ watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
}
- log_printf (LOGSYS_LEVEL_INFO, "new(%d) tickle_timeout: %d", change_type, tickle_timeout);
}
-
static void watchdog_timeout_get_initial (void)
{
int32_t res;
char watchdog_timeout_str[32];
size_t watchdog_timeout_len;
objdb_value_types_t watchdog_timeout_type;
- uint32_t tmp_value;
+ uint32_t tmp_value_32;
+ uint64_t tmp_value;
ENTER();
(void**)&watchdog_timeout_str, &watchdog_timeout_len,
&watchdog_timeout_type);
if (res != 0) {
- watchdog_timeout_apply (WD_DEFAULT_TIMEOUT);
+ watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
watchdog_timeout_len = snprintf (watchdog_timeout_str, 32, "%d", watchdog_timeout);
api->object_key_create_typed (resources_obj,
OBJDB_VALUETYPE_STRING);
}
else {
- tmp_value = strtol (watchdog_timeout_str, NULL, 0);
- watchdog_timeout_apply (tmp_value);
+ if (str_to_uint64_t(watchdog_timeout_str, &tmp_value, 2, 120) == CS_OK) {
+ tmp_value_32 = tmp_value;
+ watchdog_timeout_apply (tmp_value_32);
+ } else {
+ watchdog_timeout_apply (WD_DEFAULT_TIMEOUT_SEC);
+ }
}
api->object_track_start (resources_obj, OBJECT_TRACK_DEPTH_ONE,
wd_scan_resources();
- api->timer_add_duration((unsigned long long)tickle_timeout*1000000000, NULL,
+ api->timer_add_duration(tickle_timeout*MILLI_2_NANO_SECONDS, NULL,
wd_tickle_fn, &wd_timer);
return 0;
hdb_handle_t res_handle, proc_handle, pid_handle;
size_t value_len;
uint64_t tstamp1, tstamp2;
+ int32_t msec_diff;
char key_value[256];
unsigned int instance_id;
char tmp_obj[PATH_MAX];
return (2);
}
- if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) {
- printf ("State key is not \"registered\".\n");
+ if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) {
+ printf ("State key is not \"stopped\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) {
- printf ("State key is not \"started\".\n");
+ if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) {
+ printf ("State key is not \"running\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) {
- printf ("State key is not \"registered\".\n");
+ if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) {
+ printf ("State key is not \"stopped\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) {
- printf ("State key is not \"registered\".\n");
+ if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) {
+ printf ("State key is not \"stopped\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) {
- printf ("State key is not \"started\".\n");
+ if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) {
+ printf ("State key is not \"running\".\n");
return (2);
}
fprintf (stderr, "Can't send hc. Error %d\n", err);
return 2;
}
- err = confdb_key_get_typed (cdb_handle, pid_handle, "hc_last", &tstamp1, &value_len, &cdbtype);
+ err = confdb_key_get_typed (cdb_handle, pid_handle, "last_updated", &tstamp1, &value_len, &cdbtype);
if (err != CS_OK) {
printf ("Could not get \"state\" key: %d.\n", err);
return (2);
return 2;
}
sleep (1);
- err = confdb_key_get_typed (cdb_handle, pid_handle, "hc_last", &tstamp2, &value_len, &cdbtype);
+ err = confdb_key_get_typed (cdb_handle, pid_handle, "last_updated", &tstamp2, &value_len, &cdbtype);
if (err != CS_OK) {
printf ("Could not get \"state\" key: %d.\n", err);
return (2);
}
- if (tstamp2 - tstamp1 < 500 || tstamp2 - tstamp1 > 2000) {
- printf ("Difference %d is not within <500, 2000> interval.\n", (int)(tstamp2 - tstamp1));
+ msec_diff = (tstamp2 - tstamp1)/CS_TIME_NS_IN_MSEC;
+
+ if (msec_diff < 500 || msec_diff > 2000) {
+ printf ("Difference %d is not within <500, 2000> interval.\n", msec_diff);
return (2);
}
return (2);
}
- if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) {
- printf ("State key is not \"registered\".\n");
+ if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) {
+ printf ("State key is not \"stopped\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("registered") || memcmp (key_value, "registered", value_len) != 0) {
- printf ("State key is not \"registered\".\n");
+ if (value_len != strlen ("stopped") || memcmp (key_value, "stopped", value_len) != 0) {
+ printf ("State key is not \"stopped\".\n");
return (2);
}
return (2);
}
- if (value_len != strlen ("started") || memcmp (key_value, "started", value_len) != 0) {
- printf ("State key is not \"started\".\n");
+ if (value_len != strlen ("running") || memcmp (key_value, "running", value_len) != 0) {
+ printf ("State key is not \"running\".\n");
return (2);
}