]> git.proxmox.com Git - mirror_corosync.git/commitdiff
Add token_warning configuration option
authorChris Walker <cwalker@cray.com>
Fri, 27 Jul 2018 05:06:32 +0000 (01:06 -0400)
committerJan Friesse <jfriesse@redhat.com>
Tue, 14 Aug 2018 08:34:49 +0000 (10:34 +0200)
Token_warning is used to present information about
when the token was last received.

Signed-off-by: Chris Walker <cwalker@cray.com>
Reviewed-by: Jan Friesse <jfriesse@redhat.com>
exec/coroparse.c
exec/main.c
exec/stats.c
exec/totemconfig.c
exec/totemsrp.c
include/corosync/totem/totem.h
include/corosync/totem/totemstats.h
man/corosync.conf.5

index 54b76defbbc123d7c6f634761dd97b87974c1c22..d20af01071481b3a8178261b78bf4cf02c869bc4 100644 (file)
@@ -597,6 +597,7 @@ static int main_config_parser_cb(const char *path,
                            (strcmp(path, "totem.token") == 0) ||
                            (strcmp(path, "totem.token_coefficient") == 0) ||
                            (strcmp(path, "totem.token_retransmit") == 0) ||
+                           (strcmp(path, "totem.token_warning") == 0) ||
                            (strcmp(path, "totem.hold") == 0) ||
                            (strcmp(path, "totem.token_retransmits_before_loss_const") == 0) ||
                            (strcmp(path, "totem.join") == 0) ||
index 970f05b1a7d007f14c9fd28b86ca0c2ffcf0d963..efa91814c2b2805d92836d0eb5545f33a7b7f368 100644 (file)
@@ -546,6 +546,9 @@ static void corosync_totem_stats_updater (void *data)
                stats->srp->avg_backlog_calc = (total_backlog_calc / token_count);
        }
 
+       stats->srp->time_since_token_last_received = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+               stats->srp->token[stats->srp->latest_token].rx;
+
        stats_trigger_trackers();
 
        api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL,
index c66cfef359b399456ebe059b556fa5a996ff1ad4..d23a5edb877dc06e50b460cea4011bd37b15b7f3 100644 (file)
@@ -96,6 +96,7 @@ struct cs_stats_conv cs_srp_stats[] = {
        { STAT_SRP, "recovery_token_lost",    offsetof(totemsrp_stats_t, recovery_token_lost),    ICMAP_VALUETYPE_UINT64},
        { STAT_SRP, "consensus_timeouts",     offsetof(totemsrp_stats_t, consensus_timeouts),     ICMAP_VALUETYPE_UINT64},
        { STAT_SRP, "rx_msg_dropped",         offsetof(totemsrp_stats_t, rx_msg_dropped),         ICMAP_VALUETYPE_UINT64},
+       { STAT_SRP, "time_since_token_last_received", offsetof(totemsrp_stats_t, time_since_token_last_received), ICMAP_VALUETYPE_UINT64},
        { STAT_SRP, "continuous_gather",      offsetof(totemsrp_stats_t, continuous_gather),      ICMAP_VALUETYPE_UINT32},
        { STAT_SRP, "continuous_sendmsg_failures", offsetof(totemsrp_stats_t, continuous_sendmsg_failures), ICMAP_VALUETYPE_UINT32},
        { STAT_SRP, "firewall_enabled_or_nic_failure", offsetof(totemsrp_stats_t, firewall_enabled_or_nic_failure), ICMAP_VALUETYPE_UINT8},
index 630ffbed4a5a93481a52bd11586791835fbc96e8..7e6985ee6c112e78f64c7810b9350bd21d2117c9 100644 (file)
@@ -66,6 +66,7 @@
 
 #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST    4
 #define TOKEN_TIMEOUT                          1000
+#define TOKEN_WARNING                          75
 #define TOKEN_COEFFICIENT                      650
 #define JOIN_TIMEOUT                           50
 #define MERGE_TIMEOUT                          200
@@ -96,6 +97,8 @@ static void *totem_get_param_by_name(struct totem_config *totem_config, const ch
 {
        if (strcmp(param_name, "totem.token") == 0)
                return &totem_config->token_timeout;
+       if (strcmp(param_name, "totem.token_warning") == 0)
+               return &totem_config->token_warning;
        if (strcmp(param_name, "totem.token_retransmit") == 0)
                return &totem_config->token_retransmit_timeout;
        if (strcmp(param_name, "totem.hold") == 0)
@@ -246,6 +249,8 @@ static void totem_volatile_config_read (struct totem_config *totem_config, const
 
        totem_volatile_config_set_uint32_value(totem_config, "totem.token", deleted_key, TOKEN_TIMEOUT, 0);
 
+       totem_volatile_config_set_uint32_value(totem_config, "totem.token_warning", deleted_key, TOKEN_WARNING, 1);
+
        if (totem_config->interfaces[0].member_count > 2) {
                u32 = TOKEN_COEFFICIENT;
                icmap_get_uint32("totem.token_coefficient", &u32);
@@ -323,6 +328,13 @@ static int totem_volatile_config_validate (
                goto parse_error;
        }
 
+       if (totem_config->token_warning > 100 || totem_config->token_warning < 0) {
+               snprintf (local_error_reason, sizeof(local_error_reason),
+                       "The token warning parameter (%d%%) must be between 0 (disabled) and 100.",
+                       totem_config->token_warning);
+               goto parse_error;
+       }
+
        if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) {
                snprintf (local_error_reason, sizeof(local_error_reason),
                        "The token retransmit timeout parameter (%d ms) may not be less than (%d ms).",
@@ -1986,6 +1998,18 @@ static void debug_dump_totem_config(const struct totem_config *totem_config)
 
        log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)",
            totem_config->token_timeout, totem_config->token_retransmit_timeout);
+       if (totem_config->token_warning) {
+               uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+               log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)",
+                   token_warning_ms, totem_config->token_warning);
+               if (token_warning_ms < totem_config->token_retransmit_timeout)
+                       log_printf (LOGSYS_LEVEL_DEBUG,
+                               "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+                               "which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+                               token_warning_ms, totem_config->token_retransmit_timeout);
+
+       } else
+               log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled");
        log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)",
            totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
        log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)",
index 7d2d426d6bd2bf08ee016fc5eb643167f3146d94..aab6c809d0a301cbf6b0e9ec40d68ac46d126b26 100644 (file)
@@ -401,6 +401,8 @@ struct totemsrp_instance {
 
        qb_loop_timer_handle timer_orf_token_timeout;
 
+       qb_loop_timer_handle timer_orf_token_warning;
+
        qb_loop_timer_handle timer_orf_token_retransmit_timeout;
 
        qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout;
@@ -653,6 +655,7 @@ static void memb_merge_detect_endian_convert (
        struct memb_merge_detect *out);
 static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in);
 static void timer_function_orf_token_timeout (void *data);
+static void timer_function_orf_token_warning (void *data);
 static void timer_function_pause_timeout (void *data);
 static void timer_function_heartbeat_timeout (void *data);
 static void timer_function_token_retransmit_timeout (void *data);
@@ -883,6 +886,20 @@ int totemsrp_initialize (
        log_printf (instance->totemsrp_log_level_debug,
                "Token Timeout (%d ms) retransmit timeout (%d ms)",
                totem_config->token_timeout, totem_config->token_retransmit_timeout);
+       if (totem_config->token_warning) {
+               uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+               log_printf(instance->totemsrp_log_level_debug,
+                       "Token warning every %d ms (%d%% of Token Timeout)",
+                       token_warning_ms, totem_config->token_warning);
+               if (token_warning_ms < totem_config->token_retransmit_timeout)
+                       log_printf (LOGSYS_LEVEL_DEBUG,
+                               "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+                               "which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+                               token_warning_ms, totem_config->token_retransmit_timeout);
+       } else {
+               log_printf(instance->totemsrp_log_level_debug,
+                       "Token warnings disabled");
+       }
        log_printf (instance->totemsrp_log_level_debug,
                "token hold (%d ms) retransmits before loss (%d retrans)",
                totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
@@ -1566,6 +1583,21 @@ static void reset_pause_timeout (struct totemsrp_instance *instance)
        }
 }
 
+static void reset_token_warning (struct totemsrp_instance *instance) {
+       int32_t res;
+
+       qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+       res = qb_loop_timer_add (instance->totemsrp_poll_handle,
+               QB_LOOP_MED,
+               instance->totem_config->token_warning * instance->totem_config->token_timeout / 100 * QB_TIME_NS_IN_MSEC,
+               (void *)instance,
+               timer_function_orf_token_warning,
+               &instance->timer_orf_token_warning);
+       if (res != 0) {
+               log_printf(instance->totemsrp_log_level_error, "reset_token_warning - qb_loop_timer_add error : %d", res);
+       }
+}
+
 static void reset_token_timeout (struct totemsrp_instance *instance) {
        int32_t res;
 
@@ -1579,6 +1611,9 @@ static void reset_token_timeout (struct totemsrp_instance *instance) {
        if (res != 0) {
                log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res);
        }
+
+       if (instance->totem_config->token_warning)
+               reset_token_warning(instance);
 }
 
 static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1597,8 +1632,15 @@ static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
 }
 
 
+static void cancel_token_warning (struct totemsrp_instance *instance) {
+       qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+}
+
 static void cancel_token_timeout (struct totemsrp_instance *instance) {
        qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
+
+        if (instance->totem_config->token_warning)
+                cancel_token_warning(instance);
 }
 
 static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1680,6 +1722,23 @@ static void memb_recovery_state_token_loss (struct totemsrp_instance *instance)
        instance->stats.recovery_token_lost++;
 }
 
+static void timer_function_orf_token_warning (void *data)
+{
+       struct totemsrp_instance *instance = data;
+       uint64_t tv_diff;
+
+       /* need to protect against the case where token_warning is set to 0 dynamically */
+       if (instance->totem_config->token_warning) {
+               tv_diff = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+                       instance->stats.token[instance->stats.latest_token].rx;
+               log_printf (instance->totemsrp_log_level_notice,
+                       "Token has not been received in %d ms ", (unsigned int) tv_diff);
+               reset_token_warning(instance);
+        } else {
+               cancel_token_warning(instance);
+       }
+}
+
 static void timer_function_orf_token_timeout (void *data)
 {
        struct totemsrp_instance *instance = data;
index d278e478a4fab49084f178c8f418c8927c9a9d3d..a087119c118b4e36e7782a148d8c4e3246bde747 100644 (file)
@@ -173,6 +173,8 @@ struct totem_config {
         */
        unsigned int token_timeout;
 
+       unsigned int token_warning;
+
        unsigned int token_retransmit_timeout;
 
        unsigned int token_hold_timeout;
index b87cd396aef860a3bc6e7259ffbbfe8d1dabaf66..51e604cfafa9a4886825824c1436ad8585830ff6 100644 (file)
@@ -77,6 +77,7 @@ typedef struct {
        uint64_t rx_msg_dropped;
        uint32_t continuous_gather;
        uint32_t continuous_sendmsg_failures;
+       uint64_t time_since_token_last_received; // relative time
 
        uint8_t  firewall_enabled_or_nic_failure;
        uint32_t mtt_rx_token;
index 4e6d69bcce1ff9f33f620a70237e459f2917d434..e7a31cd5508f908c521977a2afac24a791d3563d 100644 (file)
@@ -320,6 +320,14 @@ key.
 
 The default is 1000 milliseconds.
 
+.TP
+token_warning
+Specifies the interval between warnings that the token has not been received.  The
+value is a percentage of the token timeout and can be set to 0 to disable
+warnings.
+
+The default is 75%.
+
 .TP
 token_coefficient
 This value is used only when