Add token_warning configuration option

author Chris Walker <cwalker@cray.com>

Fri, 27 Jul 2018 05:06:32 +0000 (01:06 -0400)

committer Jan Friesse <jfriesse@redhat.com>

Tue, 14 Aug 2018 08:34:49 +0000 (10:34 +0200)
author Chris Walker <cwalker@cray.com>
Fri, 27 Jul 2018 05:06:32 +0000 (01:06 -0400)
committer Jan Friesse <jfriesse@redhat.com>
Tue, 14 Aug 2018 08:34:49 +0000 (10:34 +0200)
diff --git a/exec/coroparse.c b/exec/coroparse.c

index 54b76defbbc123d7c6f634761dd97b87974c1c22..d20af01071481b3a8178261b78bf4cf02c869bc4 100644 (file)
--- a/exec/coroparse.c
+++ b/exec/coroparse.c
@@ -597,6 +597,7 @@ static int main_config_parser_cb(const char *path,
                             (strcmp(path, "totem.token") == 0) ||
                             (strcmp(path, "totem.token_coefficient") == 0) ||
                             (strcmp(path, "totem.token_retransmit") == 0) ||
+                           (strcmp(path, "totem.token_warning") == 0) ||
                             (strcmp(path, "totem.hold") == 0) ||
                             (strcmp(path, "totem.token_retransmits_before_loss_const") == 0) ||
                             (strcmp(path, "totem.join") == 0) ||
diff --git a/exec/main.c b/exec/main.c

index 970f05b1a7d007f14c9fd28b86ca0c2ffcf0d963..efa91814c2b2805d92836d0eb5545f33a7b7f368 100644 (file)
--- a/exec/main.c
+++ b/exec/main.c
@@ -546,6 +546,9 @@ static void corosync_totem_stats_updater (void *data)
                 stats->srp->avg_backlog_calc = (total_backlog_calc / token_count);
         }
  
+       stats->srp->time_since_token_last_received = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+               stats->srp->token[stats->srp->latest_token].rx;
+
         stats_trigger_trackers();
  
         api->timer_add_duration (1500 * MILLI_2_NANO_SECONDS, NULL,
diff --git a/exec/stats.c b/exec/stats.c

index c66cfef359b399456ebe059b556fa5a996ff1ad4..d23a5edb877dc06e50b460cea4011bd37b15b7f3 100644 (file)
--- a/exec/stats.c
+++ b/exec/stats.c
@@ -96,6 +96,7 @@ struct cs_stats_conv cs_srp_stats[] = {
         { STAT_SRP, "recovery_token_lost",    offsetof(totemsrp_stats_t, recovery_token_lost),    ICMAP_VALUETYPE_UINT64},
         { STAT_SRP, "consensus_timeouts",     offsetof(totemsrp_stats_t, consensus_timeouts),     ICMAP_VALUETYPE_UINT64},
         { STAT_SRP, "rx_msg_dropped",         offsetof(totemsrp_stats_t, rx_msg_dropped),         ICMAP_VALUETYPE_UINT64},
+       { STAT_SRP, "time_since_token_last_received", offsetof(totemsrp_stats_t, time_since_token_last_received), ICMAP_VALUETYPE_UINT64},
         { STAT_SRP, "continuous_gather",      offsetof(totemsrp_stats_t, continuous_gather),      ICMAP_VALUETYPE_UINT32},
         { STAT_SRP, "continuous_sendmsg_failures", offsetof(totemsrp_stats_t, continuous_sendmsg_failures), ICMAP_VALUETYPE_UINT32},
         { STAT_SRP, "firewall_enabled_or_nic_failure", offsetof(totemsrp_stats_t, firewall_enabled_or_nic_failure), ICMAP_VALUETYPE_UINT8},
diff --git a/exec/totemconfig.c b/exec/totemconfig.c

index 630ffbed4a5a93481a52bd11586791835fbc96e8..7e6985ee6c112e78f64c7810b9350bd21d2117c9 100644 (file)
--- a/exec/totemconfig.c
+++ b/exec/totemconfig.c
@@ -66,6 +66,7 @@
  
  #define TOKEN_RETRANSMITS_BEFORE_LOSS_CONST    4
  #define TOKEN_TIMEOUT                          1000
+#define TOKEN_WARNING                          75
  #define TOKEN_COEFFICIENT                      650
  #define JOIN_TIMEOUT                           50
  #define MERGE_TIMEOUT                          200
@@ -96,6 +97,8 @@ static void *totem_get_param_by_name(struct totem_config *totem_config, const ch
  {
         if (strcmp(param_name, "totem.token") == 0)
                 return &totem_config->token_timeout;
+       if (strcmp(param_name, "totem.token_warning") == 0)
+               return &totem_config->token_warning;
         if (strcmp(param_name, "totem.token_retransmit") == 0)
                 return &totem_config->token_retransmit_timeout;
         if (strcmp(param_name, "totem.hold") == 0)
@@ -246,6 +249,8 @@ static void totem_volatile_config_read (struct totem_config *totem_config, const
  
         totem_volatile_config_set_uint32_value(totem_config, "totem.token", deleted_key, TOKEN_TIMEOUT, 0);
  
+       totem_volatile_config_set_uint32_value(totem_config, "totem.token_warning", deleted_key, TOKEN_WARNING, 1);
+
         if (totem_config->interfaces[0].member_count > 2) {
                 u32 = TOKEN_COEFFICIENT;
                 icmap_get_uint32("totem.token_coefficient", &u32);
@@ -323,6 +328,13 @@ static int totem_volatile_config_validate (
                 goto parse_error;
         }
  
+       if (totem_config->token_warning > 100 || totem_config->token_warning < 0) {
+               snprintf (local_error_reason, sizeof(local_error_reason),
+                       "The token warning parameter (%d%%) must be between 0 (disabled) and 100.",
+                       totem_config->token_warning);
+               goto parse_error;
+       }
+
         if (totem_config->token_retransmit_timeout < MINIMUM_TIMEOUT) {
                 snprintf (local_error_reason, sizeof(local_error_reason),
                         "The token retransmit timeout parameter (%d ms) may not be less than (%d ms).",
@@ -1986,6 +1998,18 @@ static void debug_dump_totem_config(const struct totem_config *totem_config)
  
         log_printf(LOGSYS_LEVEL_DEBUG, "Token Timeout (%d ms) retransmit timeout (%d ms)",
             totem_config->token_timeout, totem_config->token_retransmit_timeout);
+       if (totem_config->token_warning) {
+               uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+               log_printf(LOGSYS_LEVEL_DEBUG, "Token warning every %d ms (%d%% of Token Timeout)",
+                   token_warning_ms, totem_config->token_warning);
+               if (token_warning_ms < totem_config->token_retransmit_timeout)
+                       log_printf (LOGSYS_LEVEL_DEBUG,
+                               "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+                               "which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+                               token_warning_ms, totem_config->token_retransmit_timeout);
+
+       } else
+               log_printf(LOGSYS_LEVEL_DEBUG, "Token warnings disabled");
         log_printf(LOGSYS_LEVEL_DEBUG, "token hold (%d ms) retransmits before loss (%d retrans)",
             totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
         log_printf(LOGSYS_LEVEL_DEBUG, "join (%d ms) send_join (%d ms) consensus (%d ms) merge (%d ms)",
diff --git a/exec/totemsrp.c b/exec/totemsrp.c

index 7d2d426d6bd2bf08ee016fc5eb643167f3146d94..aab6c809d0a301cbf6b0e9ec40d68ac46d126b26 100644 (file)
--- a/exec/totemsrp.c
+++ b/exec/totemsrp.c
@@ -401,6 +401,8 @@ struct totemsrp_instance {
  
         qb_loop_timer_handle timer_orf_token_timeout;
  
+       qb_loop_timer_handle timer_orf_token_warning;
+
         qb_loop_timer_handle timer_orf_token_retransmit_timeout;
  
         qb_loop_timer_handle timer_orf_token_hold_retransmit_timeout;
@@ -653,6 +655,7 @@ static void memb_merge_detect_endian_convert (
         struct memb_merge_detect *out);
  static void srp_addr_copy_endian_convert (struct srp_addr *out, const struct srp_addr *in);
  static void timer_function_orf_token_timeout (void *data);
+static void timer_function_orf_token_warning (void *data);
  static void timer_function_pause_timeout (void *data);
  static void timer_function_heartbeat_timeout (void *data);
  static void timer_function_token_retransmit_timeout (void *data);
@@ -883,6 +886,20 @@ int totemsrp_initialize (
         log_printf (instance->totemsrp_log_level_debug,
                 "Token Timeout (%d ms) retransmit timeout (%d ms)",
                 totem_config->token_timeout, totem_config->token_retransmit_timeout);
+       if (totem_config->token_warning) {
+               uint32_t token_warning_ms = totem_config->token_warning * totem_config->token_timeout / 100;
+               log_printf(instance->totemsrp_log_level_debug,
+                       "Token warning every %d ms (%d%% of Token Timeout)",
+                       token_warning_ms, totem_config->token_warning);
+               if (token_warning_ms < totem_config->token_retransmit_timeout)
+                       log_printf (LOGSYS_LEVEL_DEBUG,
+                               "The token warning interval (%d ms) is less than the token retransmit timeout (%d ms) "
+                               "which can lead to spurious token warnings. Consider increasing the token_warning parameter.",
+                               token_warning_ms, totem_config->token_retransmit_timeout);
+       } else {
+               log_printf(instance->totemsrp_log_level_debug,
+                       "Token warnings disabled");
+       }
         log_printf (instance->totemsrp_log_level_debug,
                 "token hold (%d ms) retransmits before loss (%d retrans)",
                 totem_config->token_hold_timeout, totem_config->token_retransmits_before_loss_const);
@@ -1566,6 +1583,21 @@ static void reset_pause_timeout (struct totemsrp_instance *instance)
         }
  }
  
+static void reset_token_warning (struct totemsrp_instance *instance) {
+       int32_t res;
+
+       qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+       res = qb_loop_timer_add (instance->totemsrp_poll_handle,
+               QB_LOOP_MED,
+               instance->totem_config->token_warning * instance->totem_config->token_timeout / 100 * QB_TIME_NS_IN_MSEC,
+               (void *)instance,
+               timer_function_orf_token_warning,
+               &instance->timer_orf_token_warning);
+       if (res != 0) {
+               log_printf(instance->totemsrp_log_level_error, "reset_token_warning - qb_loop_timer_add error : %d", res);
+       }
+}
+
  static void reset_token_timeout (struct totemsrp_instance *instance) {
         int32_t res;
  
@@ -1579,6 +1611,9 @@ static void reset_token_timeout (struct totemsrp_instance *instance) {
         if (res != 0) {
                 log_printf(instance->totemsrp_log_level_error, "reset_token_timeout - qb_loop_timer_add error : %d", res);
         }
+
+       if (instance->totem_config->token_warning)
+               reset_token_warning(instance);
  }
  
  static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1597,8 +1632,15 @@ static void reset_heartbeat_timeout (struct totemsrp_instance *instance) {
  }
  
  
+static void cancel_token_warning (struct totemsrp_instance *instance) {
+       qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_warning);
+}
+
  static void cancel_token_timeout (struct totemsrp_instance *instance) {
         qb_loop_timer_del (instance->totemsrp_poll_handle, instance->timer_orf_token_timeout);
+
+        if (instance->totem_config->token_warning)
+                cancel_token_warning(instance);
  }
  
  static void cancel_heartbeat_timeout (struct totemsrp_instance *instance) {
@@ -1680,6 +1722,23 @@ static void memb_recovery_state_token_loss (struct totemsrp_instance *instance)
         instance->stats.recovery_token_lost++;
  }
  
+static void timer_function_orf_token_warning (void *data)
+{
+       struct totemsrp_instance *instance = data;
+       uint64_t tv_diff;
+
+       /* need to protect against the case where token_warning is set to 0 dynamically */
+       if (instance->totem_config->token_warning) {
+               tv_diff = qb_util_nano_current_get () / QB_TIME_NS_IN_MSEC -
+                       instance->stats.token[instance->stats.latest_token].rx;
+               log_printf (instance->totemsrp_log_level_notice,
+                       "Token has not been received in %d ms ", (unsigned int) tv_diff);
+               reset_token_warning(instance);
+        } else {
+               cancel_token_warning(instance);
+       }
+}
+
  static void timer_function_orf_token_timeout (void *data)
  {
         struct totemsrp_instance *instance = data;
diff --git a/include/corosync/totem/totem.h b/include/corosync/totem/totem.h

index d278e478a4fab49084f178c8f418c8927c9a9d3d..a087119c118b4e36e7782a148d8c4e3246bde747 100644 (file)
--- a/include/corosync/totem/totem.h
+++ b/include/corosync/totem/totem.h
@@ -173,6 +173,8 @@ struct totem_config {
          */
         unsigned int token_timeout;
  
+       unsigned int token_warning;
+
         unsigned int token_retransmit_timeout;
  
         unsigned int token_hold_timeout;
diff --git a/include/corosync/totem/totemstats.h b/include/corosync/totem/totemstats.h

index b87cd396aef860a3bc6e7259ffbbfe8d1dabaf66..51e604cfafa9a4886825824c1436ad8585830ff6 100644 (file)
--- a/include/corosync/totem/totemstats.h
+++ b/include/corosync/totem/totemstats.h
@@ -77,6 +77,7 @@ typedef struct {
         uint64_t rx_msg_dropped;
         uint32_t continuous_gather;
         uint32_t continuous_sendmsg_failures;
+       uint64_t time_since_token_last_received; // relative time
  
         uint8_t  firewall_enabled_or_nic_failure;
         uint32_t mtt_rx_token;
diff --git a/man/corosync.conf.5 b/man/corosync.conf.5

index 4e6d69bcce1ff9f33f620a70237e459f2917d434..e7a31cd5508f908c521977a2afac24a791d3563d 100644 (file)
--- a/man/corosync.conf.5
+++ b/man/corosync.conf.5
@@ -320,6 +320,14 @@ key.
  
  The default is 1000 milliseconds.
  
+.TP
+token_warning
+Specifies the interval between warnings that the token has not been received.  The
+value is a percentage of the token timeout and can be set to 0 to disable
+warnings.
+
+The default is 75%.
+
  .TP
  token_coefficient
  This value is used only when
author	Chris Walker <cwalker@cray.com>
	Fri, 27 Jul 2018 05:06:32 +0000 (01:06 -0400)
committer	Jan Friesse <jfriesse@redhat.com>
	Tue, 14 Aug 2018 08:34:49 +0000 (10:34 +0200)
exec/coroparse.c		patch \| blob \| blame \| history
exec/main.c		patch \| blob \| blame \| history
exec/stats.c		patch \| blob \| blame \| history
exec/totemconfig.c		patch \| blob \| blame \| history
exec/totemsrp.c		patch \| blob \| blame \| history
include/corosync/totem/totem.h		patch \| blob \| blame \| history
include/corosync/totem/totemstats.h		patch \| blob \| blame \| history
man/corosync.conf.5		patch \| blob \| blame \| history