]> git.proxmox.com Git - ovs.git/commitdiff
reconnect: Add ability to do a number of retries without backoff.
authorBen Pfaff <blp@ovn.org>
Mon, 22 Jan 2018 19:04:58 +0000 (11:04 -0800)
committerBen Pfaff <blp@ovn.org>
Tue, 1 May 2018 00:12:23 +0000 (17:12 -0700)
This is aimed at an upcoming database clustering implementation, where it's
desirable to try all of the cluster members quickly before backing off to
retry them again in sequence.

Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Russell Bryant <russell@ovn.org>
Signed-off-by: Ben Pfaff <blp@ovn.org>
Acked-by: Justin Pettit <jpettit@ovn.org>
lib/reconnect.c
lib/reconnect.h
python/ovs/reconnect.py
tests/reconnect.at
tests/test-reconnect.c
tests/test-reconnect.py

index 04cb15b7ce8c16eee2af0c398c6b34373c6e8f96..0f21378d7af2154e1c373c53c0bf5f3b16221966 100644 (file)
@@ -62,6 +62,7 @@ struct reconnect {
     long long int last_connected;
     long long int last_disconnected;
     unsigned int max_tries;
+    unsigned int backoff_free_tries;
 
     /* These values are simply for statistics reporting, not otherwise used
      * directly by anything internal. */
@@ -206,6 +207,15 @@ reconnect_get_max_tries(struct reconnect *fsm)
     return fsm->max_tries;
 }
 
+/* Sets the number of connection attempts that will be made without backoff to
+ * 'backoff_free_tries'.  Values 0 and 1 both represent a single attempt. */
+void
+reconnect_set_backoff_free_tries(struct reconnect *fsm,
+                                 unsigned int backoff_free_tries)
+{
+    fsm->backoff_free_tries = backoff_free_tries;
+}
+
 /* Configures the backoff parameters for 'fsm'.  'min_backoff' is the minimum
  * number of milliseconds, and 'max_backoff' is the maximum, between connection
  * attempts.  The current backoff is also the duration that 'fsm' is willing to
@@ -346,7 +356,7 @@ reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
                 VLOG(fsm->info, "%s: error listening for connections",
                      fsm->name);
             }
-        } else {
+        } else if (fsm->backoff < fsm->max_backoff) {
             const char *type = fsm->passive ? "listen" : "connection";
             if (error > 0) {
                 VLOG_INFO("%s: %s attempt failed (%s)",
@@ -354,35 +364,47 @@ reconnect_disconnected(struct reconnect *fsm, long long int now, int error)
             } else {
                 VLOG(fsm->info, "%s: %s attempt timed out", fsm->name, type);
             }
+        } else {
+            /* We have reached the maximum backoff, so suppress logging to
+             * avoid wastefully filling the log.  (Previously we logged that we
+             * were suppressing further logging, see below.) */
         }
 
         if (fsm->state & (S_ACTIVE | S_IDLE)) {
             fsm->last_disconnected = now;
         }
+
+        if (!reconnect_may_retry(fsm)) {
+            reconnect_transition__(fsm, now, S_VOID);
+            return;
+        }
+
         /* Back off. */
-        if (fsm->state & (S_ACTIVE | S_IDLE)
-             && (fsm->last_activity - fsm->last_connected >= fsm->backoff
-                 || fsm->passive)) {
+        if (fsm->backoff_free_tries > 1) {
+            fsm->backoff_free_tries--;
+            fsm->backoff = 0;
+        } else if (fsm->state & (S_ACTIVE | S_IDLE)
+                   && (fsm->last_activity - fsm->last_connected >= fsm->backoff
+                       || fsm->passive)) {
             fsm->backoff = fsm->passive ? 0 : fsm->min_backoff;
         } else {
             if (fsm->backoff < fsm->min_backoff) {
                 fsm->backoff = fsm->min_backoff;
-            } else if (fsm->backoff >= fsm->max_backoff / 2) {
-                fsm->backoff = fsm->max_backoff;
-            } else {
+            } else if (fsm->backoff < fsm->max_backoff / 2) {
                 fsm->backoff *= 2;
-            }
-            if (fsm->passive) {
-                VLOG(fsm->info, "%s: waiting %.3g seconds before trying to "
-                          "listen again", fsm->name, fsm->backoff / 1000.0);
+                VLOG(fsm->info, "%s: waiting %.3g seconds before %s",
+                     fsm->name, fsm->backoff / 1000.0,
+                     fsm->passive ? "trying to listen again" : "reconnect");
             } else {
-                VLOG(fsm->info, "%s: waiting %.3g seconds before reconnect",
-                          fsm->name, fsm->backoff / 1000.0);
+                if (fsm->backoff < fsm->max_backoff) {
+                    VLOG_INFO("%s: continuing to %s in the background but "
+                              "suppressing further logging", fsm->name,
+                              fsm->passive ? "try to listen" : "reconnect");
+                }
+                fsm->backoff = fsm->max_backoff;
             }
         }
-
-        reconnect_transition__(fsm, now,
-                               reconnect_may_retry(fsm) ? S_BACKOFF : S_VOID);
+        reconnect_transition__(fsm, now, S_BACKOFF);
     }
 }
 
@@ -397,7 +419,7 @@ reconnect_connecting(struct reconnect *fsm, long long int now)
     if (fsm->state != S_CONNECTING) {
         if (fsm->passive) {
             VLOG(fsm->info, "%s: listening...", fsm->name);
-        } else {
+        } else if (fsm->backoff < fsm->max_backoff) {
             VLOG(fsm->info, "%s: connecting...", fsm->name);
         }
         reconnect_transition__(fsm, now, S_CONNECTING);
index 4446713ce8734d078ea8c9266341f7f856eddb82..9f2d469e2ddd5c69e2138008103a4e862d4ff64e 100644 (file)
@@ -51,6 +51,8 @@ int reconnect_get_probe_interval(const struct reconnect *);
 
 void reconnect_set_max_tries(struct reconnect *, unsigned int max_tries);
 unsigned int reconnect_get_max_tries(struct reconnect *);
+void reconnect_set_backoff_free_tries(struct reconnect *,
+                                      unsigned int backoff_free_tries);
 
 void reconnect_set_backoff(struct reconnect *,
                            int min_backoff, int max_backoff);
@@ -65,6 +67,7 @@ void reconnect_enable(struct reconnect *, long long int now);
 void reconnect_disable(struct reconnect *, long long int now);
 
 void reconnect_force_reconnect(struct reconnect *, long long int now);
+void reconnect_skip_backoff(struct reconnect *);
 
 bool reconnect_is_connected(const struct reconnect *);
 unsigned int reconnect_get_last_connect_elapsed(const struct reconnect *,
index ec52ebb7affc4d1be0919f1b8c79bb7cac63c72c..34cc769870315c14ddd66507d5320ad00dc282e1 100644 (file)
@@ -154,6 +154,7 @@ class Reconnect(object):
         self.last_connected = None
         self.last_disconnected = None
         self.max_tries = None
+        self.backoff_free_tries = 0
 
         self.creation_time = now
         self.n_attempted_connections = 0
@@ -242,6 +243,12 @@ class Reconnect(object):
             self.backoff > self.max_backoff):
                 self.backoff = self.max_backoff
 
+    def set_backoff_free_tries(self, backoff_free_tries):
+        """Sets the number of connection attempts that will be made without
+        backoff to 'backoff_free_tries'.  Values 0 and 1 both
+        represent a single attempt."""
+        self.backoff_free_tries = backoff_free_tries
+
     def set_probe_interval(self, probe_interval):
         """Sets the "probe interval" to 'probe_interval', in milliseconds.  If
         this is zero, it disables the connection keepalive feature.  If it is
@@ -337,7 +344,7 @@ class Reconnect(object):
                 else:
                     self.info_level("%s: error listening for connections"
                                     % self.name)
-            else:
+            elif self.backoff < self.max_backoff:
                 if self.passive:
                     type_ = "listen"
                 else:
@@ -352,8 +359,15 @@ class Reconnect(object):
             if (self.state in (Reconnect.Active, Reconnect.Idle)):
                 self.last_disconnected = now
 
+            if not self.__may_retry():
+                self._transition(now, Reconnect.Void)
+                return
+
             # Back off
-            if (self.state in (Reconnect.Active, Reconnect.Idle) and
+            if self.backoff_free_tries > 1:
+                self.backoff_free_tries -= 1
+                self.backoff = 0
+            elif (self.state in (Reconnect.Active, Reconnect.Idle) and
                 (self.last_activity - self.last_connected >= self.backoff or
                  self.passive)):
                 if self.passive:
@@ -363,23 +377,26 @@ class Reconnect(object):
             else:
                 if self.backoff < self.min_backoff:
                     self.backoff = self.min_backoff
-                elif self.backoff >= self.max_backoff / 2:
-                    self.backoff = self.max_backoff
-                else:
+                elif self.backoff < self.max_backoff / 2:
                     self.backoff *= 2
-
-                if self.passive:
-                    self.info_level("%s: waiting %.3g seconds before trying "
-                                    "to listen again"
-                                    % (self.name, self.backoff / 1000.0))
+                    if self.passive:
+                        action = "trying to listen again"
+                    else:
+                        action = "reconnect"
+                    self.info_level("%s: waiting %.3g seconds before %s"
+                                    % (self.name, self.backoff / 1000.0,
+                                       action))
                 else:
-                    self.info_level("%s: waiting %.3g seconds before reconnect"
-                                    % (self.name, self.backoff / 1000.0))
-
-            if self.__may_retry():
-                self._transition(now, Reconnect.Backoff)
-            else:
-                self._transition(now, Reconnect.Void)
+                    if self.backoff < self.max_backoff:
+                        if self.passive:
+                            action = "try to listen"
+                        else:
+                            action = "reconnect"
+                        self.info_level("%s: continuing to %s in the "
+                                        "background but suppressing further "
+                                        "logging" % (self.name, action))
+                    self.backoff = self.max_backoff
+            self._transition(now, Reconnect.Backoff)
 
     def connecting(self, now):
         """Tell this FSM that a connection or listening attempt is in progress.
@@ -390,7 +407,7 @@ class Reconnect(object):
         if self.state != Reconnect.ConnectInProgress:
             if self.passive:
                 self.info_level("%s: listening..." % self.name)
-            else:
+            elif self.backoff < self.max_backoff:
                 self.info_level("%s: connecting..." % self.name)
             self._transition(now, Reconnect.ConnectInProgress)
 
index c88ca785cad262c5ffd5ee81cedc11a60eded058..59c95d95bdd30190bea0b60d46f5f16eab32cfd5 100644 (file)
@@ -1036,6 +1036,60 @@ timeout
   in BACKOFF for 2000 ms (2000 ms backoff)
 ])
 
+######################################################################
+RECONNECT_CHECK([backoff-free tries work],
+  [set-backoff-free-tries 2
+enable
+
+# Connection fails quickly.
+run
+connect-failed ECONNREFUSED
+
+# No backoff.
+run
+timeout
+
+# Connection fails quickly again.
+run
+connect-failed ECONNREFUSED
+
+# Back off for 1000 ms.
+run
+timeout
+],
+   [### t=1000 ###
+set-backoff-free-tries 2
+enable
+  in BACKOFF for 0 ms (0 ms backoff)
+
+# Connection fails quickly.
+run
+  should connect
+connect-failed ECONNREFUSED
+  0 successful connections out of 1 attempts, seqno 0
+
+# No backoff.
+run
+  should connect
+timeout
+  advance 0 ms
+
+# Connection fails quickly again.
+run
+  should connect
+connect-failed ECONNREFUSED
+  in BACKOFF for 0 ms (1000 ms backoff)
+  0 successful connections out of 2 attempts, seqno 0
+
+# Back off for 1000 ms.
+run
+timeout
+  advance 1000 ms
+
+### t=2000 ###
+  in BACKOFF for 1000 ms (1000 ms backoff)
+])
+
 ######################################################################
 RECONNECT_CHECK([max-tries of 1 honored],
   [set-max-tries 1
@@ -1090,7 +1144,7 @@ timeout
 run
   should disconnect
 disconnected
-  in VOID for 0 ms (1000 ms backoff)
+  in VOID for 0 ms (0 ms backoff)
   1 successful connections out of 1 attempts, seqno 2
   disconnected
   disconnected at 11000 ms (0 ms ago)
index 72252b8f707bf3fb87430a14deadc33d21153b7f..5a14e7fe58dae58254fa5e5fcbb1c3d8ebaae974 100644 (file)
@@ -207,6 +207,12 @@ do_set_max_tries(struct ovs_cmdl_context *ctx)
     reconnect_set_max_tries(reconnect, atoi(ctx->argv[1]));
 }
 
+static void
+do_set_backoff_free_tries(struct ovs_cmdl_context *ctx)
+{
+    reconnect_set_backoff_free_tries(reconnect, atoi(ctx->argv[1]));
+}
+
 static void
 diff_stats(const struct reconnect_stats *old,
            const struct reconnect_stats *new,
@@ -284,6 +290,8 @@ static const struct ovs_cmdl_command all_commands[] = {
     { "advance", NULL, 1, 1, do_advance, OVS_RO },
     { "timeout", NULL, 0, 0, do_timeout, OVS_RO },
     { "set-max-tries", NULL, 1, 1, do_set_max_tries, OVS_RO },
+    { "set-backoff-free-tries", NULL, 1, 1, do_set_backoff_free_tries,
+      OVS_RO },
     { "passive", NULL, 0, 0, do_set_passive, OVS_RO },
     { "listening", NULL, 0, 0, do_listening, OVS_RO },
     { "listen-error", NULL, 1, 1, do_listen_error, OVS_RO },
index 8132fd9258ef63a6f949778460ff740938239c04..6cd052878eb1c09c3e2fac70e13bd67a74a73d5e 100644 (file)
@@ -104,6 +104,10 @@ def do_set_max_tries(arg):
     r.set_max_tries(int(arg))
 
 
+def do_set_backoff_free_tries(arg):
+    r.set_backoff_free_tries(int(arg))
+
+
 def diff_stats(old, new, delta):
     if (old.state != new.state or
         old.state_elapsed != new.state_elapsed or
@@ -173,6 +177,7 @@ def main():
         "advance": do_advance,
         "timeout": do_timeout,
         "set-max-tries": do_set_max_tries,
+        "set-backoff-free-tries": do_set_backoff_free_tries,
         "passive": do_set_passive,
         "listening": do_listening,
         "listen-error": do_listen_error