From: Han Zhou <hzhou8@ebay.com>
Date: Mon, 19 Aug 2019 16:29:59 +0000 (-0700)
Subject: raft.c: Set candidate_retrying if no leader elected since last election.
X-Git-Url: https://git.proxmox.com/?a=commitdiff_plain;h=923f01cad678228224ae4fe86466e2f61ab2c9d0;p=mirror_ovs.git

raft.c: Set candidate_retrying if no leader elected since last election.

candiate_retrying is used to determine if the current node is disconnected
from the cluster when the node is in candiate role. However, a node
can flap between candidate and follower role before a leader is elected
when majority of the cluster is down, so is_connected() will flap, too, which
confuses clients.

This patch avoids the flapping with the help of a new member had_leader,
so that if no leader was elected since last election, we know we are
still retrying, and keep as disconnected from the cluster.

Signed-off-by: Han Zhou <hzhou8@ebay.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
---

diff --git a/ovsdb/raft.c b/ovsdb/raft.c
index 71c5a7e1a..ca86830f3 100644
--- a/ovsdb/raft.c
+++ b/ovsdb/raft.c
@@ -285,8 +285,13 @@ struct raft {
 
     /* Candidates only.  Reinitialized at start of election. */
     int n_votes;                /* Number of votes for me. */
-    bool candidate_retrying;    /* The first round of election timed-out and it
-                                   is now retrying. */
+
+    /* Followers and candidates only. */
+    bool candidate_retrying;    /* The earlier election timed-out and we are
+                                   now retrying. */
+    bool had_leader;            /* There has been leader elected since last
+                                   election initiated. This is to help setting
+                                   candidate_retrying. */
 };
 
 /* All Raft structures. */
@@ -344,6 +349,7 @@ static bool raft_handle_write_error(struct raft *, struct ovsdb_error *);
 
 static void raft_run_reconfigure(struct raft *);
 
+static void raft_set_leader(struct raft *, const struct uuid *sid);
 static struct raft_server *
 raft_find_server(const struct raft *raft, const struct uuid *sid)
 {
@@ -996,11 +1002,13 @@ raft_get_sid(const struct raft *raft)
 bool
 raft_is_connected(const struct raft *raft)
 {
-    return (!(raft->role == RAFT_CANDIDATE && raft->candidate_retrying)
+    bool ret = (!raft->candidate_retrying
             && !raft->joining
             && !raft->leaving
             && !raft->left
             && !raft->failed);
+    VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false");
+    return ret;
 }
 
 /* Returns true if 'raft' is the cluster leader. */
@@ -1615,8 +1623,11 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
     }
 
     ovs_assert(raft->role != RAFT_LEADER);
-    raft->candidate_retrying = (raft->role == RAFT_CANDIDATE);
     raft->role = RAFT_CANDIDATE;
+    /* If there was no leader elected since last election, we know we are
+     * retrying now. */
+    raft->candidate_retrying = !raft->had_leader;
+    raft->had_leader = false;
 
     raft->n_votes = 0;
 
@@ -2485,6 +2496,14 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s)
     s->replied = false;
 }
 
+static void
+raft_set_leader(struct raft *raft, const struct uuid *sid)
+{
+    raft->leader_sid = *sid;
+    raft->had_leader = true;
+    raft->candidate_retrying = false;
+}
+
 static void
 raft_become_leader(struct raft *raft)
 {
@@ -2497,7 +2516,7 @@ raft_become_leader(struct raft *raft)
 
     ovs_assert(raft->role != RAFT_LEADER);
     raft->role = RAFT_LEADER;
-    raft->leader_sid = raft->sid;
+    raft_set_leader(raft, &raft->sid);
     raft_reset_election_timer(raft);
     raft_reset_ping_timer(raft);
 
@@ -2891,7 +2910,7 @@ raft_update_leader(struct raft *raft, const struct uuid *sid)
                       raft_get_nickname(raft, sid, buf, sizeof buf),
                       raft->term);
         }
-        raft->leader_sid = *sid;
+        raft_set_leader(raft, sid);
 
         /* Record the leader to the log.  This is not used by the algorithm
          * (although it could be, for quick restart), but it is used for
diff --git a/tests/ovsdb-cluster.at b/tests/ovsdb-cluster.at
index 0fbb2681f..c8532fa27 100644
--- a/tests/ovsdb-cluster.at
+++ b/tests/ovsdb-cluster.at
@@ -66,23 +66,30 @@ EXECUTION_EXAMPLES
 AT_BANNER([OVSDB - disconnect from cluster])
 
 OVS_START_SHELL_HELPERS
-# ovsdb_test_cluster_disconnect LEADER_OR_FOLLOWER
+# ovsdb_test_cluster_disconnect N_SERVERS LEADER_OR_FOLLOWER [CHECK_FLAPPING]
+# Test server disconnected from the cluster.
+# N_SERVERS: Number of servers in the cluster.
+# LEADER_OR_FOLLOWER: The role of the server that is disconnected from the
+#                     cluster: "leader" or "follower".
+# CHECK_FLAPPING: Whether to check if is_disconnected flapped. "yes", "no".
 ovsdb_test_cluster_disconnect () {
-    leader_or_follower=$1
+    n=$1
+    leader_or_follower=$2
+    check_flapping=$3
     schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
     ordinal_schema > schema
     AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
     cid=`ovsdb-tool db-cid s1.db`
     schema_name=`ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema`
-    for i in `seq 2 3`; do
+    for i in `seq 2 $n`; do
         AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
     done
 
     on_exit 'kill `cat *.pid`'
-    for i in `seq 3`; do
+    for i in `seq $n`; do
         AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir --log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i --remote=punix:s$i.ovsdb s$i.db])
     done
-    for i in `seq 3`; do
+    for i in `seq $n`; do
         AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
     done
 
@@ -96,14 +103,18 @@ ovsdb_test_cluster_disconnect () {
     # a VIP on a load-balance. So we use single remote to test here.
     if test $leader_or_follower == "leader"; then
         target=1
-        shutdown="2 3"
+        shutdown=`seq $(($n/2 + 1)) $n`
+        cleanup=`seq $(($n/2))`
     else
-        target=3
+        target=$n
 
-        # shutdown follower before the leader so that there is no chance for s3
-        # become leader during the process.
-        shutdown="2 1"
+        # shutdown followers before the leader (s1) so that there is no chance for
+        # s$n to become leader during the process.
+        shutdown="`seq 2 $(($n/2 + 1))` 1"
+        cleanup=`seq $(($n/2 + 2)) $n`
     fi
+    echo shutdown=$shutdown
+    echo cleanup=$cleanup
 
     # Connect to $target.  Use "wait" to trigger a non-op transaction so
     # that test-ovsdb will not quit.
@@ -119,6 +130,11 @@ ovsdb_test_cluster_disconnect () {
 
     OVS_WAIT_UNTIL([grep "000: i=1" test-ovsdb.log])
 
+    # Start collecting raft_is_connected logs for $target before shutting down
+    # any servers.
+    tail -f s$target.log > raft_is_connected.log &
+    echo $! > tail.pid
+
     # Shutdown the other servers so that $target is disconnected from the cluster.
     for i in $shutdown; do
         OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
@@ -127,18 +143,40 @@ ovsdb_test_cluster_disconnect () {
     # The test-ovsdb should detect the disconnect and retry.
     OVS_WAIT_UNTIL([grep disconnect test-ovsdb.log])
 
-    OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$target], [s$target.pid])
+    # The $target debug log should show raft_is_connected: false.
+    OVS_WAIT_UNTIL([grep "raft_is_connected: false" raft_is_connected.log])
+
+    # Save the current count of "raft_is_connected: true"
+    count_old=`grep "raft_is_connected: true" raft_is_connected.log | wc -l`
+    echo count_old $count_old
+
+    if test X$check_flapping == X"yes"; then
+        sleep 10
+    fi
+    # Make sure raft_is_connected didn't flap from false to true.
+    count_new=`grep "raft_is_connected: true" raft_is_connected.log | wc -l`
+    echo count_new $count_new
+    AT_CHECK([test $count_new == $count_old])
+
+    for i in $cleanup; do
+        OVS_APP_EXIT_AND_WAIT_BY_TARGET([`pwd`/s$i], [s$i.pid])
+    done
 }
 OVS_END_SHELL_HELPERS
 
 AT_SETUP([OVSDB cluster - follower disconnect from cluster, single remote])
 AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
-ovsdb_test_cluster_disconnect follower
+ovsdb_test_cluster_disconnect 3 follower
 AT_CLEANUP
 
 AT_SETUP([OVSDB cluster - leader disconnect from cluster, single remote])
 AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
-ovsdb_test_cluster_disconnect leader
+ovsdb_test_cluster_disconnect 3 leader
+AT_CLEANUP
+
+AT_SETUP([OVSDB cluster - leader disconnect from cluster, check flapping])
+AT_KEYWORDS([ovsdb server negative unix cluster disconnect])
+ovsdb_test_cluster_disconnect 5 leader yes
 AT_CLEANUP