]> git.proxmox.com Git - mirror_ovs.git/commitdiff
ovsdb raft: Fix duplicated transaction execution when leader failover.
authorHan Zhou <hzhou8@ebay.com>
Fri, 12 Apr 2019 23:26:28 +0000 (16:26 -0700)
committerBen Pfaff <blp@ovn.org>
Mon, 15 Apr 2019 20:16:02 +0000 (13:16 -0700)
When a transaction is submitted from a client connected to a follower,
if leader crashes after receiving the execute_command_request from the
follower and sending out append request to the majority of followers,
but before sending execute_command_reply to the follower. The
transaction would finally got commited by the new leader. However,
with current implementation the transaction would be commited twice.

For the root cause, there are two cases:

Case 1, the connected follower becomes the new leader. In this case,
the pending command of the follower will be cancelled during its role
changing to leader, so the trigger for the transaction will be retried.

Case 2, another follower becomes the new leader. In this case, since
there is no execute_command_reply from the original leader (which has
crashed), the command will finally timed out, causing the trigger for
the transaction retried.

In both cases, the transaction will be retried by the server node's
trigger retrying logic. This patch fixes the problem by below changes:

1) A pending command can be completed not only by
execute_command_reply, but also when the eid is committed, if the
execute_command_reply never came.

2) Instead of cancelling all pending commands during role change, let
the commands continue waiting to be completed when the eid is
committed. The timer is increased to be twice the election base time,
so that it has the chance to be completed when leader crashes.

This patch fixes the two raft failure test cases previously disabled.
See the test case for details of how to reproduce the problem.

Signed-off-by: Han Zhou <hzhou8@ebay.com>
Signed-off-by: Ben Pfaff <blp@ovn.org>
ovsdb/raft.c
tests/ovsdb-cluster.at

index 02bdf115d000f73c7ccad67d0dd18be1ebec5f56..77ad365c260162d6a8e75906d5f7e4b676cb7099 100644 (file)
@@ -1607,10 +1607,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
         return;
     }
 
-    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
-
     ovs_assert(raft->role != RAFT_LEADER);
-    ovs_assert(hmap_is_empty(&raft->commands));
     raft->role = RAFT_CANDIDATE;
 
     raft->n_votes = 0;
@@ -1793,17 +1790,22 @@ raft_run(struct raft *raft)
         }
     }
 
-    if (time_msec() >= raft->ping_timeout) {
+    long long int now = time_msec();
+    if (now >= raft->ping_timeout) {
         if (raft->role == RAFT_LEADER) {
             raft_send_heartbeats(raft);
-        } else {
-            long long int now = time_msec();
-            struct raft_command *cmd, *next_cmd;
-            HMAP_FOR_EACH_SAFE (cmd, next_cmd, hmap_node, &raft->commands) {
-                if (cmd->timestamp
-                    && now - cmd->timestamp > ELECTION_BASE_MSEC) {
-                    raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
-                }
+        }
+        /* Check if any commands timeout. Timeout is set to twice the time of
+         * election base time so that commands can complete properly during
+         * leader election. E.g. a leader crashed and current node with pending
+         * commands becomes new leader: the pending commands can still complete
+         * if the crashed leader has replicated the transactions to majority of
+         * followers before it crashed. */
+        struct raft_command *cmd, *next_cmd;
+        HMAP_FOR_EACH_SAFE (cmd, next_cmd, hmap_node, &raft->commands) {
+            if (cmd->timestamp
+                && now - cmd->timestamp > ELECTION_BASE_MSEC * 2) {
+                raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
             }
             raft_reset_ping_timer(raft);
         }
@@ -1974,6 +1976,7 @@ raft_command_initiate(struct raft *raft,
     struct raft_command *cmd = raft_command_create_incomplete(raft, index);
     ovs_assert(eid);
     cmd->eid = *eid;
+    cmd->timestamp = time_msec();
 
     raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index = cmd->index;
 
@@ -1996,6 +1999,15 @@ raft_command_initiate(struct raft *raft,
     return cmd;
 }
 
+static void
+log_all_commands(struct raft *raft)
+{
+    struct raft_command *cmd, *next;
+    HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
+        VLOG_DBG("raft command eid: "UUID_FMT, UUID_ARGS(&cmd->eid));
+    }
+}
+
 static struct raft_command * OVS_WARN_UNUSED_RESULT
 raft_command_execute__(struct raft *raft,
                        const struct json *data, const struct json *servers,
@@ -2051,6 +2063,7 @@ raft_command_execute__(struct raft *raft,
         struct raft_command *cmd = raft_command_create_incomplete(raft, 0);
         cmd->timestamp = time_msec();
         cmd->eid = eid;
+        log_all_commands(raft);
         return cmd;
     }
 
@@ -2126,6 +2139,8 @@ raft_command_complete(struct raft *raft,
                       struct raft_command *cmd,
                       enum raft_command_status status)
 {
+    VLOG_DBG("raft_command_complete eid "UUID_FMT" status: %s",
+             UUID_ARGS(&cmd->eid), raft_command_status_to_string(status));
     if (!uuid_is_zero(&cmd->sid)) {
         uint64_t commit_index = status == RAFT_CMD_SUCCESS ? cmd->index : 0;
         raft_send_execute_command_reply(raft, &cmd->sid, &cmd->eid, status,
@@ -2148,19 +2163,6 @@ raft_complete_all_commands(struct raft *raft, enum raft_command_status status)
     }
 }
 
-static struct raft_command *
-raft_find_command_by_index(struct raft *raft, uint64_t index)
-{
-    struct raft_command *cmd;
-
-    HMAP_FOR_EACH_IN_BUCKET (cmd, hmap_node, index, &raft->commands) {
-        if (cmd->index == index) {
-            return cmd;
-        }
-    }
-    return NULL;
-}
-
 static struct raft_command *
 raft_find_command_by_eid(struct raft *raft, const struct uuid *eid)
 {
@@ -2441,7 +2443,7 @@ raft_server_init_leader(struct raft *raft, struct raft_server *s)
 static void
 raft_become_leader(struct raft *raft)
 {
-    raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
+    log_all_commands(raft);
 
     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
     VLOG_INFO_RL(&rl, "term %"PRIu64": elected leader by %d+ of "
@@ -2627,8 +2629,14 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
             const struct raft_entry *e = raft_get_entry(raft, index);
             if (e->data) {
                 struct raft_command *cmd
-                    = raft_find_command_by_index(raft, index);
+                    = raft_find_command_by_eid(raft, &e->eid);
                 if (cmd) {
+                    if (!cmd->index) {
+                        VLOG_DBG("Command completed after role change from"
+                                 " follower to leader "UUID_FMT,
+                                 UUID_ARGS(&e->eid));
+                        cmd->index = index;
+                    }
                     raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
                 }
             }
@@ -2641,6 +2649,20 @@ raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
         }
     } else {
         raft->commit_index = new_commit_index;
+        /* Check if any pending command can be completed, and complete it.
+         * This can happen when leader fail-over before sending
+         * execute_command_reply. */
+        const struct uuid *eid = raft_get_eid(raft, new_commit_index);
+        struct raft_command *cmd = raft_find_command_by_eid(raft, eid);
+        if (cmd) {
+            static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
+            VLOG_INFO_RL(&rl,
+                         "Command completed without reply (eid: "UUID_FMT", "
+                         "commit index: %"PRIu64")",
+                         UUID_ARGS(eid), new_commit_index);
+            cmd->index = new_commit_index;
+            raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
+        }
     }
 
     /* Write the commit index to the log.  The next time we restart, this
index 4e8876617b8740f61f24e00cfba9203980225d1c..931da6e7dec6a9c5c2f0abb2dcc6eaf509afab0b 100644 (file)
@@ -138,8 +138,6 @@ AT_BANNER([OVSDB - cluster failure with pending transaction])
 
 AT_SETUP([OVSDB cluster - txn on follower-2, leader crash before sending appendReq, follower-2 becomes leader])
 AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
-# XXX: fix bug before enabling this test
-AT_CHECK([exit 77])
 ovsdb_cluster_failure_test 2 3 1 crash-before-sending-append-request 2
 AT_CLEANUP
 
@@ -157,8 +155,6 @@ AT_CLEANUP
 
 AT_SETUP([OVSDB cluster - txn on follower-2, leader crash before sending execRep, follower-3 becomes leader])
 AT_KEYWORDS([ovsdb server negative unix cluster pending-txn])
-# XXX: fix bug before enabling this test
-AT_CHECK([exit 77])
 ovsdb_cluster_failure_test 2 3 1 crash-before-sending-execute-command-reply 3
 AT_CLEANUP