ovsdb/raft.c

   1 /*
   2  * Copyright (c) 2017, 2018 Nicira, Inc.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at:
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include <config.h>
  18
  19 #include "raft.h"
  20 #include "raft-private.h"
  21
  22 #include <errno.h>
  23 #include <unistd.h>
  24
  25 #include "hash.h"
  26 #include "jsonrpc.h"
  27 #include "lockfile.h"
  28 #include "openvswitch/dynamic-string.h"
  29 #include "openvswitch/hmap.h"
  30 #include "openvswitch/json.h"
  31 #include "openvswitch/list.h"
  32 #include "openvswitch/poll-loop.h"
  33 #include "openvswitch/vlog.h"
  34 #include "ovsdb-error.h"
  35 #include "ovsdb-parser.h"
  36 #include "ovsdb/log.h"
  37 #include "raft-rpc.h"
  38 #include "random.h"
  39 #include "socket-util.h"
  40 #include "stream.h"
  41 #include "timeval.h"
  42 #include "unicode.h"
  43 #include "unixctl.h"
  44 #include "util.h"
  45 #include "uuid.h"
  46
  47 VLOG_DEFINE_THIS_MODULE(raft);
  48
  49 /* Roles for a Raft server:
  50  *
  51  *    - Followers: Servers in touch with the current leader.
  52  *
  53  *    - Candidate: Servers unaware of a current leader and seeking election to
  54  *      leader.
  55  *
  56  *    - Leader: Handles all client requests.  At most one at a time.
  57  *
  58  * In normal operation there is exactly one leader and all of the other servers
  59  * are followers. */
  60 enum raft_role {
  61     RAFT_FOLLOWER,
  62     RAFT_CANDIDATE,
  63     RAFT_LEADER
  64 };
  65
  66 /* Flags for unit tests. */
  67 enum raft_failure_test {
  68     FT_NO_TEST,
  69     FT_CRASH_BEFORE_SEND_APPEND_REQ,
  70     FT_CRASH_AFTER_SEND_APPEND_REQ,
  71     FT_CRASH_BEFORE_SEND_EXEC_REP,
  72     FT_CRASH_AFTER_SEND_EXEC_REP,
  73     FT_CRASH_BEFORE_SEND_EXEC_REQ,
  74     FT_CRASH_AFTER_SEND_EXEC_REQ,
  75     FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE,
  76     FT_DELAY_ELECTION
  77 };
  78 static enum raft_failure_test failure_test;
  79
  80 /* A connection between this Raft server and another one. */
  81 struct raft_conn {
  82     struct ovs_list list_node;  /* In struct raft's 'conns' list. */
  83     struct jsonrpc_session *js; /* JSON-RPC connection. */
  84     struct uuid sid;            /* This server's unique ID. */
  85     char *nickname;             /* Short name for use in log messages. */
  86     bool incoming;              /* True if incoming, false if outgoing. */
  87     unsigned int js_seqno;      /* Seqno for noticing (re)connections. */
  88 };
  89
  90 static void raft_conn_close(struct raft_conn *);
  91
  92 /* A "command", that is, a request to append an entry to the log.
  93  *
  94  * The Raft specification only allows clients to issue commands to the leader.
  95  * With this implementation, clients may issue a command on any server, which
  96  * then relays the command to the leader if necessary.
  97  *
  98  * This structure is thus used in three cases:
  99  *
 100  *     1. We are the leader and the command was issued to us directly.
 101  *
 102  *     2. We are a follower and relayed the command to the leader.
 103  *
 104  *     3. We are the leader and a follower relayed the command to us.
 105  */
 106 struct raft_command {
 107     /* All cases. */
 108     struct hmap_node hmap_node; /* In struct raft's 'commands' hmap. */
 109     unsigned int n_refs;        /* Reference count.  */
 110     enum raft_command_status status; /* Execution status. */
 111     struct uuid eid;            /* Entry ID of result. */
 112
 113     /* Case 1 only. */
 114     uint64_t index;             /* Index in log (0 if being relayed). */
 115
 116     /* Case 2 only. */
 117     long long int timestamp;    /* Issue or last ping time, for expiration. */
 118
 119     /* Case 3 only. */
 120     struct uuid sid;            /* The follower (otherwise UUID_ZERO). */
 121 };
 122
 123 static void raft_command_complete(struct raft *, struct raft_command *,
 124                                   enum raft_command_status);
 125
 126 static void raft_complete_all_commands(struct raft *,
 127                                        enum raft_command_status);
 128
 129 /* Type of deferred action, see struct raft_waiter. */
 130 enum raft_waiter_type {
 131     RAFT_W_ENTRY,
 132     RAFT_W_TERM,
 133     RAFT_W_RPC,
 134 };
 135
 136 /* An action deferred until a log write commits to disk. */
 137 struct raft_waiter {
 138     struct ovs_list list_node;
 139     uint64_t commit_ticket;
 140
 141     enum raft_waiter_type type;
 142     union {
 143         /* RAFT_W_ENTRY.
 144          *
 145          * Waits for a RAFT_REC_ENTRY write to our local log to commit.  Upon
 146          * completion, updates 'log_synced' to indicate that the new log entry
 147          * or entries are committed and, if we are leader, also updates our
 148          * local 'match_index'. */
 149         struct {
 150             uint64_t index;
 151         } entry;
 152
 153         /* RAFT_W_TERM.
 154          *
 155          * Waits for a RAFT_REC_TERM or RAFT_REC_VOTE record write to commit.
 156          * Upon completion, updates 'synced_term' and 'synced_vote', which
 157          * triggers sending RPCs deferred by the uncommitted 'term' and
 158          * 'vote'. */
 159         struct {
 160             uint64_t term;
 161             struct uuid vote;
 162         } term;
 163
 164         /* RAFT_W_RPC.
 165          *
 166          * Sometimes, sending an RPC to a peer must be delayed until an entry,
 167          * a term, or a vote mentioned in the RPC is synced to disk.  This
 168          * waiter keeps a copy of such an RPC until the previous waiters have
 169          * committed. */
 170         union raft_rpc *rpc;
 171     };
 172 };
 173
 174 static struct raft_waiter *raft_waiter_create(struct raft *,
 175                                               enum raft_waiter_type,
 176                                               bool start_commit);
 177 static void raft_waiters_destroy(struct raft *);
 178
 179 /* The Raft state machine. */
 180 struct raft {
 181     struct hmap_node hmap_node; /* In 'all_rafts'. */
 182     struct ovsdb_log *log;
 183
 184 /* Persistent derived state.
 185  *
 186  * This must be updated on stable storage before responding to RPCs.  It can be
 187  * derived from the header, snapshot, and log in 'log'. */
 188
 189     struct uuid cid;            /* Cluster ID (immutable for the cluster). */
 190     struct uuid sid;            /* Server ID (immutable for the server). */
 191     char *local_address;        /* Local address (immutable for the server). */
 192     char *local_nickname;       /* Used for local server in log messages. */
 193     char *name;                 /* Schema name (immutable for the cluster). */
 194
 195     /* Contains "struct raft_server"s and represents the server configuration
 196      * most recently added to 'log'. */
 197     struct hmap servers;
 198
 199 #define ELECTION_BASE_MSEC 1000
 200 #define ELECTION_RANGE_MSEC 1000
 201     /* The election timeout base value for leader election, in milliseconds.
 202      * It can be set by unixctl cluster/change-election-timer. Default value is
 203      * ELECTION_BASE_MSEC. */
 204     uint64_t election_timer;
 205     /* If not 0, it is the new value of election_timer being proposed. */
 206     uint64_t election_timer_new;
 207
 208 /* Persistent state on all servers.
 209  *
 210  * Must be updated on stable storage before responding to RPCs. */
 211
 212     /* Current term and the vote for that term.  These might be on the way to
 213      * disk now. */
 214     uint64_t term;              /* Initialized to 0 and only increases. */
 215     struct uuid vote;           /* All-zeros if no vote yet in 'term'. */
 216
 217     /* The term and vote that have been synced to disk. */
 218     uint64_t synced_term;
 219     struct uuid synced_vote;
 220
 221     /* The log.
 222      *
 223      * A log entry with index 1 never really exists; the initial snapshot for a
 224      * Raft is considered to include this index.  The first real log entry has
 225      * index 2.
 226      *
 227      * A new Raft instance contains an empty log:  log_start=2, log_end=2.
 228      * Over time, the log grows:                   log_start=2, log_end=N.
 229      * At some point, the server takes a snapshot: log_start=N, log_end=N.
 230      * The log continues to grow:                  log_start=N, log_end=N+1...
 231      *
 232      * Must be updated on stable storage before responding to RPCs. */
 233     struct raft_entry *entries; /* Log entry i is in log[i - log_start]. */
 234     uint64_t log_start;         /* Index of first entry in log. */
 235     uint64_t log_end;           /* Index of last entry in log, plus 1. */
 236     uint64_t log_synced;        /* Index of last synced entry. */
 237     size_t allocated_log;       /* Allocated entries in 'log'. */
 238
 239     /* Snapshot state (see Figure 5.1)
 240      *
 241      * This is the state of the cluster as of the last discarded log entry,
 242      * that is, at log index 'log_start - 1' (called prevIndex in Figure 5.1).
 243      * Only committed log entries can be included in a snapshot. */
 244     struct raft_entry snap;
 245
 246 /* Volatile state.
 247  *
 248  * The snapshot is always committed, but the rest of the log might not be yet.
 249  * 'last_applied' tracks what entries have been passed to the client.  If the
 250  * client hasn't yet read the latest snapshot, then even the snapshot isn't
 251  * applied yet.  Thus, the invariants are different for these members:
 252  *
 253  *     log_start - 2 <= last_applied <= commit_index < log_end.
 254  *     log_start - 1                 <= commit_index < log_end.
 255  */
 256
 257     enum raft_role role;        /* Current role. */
 258     uint64_t commit_index;      /* Max log index known to be committed. */
 259     uint64_t last_applied;      /* Max log index applied to state machine. */
 260     struct uuid leader_sid;     /* Server ID of leader (zero, if unknown). */
 261
 262     long long int election_base;    /* Time of last heartbeat from leader. */
 263     long long int election_timeout; /* Time at which we start an election. */
 264
 265     /* Used for joining a cluster. */
 266     bool joining;                 /* Attempting to join the cluster? */
 267     struct sset remote_addresses; /* Addresses to try to find other servers. */
 268     long long int join_timeout;   /* Time to re-send add server request. */
 269
 270     /* Used for leaving a cluster. */
 271     bool leaving;               /* True if we are leaving the cluster. */
 272     bool left;                  /* True if we have finished leaving. */
 273     long long int leave_timeout; /* Time to re-send remove server request. */
 274
 275     /* Failure. */
 276     bool failed;                /* True if unrecoverable error has occurred. */
 277
 278     /* File synchronization. */
 279     struct ovs_list waiters;    /* Contains "struct raft_waiter"s. */
 280
 281     /* Network connections. */
 282     struct pstream *listener;   /* For connections from other Raft servers. */
 283     long long int listen_backoff; /* For retrying creating 'listener'. */
 284     struct ovs_list conns;      /* Contains struct raft_conns. */
 285
 286     /* Leaders only.  Reinitialized after becoming leader. */
 287     struct hmap add_servers;    /* Contains "struct raft_server"s to add. */
 288     struct raft_server *remove_server; /* Server being removed. */
 289     struct hmap commands;       /* Contains "struct raft_command"s. */
 290     long long int ping_timeout; /* Time at which to send a heartbeat */
 291
 292     /* Candidates only.  Reinitialized at start of election. */
 293     int n_votes;                /* Number of votes for me. */
 294
 295     /* Followers and candidates only. */
 296     bool candidate_retrying;    /* The earlier election timed-out and we are
 297                                    now retrying. */
 298     bool had_leader;            /* There has been leader elected since last
 299                                    election initiated. This is to help setting
 300                                    candidate_retrying. */
 301 };
 302
 303 /* All Raft structures. */
 304 static struct hmap all_rafts = HMAP_INITIALIZER(&all_rafts);
 305
 306 static void raft_init(void);
 307
 308 static struct ovsdb_error *raft_read_header(struct raft *)
 309     OVS_WARN_UNUSED_RESULT;
 310
 311 static void raft_send_execute_command_reply(struct raft *,
 312                                             const struct uuid *sid,
 313                                             const struct uuid *eid,
 314                                             enum raft_command_status,
 315                                             uint64_t commit_index);
 316
 317 static void raft_update_our_match_index(struct raft *, uint64_t min_index);
 318
 319 static void raft_send_remove_server_reply__(
 320     struct raft *, const struct uuid *target_sid,
 321     const struct uuid *requester_sid, struct unixctl_conn *requester_conn,
 322     bool success, const char *comment);
 323 static void raft_finished_leaving_cluster(struct raft *);
 324
 325 static void raft_server_init_leader(struct raft *, struct raft_server *);
 326
 327 static bool raft_rpc_is_heartbeat(const union raft_rpc *);
 328 static bool raft_is_rpc_synced(const struct raft *, const union raft_rpc *);
 329
 330 static void raft_handle_rpc(struct raft *, const union raft_rpc *);
 331
 332 static bool raft_send_at(struct raft *, const union raft_rpc *,
 333                          int line_number);
 334 #define raft_send(raft, rpc) raft_send_at(raft, rpc, __LINE__)
 335
 336 static bool raft_send_to_conn_at(struct raft *, const union raft_rpc *,
 337                                  struct raft_conn *, int line_number);
 338 #define raft_send_to_conn(raft, rpc, conn) \
 339     raft_send_to_conn_at(raft, rpc, conn, __LINE__)
 340
 341 static void raft_send_append_request(struct raft *,
 342                                      struct raft_server *, unsigned int n,
 343                                      const char *comment);
 344
 345 static void raft_become_leader(struct raft *);
 346 static void raft_become_follower(struct raft *);
 347 static void raft_reset_election_timer(struct raft *);
 348 static void raft_reset_ping_timer(struct raft *);
 349 static void raft_send_heartbeats(struct raft *);
 350 static void raft_start_election(struct raft *, bool leadership_transfer);
 351 static bool raft_truncate(struct raft *, uint64_t new_end);
 352 static void raft_get_servers_from_log(struct raft *, enum vlog_level);
 353 static void raft_get_election_timer_from_log(struct raft *);
 354
 355 static bool raft_handle_write_error(struct raft *, struct ovsdb_error *);
 356
 357 static void raft_run_reconfigure(struct raft *);
 358
 359 static void raft_set_leader(struct raft *, const struct uuid *sid);
 360 static struct raft_server *
 361 raft_find_server(const struct raft *raft, const struct uuid *sid)
 362 {
 363     return raft_server_find(&raft->servers, sid);
 364 }
 365
 366 static char *
 367 raft_make_address_passive(const char *address_)
 368 {
 369     if (!strncmp(address_, "unix:", 5)) {
 370         return xasprintf("p%s", address_);
 371     } else {
 372         char *address = xstrdup(address_);
 373         char *host, *port;
 374         inet_parse_host_port_tokens(strchr(address, ':') + 1, &host, &port);
 375
 376         struct ds paddr = DS_EMPTY_INITIALIZER;
 377         ds_put_format(&paddr, "p%.3s:%s:", address, port);
 378         if (strchr(host, ':')) {
 379             ds_put_format(&paddr, "[%s]", host);
 380         } else {
 381             ds_put_cstr(&paddr, host);
 382         }
 383         free(address);
 384         return ds_steal_cstr(&paddr);
 385     }
 386 }
 387
 388 static struct raft *
 389 raft_alloc(void)
 390 {
 391     raft_init();
 392
 393     struct raft *raft = xzalloc(sizeof *raft);
 394     hmap_node_nullify(&raft->hmap_node);
 395     hmap_init(&raft->servers);
 396     raft->log_start = raft->log_end = 1;
 397     raft->role = RAFT_FOLLOWER;
 398     sset_init(&raft->remote_addresses);
 399     raft->join_timeout = LLONG_MAX;
 400     ovs_list_init(&raft->waiters);
 401     raft->listen_backoff = LLONG_MIN;
 402     ovs_list_init(&raft->conns);
 403     hmap_init(&raft->add_servers);
 404     hmap_init(&raft->commands);
 405
 406     raft->election_timer = ELECTION_BASE_MSEC;
 407
 408     return raft;
 409 }
 410
 411 /* Creates an on-disk file that represents a new Raft cluster and initializes
 412  * it to consist of a single server, the one on which this function is called.
 413  *
 414  * Creates the local copy of the cluster's log in 'file_name', which must not
 415  * already exist.  Gives it the name 'name', which should be the database
 416  * schema name and which is used only to match up this database with the server
 417  * added to the cluster later if the cluster ID is unavailable.
 418  *
 419  * The new server is located at 'local_address', which must take one of the
 420  * forms "tcp:IP:PORT" or "ssl:IP:PORT", where IP is an IPv4 address or a
 421  * square bracket enclosed IPv6 address and PORT is a TCP port number.
 422  *
 423  * This only creates the on-disk file.  Use raft_open() to start operating the
 424  * new server.
 425  *
 426  * Returns null if successful, otherwise an ovsdb_error describing the
 427  * problem. */
 428 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 429 raft_create_cluster(const char *file_name, const char *name,
 430                     const char *local_address, const struct json *data)
 431 {
 432     /* Parse and verify validity of the local address. */
 433     struct ovsdb_error *error = raft_address_validate(local_address);
 434     if (error) {
 435         return error;
 436     }
 437
 438     /* Create log file. */
 439     struct ovsdb_log *log;
 440     error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL,
 441                            -1, &log);
 442     if (error) {
 443         return error;
 444     }
 445
 446     /* Write log file. */
 447     struct raft_header h = {
 448         .sid = uuid_random(),
 449         .cid = uuid_random(),
 450         .name = xstrdup(name),
 451         .local_address = xstrdup(local_address),
 452         .joining = false,
 453         .remote_addresses = SSET_INITIALIZER(&h.remote_addresses),
 454         .snap_index = 1,
 455         .snap = {
 456             .term = 1,
 457             .data = json_nullable_clone(data),
 458             .eid = uuid_random(),
 459             .servers = json_object_create(),
 460         },
 461     };
 462     shash_add_nocopy(json_object(h.snap.servers),
 463                      xasprintf(UUID_FMT, UUID_ARGS(&h.sid)),
 464                      json_string_create(local_address));
 465     error = ovsdb_log_write_and_free(log, raft_header_to_json(&h));
 466     raft_header_uninit(&h);
 467     if (!error) {
 468         error = ovsdb_log_commit_block(log);
 469     }
 470     ovsdb_log_close(log);
 471
 472     return error;
 473 }
 474
 475 /* Creates a database file that represents a new server in an existing Raft
 476  * cluster.
 477  *
 478  * Creates the local copy of the cluster's log in 'file_name', which must not
 479  * already exist.  Gives it the name 'name', which must be the same name
 480  * passed in to raft_create_cluster() earlier.
 481  *
 482  * 'cid' is optional.  If specified, the new server will join only the cluster
 483  * with the given cluster ID.
 484  *
 485  * The new server is located at 'local_address', which must take one of the
 486  * forms "tcp:IP:PORT" or "ssl:IP:PORT", where IP is an IPv4 address or a
 487  * square bracket enclosed IPv6 address and PORT is a TCP port number.
 488  *
 489  * Joining the cluster requires contacting it.  Thus, 'remote_addresses'
 490  * specifies the addresses of existing servers in the cluster.  One server out
 491  * of the existing cluster is sufficient, as long as that server is reachable
 492  * and not partitioned from the current cluster leader.  If multiple servers
 493  * from the cluster are specified, then it is sufficient for any of them to
 494  * meet this criterion.
 495  *
 496  * This only creates the on-disk file and does no network access.  Use
 497  * raft_open() to start operating the new server.  (Until this happens, the
 498  * new server has not joined the cluster.)
 499  *
 500  * Returns null if successful, otherwise an ovsdb_error describing the
 501  * problem. */
 502 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 503 raft_join_cluster(const char *file_name,
 504                   const char *name, const char *local_address,
 505                   const struct sset *remote_addresses,
 506                   const struct uuid *cid)
 507 {
 508     ovs_assert(!sset_is_empty(remote_addresses));
 509
 510     /* Parse and verify validity of the addresses. */
 511     struct ovsdb_error *error = raft_address_validate(local_address);
 512     if (error) {
 513         return error;
 514     }
 515     const char *addr;
 516     SSET_FOR_EACH (addr, remote_addresses) {
 517         error = raft_address_validate(addr);
 518         if (error) {
 519             return error;
 520         }
 521         if (!strcmp(addr, local_address)) {
 522             return ovsdb_error(NULL, "remote addresses cannot be the same "
 523                                "as the local address");
 524         }
 525     }
 526
 527     /* Verify validity of the cluster ID (if provided). */
 528     if (cid && uuid_is_zero(cid)) {
 529         return ovsdb_error(NULL, "all-zero UUID is not valid cluster ID");
 530     }
 531
 532     /* Create log file. */
 533     struct ovsdb_log *log;
 534     error = ovsdb_log_open(file_name, RAFT_MAGIC, OVSDB_LOG_CREATE_EXCL,
 535                            -1, &log);
 536     if (error) {
 537         return error;
 538     }
 539
 540     /* Write log file. */
 541     struct raft_header h = {
 542         .sid = uuid_random(),
 543         .cid = cid ? *cid : UUID_ZERO,
 544         .name = xstrdup(name),
 545         .local_address = xstrdup(local_address),
 546         .joining = true,
 547         /* No snapshot yet. */
 548     };
 549     sset_clone(&h.remote_addresses, remote_addresses);
 550     error = ovsdb_log_write_and_free(log, raft_header_to_json(&h));
 551     raft_header_uninit(&h);
 552     if (!error) {
 553         error = ovsdb_log_commit_block(log);
 554     }
 555     ovsdb_log_close(log);
 556
 557     return error;
 558 }
 559
 560 /* Reads the initial header record from 'log', which must be a Raft clustered
 561  * database log, and populates '*md' with the information read from it.  The
 562  * caller must eventually destroy 'md' with raft_metadata_destroy().
 563  *
 564  * On success, returns NULL.  On failure, returns an error that the caller must
 565  * eventually destroy and zeros '*md'. */
 566 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 567 raft_read_metadata(struct ovsdb_log *log, struct raft_metadata *md)
 568 {
 569     struct raft *raft = raft_alloc();
 570     raft->log = log;
 571
 572     struct ovsdb_error *error = raft_read_header(raft);
 573     if (!error) {
 574         md->sid = raft->sid;
 575         md->name = xstrdup(raft->name);
 576         md->local = xstrdup(raft->local_address);
 577         md->cid = raft->cid;
 578     } else {
 579         memset(md, 0, sizeof *md);
 580     }
 581
 582     raft->log = NULL;
 583     raft_close(raft);
 584     return error;
 585 }
 586
 587 /* Frees the metadata in 'md'. */
 588 void
 589 raft_metadata_destroy(struct raft_metadata *md)
 590 {
 591     if (md) {
 592         free(md->name);
 593         free(md->local);
 594     }
 595 }
 596
 597 static const struct raft_entry *
 598 raft_get_entry(const struct raft *raft, uint64_t index)
 599 {
 600     ovs_assert(index >= raft->log_start);
 601     ovs_assert(index < raft->log_end);
 602     return &raft->entries[index - raft->log_start];
 603 }
 604
 605 static uint64_t
 606 raft_get_term(const struct raft *raft, uint64_t index)
 607 {
 608     return (index == raft->log_start - 1
 609             ? raft->snap.term
 610             : raft_get_entry(raft, index)->term);
 611 }
 612
 613 static const struct json *
 614 raft_servers_for_index(const struct raft *raft, uint64_t index)
 615 {
 616     ovs_assert(index >= raft->log_start - 1);
 617     ovs_assert(index < raft->log_end);
 618
 619     const struct json *servers = raft->snap.servers;
 620     for (uint64_t i = raft->log_start; i <= index; i++) {
 621         const struct raft_entry *e = raft_get_entry(raft, i);
 622         if (e->servers) {
 623             servers = e->servers;
 624         }
 625     }
 626     return servers;
 627 }
 628
 629 static void
 630 raft_set_servers(struct raft *raft, const struct hmap *new_servers,
 631                  enum vlog_level level)
 632 {
 633     struct raft_server *s, *next;
 634     HMAP_FOR_EACH_SAFE (s, next, hmap_node, &raft->servers) {
 635         if (!raft_server_find(new_servers, &s->sid)) {
 636             ovs_assert(s != raft->remove_server);
 637
 638             hmap_remove(&raft->servers, &s->hmap_node);
 639             VLOG(level, "server %s removed from configuration", s->nickname);
 640             raft_server_destroy(s);
 641         }
 642     }
 643
 644     HMAP_FOR_EACH_SAFE (s, next, hmap_node, new_servers) {
 645         if (!raft_find_server(raft, &s->sid)) {
 646             VLOG(level, "server %s added to configuration", s->nickname);
 647
 648             struct raft_server *new
 649                 = raft_server_add(&raft->servers, &s->sid, s->address);
 650             raft_server_init_leader(raft, new);
 651         }
 652     }
 653 }
 654
 655 static uint64_t
 656 raft_add_entry(struct raft *raft,
 657                uint64_t term, struct json *data, const struct uuid *eid,
 658                struct json *servers, uint64_t election_timer)
 659 {
 660     if (raft->log_end - raft->log_start >= raft->allocated_log) {
 661         raft->entries = x2nrealloc(raft->entries, &raft->allocated_log,
 662                                    sizeof *raft->entries);
 663     }
 664
 665     uint64_t index = raft->log_end++;
 666     struct raft_entry *entry = &raft->entries[index - raft->log_start];
 667     entry->term = term;
 668     entry->data = data;
 669     entry->eid = eid ? *eid : UUID_ZERO;
 670     entry->servers = servers;
 671     entry->election_timer = election_timer;
 672     return index;
 673 }
 674
 675 /* Writes a RAFT_REC_ENTRY record for 'term', 'data', 'eid', 'servers',
 676  * 'election_timer' to * 'raft''s log and returns an error indication. */
 677 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 678 raft_write_entry(struct raft *raft, uint64_t term, struct json *data,
 679                  const struct uuid *eid, struct json *servers,
 680                  uint64_t election_timer)
 681 {
 682     struct raft_record r = {
 683         .type = RAFT_REC_ENTRY,
 684         .term = term,
 685         .entry = {
 686             .index = raft_add_entry(raft, term, data, eid, servers,
 687                                     election_timer),
 688             .data = data,
 689             .servers = servers,
 690             .election_timer = election_timer,
 691             .eid = eid ? *eid : UUID_ZERO,
 692         },
 693     };
 694     return ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r));
 695 }
 696
 697 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 698 raft_write_state(struct ovsdb_log *log,
 699                  uint64_t term, const struct uuid *vote)
 700 {
 701     struct raft_record r = { .term = term };
 702     if (vote && !uuid_is_zero(vote)) {
 703         r.type = RAFT_REC_VOTE;
 704         r.sid = *vote;
 705     } else {
 706         r.type = RAFT_REC_TERM;
 707     }
 708     return ovsdb_log_write_and_free(log, raft_record_to_json(&r));
 709 }
 710
 711 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 712 raft_apply_record(struct raft *raft, unsigned long long int rec_idx,
 713                   const struct raft_record *r)
 714 {
 715     /* Apply "term", which is present in most kinds of records (and otherwise
 716      * 0).
 717      *
 718      * A Raft leader can replicate entries from previous terms to the other
 719      * servers in the cluster, retaining the original terms on those entries
 720      * (see section 3.6.2 "Committing entries from previous terms" for more
 721      * information), so it's OK for the term in a log record to precede the
 722      * current term. */
 723     if (r->term > raft->term) {
 724         raft->term = raft->synced_term = r->term;
 725         raft->vote = raft->synced_vote = UUID_ZERO;
 726     }
 727
 728     switch (r->type) {
 729     case RAFT_REC_ENTRY:
 730         if (r->entry.index < raft->commit_index) {
 731             return ovsdb_error(NULL, "record %llu attempts to truncate log "
 732                                "from %"PRIu64" to %"PRIu64" entries, but "
 733                                "commit index is already %"PRIu64,
 734                                rec_idx, raft->log_end, r->entry.index,
 735                                raft->commit_index);
 736         } else if (r->entry.index > raft->log_end) {
 737             return ovsdb_error(NULL, "record %llu with index %"PRIu64" skips "
 738                                "past expected index %"PRIu64,
 739                                rec_idx, r->entry.index, raft->log_end);
 740         }
 741
 742         if (r->entry.index < raft->log_end) {
 743             /* This can happen, but it is notable. */
 744             VLOG_DBG("record %llu truncates log from %"PRIu64" to %"PRIu64
 745                      " entries", rec_idx, raft->log_end, r->entry.index);
 746             raft_truncate(raft, r->entry.index);
 747         }
 748
 749         uint64_t prev_term = (raft->log_end > raft->log_start
 750                               ? raft->entries[raft->log_end
 751                                               - raft->log_start - 1].term
 752                               : raft->snap.term);
 753         if (r->term < prev_term) {
 754             return ovsdb_error(NULL, "record %llu with index %"PRIu64" term "
 755                                "%"PRIu64" precedes previous entry's term "
 756                                "%"PRIu64,
 757                                rec_idx, r->entry.index, r->term, prev_term);
 758         }
 759
 760         raft->log_synced = raft_add_entry(
 761             raft, r->term,
 762             json_nullable_clone(r->entry.data), &r->entry.eid,
 763             json_nullable_clone(r->entry.servers),
 764             r->entry.election_timer);
 765         return NULL;
 766
 767     case RAFT_REC_TERM:
 768         return NULL;
 769
 770     case RAFT_REC_VOTE:
 771         if (r->term < raft->term) {
 772             return ovsdb_error(NULL, "record %llu votes for term %"PRIu64" "
 773                                "but current term is %"PRIu64,
 774                                rec_idx, r->term, raft->term);
 775         } else if (!uuid_is_zero(&raft->vote)
 776                    && !uuid_equals(&raft->vote, &r->sid)) {
 777             return ovsdb_error(NULL, "record %llu votes for "SID_FMT" in term "
 778                                "%"PRIu64" but a previous record for the "
 779                                "same term voted for "SID_FMT, rec_idx,
 780                                SID_ARGS(&raft->vote), r->term,
 781                                SID_ARGS(&r->sid));
 782         } else {
 783             raft->vote = raft->synced_vote = r->sid;
 784             return NULL;
 785         }
 786         break;
 787
 788     case RAFT_REC_NOTE:
 789         if (!strcmp(r->note, "left")) {
 790             return ovsdb_error(NULL, "record %llu indicates server has left "
 791                                "the cluster; it cannot be added back (use "
 792                                "\"ovsdb-tool join-cluster\" to add a new "
 793                                "server)", rec_idx);
 794         }
 795         return NULL;
 796
 797     case RAFT_REC_COMMIT_INDEX:
 798         if (r->commit_index < raft->commit_index) {
 799             return ovsdb_error(NULL, "record %llu regresses commit index "
 800                                "from %"PRIu64 " to %"PRIu64,
 801                                rec_idx, raft->commit_index, r->commit_index);
 802         } else if (r->commit_index >= raft->log_end) {
 803             return ovsdb_error(NULL, "record %llu advances commit index to "
 804                                "%"PRIu64 " but last log index is %"PRIu64,
 805                                rec_idx, r->commit_index, raft->log_end - 1);
 806         } else {
 807             raft->commit_index = r->commit_index;
 808             return NULL;
 809         }
 810         break;
 811
 812     case RAFT_REC_LEADER:
 813         /* XXX we could use this to take back leadership for quick restart */
 814         return NULL;
 815
 816     default:
 817         OVS_NOT_REACHED();
 818     }
 819 }
 820
 821 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 822 raft_read_header(struct raft *raft)
 823 {
 824     /* Read header record. */
 825     struct json *json;
 826     struct ovsdb_error *error = ovsdb_log_read(raft->log, &json);
 827     if (error || !json) {
 828         /* Report error or end-of-file. */
 829         return error;
 830     }
 831     ovsdb_log_mark_base(raft->log);
 832
 833     struct raft_header h;
 834     error = raft_header_from_json(&h, json);
 835     json_destroy(json);
 836     if (error) {
 837         return error;
 838     }
 839
 840     raft->sid = h.sid;
 841     raft->cid = h.cid;
 842     raft->name = xstrdup(h.name);
 843     raft->local_address = xstrdup(h.local_address);
 844     raft->local_nickname = raft_address_to_nickname(h.local_address, &h.sid);
 845     raft->joining = h.joining;
 846
 847     if (h.joining) {
 848         sset_clone(&raft->remote_addresses, &h.remote_addresses);
 849     } else {
 850         raft_entry_clone(&raft->snap, &h.snap);
 851         raft->log_start = raft->log_end = h.snap_index + 1;
 852         raft->commit_index = h.snap_index;
 853         raft->last_applied = h.snap_index - 1;
 854     }
 855
 856     raft_header_uninit(&h);
 857
 858     return NULL;
 859 }
 860
 861 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 862 raft_read_log(struct raft *raft)
 863 {
 864     for (unsigned long long int i = 1; ; i++) {
 865         struct json *json;
 866         struct ovsdb_error *error = ovsdb_log_read(raft->log, &json);
 867         if (!json) {
 868             if (error) {
 869                 /* We assume that the error is due to a partial write while
 870                  * appending to the file before a crash, so log it and
 871                  * continue. */
 872                 char *error_string = ovsdb_error_to_string_free(error);
 873                 VLOG_WARN("%s", error_string);
 874                 free(error_string);
 875                 error = NULL;
 876             }
 877             break;
 878         }
 879
 880         struct raft_record r;
 881         error = raft_record_from_json(&r, json);
 882         if (!error) {
 883             error = raft_apply_record(raft, i, &r);
 884             raft_record_uninit(&r);
 885         }
 886         json_destroy(json);
 887         if (error) {
 888             return ovsdb_wrap_error(error, "error reading record %llu from "
 889                                     "%s log", i, raft->name);
 890         }
 891     }
 892
 893     /* Set the most recent servers. */
 894     raft_get_servers_from_log(raft, VLL_DBG);
 895
 896     /* Set the most recent election_timer. */
 897     raft_get_election_timer_from_log(raft);
 898
 899     return NULL;
 900 }
 901
 902 static void
 903 raft_reset_election_timer(struct raft *raft)
 904 {
 905     unsigned int duration = (raft->election_timer
 906                              + random_range(ELECTION_RANGE_MSEC));
 907     raft->election_base = time_msec();
 908     if (failure_test == FT_DELAY_ELECTION) {
 909         /* Slow down this node so that it won't win the next election. */
 910         duration += raft->election_timer;
 911     }
 912     raft->election_timeout = raft->election_base + duration;
 913 }
 914
 915 static void
 916 raft_reset_ping_timer(struct raft *raft)
 917 {
 918     raft->ping_timeout = time_msec() + raft->election_timer / 3;
 919 }
 920
 921 static void
 922 raft_add_conn(struct raft *raft, struct jsonrpc_session *js,
 923               const struct uuid *sid, bool incoming)
 924 {
 925     struct raft_conn *conn = xzalloc(sizeof *conn);
 926     ovs_list_push_back(&raft->conns, &conn->list_node);
 927     conn->js = js;
 928     if (sid) {
 929         conn->sid = *sid;
 930     }
 931     conn->nickname = raft_address_to_nickname(jsonrpc_session_get_name(js),
 932                                               &conn->sid);
 933     conn->incoming = incoming;
 934     conn->js_seqno = jsonrpc_session_get_seqno(conn->js);
 935 }
 936
 937 /* Starts the local server in an existing Raft cluster, using the local copy of
 938  * the cluster's log in 'file_name'.  Takes ownership of 'log', whether
 939  * successful or not. */
 940 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
 941 raft_open(struct ovsdb_log *log, struct raft **raftp)
 942 {
 943     struct raft *raft = raft_alloc();
 944     raft->log = log;
 945
 946     struct ovsdb_error *error = raft_read_header(raft);
 947     if (error) {
 948         goto error;
 949     }
 950
 951     if (!raft->joining) {
 952         error = raft_read_log(raft);
 953         if (error) {
 954             goto error;
 955         }
 956
 957         /* Find our own server. */
 958         if (!raft_find_server(raft, &raft->sid)) {
 959             error = ovsdb_error(NULL, "server does not belong to cluster");
 960             goto error;
 961         }
 962
 963         /* If there's only one server, start an election right away so that the
 964          * cluster bootstraps quickly. */
 965         if (hmap_count(&raft->servers) == 1) {
 966             raft_start_election(raft, false);
 967         }
 968     } else {
 969         raft->join_timeout = time_msec() + 1000;
 970     }
 971
 972     raft_reset_ping_timer(raft);
 973     raft_reset_election_timer(raft);
 974
 975     *raftp = raft;
 976     hmap_insert(&all_rafts, &raft->hmap_node, hash_string(raft->name, 0));
 977     return NULL;
 978
 979 error:
 980     raft_close(raft);
 981     *raftp = NULL;
 982     return error;
 983 }
 984
 985 /* Returns the name of 'raft', which in OVSDB is the database schema name. */
 986 const char *
 987 raft_get_name(const struct raft *raft)
 988 {
 989     return raft->name;
 990 }
 991
 992 /* Returns the cluster ID of 'raft'.  If 'raft' has not yet completed joining
 993  * its cluster, then 'cid' will be all-zeros (unless the administrator
 994  * specified a cluster ID running "ovsdb-tool join-cluster").
 995  *
 996  * Each cluster has a unique cluster ID. */
 997 const struct uuid *
 998 raft_get_cid(const struct raft *raft)
 999 {
1000     return &raft->cid;
1001 }
1002
1003 /* Returns the server ID of 'raft'.  Each server has a unique server ID. */
1004 const struct uuid *
1005 raft_get_sid(const struct raft *raft)
1006 {
1007     return &raft->sid;
1008 }
1009
1010 /* Returns true if 'raft' has completed joining its cluster, has not left or
1011  * initiated leaving the cluster, does not have failed disk storage, and is
1012  * apparently connected to the leader in a healthy way (or is itself the
1013  * leader).
1014  *
1015  * If 'raft' is candidate:
1016  * a) if it is the first round of election, consider it as connected, hoping
1017  *    it will successfully elect a new leader soon.
1018  * b) if it is already retrying, consider it as disconnected (so that clients
1019  *    may decide to reconnect to other members). */
1020 bool
1021 raft_is_connected(const struct raft *raft)
1022 {
1023     bool ret = (!raft->candidate_retrying
1024             && !raft->joining
1025             && !raft->leaving
1026             && !raft->left
1027             && !raft->failed);
1028     VLOG_DBG("raft_is_connected: %s\n", ret? "true": "false");
1029     return ret;
1030 }
1031
1032 /* Returns true if 'raft' is the cluster leader. */
1033 bool
1034 raft_is_leader(const struct raft *raft)
1035 {
1036     return raft->role == RAFT_LEADER;
1037 }
1038
1039 /* Returns true if 'raft' is the process of joining its cluster. */
1040 bool
1041 raft_is_joining(const struct raft *raft)
1042 {
1043     return raft->joining;
1044 }
1045
1046 /* Only returns *connected* connections. */
1047 static struct raft_conn *
1048 raft_find_conn_by_sid(struct raft *raft, const struct uuid *sid)
1049 {
1050     if (!uuid_is_zero(sid)) {
1051         struct raft_conn *conn;
1052         LIST_FOR_EACH (conn, list_node, &raft->conns) {
1053             if (uuid_equals(sid, &conn->sid)
1054                 && jsonrpc_session_is_connected(conn->js)) {
1055                 return conn;
1056             }
1057         }
1058     }
1059     return NULL;
1060 }
1061
1062 static struct raft_conn *
1063 raft_find_conn_by_address(struct raft *raft, const char *address)
1064 {
1065     struct raft_conn *conn;
1066     LIST_FOR_EACH (conn, list_node, &raft->conns) {
1067         if (!strcmp(jsonrpc_session_get_name(conn->js), address)) {
1068             return conn;
1069         }
1070     }
1071     return NULL;
1072 }
1073
1074 static void OVS_PRINTF_FORMAT(3, 4)
1075 raft_record_note(struct raft *raft, const char *note,
1076                  const char *comment_format, ...)
1077 {
1078     va_list args;
1079     va_start(args, comment_format);
1080     char *comment = xvasprintf(comment_format, args);
1081     va_end(args);
1082
1083     struct raft_record r = {
1084         .type = RAFT_REC_NOTE,
1085         .comment = comment,
1086         .note = CONST_CAST(char *, note),
1087     };
1088     ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
1089
1090     free(comment);
1091 }
1092
1093 /* If we're leader, try to transfer leadership to another server, logging
1094  * 'reason' as the human-readable reason (it should be a phrase suitable for
1095  * following "because") . */
1096 void
1097 raft_transfer_leadership(struct raft *raft, const char *reason)
1098 {
1099     if (raft->role != RAFT_LEADER) {
1100         return;
1101     }
1102
1103     struct raft_server *s;
1104     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
1105         if (!uuid_equals(&raft->sid, &s->sid)
1106             && s->phase == RAFT_PHASE_STABLE) {
1107             struct raft_conn *conn = raft_find_conn_by_sid(raft, &s->sid);
1108             if (!conn) {
1109                 continue;
1110             }
1111
1112             union raft_rpc rpc = {
1113                 .become_leader = {
1114                     .common = {
1115                         .comment = CONST_CAST(char *, reason),
1116                         .type = RAFT_RPC_BECOME_LEADER,
1117                         .sid = s->sid,
1118                     },
1119                     .term = raft->term,
1120                 }
1121             };
1122             raft_send_to_conn(raft, &rpc, conn);
1123
1124             raft_record_note(raft, "transfer leadership",
1125                              "transferring leadership to %s because %s",
1126                              s->nickname, reason);
1127             break;
1128         }
1129     }
1130 }
1131
1132 /* Send a RemoveServerRequest to the rest of the servers in the cluster.
1133  *
1134  * If we know which server is the leader, we can just send the request to it.
1135  * However, we might not know which server is the leader, and we might never
1136  * find out if the remove request was actually previously committed by a
1137  * majority of the servers (because in that case the new leader will not send
1138  * AppendRequests or heartbeats to us).  Therefore, we instead send
1139  * RemoveRequests to every server.  This theoretically has the same problem, if
1140  * the current cluster leader was not previously a member of the cluster, but
1141  * it seems likely to be more robust in practice.  */
1142 static void
1143 raft_send_remove_server_requests(struct raft *raft)
1144 {
1145     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
1146     VLOG_INFO_RL(&rl, "sending remove request (joining=%s, leaving=%s)",
1147                  raft->joining ? "true" : "false",
1148                  raft->leaving ? "true" : "false");
1149     const struct raft_server *s;
1150     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
1151         if (!uuid_equals(&s->sid, &raft->sid)) {
1152             union raft_rpc rpc = (union raft_rpc) {
1153                 .remove_server_request = {
1154                     .common = {
1155                         .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
1156                         .sid = s->sid,
1157                     },
1158                     .sid = raft->sid,
1159                 },
1160             };
1161             raft_send(raft, &rpc);
1162         }
1163     }
1164
1165     raft->leave_timeout = time_msec() + raft->election_timer;
1166 }
1167
1168 /* Attempts to start 'raft' leaving its cluster.  The caller can check progress
1169  * using raft_is_leaving() and raft_left(). */
1170 void
1171 raft_leave(struct raft *raft)
1172 {
1173     if (raft->joining || raft->failed || raft->leaving || raft->left) {
1174         return;
1175     }
1176     VLOG_INFO(SID_FMT": starting to leave cluster "CID_FMT,
1177               SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
1178     raft->leaving = true;
1179     raft_transfer_leadership(raft, "this server is leaving the cluster");
1180     raft_become_follower(raft);
1181     raft_send_remove_server_requests(raft);
1182     raft->leave_timeout = time_msec() + raft->election_timer;
1183 }
1184
1185 /* Returns true if 'raft' is currently attempting to leave its cluster. */
1186 bool
1187 raft_is_leaving(const struct raft *raft)
1188 {
1189     return raft->leaving;
1190 }
1191
1192 /* Returns true if 'raft' successfully left its cluster. */
1193 bool
1194 raft_left(const struct raft *raft)
1195 {
1196     return raft->left;
1197 }
1198
1199 /* Returns true if 'raft' has experienced a disk I/O failure.  When this
1200  * returns true, only closing and reopening 'raft' allows for recovery. */
1201 bool
1202 raft_failed(const struct raft *raft)
1203 {
1204     return raft->failed;
1205 }
1206
1207 /* Forces 'raft' to attempt to take leadership of the cluster by deposing the
1208  * current cluster. */
1209 void
1210 raft_take_leadership(struct raft *raft)
1211 {
1212     if (raft->role != RAFT_LEADER) {
1213         raft_start_election(raft, true);
1214     }
1215 }
1216
1217 /* Closes everything owned by 'raft' that might be visible outside the process:
1218  * network connections, commands, etc.  This is part of closing 'raft'; it is
1219  * also used if 'raft' has failed in an unrecoverable way. */
1220 static void
1221 raft_close__(struct raft *raft)
1222 {
1223     if (!hmap_node_is_null(&raft->hmap_node)) {
1224         hmap_remove(&all_rafts, &raft->hmap_node);
1225         hmap_node_nullify(&raft->hmap_node);
1226     }
1227
1228     raft_complete_all_commands(raft, RAFT_CMD_SHUTDOWN);
1229
1230     struct raft_server *rs = raft->remove_server;
1231     if (rs) {
1232         raft_send_remove_server_reply__(raft, &rs->sid, &rs->requester_sid,
1233                                         rs->requester_conn, false,
1234                                         RAFT_SERVER_SHUTDOWN);
1235         raft_server_destroy(raft->remove_server);
1236         raft->remove_server = NULL;
1237     }
1238
1239     struct raft_conn *conn, *next;
1240     LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) {
1241         raft_conn_close(conn);
1242     }
1243 }
1244
1245 /* Closes and frees 'raft'.
1246  *
1247  * A server's cluster membership is independent of whether the server is
1248  * actually running.  When a server that is a member of a cluster closes, the
1249  * cluster treats this as a server failure. */
1250 void
1251 raft_close(struct raft *raft)
1252 {
1253     if (!raft) {
1254         return;
1255     }
1256
1257     raft_transfer_leadership(raft, "this server is shutting down");
1258
1259     raft_close__(raft);
1260
1261     ovsdb_log_close(raft->log);
1262
1263     raft_servers_destroy(&raft->servers);
1264
1265     for (uint64_t index = raft->log_start; index < raft->log_end; index++) {
1266         struct raft_entry *e = &raft->entries[index - raft->log_start];
1267         raft_entry_uninit(e);
1268     }
1269     free(raft->entries);
1270
1271     raft_entry_uninit(&raft->snap);
1272
1273     raft_waiters_destroy(raft);
1274
1275     raft_servers_destroy(&raft->add_servers);
1276
1277     hmap_destroy(&raft->commands);
1278
1279     pstream_close(raft->listener);
1280
1281     sset_destroy(&raft->remote_addresses);
1282     free(raft->local_address);
1283     free(raft->local_nickname);
1284     free(raft->name);
1285
1286     free(raft);
1287 }
1288
1289 static bool
1290 raft_conn_receive(struct raft *raft, struct raft_conn *conn,
1291                   union raft_rpc *rpc)
1292 {
1293     struct jsonrpc_msg *msg = jsonrpc_session_recv(conn->js);
1294     if (!msg) {
1295         return false;
1296     }
1297
1298     struct ovsdb_error *error = raft_rpc_from_jsonrpc(&raft->cid, &raft->sid,
1299                                                       msg, rpc);
1300     jsonrpc_msg_destroy(msg);
1301     if (error) {
1302         char *s = ovsdb_error_to_string_free(error);
1303         VLOG_INFO("%s: %s", jsonrpc_session_get_name(conn->js), s);
1304         free(s);
1305         return false;
1306     }
1307
1308     if (uuid_is_zero(&conn->sid)) {
1309         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(50, 50);
1310         conn->sid = rpc->common.sid;
1311         VLOG_INFO_RL(&rl, "%s: learned server ID "SID_FMT,
1312                      jsonrpc_session_get_name(conn->js), SID_ARGS(&conn->sid));
1313     } else if (!uuid_equals(&conn->sid, &rpc->common.sid)) {
1314         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1315         VLOG_WARN_RL(&rl, "%s: ignoring message with unexpected server ID "
1316                      SID_FMT" (expected "SID_FMT")",
1317                      jsonrpc_session_get_name(conn->js),
1318                      SID_ARGS(&rpc->common.sid), SID_ARGS(&conn->sid));
1319         raft_rpc_uninit(rpc);
1320         return false;
1321     }
1322
1323     const char *address = (rpc->type == RAFT_RPC_HELLO_REQUEST
1324                            ? rpc->hello_request.address
1325                            : rpc->type == RAFT_RPC_ADD_SERVER_REQUEST
1326                            ? rpc->add_server_request.address
1327                            : NULL);
1328     if (address) {
1329         char *new_nickname = raft_address_to_nickname(address, &conn->sid);
1330         if (strcmp(conn->nickname, new_nickname)) {
1331             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(50, 50);
1332             VLOG_INFO_RL(&rl, "%s: learned remote address %s",
1333                          jsonrpc_session_get_name(conn->js), address);
1334
1335             free(conn->nickname);
1336             conn->nickname = new_nickname;
1337         } else {
1338             free(new_nickname);
1339         }
1340     }
1341
1342     return true;
1343 }
1344
1345 static const char *
1346 raft_get_nickname(const struct raft *raft, const struct uuid *sid,
1347                   char buf[SID_LEN + 1], size_t bufsize)
1348 {
1349     if (uuid_equals(sid, &raft->sid)) {
1350         return raft->local_nickname;
1351     }
1352
1353     const char *s = raft_servers_get_nickname__(&raft->servers, sid);
1354     if (s) {
1355         return s;
1356     }
1357
1358     return raft_servers_get_nickname(&raft->add_servers, sid, buf, bufsize);
1359 }
1360
1361 static void
1362 log_rpc(const union raft_rpc *rpc, const char *direction,
1363         const struct raft_conn *conn, int line_number)
1364 {
1365     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(600, 600);
1366     if (!raft_rpc_is_heartbeat(rpc) && !VLOG_DROP_DBG(&rl)) {
1367         struct ds s = DS_EMPTY_INITIALIZER;
1368         if (line_number) {
1369             ds_put_format(&s, "raft.c:%d ", line_number);
1370         }
1371         ds_put_format(&s, "%s%s ", direction, conn->nickname);
1372         raft_rpc_format(rpc, &s);
1373         VLOG_DBG("%s", ds_cstr(&s));
1374         ds_destroy(&s);
1375     }
1376 }
1377
1378 static void
1379 raft_send_add_server_request(struct raft *raft, struct raft_conn *conn)
1380 {
1381     union raft_rpc rq = {
1382         .add_server_request = {
1383             .common = {
1384                 .type = RAFT_RPC_ADD_SERVER_REQUEST,
1385                 .sid = UUID_ZERO,
1386                 .comment = NULL,
1387             },
1388             .address = raft->local_address,
1389         },
1390     };
1391     raft_send_to_conn(raft, &rq, conn);
1392 }
1393
1394 static void
1395 raft_conn_run(struct raft *raft, struct raft_conn *conn)
1396 {
1397     jsonrpc_session_run(conn->js);
1398
1399     unsigned int new_seqno = jsonrpc_session_get_seqno(conn->js);
1400     bool just_connected = (new_seqno != conn->js_seqno
1401                            && jsonrpc_session_is_connected(conn->js));
1402     conn->js_seqno = new_seqno;
1403     if (just_connected) {
1404         if (raft->joining) {
1405             raft_send_add_server_request(raft, conn);
1406         } else if (raft->leaving) {
1407             union raft_rpc rq = {
1408                 .remove_server_request = {
1409                     .common = {
1410                         .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
1411                         .sid = conn->sid,
1412                     },
1413                     .sid = raft->sid,
1414                 },
1415             };
1416             raft_send_to_conn(raft, &rq, conn);
1417         } else {
1418             union raft_rpc rq = (union raft_rpc) {
1419                 .hello_request = {
1420                     .common = {
1421                         .type = RAFT_RPC_HELLO_REQUEST,
1422                         .sid = conn->sid,
1423                     },
1424                     .address = raft->local_address,
1425                 },
1426             };
1427             raft_send_to_conn(raft, &rq, conn);
1428         }
1429     }
1430
1431     for (size_t i = 0; i < 50; i++) {
1432         union raft_rpc rpc;
1433         if (!raft_conn_receive(raft, conn, &rpc)) {
1434             break;
1435         }
1436
1437         log_rpc(&rpc, "<--", conn, 0);
1438         raft_handle_rpc(raft, &rpc);
1439         raft_rpc_uninit(&rpc);
1440     }
1441 }
1442
1443 static void
1444 raft_waiter_complete_rpc(struct raft *raft, const union raft_rpc *rpc)
1445 {
1446     uint64_t term = raft_rpc_get_term(rpc);
1447     if (term && term < raft->term) {
1448         /* Drop the message because it's for an expired term. */
1449         return;
1450     }
1451
1452     if (!raft_is_rpc_synced(raft, rpc)) {
1453         /* This is a bug.  A reply message is deferred because some state in
1454          * the message, such as a term or index, has not been committed to
1455          * disk, and they should only be completed when that commit is done.
1456          * But this message is being completed before the commit is finished.
1457          * Complain, and hope that someone reports the bug. */
1458         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
1459         if (VLOG_DROP_ERR(&rl)) {
1460             return;
1461         }
1462
1463         struct ds s = DS_EMPTY_INITIALIZER;
1464
1465         if (term > raft->synced_term) {
1466             ds_put_format(&s, " because message term %"PRIu64" is "
1467                           "past synced term %"PRIu64,
1468                           term, raft->synced_term);
1469         }
1470
1471         uint64_t index = raft_rpc_get_min_sync_index(rpc);
1472         if (index > raft->log_synced) {
1473             ds_put_format(&s, " %s message index %"PRIu64" is past last "
1474                           "synced index %"PRIu64,
1475                           s.length ? "and" : "because",
1476                           index, raft->log_synced);
1477         }
1478
1479         const struct uuid *vote = raft_rpc_get_vote(rpc);
1480         if (vote && !uuid_equals(vote, &raft->synced_vote)) {
1481             char buf1[SID_LEN + 1];
1482             char buf2[SID_LEN + 1];
1483             ds_put_format(&s, " %s vote %s differs from synced vote %s",
1484                           s.length ? "and" : "because",
1485                           raft_get_nickname(raft, vote, buf1, sizeof buf1),
1486                           raft_get_nickname(raft, &raft->synced_vote,
1487                                             buf2, sizeof buf2));
1488         }
1489
1490         char buf[SID_LEN + 1];
1491         ds_put_format(&s, ": %s ",
1492                       raft_get_nickname(raft, &rpc->common.sid,
1493                                         buf, sizeof buf));
1494         raft_rpc_format(rpc, &s);
1495         VLOG_ERR("internal error: deferred %s message completed "
1496                  "but not ready to send%s",
1497                  raft_rpc_type_to_string(rpc->type), ds_cstr(&s));
1498         ds_destroy(&s);
1499
1500         return;
1501     }
1502
1503     struct raft_conn *dst = raft_find_conn_by_sid(raft, &rpc->common.sid);
1504     if (dst) {
1505         raft_send_to_conn(raft, rpc, dst);
1506     }
1507 }
1508
1509 static void
1510 raft_waiter_complete(struct raft *raft, struct raft_waiter *w)
1511 {
1512     switch (w->type) {
1513     case RAFT_W_ENTRY:
1514         if (raft->role == RAFT_LEADER) {
1515             raft_update_our_match_index(raft, w->entry.index);
1516         }
1517         raft->log_synced = w->entry.index;
1518         break;
1519
1520     case RAFT_W_TERM:
1521         raft->synced_term = w->term.term;
1522         raft->synced_vote = w->term.vote;
1523         break;
1524
1525     case RAFT_W_RPC:
1526         raft_waiter_complete_rpc(raft, w->rpc);
1527         break;
1528     }
1529 }
1530
1531 static void
1532 raft_waiter_destroy(struct raft_waiter *w)
1533 {
1534     if (!w) {
1535         return;
1536     }
1537
1538     ovs_list_remove(&w->list_node);
1539
1540     switch (w->type) {
1541     case RAFT_W_ENTRY:
1542     case RAFT_W_TERM:
1543         break;
1544
1545     case RAFT_W_RPC:
1546         raft_rpc_uninit(w->rpc);
1547         free(w->rpc);
1548         break;
1549     }
1550     free(w);
1551 }
1552
1553 static void
1554 raft_waiters_run(struct raft *raft)
1555 {
1556     if (ovs_list_is_empty(&raft->waiters)) {
1557         return;
1558     }
1559
1560     uint64_t cur = ovsdb_log_commit_progress(raft->log);
1561     struct raft_waiter *w, *next;
1562     LIST_FOR_EACH_SAFE (w, next, list_node, &raft->waiters) {
1563         if (cur < w->commit_ticket) {
1564             break;
1565         }
1566         raft_waiter_complete(raft, w);
1567         raft_waiter_destroy(w);
1568     }
1569 }
1570
1571 static void
1572 raft_waiters_wait(struct raft *raft)
1573 {
1574     struct raft_waiter *w;
1575     LIST_FOR_EACH (w, list_node, &raft->waiters) {
1576         ovsdb_log_commit_wait(raft->log, w->commit_ticket);
1577         break;
1578     }
1579 }
1580
1581 static void
1582 raft_waiters_destroy(struct raft *raft)
1583 {
1584     struct raft_waiter *w, *next;
1585     LIST_FOR_EACH_SAFE (w, next, list_node, &raft->waiters) {
1586         raft_waiter_destroy(w);
1587     }
1588 }
1589
1590 static bool OVS_WARN_UNUSED_RESULT
1591 raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote)
1592 {
1593     struct ovsdb_error *error = raft_write_state(raft->log, term, vote);
1594     if (!raft_handle_write_error(raft, error)) {
1595         return false;
1596     }
1597
1598     struct raft_waiter *w = raft_waiter_create(raft, RAFT_W_TERM, true);
1599     raft->term = w->term.term = term;
1600     raft->vote = w->term.vote = vote ? *vote : UUID_ZERO;
1601     return true;
1602 }
1603
1604 static void
1605 raft_accept_vote(struct raft *raft, struct raft_server *s,
1606                  const struct uuid *vote)
1607 {
1608     if (uuid_equals(&s->vote, vote)) {
1609         return;
1610     }
1611     if (!uuid_is_zero(&s->vote)) {
1612         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1613         char buf1[SID_LEN + 1];
1614         char buf2[SID_LEN + 1];
1615         VLOG_WARN_RL(&rl, "server %s changed its vote from %s to %s",
1616                      s->nickname,
1617                      raft_get_nickname(raft, &s->vote, buf1, sizeof buf1),
1618                      raft_get_nickname(raft, vote, buf2, sizeof buf2));
1619     }
1620     s->vote = *vote;
1621     if (uuid_equals(vote, &raft->sid)
1622         && ++raft->n_votes > hmap_count(&raft->servers) / 2) {
1623         raft_become_leader(raft);
1624     }
1625 }
1626
1627 static void
1628 raft_start_election(struct raft *raft, bool leadership_transfer)
1629 {
1630     if (raft->leaving) {
1631         return;
1632     }
1633
1634     struct raft_server *me = raft_find_server(raft, &raft->sid);
1635     if (!me) {
1636         return;
1637     }
1638
1639     if (!raft_set_term(raft, raft->term + 1, &raft->sid)) {
1640         return;
1641     }
1642
1643     ovs_assert(raft->role != RAFT_LEADER);
1644     raft->role = RAFT_CANDIDATE;
1645     /* If there was no leader elected since last election, we know we are
1646      * retrying now. */
1647     raft->candidate_retrying = !raft->had_leader;
1648     raft->had_leader = false;
1649
1650     raft->n_votes = 0;
1651
1652     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1653     if (!VLOG_DROP_INFO(&rl)) {
1654         long long int now = time_msec();
1655         if (now >= raft->election_timeout) {
1656             VLOG_INFO("term %"PRIu64": %lld ms timeout expired, "
1657                       "starting election",
1658                       raft->term, now - raft->election_base);
1659         } else {
1660             VLOG_INFO("term %"PRIu64": starting election", raft->term);
1661         }
1662     }
1663     raft_reset_election_timer(raft);
1664
1665     struct raft_server *peer;
1666     HMAP_FOR_EACH (peer, hmap_node, &raft->servers) {
1667         peer->vote = UUID_ZERO;
1668         if (uuid_equals(&raft->sid, &peer->sid)) {
1669             continue;
1670         }
1671
1672         union raft_rpc rq = {
1673             .vote_request = {
1674                 .common = {
1675                     .type = RAFT_RPC_VOTE_REQUEST,
1676                     .sid = peer->sid,
1677                 },
1678                 .term = raft->term,
1679                 .last_log_index = raft->log_end - 1,
1680                 .last_log_term = (
1681                     raft->log_end > raft->log_start
1682                     ? raft->entries[raft->log_end - raft->log_start - 1].term
1683                     : raft->snap.term),
1684                 .leadership_transfer = leadership_transfer,
1685             },
1686         };
1687         raft_send(raft, &rq);
1688     }
1689
1690     /* Vote for ourselves. */
1691     raft_accept_vote(raft, me, &raft->sid);
1692 }
1693
1694 static void
1695 raft_open_conn(struct raft *raft, const char *address, const struct uuid *sid)
1696 {
1697     if (strcmp(address, raft->local_address)
1698         && !raft_find_conn_by_address(raft, address)) {
1699         raft_add_conn(raft, jsonrpc_session_open(address, true), sid, false);
1700     }
1701 }
1702
1703 static void
1704 raft_conn_close(struct raft_conn *conn)
1705 {
1706     jsonrpc_session_close(conn->js);
1707     ovs_list_remove(&conn->list_node);
1708     free(conn->nickname);
1709     free(conn);
1710 }
1711
1712 /* Returns true if 'conn' should stay open, false if it should be closed. */
1713 static bool
1714 raft_conn_should_stay_open(struct raft *raft, struct raft_conn *conn)
1715 {
1716     /* Close the connection if it's actually dead.  If necessary, we'll
1717      * initiate a new session later. */
1718     if (!jsonrpc_session_is_alive(conn->js)) {
1719         return false;
1720     }
1721
1722     /* Keep incoming sessions.  We trust the originator to decide to drop
1723      * it. */
1724     if (conn->incoming) {
1725         return true;
1726     }
1727
1728     /* If we are joining the cluster, keep sessions to the remote addresses
1729      * that are supposed to be part of the cluster we're joining. */
1730     if (raft->joining && sset_contains(&raft->remote_addresses,
1731                                        jsonrpc_session_get_name(conn->js))) {
1732         return true;
1733     }
1734
1735     /* We have joined the cluster.  If we did that "recently", then there is a
1736      * chance that we do not have the most recent server configuration log
1737      * entry.  If so, it's a waste to disconnect from the servers that were in
1738      * remote_addresses and that will probably appear in the configuration,
1739      * just to reconnect to them a moment later when we do get the
1740      * configuration update.  If we are not ourselves in the configuration,
1741      * then we know that there must be a new configuration coming up, so in
1742      * that case keep the connection. */
1743     if (!raft_find_server(raft, &raft->sid)) {
1744         return true;
1745     }
1746
1747     /* Keep the connection only if the server is part of the configuration. */
1748     return raft_find_server(raft, &conn->sid);
1749 }
1750
1751 /* Allows 'raft' to maintain the distributed log.  Call this function as part
1752  * of the process's main loop. */
1753 void
1754 raft_run(struct raft *raft)
1755 {
1756     if (raft->left || raft->failed) {
1757         return;
1758     }
1759
1760     raft_waiters_run(raft);
1761
1762     if (!raft->listener && time_msec() >= raft->listen_backoff) {
1763         char *paddr = raft_make_address_passive(raft->local_address);
1764         int error = pstream_open(paddr, &raft->listener, DSCP_DEFAULT);
1765         if (error) {
1766             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
1767             VLOG_WARN_RL(&rl, "%s: listen failed (%s)",
1768                          paddr, ovs_strerror(error));
1769             raft->listen_backoff = time_msec() + 1000;
1770         }
1771         free(paddr);
1772     }
1773
1774     if (raft->listener) {
1775         struct stream *stream;
1776         int error = pstream_accept(raft->listener, &stream);
1777         if (!error) {
1778             raft_add_conn(raft, jsonrpc_session_open_unreliably(
1779                               jsonrpc_open(stream), DSCP_DEFAULT), NULL,
1780                           true);
1781         } else if (error != EAGAIN) {
1782             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
1783             VLOG_WARN_RL(&rl, "%s: accept failed: %s",
1784                          pstream_get_name(raft->listener),
1785                          ovs_strerror(error));
1786         }
1787     }
1788
1789     /* Run RPCs for all open sessions. */
1790     struct raft_conn *conn;
1791     LIST_FOR_EACH (conn, list_node, &raft->conns) {
1792         raft_conn_run(raft, conn);
1793     }
1794
1795     /* Close unneeded sessions. */
1796     struct raft_conn *next;
1797     LIST_FOR_EACH_SAFE (conn, next, list_node, &raft->conns) {
1798         if (!raft_conn_should_stay_open(raft, conn)) {
1799             raft_conn_close(conn);
1800         }
1801     }
1802
1803     /* Open needed sessions. */
1804     struct raft_server *server;
1805     HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
1806         raft_open_conn(raft, server->address, &server->sid);
1807     }
1808     if (raft->joining) {
1809         const char *address;
1810         SSET_FOR_EACH (address, &raft->remote_addresses) {
1811             raft_open_conn(raft, address, NULL);
1812         }
1813     }
1814
1815     if (!raft->joining && time_msec() >= raft->election_timeout) {
1816         if (raft->role == RAFT_LEADER) {
1817             /* Check if majority of followers replied, then reset
1818              * election_timeout and reset s->replied. Otherwise, become
1819              * follower.
1820              *
1821              * Raft paper section 6.2: Leaders: A server might be in the leader
1822              * state, but if it isn’t the current leader, it could be
1823              * needlessly delaying client requests. For example, suppose a
1824              * leader is partitioned from the rest of the cluster, but it can
1825              * still communicate with a particular client. Without additional
1826              * mechanism, it could delay a request from that client forever,
1827              * being unable to replicate a log entry to any other servers.
1828              * Meanwhile, there might be another leader of a newer term that is
1829              * able to communicate with a majority of the cluster and would be
1830              * able to commit the client’s request. Thus, a leader in Raft
1831              * steps down if an election timeout elapses without a successful
1832              * round of heartbeats to a majority of its cluster; this allows
1833              * clients to retry their requests with another server.  */
1834             int count = 0;
1835             HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
1836                 if (server->replied) {
1837                     count ++;
1838                 }
1839             }
1840             if (count >= hmap_count(&raft->servers) / 2) {
1841                 HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
1842                     server->replied = false;
1843                 }
1844                 raft_reset_election_timer(raft);
1845             } else {
1846                 raft_become_follower(raft);
1847                 raft_start_election(raft, false);
1848             }
1849         } else {
1850             raft_start_election(raft, false);
1851         }
1852
1853     }
1854
1855     if (raft->leaving && time_msec() >= raft->leave_timeout) {
1856         raft_send_remove_server_requests(raft);
1857     }
1858
1859     if (raft->joining && time_msec() >= raft->join_timeout) {
1860         raft->join_timeout = time_msec() + 1000;
1861         LIST_FOR_EACH (conn, list_node, &raft->conns) {
1862             raft_send_add_server_request(raft, conn);
1863         }
1864     }
1865
1866     long long int now = time_msec();
1867     if (now >= raft->ping_timeout) {
1868         if (raft->role == RAFT_LEADER) {
1869             raft_send_heartbeats(raft);
1870         }
1871         /* Check if any commands timeout. Timeout is set to twice the time of
1872          * election base time so that commands can complete properly during
1873          * leader election. E.g. a leader crashed and current node with pending
1874          * commands becomes new leader: the pending commands can still complete
1875          * if the crashed leader has replicated the transactions to majority of
1876          * followers before it crashed. */
1877         struct raft_command *cmd, *next_cmd;
1878         HMAP_FOR_EACH_SAFE (cmd, next_cmd, hmap_node, &raft->commands) {
1879             if (cmd->timestamp
1880                 && now - cmd->timestamp > raft->election_timer * 2) {
1881                 raft_command_complete(raft, cmd, RAFT_CMD_TIMEOUT);
1882             }
1883         }
1884         raft_reset_ping_timer(raft);
1885     }
1886
1887     /* Do this only at the end; if we did it as soon as we set raft->left or
1888      * raft->failed in handling the RemoveServerReply, then it could easily
1889      * cause references to freed memory in RPC sessions, etc. */
1890     if (raft->left || raft->failed) {
1891         raft_close__(raft);
1892     }
1893 }
1894
1895 static void
1896 raft_wait_session(struct jsonrpc_session *js)
1897 {
1898     if (js) {
1899         jsonrpc_session_wait(js);
1900         jsonrpc_session_recv_wait(js);
1901     }
1902 }
1903
1904 /* Causes the next call to poll_block() to wake up when 'raft' needs to do
1905  * something. */
1906 void
1907 raft_wait(struct raft *raft)
1908 {
1909     if (raft->left || raft->failed) {
1910         return;
1911     }
1912
1913     raft_waiters_wait(raft);
1914
1915     if (raft->listener) {
1916         pstream_wait(raft->listener);
1917     } else {
1918         poll_timer_wait_until(raft->listen_backoff);
1919     }
1920
1921     struct raft_conn *conn;
1922     LIST_FOR_EACH (conn, list_node, &raft->conns) {
1923         raft_wait_session(conn->js);
1924     }
1925
1926     if (!raft->joining) {
1927         poll_timer_wait_until(raft->election_timeout);
1928     } else {
1929         poll_timer_wait_until(raft->join_timeout);
1930     }
1931     if (raft->leaving) {
1932         poll_timer_wait_until(raft->leave_timeout);
1933     }
1934     if (raft->role == RAFT_LEADER || !hmap_is_empty(&raft->commands)) {
1935         poll_timer_wait_until(raft->ping_timeout);
1936     }
1937 }
1938
1939 static struct raft_waiter *
1940 raft_waiter_create(struct raft *raft, enum raft_waiter_type type,
1941                    bool start_commit)
1942 {
1943     struct raft_waiter *w = xzalloc(sizeof *w);
1944     ovs_list_push_back(&raft->waiters, &w->list_node);
1945     w->commit_ticket = start_commit ? ovsdb_log_commit_start(raft->log) : 0;
1946     w->type = type;
1947     return w;
1948 }
1949
1950 /* Returns a human-readable representation of 'status' (or NULL if 'status' is
1951  * invalid). */
1952 const char *
1953 raft_command_status_to_string(enum raft_command_status status)
1954 {
1955     switch (status) {
1956     case RAFT_CMD_INCOMPLETE:
1957         return "operation still in progress";
1958     case RAFT_CMD_SUCCESS:
1959         return "success";
1960     case RAFT_CMD_NOT_LEADER:
1961         return "not leader";
1962     case RAFT_CMD_BAD_PREREQ:
1963         return "prerequisite check failed";
1964     case RAFT_CMD_LOST_LEADERSHIP:
1965         return "lost leadership";
1966     case RAFT_CMD_SHUTDOWN:
1967         return "server shutdown";
1968     case RAFT_CMD_IO_ERROR:
1969         return "I/O error";
1970     case RAFT_CMD_TIMEOUT:
1971         return "timeout";
1972     default:
1973         return NULL;
1974     }
1975 }
1976
1977 /* Converts human-readable status in 's' into status code in '*statusp'.
1978  * Returns true if successful, false if 's' is unknown. */
1979 bool
1980 raft_command_status_from_string(const char *s,
1981                                 enum raft_command_status *statusp)
1982 {
1983     for (enum raft_command_status status = 0; ; status++) {
1984         const char *s2 = raft_command_status_to_string(status);
1985         if (!s2) {
1986             *statusp = 0;
1987             return false;
1988         } else if (!strcmp(s, s2)) {
1989             *statusp = status;
1990             return true;
1991         }
1992     }
1993 }
1994
1995 static const struct uuid *
1996 raft_get_eid(const struct raft *raft, uint64_t index)
1997 {
1998     for (; index >= raft->log_start; index--) {
1999         const struct raft_entry *e = raft_get_entry(raft, index);
2000         if (e->data) {
2001             return &e->eid;
2002         }
2003     }
2004     return &raft->snap.eid;
2005 }
2006
2007 const struct uuid *
2008 raft_current_eid(const struct raft *raft)
2009 {
2010     return raft_get_eid(raft, raft->log_end - 1);
2011 }
2012
2013 static struct raft_command *
2014 raft_command_create_completed(enum raft_command_status status)
2015 {
2016     ovs_assert(status != RAFT_CMD_INCOMPLETE);
2017
2018     struct raft_command *cmd = xzalloc(sizeof *cmd);
2019     cmd->n_refs = 1;
2020     cmd->status = status;
2021     return cmd;
2022 }
2023
2024 static struct raft_command *
2025 raft_command_create_incomplete(struct raft *raft, uint64_t index)
2026 {
2027     struct raft_command *cmd = xzalloc(sizeof *cmd);
2028     cmd->n_refs = 2;            /* One for client, one for raft->commands. */
2029     cmd->index = index;
2030     cmd->status = RAFT_CMD_INCOMPLETE;
2031     hmap_insert(&raft->commands, &cmd->hmap_node, cmd->index);
2032     return cmd;
2033 }
2034
2035 static struct raft_command * OVS_WARN_UNUSED_RESULT
2036 raft_command_initiate(struct raft *raft,
2037                       const struct json *data, const struct json *servers,
2038                       uint64_t election_timer, const struct uuid *eid)
2039 {
2040     /* Write to local log. */
2041     uint64_t index = raft->log_end;
2042     if (!raft_handle_write_error(
2043             raft, raft_write_entry(
2044                 raft, raft->term, json_nullable_clone(data), eid,
2045                 json_nullable_clone(servers),
2046                 election_timer))) {
2047         return raft_command_create_completed(RAFT_CMD_IO_ERROR);
2048     }
2049
2050     struct raft_command *cmd = raft_command_create_incomplete(raft, index);
2051     ovs_assert(eid);
2052     cmd->eid = *eid;
2053     cmd->timestamp = time_msec();
2054
2055     raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index = cmd->index;
2056
2057     if (failure_test == FT_CRASH_BEFORE_SEND_APPEND_REQ) {
2058         ovs_fatal(0, "Raft test: crash before sending append_request.");
2059     }
2060     /* Write to remote logs. */
2061     struct raft_server *s;
2062     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
2063         if (!uuid_equals(&s->sid, &raft->sid) && s->next_index == index) {
2064             raft_send_append_request(raft, s, 1, "execute command");
2065             s->next_index++;
2066         }
2067     }
2068     if (failure_test == FT_CRASH_AFTER_SEND_APPEND_REQ) {
2069         ovs_fatal(0, "Raft test: crash after sending append_request.");
2070     }
2071     raft_reset_ping_timer(raft);
2072
2073     return cmd;
2074 }
2075
2076 static void
2077 log_all_commands(struct raft *raft)
2078 {
2079     struct raft_command *cmd, *next;
2080     HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
2081         VLOG_DBG("raft command eid: "UUID_FMT, UUID_ARGS(&cmd->eid));
2082     }
2083 }
2084
2085 static struct raft_command * OVS_WARN_UNUSED_RESULT
2086 raft_command_execute__(struct raft *raft, const struct json *data,
2087                        const struct json *servers, uint64_t election_timer,
2088                        const struct uuid *prereq, struct uuid *result)
2089 {
2090     if (raft->joining || raft->leaving || raft->left || raft->failed) {
2091         return raft_command_create_completed(RAFT_CMD_SHUTDOWN);
2092     }
2093
2094     if (raft->role != RAFT_LEADER) {
2095         /* Consider proxying the command to the leader.  We can only do that if
2096          * we know the leader and the command does not change the set of
2097          * servers.  We do not proxy commands without prerequisites, even
2098          * though we could, because in an OVSDB context a log entry doesn't
2099          * make sense without context. */
2100         if (servers || election_timer || !data
2101             || raft->role != RAFT_FOLLOWER || uuid_is_zero(&raft->leader_sid)
2102             || !prereq) {
2103             return raft_command_create_completed(RAFT_CMD_NOT_LEADER);
2104         }
2105     }
2106
2107     struct uuid eid = data ? uuid_random() : UUID_ZERO;
2108     if (result) {
2109         *result = eid;
2110     }
2111
2112     if (raft->role != RAFT_LEADER) {
2113         const union raft_rpc rpc = {
2114             .execute_command_request = {
2115                 .common = {
2116                     .type = RAFT_RPC_EXECUTE_COMMAND_REQUEST,
2117                     .sid = raft->leader_sid,
2118                 },
2119                 .data = CONST_CAST(struct json *, data),
2120                 .prereq = *prereq,
2121                 .result = eid,
2122             }
2123         };
2124         if (failure_test == FT_CRASH_BEFORE_SEND_EXEC_REQ) {
2125             ovs_fatal(0, "Raft test: crash before sending "
2126                       "execute_command_request");
2127         }
2128         if (!raft_send(raft, &rpc)) {
2129             /* Couldn't send command, so it definitely failed. */
2130             return raft_command_create_completed(RAFT_CMD_NOT_LEADER);
2131         }
2132         if (failure_test == FT_CRASH_AFTER_SEND_EXEC_REQ) {
2133             ovs_fatal(0, "Raft test: crash after sending "
2134                       "execute_command_request");
2135         }
2136
2137         struct raft_command *cmd = raft_command_create_incomplete(raft, 0);
2138         cmd->timestamp = time_msec();
2139         cmd->eid = eid;
2140         log_all_commands(raft);
2141         return cmd;
2142     }
2143
2144     const struct uuid *current_eid = raft_current_eid(raft);
2145     if (prereq && !uuid_equals(prereq, current_eid)) {
2146         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2147         VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match "
2148                      "prerequisite "UUID_FMT,
2149                      UUID_ARGS(current_eid), UUID_ARGS(prereq));
2150         return raft_command_create_completed(RAFT_CMD_BAD_PREREQ);
2151     }
2152
2153     return raft_command_initiate(raft, data, servers, election_timer, &eid);
2154 }
2155
2156 /* Initiates appending a log entry to 'raft'.  The log entry consists of 'data'
2157  * and, if 'prereq' is nonnull, it is only added to the log if the previous
2158  * entry in the log has entry ID 'prereq'.  If 'result' is nonnull, it is
2159  * populated with the entry ID for the new log entry.
2160  *
2161  * Returns a "struct raft_command" that may be used to track progress adding
2162  * the log entry.  The caller must eventually free the returned structure, with
2163  * raft_command_unref(). */
2164 struct raft_command * OVS_WARN_UNUSED_RESULT
2165 raft_command_execute(struct raft *raft, const struct json *data,
2166                      const struct uuid *prereq, struct uuid *result)
2167 {
2168     return raft_command_execute__(raft, data, NULL, 0, prereq, result);
2169 }
2170
2171 /* Returns the status of 'cmd'. */
2172 enum raft_command_status
2173 raft_command_get_status(const struct raft_command *cmd)
2174 {
2175     ovs_assert(cmd->n_refs > 0);
2176     return cmd->status;
2177 }
2178
2179 /* Returns the index of the log entry at which 'cmd' was committed.
2180  *
2181  * This function works only with successful commands. */
2182 uint64_t
2183 raft_command_get_commit_index(const struct raft_command *cmd)
2184 {
2185     ovs_assert(cmd->n_refs > 0);
2186     ovs_assert(cmd->status == RAFT_CMD_SUCCESS);
2187     return cmd->index;
2188 }
2189
2190 /* Frees 'cmd'. */
2191 void
2192 raft_command_unref(struct raft_command *cmd)
2193 {
2194     if (cmd) {
2195         ovs_assert(cmd->n_refs > 0);
2196         if (!--cmd->n_refs) {
2197             free(cmd);
2198         }
2199     }
2200 }
2201
2202 /* Causes poll_block() to wake up when 'cmd' has status to report. */
2203 void
2204 raft_command_wait(const struct raft_command *cmd)
2205 {
2206     if (cmd->status != RAFT_CMD_INCOMPLETE) {
2207         poll_immediate_wake();
2208     }
2209 }
2210
2211 static void
2212 raft_command_complete(struct raft *raft,
2213                       struct raft_command *cmd,
2214                       enum raft_command_status status)
2215 {
2216     VLOG_DBG("raft_command_complete eid "UUID_FMT" status: %s",
2217              UUID_ARGS(&cmd->eid), raft_command_status_to_string(status));
2218     if (!uuid_is_zero(&cmd->sid)) {
2219         uint64_t commit_index = status == RAFT_CMD_SUCCESS ? cmd->index : 0;
2220         raft_send_execute_command_reply(raft, &cmd->sid, &cmd->eid, status,
2221                                         commit_index);
2222     }
2223
2224     ovs_assert(cmd->status == RAFT_CMD_INCOMPLETE);
2225     ovs_assert(cmd->n_refs > 0);
2226     hmap_remove(&raft->commands, &cmd->hmap_node);
2227     cmd->status = status;
2228     raft_command_unref(cmd);
2229 }
2230
2231 static void
2232 raft_complete_all_commands(struct raft *raft, enum raft_command_status status)
2233 {
2234     struct raft_command *cmd, *next;
2235     HMAP_FOR_EACH_SAFE (cmd, next, hmap_node, &raft->commands) {
2236         raft_command_complete(raft, cmd, status);
2237     }
2238 }
2239
2240 static struct raft_command *
2241 raft_find_command_by_eid(struct raft *raft, const struct uuid *eid)
2242 {
2243     struct raft_command *cmd;
2244
2245     HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) {
2246         if (uuid_equals(&cmd->eid, eid)) {
2247             return cmd;
2248         }
2249     }
2250     return NULL;
2251 }
2252 \f
2253 #define RAFT_RPC(ENUM, NAME) \
2254     static void raft_handle_##NAME(struct raft *, const struct raft_##NAME *);
2255 RAFT_RPC_TYPES
2256 #undef RAFT_RPC
2257
2258 static void
2259 raft_handle_hello_request(struct raft *raft OVS_UNUSED,
2260                           const struct raft_hello_request *hello OVS_UNUSED)
2261 {
2262 }
2263
2264 /* 'sid' is the server being added. */
2265 static void
2266 raft_send_add_server_reply__(struct raft *raft, const struct uuid *sid,
2267                              const char *address,
2268                              bool success, const char *comment)
2269 {
2270     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
2271     if (!VLOG_DROP_INFO(&rl)) {
2272         struct ds s = DS_EMPTY_INITIALIZER;
2273         char buf[SID_LEN + 1];
2274         ds_put_format(&s, "adding %s ("SID_FMT" at %s) "
2275                       "to cluster "CID_FMT" %s",
2276                       raft_get_nickname(raft, sid, buf, sizeof buf),
2277                       SID_ARGS(sid), address, CID_ARGS(&raft->cid),
2278                       success ? "succeeded" : "failed");
2279         if (comment) {
2280             ds_put_format(&s, " (%s)", comment);
2281         }
2282         VLOG_INFO("%s", ds_cstr(&s));
2283         ds_destroy(&s);
2284     }
2285
2286     union raft_rpc rpy = {
2287         .add_server_reply = {
2288             .common = {
2289                 .type = RAFT_RPC_ADD_SERVER_REPLY,
2290                 .sid = *sid,
2291                 .comment = CONST_CAST(char *, comment),
2292             },
2293             .success = success,
2294         }
2295     };
2296
2297     struct sset *remote_addresses = &rpy.add_server_reply.remote_addresses;
2298     sset_init(remote_addresses);
2299     if (!raft->joining) {
2300         struct raft_server *s;
2301         HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
2302             if (!uuid_equals(&s->sid, &raft->sid)) {
2303                 sset_add(remote_addresses, s->address);
2304             }
2305         }
2306     }
2307
2308     raft_send(raft, &rpy);
2309
2310     sset_destroy(remote_addresses);
2311 }
2312
2313 static void
2314 raft_send_remove_server_reply_rpc(struct raft *raft,
2315                                   const struct uuid *dst_sid,
2316                                   const struct uuid *target_sid,
2317                                   bool success, const char *comment)
2318 {
2319     if (uuid_equals(&raft->sid, dst_sid)) {
2320         if (success && uuid_equals(&raft->sid, target_sid)) {
2321             raft_finished_leaving_cluster(raft);
2322         }
2323         return;
2324     }
2325
2326     const union raft_rpc rpy = {
2327         .remove_server_reply = {
2328             .common = {
2329                 .type = RAFT_RPC_REMOVE_SERVER_REPLY,
2330                 .sid = *dst_sid,
2331                 .comment = CONST_CAST(char *, comment),
2332             },
2333             .target_sid = (uuid_equals(dst_sid, target_sid)
2334                            ? UUID_ZERO
2335                            : *target_sid),
2336             .success = success,
2337         }
2338     };
2339     raft_send(raft, &rpy);
2340 }
2341
2342 static void
2343 raft_send_remove_server_reply__(struct raft *raft,
2344                                 const struct uuid *target_sid,
2345                                 const struct uuid *requester_sid,
2346                                 struct unixctl_conn *requester_conn,
2347                                 bool success, const char *comment)
2348 {
2349     struct ds s = DS_EMPTY_INITIALIZER;
2350     ds_put_format(&s, "request ");
2351     if (!uuid_is_zero(requester_sid)) {
2352         char buf[SID_LEN + 1];
2353         ds_put_format(&s, "by %s",
2354                       raft_get_nickname(raft, requester_sid, buf, sizeof buf));
2355     } else {
2356         ds_put_cstr(&s, "via unixctl");
2357     }
2358     ds_put_cstr(&s, " to remove ");
2359     if (!requester_conn && uuid_equals(target_sid, requester_sid)) {
2360         ds_put_cstr(&s, "itself");
2361     } else {
2362         char buf[SID_LEN + 1];
2363         ds_put_cstr(&s, raft_get_nickname(raft, target_sid, buf, sizeof buf));
2364         if (uuid_equals(target_sid, &raft->sid)) {
2365             ds_put_cstr(&s, " (ourselves)");
2366         }
2367     }
2368     ds_put_format(&s, " from cluster "CID_FMT" %s",
2369                   CID_ARGS(&raft->cid),
2370                   success ? "succeeded" : "failed");
2371     if (comment) {
2372         ds_put_format(&s, " (%s)", comment);
2373     }
2374
2375     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
2376     VLOG_INFO_RL(&rl, "%s", ds_cstr(&s));
2377
2378     /* Send RemoveServerReply to the requester (which could be a server or a
2379      * unixctl connection.  Also always send it to the removed server; this
2380      * allows it to be sure that it's really removed and update its log and
2381      * disconnect permanently.  */
2382     if (!uuid_is_zero(requester_sid)) {
2383         raft_send_remove_server_reply_rpc(raft, requester_sid, target_sid,
2384                                           success, comment);
2385     }
2386     if (!uuid_equals(requester_sid, target_sid)) {
2387         raft_send_remove_server_reply_rpc(raft, target_sid, target_sid,
2388                                           success, comment);
2389     }
2390     if (requester_conn) {
2391         if (success) {
2392             unixctl_command_reply(requester_conn, ds_cstr(&s));
2393         } else {
2394             unixctl_command_reply_error(requester_conn, ds_cstr(&s));
2395         }
2396     }
2397
2398     ds_destroy(&s);
2399 }
2400
2401 static void
2402 raft_send_add_server_reply(struct raft *raft,
2403                            const struct raft_add_server_request *rq,
2404                            bool success, const char *comment)
2405 {
2406     return raft_send_add_server_reply__(raft, &rq->common.sid, rq->address,
2407                                         success, comment);
2408 }
2409
2410 static void
2411 raft_send_remove_server_reply(struct raft *raft,
2412                               const struct raft_remove_server_request *rq,
2413                               bool success, const char *comment)
2414 {
2415     return raft_send_remove_server_reply__(raft, &rq->sid, &rq->common.sid,
2416                                            rq->requester_conn, success,
2417                                            comment);
2418 }
2419
2420 static void
2421 raft_become_follower(struct raft *raft)
2422 {
2423     raft->leader_sid = UUID_ZERO;
2424     if (raft->role == RAFT_FOLLOWER) {
2425         return;
2426     }
2427
2428     raft->role = RAFT_FOLLOWER;
2429     raft_reset_election_timer(raft);
2430
2431     /* Notify clients about lost leadership.
2432      *
2433      * We do not reverse our changes to 'raft->servers' because the new
2434      * configuration is already part of the log.  Possibly the configuration
2435      * log entry will not be committed, but until we know that we must use the
2436      * new configuration.  Our AppendEntries processing will properly update
2437      * the server configuration later, if necessary. */
2438     struct raft_server *s;
2439     HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
2440         raft_send_add_server_reply__(raft, &s->sid, s->address, false,
2441                                      RAFT_SERVER_LOST_LEADERSHIP);
2442     }
2443     if (raft->remove_server) {
2444         raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
2445                                         &raft->remove_server->requester_sid,
2446                                         raft->remove_server->requester_conn,
2447                                         false, RAFT_SERVER_LOST_LEADERSHIP);
2448         raft_server_destroy(raft->remove_server);
2449         raft->remove_server = NULL;
2450     }
2451
2452     raft_complete_all_commands(raft, RAFT_CMD_LOST_LEADERSHIP);
2453 }
2454
2455 static void
2456 raft_send_append_request(struct raft *raft,
2457                          struct raft_server *peer, unsigned int n,
2458                          const char *comment)
2459 {
2460     ovs_assert(raft->role == RAFT_LEADER);
2461
2462     const union raft_rpc rq = {
2463         .append_request = {
2464             .common = {
2465                 .type = RAFT_RPC_APPEND_REQUEST,
2466                 .sid = peer->sid,
2467                 .comment = CONST_CAST(char *, comment),
2468             },
2469             .term = raft->term,
2470             .prev_log_index = peer->next_index - 1,
2471             .prev_log_term = (peer->next_index - 1 >= raft->log_start
2472                               ? raft->entries[peer->next_index - 1
2473                                               - raft->log_start].term
2474                               : raft->snap.term),
2475             .leader_commit = raft->commit_index,
2476             .entries = &raft->entries[peer->next_index - raft->log_start],
2477             .n_entries = n,
2478         },
2479     };
2480     raft_send(raft, &rq);
2481 }
2482
2483 static void
2484 raft_send_heartbeats(struct raft *raft)
2485 {
2486     struct raft_server *s;
2487     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
2488         if (!uuid_equals(&raft->sid, &s->sid)) {
2489             raft_send_append_request(raft, s, 0, "heartbeat");
2490         }
2491     }
2492
2493     /* Send anyone waiting for a command to complete a ping to let them
2494      * know we're still working on it. */
2495     struct raft_command *cmd;
2496     HMAP_FOR_EACH (cmd, hmap_node, &raft->commands) {
2497         if (!uuid_is_zero(&cmd->sid)) {
2498             raft_send_execute_command_reply(raft, &cmd->sid,
2499                                             &cmd->eid,
2500                                             RAFT_CMD_INCOMPLETE, 0);
2501         }
2502     }
2503
2504     raft_reset_ping_timer(raft);
2505 }
2506
2507 /* Initializes the fields in 's' that represent the leader's view of the
2508  * server. */
2509 static void
2510 raft_server_init_leader(struct raft *raft, struct raft_server *s)
2511 {
2512     s->next_index = raft->log_end;
2513     s->match_index = 0;
2514     s->phase = RAFT_PHASE_STABLE;
2515     s->replied = false;
2516 }
2517
2518 static void
2519 raft_set_leader(struct raft *raft, const struct uuid *sid)
2520 {
2521     raft->leader_sid = *sid;
2522     raft->had_leader = true;
2523     raft->candidate_retrying = false;
2524 }
2525
2526 static void
2527 raft_become_leader(struct raft *raft)
2528 {
2529     log_all_commands(raft);
2530
2531     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
2532     VLOG_INFO_RL(&rl, "term %"PRIu64": elected leader by %d+ of "
2533                  "%"PRIuSIZE" servers", raft->term,
2534                  raft->n_votes, hmap_count(&raft->servers));
2535
2536     ovs_assert(raft->role != RAFT_LEADER);
2537     raft->role = RAFT_LEADER;
2538     raft_set_leader(raft, &raft->sid);
2539     raft_reset_election_timer(raft);
2540     raft_reset_ping_timer(raft);
2541
2542     struct raft_server *s;
2543     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
2544         raft_server_init_leader(raft, s);
2545     }
2546
2547     raft->election_timer_new = 0;
2548
2549     raft_update_our_match_index(raft, raft->log_end - 1);
2550     raft_send_heartbeats(raft);
2551
2552     /* Write the fact that we are leader to the log.  This is not used by the
2553      * algorithm (although it could be, for quick restart), but it is used for
2554      * offline analysis to check for conformance with the properties that Raft
2555      * guarantees. */
2556     struct raft_record r = {
2557         .type = RAFT_REC_LEADER,
2558         .term = raft->term,
2559         .sid = raft->sid,
2560     };
2561     ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
2562
2563     /* Initiate a no-op commit.  Otherwise we might never find out what's in
2564      * the log.  See section 6.4 item 1:
2565      *
2566      *     The Leader Completeness Property guarantees that a leader has all
2567      *     committed entries, but at the start of its term, it may not know
2568      *     which those are.  To find out, it needs to commit an entry from its
2569      *     term.  Raft handles this by having each leader commit a blank no-op
2570      *     entry into the log at the start of its term.  As soon as this no-op
2571      *     entry is committed, the leader’s commit index will be at least as
2572      *     large as any other servers’ during its term.
2573      */
2574     raft_command_unref(raft_command_execute__(raft, NULL, NULL, 0, NULL,
2575                                               NULL));
2576 }
2577
2578 /* Processes term 'term' received as part of RPC 'common'.  Returns true if the
2579  * caller should continue processing the RPC, false if the caller should reject
2580  * it due to a stale term. */
2581 static bool
2582 raft_receive_term__(struct raft *raft, const struct raft_rpc_common *common,
2583                     uint64_t term)
2584 {
2585     /* Section 3.3 says:
2586      *
2587      *     Current terms are exchanged whenever servers communicate; if one
2588      *     server’s current term is smaller than the other’s, then it updates
2589      *     its current term to the larger value.  If a candidate or leader
2590      *     discovers that its term is out of date, it immediately reverts to
2591      *     follower state.  If a server receives a request with a stale term
2592      *     number, it rejects the request.
2593      */
2594     if (term > raft->term) {
2595         if (!raft_set_term(raft, term, NULL)) {
2596             /* Failed to update the term to 'term'. */
2597             return false;
2598         }
2599         raft_become_follower(raft);
2600     } else if (term < raft->term) {
2601         char buf[SID_LEN + 1];
2602         VLOG_INFO("rejecting term %"PRIu64" < current term %"PRIu64" received "
2603                   "in %s message from server %s",
2604                   term, raft->term,
2605                   raft_rpc_type_to_string(common->type),
2606                   raft_get_nickname(raft, &common->sid, buf, sizeof buf));
2607         return false;
2608     }
2609     return true;
2610 }
2611
2612 static void
2613 raft_get_servers_from_log(struct raft *raft, enum vlog_level level)
2614 {
2615     const struct json *servers_json = raft->snap.servers;
2616     for (uint64_t index = raft->log_end - 1; index >= raft->log_start;
2617          index--) {
2618         struct raft_entry *e = &raft->entries[index - raft->log_start];
2619         if (e->servers) {
2620             servers_json = e->servers;
2621             break;
2622         }
2623     }
2624
2625     struct hmap servers;
2626     struct ovsdb_error *error = raft_servers_from_json(servers_json, &servers);
2627     ovs_assert(!error);
2628     raft_set_servers(raft, &servers, level);
2629     raft_servers_destroy(&servers);
2630 }
2631
2632 /* Truncates the log, so that raft->log_end becomes 'new_end'.
2633  *
2634  * Doesn't write anything to disk.  In theory, we could truncate the on-disk
2635  * log file, but we don't have the right information to know how long it should
2636  * be.  What we actually do is to append entries for older indexes to the
2637  * on-disk log; when we re-read it later, these entries truncate the log.
2638  *
2639  * Returns true if any of the removed log entries were server configuration
2640  * entries, false otherwise. */
2641 static bool
2642 raft_truncate(struct raft *raft, uint64_t new_end)
2643 {
2644     ovs_assert(new_end >= raft->log_start);
2645     if (raft->log_end > new_end) {
2646         char buf[SID_LEN + 1];
2647         VLOG_INFO("%s truncating %"PRIu64 " entries from end of log",
2648                   raft_get_nickname(raft, &raft->sid, buf, sizeof buf),
2649                   raft->log_end - new_end);
2650     }
2651
2652     bool servers_changed = false;
2653     while (raft->log_end > new_end) {
2654         struct raft_entry *entry = &raft->entries[--raft->log_end
2655                                                   - raft->log_start];
2656         if (entry->servers) {
2657             servers_changed = true;
2658         }
2659         raft_entry_uninit(entry);
2660     }
2661     return servers_changed;
2662 }
2663
2664 static const struct json *
2665 raft_peek_next_entry(struct raft *raft, struct uuid *eid)
2666 {
2667     /* Invariant: log_start - 2 <= last_applied <= commit_index < log_end. */
2668     ovs_assert(raft->log_start <= raft->last_applied + 2);
2669     ovs_assert(raft->last_applied <= raft->commit_index);
2670     ovs_assert(raft->commit_index < raft->log_end);
2671
2672     if (raft->joining || raft->failed) {
2673         return NULL;
2674     }
2675
2676     if (raft->log_start == raft->last_applied + 2) {
2677         *eid = raft->snap.eid;
2678         return raft->snap.data;
2679     }
2680
2681     while (raft->last_applied < raft->commit_index) {
2682         const struct raft_entry *e = raft_get_entry(raft,
2683                                                     raft->last_applied + 1);
2684         if (e->data) {
2685             *eid = e->eid;
2686             return e->data;
2687         }
2688         raft->last_applied++;
2689     }
2690     return NULL;
2691 }
2692
2693 static const struct json *
2694 raft_get_next_entry(struct raft *raft, struct uuid *eid)
2695 {
2696     const struct json *data = raft_peek_next_entry(raft, eid);
2697     if (data) {
2698         raft->last_applied++;
2699     }
2700     return data;
2701 }
2702
2703 /* Updates commit index in raft log. If commit index is already up-to-date
2704  * it does nothing and return false, otherwise, returns true. */
2705 static bool
2706 raft_update_commit_index(struct raft *raft, uint64_t new_commit_index)
2707 {
2708     if (new_commit_index <= raft->commit_index) {
2709         return false;
2710     }
2711
2712     if (raft->role == RAFT_LEADER) {
2713         while (raft->commit_index < new_commit_index) {
2714             uint64_t index = ++raft->commit_index;
2715             const struct raft_entry *e = raft_get_entry(raft, index);
2716             if (e->data) {
2717                 struct raft_command *cmd
2718                     = raft_find_command_by_eid(raft, &e->eid);
2719                 if (cmd) {
2720                     if (!cmd->index) {
2721                         VLOG_DBG("Command completed after role change from"
2722                                  " follower to leader "UUID_FMT,
2723                                  UUID_ARGS(&e->eid));
2724                         cmd->index = index;
2725                     }
2726                     raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
2727                 }
2728             }
2729             if (e->election_timer) {
2730                 VLOG_INFO("Election timer changed from %"PRIu64" to %"PRIu64,
2731                           raft->election_timer, e->election_timer);
2732                 raft->election_timer = e->election_timer;
2733                 raft->election_timer_new = 0;
2734             }
2735             if (e->servers) {
2736                 /* raft_run_reconfigure() can write a new Raft entry, which can
2737                  * reallocate raft->entries, which would invalidate 'e', so
2738                  * this case must be last, after the one for 'e->data'. */
2739                 raft_run_reconfigure(raft);
2740             }
2741         }
2742     } else {
2743         while (raft->commit_index < new_commit_index) {
2744             uint64_t index = ++raft->commit_index;
2745             const struct raft_entry *e = raft_get_entry(raft, index);
2746             if (e->election_timer) {
2747                 VLOG_INFO("Election timer changed from %"PRIu64" to %"PRIu64,
2748                           raft->election_timer, e->election_timer);
2749                 raft->election_timer = e->election_timer;
2750             }
2751         }
2752         /* Check if any pending command can be completed, and complete it.
2753          * This can happen when leader fail-over before sending
2754          * execute_command_reply. */
2755         const struct uuid *eid = raft_get_eid(raft, new_commit_index);
2756         struct raft_command *cmd = raft_find_command_by_eid(raft, eid);
2757         if (cmd) {
2758             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
2759             VLOG_INFO_RL(&rl,
2760                          "Command completed without reply (eid: "UUID_FMT", "
2761                          "commit index: %"PRIu64")",
2762                          UUID_ARGS(eid), new_commit_index);
2763             cmd->index = new_commit_index;
2764             raft_command_complete(raft, cmd, RAFT_CMD_SUCCESS);
2765         }
2766     }
2767
2768     /* Write the commit index to the log.  The next time we restart, this
2769      * allows us to start exporting a reasonably fresh log, instead of a log
2770      * that only contains the snapshot. */
2771     struct raft_record r = {
2772         .type = RAFT_REC_COMMIT_INDEX,
2773         .commit_index = raft->commit_index,
2774     };
2775     ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
2776     return true;
2777 }
2778
2779 /* This doesn't use rq->entries (but it does use rq->n_entries). */
2780 static void
2781 raft_send_append_reply(struct raft *raft, const struct raft_append_request *rq,
2782                        enum raft_append_result result, const char *comment)
2783 {
2784     /* Figure 3.1: "If leaderCommit > commitIndex, set commitIndex =
2785      * min(leaderCommit, index of last new entry)" */
2786     if (result == RAFT_APPEND_OK && rq->leader_commit > raft->commit_index) {
2787         raft_update_commit_index(
2788             raft, MIN(rq->leader_commit, rq->prev_log_index + rq->n_entries));
2789     }
2790
2791     /* Send reply. */
2792     union raft_rpc reply = {
2793         .append_reply = {
2794             .common = {
2795                 .type = RAFT_RPC_APPEND_REPLY,
2796                 .sid = rq->common.sid,
2797                 .comment = CONST_CAST(char *, comment),
2798             },
2799             .term = raft->term,
2800             .log_end = raft->log_end,
2801             .prev_log_index = rq->prev_log_index,
2802             .prev_log_term = rq->prev_log_term,
2803             .n_entries = rq->n_entries,
2804             .result = result,
2805         }
2806     };
2807     raft_send(raft, &reply);
2808 }
2809
2810 /* If 'prev_log_index' exists in 'raft''s log, in term 'prev_log_term', returns
2811  * NULL.  Otherwise, returns an explanation for the mismatch.  */
2812 static const char *
2813 match_index_and_term(const struct raft *raft,
2814                      uint64_t prev_log_index, uint64_t prev_log_term)
2815 {
2816     if (prev_log_index < raft->log_start - 1) {
2817         return "mismatch before start of log";
2818     } else if (prev_log_index == raft->log_start - 1) {
2819         if (prev_log_term != raft->snap.term) {
2820             return "prev_term mismatch";
2821         }
2822     } else if (prev_log_index < raft->log_end) {
2823         if (raft->entries[prev_log_index - raft->log_start].term
2824             != prev_log_term) {
2825             return "term mismatch";
2826         }
2827     } else {
2828         /* prev_log_index >= raft->log_end */
2829         return "mismatch past end of log";
2830     }
2831     return NULL;
2832 }
2833
2834 static void
2835 raft_handle_append_entries(struct raft *raft,
2836                            const struct raft_append_request *rq,
2837                            uint64_t prev_log_index, uint64_t prev_log_term,
2838                            const struct raft_entry *entries,
2839                            unsigned int n_entries)
2840 {
2841     /* Section 3.5: "When sending an AppendEntries RPC, the leader includes
2842      * the index and term of the entry in its log that immediately precedes
2843      * the new entries. If the follower does not find an entry in its log
2844      * with the same index and term, then it refuses the new entries." */
2845     const char *mismatch = match_index_and_term(raft, prev_log_index,
2846                                                 prev_log_term);
2847     if (mismatch) {
2848         VLOG_INFO("rejecting append_request because previous entry "
2849                   "%"PRIu64",%"PRIu64" not in local log (%s)",
2850                   prev_log_term, prev_log_index, mismatch);
2851         raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY, mismatch);
2852         return;
2853     }
2854
2855     /* Figure 3.1: "If an existing entry conflicts with a new one (same
2856      * index but different terms), delete the existing entry and all that
2857      * follow it." */
2858     unsigned int i;
2859     bool servers_changed = false;
2860     for (i = 0; ; i++) {
2861         if (i >= n_entries) {
2862             /* No change. */
2863             if (rq->common.comment
2864                 && !strcmp(rq->common.comment, "heartbeat")) {
2865                 raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "heartbeat");
2866             } else {
2867                 raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change");
2868             }
2869             return;
2870         }
2871
2872         uint64_t log_index = (prev_log_index + 1) + i;
2873         if (log_index >= raft->log_end) {
2874             break;
2875         }
2876         if (raft->entries[log_index - raft->log_start].term
2877             != entries[i].term) {
2878             if (raft_truncate(raft, log_index)) {
2879                 servers_changed = true;
2880             }
2881             break;
2882         }
2883     }
2884
2885     if (failure_test == FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE) {
2886         ovs_fatal(0, "Raft test: crash after receiving append_request with "
2887                   "update.");
2888     }
2889     /* Figure 3.1: "Append any entries not already in the log." */
2890     struct ovsdb_error *error = NULL;
2891     bool any_written = false;
2892     for (; i < n_entries; i++) {
2893         const struct raft_entry *e = &entries[i];
2894         error = raft_write_entry(raft, e->term,
2895                                  json_nullable_clone(e->data), &e->eid,
2896                                  json_nullable_clone(e->servers),
2897                                  e->election_timer);
2898         if (error) {
2899             break;
2900         }
2901         any_written = true;
2902         if (e->servers) {
2903             servers_changed = true;
2904         }
2905     }
2906
2907     if (any_written) {
2908         raft_waiter_create(raft, RAFT_W_ENTRY, true)->entry.index
2909             = raft->log_end - 1;
2910     }
2911     if (servers_changed) {
2912         /* The set of servers might have changed; check. */
2913         raft_get_servers_from_log(raft, VLL_INFO);
2914     }
2915
2916     if (error) {
2917         char *s = ovsdb_error_to_string_free(error);
2918         VLOG_ERR("%s", s);
2919         free(s);
2920         raft_send_append_reply(raft, rq, RAFT_APPEND_IO_ERROR, "I/O error");
2921         return;
2922     }
2923
2924     raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "log updated");
2925 }
2926
2927 static bool
2928 raft_update_leader(struct raft *raft, const struct uuid *sid)
2929 {
2930     if (raft->role == RAFT_LEADER) {
2931         char buf[SID_LEN + 1];
2932         VLOG_ERR("this server is leader but server %s claims to be",
2933                  raft_get_nickname(raft, sid, buf, sizeof buf));
2934         return false;
2935     } else if (!uuid_equals(sid, &raft->leader_sid)) {
2936         if (!uuid_is_zero(&raft->leader_sid)) {
2937             char buf1[SID_LEN + 1];
2938             char buf2[SID_LEN + 1];
2939             VLOG_ERR("leader for term %"PRIu64" changed from %s to %s",
2940                      raft->term,
2941                      raft_get_nickname(raft, &raft->leader_sid,
2942                                        buf1, sizeof buf1),
2943                      raft_get_nickname(raft, sid, buf2, sizeof buf2));
2944         } else {
2945             char buf[SID_LEN + 1];
2946             VLOG_INFO("server %s is leader for term %"PRIu64,
2947                       raft_get_nickname(raft, sid, buf, sizeof buf),
2948                       raft->term);
2949         }
2950         raft_set_leader(raft, sid);
2951
2952         /* Record the leader to the log.  This is not used by the algorithm
2953          * (although it could be, for quick restart), but it is used for
2954          * offline analysis to check for conformance with the properties
2955          * that Raft guarantees. */
2956         struct raft_record r = {
2957             .type = RAFT_REC_LEADER,
2958             .term = raft->term,
2959             .sid = *sid,
2960         };
2961         ignore(ovsdb_log_write_and_free(raft->log, raft_record_to_json(&r)));
2962     }
2963     return true;
2964 }
2965
2966 static void
2967 raft_handle_append_request(struct raft *raft,
2968                            const struct raft_append_request *rq)
2969 {
2970     /* We do not check whether the server that sent the request is part of the
2971      * cluster.  As section 4.1 says, "A server accepts AppendEntries requests
2972      * from a leader that is not part of the server’s latest configuration.
2973      * Otherwise, a new server could never be added to the cluster (it would
2974      * never accept any log entries preceding the configuration entry that adds
2975      * the server)." */
2976     if (!raft_update_leader(raft, &rq->common.sid)) {
2977         raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY,
2978                                "usurped leadership");
2979         return;
2980     }
2981     raft_reset_election_timer(raft);
2982
2983     /* First check for the common case, where the AppendEntries request is
2984      * entirely for indexes covered by 'log_start' ... 'log_end - 1', something
2985      * like this:
2986      *
2987      *     rq->prev_log_index
2988      *       | first_entry_index
2989      *       |   |         nth_entry_index
2990      *       |   |           |
2991      *       v   v           v
2992      *         +---+---+---+---+
2993      *       T | T | T | T | T |
2994      *         +---+-------+---+
2995      *     +---+---+---+---+
2996      *   T | T | T | T | T |
2997      *     +---+---+---+---+
2998      *       ^               ^
2999      *       |               |
3000      *   log_start        log_end
3001      * */
3002     uint64_t first_entry_index = rq->prev_log_index + 1;
3003     uint64_t nth_entry_index = rq->prev_log_index + rq->n_entries;
3004     if (OVS_LIKELY(first_entry_index >= raft->log_start)) {
3005         raft_handle_append_entries(raft, rq,
3006                                    rq->prev_log_index, rq->prev_log_term,
3007                                    rq->entries, rq->n_entries);
3008         return;
3009     }
3010
3011     /* Now a series of checks for odd cases, where the AppendEntries request
3012      * extends earlier than the beginning of our log, into the log entries
3013      * discarded by the most recent snapshot. */
3014
3015     /*
3016      * Handle the case where the indexes covered by rq->entries[] are entirely
3017      * disjoint with 'log_start - 1' ... 'log_end - 1', as shown below.  So,
3018      * everything in the AppendEntries request must already have been
3019      * committed, and we might as well return true.
3020      *
3021      *     rq->prev_log_index
3022      *       | first_entry_index
3023      *       |   |         nth_entry_index
3024      *       |   |           |
3025      *       v   v           v
3026      *         +---+---+---+---+
3027      *       T | T | T | T | T |
3028      *         +---+-------+---+
3029      *                             +---+---+---+---+
3030      *                           T | T | T | T | T |
3031      *                             +---+---+---+---+
3032      *                               ^               ^
3033      *                               |               |
3034      *                           log_start        log_end
3035      */
3036     if (nth_entry_index < raft->log_start - 1) {
3037         raft_send_append_reply(raft, rq, RAFT_APPEND_OK,
3038                                "append before log start");
3039         return;
3040     }
3041
3042     /*
3043      * Handle the case where the last entry in rq->entries[] has the same index
3044      * as 'log_start - 1', so we can compare their terms:
3045      *
3046      *     rq->prev_log_index
3047      *       | first_entry_index
3048      *       |   |         nth_entry_index
3049      *       |   |           |
3050      *       v   v           v
3051      *         +---+---+---+---+
3052      *       T | T | T | T | T |
3053      *         +---+-------+---+
3054      *                         +---+---+---+---+
3055      *                       T | T | T | T | T |
3056      *                         +---+---+---+---+
3057      *                           ^               ^
3058      *                           |               |
3059      *                       log_start        log_end
3060      *
3061      * There's actually a sub-case where rq->n_entries == 0, in which we
3062      * compare rq->prev_term:
3063      *
3064      *     rq->prev_log_index
3065      *       |
3066      *       |
3067      *       |
3068      *       v
3069      *       T
3070      *
3071      *         +---+---+---+---+
3072      *       T | T | T | T | T |
3073      *         +---+---+---+---+
3074      *           ^               ^
3075      *           |               |
3076      *       log_start        log_end
3077      */
3078     if (nth_entry_index == raft->log_start - 1) {
3079         if (rq->n_entries
3080             ? raft->snap.term == rq->entries[rq->n_entries - 1].term
3081             : raft->snap.term == rq->prev_log_term) {
3082             raft_send_append_reply(raft, rq, RAFT_APPEND_OK, "no change");
3083         } else {
3084             raft_send_append_reply(raft, rq, RAFT_APPEND_INCONSISTENCY,
3085                                    "term mismatch");
3086         }
3087         return;
3088     }
3089
3090     /*
3091      * We now know that the data in rq->entries[] overlaps the data in
3092      * raft->entries[], as shown below, with some positive 'ofs':
3093      *
3094      *     rq->prev_log_index
3095      *       | first_entry_index
3096      *       |   |             nth_entry_index
3097      *       |   |               |
3098      *       v   v               v
3099      *         +---+---+---+---+---+
3100      *       T | T | T | T | T | T |
3101      *         +---+-------+---+---+
3102      *                     +---+---+---+---+
3103      *                   T | T | T | T | T |
3104      *                     +---+---+---+---+
3105      *                       ^               ^
3106      *                       |               |
3107      *                   log_start        log_end
3108      *
3109      *         |<-- ofs -->|
3110      *
3111      * We transform this into the following by trimming the first 'ofs'
3112      * elements off of rq->entries[], ending up with the following.  Notice how
3113      * we retain the term but not the data for rq->entries[ofs - 1]:
3114      *
3115      *                  first_entry_index + ofs - 1
3116      *                   | first_entry_index + ofs
3117      *                   |   |  nth_entry_index + ofs
3118      *                   |   |   |
3119      *                   v   v   v
3120      *                     +---+---+
3121      *                   T | T | T |
3122      *                     +---+---+
3123      *                     +---+---+---+---+
3124      *                   T | T | T | T | T |
3125      *                     +---+---+---+---+
3126      *                       ^               ^
3127      *                       |               |
3128      *                   log_start        log_end
3129      */
3130     uint64_t ofs = raft->log_start - first_entry_index;
3131     raft_handle_append_entries(raft, rq,
3132                                raft->log_start - 1, rq->entries[ofs - 1].term,
3133                                &rq->entries[ofs], rq->n_entries - ofs);
3134 }
3135
3136 /* Returns true if 'raft' has another log entry or snapshot to read. */
3137 bool
3138 raft_has_next_entry(const struct raft *raft_)
3139 {
3140     struct raft *raft = CONST_CAST(struct raft *, raft_);
3141     struct uuid eid;
3142     return raft_peek_next_entry(raft, &eid) != NULL;
3143 }
3144
3145 /* Returns the next log entry or snapshot from 'raft', or NULL if there are
3146  * none left to read.  Stores the entry ID of the log entry in '*eid'.  Stores
3147  * true in '*is_snapshot' if the returned data is a snapshot, false if it is a
3148  * log entry. */
3149 const struct json *
3150 raft_next_entry(struct raft *raft, struct uuid *eid, bool *is_snapshot)
3151 {
3152     const struct json *data = raft_get_next_entry(raft, eid);
3153     *is_snapshot = data == raft->snap.data;
3154     return data;
3155 }
3156
3157 /* Returns the log index of the last-read snapshot or log entry. */
3158 uint64_t
3159 raft_get_applied_index(const struct raft *raft)
3160 {
3161     return raft->last_applied;
3162 }
3163
3164 /* Returns the log index of the last snapshot or log entry that is available to
3165  * be read. */
3166 uint64_t
3167 raft_get_commit_index(const struct raft *raft)
3168 {
3169     return raft->commit_index;
3170 }
3171
3172 static struct raft_server *
3173 raft_find_peer(struct raft *raft, const struct uuid *uuid)
3174 {
3175     struct raft_server *s = raft_find_server(raft, uuid);
3176     return s && !uuid_equals(&raft->sid, &s->sid) ? s : NULL;
3177 }
3178
3179 static struct raft_server *
3180 raft_find_new_server(struct raft *raft, const struct uuid *uuid)
3181 {
3182     return raft_server_find(&raft->add_servers, uuid);
3183 }
3184
3185 /* Figure 3.1: "If there exists an N such that N > commitIndex, a
3186  * majority of matchIndex[i] >= N, and log[N].term == currentTerm, set
3187  * commitIndex = N (sections 3.5 and 3.6)." */
3188 static void
3189 raft_consider_updating_commit_index(struct raft *raft)
3190 {
3191     /* This loop cannot just bail out when it comes across a log entry that
3192      * does not match the criteria.  For example, Figure 3.7(d2) shows a
3193      * case where the log entry for term 2 cannot be committed directly
3194      * (because it is not for the current term) but it can be committed as
3195      * a side effect of commit the entry for term 4 (the current term).
3196      * XXX Is there a more efficient way to do this? */
3197     ovs_assert(raft->role == RAFT_LEADER);
3198
3199     uint64_t new_commit_index = raft->commit_index;
3200     for (uint64_t idx = MAX(raft->commit_index + 1, raft->log_start);
3201          idx < raft->log_end; idx++) {
3202         if (raft->entries[idx - raft->log_start].term == raft->term) {
3203             size_t count = 0;
3204             struct raft_server *s2;
3205             HMAP_FOR_EACH (s2, hmap_node, &raft->servers) {
3206                 if (s2->match_index >= idx) {
3207                     count++;
3208                 }
3209             }
3210             if (count > hmap_count(&raft->servers) / 2) {
3211                 VLOG_DBG("index %"PRIu64" committed to %"PRIuSIZE" servers, "
3212                           "applying", idx, count);
3213                 new_commit_index = idx;
3214             }
3215         }
3216     }
3217     if (raft_update_commit_index(raft, new_commit_index)) {
3218         raft_send_heartbeats(raft);
3219     }
3220 }
3221
3222 static void
3223 raft_update_match_index(struct raft *raft, struct raft_server *s,
3224                         uint64_t min_index)
3225 {
3226     ovs_assert(raft->role == RAFT_LEADER);
3227     if (min_index > s->match_index) {
3228         s->match_index = min_index;
3229         raft_consider_updating_commit_index(raft);
3230     }
3231 }
3232
3233 static void
3234 raft_update_our_match_index(struct raft *raft, uint64_t min_index)
3235 {
3236     struct raft_server *server = raft_find_server(raft, &raft->sid);
3237     if (server) {
3238         raft_update_match_index(raft, server, min_index);
3239     }
3240 }
3241
3242 static void
3243 raft_send_install_snapshot_request(struct raft *raft,
3244                                    const struct raft_server *s,
3245                                    const char *comment)
3246 {
3247     union raft_rpc rpc = {
3248         .install_snapshot_request = {
3249             .common = {
3250                 .type = RAFT_RPC_INSTALL_SNAPSHOT_REQUEST,
3251                 .sid = s->sid,
3252                 .comment = CONST_CAST(char *, comment),
3253             },
3254             .term = raft->term,
3255             .last_index = raft->log_start - 1,
3256             .last_term = raft->snap.term,
3257             .last_servers = raft->snap.servers,
3258             .last_eid = raft->snap.eid,
3259             .data = raft->snap.data,
3260             .election_timer = raft->election_timer,
3261         }
3262     };
3263     raft_send(raft, &rpc);
3264 }
3265
3266 static void
3267 raft_handle_append_reply(struct raft *raft,
3268                          const struct raft_append_reply *rpy)
3269 {
3270     if (raft->role != RAFT_LEADER) {
3271         VLOG_INFO("rejected append_reply (not leader)");
3272         return;
3273     }
3274
3275     /* Most commonly we'd be getting an AppendEntries reply from a configured
3276      * server (e.g. a peer), but we can also get them from servers in the
3277      * process of being added. */
3278     struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
3279     if (!s) {
3280         s = raft_find_new_server(raft, &rpy->common.sid);
3281         if (!s) {
3282             VLOG_INFO("rejected append_reply from unknown server "SID_FMT,
3283                       SID_ARGS(&rpy->common.sid));
3284             return;
3285         }
3286     }
3287
3288     s->replied = true;
3289     if (rpy->result == RAFT_APPEND_OK) {
3290         /* Figure 3.1: "If successful, update nextIndex and matchIndex for
3291          * follower (section 3.5)." */
3292         uint64_t min_index = rpy->prev_log_index + rpy->n_entries + 1;
3293         if (s->next_index < min_index) {
3294             s->next_index = min_index;
3295         }
3296         raft_update_match_index(raft, s, min_index - 1);
3297     } else {
3298         /* Figure 3.1: "If AppendEntries fails because of log inconsistency,
3299          * decrement nextIndex and retry (section 3.5)."
3300          *
3301          * We also implement the optimization suggested in section 4.2.1:
3302          * "Various approaches can make nextIndex converge to its correct value
3303          * more quickly, including those described in Chapter 3. The simplest
3304          * approach to solving this particular problem of adding a new server,
3305          * however, is to have followers return the length of their logs in the
3306          * AppendEntries response; this allows the leader to cap the follower’s
3307          * nextIndex accordingly." */
3308         s->next_index = (s->next_index > 0
3309                          ? MIN(s->next_index - 1, rpy->log_end)
3310                          : 0);
3311
3312         if (rpy->result == RAFT_APPEND_IO_ERROR) {
3313             /* Append failed but not because of a log inconsistency.  Because
3314              * of the I/O error, there's no point in re-sending the append
3315              * immediately. */
3316             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3317             VLOG_INFO_RL(&rl, "%s reported I/O error", s->nickname);
3318             return;
3319         }
3320     }
3321
3322     /*
3323      * Our behavior here must depend on the value of next_index relative to
3324      * log_start and log_end.  There are three cases:
3325      *
3326      *        Case 1       |    Case 2     |      Case 3
3327      *   <---------------->|<------------->|<------------------>
3328      *                     |               |
3329      *
3330      *                     +---+---+---+---+
3331      *                   T | T | T | T | T |
3332      *                     +---+---+---+---+
3333      *                       ^               ^
3334      *                       |               |
3335      *                   log_start        log_end
3336      */
3337     if (s->next_index < raft->log_start) {
3338         /* Case 1. */
3339         raft_send_install_snapshot_request(raft, s, NULL);
3340     } else if (s->next_index < raft->log_end) {
3341         /* Case 2. */
3342         raft_send_append_request(raft, s, 1, NULL);
3343     } else {
3344         /* Case 3. */
3345         if (s->phase == RAFT_PHASE_CATCHUP) {
3346             s->phase = RAFT_PHASE_CAUGHT_UP;
3347             raft_run_reconfigure(raft);
3348         }
3349     }
3350 }
3351
3352 static bool
3353 raft_should_suppress_disruptive_server(struct raft *raft,
3354                                        const union raft_rpc *rpc)
3355 {
3356     if (rpc->type != RAFT_RPC_VOTE_REQUEST) {
3357         return false;
3358     }
3359
3360     /* Section 4.2.3 "Disruptive Servers" says:
3361      *
3362      *    ...if a server receives a RequestVote request within the minimum
3363      *    election timeout of hearing from a current leader, it does not update
3364      *    its term or grant its vote...
3365      *
3366      *    ...This change conflicts with the leadership transfer mechanism as
3367      *    described in Chapter 3, in which a server legitimately starts an
3368      *    election without waiting an election timeout.  In that case,
3369      *    RequestVote messages should be processed by other servers even when
3370      *    they believe a current cluster leader exists.  Those RequestVote
3371      *    requests can include a special flag to indicate this behavior (“I
3372      *    have permission to disrupt the leader--it told me to!”).
3373      *
3374      * This clearly describes how the followers should act, but not the leader.
3375      * We just ignore vote requests that arrive at a current leader.  This
3376      * seems to be fairly safe, since a majority other than the current leader
3377      * can still elect a new leader and the first AppendEntries from that new
3378      * leader will depose the current leader. */
3379     const struct raft_vote_request *rq = raft_vote_request_cast(rpc);
3380     if (rq->leadership_transfer) {
3381         return false;
3382     }
3383
3384     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3385     long long int now = time_msec();
3386     switch (raft->role) {
3387     case RAFT_LEADER:
3388         VLOG_WARN_RL(&rl, "ignoring vote request received as leader");
3389         return true;
3390
3391     case RAFT_FOLLOWER:
3392         if (now < raft->election_base + raft->election_timer) {
3393             VLOG_WARN_RL(&rl, "ignoring vote request received after only "
3394                          "%lld ms (minimum election time is %"PRIu64" ms)",
3395                          now - raft->election_base, raft->election_timer);
3396             return true;
3397         }
3398         return false;
3399
3400     case RAFT_CANDIDATE:
3401         return false;
3402
3403     default:
3404         OVS_NOT_REACHED();
3405     }
3406 }
3407
3408 /* Returns true if a reply should be sent. */
3409 static bool
3410 raft_handle_vote_request__(struct raft *raft,
3411                            const struct raft_vote_request *rq)
3412 {
3413     /* Figure 3.1: "If votedFor is null or candidateId, and candidate's vote is
3414      * at least as up-to-date as receiver's log, grant vote (sections 3.4,
3415      * 3.6)." */
3416     if (uuid_equals(&raft->vote, &rq->common.sid)) {
3417         /* Already voted for this candidate in this term.  Resend vote. */
3418         return true;
3419     } else if (!uuid_is_zero(&raft->vote)) {
3420         /* Already voted for different candidate in this term.  Send a reply
3421          * saying what candidate we did vote for.  This isn't a necessary part
3422          * of the Raft protocol but it can make debugging easier. */
3423         return true;
3424     }
3425
3426     /* Section 3.6.1: "The RequestVote RPC implements this restriction: the RPC
3427      * includes information about the candidate’s log, and the voter denies its
3428      * vote if its own log is more up-to-date than that of the candidate.  Raft
3429      * determines which of two logs is more up-to-date by comparing the index
3430      * and term of the last entries in the logs.  If the logs have last entries
3431      * with different terms, then the log with the later term is more
3432      * up-to-date.  If the logs end with the same term, then whichever log is
3433      * longer is more up-to-date." */
3434     uint64_t last_term = (raft->log_end > raft->log_start
3435                           ? raft->entries[raft->log_end - 1
3436                                           - raft->log_start].term
3437                           : raft->snap.term);
3438     if (last_term > rq->last_log_term
3439         || (last_term == rq->last_log_term
3440             && raft->log_end - 1 > rq->last_log_index)) {
3441         /* Our log is more up-to-date than the peer's.   Withhold vote. */
3442         return false;
3443     }
3444
3445     /* Record a vote for the peer. */
3446     if (!raft_set_term(raft, raft->term, &rq->common.sid)) {
3447         return false;
3448     }
3449
3450     raft_reset_election_timer(raft);
3451
3452     return true;
3453 }
3454
3455 static void
3456 raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
3457                      const struct uuid *vote)
3458 {
3459     union raft_rpc rpy = {
3460         .vote_reply = {
3461             .common = {
3462                 .type = RAFT_RPC_VOTE_REPLY,
3463                 .sid = *dst,
3464             },
3465             .term = raft->term,
3466             .vote = *vote,
3467         },
3468     };
3469     raft_send(raft, &rpy);
3470 }
3471
3472 static void
3473 raft_handle_vote_request(struct raft *raft,
3474                          const struct raft_vote_request *rq)
3475 {
3476     if (raft_handle_vote_request__(raft, rq)) {
3477         raft_send_vote_reply(raft, &rq->common.sid, &raft->vote);
3478     }
3479 }
3480
3481 static void
3482 raft_handle_vote_reply(struct raft *raft,
3483                        const struct raft_vote_reply *rpy)
3484 {
3485     if (!raft_receive_term__(raft, &rpy->common, rpy->term)) {
3486         return;
3487     }
3488
3489     if (raft->role != RAFT_CANDIDATE) {
3490         return;
3491     }
3492
3493     struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
3494     if (s) {
3495         raft_accept_vote(raft, s, &rpy->vote);
3496     }
3497 }
3498
3499 /* Returns true if 'raft''s log contains reconfiguration entries that have not
3500  * yet been committed. */
3501 static bool
3502 raft_has_uncommitted_configuration(const struct raft *raft)
3503 {
3504     for (uint64_t i = raft->commit_index + 1; i < raft->log_end; i++) {
3505         ovs_assert(i >= raft->log_start);
3506         const struct raft_entry *e = &raft->entries[i - raft->log_start];
3507         if (e->servers) {
3508             return true;
3509         }
3510     }
3511     return false;
3512 }
3513
3514 static void
3515 raft_log_reconfiguration(struct raft *raft)
3516 {
3517     struct json *servers_json = raft_servers_to_json(&raft->servers);
3518     raft_command_unref(raft_command_execute__(
3519                            raft, NULL, servers_json, 0, NULL, NULL));
3520     json_destroy(servers_json);
3521 }
3522
3523 static void
3524 raft_run_reconfigure(struct raft *raft)
3525 {
3526     ovs_assert(raft->role == RAFT_LEADER);
3527
3528     /* Reconfiguration only progresses when configuration changes commit. */
3529     if (raft_has_uncommitted_configuration(raft)) {
3530         return;
3531     }
3532
3533     /* If we were waiting for a configuration change to commit, it's done. */
3534     struct raft_server *s;
3535     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
3536         if (s->phase == RAFT_PHASE_COMMITTING) {
3537             raft_send_add_server_reply__(raft, &s->sid, s->address,
3538                                          true, RAFT_SERVER_COMPLETED);
3539             s->phase = RAFT_PHASE_STABLE;
3540         }
3541     }
3542     if (raft->remove_server) {
3543         raft_send_remove_server_reply__(raft, &raft->remove_server->sid,
3544                                         &raft->remove_server->requester_sid,
3545                                         raft->remove_server->requester_conn,
3546                                         true, RAFT_SERVER_COMPLETED);
3547         raft_server_destroy(raft->remove_server);
3548         raft->remove_server = NULL;
3549     }
3550
3551     /* If a new server is caught up, add it to the configuration.  */
3552     HMAP_FOR_EACH (s, hmap_node, &raft->add_servers) {
3553         if (s->phase == RAFT_PHASE_CAUGHT_UP) {
3554             /* Move 's' from 'raft->add_servers' to 'raft->servers'. */
3555             hmap_remove(&raft->add_servers, &s->hmap_node);
3556             hmap_insert(&raft->servers, &s->hmap_node, uuid_hash(&s->sid));
3557
3558             /* Mark 's' as waiting for commit. */
3559             s->phase = RAFT_PHASE_COMMITTING;
3560
3561             raft_log_reconfiguration(raft);
3562
3563             /* When commit completes we'll transition to RAFT_PHASE_STABLE and
3564              * send a RAFT_SERVER_OK reply. */
3565
3566             return;
3567         }
3568     }
3569
3570     /* Remove a server, if one is scheduled for removal. */
3571     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
3572         if (s->phase == RAFT_PHASE_REMOVE) {
3573             hmap_remove(&raft->servers, &s->hmap_node);
3574             raft->remove_server = s;
3575
3576             raft_log_reconfiguration(raft);
3577
3578             return;
3579         }
3580     }
3581 }
3582
3583 static void
3584 raft_handle_add_server_request(struct raft *raft,
3585                                const struct raft_add_server_request *rq)
3586 {
3587     /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */
3588     if (raft->role != RAFT_LEADER) {
3589         raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER);
3590         return;
3591     }
3592
3593     /* Check for an existing server. */
3594     struct raft_server *s = raft_find_server(raft, &rq->common.sid);
3595     if (s) {
3596         /* If the server is scheduled to be removed, cancel it. */
3597         if (s->phase == RAFT_PHASE_REMOVE) {
3598             s->phase = RAFT_PHASE_STABLE;
3599             raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_CANCELED);
3600             return;
3601         }
3602
3603         /* If the server is being added, then it's in progress. */
3604         if (s->phase != RAFT_PHASE_STABLE) {
3605             raft_send_add_server_reply(raft, rq,
3606                                        false, RAFT_SERVER_IN_PROGRESS);
3607         }
3608
3609         /* Nothing to do--server is already part of the configuration. */
3610         raft_send_add_server_reply(raft, rq,
3611                                    true, RAFT_SERVER_ALREADY_PRESENT);
3612         return;
3613     }
3614
3615     /* Check for a server being removed. */
3616     if (raft->remove_server
3617         && uuid_equals(&rq->common.sid, &raft->remove_server->sid)) {
3618         raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING);
3619         return;
3620     }
3621
3622     /* Check for a server already being added. */
3623     if (raft_find_new_server(raft, &rq->common.sid)) {
3624         raft_send_add_server_reply(raft, rq, false, RAFT_SERVER_IN_PROGRESS);
3625         return;
3626     }
3627
3628     /* Add server to 'add_servers'. */
3629     s = raft_server_add(&raft->add_servers, &rq->common.sid, rq->address);
3630     raft_server_init_leader(raft, s);
3631     s->requester_sid = rq->common.sid;
3632     s->requester_conn = NULL;
3633     s->phase = RAFT_PHASE_CATCHUP;
3634
3635     /* Start sending the log.  If this is the first time we've tried to add
3636      * this server, then this will quickly degenerate into an InstallSnapshot
3637      * followed by a series of AddEntries, but if it's a retry of an earlier
3638      * AddRequest that was interrupted (e.g. by a timeout or a loss of
3639      * leadership) then it will gracefully resume populating the log.
3640      *
3641      * See the last few paragraphs of section 4.2.1 for further insight. */
3642     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
3643     VLOG_INFO_RL(&rl,
3644                  "starting to add server %s ("SID_FMT" at %s) "
3645                  "to cluster "CID_FMT, s->nickname, SID_ARGS(&s->sid),
3646                  rq->address, CID_ARGS(&raft->cid));
3647     raft_send_append_request(raft, s, 0, "initialize new server");
3648 }
3649
3650 static void
3651 raft_handle_add_server_reply(struct raft *raft,
3652                              const struct raft_add_server_reply *rpy)
3653 {
3654     if (!raft->joining) {
3655         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3656         VLOG_WARN_RL(&rl, "received add_server_reply even though we're "
3657                      "already part of the cluster");
3658         return;
3659     }
3660
3661     if (rpy->success) {
3662         raft->joining = false;
3663
3664         /* It is tempting, at this point, to check that this server is part of
3665          * the current configuration.  However, this is not necessarily the
3666          * case, because the log entry that added this server to the cluster
3667          * might have been committed by a majority of the cluster that does not
3668          * include this one.  This actually happens in testing. */
3669     } else {
3670         const char *address;
3671         SSET_FOR_EACH (address, &rpy->remote_addresses) {
3672             if (sset_add(&raft->remote_addresses, address)) {
3673                 VLOG_INFO("%s: learned new server address for joining cluster",
3674                           address);
3675             }
3676         }
3677     }
3678 }
3679
3680 /* This is called by raft_unixctl_kick() as well as via RPC. */
3681 static void
3682 raft_handle_remove_server_request(struct raft *raft,
3683                                   const struct raft_remove_server_request *rq)
3684 {
3685     /* Figure 4.1: "1. Reply NOT_LEADER if not leader (section 6.2)." */
3686     if (raft->role != RAFT_LEADER) {
3687         raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_NOT_LEADER);
3688         return;
3689     }
3690
3691     /* If the server to remove is currently waiting to be added, cancel it. */
3692     struct raft_server *target = raft_find_new_server(raft, &rq->sid);
3693     if (target) {
3694         raft_send_add_server_reply__(raft, &target->sid, target->address,
3695                                      false, RAFT_SERVER_CANCELED);
3696         hmap_remove(&raft->add_servers, &target->hmap_node);
3697         raft_server_destroy(target);
3698         return;
3699     }
3700
3701     /* If the server isn't configured, report that. */
3702     target = raft_find_server(raft, &rq->sid);
3703     if (!target) {
3704         raft_send_remove_server_reply(raft, rq,
3705                                       true, RAFT_SERVER_ALREADY_GONE);
3706         return;
3707     }
3708
3709     /* Check whether we're waiting for the addition of the server to commit. */
3710     if (target->phase == RAFT_PHASE_COMMITTING) {
3711         raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_COMMITTING);
3712         return;
3713     }
3714
3715     /* Check whether the server is already scheduled for removal. */
3716     if (target->phase == RAFT_PHASE_REMOVE) {
3717         raft_send_remove_server_reply(raft, rq,
3718                                       false, RAFT_SERVER_IN_PROGRESS);
3719         return;
3720     }
3721
3722     /* Make sure that if we remove this server then that at least one other
3723      * server will be left.  We don't count servers currently being added (in
3724      * 'add_servers') since those could fail. */
3725     struct raft_server *s;
3726     int n = 0;
3727     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
3728         if (s != target && s->phase != RAFT_PHASE_REMOVE) {
3729             n++;
3730         }
3731     }
3732     if (!n) {
3733         raft_send_remove_server_reply(raft, rq, false, RAFT_SERVER_EMPTY);
3734         return;
3735     }
3736
3737     /* Mark the server for removal. */
3738     target->phase = RAFT_PHASE_REMOVE;
3739     if (rq->requester_conn) {
3740         target->requester_sid = UUID_ZERO;
3741         unixctl_command_reply(rq->requester_conn, "started removal");
3742     } else {
3743         target->requester_sid = rq->common.sid;
3744         target->requester_conn = NULL;
3745     }
3746
3747     raft_run_reconfigure(raft);
3748     /* Operation in progress, reply will be sent later. */
3749 }
3750
3751 static void
3752 raft_finished_leaving_cluster(struct raft *raft)
3753 {
3754     VLOG_INFO(SID_FMT": finished leaving cluster "CID_FMT,
3755               SID_ARGS(&raft->sid), CID_ARGS(&raft->cid));
3756
3757     raft_record_note(raft, "left", "this server left the cluster");
3758
3759     raft->leaving = false;
3760     raft->left = true;
3761 }
3762
3763 static void
3764 raft_handle_remove_server_reply(struct raft *raft,
3765                                 const struct raft_remove_server_reply *rpc)
3766 {
3767     if (rpc->success
3768         && (uuid_is_zero(&rpc->target_sid)
3769             || uuid_equals(&rpc->target_sid, &raft->sid))) {
3770         raft_finished_leaving_cluster(raft);
3771     }
3772 }
3773
3774 static bool
3775 raft_handle_write_error(struct raft *raft, struct ovsdb_error *error)
3776 {
3777     if (error && !raft->failed) {
3778         raft->failed = true;
3779
3780         char *s = ovsdb_error_to_string_free(error);
3781         VLOG_WARN("%s: entering failure mode due to I/O error (%s)",
3782                   raft->name, s);
3783         free(s);
3784     }
3785     return !raft->failed;
3786 }
3787
3788 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
3789 raft_write_snapshot(struct raft *raft, struct ovsdb_log *log,
3790                     uint64_t new_log_start,
3791                     const struct raft_entry *new_snapshot)
3792 {
3793     struct raft_header h = {
3794         .sid = raft->sid,
3795         .cid = raft->cid,
3796         .name = raft->name,
3797         .local_address = raft->local_address,
3798         .snap_index = new_log_start - 1,
3799         .snap = *new_snapshot,
3800     };
3801     struct ovsdb_error *error = ovsdb_log_write_and_free(
3802         log, raft_header_to_json(&h));
3803     if (error) {
3804         return error;
3805     }
3806     ovsdb_log_mark_base(raft->log);
3807
3808     /* Write log records. */
3809     for (uint64_t index = new_log_start; index < raft->log_end; index++) {
3810         const struct raft_entry *e = &raft->entries[index - raft->log_start];
3811         struct raft_record r = {
3812             .type = RAFT_REC_ENTRY,
3813             .term = e->term,
3814             .entry = {
3815                 .index = index,
3816                 .data = e->data,
3817                 .servers = e->servers,
3818                 .election_timer = e->election_timer,
3819                 .eid = e->eid,
3820             },
3821         };
3822         error = ovsdb_log_write_and_free(log, raft_record_to_json(&r));
3823         if (error) {
3824             return error;
3825         }
3826     }
3827
3828     /* Write term and vote (if any).
3829      *
3830      * The term is redundant if we wrote a log record for that term above.  The
3831      * vote, if any, is never redundant.
3832      */
3833     error = raft_write_state(log, raft->term, &raft->vote);
3834     if (error) {
3835         return error;
3836     }
3837
3838     /* Write commit_index if it's beyond the new start of the log. */
3839     if (raft->commit_index >= new_log_start) {
3840         struct raft_record r = {
3841             .type = RAFT_REC_COMMIT_INDEX,
3842             .commit_index = raft->commit_index,
3843         };
3844         return ovsdb_log_write_and_free(log, raft_record_to_json(&r));
3845     }
3846     return NULL;
3847 }
3848
3849 static struct ovsdb_error * OVS_WARN_UNUSED_RESULT
3850 raft_save_snapshot(struct raft *raft,
3851                    uint64_t new_start, const struct raft_entry *new_snapshot)
3852
3853 {
3854     struct ovsdb_log *new_log;
3855     struct ovsdb_error *error;
3856     error = ovsdb_log_replace_start(raft->log, &new_log);
3857     if (error) {
3858         return error;
3859     }
3860
3861     error = raft_write_snapshot(raft, new_log, new_start, new_snapshot);
3862     if (error) {
3863         ovsdb_log_replace_abort(new_log);
3864         return error;
3865     }
3866
3867     return ovsdb_log_replace_commit(raft->log, new_log);
3868 }
3869
3870 static bool
3871 raft_handle_install_snapshot_request__(
3872     struct raft *raft, const struct raft_install_snapshot_request *rq)
3873 {
3874     raft_reset_election_timer(raft);
3875
3876     /*
3877      * Our behavior here depend on new_log_start in the snapshot compared to
3878      * log_start and log_end.  There are three cases:
3879      *
3880      *        Case 1       |    Case 2     |      Case 3
3881      *   <---------------->|<------------->|<------------------>
3882      *                     |               |
3883      *
3884      *                     +---+---+---+---+
3885      *                   T | T | T | T | T |
3886      *                     +---+---+---+---+
3887      *                       ^               ^
3888      *                       |               |
3889      *                   log_start        log_end
3890      */
3891     uint64_t new_log_start = rq->last_index + 1;
3892     if (new_log_start < raft->log_start) {
3893         /* Case 1: The new snapshot covers less than our current one.  Nothing
3894          * to do. */
3895         return true;
3896     } else if (new_log_start < raft->log_end) {
3897         /* Case 2: The new snapshot starts in the middle of our log.  We could
3898          * discard the first 'new_log_start - raft->log_start' entries in the
3899          * log.  But there's not much value in that, since snapshotting is
3900          * supposed to be a local decision.  Just skip it. */
3901         return true;
3902     }
3903
3904     /* Case 3: The new snapshot starts past the end of our current log, so
3905      * discard all of our current log. */
3906     const struct raft_entry new_snapshot = {
3907         .term = rq->last_term,
3908         .data = rq->data,
3909         .eid = rq->last_eid,
3910         .servers = rq->last_servers,
3911         .election_timer = rq->election_timer,
3912     };
3913     struct ovsdb_error *error = raft_save_snapshot(raft, new_log_start,
3914                                                    &new_snapshot);
3915     if (error) {
3916         char *error_s = ovsdb_error_to_string(error);
3917         VLOG_WARN("could not save snapshot: %s", error_s);
3918         free(error_s);
3919         return false;
3920     }
3921
3922     for (size_t i = 0; i < raft->log_end - raft->log_start; i++) {
3923         raft_entry_uninit(&raft->entries[i]);
3924     }
3925     raft->log_start = raft->log_end = new_log_start;
3926     raft->log_synced = raft->log_end - 1;
3927     raft->commit_index = raft->log_start - 1;
3928     if (raft->last_applied < raft->commit_index) {
3929         raft->last_applied = raft->log_start - 2;
3930     }
3931
3932     raft_entry_uninit(&raft->snap);
3933     raft_entry_clone(&raft->snap, &new_snapshot);
3934
3935     raft_get_servers_from_log(raft, VLL_INFO);
3936     raft_get_election_timer_from_log(raft);
3937
3938     return true;
3939 }
3940
3941 static void
3942 raft_handle_install_snapshot_request(
3943     struct raft *raft, const struct raft_install_snapshot_request *rq)
3944 {
3945     if (raft_handle_install_snapshot_request__(raft, rq)) {
3946         union raft_rpc rpy = {
3947             .install_snapshot_reply = {
3948                 .common = {
3949                     .type = RAFT_RPC_INSTALL_SNAPSHOT_REPLY,
3950                     .sid = rq->common.sid,
3951                 },
3952                 .term = raft->term,
3953                 .last_index = rq->last_index,
3954                 .last_term = rq->last_term,
3955             },
3956         };
3957         raft_send(raft, &rpy);
3958     }
3959 }
3960
3961 static void
3962 raft_handle_install_snapshot_reply(
3963     struct raft *raft, const struct raft_install_snapshot_reply *rpy)
3964 {
3965     /* We might get an InstallSnapshot reply from a configured server (e.g. a
3966      * peer) or a server in the process of being added. */
3967     struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
3968     if (!s) {
3969         s = raft_find_new_server(raft, &rpy->common.sid);
3970         if (!s) {
3971             static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3972             VLOG_INFO_RL(&rl, "cluster "CID_FMT": received %s from "
3973                          "unknown server "SID_FMT, CID_ARGS(&raft->cid),
3974                          raft_rpc_type_to_string(rpy->common.type),
3975                          SID_ARGS(&rpy->common.sid));
3976             return;
3977         }
3978     }
3979
3980     if (rpy->last_index != raft->log_start - 1 ||
3981         rpy->last_term != raft->snap.term) {
3982         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
3983         VLOG_INFO_RL(&rl, "cluster "CID_FMT": server %s installed "
3984                      "out-of-date snapshot, starting over",
3985                      CID_ARGS(&raft->cid), s->nickname);
3986         raft_send_install_snapshot_request(raft, s,
3987                                            "installed obsolete snapshot");
3988         return;
3989     }
3990
3991     static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(10, 10);
3992     VLOG_INFO_RL(&rl, "cluster "CID_FMT": installed snapshot on server %s "
3993                  " up to %"PRIu64":%"PRIu64, CID_ARGS(&raft->cid),
3994                  s->nickname, rpy->last_term, rpy->last_index);
3995     s->next_index = raft->log_end;
3996     raft_send_append_request(raft, s, 0, "snapshot installed");
3997 }
3998
3999 /* Returns true if 'raft' has grown enough since the last snapshot that
4000  * reducing the log to a snapshot would be valuable, false otherwise.  */
4001 bool
4002 raft_grew_lots(const struct raft *raft)
4003 {
4004     return ovsdb_log_grew_lots(raft->log);
4005 }
4006
4007 /* Returns the number of log entries that could be trimmed off the on-disk log
4008  * by snapshotting. */
4009 uint64_t
4010 raft_get_log_length(const struct raft *raft)
4011 {
4012     return (raft->last_applied < raft->log_start
4013             ? 0
4014             : raft->last_applied - raft->log_start + 1);
4015 }
4016
4017 /* Returns true if taking a snapshot of 'raft', with raft_store_snapshot(), is
4018  * possible. */
4019 bool
4020 raft_may_snapshot(const struct raft *raft)
4021 {
4022     return (!raft->joining
4023             && !raft->leaving
4024             && !raft->left
4025             && !raft->failed
4026             && raft->last_applied >= raft->log_start);
4027 }
4028
4029 /* Replaces the log for 'raft', up to the last log entry read, by
4030  * 'new_snapshot_data'.  Returns NULL if successful, otherwise an error that
4031  * the caller must eventually free.
4032  *
4033  * This function can only succeed if raft_may_snapshot() returns true.  It is
4034  * only valuable to call it if raft_get_log_length() is significant and
4035  * especially if raft_grew_lots() returns true. */
4036 struct ovsdb_error * OVS_WARN_UNUSED_RESULT
4037 raft_store_snapshot(struct raft *raft, const struct json *new_snapshot_data)
4038 {
4039     if (raft->joining) {
4040         return ovsdb_error(NULL,
4041                            "cannot store a snapshot while joining cluster");
4042     } else if (raft->leaving) {
4043         return ovsdb_error(NULL,
4044                            "cannot store a snapshot while leaving cluster");
4045     } else if (raft->left) {
4046         return ovsdb_error(NULL,
4047                            "cannot store a snapshot after leaving cluster");
4048     } else if (raft->failed) {
4049         return ovsdb_error(NULL,
4050                            "cannot store a snapshot following failure");
4051     }
4052
4053     if (raft->last_applied < raft->log_start) {
4054         return ovsdb_error(NULL, "not storing a duplicate snapshot");
4055     }
4056
4057     uint64_t new_log_start = raft->last_applied + 1;
4058     struct raft_entry new_snapshot = {
4059         .term = raft_get_term(raft, new_log_start - 1),
4060         .data = json_clone(new_snapshot_data),
4061         .eid = *raft_get_eid(raft, new_log_start - 1),
4062         .servers = json_clone(raft_servers_for_index(raft, new_log_start - 1)),
4063         .election_timer = raft->election_timer,
4064     };
4065     struct ovsdb_error *error = raft_save_snapshot(raft, new_log_start,
4066                                                    &new_snapshot);
4067     if (error) {
4068         raft_entry_uninit(&new_snapshot);
4069         return error;
4070     }
4071
4072     raft->log_synced = raft->log_end - 1;
4073     raft_entry_uninit(&raft->snap);
4074     raft->snap = new_snapshot;
4075     for (size_t i = 0; i < new_log_start - raft->log_start; i++) {
4076         raft_entry_uninit(&raft->entries[i]);
4077     }
4078     memmove(&raft->entries[0], &raft->entries[new_log_start - raft->log_start],
4079             (raft->log_end - new_log_start) * sizeof *raft->entries);
4080     raft->log_start = new_log_start;
4081     return NULL;
4082 }
4083
4084 static void
4085 raft_handle_become_leader(struct raft *raft,
4086                           const struct raft_become_leader *rq)
4087 {
4088     if (raft->role == RAFT_FOLLOWER) {
4089         char buf[SID_LEN + 1];
4090         VLOG_INFO("received leadership transfer from %s in term %"PRIu64,
4091                   raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf),
4092                   rq->term);
4093         raft_start_election(raft, true);
4094     }
4095 }
4096
4097 static void
4098 raft_send_execute_command_reply(struct raft *raft,
4099                                 const struct uuid *sid,
4100                                 const struct uuid *eid,
4101                                 enum raft_command_status status,
4102                                 uint64_t commit_index)
4103 {
4104     if (failure_test == FT_CRASH_BEFORE_SEND_EXEC_REP) {
4105         ovs_fatal(0, "Raft test: crash before sending execute_command_reply");
4106     }
4107     union raft_rpc rpc = {
4108         .execute_command_reply = {
4109             .common = {
4110                 .type = RAFT_RPC_EXECUTE_COMMAND_REPLY,
4111                 .sid = *sid,
4112             },
4113             .result = *eid,
4114             .status = status,
4115             .commit_index = commit_index,
4116         },
4117     };
4118     raft_send(raft, &rpc);
4119     if (failure_test == FT_CRASH_AFTER_SEND_EXEC_REP) {
4120         ovs_fatal(0, "Raft test: crash after sending execute_command_reply.");
4121     }
4122 }
4123
4124 static enum raft_command_status
4125 raft_handle_execute_command_request__(
4126     struct raft *raft, const struct raft_execute_command_request *rq)
4127 {
4128     if (raft->role != RAFT_LEADER) {
4129         return RAFT_CMD_NOT_LEADER;
4130     }
4131
4132     const struct uuid *current_eid = raft_current_eid(raft);
4133     if (!uuid_equals(&rq->prereq, current_eid)) {
4134         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
4135         VLOG_INFO_RL(&rl, "current entry eid "UUID_FMT" does not match "
4136                      "prerequisite "UUID_FMT" in execute_command_request",
4137                      UUID_ARGS(current_eid), UUID_ARGS(&rq->prereq));
4138         return RAFT_CMD_BAD_PREREQ;
4139     }
4140
4141     struct raft_command *cmd = raft_command_initiate(raft, rq->data,
4142                                                      NULL, 0, &rq->result);
4143     cmd->sid = rq->common.sid;
4144
4145     enum raft_command_status status = cmd->status;
4146     if (status != RAFT_CMD_INCOMPLETE) {
4147         raft_command_unref(cmd);
4148     }
4149     return status;
4150 }
4151
4152 static void
4153 raft_handle_execute_command_request(
4154     struct raft *raft, const struct raft_execute_command_request *rq)
4155 {
4156     enum raft_command_status status
4157         = raft_handle_execute_command_request__(raft, rq);
4158     if (status != RAFT_CMD_INCOMPLETE) {
4159         raft_send_execute_command_reply(raft, &rq->common.sid, &rq->result,
4160                                         status, 0);
4161     }
4162 }
4163
4164 static void
4165 raft_handle_execute_command_reply(
4166     struct raft *raft, const struct raft_execute_command_reply *rpy)
4167 {
4168     struct raft_command *cmd = raft_find_command_by_eid(raft, &rpy->result);
4169     if (!cmd) {
4170         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5);
4171         char buf[SID_LEN + 1];
4172         VLOG_INFO_RL(&rl,
4173                      "%s received \"%s\" reply from %s for unknown command",
4174                      raft->local_nickname,
4175                      raft_command_status_to_string(rpy->status),
4176                      raft_get_nickname(raft, &rpy->common.sid,
4177                                        buf, sizeof buf));
4178         return;
4179     }
4180
4181     if (rpy->status == RAFT_CMD_INCOMPLETE) {
4182         cmd->timestamp = time_msec();
4183     } else {
4184         cmd->index = rpy->commit_index;
4185         raft_command_complete(raft, cmd, rpy->status);
4186     }
4187 }
4188
4189 static void
4190 raft_handle_rpc(struct raft *raft, const union raft_rpc *rpc)
4191 {
4192     uint64_t term = raft_rpc_get_term(rpc);
4193     if (term
4194         && !raft_should_suppress_disruptive_server(raft, rpc)
4195         && !raft_receive_term__(raft, &rpc->common, term)) {
4196         if (rpc->type == RAFT_RPC_APPEND_REQUEST) {
4197             /* Section 3.3: "If a server receives a request with a stale term
4198              * number, it rejects the request." */
4199             raft_send_append_reply(raft, raft_append_request_cast(rpc),
4200                                    RAFT_APPEND_INCONSISTENCY, "stale term");
4201         }
4202         return;
4203     }
4204
4205     switch (rpc->type) {
4206 #define RAFT_RPC(ENUM, NAME)                        \
4207         case ENUM:                                  \
4208             raft_handle_##NAME(raft, &rpc->NAME);   \
4209             break;
4210     RAFT_RPC_TYPES
4211 #undef RAFT_RPC
4212     default:
4213         OVS_NOT_REACHED();
4214     }
4215 }
4216 \f
4217 static bool
4218 raft_rpc_is_heartbeat(const union raft_rpc *rpc)
4219 {
4220     return ((rpc->type == RAFT_RPC_APPEND_REQUEST
4221              || rpc->type == RAFT_RPC_APPEND_REPLY)
4222              && rpc->common.comment
4223              && !strcmp(rpc->common.comment, "heartbeat"));
4224 }
4225
4226 \f
4227 static bool
4228 raft_send_to_conn_at(struct raft *raft, const union raft_rpc *rpc,
4229                      struct raft_conn *conn, int line_number)
4230 {
4231     log_rpc(rpc, "-->", conn, line_number);
4232     return !jsonrpc_session_send(
4233         conn->js, raft_rpc_to_jsonrpc(&raft->cid, &raft->sid, rpc));
4234 }
4235
4236 static bool
4237 raft_is_rpc_synced(const struct raft *raft, const union raft_rpc *rpc)
4238 {
4239     uint64_t term = raft_rpc_get_term(rpc);
4240     uint64_t index = raft_rpc_get_min_sync_index(rpc);
4241     const struct uuid *vote = raft_rpc_get_vote(rpc);
4242
4243     return (term <= raft->synced_term
4244             && index <= raft->log_synced
4245             && (!vote || uuid_equals(vote, &raft->synced_vote)));
4246 }
4247
4248 static bool
4249 raft_send_at(struct raft *raft, const union raft_rpc *rpc, int line_number)
4250 {
4251     const struct uuid *dst = &rpc->common.sid;
4252     if (uuid_equals(dst, &raft->sid)) {
4253         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
4254         VLOG_WARN_RL(&rl, "attempted to send RPC to self from raft.c:%d",
4255                      line_number);
4256         return false;
4257     }
4258
4259     struct raft_conn *conn = raft_find_conn_by_sid(raft, dst);
4260     if (!conn) {
4261         static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
4262         char buf[SID_LEN + 1];
4263         VLOG_DBG_RL(&rl, "%s: no connection to %s, cannot send RPC "
4264                     "from raft.c:%d", raft->local_nickname,
4265                     raft_get_nickname(raft, dst, buf, sizeof buf),
4266                     line_number);
4267         return false;
4268     }
4269
4270     if (!raft_is_rpc_synced(raft, rpc)) {
4271         raft_waiter_create(raft, RAFT_W_RPC, false)->rpc = raft_rpc_clone(rpc);
4272         return true;
4273     }
4274
4275     return raft_send_to_conn_at(raft, rpc, conn, line_number);
4276 }
4277 \f
4278 static struct raft *
4279 raft_lookup_by_name(const char *name)
4280 {
4281     struct raft *raft;
4282
4283     HMAP_FOR_EACH_WITH_HASH (raft, hmap_node, hash_string(name, 0),
4284                              &all_rafts) {
4285         if (!strcmp(raft->name, name)) {
4286             return raft;
4287         }
4288     }
4289     return NULL;
4290 }
4291
4292 static void
4293 raft_unixctl_cid(struct unixctl_conn *conn,
4294                  int argc OVS_UNUSED, const char *argv[],
4295                  void *aux OVS_UNUSED)
4296 {
4297     struct raft *raft = raft_lookup_by_name(argv[1]);
4298     if (!raft) {
4299         unixctl_command_reply_error(conn, "unknown cluster");
4300     } else if (uuid_is_zero(&raft->cid)) {
4301         unixctl_command_reply_error(conn, "cluster id not yet known");
4302     } else {
4303         char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->cid));
4304         unixctl_command_reply(conn, uuid);
4305         free(uuid);
4306     }
4307 }
4308
4309 static void
4310 raft_unixctl_sid(struct unixctl_conn *conn,
4311                  int argc OVS_UNUSED, const char *argv[],
4312                  void *aux OVS_UNUSED)
4313 {
4314     struct raft *raft = raft_lookup_by_name(argv[1]);
4315     if (!raft) {
4316         unixctl_command_reply_error(conn, "unknown cluster");
4317     } else {
4318         char *uuid = xasprintf(UUID_FMT, UUID_ARGS(&raft->sid));
4319         unixctl_command_reply(conn, uuid);
4320         free(uuid);
4321     }
4322 }
4323
4324 static void
4325 raft_put_sid(const char *title, const struct uuid *sid,
4326              const struct raft *raft, struct ds *s)
4327 {
4328     ds_put_format(s, "%s: ", title);
4329     if (uuid_equals(sid, &raft->sid)) {
4330         ds_put_cstr(s, "self");
4331     } else if (uuid_is_zero(sid)) {
4332         ds_put_cstr(s, "unknown");
4333     } else {
4334         char buf[SID_LEN + 1];
4335         ds_put_cstr(s, raft_get_nickname(raft, sid, buf, sizeof buf));
4336     }
4337     ds_put_char(s, '\n');
4338 }
4339
4340 static void
4341 raft_unixctl_status(struct unixctl_conn *conn,
4342                     int argc OVS_UNUSED, const char *argv[],
4343                     void *aux OVS_UNUSED)
4344 {
4345     struct raft *raft = raft_lookup_by_name(argv[1]);
4346     if (!raft) {
4347         unixctl_command_reply_error(conn, "unknown cluster");
4348         return;
4349     }
4350
4351     struct ds s = DS_EMPTY_INITIALIZER;
4352     ds_put_format(&s, "%s\n", raft->local_nickname);
4353     ds_put_format(&s, "Name: %s\n", raft->name);
4354     ds_put_format(&s, "Cluster ID: ");
4355     if (!uuid_is_zero(&raft->cid)) {
4356         ds_put_format(&s, CID_FMT" ("UUID_FMT")\n",
4357                       CID_ARGS(&raft->cid), UUID_ARGS(&raft->cid));
4358     } else {
4359         ds_put_format(&s, "not yet known\n");
4360     }
4361     ds_put_format(&s, "Server ID: "SID_FMT" ("UUID_FMT")\n",
4362                   SID_ARGS(&raft->sid), UUID_ARGS(&raft->sid));
4363     ds_put_format(&s, "Address: %s\n", raft->local_address);
4364     ds_put_format(&s, "Status: %s\n",
4365                   raft->joining ? "joining cluster"
4366                   : raft->leaving ? "leaving cluster"
4367                   : raft->left ? "left cluster"
4368                   : raft->failed ? "failed"
4369                   : "cluster member");
4370     if (raft->joining) {
4371         ds_put_format(&s, "Remotes for joining:");
4372         const char *address;
4373         SSET_FOR_EACH (address, &raft->remote_addresses) {
4374             ds_put_format(&s, " %s", address);
4375         }
4376         ds_put_char(&s, '\n');
4377     }
4378     if (raft->role == RAFT_LEADER) {
4379         struct raft_server *as;
4380         HMAP_FOR_EACH (as, hmap_node, &raft->add_servers) {
4381             ds_put_format(&s, "Adding server %s ("SID_FMT" at %s) (%s)\n",
4382                           as->nickname, SID_ARGS(&as->sid), as->address,
4383                           raft_server_phase_to_string(as->phase));
4384         }
4385
4386         struct raft_server *rs = raft->remove_server;
4387         if (rs) {
4388             ds_put_format(&s, "Removing server %s ("SID_FMT" at %s) (%s)\n",
4389                           rs->nickname, SID_ARGS(&rs->sid), rs->address,
4390                           raft_server_phase_to_string(rs->phase));
4391         }
4392     }
4393
4394     ds_put_format(&s, "Role: %s\n",
4395                   raft->role == RAFT_LEADER ? "leader"
4396                   : raft->role == RAFT_CANDIDATE ? "candidate"
4397                   : raft->role == RAFT_FOLLOWER ? "follower"
4398                   : "<error>");
4399     ds_put_format(&s, "Term: %"PRIu64"\n", raft->term);
4400     raft_put_sid("Leader", &raft->leader_sid, raft, &s);
4401     raft_put_sid("Vote", &raft->vote, raft, &s);
4402     ds_put_char(&s, '\n');
4403
4404     ds_put_format(&s, "Election timer: %"PRIu64, raft->election_timer);
4405     if (raft->role == RAFT_LEADER && raft->election_timer_new) {
4406         ds_put_format(&s, " (changing to %"PRIu64")",
4407                       raft->election_timer_new);
4408     }
4409     ds_put_char(&s, '\n');
4410
4411     ds_put_format(&s, "Log: [%"PRIu64", %"PRIu64"]\n",
4412                   raft->log_start, raft->log_end);
4413
4414     uint64_t n_uncommitted = raft->log_end - raft->commit_index - 1;
4415     ds_put_format(&s, "Entries not yet committed: %"PRIu64"\n", n_uncommitted);
4416
4417     uint64_t n_unapplied = raft->log_end - raft->last_applied - 1;
4418     ds_put_format(&s, "Entries not yet applied: %"PRIu64"\n", n_unapplied);
4419
4420     const struct raft_conn *c;
4421     ds_put_cstr(&s, "Connections:");
4422     LIST_FOR_EACH (c, list_node, &raft->conns) {
4423         bool connected = jsonrpc_session_is_connected(c->js);
4424         ds_put_format(&s, " %s%s%s%s",
4425                       connected ? "" : "(",
4426                       c->incoming ? "<-" : "->", c->nickname,
4427                       connected ? "" : ")");
4428     }
4429     ds_put_char(&s, '\n');
4430
4431     ds_put_cstr(&s, "Servers:\n");
4432     struct raft_server *server;
4433     HMAP_FOR_EACH (server, hmap_node, &raft->servers) {
4434         ds_put_format(&s, "    %s ("SID_FMT" at %s)",
4435                       server->nickname,
4436                       SID_ARGS(&server->sid), server->address);
4437         if (uuid_equals(&server->sid, &raft->sid)) {
4438             ds_put_cstr(&s, " (self)");
4439         }
4440         if (server->phase != RAFT_PHASE_STABLE) {
4441             ds_put_format (&s, " (%s)",
4442                            raft_server_phase_to_string(server->phase));
4443         }
4444         if (raft->role == RAFT_CANDIDATE) {
4445             if (!uuid_is_zero(&server->vote)) {
4446                 char buf[SID_LEN + 1];
4447                 ds_put_format(&s, " (voted for %s)",
4448                               raft_get_nickname(raft, &server->vote,
4449                                                 buf, sizeof buf));
4450             }
4451         } else if (raft->role == RAFT_LEADER) {
4452             ds_put_format(&s, " next_index=%"PRIu64" match_index=%"PRIu64,
4453                           server->next_index, server->match_index);
4454         }
4455         ds_put_char(&s, '\n');
4456     }
4457
4458     unixctl_command_reply(conn, ds_cstr(&s));
4459     ds_destroy(&s);
4460 }
4461
4462 static void
4463 raft_unixctl_leave__(struct unixctl_conn *conn, struct raft *raft)
4464 {
4465     if (raft_is_leaving(raft)) {
4466         unixctl_command_reply_error(conn,
4467                                     "already in progress leaving cluster");
4468     } else if (raft_is_joining(raft)) {
4469         unixctl_command_reply_error(conn,
4470                                     "can't leave while join in progress");
4471     } else if (raft_failed(raft)) {
4472         unixctl_command_reply_error(conn,
4473                                     "can't leave after failure");
4474     } else {
4475         raft_leave(raft);
4476         unixctl_command_reply(conn, NULL);
4477     }
4478 }
4479
4480 static void
4481 raft_unixctl_leave(struct unixctl_conn *conn, int argc OVS_UNUSED,
4482                    const char *argv[], void *aux OVS_UNUSED)
4483 {
4484     struct raft *raft = raft_lookup_by_name(argv[1]);
4485     if (!raft) {
4486         unixctl_command_reply_error(conn, "unknown cluster");
4487         return;
4488     }
4489
4490     raft_unixctl_leave__(conn, raft);
4491 }
4492
4493 static struct raft_server *
4494 raft_lookup_server_best_match(struct raft *raft, const char *id)
4495 {
4496     struct raft_server *best = NULL;
4497     int best_score = -1;
4498     int n_best = 0;
4499
4500     struct raft_server *s;
4501     HMAP_FOR_EACH (s, hmap_node, &raft->servers) {
4502         int score = (!strcmp(id, s->address)
4503                      ? INT_MAX
4504                      : uuid_is_partial_match(&s->sid, id));
4505         if (score > best_score) {
4506             best = s;
4507             best_score = score;
4508             n_best = 1;
4509         } else if (score == best_score) {
4510             n_best++;
4511         }
4512     }
4513     return n_best == 1 ? best : NULL;
4514 }
4515
4516 static void
4517 raft_unixctl_kick(struct unixctl_conn *conn, int argc OVS_UNUSED,
4518                   const char *argv[], void *aux OVS_UNUSED)
4519 {
4520     const char *cluster_name = argv[1];
4521     const char *server_name = argv[2];
4522
4523     struct raft *raft = raft_lookup_by_name(cluster_name);
4524     if (!raft) {
4525         unixctl_command_reply_error(conn, "unknown cluster");
4526         return;
4527     }
4528
4529     struct raft_server *server = raft_lookup_server_best_match(raft,
4530                                                                server_name);
4531     if (!server) {
4532         unixctl_command_reply_error(conn, "unknown server");
4533         return;
4534     }
4535
4536     if (uuid_equals(&server->sid, &raft->sid)) {
4537         raft_unixctl_leave__(conn, raft);
4538     } else if (raft->role == RAFT_LEADER) {
4539         const struct raft_remove_server_request rq = {
4540             .sid = server->sid,
4541             .requester_conn = conn,
4542         };
4543         raft_handle_remove_server_request(raft, &rq);
4544     } else {
4545         const union raft_rpc rpc = {
4546             .remove_server_request = {
4547                 .common = {
4548                     .type = RAFT_RPC_REMOVE_SERVER_REQUEST,
4549                     .sid = raft->leader_sid,
4550                     .comment = "via unixctl"
4551                 },
4552                 .sid = server->sid,
4553             }
4554         };
4555         if (raft_send(raft, &rpc)) {
4556             unixctl_command_reply(conn, "sent removal request to leader");
4557         } else {
4558             unixctl_command_reply_error(conn,
4559                                         "failed to send removal request");
4560         }
4561     }
4562 }
4563
4564 static void
4565 raft_get_election_timer_from_log(struct raft *raft)
4566 {
4567     if (raft->snap.election_timer) {
4568         raft->election_timer = raft->snap.election_timer;
4569     }
4570     for (uint64_t index = raft->commit_index; index >= raft->log_start;
4571          index--) {
4572         struct raft_entry *e = &raft->entries[index - raft->log_start];
4573         if (e->election_timer) {
4574             raft->election_timer = e->election_timer;
4575             break;
4576         }
4577     }
4578 }
4579
4580 static void
4581 raft_log_election_timer(struct raft *raft)
4582 {
4583     raft_command_unref(raft_command_execute__(raft, NULL, NULL,
4584                                               raft->election_timer_new, NULL,
4585                                               NULL));
4586 }
4587
4588 static void
4589 raft_unixctl_change_election_timer(struct unixctl_conn *conn,
4590                                    int argc OVS_UNUSED, const char *argv[],
4591                                    void *aux OVS_UNUSED)
4592 {
4593     const char *cluster_name = argv[1];
4594     const char *election_timer_str = argv[2];
4595
4596     struct raft *raft = raft_lookup_by_name(cluster_name);
4597     if (!raft) {
4598         unixctl_command_reply_error(conn, "unknown cluster");
4599         return;
4600     }
4601
4602     if (raft->role != RAFT_LEADER) {
4603         unixctl_command_reply_error(conn, "election timer must be changed"
4604                                    " through leader.");
4605         return;
4606     }
4607
4608     /* If there are pending changes for election timer, reject it. */
4609     if (raft->election_timer_new) {
4610         unixctl_command_reply_error(conn, "election timer change pending.");
4611         return;
4612     }
4613
4614     uint64_t election_timer = atoll(election_timer_str);
4615     if (election_timer == raft->election_timer) {
4616         unixctl_command_reply(conn, "change election timer to current value.");
4617         return;
4618     }
4619
4620     /* Election timer smaller than 100ms or bigger than 10min doesn't make
4621      * sense. */
4622     if (election_timer < 100 || election_timer > 600000) {
4623         unixctl_command_reply_error(conn, "election timer must be between "
4624                                     "100 and 600000, in msec.");
4625         return;
4626     }
4627
4628     /* If election timer is to be enlarged, it should be done gradually so that
4629      * it won't cause timeout when new value is applied on leader but not yet
4630      * applied on some of the followers. */
4631     if (election_timer > raft->election_timer * 2) {
4632         unixctl_command_reply_error(conn, "election timer increase should not "
4633                                     "exceed the current value x 2.");
4634         return;
4635     }
4636
4637     raft->election_timer_new = election_timer;
4638     raft_log_election_timer(raft);
4639     unixctl_command_reply(conn, "change of election timer initiated.");
4640 }
4641
4642 static void
4643 raft_unixctl_failure_test(struct unixctl_conn *conn OVS_UNUSED,
4644                           int argc OVS_UNUSED, const char *argv[],
4645                           void *aux OVS_UNUSED)
4646 {
4647     const char *test = argv[1];
4648     if (!strcmp(test, "crash-before-sending-append-request")) {
4649         failure_test = FT_CRASH_BEFORE_SEND_APPEND_REQ;
4650     } else if (!strcmp(test, "crash-after-sending-append-request")) {
4651         failure_test = FT_CRASH_AFTER_SEND_APPEND_REQ;
4652     } else if (!strcmp(test, "crash-before-sending-execute-command-reply")) {
4653         failure_test = FT_CRASH_BEFORE_SEND_EXEC_REP;
4654     } else if (!strcmp(test, "crash-after-sending-execute-command-reply")) {
4655         failure_test = FT_CRASH_AFTER_SEND_EXEC_REP;
4656     } else if (!strcmp(test, "crash-before-sending-execute-command-request")) {
4657         failure_test = FT_CRASH_BEFORE_SEND_EXEC_REQ;
4658     } else if (!strcmp(test, "crash-after-sending-execute-command-request")) {
4659         failure_test = FT_CRASH_AFTER_SEND_EXEC_REQ;
4660     } else if (!strcmp(test, "crash-after-receiving-append-request-update")) {
4661         failure_test = FT_CRASH_AFTER_RECV_APPEND_REQ_UPDATE;
4662     } else if (!strcmp(test, "delay-election")) {
4663         failure_test = FT_DELAY_ELECTION;
4664         struct raft *raft;
4665         HMAP_FOR_EACH (raft, hmap_node, &all_rafts) {
4666             if (raft->role == RAFT_FOLLOWER) {
4667                 raft_reset_election_timer(raft);
4668             }
4669         }
4670     } else if (!strcmp(test, "clear")) {
4671         failure_test = FT_NO_TEST;
4672         unixctl_command_reply(conn, "test dismissed");
4673         return;
4674     } else {
4675         unixctl_command_reply_error(conn, "unknown test scenario");
4676         return;
4677     }
4678     unixctl_command_reply(conn, "test engaged");
4679 }
4680
4681 static void
4682 raft_init(void)
4683 {
4684     static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
4685     if (!ovsthread_once_start(&once)) {
4686         return;
4687     }
4688     unixctl_command_register("cluster/cid", "DB", 1, 1,
4689                              raft_unixctl_cid, NULL);
4690     unixctl_command_register("cluster/sid", "DB", 1, 1,
4691                              raft_unixctl_sid, NULL);
4692     unixctl_command_register("cluster/status", "DB", 1, 1,
4693                              raft_unixctl_status, NULL);
4694     unixctl_command_register("cluster/leave", "DB", 1, 1,
4695                              raft_unixctl_leave, NULL);
4696     unixctl_command_register("cluster/kick", "DB SERVER", 2, 2,
4697                              raft_unixctl_kick, NULL);
4698     unixctl_command_register("cluster/change-election-timer", "DB TIME", 2, 2,
4699                              raft_unixctl_change_election_timer, NULL);
4700     unixctl_command_register("cluster/failure-test", "FAILURE SCENARIO", 1, 1,
4701                              raft_unixctl_failure_test, NULL);
4702     ovsthread_once_done(&once);
4703 }