]>
Commit | Line | Data |
---|---|---|
1b1d2e6d BP |
1 | /* |
2 | * Copyright (c) 2017, 2018 Nicira, Inc. | |
3 | * | |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #ifndef RAFT_H | |
18 | #define RAFT_H 1 | |
19 | ||
20 | #include <stddef.h> | |
21 | ||
22 | /* Implementation of the Raft consensus algorithm. | |
23 | * | |
24 | * | |
25 | * References | |
26 | * ========== | |
27 | * | |
28 | * Based on Diego Ongaro's Ph.D. thesis, "Consensus: Bridging Theory and | |
29 | * Practice", available at https://ramcloud.stanford.edu/~ongaro/thesis.pdf. | |
30 | * References to sections, pages, and figures are from this thesis. Quotations | |
31 | * in comments also come from this work, in accordance with its license notice, | |
32 | * reproduced below: | |
33 | * | |
34 | * Copyright 2014 by Diego Andres Ongaro. All Rights Reserved. | |
35 | * | |
36 | * This work is licensed under a Creative Commons Attribution-3.0 United | |
37 | * States License. http://creativecommons.org/licenses/by/3.0/us/ | |
38 | * | |
39 | * | |
40 | * Concepts | |
41 | * ======== | |
42 | * | |
43 | * Raft allows a cluster of servers to maintain a distributed log. At any | |
44 | * given time, at most one of N servers is a leader. The leader can propose | |
45 | * appending a new entry to the log. If ratified by more than N/2 servers | |
46 | * (including the leader), the new entry becomes permanently part of the log. | |
47 | * | |
48 | * This implementation gives each cluster a name, which is the same as the | |
49 | * database schema's name and a UUID, called the cluster ID. Each server has | |
50 | * its own UUID, called the server ID, and a network address (e.g. an IP | |
51 | * address and a port). | |
52 | * | |
53 | * | |
54 | * Thread-safety | |
55 | * ============= | |
56 | * | |
57 | * The Raft code is not thread-safe. Even if separate threads access different | |
58 | * Raft objects, the implementation can still make unsynchronized cross-thread | |
59 | * accesses (from unixctl handlers). | |
60 | */ | |
61 | ||
62 | #include <stdbool.h> | |
63 | #include <stdint.h> | |
64 | #include "compiler.h" | |
65 | #include "uuid.h" | |
66 | ||
67 | struct json; | |
68 | struct ovsdb_log; | |
69 | struct raft; | |
70 | struct sset; | |
71 | ||
72 | #define RAFT_MAGIC "CLUSTER" | |
73 | ||
74 | /* Setting up a new cluster or adding a new server to a cluster. | |
75 | * | |
76 | * These functions just write an on-disk file. They do not do any network | |
77 | * activity, which means that the actual work of setting up or joining the | |
78 | * cluster happens later after raft_open(). */ | |
79 | struct ovsdb_error *raft_create_cluster(const char *file_name, | |
80 | const char *name, | |
81 | const char *local_address, | |
82 | const struct json *snapshot) | |
83 | OVS_WARN_UNUSED_RESULT; | |
84 | struct ovsdb_error *raft_join_cluster(const char *file_name, const char *name, | |
85 | const char *local_address, | |
86 | const struct sset *remote_addrs, | |
87 | const struct uuid *cid) | |
88 | OVS_WARN_UNUSED_RESULT; | |
89 | ||
90 | /* Reading metadata from a server log. */ | |
91 | struct raft_metadata { | |
92 | struct uuid sid; /* Server ID. */ | |
93 | struct uuid cid; /* Cluster ID. All-zeros if not yet known. */ | |
94 | char *name; /* Schema name. */ | |
95 | char *local; /* Local address. */ | |
96 | }; | |
97 | struct ovsdb_error *raft_read_metadata(struct ovsdb_log *, | |
98 | struct raft_metadata *) | |
99 | OVS_WARN_UNUSED_RESULT; | |
100 | void raft_metadata_destroy(struct raft_metadata *); | |
101 | ||
102 | /* Starting up or shutting down a server within a cluster. */ | |
103 | struct ovsdb_error *raft_open(struct ovsdb_log *, struct raft **) | |
104 | OVS_WARN_UNUSED_RESULT; | |
105 | void raft_close(struct raft *); | |
106 | ||
107 | void raft_run(struct raft *); | |
108 | void raft_wait(struct raft *); | |
109 | ||
110 | /* Information. */ | |
111 | const char *raft_get_name(const struct raft *); | |
112 | const struct uuid *raft_get_cid(const struct raft *); | |
113 | const struct uuid *raft_get_sid(const struct raft *); | |
114 | bool raft_is_connected(const struct raft *); | |
115 | bool raft_is_leader(const struct raft *); | |
116 | ||
117 | /* Joining a cluster. */ | |
118 | bool raft_is_joining(const struct raft *); | |
119 | ||
120 | /* Leaving a cluster. */ | |
121 | void raft_leave(struct raft *); | |
122 | bool raft_is_leaving(const struct raft *); | |
123 | bool raft_left(const struct raft *); | |
124 | ||
125 | /* Failure. */ | |
126 | bool raft_failed(const struct raft *); | |
127 | ||
128 | /* Reading snapshots and log entries. */ | |
129 | const struct json *raft_next_entry(struct raft *, struct uuid *eid, | |
130 | bool *is_snapshot); | |
131 | bool raft_has_next_entry(const struct raft *); | |
132 | ||
133 | uint64_t raft_get_applied_index(const struct raft *); | |
134 | uint64_t raft_get_commit_index(const struct raft *); | |
135 | ||
136 | /* Writing log entries (executing commands). */ | |
137 | enum raft_command_status { | |
138 | /* In progress, please wait. */ | |
139 | RAFT_CMD_INCOMPLETE, | |
140 | ||
141 | /* Success. */ | |
142 | RAFT_CMD_SUCCESS, /* Committed. */ | |
143 | ||
144 | /* Failure. | |
145 | * | |
146 | * A failure status does not always mean that the operation actually | |
147 | * failed. In corner cases, it means that the log entry was committed but | |
148 | * the message reporting success was not successfully received. Thus, this | |
149 | * Raft implementation implements "at-least-once" rather than | |
150 | * "exactly-once" semantics. */ | |
151 | RAFT_CMD_NOT_LEADER, /* Failed because we are not the leader. */ | |
152 | RAFT_CMD_BAD_PREREQ, /* Failed because prerequisite check failed. */ | |
153 | RAFT_CMD_LOST_LEADERSHIP, /* Leadership lost after command initiation. */ | |
154 | RAFT_CMD_SHUTDOWN, /* Raft server joining or left or shut down. */ | |
155 | RAFT_CMD_IO_ERROR, /* I/O error. */ | |
156 | RAFT_CMD_TIMEOUT, /* Request to remote leader timed out. */ | |
157 | }; | |
158 | const char *raft_command_status_to_string(enum raft_command_status); | |
159 | bool raft_command_status_from_string(const char *, enum raft_command_status *); | |
160 | ||
161 | struct raft_command *raft_command_execute(struct raft *, | |
162 | const struct json *data, | |
163 | const struct uuid *prereq, | |
164 | struct uuid *result) | |
165 | OVS_WARN_UNUSED_RESULT; | |
166 | enum raft_command_status raft_command_get_status(const struct raft_command *); | |
167 | uint64_t raft_command_get_commit_index(const struct raft_command *); | |
168 | void raft_command_unref(struct raft_command *); | |
169 | void raft_command_wait(const struct raft_command *); | |
170 | ||
171 | /* Replacing the local log by a snapshot. */ | |
172 | bool raft_grew_lots(const struct raft *); | |
173 | uint64_t raft_get_log_length(const struct raft *); | |
174 | bool raft_may_snapshot(const struct raft *); | |
175 | struct ovsdb_error *raft_store_snapshot(struct raft *, | |
176 | const struct json *new_snapshot) | |
177 | OVS_WARN_UNUSED_RESULT; | |
178 | ||
179 | /* Cluster management. */ | |
180 | void raft_take_leadership(struct raft *); | |
181 | void raft_transfer_leadership(struct raft *, const char *reason); | |
182 | ||
183 | #endif /* lib/raft.h */ |