]>
Commit | Line | Data |
---|---|---|
98211489 ZB |
1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * | |
3 | * vim: noexpandtab sw=8 ts=8 sts=0: | |
4 | * | |
5 | * Copyright (C) 2005 Oracle. All rights reserved. | |
6 | * | |
7 | * This program is free software; you can redistribute it and/or | |
8 | * modify it under the terms of the GNU General Public | |
9 | * License as published by the Free Software Foundation; either | |
10 | * version 2 of the License, or (at your option) any later version. | |
11 | * | |
12 | * This program is distributed in the hope that it will be useful, | |
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
15 | * General Public License for more details. | |
16 | * | |
17 | * You should have received a copy of the GNU General Public | |
18 | * License along with this program; if not, write to the | |
19 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | |
20 | * Boston, MA 021110-1307, USA. | |
21 | */ | |
22 | ||
23 | /* This quorum hack is only here until we transition to some more rational | |
24 | * approach that is driven from userspace. Honest. No foolin'. | |
25 | * | |
26 | * Imagine two nodes lose network connectivity to each other but they're still | |
27 | * up and operating in every other way. Presumably a network timeout indicates | |
28 | * that a node is broken and should be recovered. They can't both recover each | |
29 | * other and both carry on without serialising their access to the file system. | |
30 | * They need to decide who is authoritative. Now extend that problem to | |
31 | * arbitrary groups of nodes losing connectivity between each other. | |
32 | * | |
33 | * So we declare that a node which has given up on connecting to a majority | |
34 | * of nodes who are still heartbeating will fence itself. | |
35 | * | |
36 | * There are huge opportunities for races here. After we give up on a node's | |
37 | * connection we need to wait long enough to give heartbeat an opportunity | |
38 | * to declare the node as truly dead. We also need to be careful with the | |
39 | * race between when we see a node start heartbeating and when we connect | |
40 | * to it. | |
41 | * | |
42 | * So nodes that are in this transtion put a hold on the quorum decision | |
43 | * with a counter. As they fall out of this transition they drop the count | |
44 | * and if they're the last, they fire off the decision. | |
45 | */ | |
46 | #include <linux/kernel.h> | |
98211489 | 47 | #include <linux/workqueue.h> |
bebe6f12 | 48 | #include <linux/reboot.h> |
98211489 ZB |
49 | |
50 | #include "heartbeat.h" | |
51 | #include "nodemanager.h" | |
52 | #define MLOG_MASK_PREFIX ML_QUORUM | |
53 | #include "masklog.h" | |
54 | #include "quorum.h" | |
55 | ||
56 | static struct o2quo_state { | |
57 | spinlock_t qs_lock; | |
58 | struct work_struct qs_work; | |
59 | int qs_pending; | |
60 | int qs_heartbeating; | |
61 | unsigned long qs_hb_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | |
62 | int qs_connected; | |
63 | unsigned long qs_conn_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | |
64 | int qs_holds; | |
65 | unsigned long qs_hold_bm[BITS_TO_LONGS(O2NM_MAX_NODES)]; | |
66 | } o2quo_state; | |
67 | ||
68 | /* this is horribly heavy-handed. It should instead flip the file | |
69 | * system RO and call some userspace script. */ | |
70 | static void o2quo_fence_self(void) | |
71 | { | |
72 | /* panic spins with interrupts enabled. with preempt | |
73 | * threads can still schedule, etc, etc */ | |
74 | o2hb_stop_all_regions(); | |
bebe6f12 | 75 | |
f6656d26 SM |
76 | switch (o2nm_single_cluster->cl_fence_method) { |
77 | case O2NM_FENCE_PANIC: | |
78 | panic("*** ocfs2 is very sorry to be fencing this system by " | |
79 | "panicing ***\n"); | |
80 | break; | |
81 | default: | |
82 | WARN_ON(o2nm_single_cluster->cl_fence_method >= | |
83 | O2NM_FENCE_METHODS); | |
0a4c9265 | 84 | /* fall through */ |
f6656d26 SM |
85 | case O2NM_FENCE_RESET: |
86 | printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " | |
87 | "system by restarting ***\n"); | |
88 | emergency_restart(); | |
89 | break; | |
90 | }; | |
98211489 ZB |
91 | } |
92 | ||
25985edc | 93 | /* Indicate that a timeout occurred on a hearbeat region write. The |
98211489 ZB |
94 | * other nodes in the cluster may consider us dead at that time so we |
95 | * want to "fence" ourselves so that we don't scribble on the disk | |
96 | * after they think they've recovered us. This can't solve all | |
97 | * problems related to writeout after recovery but this hack can at | |
98 | * least close some of those gaps. When we have real fencing, this can | |
99 | * go away as our node would be fenced externally before other nodes | |
100 | * begin recovery. */ | |
101 | void o2quo_disk_timeout(void) | |
102 | { | |
103 | o2quo_fence_self(); | |
104 | } | |
105 | ||
c4028958 | 106 | static void o2quo_make_decision(struct work_struct *work) |
98211489 ZB |
107 | { |
108 | int quorum; | |
109 | int lowest_hb, lowest_reachable = 0, fence = 0; | |
110 | struct o2quo_state *qs = &o2quo_state; | |
111 | ||
112 | spin_lock(&qs->qs_lock); | |
113 | ||
114 | lowest_hb = find_first_bit(qs->qs_hb_bm, O2NM_MAX_NODES); | |
115 | if (lowest_hb != O2NM_MAX_NODES) | |
116 | lowest_reachable = test_bit(lowest_hb, qs->qs_conn_bm); | |
117 | ||
118 | mlog(0, "heartbeating: %d, connected: %d, " | |
119 | "lowest: %d (%sreachable)\n", qs->qs_heartbeating, | |
120 | qs->qs_connected, lowest_hb, lowest_reachable ? "" : "un"); | |
121 | ||
122 | if (!test_bit(o2nm_this_node(), qs->qs_hb_bm) || | |
123 | qs->qs_heartbeating == 1) | |
124 | goto out; | |
125 | ||
126 | if (qs->qs_heartbeating & 1) { | |
127 | /* the odd numbered cluster case is straight forward -- | |
128 | * if we can't talk to the majority we're hosed */ | |
129 | quorum = (qs->qs_heartbeating + 1)/2; | |
130 | if (qs->qs_connected < quorum) { | |
131 | mlog(ML_ERROR, "fencing this node because it is " | |
132 | "only connected to %u nodes and %u is needed " | |
133 | "to make a quorum out of %u heartbeating nodes\n", | |
134 | qs->qs_connected, quorum, | |
135 | qs->qs_heartbeating); | |
136 | fence = 1; | |
137 | } | |
138 | } else { | |
139 | /* the even numbered cluster adds the possibility of each half | |
140 | * of the cluster being able to talk amongst themselves.. in | |
141 | * that case we're hosed if we can't talk to the group that has | |
142 | * the lowest numbered node */ | |
143 | quorum = qs->qs_heartbeating / 2; | |
144 | if (qs->qs_connected < quorum) { | |
145 | mlog(ML_ERROR, "fencing this node because it is " | |
146 | "only connected to %u nodes and %u is needed " | |
147 | "to make a quorum out of %u heartbeating nodes\n", | |
148 | qs->qs_connected, quorum, | |
149 | qs->qs_heartbeating); | |
150 | fence = 1; | |
151 | } | |
152 | else if ((qs->qs_connected == quorum) && | |
153 | !lowest_reachable) { | |
154 | mlog(ML_ERROR, "fencing this node because it is " | |
155 | "connected to a half-quorum of %u out of %u " | |
156 | "nodes which doesn't include the lowest active " | |
157 | "node %u\n", quorum, qs->qs_heartbeating, | |
158 | lowest_hb); | |
159 | fence = 1; | |
160 | } | |
161 | } | |
162 | ||
163 | out: | |
8c7b638c JB |
164 | if (fence) { |
165 | spin_unlock(&qs->qs_lock); | |
98211489 | 166 | o2quo_fence_self(); |
8c7b638c JB |
167 | } else { |
168 | mlog(ML_NOTICE, "not fencing this node, heartbeating: %d, " | |
169 | "connected: %d, lowest: %d (%sreachable)\n", | |
170 | qs->qs_heartbeating, qs->qs_connected, lowest_hb, | |
171 | lowest_reachable ? "" : "un"); | |
172 | spin_unlock(&qs->qs_lock); | |
173 | ||
174 | } | |
175 | ||
98211489 ZB |
176 | } |
177 | ||
178 | static void o2quo_set_hold(struct o2quo_state *qs, u8 node) | |
179 | { | |
180 | assert_spin_locked(&qs->qs_lock); | |
181 | ||
182 | if (!test_and_set_bit(node, qs->qs_hold_bm)) { | |
183 | qs->qs_holds++; | |
184 | mlog_bug_on_msg(qs->qs_holds == O2NM_MAX_NODES, | |
185 | "node %u\n", node); | |
186 | mlog(0, "node %u, %d total\n", node, qs->qs_holds); | |
187 | } | |
188 | } | |
189 | ||
190 | static void o2quo_clear_hold(struct o2quo_state *qs, u8 node) | |
191 | { | |
192 | assert_spin_locked(&qs->qs_lock); | |
193 | ||
194 | if (test_and_clear_bit(node, qs->qs_hold_bm)) { | |
195 | mlog(0, "node %u, %d total\n", node, qs->qs_holds - 1); | |
196 | if (--qs->qs_holds == 0) { | |
197 | if (qs->qs_pending) { | |
198 | qs->qs_pending = 0; | |
199 | schedule_work(&qs->qs_work); | |
200 | } | |
201 | } | |
202 | mlog_bug_on_msg(qs->qs_holds < 0, "node %u, holds %d\n", | |
203 | node, qs->qs_holds); | |
204 | } | |
205 | } | |
206 | ||
207 | /* as a node comes up we delay the quorum decision until we know the fate of | |
208 | * the connection. the hold will be droped in conn_up or hb_down. it might be | |
209 | * perpetuated by con_err until hb_down. if we already have a conn, we might | |
210 | * be dropping a hold that conn_up got. */ | |
211 | void o2quo_hb_up(u8 node) | |
212 | { | |
213 | struct o2quo_state *qs = &o2quo_state; | |
214 | ||
215 | spin_lock(&qs->qs_lock); | |
216 | ||
217 | qs->qs_heartbeating++; | |
218 | mlog_bug_on_msg(qs->qs_heartbeating == O2NM_MAX_NODES, | |
219 | "node %u\n", node); | |
220 | mlog_bug_on_msg(test_bit(node, qs->qs_hb_bm), "node %u\n", node); | |
221 | set_bit(node, qs->qs_hb_bm); | |
222 | ||
223 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | |
224 | ||
225 | if (!test_bit(node, qs->qs_conn_bm)) | |
226 | o2quo_set_hold(qs, node); | |
227 | else | |
228 | o2quo_clear_hold(qs, node); | |
229 | ||
230 | spin_unlock(&qs->qs_lock); | |
231 | } | |
232 | ||
233 | /* hb going down releases any holds we might have had due to this node from | |
234 | * conn_up, conn_err, or hb_up */ | |
235 | void o2quo_hb_down(u8 node) | |
236 | { | |
237 | struct o2quo_state *qs = &o2quo_state; | |
238 | ||
239 | spin_lock(&qs->qs_lock); | |
240 | ||
241 | qs->qs_heartbeating--; | |
242 | mlog_bug_on_msg(qs->qs_heartbeating < 0, | |
243 | "node %u, %d heartbeating\n", | |
244 | node, qs->qs_heartbeating); | |
245 | mlog_bug_on_msg(!test_bit(node, qs->qs_hb_bm), "node %u\n", node); | |
246 | clear_bit(node, qs->qs_hb_bm); | |
247 | ||
248 | mlog(0, "node %u, %d total\n", node, qs->qs_heartbeating); | |
249 | ||
250 | o2quo_clear_hold(qs, node); | |
251 | ||
252 | spin_unlock(&qs->qs_lock); | |
253 | } | |
254 | ||
255 | /* this tells us that we've decided that the node is still heartbeating | |
256 | * even though we've lost it's conn. it must only be called after conn_err | |
257 | * and indicates that we must now make a quorum decision in the future, | |
258 | * though we might be doing so after waiting for holds to drain. Here | |
259 | * we'll be dropping the hold from conn_err. */ | |
260 | void o2quo_hb_still_up(u8 node) | |
261 | { | |
262 | struct o2quo_state *qs = &o2quo_state; | |
263 | ||
264 | spin_lock(&qs->qs_lock); | |
265 | ||
266 | mlog(0, "node %u\n", node); | |
267 | ||
268 | qs->qs_pending = 1; | |
269 | o2quo_clear_hold(qs, node); | |
270 | ||
271 | spin_unlock(&qs->qs_lock); | |
272 | } | |
273 | ||
25985edc | 274 | /* This is analogous to hb_up. as a node's connection comes up we delay the |
98211489 ZB |
275 | * quorum decision until we see it heartbeating. the hold will be droped in |
276 | * hb_up or hb_down. it might be perpetuated by con_err until hb_down. if | |
b4d8ed4f | 277 | * it's already heartbeating we might be dropping a hold that conn_up got. |
98211489 ZB |
278 | * */ |
279 | void o2quo_conn_up(u8 node) | |
280 | { | |
281 | struct o2quo_state *qs = &o2quo_state; | |
282 | ||
283 | spin_lock(&qs->qs_lock); | |
284 | ||
285 | qs->qs_connected++; | |
286 | mlog_bug_on_msg(qs->qs_connected == O2NM_MAX_NODES, | |
287 | "node %u\n", node); | |
288 | mlog_bug_on_msg(test_bit(node, qs->qs_conn_bm), "node %u\n", node); | |
289 | set_bit(node, qs->qs_conn_bm); | |
290 | ||
291 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | |
292 | ||
293 | if (!test_bit(node, qs->qs_hb_bm)) | |
294 | o2quo_set_hold(qs, node); | |
295 | else | |
296 | o2quo_clear_hold(qs, node); | |
297 | ||
298 | spin_unlock(&qs->qs_lock); | |
299 | } | |
300 | ||
301 | /* we've decided that we won't ever be connecting to the node again. if it's | |
302 | * still heartbeating we grab a hold that will delay decisions until either the | |
303 | * node stops heartbeating from hb_down or the caller decides that the node is | |
304 | * still up and calls still_up */ | |
305 | void o2quo_conn_err(u8 node) | |
306 | { | |
307 | struct o2quo_state *qs = &o2quo_state; | |
308 | ||
309 | spin_lock(&qs->qs_lock); | |
310 | ||
311 | if (test_bit(node, qs->qs_conn_bm)) { | |
312 | qs->qs_connected--; | |
313 | mlog_bug_on_msg(qs->qs_connected < 0, | |
314 | "node %u, connected %d\n", | |
315 | node, qs->qs_connected); | |
316 | ||
317 | clear_bit(node, qs->qs_conn_bm); | |
fc2af28b YZ |
318 | |
319 | if (test_bit(node, qs->qs_hb_bm)) | |
320 | o2quo_set_hold(qs, node); | |
98211489 ZB |
321 | } |
322 | ||
323 | mlog(0, "node %u, %d total\n", node, qs->qs_connected); | |
324 | ||
98211489 ZB |
325 | |
326 | spin_unlock(&qs->qs_lock); | |
327 | } | |
328 | ||
329 | void o2quo_init(void) | |
330 | { | |
331 | struct o2quo_state *qs = &o2quo_state; | |
332 | ||
333 | spin_lock_init(&qs->qs_lock); | |
c4028958 | 334 | INIT_WORK(&qs->qs_work, o2quo_make_decision); |
98211489 ZB |
335 | } |
336 | ||
337 | void o2quo_exit(void) | |
338 | { | |
9b00a818 TH |
339 | struct o2quo_state *qs = &o2quo_state; |
340 | ||
43829731 | 341 | flush_work(&qs->qs_work); |
98211489 | 342 | } |