]>
Commit | Line | Data |
---|---|---|
8e854e9c GR |
1 | /* |
2 | * Copyright (C) 2015, SUSE | |
3 | * | |
4 | * This program is free software; you can redistribute it and/or modify | |
5 | * it under the terms of the GNU General Public License as published by | |
6 | * the Free Software Foundation; either version 2, or (at your option) | |
7 | * any later version. | |
8 | * | |
9 | */ | |
10 | ||
11 | ||
12 | #include <linux/module.h> | |
47741b7c GR |
13 | #include <linux/dlm.h> |
14 | #include <linux/sched.h> | |
15 | #include "md.h" | |
e94987db | 16 | #include "bitmap.h" |
edb39c9d | 17 | #include "md-cluster.h" |
47741b7c GR |
18 | |
19 | #define LVB_SIZE 64 | |
20 | ||
21 | struct dlm_lock_resource { | |
22 | dlm_lockspace_t *ls; | |
23 | struct dlm_lksb lksb; | |
24 | char *name; /* lock name. */ | |
25 | uint32_t flags; /* flags to pass to dlm_lock() */ | |
47741b7c | 26 | struct completion completion; /* completion for synchronized locking */ |
c4ce867f GR |
27 | void (*bast)(void *arg, int mode); /* blocking AST function pointer*/ |
28 | struct mddev *mddev; /* pointing back to mddev. */ | |
29 | }; | |
30 | ||
96ae923a GR |
31 | struct suspend_info { |
32 | int slot; | |
33 | sector_t lo; | |
34 | sector_t hi; | |
35 | struct list_head list; | |
36 | }; | |
37 | ||
38 | struct resync_info { | |
39 | __le64 lo; | |
40 | __le64 hi; | |
41 | }; | |
42 | ||
c4ce867f GR |
43 | struct md_cluster_info { |
44 | /* dlm lock space and resources for clustered raid. */ | |
45 | dlm_lockspace_t *lockspace; | |
cf921cc1 GR |
46 | int slot_number; |
47 | struct completion completion; | |
c4ce867f GR |
48 | struct dlm_lock_resource *sb_lock; |
49 | struct mutex sb_mutex; | |
54519c5f | 50 | struct dlm_lock_resource *bitmap_lockres; |
96ae923a GR |
51 | struct list_head suspend_list; |
52 | spinlock_t suspend_lock; | |
e94987db GR |
53 | struct md_thread *recovery_thread; |
54 | unsigned long recovery_map; | |
47741b7c GR |
55 | }; |
56 | ||
57 | static void sync_ast(void *arg) | |
58 | { | |
59 | struct dlm_lock_resource *res; | |
60 | ||
61 | res = (struct dlm_lock_resource *) arg; | |
62 | complete(&res->completion); | |
63 | } | |
64 | ||
65 | static int dlm_lock_sync(struct dlm_lock_resource *res, int mode) | |
66 | { | |
67 | int ret = 0; | |
68 | ||
69 | init_completion(&res->completion); | |
70 | ret = dlm_lock(res->ls, mode, &res->lksb, | |
71 | res->flags, res->name, strlen(res->name), | |
72 | 0, sync_ast, res, res->bast); | |
73 | if (ret) | |
74 | return ret; | |
75 | wait_for_completion(&res->completion); | |
76 | return res->lksb.sb_status; | |
77 | } | |
78 | ||
79 | static int dlm_unlock_sync(struct dlm_lock_resource *res) | |
80 | { | |
81 | return dlm_lock_sync(res, DLM_LOCK_NL); | |
82 | } | |
83 | ||
c4ce867f | 84 | static struct dlm_lock_resource *lockres_init(struct mddev *mddev, |
47741b7c GR |
85 | char *name, void (*bastfn)(void *arg, int mode), int with_lvb) |
86 | { | |
87 | struct dlm_lock_resource *res = NULL; | |
88 | int ret, namelen; | |
c4ce867f | 89 | struct md_cluster_info *cinfo = mddev->cluster_info; |
47741b7c GR |
90 | |
91 | res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); | |
92 | if (!res) | |
93 | return NULL; | |
c4ce867f GR |
94 | res->ls = cinfo->lockspace; |
95 | res->mddev = mddev; | |
47741b7c GR |
96 | namelen = strlen(name); |
97 | res->name = kzalloc(namelen + 1, GFP_KERNEL); | |
98 | if (!res->name) { | |
99 | pr_err("md-cluster: Unable to allocate resource name for resource %s\n", name); | |
100 | goto out_err; | |
101 | } | |
102 | strlcpy(res->name, name, namelen + 1); | |
103 | if (with_lvb) { | |
104 | res->lksb.sb_lvbptr = kzalloc(LVB_SIZE, GFP_KERNEL); | |
105 | if (!res->lksb.sb_lvbptr) { | |
106 | pr_err("md-cluster: Unable to allocate LVB for resource %s\n", name); | |
107 | goto out_err; | |
108 | } | |
109 | res->flags = DLM_LKF_VALBLK; | |
110 | } | |
111 | ||
112 | if (bastfn) | |
113 | res->bast = bastfn; | |
114 | ||
115 | res->flags |= DLM_LKF_EXPEDITE; | |
116 | ||
117 | ret = dlm_lock_sync(res, DLM_LOCK_NL); | |
118 | if (ret) { | |
119 | pr_err("md-cluster: Unable to lock NL on new lock resource %s\n", name); | |
120 | goto out_err; | |
121 | } | |
122 | res->flags &= ~DLM_LKF_EXPEDITE; | |
123 | res->flags |= DLM_LKF_CONVERT; | |
124 | ||
125 | return res; | |
126 | out_err: | |
127 | kfree(res->lksb.sb_lvbptr); | |
128 | kfree(res->name); | |
129 | kfree(res); | |
130 | return NULL; | |
131 | } | |
132 | ||
133 | static void lockres_free(struct dlm_lock_resource *res) | |
134 | { | |
135 | if (!res) | |
136 | return; | |
137 | ||
138 | init_completion(&res->completion); | |
139 | dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res); | |
140 | wait_for_completion(&res->completion); | |
141 | ||
142 | kfree(res->name); | |
143 | kfree(res->lksb.sb_lvbptr); | |
144 | kfree(res); | |
145 | } | |
8e854e9c | 146 | |
c4ce867f GR |
147 | static char *pretty_uuid(char *dest, char *src) |
148 | { | |
149 | int i, len = 0; | |
150 | ||
151 | for (i = 0; i < 16; i++) { | |
152 | if (i == 4 || i == 6 || i == 8 || i == 10) | |
153 | len += sprintf(dest + len, "-"); | |
154 | len += sprintf(dest + len, "%02x", (__u8)src[i]); | |
155 | } | |
156 | return dest; | |
157 | } | |
158 | ||
96ae923a GR |
159 | static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres, |
160 | sector_t lo, sector_t hi) | |
161 | { | |
162 | struct resync_info *ri; | |
163 | ||
164 | ri = (struct resync_info *)lockres->lksb.sb_lvbptr; | |
165 | ri->lo = cpu_to_le64(lo); | |
166 | ri->hi = cpu_to_le64(hi); | |
167 | } | |
168 | ||
169 | static struct suspend_info *read_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres) | |
170 | { | |
171 | struct resync_info ri; | |
172 | struct suspend_info *s = NULL; | |
173 | sector_t hi = 0; | |
174 | ||
175 | dlm_lock_sync(lockres, DLM_LOCK_CR); | |
176 | memcpy(&ri, lockres->lksb.sb_lvbptr, sizeof(struct resync_info)); | |
177 | hi = le64_to_cpu(ri.hi); | |
178 | if (ri.hi > 0) { | |
179 | s = kzalloc(sizeof(struct suspend_info), GFP_KERNEL); | |
180 | if (!s) | |
181 | goto out; | |
182 | s->hi = hi; | |
183 | s->lo = le64_to_cpu(ri.lo); | |
184 | } | |
185 | dlm_unlock_sync(lockres); | |
186 | out: | |
187 | return s; | |
188 | } | |
189 | ||
e94987db GR |
190 | void recover_bitmaps(struct md_thread *thread) |
191 | { | |
192 | struct mddev *mddev = thread->mddev; | |
193 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
194 | struct dlm_lock_resource *bm_lockres; | |
195 | char str[64]; | |
196 | int slot, ret; | |
197 | struct suspend_info *s, *tmp; | |
198 | sector_t lo, hi; | |
199 | ||
200 | while (cinfo->recovery_map) { | |
201 | slot = fls64((u64)cinfo->recovery_map) - 1; | |
202 | ||
203 | /* Clear suspend_area associated with the bitmap */ | |
204 | spin_lock_irq(&cinfo->suspend_lock); | |
205 | list_for_each_entry_safe(s, tmp, &cinfo->suspend_list, list) | |
206 | if (slot == s->slot) { | |
207 | list_del(&s->list); | |
208 | kfree(s); | |
209 | } | |
210 | spin_unlock_irq(&cinfo->suspend_lock); | |
211 | ||
212 | snprintf(str, 64, "bitmap%04d", slot); | |
213 | bm_lockres = lockres_init(mddev, str, NULL, 1); | |
214 | if (!bm_lockres) { | |
215 | pr_err("md-cluster: Cannot initialize bitmaps\n"); | |
216 | goto clear_bit; | |
217 | } | |
218 | ||
219 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | |
220 | if (ret) { | |
221 | pr_err("md-cluster: Could not DLM lock %s: %d\n", | |
222 | str, ret); | |
223 | goto clear_bit; | |
224 | } | |
225 | ret = bitmap_copy_from_slot(mddev, slot, &lo, &hi); | |
226 | if (ret) | |
227 | pr_err("md-cluster: Could not copy data from bitmap %d\n", slot); | |
228 | dlm_unlock_sync(bm_lockres); | |
229 | clear_bit: | |
230 | clear_bit(slot, &cinfo->recovery_map); | |
231 | } | |
232 | } | |
233 | ||
cf921cc1 GR |
234 | static void recover_prep(void *arg) |
235 | { | |
236 | } | |
237 | ||
238 | static void recover_slot(void *arg, struct dlm_slot *slot) | |
239 | { | |
240 | struct mddev *mddev = arg; | |
241 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
242 | ||
243 | pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n", | |
244 | mddev->bitmap_info.cluster_name, | |
245 | slot->nodeid, slot->slot, | |
246 | cinfo->slot_number); | |
e94987db GR |
247 | set_bit(slot->slot - 1, &cinfo->recovery_map); |
248 | if (!cinfo->recovery_thread) { | |
249 | cinfo->recovery_thread = md_register_thread(recover_bitmaps, | |
250 | mddev, "recover"); | |
251 | if (!cinfo->recovery_thread) { | |
252 | pr_warn("md-cluster: Could not create recovery thread\n"); | |
253 | return; | |
254 | } | |
255 | } | |
256 | md_wakeup_thread(cinfo->recovery_thread); | |
cf921cc1 GR |
257 | } |
258 | ||
259 | static void recover_done(void *arg, struct dlm_slot *slots, | |
260 | int num_slots, int our_slot, | |
261 | uint32_t generation) | |
262 | { | |
263 | struct mddev *mddev = arg; | |
264 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
265 | ||
266 | cinfo->slot_number = our_slot; | |
267 | complete(&cinfo->completion); | |
268 | } | |
269 | ||
270 | static const struct dlm_lockspace_ops md_ls_ops = { | |
271 | .recover_prep = recover_prep, | |
272 | .recover_slot = recover_slot, | |
273 | .recover_done = recover_done, | |
274 | }; | |
275 | ||
96ae923a GR |
276 | static int gather_all_resync_info(struct mddev *mddev, int total_slots) |
277 | { | |
278 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
279 | int i, ret = 0; | |
280 | struct dlm_lock_resource *bm_lockres; | |
281 | struct suspend_info *s; | |
282 | char str[64]; | |
283 | ||
284 | ||
285 | for (i = 0; i < total_slots; i++) { | |
286 | memset(str, '\0', 64); | |
287 | snprintf(str, 64, "bitmap%04d", i); | |
288 | bm_lockres = lockres_init(mddev, str, NULL, 1); | |
289 | if (!bm_lockres) | |
290 | return -ENOMEM; | |
291 | if (i == (cinfo->slot_number - 1)) | |
292 | continue; | |
293 | ||
294 | bm_lockres->flags |= DLM_LKF_NOQUEUE; | |
295 | ret = dlm_lock_sync(bm_lockres, DLM_LOCK_PW); | |
296 | if (ret == -EAGAIN) { | |
297 | memset(bm_lockres->lksb.sb_lvbptr, '\0', LVB_SIZE); | |
298 | s = read_resync_info(mddev, bm_lockres); | |
299 | if (s) { | |
300 | pr_info("%s:%d Resync[%llu..%llu] in progress on %d\n", | |
301 | __func__, __LINE__, | |
302 | (unsigned long long) s->lo, | |
303 | (unsigned long long) s->hi, i); | |
304 | spin_lock_irq(&cinfo->suspend_lock); | |
305 | s->slot = i; | |
306 | list_add(&s->list, &cinfo->suspend_list); | |
307 | spin_unlock_irq(&cinfo->suspend_lock); | |
308 | } | |
309 | ret = 0; | |
310 | lockres_free(bm_lockres); | |
311 | continue; | |
312 | } | |
313 | if (ret) | |
314 | goto out; | |
315 | /* TODO: Read the disk bitmap sb and check if it needs recovery */ | |
316 | dlm_unlock_sync(bm_lockres); | |
317 | lockres_free(bm_lockres); | |
318 | } | |
319 | out: | |
320 | return ret; | |
321 | } | |
322 | ||
edb39c9d GR |
323 | static int join(struct mddev *mddev, int nodes) |
324 | { | |
c4ce867f | 325 | struct md_cluster_info *cinfo; |
cf921cc1 | 326 | int ret, ops_rv; |
c4ce867f GR |
327 | char str[64]; |
328 | ||
329 | if (!try_module_get(THIS_MODULE)) | |
330 | return -ENOENT; | |
331 | ||
332 | cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL); | |
333 | if (!cinfo) | |
334 | return -ENOMEM; | |
335 | ||
cf921cc1 GR |
336 | init_completion(&cinfo->completion); |
337 | ||
338 | mutex_init(&cinfo->sb_mutex); | |
339 | mddev->cluster_info = cinfo; | |
340 | ||
c4ce867f GR |
341 | memset(str, 0, 64); |
342 | pretty_uuid(str, mddev->uuid); | |
cf921cc1 GR |
343 | ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name, |
344 | DLM_LSFL_FS, LVB_SIZE, | |
345 | &md_ls_ops, mddev, &ops_rv, &cinfo->lockspace); | |
c4ce867f GR |
346 | if (ret) |
347 | goto err; | |
cf921cc1 | 348 | wait_for_completion(&cinfo->completion); |
b97e9257 GR |
349 | if (nodes <= cinfo->slot_number) { |
350 | pr_err("md-cluster: Slot allotted(%d) greater than available slots(%d)", cinfo->slot_number - 1, | |
351 | nodes); | |
352 | ret = -ERANGE; | |
353 | goto err; | |
354 | } | |
c4ce867f GR |
355 | cinfo->sb_lock = lockres_init(mddev, "cmd-super", |
356 | NULL, 0); | |
357 | if (!cinfo->sb_lock) { | |
358 | ret = -ENOMEM; | |
359 | goto err; | |
360 | } | |
54519c5f GR |
361 | |
362 | pr_info("md-cluster: Joined cluster %s slot %d\n", str, cinfo->slot_number); | |
363 | snprintf(str, 64, "bitmap%04d", cinfo->slot_number - 1); | |
364 | cinfo->bitmap_lockres = lockres_init(mddev, str, NULL, 1); | |
365 | if (!cinfo->bitmap_lockres) | |
366 | goto err; | |
367 | if (dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW)) { | |
368 | pr_err("Failed to get bitmap lock\n"); | |
369 | ret = -EINVAL; | |
370 | goto err; | |
371 | } | |
372 | ||
96ae923a GR |
373 | INIT_LIST_HEAD(&cinfo->suspend_list); |
374 | spin_lock_init(&cinfo->suspend_lock); | |
375 | ||
376 | ret = gather_all_resync_info(mddev, nodes); | |
377 | if (ret) | |
378 | goto err; | |
379 | ||
edb39c9d | 380 | return 0; |
c4ce867f | 381 | err: |
96ae923a GR |
382 | lockres_free(cinfo->bitmap_lockres); |
383 | lockres_free(cinfo->sb_lock); | |
c4ce867f GR |
384 | if (cinfo->lockspace) |
385 | dlm_release_lockspace(cinfo->lockspace, 2); | |
cf921cc1 | 386 | mddev->cluster_info = NULL; |
c4ce867f GR |
387 | kfree(cinfo); |
388 | module_put(THIS_MODULE); | |
389 | return ret; | |
edb39c9d GR |
390 | } |
391 | ||
392 | static int leave(struct mddev *mddev) | |
393 | { | |
c4ce867f GR |
394 | struct md_cluster_info *cinfo = mddev->cluster_info; |
395 | ||
396 | if (!cinfo) | |
397 | return 0; | |
e94987db | 398 | md_unregister_thread(&cinfo->recovery_thread); |
c4ce867f | 399 | lockres_free(cinfo->sb_lock); |
54519c5f | 400 | lockres_free(cinfo->bitmap_lockres); |
c4ce867f | 401 | dlm_release_lockspace(cinfo->lockspace, 2); |
edb39c9d GR |
402 | return 0; |
403 | } | |
404 | ||
cf921cc1 GR |
405 | /* slot_number(): Returns the MD slot number to use |
406 | * DLM starts the slot numbers from 1, wheras cluster-md | |
407 | * wants the number to be from zero, so we deduct one | |
408 | */ | |
409 | static int slot_number(struct mddev *mddev) | |
410 | { | |
411 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
412 | ||
413 | return cinfo->slot_number - 1; | |
414 | } | |
415 | ||
96ae923a GR |
416 | static void resync_info_update(struct mddev *mddev, sector_t lo, sector_t hi) |
417 | { | |
418 | struct md_cluster_info *cinfo = mddev->cluster_info; | |
419 | ||
420 | add_resync_info(mddev, cinfo->bitmap_lockres, lo, hi); | |
421 | /* Re-acquire the lock to refresh LVB */ | |
422 | dlm_lock_sync(cinfo->bitmap_lockres, DLM_LOCK_PW); | |
423 | } | |
424 | ||
edb39c9d GR |
425 | static struct md_cluster_operations cluster_ops = { |
426 | .join = join, | |
427 | .leave = leave, | |
cf921cc1 | 428 | .slot_number = slot_number, |
96ae923a | 429 | .resync_info_update = resync_info_update, |
edb39c9d GR |
430 | }; |
431 | ||
8e854e9c GR |
432 | static int __init cluster_init(void) |
433 | { | |
434 | pr_warn("md-cluster: EXPERIMENTAL. Use with caution\n"); | |
435 | pr_info("Registering Cluster MD functions\n"); | |
edb39c9d | 436 | register_md_cluster_operations(&cluster_ops, THIS_MODULE); |
8e854e9c GR |
437 | return 0; |
438 | } | |
439 | ||
440 | static void cluster_exit(void) | |
441 | { | |
edb39c9d | 442 | unregister_md_cluster_operations(); |
8e854e9c GR |
443 | } |
444 | ||
445 | module_init(cluster_init); | |
446 | module_exit(cluster_exit); | |
447 | MODULE_LICENSE("GPL"); | |
448 | MODULE_DESCRIPTION("Clustering support for MD"); |