]> git.proxmox.com Git - mirror_ubuntu-artful-kernel.git/blame - block/badblocks.c
block: clarify badblocks lifetime
[mirror_ubuntu-artful-kernel.git] / block / badblocks.c
CommitLineData
9e0e252a
VV
1/*
2 * Bad block management
3 *
4 * - Heavily based on MD badblocks code from Neil Brown
5 *
6 * Copyright (c) 2015, Intel Corporation.
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms and conditions of the GNU General Public License,
10 * version 2, as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope it will be useful, but WITHOUT
13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
15 * more details.
16 */
17
18#include <linux/badblocks.h>
19#include <linux/seqlock.h>
20#include <linux/kernel.h>
21#include <linux/module.h>
22#include <linux/stddef.h>
23#include <linux/types.h>
24#include <linux/slab.h>
25
26/**
27 * badblocks_check() - check a given range for bad sectors
28 * @bb: the badblocks structure that holds all badblock information
29 * @s: sector (start) at which to check for badblocks
30 * @sectors: number of sectors to check for badblocks
31 * @first_bad: pointer to store location of the first badblock
32 * @bad_sectors: pointer to store number of badblocks after @first_bad
33 *
34 * We can record which blocks on each device are 'bad' and so just
35 * fail those blocks, or that stripe, rather than the whole device.
36 * Entries in the bad-block table are 64bits wide. This comprises:
37 * Length of bad-range, in sectors: 0-511 for lengths 1-512
38 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
39 * A 'shift' can be set so that larger blocks are tracked and
40 * consequently larger devices can be covered.
41 * 'Acknowledged' flag - 1 bit. - the most significant bit.
42 *
43 * Locking of the bad-block table uses a seqlock so badblocks_check
44 * might need to retry if it is very unlucky.
45 * We will sometimes want to check for bad blocks in a bi_end_io function,
46 * so we use the write_seqlock_irq variant.
47 *
48 * When looking for a bad block we specify a range and want to
49 * know if any block in the range is bad. So we binary-search
50 * to the last range that starts at-or-before the given endpoint,
51 * (or "before the sector after the target range")
52 * then see if it ends after the given start.
53 *
54 * Return:
55 * 0: there are no known bad blocks in the range
56 * 1: there are known bad block which are all acknowledged
57 * -1: there are bad blocks which have not yet been acknowledged in metadata.
58 * plus the start/length of the first bad section we overlap.
59 */
60int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
61 sector_t *first_bad, int *bad_sectors)
62{
63 int hi;
64 int lo;
65 u64 *p = bb->page;
66 int rv;
67 sector_t target = s + sectors;
68 unsigned seq;
69
70 if (bb->shift > 0) {
71 /* round the start down, and the end up */
72 s >>= bb->shift;
73 target += (1<<bb->shift) - 1;
74 target >>= bb->shift;
75 sectors = target - s;
76 }
77 /* 'target' is now the first block after the bad range */
78
79retry:
80 seq = read_seqbegin(&bb->lock);
81 lo = 0;
82 rv = 0;
83 hi = bb->count;
84
85 /* Binary search between lo and hi for 'target'
86 * i.e. for the last range that starts before 'target'
87 */
88 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
89 * are known not to be the last range before target.
90 * VARIANT: hi-lo is the number of possible
91 * ranges, and decreases until it reaches 1
92 */
93 while (hi - lo > 1) {
94 int mid = (lo + hi) / 2;
95 sector_t a = BB_OFFSET(p[mid]);
96
97 if (a < target)
98 /* This could still be the one, earlier ranges
99 * could not.
100 */
101 lo = mid;
102 else
103 /* This and later ranges are definitely out. */
104 hi = mid;
105 }
106 /* 'lo' might be the last that started before target, but 'hi' isn't */
107 if (hi > lo) {
108 /* need to check all range that end after 's' to see if
109 * any are unacknowledged.
110 */
111 while (lo >= 0 &&
112 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
113 if (BB_OFFSET(p[lo]) < target) {
114 /* starts before the end, and finishes after
115 * the start, so they must overlap
116 */
117 if (rv != -1 && BB_ACK(p[lo]))
118 rv = 1;
119 else
120 rv = -1;
121 *first_bad = BB_OFFSET(p[lo]);
122 *bad_sectors = BB_LEN(p[lo]);
123 }
124 lo--;
125 }
126 }
127
128 if (read_seqretry(&bb->lock, seq))
129 goto retry;
130
131 return rv;
132}
133EXPORT_SYMBOL_GPL(badblocks_check);
134
135/**
136 * badblocks_set() - Add a range of bad blocks to the table.
137 * @bb: the badblocks structure that holds all badblock information
138 * @s: first sector to mark as bad
139 * @sectors: number of sectors to mark as bad
140 * @acknowledged: weather to mark the bad sectors as acknowledged
141 *
142 * This might extend the table, or might contract it if two adjacent ranges
143 * can be merged. We binary-search to find the 'insertion' point, then
144 * decide how best to handle it.
145 *
146 * Return:
147 * 0: success
148 * 1: failed to set badblocks (out of space)
149 */
150int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
151 int acknowledged)
152{
153 u64 *p;
154 int lo, hi;
155 int rv = 0;
156 unsigned long flags;
157
158 if (bb->shift < 0)
159 /* badblocks are disabled */
160 return 0;
161
162 if (bb->shift) {
163 /* round the start down, and the end up */
164 sector_t next = s + sectors;
165
166 s >>= bb->shift;
167 next += (1<<bb->shift) - 1;
168 next >>= bb->shift;
169 sectors = next - s;
170 }
171
172 write_seqlock_irqsave(&bb->lock, flags);
173
174 p = bb->page;
175 lo = 0;
176 hi = bb->count;
177 /* Find the last range that starts at-or-before 's' */
178 while (hi - lo > 1) {
179 int mid = (lo + hi) / 2;
180 sector_t a = BB_OFFSET(p[mid]);
181
182 if (a <= s)
183 lo = mid;
184 else
185 hi = mid;
186 }
187 if (hi > lo && BB_OFFSET(p[lo]) > s)
188 hi = lo;
189
190 if (hi > lo) {
191 /* we found a range that might merge with the start
192 * of our new range
193 */
194 sector_t a = BB_OFFSET(p[lo]);
195 sector_t e = a + BB_LEN(p[lo]);
196 int ack = BB_ACK(p[lo]);
197
198 if (e >= s) {
199 /* Yes, we can merge with a previous range */
200 if (s == a && s + sectors >= e)
201 /* new range covers old */
202 ack = acknowledged;
203 else
204 ack = ack && acknowledged;
205
206 if (e < s + sectors)
207 e = s + sectors;
208 if (e - a <= BB_MAX_LEN) {
209 p[lo] = BB_MAKE(a, e-a, ack);
210 s = e;
211 } else {
212 /* does not all fit in one range,
213 * make p[lo] maximal
214 */
215 if (BB_LEN(p[lo]) != BB_MAX_LEN)
216 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
217 s = a + BB_MAX_LEN;
218 }
219 sectors = e - s;
220 }
221 }
222 if (sectors && hi < bb->count) {
223 /* 'hi' points to the first range that starts after 's'.
224 * Maybe we can merge with the start of that range
225 */
226 sector_t a = BB_OFFSET(p[hi]);
227 sector_t e = a + BB_LEN(p[hi]);
228 int ack = BB_ACK(p[hi]);
229
230 if (a <= s + sectors) {
231 /* merging is possible */
232 if (e <= s + sectors) {
233 /* full overlap */
234 e = s + sectors;
235 ack = acknowledged;
236 } else
237 ack = ack && acknowledged;
238
239 a = s;
240 if (e - a <= BB_MAX_LEN) {
241 p[hi] = BB_MAKE(a, e-a, ack);
242 s = e;
243 } else {
244 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
245 s = a + BB_MAX_LEN;
246 }
247 sectors = e - s;
248 lo = hi;
249 hi++;
250 }
251 }
252 if (sectors == 0 && hi < bb->count) {
253 /* we might be able to combine lo and hi */
254 /* Note: 's' is at the end of 'lo' */
255 sector_t a = BB_OFFSET(p[hi]);
256 int lolen = BB_LEN(p[lo]);
257 int hilen = BB_LEN(p[hi]);
258 int newlen = lolen + hilen - (s - a);
259
260 if (s >= a && newlen < BB_MAX_LEN) {
261 /* yes, we can combine them */
262 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
263
264 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
265 memmove(p + hi, p + hi + 1,
266 (bb->count - hi - 1) * 8);
267 bb->count--;
268 }
269 }
270 while (sectors) {
271 /* didn't merge (it all).
272 * Need to add a range just before 'hi'
273 */
274 if (bb->count >= MAX_BADBLOCKS) {
275 /* No room for more */
276 rv = 1;
277 break;
278 } else {
279 int this_sectors = sectors;
280
281 memmove(p + hi + 1, p + hi,
282 (bb->count - hi) * 8);
283 bb->count++;
284
285 if (this_sectors > BB_MAX_LEN)
286 this_sectors = BB_MAX_LEN;
287 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
288 sectors -= this_sectors;
289 s += this_sectors;
290 }
291 }
292
293 bb->changed = 1;
294 if (!acknowledged)
295 bb->unacked_exist = 1;
296 write_sequnlock_irqrestore(&bb->lock, flags);
297
298 return rv;
299}
300EXPORT_SYMBOL_GPL(badblocks_set);
301
302/**
303 * badblocks_clear() - Remove a range of bad blocks to the table.
304 * @bb: the badblocks structure that holds all badblock information
305 * @s: first sector to mark as bad
306 * @sectors: number of sectors to mark as bad
307 *
308 * This may involve extending the table if we spilt a region,
309 * but it must not fail. So if the table becomes full, we just
310 * drop the remove request.
311 *
312 * Return:
313 * 0: success
314 * 1: failed to clear badblocks
315 */
316int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
317{
318 u64 *p;
319 int lo, hi;
320 sector_t target = s + sectors;
321 int rv = 0;
322
323 if (bb->shift > 0) {
324 /* When clearing we round the start up and the end down.
325 * This should not matter as the shift should align with
326 * the block size and no rounding should ever be needed.
327 * However it is better the think a block is bad when it
328 * isn't than to think a block is not bad when it is.
329 */
330 s += (1<<bb->shift) - 1;
331 s >>= bb->shift;
332 target >>= bb->shift;
333 sectors = target - s;
334 }
335
336 write_seqlock_irq(&bb->lock);
337
338 p = bb->page;
339 lo = 0;
340 hi = bb->count;
341 /* Find the last range that starts before 'target' */
342 while (hi - lo > 1) {
343 int mid = (lo + hi) / 2;
344 sector_t a = BB_OFFSET(p[mid]);
345
346 if (a < target)
347 lo = mid;
348 else
349 hi = mid;
350 }
351 if (hi > lo) {
352 /* p[lo] is the last range that could overlap the
353 * current range. Earlier ranges could also overlap,
354 * but only this one can overlap the end of the range.
355 */
356 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
357 /* Partial overlap, leave the tail of this range */
358 int ack = BB_ACK(p[lo]);
359 sector_t a = BB_OFFSET(p[lo]);
360 sector_t end = a + BB_LEN(p[lo]);
361
362 if (a < s) {
363 /* we need to split this range */
364 if (bb->count >= MAX_BADBLOCKS) {
365 rv = -ENOSPC;
366 goto out;
367 }
368 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
369 bb->count++;
370 p[lo] = BB_MAKE(a, s-a, ack);
371 lo++;
372 }
373 p[lo] = BB_MAKE(target, end - target, ack);
374 /* there is no longer an overlap */
375 hi = lo;
376 lo--;
377 }
378 while (lo >= 0 &&
379 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
380 /* This range does overlap */
381 if (BB_OFFSET(p[lo]) < s) {
382 /* Keep the early parts of this range. */
383 int ack = BB_ACK(p[lo]);
384 sector_t start = BB_OFFSET(p[lo]);
385
386 p[lo] = BB_MAKE(start, s - start, ack);
387 /* now low doesn't overlap, so.. */
388 break;
389 }
390 lo--;
391 }
392 /* 'lo' is strictly before, 'hi' is strictly after,
393 * anything between needs to be discarded
394 */
395 if (hi - lo > 1) {
396 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
397 bb->count -= (hi - lo - 1);
398 }
399 }
400
401 bb->changed = 1;
402out:
403 write_sequnlock_irq(&bb->lock);
404 return rv;
405}
406EXPORT_SYMBOL_GPL(badblocks_clear);
407
408/**
409 * ack_all_badblocks() - Acknowledge all bad blocks in a list.
410 * @bb: the badblocks structure that holds all badblock information
411 *
412 * This only succeeds if ->changed is clear. It is used by
413 * in-kernel metadata updates
414 */
415void ack_all_badblocks(struct badblocks *bb)
416{
417 if (bb->page == NULL || bb->changed)
418 /* no point even trying */
419 return;
420 write_seqlock_irq(&bb->lock);
421
422 if (bb->changed == 0 && bb->unacked_exist) {
423 u64 *p = bb->page;
424 int i;
425
426 for (i = 0; i < bb->count ; i++) {
427 if (!BB_ACK(p[i])) {
428 sector_t start = BB_OFFSET(p[i]);
429 int len = BB_LEN(p[i]);
430
431 p[i] = BB_MAKE(start, len, 1);
432 }
433 }
434 bb->unacked_exist = 0;
435 }
436 write_sequnlock_irq(&bb->lock);
437}
438EXPORT_SYMBOL_GPL(ack_all_badblocks);
439
440/**
441 * badblocks_show() - sysfs access to bad-blocks list
442 * @bb: the badblocks structure that holds all badblock information
443 * @page: buffer received from sysfs
444 * @unack: weather to show unacknowledged badblocks
445 *
446 * Return:
447 * Length of returned data
448 */
449ssize_t badblocks_show(struct badblocks *bb, char *page, int unack)
450{
451 size_t len;
452 int i;
453 u64 *p = bb->page;
454 unsigned seq;
455
456 if (bb->shift < 0)
457 return 0;
458
459retry:
460 seq = read_seqbegin(&bb->lock);
461
462 len = 0;
463 i = 0;
464
465 while (len < PAGE_SIZE && i < bb->count) {
466 sector_t s = BB_OFFSET(p[i]);
467 unsigned int length = BB_LEN(p[i]);
468 int ack = BB_ACK(p[i]);
469
470 i++;
471
472 if (unack && ack)
473 continue;
474
475 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
476 (unsigned long long)s << bb->shift,
477 length << bb->shift);
478 }
479 if (unack && len == 0)
480 bb->unacked_exist = 0;
481
482 if (read_seqretry(&bb->lock, seq))
483 goto retry;
484
485 return len;
486}
487EXPORT_SYMBOL_GPL(badblocks_show);
488
489/**
490 * badblocks_store() - sysfs access to bad-blocks list
491 * @bb: the badblocks structure that holds all badblock information
492 * @page: buffer received from sysfs
493 * @len: length of data received from sysfs
494 * @unack: weather to show unacknowledged badblocks
495 *
496 * Return:
497 * Length of the buffer processed or -ve error.
498 */
499ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
500 int unack)
501{
502 unsigned long long sector;
503 int length;
504 char newline;
505
506 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
507 case 3:
508 if (newline != '\n')
509 return -EINVAL;
510 case 2:
511 if (length <= 0)
512 return -EINVAL;
513 break;
514 default:
515 return -EINVAL;
516 }
517
518 if (badblocks_set(bb, sector, length, !unack))
519 return -ENOSPC;
520 else
521 return len;
522}
523EXPORT_SYMBOL_GPL(badblocks_store);
524
525/**
526 * badblocks_init() - initialize the badblocks structure
527 * @bb: the badblocks structure that holds all badblock information
528 * @enable: weather to enable badblocks accounting
529 *
530 * Return:
531 * 0: success
532 * -ve errno: on error
533 */
534int badblocks_init(struct badblocks *bb, int enable)
535{
536 bb->count = 0;
537 if (enable)
538 bb->shift = 0;
539 else
540 bb->shift = -1;
541 bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL);
542 if (bb->page == (u64 *)0) {
543 bb->shift = -1;
544 return -ENOMEM;
545 }
546 seqlock_init(&bb->lock);
547
548 return 0;
549}
550EXPORT_SYMBOL_GPL(badblocks_init);
551
552/**
d3b407fb 553 * badblocks_exit() - free the badblocks structure
9e0e252a
VV
554 * @bb: the badblocks structure that holds all badblock information
555 */
d3b407fb 556void badblocks_exit(struct badblocks *bb)
9e0e252a 557{
20a308f0
DW
558 if (!bb)
559 return;
9e0e252a
VV
560 kfree(bb->page);
561 bb->page = NULL;
562}
d3b407fb 563EXPORT_SYMBOL_GPL(badblocks_exit);