]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blob - drivers/md/md.c
blk-mq: introduce BLK_STS_DEV_RESOURCE
[mirror_ubuntu-bionic-kernel.git] / drivers / md / md.c
1 /*
2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4
5 completely rewritten, based on the MD driver code from Marc Zyngier
6
7 Changes:
8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization):
19
20 Neil Brown <neilb@cse.unsw.edu.au>.
21
22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24
25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option)
28 any later version.
29
30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33
34 Errors, Warnings, etc.
35 Please use:
36 pr_crit() for error conditions that risk data loss
37 pr_err() for error conditions that are unexpected, like an IO error
38 or internal inconsistency
39 pr_warn() for error conditions that could have been predicated, like
40 adding a device to an array when it has incompatible metadata
41 pr_info() for every interesting, very rare events, like an array starting
42 or stopping, or resync starting or stopping
43 pr_debug() for everything else.
44
45 */
46
47 #include <linux/sched/signal.h>
48 #include <linux/kthread.h>
49 #include <linux/blkdev.h>
50 #include <linux/badblocks.h>
51 #include <linux/sysctl.h>
52 #include <linux/seq_file.h>
53 #include <linux/fs.h>
54 #include <linux/poll.h>
55 #include <linux/ctype.h>
56 #include <linux/string.h>
57 #include <linux/hdreg.h>
58 #include <linux/proc_fs.h>
59 #include <linux/random.h>
60 #include <linux/module.h>
61 #include <linux/reboot.h>
62 #include <linux/file.h>
63 #include <linux/compat.h>
64 #include <linux/delay.h>
65 #include <linux/raid/md_p.h>
66 #include <linux/raid/md_u.h>
67 #include <linux/slab.h>
68 #include <linux/percpu-refcount.h>
69
70 #include <trace/events/block.h>
71 #include "md.h"
72 #include "md-bitmap.h"
73 #include "md-cluster.h"
74
75 #ifndef MODULE
76 static void autostart_arrays(int part);
77 #endif
78
79 /* pers_list is a list of registered personalities protected
80 * by pers_lock.
81 * pers_lock does extra service to protect accesses to
82 * mddev->thread when the mutex cannot be held.
83 */
84 static LIST_HEAD(pers_list);
85 static DEFINE_SPINLOCK(pers_lock);
86
87 struct md_cluster_operations *md_cluster_ops;
88 EXPORT_SYMBOL(md_cluster_ops);
89 struct module *md_cluster_mod;
90 EXPORT_SYMBOL(md_cluster_mod);
91
92 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
93 static struct workqueue_struct *md_wq;
94 static struct workqueue_struct *md_misc_wq;
95
96 static int remove_and_add_spares(struct mddev *mddev,
97 struct md_rdev *this);
98 static void mddev_detach(struct mddev *mddev);
99
100 /*
101 * Default number of read corrections we'll attempt on an rdev
102 * before ejecting it from the array. We divide the read error
103 * count by 2 for every hour elapsed between read errors.
104 */
105 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
106 /*
107 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
108 * is 1000 KB/sec, so the extra system load does not show up that much.
109 * Increase it if you want to have more _guaranteed_ speed. Note that
110 * the RAID driver will use the maximum available bandwidth if the IO
111 * subsystem is idle. There is also an 'absolute maximum' reconstruction
112 * speed limit - in case reconstruction slows down your system despite
113 * idle IO detection.
114 *
115 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
116 * or /sys/block/mdX/md/sync_speed_{min,max}
117 */
118
119 static int sysctl_speed_limit_min = 1000;
120 static int sysctl_speed_limit_max = 200000;
121 static inline int speed_min(struct mddev *mddev)
122 {
123 return mddev->sync_speed_min ?
124 mddev->sync_speed_min : sysctl_speed_limit_min;
125 }
126
127 static inline int speed_max(struct mddev *mddev)
128 {
129 return mddev->sync_speed_max ?
130 mddev->sync_speed_max : sysctl_speed_limit_max;
131 }
132
133 static struct ctl_table_header *raid_table_header;
134
135 static struct ctl_table raid_table[] = {
136 {
137 .procname = "speed_limit_min",
138 .data = &sysctl_speed_limit_min,
139 .maxlen = sizeof(int),
140 .mode = S_IRUGO|S_IWUSR,
141 .proc_handler = proc_dointvec,
142 },
143 {
144 .procname = "speed_limit_max",
145 .data = &sysctl_speed_limit_max,
146 .maxlen = sizeof(int),
147 .mode = S_IRUGO|S_IWUSR,
148 .proc_handler = proc_dointvec,
149 },
150 { }
151 };
152
153 static struct ctl_table raid_dir_table[] = {
154 {
155 .procname = "raid",
156 .maxlen = 0,
157 .mode = S_IRUGO|S_IXUGO,
158 .child = raid_table,
159 },
160 { }
161 };
162
163 static struct ctl_table raid_root_table[] = {
164 {
165 .procname = "dev",
166 .maxlen = 0,
167 .mode = 0555,
168 .child = raid_dir_table,
169 },
170 { }
171 };
172
173 static const struct block_device_operations md_fops;
174
175 static int start_readonly;
176
177 /*
178 * The original mechanism for creating an md device is to create
179 * a device node in /dev and to open it. This causes races with device-close.
180 * The preferred method is to write to the "new_array" module parameter.
181 * This can avoid races.
182 * Setting create_on_open to false disables the original mechanism
183 * so all the races disappear.
184 */
185 static bool create_on_open = true;
186
187 /* bio_clone_mddev
188 * like bio_clone_bioset, but with a local bio set
189 */
190
191 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
192 struct mddev *mddev)
193 {
194 struct bio *b;
195
196 if (!mddev || !mddev->bio_set)
197 return bio_alloc(gfp_mask, nr_iovecs);
198
199 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
200 if (!b)
201 return NULL;
202 return b;
203 }
204 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
205
206 static struct bio *md_bio_alloc_sync(struct mddev *mddev)
207 {
208 if (!mddev || !mddev->sync_set)
209 return bio_alloc(GFP_NOIO, 1);
210
211 return bio_alloc_bioset(GFP_NOIO, 1, mddev->sync_set);
212 }
213
214 /*
215 * We have a system wide 'event count' that is incremented
216 * on any 'interesting' event, and readers of /proc/mdstat
217 * can use 'poll' or 'select' to find out when the event
218 * count increases.
219 *
220 * Events are:
221 * start array, stop array, error, add device, remove device,
222 * start build, activate spare
223 */
224 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
225 static atomic_t md_event_count;
226 void md_new_event(struct mddev *mddev)
227 {
228 atomic_inc(&md_event_count);
229 wake_up(&md_event_waiters);
230 }
231 EXPORT_SYMBOL_GPL(md_new_event);
232
233 /*
234 * Enables to iterate over all existing md arrays
235 * all_mddevs_lock protects this list.
236 */
237 static LIST_HEAD(all_mddevs);
238 static DEFINE_SPINLOCK(all_mddevs_lock);
239
240 /*
241 * iterates through all used mddevs in the system.
242 * We take care to grab the all_mddevs_lock whenever navigating
243 * the list, and to always hold a refcount when unlocked.
244 * Any code which breaks out of this loop while own
245 * a reference to the current mddev and must mddev_put it.
246 */
247 #define for_each_mddev(_mddev,_tmp) \
248 \
249 for (({ spin_lock(&all_mddevs_lock); \
250 _tmp = all_mddevs.next; \
251 _mddev = NULL;}); \
252 ({ if (_tmp != &all_mddevs) \
253 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
254 spin_unlock(&all_mddevs_lock); \
255 if (_mddev) mddev_put(_mddev); \
256 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
257 _tmp != &all_mddevs;}); \
258 ({ spin_lock(&all_mddevs_lock); \
259 _tmp = _tmp->next;}) \
260 )
261
262 /* Rather than calling directly into the personality make_request function,
263 * IO requests come here first so that we can check if the device is
264 * being suspended pending a reconfiguration.
265 * We hold a refcount over the call to ->make_request. By the time that
266 * call has finished, the bio has been linked into some internal structure
267 * and so is visible to ->quiesce(), so we don't need the refcount any more.
268 */
269 static bool is_suspended(struct mddev *mddev, struct bio *bio)
270 {
271 if (mddev->suspended)
272 return true;
273 if (bio_data_dir(bio) != WRITE)
274 return false;
275 if (mddev->suspend_lo >= mddev->suspend_hi)
276 return false;
277 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
278 return false;
279 if (bio_end_sector(bio) < mddev->suspend_lo)
280 return false;
281 return true;
282 }
283
284 void md_handle_request(struct mddev *mddev, struct bio *bio)
285 {
286 check_suspended:
287 rcu_read_lock();
288 if (is_suspended(mddev, bio)) {
289 DEFINE_WAIT(__wait);
290 for (;;) {
291 prepare_to_wait(&mddev->sb_wait, &__wait,
292 TASK_UNINTERRUPTIBLE);
293 if (!is_suspended(mddev, bio))
294 break;
295 rcu_read_unlock();
296 schedule();
297 rcu_read_lock();
298 }
299 finish_wait(&mddev->sb_wait, &__wait);
300 }
301 atomic_inc(&mddev->active_io);
302 rcu_read_unlock();
303
304 if (!mddev->pers->make_request(mddev, bio)) {
305 atomic_dec(&mddev->active_io);
306 wake_up(&mddev->sb_wait);
307 goto check_suspended;
308 }
309
310 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
311 wake_up(&mddev->sb_wait);
312 }
313 EXPORT_SYMBOL(md_handle_request);
314
315 static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
316 {
317 const int rw = bio_data_dir(bio);
318 struct mddev *mddev = q->queuedata;
319 unsigned int sectors;
320 int cpu;
321
322 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
323 bio_io_error(bio);
324 return BLK_QC_T_NONE;
325 }
326
327 blk_queue_split(q, &bio);
328
329 if (mddev == NULL || mddev->pers == NULL) {
330 bio_io_error(bio);
331 return BLK_QC_T_NONE;
332 }
333 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
334 if (bio_sectors(bio) != 0)
335 bio->bi_status = BLK_STS_IOERR;
336 bio_endio(bio);
337 return BLK_QC_T_NONE;
338 }
339
340 /*
341 * save the sectors now since our bio can
342 * go away inside make_request
343 */
344 sectors = bio_sectors(bio);
345 /* bio could be mergeable after passing to underlayer */
346 bio->bi_opf &= ~REQ_NOMERGE;
347
348 md_handle_request(mddev, bio);
349
350 cpu = part_stat_lock();
351 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
352 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
353 part_stat_unlock();
354
355 return BLK_QC_T_NONE;
356 }
357
358 /* mddev_suspend makes sure no new requests are submitted
359 * to the device, and that any requests that have been submitted
360 * are completely handled.
361 * Once mddev_detach() is called and completes, the module will be
362 * completely unused.
363 */
364 void mddev_suspend(struct mddev *mddev)
365 {
366 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
367 lockdep_assert_held(&mddev->reconfig_mutex);
368 if (mddev->suspended++)
369 return;
370 synchronize_rcu();
371 wake_up(&mddev->sb_wait);
372 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
373 smp_mb__after_atomic();
374 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
375 mddev->pers->quiesce(mddev, 1);
376 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
377 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
378
379 del_timer_sync(&mddev->safemode_timer);
380 }
381 EXPORT_SYMBOL_GPL(mddev_suspend);
382
383 void mddev_resume(struct mddev *mddev)
384 {
385 lockdep_assert_held(&mddev->reconfig_mutex);
386 if (--mddev->suspended)
387 return;
388 wake_up(&mddev->sb_wait);
389 mddev->pers->quiesce(mddev, 0);
390
391 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
392 md_wakeup_thread(mddev->thread);
393 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
394 }
395 EXPORT_SYMBOL_GPL(mddev_resume);
396
397 int mddev_congested(struct mddev *mddev, int bits)
398 {
399 struct md_personality *pers = mddev->pers;
400 int ret = 0;
401
402 rcu_read_lock();
403 if (mddev->suspended)
404 ret = 1;
405 else if (pers && pers->congested)
406 ret = pers->congested(mddev, bits);
407 rcu_read_unlock();
408 return ret;
409 }
410 EXPORT_SYMBOL_GPL(mddev_congested);
411 static int md_congested(void *data, int bits)
412 {
413 struct mddev *mddev = data;
414 return mddev_congested(mddev, bits);
415 }
416
417 /*
418 * Generic flush handling for md
419 */
420
421 static void md_end_flush(struct bio *bio)
422 {
423 struct md_rdev *rdev = bio->bi_private;
424 struct mddev *mddev = rdev->mddev;
425
426 rdev_dec_pending(rdev, mddev);
427
428 if (atomic_dec_and_test(&mddev->flush_pending)) {
429 /* The pre-request flush has finished */
430 queue_work(md_wq, &mddev->flush_work);
431 }
432 bio_put(bio);
433 }
434
435 static void md_submit_flush_data(struct work_struct *ws);
436
437 static void submit_flushes(struct work_struct *ws)
438 {
439 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
440 struct md_rdev *rdev;
441
442 mddev->start_flush = ktime_get_boottime();
443 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
444 atomic_set(&mddev->flush_pending, 1);
445 rcu_read_lock();
446 rdev_for_each_rcu(rdev, mddev)
447 if (rdev->raid_disk >= 0 &&
448 !test_bit(Faulty, &rdev->flags)) {
449 /* Take two references, one is dropped
450 * when request finishes, one after
451 * we reclaim rcu_read_lock
452 */
453 struct bio *bi;
454 atomic_inc(&rdev->nr_pending);
455 atomic_inc(&rdev->nr_pending);
456 rcu_read_unlock();
457 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
458 bi->bi_end_io = md_end_flush;
459 bi->bi_private = rdev;
460 bio_set_dev(bi, rdev->bdev);
461 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
462 atomic_inc(&mddev->flush_pending);
463 submit_bio(bi);
464 rcu_read_lock();
465 rdev_dec_pending(rdev, mddev);
466 }
467 rcu_read_unlock();
468 if (atomic_dec_and_test(&mddev->flush_pending))
469 queue_work(md_wq, &mddev->flush_work);
470 }
471
472 static void md_submit_flush_data(struct work_struct *ws)
473 {
474 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
475 struct bio *bio = mddev->flush_bio;
476
477 /*
478 * must reset flush_bio before calling into md_handle_request to avoid a
479 * deadlock, because other bios passed md_handle_request suspend check
480 * could wait for this and below md_handle_request could wait for those
481 * bios because of suspend check
482 */
483 mddev->last_flush = mddev->start_flush;
484 mddev->flush_bio = NULL;
485 wake_up(&mddev->sb_wait);
486
487 if (bio->bi_iter.bi_size == 0)
488 /* an empty barrier - all done */
489 bio_endio(bio);
490 else {
491 bio->bi_opf &= ~REQ_PREFLUSH;
492 md_handle_request(mddev, bio);
493 }
494 }
495
496 /*
497 * Manages consolidation of flushes and submitting any flushes needed for
498 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
499 * being finished in another context. Returns false if the flushing is
500 * complete but still needs the I/O portion of the bio to be processed.
501 */
502 bool md_flush_request(struct mddev *mddev, struct bio *bio)
503 {
504 ktime_t start = ktime_get_boottime();
505 spin_lock_irq(&mddev->lock);
506 wait_event_lock_irq(mddev->sb_wait,
507 !mddev->flush_bio ||
508 ktime_after(mddev->last_flush, start),
509 mddev->lock);
510 if (!ktime_after(mddev->last_flush, start)) {
511 WARN_ON(mddev->flush_bio);
512 mddev->flush_bio = bio;
513 bio = NULL;
514 }
515 spin_unlock_irq(&mddev->lock);
516
517 if (!bio) {
518 INIT_WORK(&mddev->flush_work, submit_flushes);
519 queue_work(md_wq, &mddev->flush_work);
520 } else {
521 /* flush was performed for some other bio while we waited. */
522 if (bio->bi_iter.bi_size == 0)
523 /* an empty barrier - all done */
524 bio_endio(bio);
525 else {
526 bio->bi_opf &= ~REQ_PREFLUSH;
527 return false;
528 }
529 }
530 return true;
531 }
532 EXPORT_SYMBOL(md_flush_request);
533
534 static inline struct mddev *mddev_get(struct mddev *mddev)
535 {
536 atomic_inc(&mddev->active);
537 return mddev;
538 }
539
540 static void mddev_delayed_delete(struct work_struct *ws);
541
542 static void mddev_put(struct mddev *mddev)
543 {
544 struct bio_set *bs = NULL, *sync_bs = NULL;
545
546 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
547 return;
548 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
549 mddev->ctime == 0 && !mddev->hold_active) {
550 /* Array is not configured at all, and not held active,
551 * so destroy it */
552 list_del_init(&mddev->all_mddevs);
553 bs = mddev->bio_set;
554 sync_bs = mddev->sync_set;
555 mddev->bio_set = NULL;
556 mddev->sync_set = NULL;
557 if (mddev->gendisk) {
558 /* We did a probe so need to clean up. Call
559 * queue_work inside the spinlock so that
560 * flush_workqueue() after mddev_find will
561 * succeed in waiting for the work to be done.
562 */
563 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
564 queue_work(md_misc_wq, &mddev->del_work);
565 } else
566 kfree(mddev);
567 }
568 spin_unlock(&all_mddevs_lock);
569 if (bs)
570 bioset_free(bs);
571 if (sync_bs)
572 bioset_free(sync_bs);
573 }
574
575 static void md_safemode_timeout(struct timer_list *t);
576
577 void mddev_init(struct mddev *mddev)
578 {
579 mutex_init(&mddev->open_mutex);
580 mutex_init(&mddev->reconfig_mutex);
581 mutex_init(&mddev->bitmap_info.mutex);
582 INIT_LIST_HEAD(&mddev->disks);
583 INIT_LIST_HEAD(&mddev->all_mddevs);
584 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
585 atomic_set(&mddev->active, 1);
586 atomic_set(&mddev->openers, 0);
587 atomic_set(&mddev->active_io, 0);
588 spin_lock_init(&mddev->lock);
589 atomic_set(&mddev->flush_pending, 0);
590 init_waitqueue_head(&mddev->sb_wait);
591 init_waitqueue_head(&mddev->recovery_wait);
592 mddev->reshape_position = MaxSector;
593 mddev->reshape_backwards = 0;
594 mddev->last_sync_action = "none";
595 mddev->resync_min = 0;
596 mddev->resync_max = MaxSector;
597 mddev->level = LEVEL_NONE;
598 }
599 EXPORT_SYMBOL_GPL(mddev_init);
600
601 static struct mddev *mddev_find(dev_t unit)
602 {
603 struct mddev *mddev, *new = NULL;
604
605 if (unit && MAJOR(unit) != MD_MAJOR)
606 unit &= ~((1<<MdpMinorShift)-1);
607
608 retry:
609 spin_lock(&all_mddevs_lock);
610
611 if (unit) {
612 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
613 if (mddev->unit == unit) {
614 mddev_get(mddev);
615 spin_unlock(&all_mddevs_lock);
616 kfree(new);
617 return mddev;
618 }
619
620 if (new) {
621 list_add(&new->all_mddevs, &all_mddevs);
622 spin_unlock(&all_mddevs_lock);
623 new->hold_active = UNTIL_IOCTL;
624 return new;
625 }
626 } else if (new) {
627 /* find an unused unit number */
628 static int next_minor = 512;
629 int start = next_minor;
630 int is_free = 0;
631 int dev = 0;
632 while (!is_free) {
633 dev = MKDEV(MD_MAJOR, next_minor);
634 next_minor++;
635 if (next_minor > MINORMASK)
636 next_minor = 0;
637 if (next_minor == start) {
638 /* Oh dear, all in use. */
639 spin_unlock(&all_mddevs_lock);
640 kfree(new);
641 return NULL;
642 }
643
644 is_free = 1;
645 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
646 if (mddev->unit == dev) {
647 is_free = 0;
648 break;
649 }
650 }
651 new->unit = dev;
652 new->md_minor = MINOR(dev);
653 new->hold_active = UNTIL_STOP;
654 list_add(&new->all_mddevs, &all_mddevs);
655 spin_unlock(&all_mddevs_lock);
656 return new;
657 }
658 spin_unlock(&all_mddevs_lock);
659
660 new = kzalloc(sizeof(*new), GFP_KERNEL);
661 if (!new)
662 return NULL;
663
664 new->unit = unit;
665 if (MAJOR(unit) == MD_MAJOR)
666 new->md_minor = MINOR(unit);
667 else
668 new->md_minor = MINOR(unit) >> MdpMinorShift;
669
670 mddev_init(new);
671
672 goto retry;
673 }
674
675 static struct attribute_group md_redundancy_group;
676
677 void mddev_unlock(struct mddev *mddev)
678 {
679 if (mddev->to_remove) {
680 /* These cannot be removed under reconfig_mutex as
681 * an access to the files will try to take reconfig_mutex
682 * while holding the file unremovable, which leads to
683 * a deadlock.
684 * So hold set sysfs_active while the remove in happeing,
685 * and anything else which might set ->to_remove or my
686 * otherwise change the sysfs namespace will fail with
687 * -EBUSY if sysfs_active is still set.
688 * We set sysfs_active under reconfig_mutex and elsewhere
689 * test it under the same mutex to ensure its correct value
690 * is seen.
691 */
692 struct attribute_group *to_remove = mddev->to_remove;
693 mddev->to_remove = NULL;
694 mddev->sysfs_active = 1;
695 mutex_unlock(&mddev->reconfig_mutex);
696
697 if (mddev->kobj.sd) {
698 if (to_remove != &md_redundancy_group)
699 sysfs_remove_group(&mddev->kobj, to_remove);
700 if (mddev->pers == NULL ||
701 mddev->pers->sync_request == NULL) {
702 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
703 if (mddev->sysfs_action)
704 sysfs_put(mddev->sysfs_action);
705 mddev->sysfs_action = NULL;
706 }
707 }
708 mddev->sysfs_active = 0;
709 } else
710 mutex_unlock(&mddev->reconfig_mutex);
711
712 /* As we've dropped the mutex we need a spinlock to
713 * make sure the thread doesn't disappear
714 */
715 spin_lock(&pers_lock);
716 md_wakeup_thread(mddev->thread);
717 wake_up(&mddev->sb_wait);
718 spin_unlock(&pers_lock);
719 }
720 EXPORT_SYMBOL_GPL(mddev_unlock);
721
722 struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
723 {
724 struct md_rdev *rdev;
725
726 rdev_for_each_rcu(rdev, mddev)
727 if (rdev->desc_nr == nr)
728 return rdev;
729
730 return NULL;
731 }
732 EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
733
734 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
735 {
736 struct md_rdev *rdev;
737
738 rdev_for_each(rdev, mddev)
739 if (rdev->bdev->bd_dev == dev)
740 return rdev;
741
742 return NULL;
743 }
744
745 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
746 {
747 struct md_rdev *rdev;
748
749 rdev_for_each_rcu(rdev, mddev)
750 if (rdev->bdev->bd_dev == dev)
751 return rdev;
752
753 return NULL;
754 }
755
756 static struct md_personality *find_pers(int level, char *clevel)
757 {
758 struct md_personality *pers;
759 list_for_each_entry(pers, &pers_list, list) {
760 if (level != LEVEL_NONE && pers->level == level)
761 return pers;
762 if (strcmp(pers->name, clevel)==0)
763 return pers;
764 }
765 return NULL;
766 }
767
768 /* return the offset of the super block in 512byte sectors */
769 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
770 {
771 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
772 return MD_NEW_SIZE_SECTORS(num_sectors);
773 }
774
775 static int alloc_disk_sb(struct md_rdev *rdev)
776 {
777 rdev->sb_page = alloc_page(GFP_KERNEL);
778 if (!rdev->sb_page)
779 return -ENOMEM;
780 return 0;
781 }
782
783 void md_rdev_clear(struct md_rdev *rdev)
784 {
785 if (rdev->sb_page) {
786 put_page(rdev->sb_page);
787 rdev->sb_loaded = 0;
788 rdev->sb_page = NULL;
789 rdev->sb_start = 0;
790 rdev->sectors = 0;
791 }
792 if (rdev->bb_page) {
793 put_page(rdev->bb_page);
794 rdev->bb_page = NULL;
795 }
796 badblocks_exit(&rdev->badblocks);
797 }
798 EXPORT_SYMBOL_GPL(md_rdev_clear);
799
800 static void super_written(struct bio *bio)
801 {
802 struct md_rdev *rdev = bio->bi_private;
803 struct mddev *mddev = rdev->mddev;
804
805 if (bio->bi_status) {
806 pr_err("md: super_written gets error=%d\n", bio->bi_status);
807 md_error(mddev, rdev);
808 if (!test_bit(Faulty, &rdev->flags)
809 && (bio->bi_opf & MD_FAILFAST)) {
810 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
811 set_bit(LastDev, &rdev->flags);
812 }
813 } else
814 clear_bit(LastDev, &rdev->flags);
815
816 if (atomic_dec_and_test(&mddev->pending_writes))
817 wake_up(&mddev->sb_wait);
818 rdev_dec_pending(rdev, mddev);
819 bio_put(bio);
820 }
821
822 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
823 sector_t sector, int size, struct page *page)
824 {
825 /* write first size bytes of page to sector of rdev
826 * Increment mddev->pending_writes before returning
827 * and decrement it on completion, waking up sb_wait
828 * if zero is reached.
829 * If an error occurred, call md_error
830 */
831 struct bio *bio;
832 int ff = 0;
833
834 if (!page)
835 return;
836
837 if (test_bit(Faulty, &rdev->flags))
838 return;
839
840 bio = md_bio_alloc_sync(mddev);
841
842 atomic_inc(&rdev->nr_pending);
843
844 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
845 bio->bi_iter.bi_sector = sector;
846 bio_add_page(bio, page, size, 0);
847 bio->bi_private = rdev;
848 bio->bi_end_io = super_written;
849
850 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
851 test_bit(FailFast, &rdev->flags) &&
852 !test_bit(LastDev, &rdev->flags))
853 ff = MD_FAILFAST;
854 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
855
856 atomic_inc(&mddev->pending_writes);
857 submit_bio(bio);
858 }
859
860 int md_super_wait(struct mddev *mddev)
861 {
862 /* wait for all superblock writes that were scheduled to complete */
863 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
864 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
865 return -EAGAIN;
866 return 0;
867 }
868
869 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
870 struct page *page, int op, int op_flags, bool metadata_op)
871 {
872 struct bio *bio = md_bio_alloc_sync(rdev->mddev);
873 int ret;
874
875 if (metadata_op && rdev->meta_bdev)
876 bio_set_dev(bio, rdev->meta_bdev);
877 else
878 bio_set_dev(bio, rdev->bdev);
879 bio_set_op_attrs(bio, op, op_flags);
880 if (metadata_op)
881 bio->bi_iter.bi_sector = sector + rdev->sb_start;
882 else if (rdev->mddev->reshape_position != MaxSector &&
883 (rdev->mddev->reshape_backwards ==
884 (sector >= rdev->mddev->reshape_position)))
885 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
886 else
887 bio->bi_iter.bi_sector = sector + rdev->data_offset;
888 bio_add_page(bio, page, size, 0);
889
890 submit_bio_wait(bio);
891
892 ret = !bio->bi_status;
893 bio_put(bio);
894 return ret;
895 }
896 EXPORT_SYMBOL_GPL(sync_page_io);
897
898 static int read_disk_sb(struct md_rdev *rdev, int size)
899 {
900 char b[BDEVNAME_SIZE];
901
902 if (rdev->sb_loaded)
903 return 0;
904
905 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
906 goto fail;
907 rdev->sb_loaded = 1;
908 return 0;
909
910 fail:
911 pr_err("md: disabled device %s, could not read superblock.\n",
912 bdevname(rdev->bdev,b));
913 return -EINVAL;
914 }
915
916 static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
917 {
918 return sb1->set_uuid0 == sb2->set_uuid0 &&
919 sb1->set_uuid1 == sb2->set_uuid1 &&
920 sb1->set_uuid2 == sb2->set_uuid2 &&
921 sb1->set_uuid3 == sb2->set_uuid3;
922 }
923
924 static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
925 {
926 int ret;
927 mdp_super_t *tmp1, *tmp2;
928
929 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
930 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
931
932 if (!tmp1 || !tmp2) {
933 ret = 0;
934 goto abort;
935 }
936
937 *tmp1 = *sb1;
938 *tmp2 = *sb2;
939
940 /*
941 * nr_disks is not constant
942 */
943 tmp1->nr_disks = 0;
944 tmp2->nr_disks = 0;
945
946 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
947 abort:
948 kfree(tmp1);
949 kfree(tmp2);
950 return ret;
951 }
952
953 static u32 md_csum_fold(u32 csum)
954 {
955 csum = (csum & 0xffff) + (csum >> 16);
956 return (csum & 0xffff) + (csum >> 16);
957 }
958
959 static unsigned int calc_sb_csum(mdp_super_t *sb)
960 {
961 u64 newcsum = 0;
962 u32 *sb32 = (u32*)sb;
963 int i;
964 unsigned int disk_csum, csum;
965
966 disk_csum = sb->sb_csum;
967 sb->sb_csum = 0;
968
969 for (i = 0; i < MD_SB_BYTES/4 ; i++)
970 newcsum += sb32[i];
971 csum = (newcsum & 0xffffffff) + (newcsum>>32);
972
973 #ifdef CONFIG_ALPHA
974 /* This used to use csum_partial, which was wrong for several
975 * reasons including that different results are returned on
976 * different architectures. It isn't critical that we get exactly
977 * the same return value as before (we always csum_fold before
978 * testing, and that removes any differences). However as we
979 * know that csum_partial always returned a 16bit value on
980 * alphas, do a fold to maximise conformity to previous behaviour.
981 */
982 sb->sb_csum = md_csum_fold(disk_csum);
983 #else
984 sb->sb_csum = disk_csum;
985 #endif
986 return csum;
987 }
988
989 /*
990 * Handle superblock details.
991 * We want to be able to handle multiple superblock formats
992 * so we have a common interface to them all, and an array of
993 * different handlers.
994 * We rely on user-space to write the initial superblock, and support
995 * reading and updating of superblocks.
996 * Interface methods are:
997 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
998 * loads and validates a superblock on dev.
999 * if refdev != NULL, compare superblocks on both devices
1000 * Return:
1001 * 0 - dev has a superblock that is compatible with refdev
1002 * 1 - dev has a superblock that is compatible and newer than refdev
1003 * so dev should be used as the refdev in future
1004 * -EINVAL superblock incompatible or invalid
1005 * -othererror e.g. -EIO
1006 *
1007 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1008 * Verify that dev is acceptable into mddev.
1009 * The first time, mddev->raid_disks will be 0, and data from
1010 * dev should be merged in. Subsequent calls check that dev
1011 * is new enough. Return 0 or -EINVAL
1012 *
1013 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1014 * Update the superblock for rdev with data in mddev
1015 * This does not write to disc.
1016 *
1017 */
1018
1019 struct super_type {
1020 char *name;
1021 struct module *owner;
1022 int (*load_super)(struct md_rdev *rdev,
1023 struct md_rdev *refdev,
1024 int minor_version);
1025 int (*validate_super)(struct mddev *mddev,
1026 struct md_rdev *rdev);
1027 void (*sync_super)(struct mddev *mddev,
1028 struct md_rdev *rdev);
1029 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
1030 sector_t num_sectors);
1031 int (*allow_new_offset)(struct md_rdev *rdev,
1032 unsigned long long new_offset);
1033 };
1034
1035 /*
1036 * Check that the given mddev has no bitmap.
1037 *
1038 * This function is called from the run method of all personalities that do not
1039 * support bitmaps. It prints an error message and returns non-zero if mddev
1040 * has a bitmap. Otherwise, it returns 0.
1041 *
1042 */
1043 int md_check_no_bitmap(struct mddev *mddev)
1044 {
1045 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1046 return 0;
1047 pr_warn("%s: bitmaps are not supported for %s\n",
1048 mdname(mddev), mddev->pers->name);
1049 return 1;
1050 }
1051 EXPORT_SYMBOL(md_check_no_bitmap);
1052
1053 /*
1054 * load_super for 0.90.0
1055 */
1056 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1057 {
1058 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1059 mdp_super_t *sb;
1060 int ret;
1061
1062 /*
1063 * Calculate the position of the superblock (512byte sectors),
1064 * it's at the end of the disk.
1065 *
1066 * It also happens to be a multiple of 4Kb.
1067 */
1068 rdev->sb_start = calc_dev_sboffset(rdev);
1069
1070 ret = read_disk_sb(rdev, MD_SB_BYTES);
1071 if (ret)
1072 return ret;
1073
1074 ret = -EINVAL;
1075
1076 bdevname(rdev->bdev, b);
1077 sb = page_address(rdev->sb_page);
1078
1079 if (sb->md_magic != MD_SB_MAGIC) {
1080 pr_warn("md: invalid raid superblock magic on %s\n", b);
1081 goto abort;
1082 }
1083
1084 if (sb->major_version != 0 ||
1085 sb->minor_version < 90 ||
1086 sb->minor_version > 91) {
1087 pr_warn("Bad version number %d.%d on %s\n",
1088 sb->major_version, sb->minor_version, b);
1089 goto abort;
1090 }
1091
1092 if (sb->raid_disks <= 0)
1093 goto abort;
1094
1095 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1096 pr_warn("md: invalid superblock checksum on %s\n", b);
1097 goto abort;
1098 }
1099
1100 rdev->preferred_minor = sb->md_minor;
1101 rdev->data_offset = 0;
1102 rdev->new_data_offset = 0;
1103 rdev->sb_size = MD_SB_BYTES;
1104 rdev->badblocks.shift = -1;
1105
1106 if (sb->level == LEVEL_MULTIPATH)
1107 rdev->desc_nr = -1;
1108 else
1109 rdev->desc_nr = sb->this_disk.number;
1110
1111 if (!refdev) {
1112 ret = 1;
1113 } else {
1114 __u64 ev1, ev2;
1115 mdp_super_t *refsb = page_address(refdev->sb_page);
1116 if (!md_uuid_equal(refsb, sb)) {
1117 pr_warn("md: %s has different UUID to %s\n",
1118 b, bdevname(refdev->bdev,b2));
1119 goto abort;
1120 }
1121 if (!md_sb_equal(refsb, sb)) {
1122 pr_warn("md: %s has same UUID but different superblock to %s\n",
1123 b, bdevname(refdev->bdev, b2));
1124 goto abort;
1125 }
1126 ev1 = md_event(sb);
1127 ev2 = md_event(refsb);
1128 if (ev1 > ev2)
1129 ret = 1;
1130 else
1131 ret = 0;
1132 }
1133 rdev->sectors = rdev->sb_start;
1134 /* Limit to 4TB as metadata cannot record more than that.
1135 * (not needed for Linear and RAID0 as metadata doesn't
1136 * record this size)
1137 */
1138 if (IS_ENABLED(CONFIG_LBDAF) && (u64)rdev->sectors >= (2ULL << 32) &&
1139 sb->level >= 1)
1140 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1141
1142 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1143 /* "this cannot possibly happen" ... */
1144 ret = -EINVAL;
1145
1146 abort:
1147 return ret;
1148 }
1149
1150 /*
1151 * validate_super for 0.90.0
1152 */
1153 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1154 {
1155 mdp_disk_t *desc;
1156 mdp_super_t *sb = page_address(rdev->sb_page);
1157 __u64 ev1 = md_event(sb);
1158
1159 rdev->raid_disk = -1;
1160 clear_bit(Faulty, &rdev->flags);
1161 clear_bit(In_sync, &rdev->flags);
1162 clear_bit(Bitmap_sync, &rdev->flags);
1163 clear_bit(WriteMostly, &rdev->flags);
1164
1165 if (mddev->raid_disks == 0) {
1166 mddev->major_version = 0;
1167 mddev->minor_version = sb->minor_version;
1168 mddev->patch_version = sb->patch_version;
1169 mddev->external = 0;
1170 mddev->chunk_sectors = sb->chunk_size >> 9;
1171 mddev->ctime = sb->ctime;
1172 mddev->utime = sb->utime;
1173 mddev->level = sb->level;
1174 mddev->clevel[0] = 0;
1175 mddev->layout = sb->layout;
1176 mddev->raid_disks = sb->raid_disks;
1177 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1178 mddev->events = ev1;
1179 mddev->bitmap_info.offset = 0;
1180 mddev->bitmap_info.space = 0;
1181 /* bitmap can use 60 K after the 4K superblocks */
1182 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1183 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1184 mddev->reshape_backwards = 0;
1185
1186 if (mddev->minor_version >= 91) {
1187 mddev->reshape_position = sb->reshape_position;
1188 mddev->delta_disks = sb->delta_disks;
1189 mddev->new_level = sb->new_level;
1190 mddev->new_layout = sb->new_layout;
1191 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1192 if (mddev->delta_disks < 0)
1193 mddev->reshape_backwards = 1;
1194 } else {
1195 mddev->reshape_position = MaxSector;
1196 mddev->delta_disks = 0;
1197 mddev->new_level = mddev->level;
1198 mddev->new_layout = mddev->layout;
1199 mddev->new_chunk_sectors = mddev->chunk_sectors;
1200 }
1201
1202 if (sb->state & (1<<MD_SB_CLEAN))
1203 mddev->recovery_cp = MaxSector;
1204 else {
1205 if (sb->events_hi == sb->cp_events_hi &&
1206 sb->events_lo == sb->cp_events_lo) {
1207 mddev->recovery_cp = sb->recovery_cp;
1208 } else
1209 mddev->recovery_cp = 0;
1210 }
1211
1212 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1213 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1214 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1215 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1216
1217 mddev->max_disks = MD_SB_DISKS;
1218
1219 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1220 mddev->bitmap_info.file == NULL) {
1221 mddev->bitmap_info.offset =
1222 mddev->bitmap_info.default_offset;
1223 mddev->bitmap_info.space =
1224 mddev->bitmap_info.default_space;
1225 }
1226
1227 } else if (mddev->pers == NULL) {
1228 /* Insist on good event counter while assembling, except
1229 * for spares (which don't need an event count) */
1230 ++ev1;
1231 if (sb->disks[rdev->desc_nr].state & (
1232 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1233 if (ev1 < mddev->events)
1234 return -EINVAL;
1235 } else if (mddev->bitmap) {
1236 /* if adding to array with a bitmap, then we can accept an
1237 * older device ... but not too old.
1238 */
1239 if (ev1 < mddev->bitmap->events_cleared)
1240 return 0;
1241 if (ev1 < mddev->events)
1242 set_bit(Bitmap_sync, &rdev->flags);
1243 } else {
1244 if (ev1 < mddev->events)
1245 /* just a hot-add of a new device, leave raid_disk at -1 */
1246 return 0;
1247 }
1248
1249 if (mddev->level != LEVEL_MULTIPATH) {
1250 desc = sb->disks + rdev->desc_nr;
1251
1252 if (desc->state & (1<<MD_DISK_FAULTY))
1253 set_bit(Faulty, &rdev->flags);
1254 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1255 desc->raid_disk < mddev->raid_disks */) {
1256 set_bit(In_sync, &rdev->flags);
1257 rdev->raid_disk = desc->raid_disk;
1258 rdev->saved_raid_disk = desc->raid_disk;
1259 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1260 /* active but not in sync implies recovery up to
1261 * reshape position. We don't know exactly where
1262 * that is, so set to zero for now */
1263 if (mddev->minor_version >= 91) {
1264 rdev->recovery_offset = 0;
1265 rdev->raid_disk = desc->raid_disk;
1266 }
1267 }
1268 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1269 set_bit(WriteMostly, &rdev->flags);
1270 if (desc->state & (1<<MD_DISK_FAILFAST))
1271 set_bit(FailFast, &rdev->flags);
1272 } else /* MULTIPATH are always insync */
1273 set_bit(In_sync, &rdev->flags);
1274 return 0;
1275 }
1276
1277 /*
1278 * sync_super for 0.90.0
1279 */
1280 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1281 {
1282 mdp_super_t *sb;
1283 struct md_rdev *rdev2;
1284 int next_spare = mddev->raid_disks;
1285
1286 /* make rdev->sb match mddev data..
1287 *
1288 * 1/ zero out disks
1289 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1290 * 3/ any empty disks < next_spare become removed
1291 *
1292 * disks[0] gets initialised to REMOVED because
1293 * we cannot be sure from other fields if it has
1294 * been initialised or not.
1295 */
1296 int i;
1297 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1298
1299 rdev->sb_size = MD_SB_BYTES;
1300
1301 sb = page_address(rdev->sb_page);
1302
1303 memset(sb, 0, sizeof(*sb));
1304
1305 sb->md_magic = MD_SB_MAGIC;
1306 sb->major_version = mddev->major_version;
1307 sb->patch_version = mddev->patch_version;
1308 sb->gvalid_words = 0; /* ignored */
1309 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1310 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1311 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1312 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1313
1314 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1315 sb->level = mddev->level;
1316 sb->size = mddev->dev_sectors / 2;
1317 sb->raid_disks = mddev->raid_disks;
1318 sb->md_minor = mddev->md_minor;
1319 sb->not_persistent = 0;
1320 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1321 sb->state = 0;
1322 sb->events_hi = (mddev->events>>32);
1323 sb->events_lo = (u32)mddev->events;
1324
1325 if (mddev->reshape_position == MaxSector)
1326 sb->minor_version = 90;
1327 else {
1328 sb->minor_version = 91;
1329 sb->reshape_position = mddev->reshape_position;
1330 sb->new_level = mddev->new_level;
1331 sb->delta_disks = mddev->delta_disks;
1332 sb->new_layout = mddev->new_layout;
1333 sb->new_chunk = mddev->new_chunk_sectors << 9;
1334 }
1335 mddev->minor_version = sb->minor_version;
1336 if (mddev->in_sync)
1337 {
1338 sb->recovery_cp = mddev->recovery_cp;
1339 sb->cp_events_hi = (mddev->events>>32);
1340 sb->cp_events_lo = (u32)mddev->events;
1341 if (mddev->recovery_cp == MaxSector)
1342 sb->state = (1<< MD_SB_CLEAN);
1343 } else
1344 sb->recovery_cp = 0;
1345
1346 sb->layout = mddev->layout;
1347 sb->chunk_size = mddev->chunk_sectors << 9;
1348
1349 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1350 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1351
1352 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1353 rdev_for_each(rdev2, mddev) {
1354 mdp_disk_t *d;
1355 int desc_nr;
1356 int is_active = test_bit(In_sync, &rdev2->flags);
1357
1358 if (rdev2->raid_disk >= 0 &&
1359 sb->minor_version >= 91)
1360 /* we have nowhere to store the recovery_offset,
1361 * but if it is not below the reshape_position,
1362 * we can piggy-back on that.
1363 */
1364 is_active = 1;
1365 if (rdev2->raid_disk < 0 ||
1366 test_bit(Faulty, &rdev2->flags))
1367 is_active = 0;
1368 if (is_active)
1369 desc_nr = rdev2->raid_disk;
1370 else
1371 desc_nr = next_spare++;
1372 rdev2->desc_nr = desc_nr;
1373 d = &sb->disks[rdev2->desc_nr];
1374 nr_disks++;
1375 d->number = rdev2->desc_nr;
1376 d->major = MAJOR(rdev2->bdev->bd_dev);
1377 d->minor = MINOR(rdev2->bdev->bd_dev);
1378 if (is_active)
1379 d->raid_disk = rdev2->raid_disk;
1380 else
1381 d->raid_disk = rdev2->desc_nr; /* compatibility */
1382 if (test_bit(Faulty, &rdev2->flags))
1383 d->state = (1<<MD_DISK_FAULTY);
1384 else if (is_active) {
1385 d->state = (1<<MD_DISK_ACTIVE);
1386 if (test_bit(In_sync, &rdev2->flags))
1387 d->state |= (1<<MD_DISK_SYNC);
1388 active++;
1389 working++;
1390 } else {
1391 d->state = 0;
1392 spare++;
1393 working++;
1394 }
1395 if (test_bit(WriteMostly, &rdev2->flags))
1396 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1397 if (test_bit(FailFast, &rdev2->flags))
1398 d->state |= (1<<MD_DISK_FAILFAST);
1399 }
1400 /* now set the "removed" and "faulty" bits on any missing devices */
1401 for (i=0 ; i < mddev->raid_disks ; i++) {
1402 mdp_disk_t *d = &sb->disks[i];
1403 if (d->state == 0 && d->number == 0) {
1404 d->number = i;
1405 d->raid_disk = i;
1406 d->state = (1<<MD_DISK_REMOVED);
1407 d->state |= (1<<MD_DISK_FAULTY);
1408 failed++;
1409 }
1410 }
1411 sb->nr_disks = nr_disks;
1412 sb->active_disks = active;
1413 sb->working_disks = working;
1414 sb->failed_disks = failed;
1415 sb->spare_disks = spare;
1416
1417 sb->this_disk = sb->disks[rdev->desc_nr];
1418 sb->sb_csum = calc_sb_csum(sb);
1419 }
1420
1421 /*
1422 * rdev_size_change for 0.90.0
1423 */
1424 static unsigned long long
1425 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1426 {
1427 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1428 return 0; /* component must fit device */
1429 if (rdev->mddev->bitmap_info.offset)
1430 return 0; /* can't move bitmap */
1431 rdev->sb_start = calc_dev_sboffset(rdev);
1432 if (!num_sectors || num_sectors > rdev->sb_start)
1433 num_sectors = rdev->sb_start;
1434 /* Limit to 4TB as metadata cannot record more than that.
1435 * 4TB == 2^32 KB, or 2*2^32 sectors.
1436 */
1437 if (IS_ENABLED(CONFIG_LBDAF) && (u64)num_sectors >= (2ULL << 32) &&
1438 rdev->mddev->level >= 1)
1439 num_sectors = (sector_t)(2ULL << 32) - 2;
1440 do {
1441 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1442 rdev->sb_page);
1443 } while (md_super_wait(rdev->mddev) < 0);
1444 return num_sectors;
1445 }
1446
1447 static int
1448 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1449 {
1450 /* non-zero offset changes not possible with v0.90 */
1451 return new_offset == 0;
1452 }
1453
1454 /*
1455 * version 1 superblock
1456 */
1457
1458 static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1459 {
1460 __le32 disk_csum;
1461 u32 csum;
1462 unsigned long long newcsum;
1463 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1464 __le32 *isuper = (__le32*)sb;
1465
1466 disk_csum = sb->sb_csum;
1467 sb->sb_csum = 0;
1468 newcsum = 0;
1469 for (; size >= 4; size -= 4)
1470 newcsum += le32_to_cpu(*isuper++);
1471
1472 if (size == 2)
1473 newcsum += le16_to_cpu(*(__le16*) isuper);
1474
1475 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1476 sb->sb_csum = disk_csum;
1477 return cpu_to_le32(csum);
1478 }
1479
1480 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1481 {
1482 struct mdp_superblock_1 *sb;
1483 int ret;
1484 sector_t sb_start;
1485 sector_t sectors;
1486 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1487 int bmask;
1488
1489 /*
1490 * Calculate the position of the superblock in 512byte sectors.
1491 * It is always aligned to a 4K boundary and
1492 * depeding on minor_version, it can be:
1493 * 0: At least 8K, but less than 12K, from end of device
1494 * 1: At start of device
1495 * 2: 4K from start of device.
1496 */
1497 switch(minor_version) {
1498 case 0:
1499 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1500 sb_start -= 8*2;
1501 sb_start &= ~(sector_t)(4*2-1);
1502 break;
1503 case 1:
1504 sb_start = 0;
1505 break;
1506 case 2:
1507 sb_start = 8;
1508 break;
1509 default:
1510 return -EINVAL;
1511 }
1512 rdev->sb_start = sb_start;
1513
1514 /* superblock is rarely larger than 1K, but it can be larger,
1515 * and it is safe to read 4k, so we do that
1516 */
1517 ret = read_disk_sb(rdev, 4096);
1518 if (ret) return ret;
1519
1520 sb = page_address(rdev->sb_page);
1521
1522 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1523 sb->major_version != cpu_to_le32(1) ||
1524 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1525 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1526 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1527 return -EINVAL;
1528
1529 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1530 pr_warn("md: invalid superblock checksum on %s\n",
1531 bdevname(rdev->bdev,b));
1532 return -EINVAL;
1533 }
1534 if (le64_to_cpu(sb->data_size) < 10) {
1535 pr_warn("md: data_size too small on %s\n",
1536 bdevname(rdev->bdev,b));
1537 return -EINVAL;
1538 }
1539 if (sb->pad0 ||
1540 sb->pad3[0] ||
1541 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1542 /* Some padding is non-zero, might be a new feature */
1543 return -EINVAL;
1544
1545 rdev->preferred_minor = 0xffff;
1546 rdev->data_offset = le64_to_cpu(sb->data_offset);
1547 rdev->new_data_offset = rdev->data_offset;
1548 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1549 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1550 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1551 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1552
1553 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1554 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1555 if (rdev->sb_size & bmask)
1556 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1557
1558 if (minor_version
1559 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1560 return -EINVAL;
1561 if (minor_version
1562 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1563 return -EINVAL;
1564
1565 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1566 rdev->desc_nr = -1;
1567 else
1568 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1569
1570 if (!rdev->bb_page) {
1571 rdev->bb_page = alloc_page(GFP_KERNEL);
1572 if (!rdev->bb_page)
1573 return -ENOMEM;
1574 }
1575 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1576 rdev->badblocks.count == 0) {
1577 /* need to load the bad block list.
1578 * Currently we limit it to one page.
1579 */
1580 s32 offset;
1581 sector_t bb_sector;
1582 u64 *bbp;
1583 int i;
1584 int sectors = le16_to_cpu(sb->bblog_size);
1585 if (sectors > (PAGE_SIZE / 512))
1586 return -EINVAL;
1587 offset = le32_to_cpu(sb->bblog_offset);
1588 if (offset == 0)
1589 return -EINVAL;
1590 bb_sector = (long long)offset;
1591 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1592 rdev->bb_page, REQ_OP_READ, 0, true))
1593 return -EIO;
1594 bbp = (u64 *)page_address(rdev->bb_page);
1595 rdev->badblocks.shift = sb->bblog_shift;
1596 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1597 u64 bb = le64_to_cpu(*bbp);
1598 int count = bb & (0x3ff);
1599 u64 sector = bb >> 10;
1600 sector <<= sb->bblog_shift;
1601 count <<= sb->bblog_shift;
1602 if (bb + 1 == 0)
1603 break;
1604 if (badblocks_set(&rdev->badblocks, sector, count, 1))
1605 return -EINVAL;
1606 }
1607 } else if (sb->bblog_offset != 0)
1608 rdev->badblocks.shift = 0;
1609
1610 if ((le32_to_cpu(sb->feature_map) &
1611 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
1612 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1613 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1614 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1615 }
1616
1617 if (!refdev) {
1618 ret = 1;
1619 } else {
1620 __u64 ev1, ev2;
1621 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1622
1623 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1624 sb->level != refsb->level ||
1625 sb->layout != refsb->layout ||
1626 sb->chunksize != refsb->chunksize) {
1627 pr_warn("md: %s has strangely different superblock to %s\n",
1628 bdevname(rdev->bdev,b),
1629 bdevname(refdev->bdev,b2));
1630 return -EINVAL;
1631 }
1632 ev1 = le64_to_cpu(sb->events);
1633 ev2 = le64_to_cpu(refsb->events);
1634
1635 if (ev1 > ev2)
1636 ret = 1;
1637 else
1638 ret = 0;
1639 }
1640 if (minor_version) {
1641 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1642 sectors -= rdev->data_offset;
1643 } else
1644 sectors = rdev->sb_start;
1645 if (sectors < le64_to_cpu(sb->data_size))
1646 return -EINVAL;
1647 rdev->sectors = le64_to_cpu(sb->data_size);
1648 return ret;
1649 }
1650
1651 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1652 {
1653 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1654 __u64 ev1 = le64_to_cpu(sb->events);
1655
1656 rdev->raid_disk = -1;
1657 clear_bit(Faulty, &rdev->flags);
1658 clear_bit(In_sync, &rdev->flags);
1659 clear_bit(Bitmap_sync, &rdev->flags);
1660 clear_bit(WriteMostly, &rdev->flags);
1661
1662 if (mddev->raid_disks == 0) {
1663 mddev->major_version = 1;
1664 mddev->patch_version = 0;
1665 mddev->external = 0;
1666 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1667 mddev->ctime = le64_to_cpu(sb->ctime);
1668 mddev->utime = le64_to_cpu(sb->utime);
1669 mddev->level = le32_to_cpu(sb->level);
1670 mddev->clevel[0] = 0;
1671 mddev->layout = le32_to_cpu(sb->layout);
1672 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1673 mddev->dev_sectors = le64_to_cpu(sb->size);
1674 mddev->events = ev1;
1675 mddev->bitmap_info.offset = 0;
1676 mddev->bitmap_info.space = 0;
1677 /* Default location for bitmap is 1K after superblock
1678 * using 3K - total of 4K
1679 */
1680 mddev->bitmap_info.default_offset = 1024 >> 9;
1681 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1682 mddev->reshape_backwards = 0;
1683
1684 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1685 memcpy(mddev->uuid, sb->set_uuid, 16);
1686
1687 mddev->max_disks = (4096-256)/2;
1688
1689 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1690 mddev->bitmap_info.file == NULL) {
1691 mddev->bitmap_info.offset =
1692 (__s32)le32_to_cpu(sb->bitmap_offset);
1693 /* Metadata doesn't record how much space is available.
1694 * For 1.0, we assume we can use up to the superblock
1695 * if before, else to 4K beyond superblock.
1696 * For others, assume no change is possible.
1697 */
1698 if (mddev->minor_version > 0)
1699 mddev->bitmap_info.space = 0;
1700 else if (mddev->bitmap_info.offset > 0)
1701 mddev->bitmap_info.space =
1702 8 - mddev->bitmap_info.offset;
1703 else
1704 mddev->bitmap_info.space =
1705 -mddev->bitmap_info.offset;
1706 }
1707
1708 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1709 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1710 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1711 mddev->new_level = le32_to_cpu(sb->new_level);
1712 mddev->new_layout = le32_to_cpu(sb->new_layout);
1713 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1714 if (mddev->delta_disks < 0 ||
1715 (mddev->delta_disks == 0 &&
1716 (le32_to_cpu(sb->feature_map)
1717 & MD_FEATURE_RESHAPE_BACKWARDS)))
1718 mddev->reshape_backwards = 1;
1719 } else {
1720 mddev->reshape_position = MaxSector;
1721 mddev->delta_disks = 0;
1722 mddev->new_level = mddev->level;
1723 mddev->new_layout = mddev->layout;
1724 mddev->new_chunk_sectors = mddev->chunk_sectors;
1725 }
1726
1727 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
1728 set_bit(MD_HAS_JOURNAL, &mddev->flags);
1729
1730 if (le32_to_cpu(sb->feature_map) &
1731 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
1732 if (le32_to_cpu(sb->feature_map) &
1733 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1734 return -EINVAL;
1735 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1736 (le32_to_cpu(sb->feature_map) &
1737 MD_FEATURE_MULTIPLE_PPLS))
1738 return -EINVAL;
1739 set_bit(MD_HAS_PPL, &mddev->flags);
1740 }
1741 } else if (mddev->pers == NULL) {
1742 /* Insist of good event counter while assembling, except for
1743 * spares (which don't need an event count) */
1744 ++ev1;
1745 if (rdev->desc_nr >= 0 &&
1746 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1747 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1748 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
1749 if (ev1 < mddev->events)
1750 return -EINVAL;
1751 } else if (mddev->bitmap) {
1752 /* If adding to array with a bitmap, then we can accept an
1753 * older device, but not too old.
1754 */
1755 if (ev1 < mddev->bitmap->events_cleared)
1756 return 0;
1757 if (ev1 < mddev->events)
1758 set_bit(Bitmap_sync, &rdev->flags);
1759 } else {
1760 if (ev1 < mddev->events)
1761 /* just a hot-add of a new device, leave raid_disk at -1 */
1762 return 0;
1763 }
1764 if (mddev->level != LEVEL_MULTIPATH) {
1765 int role;
1766 if (rdev->desc_nr < 0 ||
1767 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1768 role = MD_DISK_ROLE_SPARE;
1769 rdev->desc_nr = -1;
1770 } else
1771 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1772 switch(role) {
1773 case MD_DISK_ROLE_SPARE: /* spare */
1774 break;
1775 case MD_DISK_ROLE_FAULTY: /* faulty */
1776 set_bit(Faulty, &rdev->flags);
1777 break;
1778 case MD_DISK_ROLE_JOURNAL: /* journal device */
1779 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1780 /* journal device without journal feature */
1781 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
1782 return -EINVAL;
1783 }
1784 set_bit(Journal, &rdev->flags);
1785 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
1786 rdev->raid_disk = 0;
1787 break;
1788 default:
1789 rdev->saved_raid_disk = role;
1790 if ((le32_to_cpu(sb->feature_map) &
1791 MD_FEATURE_RECOVERY_OFFSET)) {
1792 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1793 if (!(le32_to_cpu(sb->feature_map) &
1794 MD_FEATURE_RECOVERY_BITMAP))
1795 rdev->saved_raid_disk = -1;
1796 } else {
1797 /*
1798 * If the array is FROZEN, then the device can't
1799 * be in_sync with rest of array.
1800 */
1801 if (!test_bit(MD_RECOVERY_FROZEN,
1802 &mddev->recovery))
1803 set_bit(In_sync, &rdev->flags);
1804 }
1805 rdev->raid_disk = role;
1806 break;
1807 }
1808 if (sb->devflags & WriteMostly1)
1809 set_bit(WriteMostly, &rdev->flags);
1810 if (sb->devflags & FailFast1)
1811 set_bit(FailFast, &rdev->flags);
1812 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1813 set_bit(Replacement, &rdev->flags);
1814 } else /* MULTIPATH are always insync */
1815 set_bit(In_sync, &rdev->flags);
1816
1817 return 0;
1818 }
1819
1820 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1821 {
1822 struct mdp_superblock_1 *sb;
1823 struct md_rdev *rdev2;
1824 int max_dev, i;
1825 /* make rdev->sb match mddev and rdev data. */
1826
1827 sb = page_address(rdev->sb_page);
1828
1829 sb->feature_map = 0;
1830 sb->pad0 = 0;
1831 sb->recovery_offset = cpu_to_le64(0);
1832 memset(sb->pad3, 0, sizeof(sb->pad3));
1833
1834 sb->utime = cpu_to_le64((__u64)mddev->utime);
1835 sb->events = cpu_to_le64(mddev->events);
1836 if (mddev->in_sync)
1837 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1838 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
1839 sb->resync_offset = cpu_to_le64(MaxSector);
1840 else
1841 sb->resync_offset = cpu_to_le64(0);
1842
1843 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1844
1845 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1846 sb->size = cpu_to_le64(mddev->dev_sectors);
1847 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1848 sb->level = cpu_to_le32(mddev->level);
1849 sb->layout = cpu_to_le32(mddev->layout);
1850 if (test_bit(FailFast, &rdev->flags))
1851 sb->devflags |= FailFast1;
1852 else
1853 sb->devflags &= ~FailFast1;
1854
1855 if (test_bit(WriteMostly, &rdev->flags))
1856 sb->devflags |= WriteMostly1;
1857 else
1858 sb->devflags &= ~WriteMostly1;
1859 sb->data_offset = cpu_to_le64(rdev->data_offset);
1860 sb->data_size = cpu_to_le64(rdev->sectors);
1861
1862 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1863 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1864 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1865 }
1866
1867 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
1868 !test_bit(In_sync, &rdev->flags)) {
1869 sb->feature_map |=
1870 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1871 sb->recovery_offset =
1872 cpu_to_le64(rdev->recovery_offset);
1873 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1874 sb->feature_map |=
1875 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1876 }
1877 /* Note: recovery_offset and journal_tail share space */
1878 if (test_bit(Journal, &rdev->flags))
1879 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
1880 if (test_bit(Replacement, &rdev->flags))
1881 sb->feature_map |=
1882 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1883
1884 if (mddev->reshape_position != MaxSector) {
1885 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1886 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1887 sb->new_layout = cpu_to_le32(mddev->new_layout);
1888 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1889 sb->new_level = cpu_to_le32(mddev->new_level);
1890 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1891 if (mddev->delta_disks == 0 &&
1892 mddev->reshape_backwards)
1893 sb->feature_map
1894 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1895 if (rdev->new_data_offset != rdev->data_offset) {
1896 sb->feature_map
1897 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1898 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1899 - rdev->data_offset));
1900 }
1901 }
1902
1903 if (mddev_is_clustered(mddev))
1904 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
1905
1906 if (rdev->badblocks.count == 0)
1907 /* Nothing to do for bad blocks*/ ;
1908 else if (sb->bblog_offset == 0)
1909 /* Cannot record bad blocks on this device */
1910 md_error(mddev, rdev);
1911 else {
1912 struct badblocks *bb = &rdev->badblocks;
1913 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1914 u64 *p = bb->page;
1915 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1916 if (bb->changed) {
1917 unsigned seq;
1918
1919 retry:
1920 seq = read_seqbegin(&bb->lock);
1921
1922 memset(bbp, 0xff, PAGE_SIZE);
1923
1924 for (i = 0 ; i < bb->count ; i++) {
1925 u64 internal_bb = p[i];
1926 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1927 | BB_LEN(internal_bb));
1928 bbp[i] = cpu_to_le64(store_bb);
1929 }
1930 bb->changed = 0;
1931 if (read_seqretry(&bb->lock, seq))
1932 goto retry;
1933
1934 bb->sector = (rdev->sb_start +
1935 (int)le32_to_cpu(sb->bblog_offset));
1936 bb->size = le16_to_cpu(sb->bblog_size);
1937 }
1938 }
1939
1940 max_dev = 0;
1941 rdev_for_each(rdev2, mddev)
1942 if (rdev2->desc_nr+1 > max_dev)
1943 max_dev = rdev2->desc_nr+1;
1944
1945 if (max_dev > le32_to_cpu(sb->max_dev)) {
1946 int bmask;
1947 sb->max_dev = cpu_to_le32(max_dev);
1948 rdev->sb_size = max_dev * 2 + 256;
1949 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1950 if (rdev->sb_size & bmask)
1951 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1952 } else
1953 max_dev = le32_to_cpu(sb->max_dev);
1954
1955 for (i=0; i<max_dev;i++)
1956 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1957
1958 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
1959 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
1960
1961 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
1962 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
1963 sb->feature_map |=
1964 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
1965 else
1966 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
1967 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
1968 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
1969 }
1970
1971 rdev_for_each(rdev2, mddev) {
1972 i = rdev2->desc_nr;
1973 if (test_bit(Faulty, &rdev2->flags))
1974 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
1975 else if (test_bit(In_sync, &rdev2->flags))
1976 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1977 else if (test_bit(Journal, &rdev2->flags))
1978 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
1979 else if (rdev2->raid_disk >= 0)
1980 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1981 else
1982 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1983 }
1984
1985 sb->sb_csum = calc_sb_1_csum(sb);
1986 }
1987
1988 static unsigned long long
1989 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1990 {
1991 struct mdp_superblock_1 *sb;
1992 sector_t max_sectors;
1993 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1994 return 0; /* component must fit device */
1995 if (rdev->data_offset != rdev->new_data_offset)
1996 return 0; /* too confusing */
1997 if (rdev->sb_start < rdev->data_offset) {
1998 /* minor versions 1 and 2; superblock before data */
1999 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
2000 max_sectors -= rdev->data_offset;
2001 if (!num_sectors || num_sectors > max_sectors)
2002 num_sectors = max_sectors;
2003 } else if (rdev->mddev->bitmap_info.offset) {
2004 /* minor version 0 with bitmap we can't move */
2005 return 0;
2006 } else {
2007 /* minor version 0; superblock after data */
2008 sector_t sb_start;
2009 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
2010 sb_start &= ~(sector_t)(4*2 - 1);
2011 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
2012 if (!num_sectors || num_sectors > max_sectors)
2013 num_sectors = max_sectors;
2014 rdev->sb_start = sb_start;
2015 }
2016 sb = page_address(rdev->sb_page);
2017 sb->data_size = cpu_to_le64(num_sectors);
2018 sb->super_offset = cpu_to_le64(rdev->sb_start);
2019 sb->sb_csum = calc_sb_1_csum(sb);
2020 do {
2021 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2022 rdev->sb_page);
2023 } while (md_super_wait(rdev->mddev) < 0);
2024 return num_sectors;
2025
2026 }
2027
2028 static int
2029 super_1_allow_new_offset(struct md_rdev *rdev,
2030 unsigned long long new_offset)
2031 {
2032 /* All necessary checks on new >= old have been done */
2033 struct bitmap *bitmap;
2034 if (new_offset >= rdev->data_offset)
2035 return 1;
2036
2037 /* with 1.0 metadata, there is no metadata to tread on
2038 * so we can always move back */
2039 if (rdev->mddev->minor_version == 0)
2040 return 1;
2041
2042 /* otherwise we must be sure not to step on
2043 * any metadata, so stay:
2044 * 36K beyond start of superblock
2045 * beyond end of badblocks
2046 * beyond write-intent bitmap
2047 */
2048 if (rdev->sb_start + (32+4)*2 > new_offset)
2049 return 0;
2050 bitmap = rdev->mddev->bitmap;
2051 if (bitmap && !rdev->mddev->bitmap_info.file &&
2052 rdev->sb_start + rdev->mddev->bitmap_info.offset +
2053 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
2054 return 0;
2055 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2056 return 0;
2057
2058 return 1;
2059 }
2060
2061 static struct super_type super_types[] = {
2062 [0] = {
2063 .name = "0.90.0",
2064 .owner = THIS_MODULE,
2065 .load_super = super_90_load,
2066 .validate_super = super_90_validate,
2067 .sync_super = super_90_sync,
2068 .rdev_size_change = super_90_rdev_size_change,
2069 .allow_new_offset = super_90_allow_new_offset,
2070 },
2071 [1] = {
2072 .name = "md-1",
2073 .owner = THIS_MODULE,
2074 .load_super = super_1_load,
2075 .validate_super = super_1_validate,
2076 .sync_super = super_1_sync,
2077 .rdev_size_change = super_1_rdev_size_change,
2078 .allow_new_offset = super_1_allow_new_offset,
2079 },
2080 };
2081
2082 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
2083 {
2084 if (mddev->sync_super) {
2085 mddev->sync_super(mddev, rdev);
2086 return;
2087 }
2088
2089 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2090
2091 super_types[mddev->major_version].sync_super(mddev, rdev);
2092 }
2093
2094 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
2095 {
2096 struct md_rdev *rdev, *rdev2;
2097
2098 rcu_read_lock();
2099 rdev_for_each_rcu(rdev, mddev1) {
2100 if (test_bit(Faulty, &rdev->flags) ||
2101 test_bit(Journal, &rdev->flags) ||
2102 rdev->raid_disk == -1)
2103 continue;
2104 rdev_for_each_rcu(rdev2, mddev2) {
2105 if (test_bit(Faulty, &rdev2->flags) ||
2106 test_bit(Journal, &rdev2->flags) ||
2107 rdev2->raid_disk == -1)
2108 continue;
2109 if (rdev->bdev->bd_contains ==
2110 rdev2->bdev->bd_contains) {
2111 rcu_read_unlock();
2112 return 1;
2113 }
2114 }
2115 }
2116 rcu_read_unlock();
2117 return 0;
2118 }
2119
2120 static LIST_HEAD(pending_raid_disks);
2121
2122 /*
2123 * Try to register data integrity profile for an mddev
2124 *
2125 * This is called when an array is started and after a disk has been kicked
2126 * from the array. It only succeeds if all working and active component devices
2127 * are integrity capable with matching profiles.
2128 */
2129 int md_integrity_register(struct mddev *mddev)
2130 {
2131 struct md_rdev *rdev, *reference = NULL;
2132
2133 if (list_empty(&mddev->disks))
2134 return 0; /* nothing to do */
2135 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2136 return 0; /* shouldn't register, or already is */
2137 rdev_for_each(rdev, mddev) {
2138 /* skip spares and non-functional disks */
2139 if (test_bit(Faulty, &rdev->flags))
2140 continue;
2141 if (rdev->raid_disk < 0)
2142 continue;
2143 if (!reference) {
2144 /* Use the first rdev as the reference */
2145 reference = rdev;
2146 continue;
2147 }
2148 /* does this rdev's profile match the reference profile? */
2149 if (blk_integrity_compare(reference->bdev->bd_disk,
2150 rdev->bdev->bd_disk) < 0)
2151 return -EINVAL;
2152 }
2153 if (!reference || !bdev_get_integrity(reference->bdev))
2154 return 0;
2155 /*
2156 * All component devices are integrity capable and have matching
2157 * profiles, register the common profile for the md device.
2158 */
2159 blk_integrity_register(mddev->gendisk,
2160 bdev_get_integrity(reference->bdev));
2161
2162 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
2163 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2164 pr_err("md: failed to create integrity pool for %s\n",
2165 mdname(mddev));
2166 return -EINVAL;
2167 }
2168 return 0;
2169 }
2170 EXPORT_SYMBOL(md_integrity_register);
2171
2172 /*
2173 * Attempt to add an rdev, but only if it is consistent with the current
2174 * integrity profile
2175 */
2176 int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2177 {
2178 struct blk_integrity *bi_rdev;
2179 struct blk_integrity *bi_mddev;
2180 char name[BDEVNAME_SIZE];
2181
2182 if (!mddev->gendisk)
2183 return 0;
2184
2185 bi_rdev = bdev_get_integrity(rdev->bdev);
2186 bi_mddev = blk_get_integrity(mddev->gendisk);
2187
2188 if (!bi_mddev) /* nothing to do */
2189 return 0;
2190
2191 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
2192 pr_err("%s: incompatible integrity profile for %s\n",
2193 mdname(mddev), bdevname(rdev->bdev, name));
2194 return -ENXIO;
2195 }
2196
2197 return 0;
2198 }
2199 EXPORT_SYMBOL(md_integrity_add_rdev);
2200
2201 static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
2202 {
2203 char b[BDEVNAME_SIZE];
2204 struct kobject *ko;
2205 int err;
2206
2207 /* prevent duplicates */
2208 if (find_rdev(mddev, rdev->bdev->bd_dev))
2209 return -EEXIST;
2210
2211 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2212 mddev->pers)
2213 return -EROFS;
2214
2215 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2216 if (!test_bit(Journal, &rdev->flags) &&
2217 rdev->sectors &&
2218 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
2219 if (mddev->pers) {
2220 /* Cannot change size, so fail
2221 * If mddev->level <= 0, then we don't care
2222 * about aligning sizes (e.g. linear)
2223 */
2224 if (mddev->level > 0)
2225 return -ENOSPC;
2226 } else
2227 mddev->dev_sectors = rdev->sectors;
2228 }
2229
2230 /* Verify rdev->desc_nr is unique.
2231 * If it is -1, assign a free number, else
2232 * check number is not in use
2233 */
2234 rcu_read_lock();
2235 if (rdev->desc_nr < 0) {
2236 int choice = 0;
2237 if (mddev->pers)
2238 choice = mddev->raid_disks;
2239 while (md_find_rdev_nr_rcu(mddev, choice))
2240 choice++;
2241 rdev->desc_nr = choice;
2242 } else {
2243 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
2244 rcu_read_unlock();
2245 return -EBUSY;
2246 }
2247 }
2248 rcu_read_unlock();
2249 if (!test_bit(Journal, &rdev->flags) &&
2250 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2251 pr_warn("md: %s: array is limited to %d devices\n",
2252 mdname(mddev), mddev->max_disks);
2253 return -EBUSY;
2254 }
2255 bdevname(rdev->bdev,b);
2256 strreplace(b, '/', '!');
2257
2258 rdev->mddev = mddev;
2259 pr_debug("md: bind<%s>\n", b);
2260
2261 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2262 goto fail;
2263
2264 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2265 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2266 /* failure here is OK */;
2267 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2268
2269 list_add_rcu(&rdev->same_set, &mddev->disks);
2270 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2271
2272 /* May as well allow recovery to be retried once */
2273 mddev->recovery_disabled++;
2274
2275 return 0;
2276
2277 fail:
2278 pr_warn("md: failed to register dev-%s for %s\n",
2279 b, mdname(mddev));
2280 return err;
2281 }
2282
2283 static void md_delayed_delete(struct work_struct *ws)
2284 {
2285 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2286 kobject_del(&rdev->kobj);
2287 kobject_put(&rdev->kobj);
2288 }
2289
2290 static void unbind_rdev_from_array(struct md_rdev *rdev)
2291 {
2292 char b[BDEVNAME_SIZE];
2293
2294 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2295 list_del_rcu(&rdev->same_set);
2296 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
2297 rdev->mddev = NULL;
2298 sysfs_remove_link(&rdev->kobj, "block");
2299 sysfs_put(rdev->sysfs_state);
2300 rdev->sysfs_state = NULL;
2301 rdev->badblocks.count = 0;
2302 /* We need to delay this, otherwise we can deadlock when
2303 * writing to 'remove' to "dev/state". We also need
2304 * to delay it due to rcu usage.
2305 */
2306 synchronize_rcu();
2307 INIT_WORK(&rdev->del_work, md_delayed_delete);
2308 kobject_get(&rdev->kobj);
2309 queue_work(md_misc_wq, &rdev->del_work);
2310 }
2311
2312 /*
2313 * prevent the device from being mounted, repartitioned or
2314 * otherwise reused by a RAID array (or any other kernel
2315 * subsystem), by bd_claiming the device.
2316 */
2317 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2318 {
2319 int err = 0;
2320 struct block_device *bdev;
2321 char b[BDEVNAME_SIZE];
2322
2323 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2324 shared ? (struct md_rdev *)lock_rdev : rdev);
2325 if (IS_ERR(bdev)) {
2326 pr_warn("md: could not open %s.\n", __bdevname(dev, b));
2327 return PTR_ERR(bdev);
2328 }
2329 rdev->bdev = bdev;
2330 return err;
2331 }
2332
2333 static void unlock_rdev(struct md_rdev *rdev)
2334 {
2335 struct block_device *bdev = rdev->bdev;
2336 rdev->bdev = NULL;
2337 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2338 }
2339
2340 void md_autodetect_dev(dev_t dev);
2341
2342 static void export_rdev(struct md_rdev *rdev)
2343 {
2344 char b[BDEVNAME_SIZE];
2345
2346 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
2347 md_rdev_clear(rdev);
2348 #ifndef MODULE
2349 if (test_bit(AutoDetected, &rdev->flags))
2350 md_autodetect_dev(rdev->bdev->bd_dev);
2351 #endif
2352 unlock_rdev(rdev);
2353 kobject_put(&rdev->kobj);
2354 }
2355
2356 void md_kick_rdev_from_array(struct md_rdev *rdev)
2357 {
2358 unbind_rdev_from_array(rdev);
2359 export_rdev(rdev);
2360 }
2361 EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
2362
2363 static void export_array(struct mddev *mddev)
2364 {
2365 struct md_rdev *rdev;
2366
2367 while (!list_empty(&mddev->disks)) {
2368 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2369 same_set);
2370 md_kick_rdev_from_array(rdev);
2371 }
2372 mddev->raid_disks = 0;
2373 mddev->major_version = 0;
2374 }
2375
2376 static bool set_in_sync(struct mddev *mddev)
2377 {
2378 lockdep_assert_held(&mddev->lock);
2379 if (!mddev->in_sync) {
2380 mddev->sync_checkers++;
2381 spin_unlock(&mddev->lock);
2382 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2383 spin_lock(&mddev->lock);
2384 if (!mddev->in_sync &&
2385 percpu_ref_is_zero(&mddev->writes_pending)) {
2386 mddev->in_sync = 1;
2387 /*
2388 * Ensure ->in_sync is visible before we clear
2389 * ->sync_checkers.
2390 */
2391 smp_mb();
2392 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2393 sysfs_notify_dirent_safe(mddev->sysfs_state);
2394 }
2395 if (--mddev->sync_checkers == 0)
2396 percpu_ref_switch_to_percpu(&mddev->writes_pending);
2397 }
2398 if (mddev->safemode == 1)
2399 mddev->safemode = 0;
2400 return mddev->in_sync;
2401 }
2402
2403 static void sync_sbs(struct mddev *mddev, int nospares)
2404 {
2405 /* Update each superblock (in-memory image), but
2406 * if we are allowed to, skip spares which already
2407 * have the right event counter, or have one earlier
2408 * (which would mean they aren't being marked as dirty
2409 * with the rest of the array)
2410 */
2411 struct md_rdev *rdev;
2412 rdev_for_each(rdev, mddev) {
2413 if (rdev->sb_events == mddev->events ||
2414 (nospares &&
2415 rdev->raid_disk < 0 &&
2416 rdev->sb_events+1 == mddev->events)) {
2417 /* Don't update this superblock */
2418 rdev->sb_loaded = 2;
2419 } else {
2420 sync_super(mddev, rdev);
2421 rdev->sb_loaded = 1;
2422 }
2423 }
2424 }
2425
2426 static bool does_sb_need_changing(struct mddev *mddev)
2427 {
2428 struct md_rdev *rdev;
2429 struct mdp_superblock_1 *sb;
2430 int role;
2431
2432 /* Find a good rdev */
2433 rdev_for_each(rdev, mddev)
2434 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2435 break;
2436
2437 /* No good device found. */
2438 if (!rdev)
2439 return false;
2440
2441 sb = page_address(rdev->sb_page);
2442 /* Check if a device has become faulty or a spare become active */
2443 rdev_for_each(rdev, mddev) {
2444 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2445 /* Device activated? */
2446 if (role == 0xffff && rdev->raid_disk >=0 &&
2447 !test_bit(Faulty, &rdev->flags))
2448 return true;
2449 /* Device turned faulty? */
2450 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2451 return true;
2452 }
2453
2454 /* Check if any mddev parameters have changed */
2455 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2456 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
2457 (mddev->layout != le32_to_cpu(sb->layout)) ||
2458 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2459 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2460 return true;
2461
2462 return false;
2463 }
2464
2465 void md_update_sb(struct mddev *mddev, int force_change)
2466 {
2467 struct md_rdev *rdev;
2468 int sync_req;
2469 int nospares = 0;
2470 int any_badblocks_changed = 0;
2471 int ret = -1;
2472
2473 if (mddev->ro) {
2474 if (force_change)
2475 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2476 return;
2477 }
2478
2479 repeat:
2480 if (mddev_is_clustered(mddev)) {
2481 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2482 force_change = 1;
2483 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2484 nospares = 1;
2485 ret = md_cluster_ops->metadata_update_start(mddev);
2486 /* Has someone else has updated the sb */
2487 if (!does_sb_need_changing(mddev)) {
2488 if (ret == 0)
2489 md_cluster_ops->metadata_update_cancel(mddev);
2490 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2491 BIT(MD_SB_CHANGE_DEVS) |
2492 BIT(MD_SB_CHANGE_CLEAN));
2493 return;
2494 }
2495 }
2496
2497 /*
2498 * First make sure individual recovery_offsets are correct
2499 * curr_resync_completed can only be used during recovery.
2500 * During reshape/resync it might use array-addresses rather
2501 * that device addresses.
2502 */
2503 rdev_for_each(rdev, mddev) {
2504 if (rdev->raid_disk >= 0 &&
2505 mddev->delta_disks >= 0 &&
2506 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2507 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2508 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
2509 !test_bit(Journal, &rdev->flags) &&
2510 !test_bit(In_sync, &rdev->flags) &&
2511 mddev->curr_resync_completed > rdev->recovery_offset)
2512 rdev->recovery_offset = mddev->curr_resync_completed;
2513
2514 }
2515 if (!mddev->persistent) {
2516 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2517 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2518 if (!mddev->external) {
2519 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2520 rdev_for_each(rdev, mddev) {
2521 if (rdev->badblocks.changed) {
2522 rdev->badblocks.changed = 0;
2523 ack_all_badblocks(&rdev->badblocks);
2524 md_error(mddev, rdev);
2525 }
2526 clear_bit(Blocked, &rdev->flags);
2527 clear_bit(BlockedBadBlocks, &rdev->flags);
2528 wake_up(&rdev->blocked_wait);
2529 }
2530 }
2531 wake_up(&mddev->sb_wait);
2532 return;
2533 }
2534
2535 spin_lock(&mddev->lock);
2536
2537 mddev->utime = ktime_get_real_seconds();
2538
2539 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2540 force_change = 1;
2541 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
2542 /* just a clean<-> dirty transition, possibly leave spares alone,
2543 * though if events isn't the right even/odd, we will have to do
2544 * spares after all
2545 */
2546 nospares = 1;
2547 if (force_change)
2548 nospares = 0;
2549 if (mddev->degraded)
2550 /* If the array is degraded, then skipping spares is both
2551 * dangerous and fairly pointless.
2552 * Dangerous because a device that was removed from the array
2553 * might have a event_count that still looks up-to-date,
2554 * so it can be re-added without a resync.
2555 * Pointless because if there are any spares to skip,
2556 * then a recovery will happen and soon that array won't
2557 * be degraded any more and the spare can go back to sleep then.
2558 */
2559 nospares = 0;
2560
2561 sync_req = mddev->in_sync;
2562
2563 /* If this is just a dirty<->clean transition, and the array is clean
2564 * and 'events' is odd, we can roll back to the previous clean state */
2565 if (nospares
2566 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2567 && mddev->can_decrease_events
2568 && mddev->events != 1) {
2569 mddev->events--;
2570 mddev->can_decrease_events = 0;
2571 } else {
2572 /* otherwise we have to go forward and ... */
2573 mddev->events ++;
2574 mddev->can_decrease_events = nospares;
2575 }
2576
2577 /*
2578 * This 64-bit counter should never wrap.
2579 * Either we are in around ~1 trillion A.C., assuming
2580 * 1 reboot per second, or we have a bug...
2581 */
2582 WARN_ON(mddev->events == 0);
2583
2584 rdev_for_each(rdev, mddev) {
2585 if (rdev->badblocks.changed)
2586 any_badblocks_changed++;
2587 if (test_bit(Faulty, &rdev->flags))
2588 set_bit(FaultRecorded, &rdev->flags);
2589 }
2590
2591 sync_sbs(mddev, nospares);
2592 spin_unlock(&mddev->lock);
2593
2594 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2595 mdname(mddev), mddev->in_sync);
2596
2597 if (mddev->queue)
2598 blk_add_trace_msg(mddev->queue, "md md_update_sb");
2599 rewrite:
2600 bitmap_update_sb(mddev->bitmap);
2601 rdev_for_each(rdev, mddev) {
2602 char b[BDEVNAME_SIZE];
2603
2604 if (rdev->sb_loaded != 1)
2605 continue; /* no noise on spare devices */
2606
2607 if (!test_bit(Faulty, &rdev->flags)) {
2608 md_super_write(mddev,rdev,
2609 rdev->sb_start, rdev->sb_size,
2610 rdev->sb_page);
2611 pr_debug("md: (write) %s's sb offset: %llu\n",
2612 bdevname(rdev->bdev, b),
2613 (unsigned long long)rdev->sb_start);
2614 rdev->sb_events = mddev->events;
2615 if (rdev->badblocks.size) {
2616 md_super_write(mddev, rdev,
2617 rdev->badblocks.sector,
2618 rdev->badblocks.size << 9,
2619 rdev->bb_page);
2620 rdev->badblocks.size = 0;
2621 }
2622
2623 } else
2624 pr_debug("md: %s (skipping faulty)\n",
2625 bdevname(rdev->bdev, b));
2626
2627 if (mddev->level == LEVEL_MULTIPATH)
2628 /* only need to write one superblock... */
2629 break;
2630 }
2631 if (md_super_wait(mddev) < 0)
2632 goto rewrite;
2633 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
2634
2635 if (mddev_is_clustered(mddev) && ret == 0)
2636 md_cluster_ops->metadata_update_finish(mddev);
2637
2638 if (mddev->in_sync != sync_req ||
2639 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2640 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
2641 /* have to write it out again */
2642 goto repeat;
2643 wake_up(&mddev->sb_wait);
2644 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2645 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2646
2647 rdev_for_each(rdev, mddev) {
2648 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2649 clear_bit(Blocked, &rdev->flags);
2650
2651 if (any_badblocks_changed)
2652 ack_all_badblocks(&rdev->badblocks);
2653 clear_bit(BlockedBadBlocks, &rdev->flags);
2654 wake_up(&rdev->blocked_wait);
2655 }
2656 }
2657 EXPORT_SYMBOL(md_update_sb);
2658
2659 static int add_bound_rdev(struct md_rdev *rdev)
2660 {
2661 struct mddev *mddev = rdev->mddev;
2662 int err = 0;
2663 bool add_journal = test_bit(Journal, &rdev->flags);
2664
2665 if (!mddev->pers->hot_remove_disk || add_journal) {
2666 /* If there is hot_add_disk but no hot_remove_disk
2667 * then added disks for geometry changes,
2668 * and should be added immediately.
2669 */
2670 super_types[mddev->major_version].
2671 validate_super(mddev, rdev);
2672 if (add_journal)
2673 mddev_suspend(mddev);
2674 err = mddev->pers->hot_add_disk(mddev, rdev);
2675 if (add_journal)
2676 mddev_resume(mddev);
2677 if (err) {
2678 md_kick_rdev_from_array(rdev);
2679 return err;
2680 }
2681 }
2682 sysfs_notify_dirent_safe(rdev->sysfs_state);
2683
2684 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2685 if (mddev->degraded)
2686 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2687 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2688 md_new_event(mddev);
2689 md_wakeup_thread(mddev->thread);
2690 return 0;
2691 }
2692
2693 /* words written to sysfs files may, or may not, be \n terminated.
2694 * We want to accept with case. For this we use cmd_match.
2695 */
2696 static int cmd_match(const char *cmd, const char *str)
2697 {
2698 /* See if cmd, written into a sysfs file, matches
2699 * str. They must either be the same, or cmd can
2700 * have a trailing newline
2701 */
2702 while (*cmd && *str && *cmd == *str) {
2703 cmd++;
2704 str++;
2705 }
2706 if (*cmd == '\n')
2707 cmd++;
2708 if (*str || *cmd)
2709 return 0;
2710 return 1;
2711 }
2712
2713 struct rdev_sysfs_entry {
2714 struct attribute attr;
2715 ssize_t (*show)(struct md_rdev *, char *);
2716 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2717 };
2718
2719 static ssize_t
2720 state_show(struct md_rdev *rdev, char *page)
2721 {
2722 char *sep = ",";
2723 size_t len = 0;
2724 unsigned long flags = READ_ONCE(rdev->flags);
2725
2726 if (test_bit(Faulty, &flags) ||
2727 (!test_bit(ExternalBbl, &flags) &&
2728 rdev->badblocks.unacked_exist))
2729 len += sprintf(page+len, "faulty%s", sep);
2730 if (test_bit(In_sync, &flags))
2731 len += sprintf(page+len, "in_sync%s", sep);
2732 if (test_bit(Journal, &flags))
2733 len += sprintf(page+len, "journal%s", sep);
2734 if (test_bit(WriteMostly, &flags))
2735 len += sprintf(page+len, "write_mostly%s", sep);
2736 if (test_bit(Blocked, &flags) ||
2737 (rdev->badblocks.unacked_exist
2738 && !test_bit(Faulty, &flags)))
2739 len += sprintf(page+len, "blocked%s", sep);
2740 if (!test_bit(Faulty, &flags) &&
2741 !test_bit(Journal, &flags) &&
2742 !test_bit(In_sync, &flags))
2743 len += sprintf(page+len, "spare%s", sep);
2744 if (test_bit(WriteErrorSeen, &flags))
2745 len += sprintf(page+len, "write_error%s", sep);
2746 if (test_bit(WantReplacement, &flags))
2747 len += sprintf(page+len, "want_replacement%s", sep);
2748 if (test_bit(Replacement, &flags))
2749 len += sprintf(page+len, "replacement%s", sep);
2750 if (test_bit(ExternalBbl, &flags))
2751 len += sprintf(page+len, "external_bbl%s", sep);
2752 if (test_bit(FailFast, &flags))
2753 len += sprintf(page+len, "failfast%s", sep);
2754
2755 if (len)
2756 len -= strlen(sep);
2757
2758 return len+sprintf(page+len, "\n");
2759 }
2760
2761 static ssize_t
2762 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2763 {
2764 /* can write
2765 * faulty - simulates an error
2766 * remove - disconnects the device
2767 * writemostly - sets write_mostly
2768 * -writemostly - clears write_mostly
2769 * blocked - sets the Blocked flags
2770 * -blocked - clears the Blocked and possibly simulates an error
2771 * insync - sets Insync providing device isn't active
2772 * -insync - clear Insync for a device with a slot assigned,
2773 * so that it gets rebuilt based on bitmap
2774 * write_error - sets WriteErrorSeen
2775 * -write_error - clears WriteErrorSeen
2776 * {,-}failfast - set/clear FailFast
2777 */
2778 int err = -EINVAL;
2779 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2780 md_error(rdev->mddev, rdev);
2781 if (test_bit(Faulty, &rdev->flags))
2782 err = 0;
2783 else
2784 err = -EBUSY;
2785 } else if (cmd_match(buf, "remove")) {
2786 if (rdev->mddev->pers) {
2787 clear_bit(Blocked, &rdev->flags);
2788 remove_and_add_spares(rdev->mddev, rdev);
2789 }
2790 if (rdev->raid_disk >= 0)
2791 err = -EBUSY;
2792 else {
2793 struct mddev *mddev = rdev->mddev;
2794 err = 0;
2795 if (mddev_is_clustered(mddev))
2796 err = md_cluster_ops->remove_disk(mddev, rdev);
2797
2798 if (err == 0) {
2799 md_kick_rdev_from_array(rdev);
2800 if (mddev->pers) {
2801 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
2802 md_wakeup_thread(mddev->thread);
2803 }
2804 md_new_event(mddev);
2805 }
2806 }
2807 } else if (cmd_match(buf, "writemostly")) {
2808 set_bit(WriteMostly, &rdev->flags);
2809 err = 0;
2810 } else if (cmd_match(buf, "-writemostly")) {
2811 clear_bit(WriteMostly, &rdev->flags);
2812 err = 0;
2813 } else if (cmd_match(buf, "blocked")) {
2814 set_bit(Blocked, &rdev->flags);
2815 err = 0;
2816 } else if (cmd_match(buf, "-blocked")) {
2817 if (!test_bit(Faulty, &rdev->flags) &&
2818 !test_bit(ExternalBbl, &rdev->flags) &&
2819 rdev->badblocks.unacked_exist) {
2820 /* metadata handler doesn't understand badblocks,
2821 * so we need to fail the device
2822 */
2823 md_error(rdev->mddev, rdev);
2824 }
2825 clear_bit(Blocked, &rdev->flags);
2826 clear_bit(BlockedBadBlocks, &rdev->flags);
2827 wake_up(&rdev->blocked_wait);
2828 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2829 md_wakeup_thread(rdev->mddev->thread);
2830
2831 err = 0;
2832 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2833 set_bit(In_sync, &rdev->flags);
2834 err = 0;
2835 } else if (cmd_match(buf, "failfast")) {
2836 set_bit(FailFast, &rdev->flags);
2837 err = 0;
2838 } else if (cmd_match(buf, "-failfast")) {
2839 clear_bit(FailFast, &rdev->flags);
2840 err = 0;
2841 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
2842 !test_bit(Journal, &rdev->flags)) {
2843 if (rdev->mddev->pers == NULL) {
2844 clear_bit(In_sync, &rdev->flags);
2845 rdev->saved_raid_disk = rdev->raid_disk;
2846 rdev->raid_disk = -1;
2847 err = 0;
2848 }
2849 } else if (cmd_match(buf, "write_error")) {
2850 set_bit(WriteErrorSeen, &rdev->flags);
2851 err = 0;
2852 } else if (cmd_match(buf, "-write_error")) {
2853 clear_bit(WriteErrorSeen, &rdev->flags);
2854 err = 0;
2855 } else if (cmd_match(buf, "want_replacement")) {
2856 /* Any non-spare device that is not a replacement can
2857 * become want_replacement at any time, but we then need to
2858 * check if recovery is needed.
2859 */
2860 if (rdev->raid_disk >= 0 &&
2861 !test_bit(Journal, &rdev->flags) &&
2862 !test_bit(Replacement, &rdev->flags))
2863 set_bit(WantReplacement, &rdev->flags);
2864 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2865 md_wakeup_thread(rdev->mddev->thread);
2866 err = 0;
2867 } else if (cmd_match(buf, "-want_replacement")) {
2868 /* Clearing 'want_replacement' is always allowed.
2869 * Once replacements starts it is too late though.
2870 */
2871 err = 0;
2872 clear_bit(WantReplacement, &rdev->flags);
2873 } else if (cmd_match(buf, "replacement")) {
2874 /* Can only set a device as a replacement when array has not
2875 * yet been started. Once running, replacement is automatic
2876 * from spares, or by assigning 'slot'.
2877 */
2878 if (rdev->mddev->pers)
2879 err = -EBUSY;
2880 else {
2881 set_bit(Replacement, &rdev->flags);
2882 err = 0;
2883 }
2884 } else if (cmd_match(buf, "-replacement")) {
2885 /* Similarly, can only clear Replacement before start */
2886 if (rdev->mddev->pers)
2887 err = -EBUSY;
2888 else {
2889 clear_bit(Replacement, &rdev->flags);
2890 err = 0;
2891 }
2892 } else if (cmd_match(buf, "re-add")) {
2893 if (!rdev->mddev->pers)
2894 err = -EINVAL;
2895 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
2896 rdev->saved_raid_disk >= 0) {
2897 /* clear_bit is performed _after_ all the devices
2898 * have their local Faulty bit cleared. If any writes
2899 * happen in the meantime in the local node, they
2900 * will land in the local bitmap, which will be synced
2901 * by this node eventually
2902 */
2903 if (!mddev_is_clustered(rdev->mddev) ||
2904 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
2905 clear_bit(Faulty, &rdev->flags);
2906 err = add_bound_rdev(rdev);
2907 }
2908 } else
2909 err = -EBUSY;
2910 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
2911 set_bit(ExternalBbl, &rdev->flags);
2912 rdev->badblocks.shift = 0;
2913 err = 0;
2914 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
2915 clear_bit(ExternalBbl, &rdev->flags);
2916 err = 0;
2917 }
2918 if (!err)
2919 sysfs_notify_dirent_safe(rdev->sysfs_state);
2920 return err ? err : len;
2921 }
2922 static struct rdev_sysfs_entry rdev_state =
2923 __ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
2924
2925 static ssize_t
2926 errors_show(struct md_rdev *rdev, char *page)
2927 {
2928 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2929 }
2930
2931 static ssize_t
2932 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2933 {
2934 unsigned int n;
2935 int rv;
2936
2937 rv = kstrtouint(buf, 10, &n);
2938 if (rv < 0)
2939 return rv;
2940 atomic_set(&rdev->corrected_errors, n);
2941 return len;
2942 }
2943 static struct rdev_sysfs_entry rdev_errors =
2944 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2945
2946 static ssize_t
2947 slot_show(struct md_rdev *rdev, char *page)
2948 {
2949 if (test_bit(Journal, &rdev->flags))
2950 return sprintf(page, "journal\n");
2951 else if (rdev->raid_disk < 0)
2952 return sprintf(page, "none\n");
2953 else
2954 return sprintf(page, "%d\n", rdev->raid_disk);
2955 }
2956
2957 static ssize_t
2958 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2959 {
2960 int slot;
2961 int err;
2962
2963 if (test_bit(Journal, &rdev->flags))
2964 return -EBUSY;
2965 if (strncmp(buf, "none", 4)==0)
2966 slot = -1;
2967 else {
2968 err = kstrtouint(buf, 10, (unsigned int *)&slot);
2969 if (err < 0)
2970 return err;
2971 }
2972 if (rdev->mddev->pers && slot == -1) {
2973 /* Setting 'slot' on an active array requires also
2974 * updating the 'rd%d' link, and communicating
2975 * with the personality with ->hot_*_disk.
2976 * For now we only support removing
2977 * failed/spare devices. This normally happens automatically,
2978 * but not when the metadata is externally managed.
2979 */
2980 if (rdev->raid_disk == -1)
2981 return -EEXIST;
2982 /* personality does all needed checks */
2983 if (rdev->mddev->pers->hot_remove_disk == NULL)
2984 return -EINVAL;
2985 clear_bit(Blocked, &rdev->flags);
2986 remove_and_add_spares(rdev->mddev, rdev);
2987 if (rdev->raid_disk >= 0)
2988 return -EBUSY;
2989 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2990 md_wakeup_thread(rdev->mddev->thread);
2991 } else if (rdev->mddev->pers) {
2992 /* Activating a spare .. or possibly reactivating
2993 * if we ever get bitmaps working here.
2994 */
2995 int err;
2996
2997 if (rdev->raid_disk != -1)
2998 return -EBUSY;
2999
3000 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3001 return -EBUSY;
3002
3003 if (rdev->mddev->pers->hot_add_disk == NULL)
3004 return -EINVAL;
3005
3006 if (slot >= rdev->mddev->raid_disks &&
3007 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3008 return -ENOSPC;
3009
3010 rdev->raid_disk = slot;
3011 if (test_bit(In_sync, &rdev->flags))
3012 rdev->saved_raid_disk = slot;
3013 else
3014 rdev->saved_raid_disk = -1;
3015 clear_bit(In_sync, &rdev->flags);
3016 clear_bit(Bitmap_sync, &rdev->flags);
3017 err = rdev->mddev->pers->
3018 hot_add_disk(rdev->mddev, rdev);
3019 if (err) {
3020 rdev->raid_disk = -1;
3021 return err;
3022 } else
3023 sysfs_notify_dirent_safe(rdev->sysfs_state);
3024 if (sysfs_link_rdev(rdev->mddev, rdev))
3025 /* failure here is OK */;
3026 /* don't wakeup anyone, leave that to userspace. */
3027 } else {
3028 if (slot >= rdev->mddev->raid_disks &&
3029 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3030 return -ENOSPC;
3031 rdev->raid_disk = slot;
3032 /* assume it is working */
3033 clear_bit(Faulty, &rdev->flags);
3034 clear_bit(WriteMostly, &rdev->flags);
3035 set_bit(In_sync, &rdev->flags);
3036 sysfs_notify_dirent_safe(rdev->sysfs_state);
3037 }
3038 return len;
3039 }
3040
3041 static struct rdev_sysfs_entry rdev_slot =
3042 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
3043
3044 static ssize_t
3045 offset_show(struct md_rdev *rdev, char *page)
3046 {
3047 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
3048 }
3049
3050 static ssize_t
3051 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
3052 {
3053 unsigned long long offset;
3054 if (kstrtoull(buf, 10, &offset) < 0)
3055 return -EINVAL;
3056 if (rdev->mddev->pers && rdev->raid_disk >= 0)
3057 return -EBUSY;
3058 if (rdev->sectors && rdev->mddev->external)
3059 /* Must set offset before size, so overlap checks
3060 * can be sane */
3061 return -EBUSY;
3062 rdev->data_offset = offset;
3063 rdev->new_data_offset = offset;
3064 return len;
3065 }
3066
3067 static struct rdev_sysfs_entry rdev_offset =
3068 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
3069
3070 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3071 {
3072 return sprintf(page, "%llu\n",
3073 (unsigned long long)rdev->new_data_offset);
3074 }
3075
3076 static ssize_t new_offset_store(struct md_rdev *rdev,
3077 const char *buf, size_t len)
3078 {
3079 unsigned long long new_offset;
3080 struct mddev *mddev = rdev->mddev;
3081
3082 if (kstrtoull(buf, 10, &new_offset) < 0)
3083 return -EINVAL;
3084
3085 if (mddev->sync_thread ||
3086 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
3087 return -EBUSY;
3088 if (new_offset == rdev->data_offset)
3089 /* reset is always permitted */
3090 ;
3091 else if (new_offset > rdev->data_offset) {
3092 /* must not push array size beyond rdev_sectors */
3093 if (new_offset - rdev->data_offset
3094 + mddev->dev_sectors > rdev->sectors)
3095 return -E2BIG;
3096 }
3097 /* Metadata worries about other space details. */
3098
3099 /* decreasing the offset is inconsistent with a backwards
3100 * reshape.
3101 */
3102 if (new_offset < rdev->data_offset &&
3103 mddev->reshape_backwards)
3104 return -EINVAL;
3105 /* Increasing offset is inconsistent with forwards
3106 * reshape. reshape_direction should be set to
3107 * 'backwards' first.
3108 */
3109 if (new_offset > rdev->data_offset &&
3110 !mddev->reshape_backwards)
3111 return -EINVAL;
3112
3113 if (mddev->pers && mddev->persistent &&
3114 !super_types[mddev->major_version]
3115 .allow_new_offset(rdev, new_offset))
3116 return -E2BIG;
3117 rdev->new_data_offset = new_offset;
3118 if (new_offset > rdev->data_offset)
3119 mddev->reshape_backwards = 1;
3120 else if (new_offset < rdev->data_offset)
3121 mddev->reshape_backwards = 0;
3122
3123 return len;
3124 }
3125 static struct rdev_sysfs_entry rdev_new_offset =
3126 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3127
3128 static ssize_t
3129 rdev_size_show(struct md_rdev *rdev, char *page)
3130 {
3131 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
3132 }
3133
3134 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3135 {
3136 /* check if two start/length pairs overlap */
3137 if (s1+l1 <= s2)
3138 return 0;
3139 if (s2+l2 <= s1)
3140 return 0;
3141 return 1;
3142 }
3143
3144 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3145 {
3146 unsigned long long blocks;
3147 sector_t new;
3148
3149 if (kstrtoull(buf, 10, &blocks) < 0)
3150 return -EINVAL;
3151
3152 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3153 return -EINVAL; /* sector conversion overflow */
3154
3155 new = blocks * 2;
3156 if (new != blocks * 2)
3157 return -EINVAL; /* unsigned long long to sector_t overflow */
3158
3159 *sectors = new;
3160 return 0;
3161 }
3162
3163 static ssize_t
3164 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3165 {
3166 struct mddev *my_mddev = rdev->mddev;
3167 sector_t oldsectors = rdev->sectors;
3168 sector_t sectors;
3169
3170 if (test_bit(Journal, &rdev->flags))
3171 return -EBUSY;
3172 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3173 return -EINVAL;
3174 if (rdev->data_offset != rdev->new_data_offset)
3175 return -EINVAL; /* too confusing */
3176 if (my_mddev->pers && rdev->raid_disk >= 0) {
3177 if (my_mddev->persistent) {
3178 sectors = super_types[my_mddev->major_version].
3179 rdev_size_change(rdev, sectors);
3180 if (!sectors)
3181 return -EBUSY;
3182 } else if (!sectors)
3183 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3184 rdev->data_offset;
3185 if (!my_mddev->pers->resize)
3186 /* Cannot change size for RAID0 or Linear etc */
3187 return -EINVAL;
3188 }
3189 if (sectors < my_mddev->dev_sectors)
3190 return -EINVAL; /* component must fit device */
3191
3192 rdev->sectors = sectors;
3193 if (sectors > oldsectors && my_mddev->external) {
3194 /* Need to check that all other rdevs with the same
3195 * ->bdev do not overlap. 'rcu' is sufficient to walk
3196 * the rdev lists safely.
3197 * This check does not provide a hard guarantee, it
3198 * just helps avoid dangerous mistakes.
3199 */
3200 struct mddev *mddev;
3201 int overlap = 0;
3202 struct list_head *tmp;
3203
3204 rcu_read_lock();
3205 for_each_mddev(mddev, tmp) {
3206 struct md_rdev *rdev2;
3207
3208 rdev_for_each(rdev2, mddev)
3209 if (rdev->bdev == rdev2->bdev &&
3210 rdev != rdev2 &&
3211 overlaps(rdev->data_offset, rdev->sectors,
3212 rdev2->data_offset,
3213 rdev2->sectors)) {
3214 overlap = 1;
3215 break;
3216 }
3217 if (overlap) {
3218 mddev_put(mddev);
3219 break;
3220 }
3221 }
3222 rcu_read_unlock();
3223 if (overlap) {
3224 /* Someone else could have slipped in a size
3225 * change here, but doing so is just silly.
3226 * We put oldsectors back because we *know* it is
3227 * safe, and trust userspace not to race with
3228 * itself
3229 */
3230 rdev->sectors = oldsectors;
3231 return -EBUSY;
3232 }
3233 }
3234 return len;
3235 }
3236
3237 static struct rdev_sysfs_entry rdev_size =
3238 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3239
3240 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3241 {
3242 unsigned long long recovery_start = rdev->recovery_offset;
3243
3244 if (test_bit(In_sync, &rdev->flags) ||
3245 recovery_start == MaxSector)
3246 return sprintf(page, "none\n");
3247
3248 return sprintf(page, "%llu\n", recovery_start);
3249 }
3250
3251 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3252 {
3253 unsigned long long recovery_start;
3254
3255 if (cmd_match(buf, "none"))
3256 recovery_start = MaxSector;
3257 else if (kstrtoull(buf, 10, &recovery_start))
3258 return -EINVAL;
3259
3260 if (rdev->mddev->pers &&
3261 rdev->raid_disk >= 0)
3262 return -EBUSY;
3263
3264 rdev->recovery_offset = recovery_start;
3265 if (recovery_start == MaxSector)
3266 set_bit(In_sync, &rdev->flags);
3267 else
3268 clear_bit(In_sync, &rdev->flags);
3269 return len;
3270 }
3271
3272 static struct rdev_sysfs_entry rdev_recovery_start =
3273 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3274
3275 /* sysfs access to bad-blocks list.
3276 * We present two files.
3277 * 'bad-blocks' lists sector numbers and lengths of ranges that
3278 * are recorded as bad. The list is truncated to fit within
3279 * the one-page limit of sysfs.
3280 * Writing "sector length" to this file adds an acknowledged
3281 * bad block list.
3282 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3283 * been acknowledged. Writing to this file adds bad blocks
3284 * without acknowledging them. This is largely for testing.
3285 */
3286 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3287 {
3288 return badblocks_show(&rdev->badblocks, page, 0);
3289 }
3290 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3291 {
3292 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3293 /* Maybe that ack was all we needed */
3294 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3295 wake_up(&rdev->blocked_wait);
3296 return rv;
3297 }
3298 static struct rdev_sysfs_entry rdev_bad_blocks =
3299 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3300
3301 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3302 {
3303 return badblocks_show(&rdev->badblocks, page, 1);
3304 }
3305 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3306 {
3307 return badblocks_store(&rdev->badblocks, page, len, 1);
3308 }
3309 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3310 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3311
3312 static ssize_t
3313 ppl_sector_show(struct md_rdev *rdev, char *page)
3314 {
3315 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3316 }
3317
3318 static ssize_t
3319 ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3320 {
3321 unsigned long long sector;
3322
3323 if (kstrtoull(buf, 10, &sector) < 0)
3324 return -EINVAL;
3325 if (sector != (sector_t)sector)
3326 return -EINVAL;
3327
3328 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3329 rdev->raid_disk >= 0)
3330 return -EBUSY;
3331
3332 if (rdev->mddev->persistent) {
3333 if (rdev->mddev->major_version == 0)
3334 return -EINVAL;
3335 if ((sector > rdev->sb_start &&
3336 sector - rdev->sb_start > S16_MAX) ||
3337 (sector < rdev->sb_start &&
3338 rdev->sb_start - sector > -S16_MIN))
3339 return -EINVAL;
3340 rdev->ppl.offset = sector - rdev->sb_start;
3341 } else if (!rdev->mddev->external) {
3342 return -EBUSY;
3343 }
3344 rdev->ppl.sector = sector;
3345 return len;
3346 }
3347
3348 static struct rdev_sysfs_entry rdev_ppl_sector =
3349 __ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3350
3351 static ssize_t
3352 ppl_size_show(struct md_rdev *rdev, char *page)
3353 {
3354 return sprintf(page, "%u\n", rdev->ppl.size);
3355 }
3356
3357 static ssize_t
3358 ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3359 {
3360 unsigned int size;
3361
3362 if (kstrtouint(buf, 10, &size) < 0)
3363 return -EINVAL;
3364
3365 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3366 rdev->raid_disk >= 0)
3367 return -EBUSY;
3368
3369 if (rdev->mddev->persistent) {
3370 if (rdev->mddev->major_version == 0)
3371 return -EINVAL;
3372 if (size > U16_MAX)
3373 return -EINVAL;
3374 } else if (!rdev->mddev->external) {
3375 return -EBUSY;
3376 }
3377 rdev->ppl.size = size;
3378 return len;
3379 }
3380
3381 static struct rdev_sysfs_entry rdev_ppl_size =
3382 __ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3383
3384 static struct attribute *rdev_default_attrs[] = {
3385 &rdev_state.attr,
3386 &rdev_errors.attr,
3387 &rdev_slot.attr,
3388 &rdev_offset.attr,
3389 &rdev_new_offset.attr,
3390 &rdev_size.attr,
3391 &rdev_recovery_start.attr,
3392 &rdev_bad_blocks.attr,
3393 &rdev_unack_bad_blocks.attr,
3394 &rdev_ppl_sector.attr,
3395 &rdev_ppl_size.attr,
3396 NULL,
3397 };
3398 static ssize_t
3399 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3400 {
3401 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3402 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3403
3404 if (!entry->show)
3405 return -EIO;
3406 if (!rdev->mddev)
3407 return -EBUSY;
3408 return entry->show(rdev, page);
3409 }
3410
3411 static ssize_t
3412 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3413 const char *page, size_t length)
3414 {
3415 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3416 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3417 ssize_t rv;
3418 struct mddev *mddev = rdev->mddev;
3419
3420 if (!entry->store)
3421 return -EIO;
3422 if (!capable(CAP_SYS_ADMIN))
3423 return -EACCES;
3424 rv = mddev ? mddev_lock(mddev): -EBUSY;
3425 if (!rv) {
3426 if (rdev->mddev == NULL)
3427 rv = -EBUSY;
3428 else
3429 rv = entry->store(rdev, page, length);
3430 mddev_unlock(mddev);
3431 }
3432 return rv;
3433 }
3434
3435 static void rdev_free(struct kobject *ko)
3436 {
3437 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3438 kfree(rdev);
3439 }
3440 static const struct sysfs_ops rdev_sysfs_ops = {
3441 .show = rdev_attr_show,
3442 .store = rdev_attr_store,
3443 };
3444 static struct kobj_type rdev_ktype = {
3445 .release = rdev_free,
3446 .sysfs_ops = &rdev_sysfs_ops,
3447 .default_attrs = rdev_default_attrs,
3448 };
3449
3450 int md_rdev_init(struct md_rdev *rdev)
3451 {
3452 rdev->desc_nr = -1;
3453 rdev->saved_raid_disk = -1;
3454 rdev->raid_disk = -1;
3455 rdev->flags = 0;
3456 rdev->data_offset = 0;
3457 rdev->new_data_offset = 0;
3458 rdev->sb_events = 0;
3459 rdev->last_read_error = 0;
3460 rdev->sb_loaded = 0;
3461 rdev->bb_page = NULL;
3462 atomic_set(&rdev->nr_pending, 0);
3463 atomic_set(&rdev->read_errors, 0);
3464 atomic_set(&rdev->corrected_errors, 0);
3465
3466 INIT_LIST_HEAD(&rdev->same_set);
3467 init_waitqueue_head(&rdev->blocked_wait);
3468
3469 /* Add space to store bad block list.
3470 * This reserves the space even on arrays where it cannot
3471 * be used - I wonder if that matters
3472 */
3473 return badblocks_init(&rdev->badblocks, 0);
3474 }
3475 EXPORT_SYMBOL_GPL(md_rdev_init);
3476 /*
3477 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3478 *
3479 * mark the device faulty if:
3480 *
3481 * - the device is nonexistent (zero size)
3482 * - the device has no valid superblock
3483 *
3484 * a faulty rdev _never_ has rdev->sb set.
3485 */
3486 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3487 {
3488 char b[BDEVNAME_SIZE];
3489 int err;
3490 struct md_rdev *rdev;
3491 sector_t size;
3492
3493 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3494 if (!rdev)
3495 return ERR_PTR(-ENOMEM);
3496
3497 err = md_rdev_init(rdev);
3498 if (err)
3499 goto abort_free;
3500 err = alloc_disk_sb(rdev);
3501 if (err)
3502 goto abort_free;
3503
3504 err = lock_rdev(rdev, newdev, super_format == -2);
3505 if (err)
3506 goto abort_free;
3507
3508 kobject_init(&rdev->kobj, &rdev_ktype);
3509
3510 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3511 if (!size) {
3512 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
3513 bdevname(rdev->bdev,b));
3514 err = -EINVAL;
3515 goto abort_free;
3516 }
3517
3518 if (super_format >= 0) {
3519 err = super_types[super_format].
3520 load_super(rdev, NULL, super_minor);
3521 if (err == -EINVAL) {
3522 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
3523 bdevname(rdev->bdev,b),
3524 super_format, super_minor);
3525 goto abort_free;
3526 }
3527 if (err < 0) {
3528 pr_warn("md: could not read %s's sb, not importing!\n",
3529 bdevname(rdev->bdev,b));
3530 goto abort_free;
3531 }
3532 }
3533
3534 return rdev;
3535
3536 abort_free:
3537 if (rdev->bdev)
3538 unlock_rdev(rdev);
3539 md_rdev_clear(rdev);
3540 kfree(rdev);
3541 return ERR_PTR(err);
3542 }
3543
3544 /*
3545 * Check a full RAID array for plausibility
3546 */
3547
3548 static void analyze_sbs(struct mddev *mddev)
3549 {
3550 int i;
3551 struct md_rdev *rdev, *freshest, *tmp;
3552 char b[BDEVNAME_SIZE];
3553
3554 freshest = NULL;
3555 rdev_for_each_safe(rdev, tmp, mddev)
3556 switch (super_types[mddev->major_version].
3557 load_super(rdev, freshest, mddev->minor_version)) {
3558 case 1:
3559 freshest = rdev;
3560 break;
3561 case 0:
3562 break;
3563 default:
3564 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
3565 bdevname(rdev->bdev,b));
3566 md_kick_rdev_from_array(rdev);
3567 }
3568
3569 super_types[mddev->major_version].
3570 validate_super(mddev, freshest);
3571
3572 i = 0;
3573 rdev_for_each_safe(rdev, tmp, mddev) {
3574 if (mddev->max_disks &&
3575 (rdev->desc_nr >= mddev->max_disks ||
3576 i > mddev->max_disks)) {
3577 pr_warn("md: %s: %s: only %d devices permitted\n",
3578 mdname(mddev), bdevname(rdev->bdev, b),
3579 mddev->max_disks);
3580 md_kick_rdev_from_array(rdev);
3581 continue;
3582 }
3583 if (rdev != freshest) {
3584 if (super_types[mddev->major_version].
3585 validate_super(mddev, rdev)) {
3586 pr_warn("md: kicking non-fresh %s from array!\n",
3587 bdevname(rdev->bdev,b));
3588 md_kick_rdev_from_array(rdev);
3589 continue;
3590 }
3591 }
3592 if (mddev->level == LEVEL_MULTIPATH) {
3593 rdev->desc_nr = i++;
3594 rdev->raid_disk = rdev->desc_nr;
3595 set_bit(In_sync, &rdev->flags);
3596 } else if (rdev->raid_disk >=
3597 (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3598 !test_bit(Journal, &rdev->flags)) {
3599 rdev->raid_disk = -1;
3600 clear_bit(In_sync, &rdev->flags);
3601 }
3602 }
3603 }
3604
3605 /* Read a fixed-point number.
3606 * Numbers in sysfs attributes should be in "standard" units where
3607 * possible, so time should be in seconds.
3608 * However we internally use a a much smaller unit such as
3609 * milliseconds or jiffies.
3610 * This function takes a decimal number with a possible fractional
3611 * component, and produces an integer which is the result of
3612 * multiplying that number by 10^'scale'.
3613 * all without any floating-point arithmetic.
3614 */
3615 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3616 {
3617 unsigned long result = 0;
3618 long decimals = -1;
3619 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3620 if (*cp == '.')
3621 decimals = 0;
3622 else if (decimals < scale) {
3623 unsigned int value;
3624 value = *cp - '0';
3625 result = result * 10 + value;
3626 if (decimals >= 0)
3627 decimals++;
3628 }
3629 cp++;
3630 }
3631 if (*cp == '\n')
3632 cp++;
3633 if (*cp)
3634 return -EINVAL;
3635 if (decimals < 0)
3636 decimals = 0;
3637 while (decimals < scale) {
3638 result *= 10;
3639 decimals ++;
3640 }
3641 *res = result;
3642 return 0;
3643 }
3644
3645 static ssize_t
3646 safe_delay_show(struct mddev *mddev, char *page)
3647 {
3648 int msec = (mddev->safemode_delay*1000)/HZ;
3649 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3650 }
3651 static ssize_t
3652 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3653 {
3654 unsigned long msec;
3655
3656 if (mddev_is_clustered(mddev)) {
3657 pr_warn("md: Safemode is disabled for clustered mode\n");
3658 return -EINVAL;
3659 }
3660
3661 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3662 return -EINVAL;
3663 if (msec == 0)
3664 mddev->safemode_delay = 0;
3665 else {
3666 unsigned long old_delay = mddev->safemode_delay;
3667 unsigned long new_delay = (msec*HZ)/1000;
3668
3669 if (new_delay == 0)
3670 new_delay = 1;
3671 mddev->safemode_delay = new_delay;
3672 if (new_delay < old_delay || old_delay == 0)
3673 mod_timer(&mddev->safemode_timer, jiffies+1);
3674 }
3675 return len;
3676 }
3677 static struct md_sysfs_entry md_safe_delay =
3678 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3679
3680 static ssize_t
3681 level_show(struct mddev *mddev, char *page)
3682 {
3683 struct md_personality *p;
3684 int ret;
3685 spin_lock(&mddev->lock);
3686 p = mddev->pers;
3687 if (p)
3688 ret = sprintf(page, "%s\n", p->name);
3689 else if (mddev->clevel[0])
3690 ret = sprintf(page, "%s\n", mddev->clevel);
3691 else if (mddev->level != LEVEL_NONE)
3692 ret = sprintf(page, "%d\n", mddev->level);
3693 else
3694 ret = 0;
3695 spin_unlock(&mddev->lock);
3696 return ret;
3697 }
3698
3699 static ssize_t
3700 level_store(struct mddev *mddev, const char *buf, size_t len)
3701 {
3702 char clevel[16];
3703 ssize_t rv;
3704 size_t slen = len;
3705 struct md_personality *pers, *oldpers;
3706 long level;
3707 void *priv, *oldpriv;
3708 struct md_rdev *rdev;
3709
3710 if (slen == 0 || slen >= sizeof(clevel))
3711 return -EINVAL;
3712
3713 rv = mddev_lock(mddev);
3714 if (rv)
3715 return rv;
3716
3717 if (mddev->pers == NULL) {
3718 strncpy(mddev->clevel, buf, slen);
3719 if (mddev->clevel[slen-1] == '\n')
3720 slen--;
3721 mddev->clevel[slen] = 0;
3722 mddev->level = LEVEL_NONE;
3723 rv = len;
3724 goto out_unlock;
3725 }
3726 rv = -EROFS;
3727 if (mddev->ro)
3728 goto out_unlock;
3729
3730 /* request to change the personality. Need to ensure:
3731 * - array is not engaged in resync/recovery/reshape
3732 * - old personality can be suspended
3733 * - new personality will access other array.
3734 */
3735
3736 rv = -EBUSY;
3737 if (mddev->sync_thread ||
3738 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3739 mddev->reshape_position != MaxSector ||
3740 mddev->sysfs_active)
3741 goto out_unlock;
3742
3743 rv = -EINVAL;
3744 if (!mddev->pers->quiesce) {
3745 pr_warn("md: %s: %s does not support online personality change\n",
3746 mdname(mddev), mddev->pers->name);
3747 goto out_unlock;
3748 }
3749
3750 /* Now find the new personality */
3751 strncpy(clevel, buf, slen);
3752 if (clevel[slen-1] == '\n')
3753 slen--;
3754 clevel[slen] = 0;
3755 if (kstrtol(clevel, 10, &level))
3756 level = LEVEL_NONE;
3757
3758 if (request_module("md-%s", clevel) != 0)
3759 request_module("md-level-%s", clevel);
3760 spin_lock(&pers_lock);
3761 pers = find_pers(level, clevel);
3762 if (!pers || !try_module_get(pers->owner)) {
3763 spin_unlock(&pers_lock);
3764 pr_warn("md: personality %s not loaded\n", clevel);
3765 rv = -EINVAL;
3766 goto out_unlock;
3767 }
3768 spin_unlock(&pers_lock);
3769
3770 if (pers == mddev->pers) {
3771 /* Nothing to do! */
3772 module_put(pers->owner);
3773 rv = len;
3774 goto out_unlock;
3775 }
3776 if (!pers->takeover) {
3777 module_put(pers->owner);
3778 pr_warn("md: %s: %s does not support personality takeover\n",
3779 mdname(mddev), clevel);
3780 rv = -EINVAL;
3781 goto out_unlock;
3782 }
3783
3784 rdev_for_each(rdev, mddev)
3785 rdev->new_raid_disk = rdev->raid_disk;
3786
3787 /* ->takeover must set new_* and/or delta_disks
3788 * if it succeeds, and may set them when it fails.
3789 */
3790 priv = pers->takeover(mddev);
3791 if (IS_ERR(priv)) {
3792 mddev->new_level = mddev->level;
3793 mddev->new_layout = mddev->layout;
3794 mddev->new_chunk_sectors = mddev->chunk_sectors;
3795 mddev->raid_disks -= mddev->delta_disks;
3796 mddev->delta_disks = 0;
3797 mddev->reshape_backwards = 0;
3798 module_put(pers->owner);
3799 pr_warn("md: %s: %s would not accept array\n",
3800 mdname(mddev), clevel);
3801 rv = PTR_ERR(priv);
3802 goto out_unlock;
3803 }
3804
3805 /* Looks like we have a winner */
3806 mddev_suspend(mddev);
3807 mddev_detach(mddev);
3808
3809 spin_lock(&mddev->lock);
3810 oldpers = mddev->pers;
3811 oldpriv = mddev->private;
3812 mddev->pers = pers;
3813 mddev->private = priv;
3814 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3815 mddev->level = mddev->new_level;
3816 mddev->layout = mddev->new_layout;
3817 mddev->chunk_sectors = mddev->new_chunk_sectors;
3818 mddev->delta_disks = 0;
3819 mddev->reshape_backwards = 0;
3820 mddev->degraded = 0;
3821 spin_unlock(&mddev->lock);
3822
3823 if (oldpers->sync_request == NULL &&
3824 mddev->external) {
3825 /* We are converting from a no-redundancy array
3826 * to a redundancy array and metadata is managed
3827 * externally so we need to be sure that writes
3828 * won't block due to a need to transition
3829 * clean->dirty
3830 * until external management is started.
3831 */
3832 mddev->in_sync = 0;
3833 mddev->safemode_delay = 0;
3834 mddev->safemode = 0;
3835 }
3836
3837 oldpers->free(mddev, oldpriv);
3838
3839 if (oldpers->sync_request == NULL &&
3840 pers->sync_request != NULL) {
3841 /* need to add the md_redundancy_group */
3842 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3843 pr_warn("md: cannot register extra attributes for %s\n",
3844 mdname(mddev));
3845 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3846 }
3847 if (oldpers->sync_request != NULL &&
3848 pers->sync_request == NULL) {
3849 /* need to remove the md_redundancy_group */
3850 if (mddev->to_remove == NULL)
3851 mddev->to_remove = &md_redundancy_group;
3852 }
3853
3854 module_put(oldpers->owner);
3855
3856 rdev_for_each(rdev, mddev) {
3857 if (rdev->raid_disk < 0)
3858 continue;
3859 if (rdev->new_raid_disk >= mddev->raid_disks)
3860 rdev->new_raid_disk = -1;
3861 if (rdev->new_raid_disk == rdev->raid_disk)
3862 continue;
3863 sysfs_unlink_rdev(mddev, rdev);
3864 }
3865 rdev_for_each(rdev, mddev) {
3866 if (rdev->raid_disk < 0)
3867 continue;
3868 if (rdev->new_raid_disk == rdev->raid_disk)
3869 continue;
3870 rdev->raid_disk = rdev->new_raid_disk;
3871 if (rdev->raid_disk < 0)
3872 clear_bit(In_sync, &rdev->flags);
3873 else {
3874 if (sysfs_link_rdev(mddev, rdev))
3875 pr_warn("md: cannot register rd%d for %s after level change\n",
3876 rdev->raid_disk, mdname(mddev));
3877 }
3878 }
3879
3880 if (pers->sync_request == NULL) {
3881 /* this is now an array without redundancy, so
3882 * it must always be in_sync
3883 */
3884 mddev->in_sync = 1;
3885 del_timer_sync(&mddev->safemode_timer);
3886 }
3887 blk_set_stacking_limits(&mddev->queue->limits);
3888 pers->run(mddev);
3889 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
3890 mddev_resume(mddev);
3891 if (!mddev->thread)
3892 md_update_sb(mddev, 1);
3893 sysfs_notify(&mddev->kobj, NULL, "level");
3894 md_new_event(mddev);
3895 rv = len;
3896 out_unlock:
3897 mddev_unlock(mddev);
3898 return rv;
3899 }
3900
3901 static struct md_sysfs_entry md_level =
3902 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3903
3904 static ssize_t
3905 layout_show(struct mddev *mddev, char *page)
3906 {
3907 /* just a number, not meaningful for all levels */
3908 if (mddev->reshape_position != MaxSector &&
3909 mddev->layout != mddev->new_layout)
3910 return sprintf(page, "%d (%d)\n",
3911 mddev->new_layout, mddev->layout);
3912 return sprintf(page, "%d\n", mddev->layout);
3913 }
3914
3915 static ssize_t
3916 layout_store(struct mddev *mddev, const char *buf, size_t len)
3917 {
3918 unsigned int n;
3919 int err;
3920
3921 err = kstrtouint(buf, 10, &n);
3922 if (err < 0)
3923 return err;
3924 err = mddev_lock(mddev);
3925 if (err)
3926 return err;
3927
3928 if (mddev->pers) {
3929 if (mddev->pers->check_reshape == NULL)
3930 err = -EBUSY;
3931 else if (mddev->ro)
3932 err = -EROFS;
3933 else {
3934 mddev->new_layout = n;
3935 err = mddev->pers->check_reshape(mddev);
3936 if (err)
3937 mddev->new_layout = mddev->layout;
3938 }
3939 } else {
3940 mddev->new_layout = n;
3941 if (mddev->reshape_position == MaxSector)
3942 mddev->layout = n;
3943 }
3944 mddev_unlock(mddev);
3945 return err ?: len;
3946 }
3947 static struct md_sysfs_entry md_layout =
3948 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3949
3950 static ssize_t
3951 raid_disks_show(struct mddev *mddev, char *page)
3952 {
3953 if (mddev->raid_disks == 0)
3954 return 0;
3955 if (mddev->reshape_position != MaxSector &&
3956 mddev->delta_disks != 0)
3957 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3958 mddev->raid_disks - mddev->delta_disks);
3959 return sprintf(page, "%d\n", mddev->raid_disks);
3960 }
3961
3962 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3963
3964 static ssize_t
3965 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3966 {
3967 unsigned int n;
3968 int err;
3969
3970 err = kstrtouint(buf, 10, &n);
3971 if (err < 0)
3972 return err;
3973
3974 err = mddev_lock(mddev);
3975 if (err)
3976 return err;
3977 if (mddev->pers)
3978 err = update_raid_disks(mddev, n);
3979 else if (mddev->reshape_position != MaxSector) {
3980 struct md_rdev *rdev;
3981 int olddisks = mddev->raid_disks - mddev->delta_disks;
3982
3983 err = -EINVAL;
3984 rdev_for_each(rdev, mddev) {
3985 if (olddisks < n &&
3986 rdev->data_offset < rdev->new_data_offset)
3987 goto out_unlock;
3988 if (olddisks > n &&
3989 rdev->data_offset > rdev->new_data_offset)
3990 goto out_unlock;
3991 }
3992 err = 0;
3993 mddev->delta_disks = n - olddisks;
3994 mddev->raid_disks = n;
3995 mddev->reshape_backwards = (mddev->delta_disks < 0);
3996 } else
3997 mddev->raid_disks = n;
3998 out_unlock:
3999 mddev_unlock(mddev);
4000 return err ? err : len;
4001 }
4002 static struct md_sysfs_entry md_raid_disks =
4003 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
4004
4005 static ssize_t
4006 chunk_size_show(struct mddev *mddev, char *page)
4007 {
4008 if (mddev->reshape_position != MaxSector &&
4009 mddev->chunk_sectors != mddev->new_chunk_sectors)
4010 return sprintf(page, "%d (%d)\n",
4011 mddev->new_chunk_sectors << 9,
4012 mddev->chunk_sectors << 9);
4013 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
4014 }
4015
4016 static ssize_t
4017 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
4018 {
4019 unsigned long n;
4020 int err;
4021
4022 err = kstrtoul(buf, 10, &n);
4023 if (err < 0)
4024 return err;
4025
4026 err = mddev_lock(mddev);
4027 if (err)
4028 return err;
4029 if (mddev->pers) {
4030 if (mddev->pers->check_reshape == NULL)
4031 err = -EBUSY;
4032 else if (mddev->ro)
4033 err = -EROFS;
4034 else {
4035 mddev->new_chunk_sectors = n >> 9;
4036 err = mddev->pers->check_reshape(mddev);
4037 if (err)
4038 mddev->new_chunk_sectors = mddev->chunk_sectors;
4039 }
4040 } else {
4041 mddev->new_chunk_sectors = n >> 9;
4042 if (mddev->reshape_position == MaxSector)
4043 mddev->chunk_sectors = n >> 9;
4044 }
4045 mddev_unlock(mddev);
4046 return err ?: len;
4047 }
4048 static struct md_sysfs_entry md_chunk_size =
4049 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
4050
4051 static ssize_t
4052 resync_start_show(struct mddev *mddev, char *page)
4053 {
4054 if (mddev->recovery_cp == MaxSector)
4055 return sprintf(page, "none\n");
4056 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4057 }
4058
4059 static ssize_t
4060 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
4061 {
4062 unsigned long long n;
4063 int err;
4064
4065 if (cmd_match(buf, "none"))
4066 n = MaxSector;
4067 else {
4068 err = kstrtoull(buf, 10, &n);
4069 if (err < 0)
4070 return err;
4071 if (n != (sector_t)n)
4072 return -EINVAL;
4073 }
4074
4075 err = mddev_lock(mddev);
4076 if (err)
4077 return err;
4078 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4079 err = -EBUSY;
4080
4081 if (!err) {
4082 mddev->recovery_cp = n;
4083 if (mddev->pers)
4084 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
4085 }
4086 mddev_unlock(mddev);
4087 return err ?: len;
4088 }
4089 static struct md_sysfs_entry md_resync_start =
4090 __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4091 resync_start_show, resync_start_store);
4092
4093 /*
4094 * The array state can be:
4095 *
4096 * clear
4097 * No devices, no size, no level
4098 * Equivalent to STOP_ARRAY ioctl
4099 * inactive
4100 * May have some settings, but array is not active
4101 * all IO results in error
4102 * When written, doesn't tear down array, but just stops it
4103 * suspended (not supported yet)
4104 * All IO requests will block. The array can be reconfigured.
4105 * Writing this, if accepted, will block until array is quiescent
4106 * readonly
4107 * no resync can happen. no superblocks get written.
4108 * write requests fail
4109 * read-auto
4110 * like readonly, but behaves like 'clean' on a write request.
4111 *
4112 * clean - no pending writes, but otherwise active.
4113 * When written to inactive array, starts without resync
4114 * If a write request arrives then
4115 * if metadata is known, mark 'dirty' and switch to 'active'.
4116 * if not known, block and switch to write-pending
4117 * If written to an active array that has pending writes, then fails.
4118 * active
4119 * fully active: IO and resync can be happening.
4120 * When written to inactive array, starts with resync
4121 *
4122 * write-pending
4123 * clean, but writes are blocked waiting for 'active' to be written.
4124 *
4125 * active-idle
4126 * like active, but no writes have been seen for a while (100msec).
4127 *
4128 * broken
4129 * RAID0/LINEAR-only: same as clean, but array is missing a member.
4130 * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
4131 * when a member is gone, so this state will at least alert the
4132 * user that something is wrong.
4133 */
4134 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
4135 write_pending, active_idle, broken, bad_word};
4136 static char *array_states[] = {
4137 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
4138 "write-pending", "active-idle", "broken", NULL };
4139
4140 static int match_word(const char *word, char **list)
4141 {
4142 int n;
4143 for (n=0; list[n]; n++)
4144 if (cmd_match(word, list[n]))
4145 break;
4146 return n;
4147 }
4148
4149 static ssize_t
4150 array_state_show(struct mddev *mddev, char *page)
4151 {
4152 enum array_state st = inactive;
4153
4154 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
4155 switch(mddev->ro) {
4156 case 1:
4157 st = readonly;
4158 break;
4159 case 2:
4160 st = read_auto;
4161 break;
4162 case 0:
4163 spin_lock(&mddev->lock);
4164 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
4165 st = write_pending;
4166 else if (mddev->in_sync)
4167 st = clean;
4168 else if (mddev->safemode)
4169 st = active_idle;
4170 else
4171 st = active;
4172 spin_unlock(&mddev->lock);
4173 }
4174
4175 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4176 st = broken;
4177 } else {
4178 if (list_empty(&mddev->disks) &&
4179 mddev->raid_disks == 0 &&
4180 mddev->dev_sectors == 0)
4181 st = clear;
4182 else
4183 st = inactive;
4184 }
4185 return sprintf(page, "%s\n", array_states[st]);
4186 }
4187
4188 static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4189 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4190 static int do_md_run(struct mddev *mddev);
4191 static int restart_array(struct mddev *mddev);
4192
4193 static ssize_t
4194 array_state_store(struct mddev *mddev, const char *buf, size_t len)
4195 {
4196 int err = 0;
4197 enum array_state st = match_word(buf, array_states);
4198
4199 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4200 /* don't take reconfig_mutex when toggling between
4201 * clean and active
4202 */
4203 spin_lock(&mddev->lock);
4204 if (st == active) {
4205 restart_array(mddev);
4206 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4207 md_wakeup_thread(mddev->thread);
4208 wake_up(&mddev->sb_wait);
4209 } else /* st == clean */ {
4210 restart_array(mddev);
4211 if (!set_in_sync(mddev))
4212 err = -EBUSY;
4213 }
4214 if (!err)
4215 sysfs_notify_dirent_safe(mddev->sysfs_state);
4216 spin_unlock(&mddev->lock);
4217 return err ?: len;
4218 }
4219 err = mddev_lock(mddev);
4220 if (err)
4221 return err;
4222 err = -EINVAL;
4223 switch(st) {
4224 case bad_word:
4225 break;
4226 case clear:
4227 /* stopping an active array */
4228 err = do_md_stop(mddev, 0, NULL);
4229 break;
4230 case inactive:
4231 /* stopping an active array */
4232 if (mddev->pers)
4233 err = do_md_stop(mddev, 2, NULL);
4234 else
4235 err = 0; /* already inactive */
4236 break;
4237 case suspended:
4238 break; /* not supported yet */
4239 case readonly:
4240 if (mddev->pers)
4241 err = md_set_readonly(mddev, NULL);
4242 else {
4243 mddev->ro = 1;
4244 set_disk_ro(mddev->gendisk, 1);
4245 err = do_md_run(mddev);
4246 }
4247 break;
4248 case read_auto:
4249 if (mddev->pers) {
4250 if (mddev->ro == 0)
4251 err = md_set_readonly(mddev, NULL);
4252 else if (mddev->ro == 1)
4253 err = restart_array(mddev);
4254 if (err == 0) {
4255 mddev->ro = 2;
4256 set_disk_ro(mddev->gendisk, 0);
4257 }
4258 } else {
4259 mddev->ro = 2;
4260 err = do_md_run(mddev);
4261 }
4262 break;
4263 case clean:
4264 if (mddev->pers) {
4265 err = restart_array(mddev);
4266 if (err)
4267 break;
4268 spin_lock(&mddev->lock);
4269 if (!set_in_sync(mddev))
4270 err = -EBUSY;
4271 spin_unlock(&mddev->lock);
4272 } else
4273 err = -EINVAL;
4274 break;
4275 case active:
4276 if (mddev->pers) {
4277 err = restart_array(mddev);
4278 if (err)
4279 break;
4280 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
4281 wake_up(&mddev->sb_wait);
4282 err = 0;
4283 } else {
4284 mddev->ro = 0;
4285 set_disk_ro(mddev->gendisk, 0);
4286 err = do_md_run(mddev);
4287 }
4288 break;
4289 case write_pending:
4290 case active_idle:
4291 case broken:
4292 /* these cannot be set */
4293 break;
4294 }
4295
4296 if (!err) {
4297 if (mddev->hold_active == UNTIL_IOCTL)
4298 mddev->hold_active = 0;
4299 sysfs_notify_dirent_safe(mddev->sysfs_state);
4300 }
4301 mddev_unlock(mddev);
4302 return err ?: len;
4303 }
4304 static struct md_sysfs_entry md_array_state =
4305 __ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
4306
4307 static ssize_t
4308 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
4309 return sprintf(page, "%d\n",
4310 atomic_read(&mddev->max_corr_read_errors));
4311 }
4312
4313 static ssize_t
4314 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
4315 {
4316 unsigned int n;
4317 int rv;
4318
4319 rv = kstrtouint(buf, 10, &n);
4320 if (rv < 0)
4321 return rv;
4322 atomic_set(&mddev->max_corr_read_errors, n);
4323 return len;
4324 }
4325
4326 static struct md_sysfs_entry max_corr_read_errors =
4327 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4328 max_corrected_read_errors_store);
4329
4330 static ssize_t
4331 null_show(struct mddev *mddev, char *page)
4332 {
4333 return -EINVAL;
4334 }
4335
4336 static ssize_t
4337 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4338 {
4339 /* buf must be %d:%d\n? giving major and minor numbers */
4340 /* The new device is added to the array.
4341 * If the array has a persistent superblock, we read the
4342 * superblock to initialise info and check validity.
4343 * Otherwise, only checking done is that in bind_rdev_to_array,
4344 * which mainly checks size.
4345 */
4346 char *e;
4347 int major = simple_strtoul(buf, &e, 10);
4348 int minor;
4349 dev_t dev;
4350 struct md_rdev *rdev;
4351 int err;
4352
4353 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4354 return -EINVAL;
4355 minor = simple_strtoul(e+1, &e, 10);
4356 if (*e && *e != '\n')
4357 return -EINVAL;
4358 dev = MKDEV(major, minor);
4359 if (major != MAJOR(dev) ||
4360 minor != MINOR(dev))
4361 return -EOVERFLOW;
4362
4363 flush_workqueue(md_misc_wq);
4364
4365 err = mddev_lock(mddev);
4366 if (err)
4367 return err;
4368 if (mddev->persistent) {
4369 rdev = md_import_device(dev, mddev->major_version,
4370 mddev->minor_version);
4371 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4372 struct md_rdev *rdev0
4373 = list_entry(mddev->disks.next,
4374 struct md_rdev, same_set);
4375 err = super_types[mddev->major_version]
4376 .load_super(rdev, rdev0, mddev->minor_version);
4377 if (err < 0)
4378 goto out;
4379 }
4380 } else if (mddev->external)
4381 rdev = md_import_device(dev, -2, -1);
4382 else
4383 rdev = md_import_device(dev, -1, -1);
4384
4385 if (IS_ERR(rdev)) {
4386 mddev_unlock(mddev);
4387 return PTR_ERR(rdev);
4388 }
4389 err = bind_rdev_to_array(rdev, mddev);
4390 out:
4391 if (err)
4392 export_rdev(rdev);
4393 mddev_unlock(mddev);
4394 if (!err)
4395 md_new_event(mddev);
4396 return err ? err : len;
4397 }
4398
4399 static struct md_sysfs_entry md_new_device =
4400 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4401
4402 static ssize_t
4403 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4404 {
4405 char *end;
4406 unsigned long chunk, end_chunk;
4407 int err;
4408
4409 err = mddev_lock(mddev);
4410 if (err)
4411 return err;
4412 if (!mddev->bitmap)
4413 goto out;
4414 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4415 while (*buf) {
4416 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4417 if (buf == end) break;
4418 if (*end == '-') { /* range */
4419 buf = end + 1;
4420 end_chunk = simple_strtoul(buf, &end, 0);
4421 if (buf == end) break;
4422 }
4423 if (*end && !isspace(*end)) break;
4424 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4425 buf = skip_spaces(end);
4426 }
4427 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4428 out:
4429 mddev_unlock(mddev);
4430 return len;
4431 }
4432
4433 static struct md_sysfs_entry md_bitmap =
4434 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4435
4436 static ssize_t
4437 size_show(struct mddev *mddev, char *page)
4438 {
4439 return sprintf(page, "%llu\n",
4440 (unsigned long long)mddev->dev_sectors / 2);
4441 }
4442
4443 static int update_size(struct mddev *mddev, sector_t num_sectors);
4444
4445 static ssize_t
4446 size_store(struct mddev *mddev, const char *buf, size_t len)
4447 {
4448 /* If array is inactive, we can reduce the component size, but
4449 * not increase it (except from 0).
4450 * If array is active, we can try an on-line resize
4451 */
4452 sector_t sectors;
4453 int err = strict_blocks_to_sectors(buf, &sectors);
4454
4455 if (err < 0)
4456 return err;
4457 err = mddev_lock(mddev);
4458 if (err)
4459 return err;
4460 if (mddev->pers) {
4461 err = update_size(mddev, sectors);
4462 if (err == 0)
4463 md_update_sb(mddev, 1);
4464 } else {
4465 if (mddev->dev_sectors == 0 ||
4466 mddev->dev_sectors > sectors)
4467 mddev->dev_sectors = sectors;
4468 else
4469 err = -ENOSPC;
4470 }
4471 mddev_unlock(mddev);
4472 return err ? err : len;
4473 }
4474
4475 static struct md_sysfs_entry md_size =
4476 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4477
4478 /* Metadata version.
4479 * This is one of
4480 * 'none' for arrays with no metadata (good luck...)
4481 * 'external' for arrays with externally managed metadata,
4482 * or N.M for internally known formats
4483 */
4484 static ssize_t
4485 metadata_show(struct mddev *mddev, char *page)
4486 {
4487 if (mddev->persistent)
4488 return sprintf(page, "%d.%d\n",
4489 mddev->major_version, mddev->minor_version);
4490 else if (mddev->external)
4491 return sprintf(page, "external:%s\n", mddev->metadata_type);
4492 else
4493 return sprintf(page, "none\n");
4494 }
4495
4496 static ssize_t
4497 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4498 {
4499 int major, minor;
4500 char *e;
4501 int err;
4502 /* Changing the details of 'external' metadata is
4503 * always permitted. Otherwise there must be
4504 * no devices attached to the array.
4505 */
4506
4507 err = mddev_lock(mddev);
4508 if (err)
4509 return err;
4510 err = -EBUSY;
4511 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4512 ;
4513 else if (!list_empty(&mddev->disks))
4514 goto out_unlock;
4515
4516 err = 0;
4517 if (cmd_match(buf, "none")) {
4518 mddev->persistent = 0;
4519 mddev->external = 0;
4520 mddev->major_version = 0;
4521 mddev->minor_version = 90;
4522 goto out_unlock;
4523 }
4524 if (strncmp(buf, "external:", 9) == 0) {
4525 size_t namelen = len-9;
4526 if (namelen >= sizeof(mddev->metadata_type))
4527 namelen = sizeof(mddev->metadata_type)-1;
4528 strncpy(mddev->metadata_type, buf+9, namelen);
4529 mddev->metadata_type[namelen] = 0;
4530 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4531 mddev->metadata_type[--namelen] = 0;
4532 mddev->persistent = 0;
4533 mddev->external = 1;
4534 mddev->major_version = 0;
4535 mddev->minor_version = 90;
4536 goto out_unlock;
4537 }
4538 major = simple_strtoul(buf, &e, 10);
4539 err = -EINVAL;
4540 if (e==buf || *e != '.')
4541 goto out_unlock;
4542 buf = e+1;
4543 minor = simple_strtoul(buf, &e, 10);
4544 if (e==buf || (*e && *e != '\n') )
4545 goto out_unlock;
4546 err = -ENOENT;
4547 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4548 goto out_unlock;
4549 mddev->major_version = major;
4550 mddev->minor_version = minor;
4551 mddev->persistent = 1;
4552 mddev->external = 0;
4553 err = 0;
4554 out_unlock:
4555 mddev_unlock(mddev);
4556 return err ?: len;
4557 }
4558
4559 static struct md_sysfs_entry md_metadata =
4560 __ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4561
4562 static ssize_t
4563 action_show(struct mddev *mddev, char *page)
4564 {
4565 char *type = "idle";
4566 unsigned long recovery = mddev->recovery;
4567 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
4568 type = "frozen";
4569 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4570 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4571 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
4572 type = "reshape";
4573 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4574 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
4575 type = "resync";
4576 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
4577 type = "check";
4578 else
4579 type = "repair";
4580 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
4581 type = "recover";
4582 else if (mddev->reshape_position != MaxSector)
4583 type = "reshape";
4584 }
4585 return sprintf(page, "%s\n", type);
4586 }
4587
4588 static ssize_t
4589 action_store(struct mddev *mddev, const char *page, size_t len)
4590 {
4591 if (!mddev->pers || !mddev->pers->sync_request)
4592 return -EINVAL;
4593
4594
4595 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4596 if (cmd_match(page, "frozen"))
4597 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4598 else
4599 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4600 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4601 mddev_lock(mddev) == 0) {
4602 flush_workqueue(md_misc_wq);
4603 if (mddev->sync_thread) {
4604 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4605 md_reap_sync_thread(mddev);
4606 }
4607 mddev_unlock(mddev);
4608 }
4609 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4610 return -EBUSY;
4611 else if (cmd_match(page, "resync"))
4612 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4613 else if (cmd_match(page, "recover")) {
4614 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4615 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4616 } else if (cmd_match(page, "reshape")) {
4617 int err;
4618 if (mddev->pers->start_reshape == NULL)
4619 return -EINVAL;
4620 err = mddev_lock(mddev);
4621 if (!err) {
4622 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4623 err = -EBUSY;
4624 else {
4625 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4626 err = mddev->pers->start_reshape(mddev);
4627 }
4628 mddev_unlock(mddev);
4629 }
4630 if (err)
4631 return err;
4632 sysfs_notify(&mddev->kobj, NULL, "degraded");
4633 } else {
4634 if (cmd_match(page, "check"))
4635 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4636 else if (!cmd_match(page, "repair"))
4637 return -EINVAL;
4638 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4639 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4640 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4641 }
4642 if (mddev->ro == 2) {
4643 /* A write to sync_action is enough to justify
4644 * canceling read-auto mode
4645 */
4646 mddev->ro = 0;
4647 md_wakeup_thread(mddev->sync_thread);
4648 }
4649 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4650 md_wakeup_thread(mddev->thread);
4651 sysfs_notify_dirent_safe(mddev->sysfs_action);
4652 return len;
4653 }
4654
4655 static struct md_sysfs_entry md_scan_mode =
4656 __ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4657
4658 static ssize_t
4659 last_sync_action_show(struct mddev *mddev, char *page)
4660 {
4661 return sprintf(page, "%s\n", mddev->last_sync_action);
4662 }
4663
4664 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4665
4666 static ssize_t
4667 mismatch_cnt_show(struct mddev *mddev, char *page)
4668 {
4669 return sprintf(page, "%llu\n",
4670 (unsigned long long)
4671 atomic64_read(&mddev->resync_mismatches));
4672 }
4673
4674 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4675
4676 static ssize_t
4677 sync_min_show(struct mddev *mddev, char *page)
4678 {
4679 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4680 mddev->sync_speed_min ? "local": "system");
4681 }
4682
4683 static ssize_t
4684 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4685 {
4686 unsigned int min;
4687 int rv;
4688
4689 if (strncmp(buf, "system", 6)==0) {
4690 min = 0;
4691 } else {
4692 rv = kstrtouint(buf, 10, &min);
4693 if (rv < 0)
4694 return rv;
4695 if (min == 0)
4696 return -EINVAL;
4697 }
4698 mddev->sync_speed_min = min;
4699 return len;
4700 }
4701
4702 static struct md_sysfs_entry md_sync_min =
4703 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4704
4705 static ssize_t
4706 sync_max_show(struct mddev *mddev, char *page)
4707 {
4708 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4709 mddev->sync_speed_max ? "local": "system");
4710 }
4711
4712 static ssize_t
4713 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4714 {
4715 unsigned int max;
4716 int rv;
4717
4718 if (strncmp(buf, "system", 6)==0) {
4719 max = 0;
4720 } else {
4721 rv = kstrtouint(buf, 10, &max);
4722 if (rv < 0)
4723 return rv;
4724 if (max == 0)
4725 return -EINVAL;
4726 }
4727 mddev->sync_speed_max = max;
4728 return len;
4729 }
4730
4731 static struct md_sysfs_entry md_sync_max =
4732 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4733
4734 static ssize_t
4735 degraded_show(struct mddev *mddev, char *page)
4736 {
4737 return sprintf(page, "%d\n", mddev->degraded);
4738 }
4739 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4740
4741 static ssize_t
4742 sync_force_parallel_show(struct mddev *mddev, char *page)
4743 {
4744 return sprintf(page, "%d\n", mddev->parallel_resync);
4745 }
4746
4747 static ssize_t
4748 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4749 {
4750 long n;
4751
4752 if (kstrtol(buf, 10, &n))
4753 return -EINVAL;
4754
4755 if (n != 0 && n != 1)
4756 return -EINVAL;
4757
4758 mddev->parallel_resync = n;
4759
4760 if (mddev->sync_thread)
4761 wake_up(&resync_wait);
4762
4763 return len;
4764 }
4765
4766 /* force parallel resync, even with shared block devices */
4767 static struct md_sysfs_entry md_sync_force_parallel =
4768 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4769 sync_force_parallel_show, sync_force_parallel_store);
4770
4771 static ssize_t
4772 sync_speed_show(struct mddev *mddev, char *page)
4773 {
4774 unsigned long resync, dt, db;
4775 if (mddev->curr_resync == 0)
4776 return sprintf(page, "none\n");
4777 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4778 dt = (jiffies - mddev->resync_mark) / HZ;
4779 if (!dt) dt++;
4780 db = resync - mddev->resync_mark_cnt;
4781 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4782 }
4783
4784 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4785
4786 static ssize_t
4787 sync_completed_show(struct mddev *mddev, char *page)
4788 {
4789 unsigned long long max_sectors, resync;
4790
4791 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4792 return sprintf(page, "none\n");
4793
4794 if (mddev->curr_resync == 1 ||
4795 mddev->curr_resync == 2)
4796 return sprintf(page, "delayed\n");
4797
4798 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4799 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4800 max_sectors = mddev->resync_max_sectors;
4801 else
4802 max_sectors = mddev->dev_sectors;
4803
4804 resync = mddev->curr_resync_completed;
4805 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4806 }
4807
4808 static struct md_sysfs_entry md_sync_completed =
4809 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
4810
4811 static ssize_t
4812 min_sync_show(struct mddev *mddev, char *page)
4813 {
4814 return sprintf(page, "%llu\n",
4815 (unsigned long long)mddev->resync_min);
4816 }
4817 static ssize_t
4818 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4819 {
4820 unsigned long long min;
4821 int err;
4822
4823 if (kstrtoull(buf, 10, &min))
4824 return -EINVAL;
4825
4826 spin_lock(&mddev->lock);
4827 err = -EINVAL;
4828 if (min > mddev->resync_max)
4829 goto out_unlock;
4830
4831 err = -EBUSY;
4832 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4833 goto out_unlock;
4834
4835 /* Round down to multiple of 4K for safety */
4836 mddev->resync_min = round_down(min, 8);
4837 err = 0;
4838
4839 out_unlock:
4840 spin_unlock(&mddev->lock);
4841 return err ?: len;
4842 }
4843
4844 static struct md_sysfs_entry md_min_sync =
4845 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4846
4847 static ssize_t
4848 max_sync_show(struct mddev *mddev, char *page)
4849 {
4850 if (mddev->resync_max == MaxSector)
4851 return sprintf(page, "max\n");
4852 else
4853 return sprintf(page, "%llu\n",
4854 (unsigned long long)mddev->resync_max);
4855 }
4856 static ssize_t
4857 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4858 {
4859 int err;
4860 spin_lock(&mddev->lock);
4861 if (strncmp(buf, "max", 3) == 0)
4862 mddev->resync_max = MaxSector;
4863 else {
4864 unsigned long long max;
4865 int chunk;
4866
4867 err = -EINVAL;
4868 if (kstrtoull(buf, 10, &max))
4869 goto out_unlock;
4870 if (max < mddev->resync_min)
4871 goto out_unlock;
4872
4873 err = -EBUSY;
4874 if (max < mddev->resync_max &&
4875 mddev->ro == 0 &&
4876 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4877 goto out_unlock;
4878
4879 /* Must be a multiple of chunk_size */
4880 chunk = mddev->chunk_sectors;
4881 if (chunk) {
4882 sector_t temp = max;
4883
4884 err = -EINVAL;
4885 if (sector_div(temp, chunk))
4886 goto out_unlock;
4887 }
4888 mddev->resync_max = max;
4889 }
4890 wake_up(&mddev->recovery_wait);
4891 err = 0;
4892 out_unlock:
4893 spin_unlock(&mddev->lock);
4894 return err ?: len;
4895 }
4896
4897 static struct md_sysfs_entry md_max_sync =
4898 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4899
4900 static ssize_t
4901 suspend_lo_show(struct mddev *mddev, char *page)
4902 {
4903 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4904 }
4905
4906 static ssize_t
4907 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4908 {
4909 unsigned long long new;
4910 int err;
4911
4912 err = kstrtoull(buf, 10, &new);
4913 if (err < 0)
4914 return err;
4915 if (new != (sector_t)new)
4916 return -EINVAL;
4917
4918 err = mddev_lock(mddev);
4919 if (err)
4920 return err;
4921 err = -EINVAL;
4922 if (mddev->pers == NULL ||
4923 mddev->pers->quiesce == NULL)
4924 goto unlock;
4925 mddev_suspend(mddev);
4926 mddev->suspend_lo = new;
4927 mddev_resume(mddev);
4928
4929 err = 0;
4930 unlock:
4931 mddev_unlock(mddev);
4932 return err ?: len;
4933 }
4934 static struct md_sysfs_entry md_suspend_lo =
4935 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4936
4937 static ssize_t
4938 suspend_hi_show(struct mddev *mddev, char *page)
4939 {
4940 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4941 }
4942
4943 static ssize_t
4944 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4945 {
4946 unsigned long long new;
4947 int err;
4948
4949 err = kstrtoull(buf, 10, &new);
4950 if (err < 0)
4951 return err;
4952 if (new != (sector_t)new)
4953 return -EINVAL;
4954
4955 err = mddev_lock(mddev);
4956 if (err)
4957 return err;
4958 err = -EINVAL;
4959 if (mddev->pers == NULL)
4960 goto unlock;
4961
4962 mddev_suspend(mddev);
4963 mddev->suspend_hi = new;
4964 mddev_resume(mddev);
4965
4966 err = 0;
4967 unlock:
4968 mddev_unlock(mddev);
4969 return err ?: len;
4970 }
4971 static struct md_sysfs_entry md_suspend_hi =
4972 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4973
4974 static ssize_t
4975 reshape_position_show(struct mddev *mddev, char *page)
4976 {
4977 if (mddev->reshape_position != MaxSector)
4978 return sprintf(page, "%llu\n",
4979 (unsigned long long)mddev->reshape_position);
4980 strcpy(page, "none\n");
4981 return 5;
4982 }
4983
4984 static ssize_t
4985 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4986 {
4987 struct md_rdev *rdev;
4988 unsigned long long new;
4989 int err;
4990
4991 err = kstrtoull(buf, 10, &new);
4992 if (err < 0)
4993 return err;
4994 if (new != (sector_t)new)
4995 return -EINVAL;
4996 err = mddev_lock(mddev);
4997 if (err)
4998 return err;
4999 err = -EBUSY;
5000 if (mddev->pers)
5001 goto unlock;
5002 mddev->reshape_position = new;
5003 mddev->delta_disks = 0;
5004 mddev->reshape_backwards = 0;
5005 mddev->new_level = mddev->level;
5006 mddev->new_layout = mddev->layout;
5007 mddev->new_chunk_sectors = mddev->chunk_sectors;
5008 rdev_for_each(rdev, mddev)
5009 rdev->new_data_offset = rdev->data_offset;
5010 err = 0;
5011 unlock:
5012 mddev_unlock(mddev);
5013 return err ?: len;
5014 }
5015
5016 static struct md_sysfs_entry md_reshape_position =
5017 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5018 reshape_position_store);
5019
5020 static ssize_t
5021 reshape_direction_show(struct mddev *mddev, char *page)
5022 {
5023 return sprintf(page, "%s\n",
5024 mddev->reshape_backwards ? "backwards" : "forwards");
5025 }
5026
5027 static ssize_t
5028 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5029 {
5030 int backwards = 0;
5031 int err;
5032
5033 if (cmd_match(buf, "forwards"))
5034 backwards = 0;
5035 else if (cmd_match(buf, "backwards"))
5036 backwards = 1;
5037 else
5038 return -EINVAL;
5039 if (mddev->reshape_backwards == backwards)
5040 return len;
5041
5042 err = mddev_lock(mddev);
5043 if (err)
5044 return err;
5045 /* check if we are allowed to change */
5046 if (mddev->delta_disks)
5047 err = -EBUSY;
5048 else if (mddev->persistent &&
5049 mddev->major_version == 0)
5050 err = -EINVAL;
5051 else
5052 mddev->reshape_backwards = backwards;
5053 mddev_unlock(mddev);
5054 return err ?: len;
5055 }
5056
5057 static struct md_sysfs_entry md_reshape_direction =
5058 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5059 reshape_direction_store);
5060
5061 static ssize_t
5062 array_size_show(struct mddev *mddev, char *page)
5063 {
5064 if (mddev->external_size)
5065 return sprintf(page, "%llu\n",
5066 (unsigned long long)mddev->array_sectors/2);
5067 else
5068 return sprintf(page, "default\n");
5069 }
5070
5071 static ssize_t
5072 array_size_store(struct mddev *mddev, const char *buf, size_t len)
5073 {
5074 sector_t sectors;
5075 int err;
5076
5077 err = mddev_lock(mddev);
5078 if (err)
5079 return err;
5080
5081 /* cluster raid doesn't support change array_sectors */
5082 if (mddev_is_clustered(mddev)) {
5083 mddev_unlock(mddev);
5084 return -EINVAL;
5085 }
5086
5087 if (strncmp(buf, "default", 7) == 0) {
5088 if (mddev->pers)
5089 sectors = mddev->pers->size(mddev, 0, 0);
5090 else
5091 sectors = mddev->array_sectors;
5092
5093 mddev->external_size = 0;
5094 } else {
5095 if (strict_blocks_to_sectors(buf, &sectors) < 0)
5096 err = -EINVAL;
5097 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5098 err = -E2BIG;
5099 else
5100 mddev->external_size = 1;
5101 }
5102
5103 if (!err) {
5104 mddev->array_sectors = sectors;
5105 if (mddev->pers) {
5106 set_capacity(mddev->gendisk, mddev->array_sectors);
5107 revalidate_disk(mddev->gendisk);
5108 }
5109 }
5110 mddev_unlock(mddev);
5111 return err ?: len;
5112 }
5113
5114 static struct md_sysfs_entry md_array_size =
5115 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5116 array_size_store);
5117
5118 static ssize_t
5119 consistency_policy_show(struct mddev *mddev, char *page)
5120 {
5121 int ret;
5122
5123 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5124 ret = sprintf(page, "journal\n");
5125 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5126 ret = sprintf(page, "ppl\n");
5127 } else if (mddev->bitmap) {
5128 ret = sprintf(page, "bitmap\n");
5129 } else if (mddev->pers) {
5130 if (mddev->pers->sync_request)
5131 ret = sprintf(page, "resync\n");
5132 else
5133 ret = sprintf(page, "none\n");
5134 } else {
5135 ret = sprintf(page, "unknown\n");
5136 }
5137
5138 return ret;
5139 }
5140
5141 static ssize_t
5142 consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5143 {
5144 int err = 0;
5145
5146 if (mddev->pers) {
5147 if (mddev->pers->change_consistency_policy)
5148 err = mddev->pers->change_consistency_policy(mddev, buf);
5149 else
5150 err = -EBUSY;
5151 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5152 set_bit(MD_HAS_PPL, &mddev->flags);
5153 } else {
5154 err = -EINVAL;
5155 }
5156
5157 return err ? err : len;
5158 }
5159
5160 static struct md_sysfs_entry md_consistency_policy =
5161 __ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5162 consistency_policy_store);
5163
5164 static struct attribute *md_default_attrs[] = {
5165 &md_level.attr,
5166 &md_layout.attr,
5167 &md_raid_disks.attr,
5168 &md_chunk_size.attr,
5169 &md_size.attr,
5170 &md_resync_start.attr,
5171 &md_metadata.attr,
5172 &md_new_device.attr,
5173 &md_safe_delay.attr,
5174 &md_array_state.attr,
5175 &md_reshape_position.attr,
5176 &md_reshape_direction.attr,
5177 &md_array_size.attr,
5178 &max_corr_read_errors.attr,
5179 &md_consistency_policy.attr,
5180 NULL,
5181 };
5182
5183 static struct attribute *md_redundancy_attrs[] = {
5184 &md_scan_mode.attr,
5185 &md_last_scan_mode.attr,
5186 &md_mismatches.attr,
5187 &md_sync_min.attr,
5188 &md_sync_max.attr,
5189 &md_sync_speed.attr,
5190 &md_sync_force_parallel.attr,
5191 &md_sync_completed.attr,
5192 &md_min_sync.attr,
5193 &md_max_sync.attr,
5194 &md_suspend_lo.attr,
5195 &md_suspend_hi.attr,
5196 &md_bitmap.attr,
5197 &md_degraded.attr,
5198 NULL,
5199 };
5200 static struct attribute_group md_redundancy_group = {
5201 .name = NULL,
5202 .attrs = md_redundancy_attrs,
5203 };
5204
5205 static ssize_t
5206 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5207 {
5208 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5209 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5210 ssize_t rv;
5211
5212 if (!entry->show)
5213 return -EIO;
5214 spin_lock(&all_mddevs_lock);
5215 if (list_empty(&mddev->all_mddevs)) {
5216 spin_unlock(&all_mddevs_lock);
5217 return -EBUSY;
5218 }
5219 mddev_get(mddev);
5220 spin_unlock(&all_mddevs_lock);
5221
5222 rv = entry->show(mddev, page);
5223 mddev_put(mddev);
5224 return rv;
5225 }
5226
5227 static ssize_t
5228 md_attr_store(struct kobject *kobj, struct attribute *attr,
5229 const char *page, size_t length)
5230 {
5231 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
5232 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
5233 ssize_t rv;
5234
5235 if (!entry->store)
5236 return -EIO;
5237 if (!capable(CAP_SYS_ADMIN))
5238 return -EACCES;
5239 spin_lock(&all_mddevs_lock);
5240 if (list_empty(&mddev->all_mddevs)) {
5241 spin_unlock(&all_mddevs_lock);
5242 return -EBUSY;
5243 }
5244 mddev_get(mddev);
5245 spin_unlock(&all_mddevs_lock);
5246 rv = entry->store(mddev, page, length);
5247 mddev_put(mddev);
5248 return rv;
5249 }
5250
5251 static void md_free(struct kobject *ko)
5252 {
5253 struct mddev *mddev = container_of(ko, struct mddev, kobj);
5254
5255 if (mddev->sysfs_state)
5256 sysfs_put(mddev->sysfs_state);
5257
5258 if (mddev->queue)
5259 blk_cleanup_queue(mddev->queue);
5260 if (mddev->gendisk) {
5261 del_gendisk(mddev->gendisk);
5262 put_disk(mddev->gendisk);
5263 }
5264 percpu_ref_exit(&mddev->writes_pending);
5265
5266 kfree(mddev);
5267 }
5268
5269 static const struct sysfs_ops md_sysfs_ops = {
5270 .show = md_attr_show,
5271 .store = md_attr_store,
5272 };
5273 static struct kobj_type md_ktype = {
5274 .release = md_free,
5275 .sysfs_ops = &md_sysfs_ops,
5276 .default_attrs = md_default_attrs,
5277 };
5278
5279 int mdp_major = 0;
5280
5281 static void mddev_delayed_delete(struct work_struct *ws)
5282 {
5283 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5284
5285 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5286 kobject_del(&mddev->kobj);
5287 kobject_put(&mddev->kobj);
5288 }
5289
5290 static void no_op(struct percpu_ref *r) {}
5291
5292 int mddev_init_writes_pending(struct mddev *mddev)
5293 {
5294 if (mddev->writes_pending.percpu_count_ptr)
5295 return 0;
5296 if (percpu_ref_init(&mddev->writes_pending, no_op, 0, GFP_KERNEL) < 0)
5297 return -ENOMEM;
5298 /* We want to start with the refcount at zero */
5299 percpu_ref_put(&mddev->writes_pending);
5300 return 0;
5301 }
5302 EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5303
5304 static int md_alloc(dev_t dev, char *name)
5305 {
5306 /*
5307 * If dev is zero, name is the name of a device to allocate with
5308 * an arbitrary minor number. It will be "md_???"
5309 * If dev is non-zero it must be a device number with a MAJOR of
5310 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
5311 * the device is being created by opening a node in /dev.
5312 * If "name" is not NULL, the device is being created by
5313 * writing to /sys/module/md_mod/parameters/new_array.
5314 */
5315 static DEFINE_MUTEX(disks_mutex);
5316 struct mddev *mddev = mddev_find(dev);
5317 struct gendisk *disk;
5318 int partitioned;
5319 int shift;
5320 int unit;
5321 int error;
5322
5323 if (!mddev)
5324 return -ENODEV;
5325
5326 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5327 shift = partitioned ? MdpMinorShift : 0;
5328 unit = MINOR(mddev->unit) >> shift;
5329
5330 /* wait for any previous instance of this device to be
5331 * completely removed (mddev_delayed_delete).
5332 */
5333 flush_workqueue(md_misc_wq);
5334
5335 mutex_lock(&disks_mutex);
5336 error = -EEXIST;
5337 if (mddev->gendisk)
5338 goto abort;
5339
5340 if (name && !dev) {
5341 /* Need to ensure that 'name' is not a duplicate.
5342 */
5343 struct mddev *mddev2;
5344 spin_lock(&all_mddevs_lock);
5345
5346 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5347 if (mddev2->gendisk &&
5348 strcmp(mddev2->gendisk->disk_name, name) == 0) {
5349 spin_unlock(&all_mddevs_lock);
5350 goto abort;
5351 }
5352 spin_unlock(&all_mddevs_lock);
5353 }
5354 if (name && dev)
5355 /*
5356 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5357 */
5358 mddev->hold_active = UNTIL_STOP;
5359
5360 error = -ENOMEM;
5361 mddev->queue = blk_alloc_queue(GFP_KERNEL);
5362 if (!mddev->queue)
5363 goto abort;
5364 mddev->queue->queuedata = mddev;
5365
5366 blk_queue_make_request(mddev->queue, md_make_request);
5367 blk_set_stacking_limits(&mddev->queue->limits);
5368
5369 disk = alloc_disk(1 << shift);
5370 if (!disk) {
5371 blk_cleanup_queue(mddev->queue);
5372 mddev->queue = NULL;
5373 goto abort;
5374 }
5375 disk->major = MAJOR(mddev->unit);
5376 disk->first_minor = unit << shift;
5377 if (name)
5378 strcpy(disk->disk_name, name);
5379 else if (partitioned)
5380 sprintf(disk->disk_name, "md_d%d", unit);
5381 else
5382 sprintf(disk->disk_name, "md%d", unit);
5383 disk->fops = &md_fops;
5384 disk->private_data = mddev;
5385 disk->queue = mddev->queue;
5386 blk_queue_write_cache(mddev->queue, true, true);
5387 /* Allow extended partitions. This makes the
5388 * 'mdp' device redundant, but we can't really
5389 * remove it now.
5390 */
5391 disk->flags |= GENHD_FL_EXT_DEVT;
5392 mddev->gendisk = disk;
5393 /* As soon as we call add_disk(), another thread could get
5394 * through to md_open, so make sure it doesn't get too far
5395 */
5396 mutex_lock(&mddev->open_mutex);
5397 add_disk(disk);
5398
5399 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
5400 &disk_to_dev(disk)->kobj, "%s", "md");
5401 if (error) {
5402 /* This isn't possible, but as kobject_init_and_add is marked
5403 * __must_check, we must do something with the result
5404 */
5405 pr_debug("md: cannot register %s/md - name in use\n",
5406 disk->disk_name);
5407 error = 0;
5408 }
5409 if (mddev->kobj.sd &&
5410 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
5411 pr_debug("pointless warning\n");
5412 mutex_unlock(&mddev->open_mutex);
5413 abort:
5414 mutex_unlock(&disks_mutex);
5415 if (!error && mddev->kobj.sd) {
5416 kobject_uevent(&mddev->kobj, KOBJ_ADD);
5417 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
5418 }
5419 mddev_put(mddev);
5420 return error;
5421 }
5422
5423 static struct kobject *md_probe(dev_t dev, int *part, void *data)
5424 {
5425 if (create_on_open)
5426 md_alloc(dev, NULL);
5427 return NULL;
5428 }
5429
5430 static int add_named_array(const char *val, const struct kernel_param *kp)
5431 {
5432 /*
5433 * val must be "md_*" or "mdNNN".
5434 * For "md_*" we allocate an array with a large free minor number, and
5435 * set the name to val. val must not already be an active name.
5436 * For "mdNNN" we allocate an array with the minor number NNN
5437 * which must not already be in use.
5438 */
5439 int len = strlen(val);
5440 char buf[DISK_NAME_LEN];
5441 unsigned long devnum;
5442
5443 while (len && val[len-1] == '\n')
5444 len--;
5445 if (len >= DISK_NAME_LEN)
5446 return -E2BIG;
5447 strlcpy(buf, val, len+1);
5448 if (strncmp(buf, "md_", 3) == 0)
5449 return md_alloc(0, buf);
5450 if (strncmp(buf, "md", 2) == 0 &&
5451 isdigit(buf[2]) &&
5452 kstrtoul(buf+2, 10, &devnum) == 0 &&
5453 devnum <= MINORMASK)
5454 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5455
5456 return -EINVAL;
5457 }
5458
5459 static void md_safemode_timeout(struct timer_list *t)
5460 {
5461 struct mddev *mddev = from_timer(mddev, t, safemode_timer);
5462
5463 mddev->safemode = 1;
5464 if (mddev->external)
5465 sysfs_notify_dirent_safe(mddev->sysfs_state);
5466
5467 md_wakeup_thread(mddev->thread);
5468 }
5469
5470 static int start_dirty_degraded;
5471
5472 int md_run(struct mddev *mddev)
5473 {
5474 int err;
5475 struct md_rdev *rdev;
5476 struct md_personality *pers;
5477
5478 if (list_empty(&mddev->disks))
5479 /* cannot run an array with no devices.. */
5480 return -EINVAL;
5481
5482 if (mddev->pers)
5483 return -EBUSY;
5484 /* Cannot run until previous stop completes properly */
5485 if (mddev->sysfs_active)
5486 return -EBUSY;
5487
5488 /*
5489 * Analyze all RAID superblock(s)
5490 */
5491 if (!mddev->raid_disks) {
5492 if (!mddev->persistent)
5493 return -EINVAL;
5494 analyze_sbs(mddev);
5495 }
5496
5497 if (mddev->level != LEVEL_NONE)
5498 request_module("md-level-%d", mddev->level);
5499 else if (mddev->clevel[0])
5500 request_module("md-%s", mddev->clevel);
5501
5502 /*
5503 * Drop all container device buffers, from now on
5504 * the only valid external interface is through the md
5505 * device.
5506 */
5507 mddev->has_superblocks = false;
5508 rdev_for_each(rdev, mddev) {
5509 if (test_bit(Faulty, &rdev->flags))
5510 continue;
5511 sync_blockdev(rdev->bdev);
5512 invalidate_bdev(rdev->bdev);
5513 if (mddev->ro != 1 &&
5514 (bdev_read_only(rdev->bdev) ||
5515 bdev_read_only(rdev->meta_bdev))) {
5516 mddev->ro = 1;
5517 if (mddev->gendisk)
5518 set_disk_ro(mddev->gendisk, 1);
5519 }
5520
5521 if (rdev->sb_page)
5522 mddev->has_superblocks = true;
5523
5524 /* perform some consistency tests on the device.
5525 * We don't want the data to overlap the metadata,
5526 * Internal Bitmap issues have been handled elsewhere.
5527 */
5528 if (rdev->meta_bdev) {
5529 /* Nothing to check */;
5530 } else if (rdev->data_offset < rdev->sb_start) {
5531 if (mddev->dev_sectors &&
5532 rdev->data_offset + mddev->dev_sectors
5533 > rdev->sb_start) {
5534 pr_warn("md: %s: data overlaps metadata\n",
5535 mdname(mddev));
5536 return -EINVAL;
5537 }
5538 } else {
5539 if (rdev->sb_start + rdev->sb_size/512
5540 > rdev->data_offset) {
5541 pr_warn("md: %s: metadata overlaps data\n",
5542 mdname(mddev));
5543 return -EINVAL;
5544 }
5545 }
5546 sysfs_notify_dirent_safe(rdev->sysfs_state);
5547 }
5548
5549 if (mddev->bio_set == NULL) {
5550 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5551 if (!mddev->bio_set)
5552 return -ENOMEM;
5553 }
5554 if (mddev->sync_set == NULL) {
5555 mddev->sync_set = bioset_create(BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5556 if (!mddev->sync_set) {
5557 err = -ENOMEM;
5558 goto abort;
5559 }
5560 }
5561
5562 spin_lock(&pers_lock);
5563 pers = find_pers(mddev->level, mddev->clevel);
5564 if (!pers || !try_module_get(pers->owner)) {
5565 spin_unlock(&pers_lock);
5566 if (mddev->level != LEVEL_NONE)
5567 pr_warn("md: personality for level %d is not loaded!\n",
5568 mddev->level);
5569 else
5570 pr_warn("md: personality for level %s is not loaded!\n",
5571 mddev->clevel);
5572 err = -EINVAL;
5573 goto abort;
5574 }
5575 spin_unlock(&pers_lock);
5576 if (mddev->level != pers->level) {
5577 mddev->level = pers->level;
5578 mddev->new_level = pers->level;
5579 }
5580 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5581
5582 if (mddev->reshape_position != MaxSector &&
5583 pers->start_reshape == NULL) {
5584 /* This personality cannot handle reshaping... */
5585 module_put(pers->owner);
5586 err = -EINVAL;
5587 goto abort;
5588 }
5589
5590 if (pers->sync_request) {
5591 /* Warn if this is a potentially silly
5592 * configuration.
5593 */
5594 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5595 struct md_rdev *rdev2;
5596 int warned = 0;
5597
5598 rdev_for_each(rdev, mddev)
5599 rdev_for_each(rdev2, mddev) {
5600 if (rdev < rdev2 &&
5601 rdev->bdev->bd_contains ==
5602 rdev2->bdev->bd_contains) {
5603 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5604 mdname(mddev),
5605 bdevname(rdev->bdev,b),
5606 bdevname(rdev2->bdev,b2));
5607 warned = 1;
5608 }
5609 }
5610
5611 if (warned)
5612 pr_warn("True protection against single-disk failure might be compromised.\n");
5613 }
5614
5615 mddev->recovery = 0;
5616 /* may be over-ridden by personality */
5617 mddev->resync_max_sectors = mddev->dev_sectors;
5618
5619 mddev->ok_start_degraded = start_dirty_degraded;
5620
5621 if (start_readonly && mddev->ro == 0)
5622 mddev->ro = 2; /* read-only, but switch on first write */
5623
5624 /*
5625 * NOTE: some pers->run(), for example r5l_recovery_log(), wakes
5626 * up mddev->thread. It is important to initialize critical
5627 * resources for mddev->thread BEFORE calling pers->run().
5628 */
5629 err = pers->run(mddev);
5630 if (err)
5631 pr_warn("md: pers->run() failed ...\n");
5632 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
5633 WARN_ONCE(!mddev->external_size,
5634 "%s: default size too small, but 'external_size' not in effect?\n",
5635 __func__);
5636 pr_warn("md: invalid array_size %llu > default size %llu\n",
5637 (unsigned long long)mddev->array_sectors / 2,
5638 (unsigned long long)pers->size(mddev, 0, 0) / 2);
5639 err = -EINVAL;
5640 }
5641 if (err == 0 && pers->sync_request &&
5642 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5643 struct bitmap *bitmap;
5644
5645 bitmap = bitmap_create(mddev, -1);
5646 if (IS_ERR(bitmap)) {
5647 err = PTR_ERR(bitmap);
5648 pr_warn("%s: failed to create bitmap (%d)\n",
5649 mdname(mddev), err);
5650 } else
5651 mddev->bitmap = bitmap;
5652
5653 }
5654 if (err) {
5655 mddev_detach(mddev);
5656 if (mddev->private)
5657 pers->free(mddev, mddev->private);
5658 mddev->private = NULL;
5659 module_put(pers->owner);
5660 bitmap_destroy(mddev);
5661 goto abort;
5662 }
5663 if (mddev->queue) {
5664 bool nonrot = true;
5665
5666 rdev_for_each(rdev, mddev) {
5667 if (rdev->raid_disk >= 0 &&
5668 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5669 nonrot = false;
5670 break;
5671 }
5672 }
5673 if (mddev->degraded)
5674 nonrot = false;
5675 if (nonrot)
5676 queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5677 else
5678 queue_flag_clear_unlocked(QUEUE_FLAG_NONROT, mddev->queue);
5679 mddev->queue->backing_dev_info->congested_data = mddev;
5680 mddev->queue->backing_dev_info->congested_fn = md_congested;
5681 }
5682 if (pers->sync_request) {
5683 if (mddev->kobj.sd &&
5684 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5685 pr_warn("md: cannot register extra attributes for %s\n",
5686 mdname(mddev));
5687 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5688 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5689 mddev->ro = 0;
5690
5691 atomic_set(&mddev->max_corr_read_errors,
5692 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5693 mddev->safemode = 0;
5694 if (mddev_is_clustered(mddev))
5695 mddev->safemode_delay = 0;
5696 else
5697 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5698 mddev->in_sync = 1;
5699 smp_wmb();
5700 spin_lock(&mddev->lock);
5701 mddev->pers = pers;
5702 spin_unlock(&mddev->lock);
5703 rdev_for_each(rdev, mddev)
5704 if (rdev->raid_disk >= 0)
5705 if (sysfs_link_rdev(mddev, rdev))
5706 /* failure here is OK */;
5707
5708 if (mddev->degraded && !mddev->ro)
5709 /* This ensures that recovering status is reported immediately
5710 * via sysfs - until a lack of spares is confirmed.
5711 */
5712 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5713 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5714
5715 if (mddev->sb_flags)
5716 md_update_sb(mddev, 0);
5717
5718 md_new_event(mddev);
5719 return 0;
5720
5721 abort:
5722 if (mddev->bio_set) {
5723 bioset_free(mddev->bio_set);
5724 mddev->bio_set = NULL;
5725 }
5726 if (mddev->sync_set) {
5727 bioset_free(mddev->sync_set);
5728 mddev->sync_set = NULL;
5729 }
5730
5731 return err;
5732 }
5733 EXPORT_SYMBOL_GPL(md_run);
5734
5735 static int do_md_run(struct mddev *mddev)
5736 {
5737 int err;
5738
5739 set_bit(MD_NOT_READY, &mddev->flags);
5740 err = md_run(mddev);
5741 if (err)
5742 goto out;
5743 err = bitmap_load(mddev);
5744 if (err) {
5745 bitmap_destroy(mddev);
5746 goto out;
5747 }
5748
5749 if (mddev_is_clustered(mddev))
5750 md_allow_write(mddev);
5751
5752 md_wakeup_thread(mddev->thread);
5753 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5754
5755 set_capacity(mddev->gendisk, mddev->array_sectors);
5756 revalidate_disk(mddev->gendisk);
5757 clear_bit(MD_NOT_READY, &mddev->flags);
5758 mddev->changed = 1;
5759 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5760 sysfs_notify_dirent_safe(mddev->sysfs_state);
5761 sysfs_notify_dirent_safe(mddev->sysfs_action);
5762 sysfs_notify(&mddev->kobj, NULL, "degraded");
5763 out:
5764 clear_bit(MD_NOT_READY, &mddev->flags);
5765 return err;
5766 }
5767
5768 static int restart_array(struct mddev *mddev)
5769 {
5770 struct gendisk *disk = mddev->gendisk;
5771 struct md_rdev *rdev;
5772 bool has_journal = false;
5773 bool has_readonly = false;
5774
5775 /* Complain if it has no devices */
5776 if (list_empty(&mddev->disks))
5777 return -ENXIO;
5778 if (!mddev->pers)
5779 return -EINVAL;
5780 if (!mddev->ro)
5781 return -EBUSY;
5782
5783 rcu_read_lock();
5784 rdev_for_each_rcu(rdev, mddev) {
5785 if (test_bit(Journal, &rdev->flags) &&
5786 !test_bit(Faulty, &rdev->flags))
5787 has_journal = true;
5788 if (bdev_read_only(rdev->bdev))
5789 has_readonly = true;
5790 }
5791 rcu_read_unlock();
5792 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
5793 /* Don't restart rw with journal missing/faulty */
5794 return -EINVAL;
5795 if (has_readonly)
5796 return -EROFS;
5797
5798 mddev->safemode = 0;
5799 mddev->ro = 0;
5800 set_disk_ro(disk, 0);
5801 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
5802 /* Kick recovery or resync if necessary */
5803 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5804 md_wakeup_thread(mddev->thread);
5805 md_wakeup_thread(mddev->sync_thread);
5806 sysfs_notify_dirent_safe(mddev->sysfs_state);
5807 return 0;
5808 }
5809
5810 static void md_clean(struct mddev *mddev)
5811 {
5812 mddev->array_sectors = 0;
5813 mddev->external_size = 0;
5814 mddev->dev_sectors = 0;
5815 mddev->raid_disks = 0;
5816 mddev->recovery_cp = 0;
5817 mddev->resync_min = 0;
5818 mddev->resync_max = MaxSector;
5819 mddev->reshape_position = MaxSector;
5820 mddev->external = 0;
5821 mddev->persistent = 0;
5822 mddev->level = LEVEL_NONE;
5823 mddev->clevel[0] = 0;
5824 mddev->flags = 0;
5825 mddev->sb_flags = 0;
5826 mddev->ro = 0;
5827 mddev->metadata_type[0] = 0;
5828 mddev->chunk_sectors = 0;
5829 mddev->ctime = mddev->utime = 0;
5830 mddev->layout = 0;
5831 mddev->max_disks = 0;
5832 mddev->events = 0;
5833 mddev->can_decrease_events = 0;
5834 mddev->delta_disks = 0;
5835 mddev->reshape_backwards = 0;
5836 mddev->new_level = LEVEL_NONE;
5837 mddev->new_layout = 0;
5838 mddev->new_chunk_sectors = 0;
5839 mddev->curr_resync = 0;
5840 atomic64_set(&mddev->resync_mismatches, 0);
5841 mddev->suspend_lo = mddev->suspend_hi = 0;
5842 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5843 mddev->recovery = 0;
5844 mddev->in_sync = 0;
5845 mddev->changed = 0;
5846 mddev->degraded = 0;
5847 mddev->safemode = 0;
5848 mddev->private = NULL;
5849 mddev->cluster_info = NULL;
5850 mddev->bitmap_info.offset = 0;
5851 mddev->bitmap_info.default_offset = 0;
5852 mddev->bitmap_info.default_space = 0;
5853 mddev->bitmap_info.chunksize = 0;
5854 mddev->bitmap_info.daemon_sleep = 0;
5855 mddev->bitmap_info.max_write_behind = 0;
5856 mddev->bitmap_info.nodes = 0;
5857 }
5858
5859 static void __md_stop_writes(struct mddev *mddev)
5860 {
5861 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5862 flush_workqueue(md_misc_wq);
5863 if (mddev->sync_thread) {
5864 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5865 md_reap_sync_thread(mddev);
5866 }
5867
5868 del_timer_sync(&mddev->safemode_timer);
5869
5870 if (mddev->pers && mddev->pers->quiesce) {
5871 mddev->pers->quiesce(mddev, 1);
5872 mddev->pers->quiesce(mddev, 0);
5873 }
5874 bitmap_flush(mddev);
5875
5876 if (mddev->ro == 0 &&
5877 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
5878 mddev->sb_flags)) {
5879 /* mark array as shutdown cleanly */
5880 if (!mddev_is_clustered(mddev))
5881 mddev->in_sync = 1;
5882 md_update_sb(mddev, 1);
5883 }
5884 }
5885
5886 void md_stop_writes(struct mddev *mddev)
5887 {
5888 mddev_lock_nointr(mddev);
5889 __md_stop_writes(mddev);
5890 mddev_unlock(mddev);
5891 }
5892 EXPORT_SYMBOL_GPL(md_stop_writes);
5893
5894 static void mddev_detach(struct mddev *mddev)
5895 {
5896 bitmap_wait_behind_writes(mddev);
5897 if (mddev->pers && mddev->pers->quiesce) {
5898 mddev->pers->quiesce(mddev, 1);
5899 mddev->pers->quiesce(mddev, 0);
5900 }
5901 md_unregister_thread(&mddev->thread);
5902 if (mddev->queue)
5903 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5904 }
5905
5906 static void __md_stop(struct mddev *mddev)
5907 {
5908 struct md_personality *pers = mddev->pers;
5909 bitmap_destroy(mddev);
5910 mddev_detach(mddev);
5911 /* Ensure ->event_work is done */
5912 flush_workqueue(md_misc_wq);
5913 spin_lock(&mddev->lock);
5914 mddev->pers = NULL;
5915 spin_unlock(&mddev->lock);
5916 pers->free(mddev, mddev->private);
5917 mddev->private = NULL;
5918 if (pers->sync_request && mddev->to_remove == NULL)
5919 mddev->to_remove = &md_redundancy_group;
5920 module_put(pers->owner);
5921 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5922 }
5923
5924 void md_stop(struct mddev *mddev)
5925 {
5926 /* stop the array and free an attached data structures.
5927 * This is called from dm-raid
5928 */
5929 __md_stop(mddev);
5930 if (mddev->bio_set) {
5931 bioset_free(mddev->bio_set);
5932 mddev->bio_set = NULL;
5933 }
5934 if (mddev->sync_set) {
5935 bioset_free(mddev->sync_set);
5936 mddev->sync_set = NULL;
5937 }
5938 }
5939
5940 EXPORT_SYMBOL_GPL(md_stop);
5941
5942 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5943 {
5944 int err = 0;
5945 int did_freeze = 0;
5946
5947 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5948 did_freeze = 1;
5949 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5950 md_wakeup_thread(mddev->thread);
5951 }
5952 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5953 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5954 if (mddev->sync_thread)
5955 /* Thread might be blocked waiting for metadata update
5956 * which will now never happen */
5957 wake_up_process(mddev->sync_thread->tsk);
5958
5959 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
5960 return -EBUSY;
5961 mddev_unlock(mddev);
5962 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
5963 &mddev->recovery));
5964 wait_event(mddev->sb_wait,
5965 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
5966 mddev_lock_nointr(mddev);
5967
5968 mutex_lock(&mddev->open_mutex);
5969 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
5970 mddev->sync_thread ||
5971 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
5972 pr_warn("md: %s still in use.\n",mdname(mddev));
5973 if (did_freeze) {
5974 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5975 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5976 md_wakeup_thread(mddev->thread);
5977 }
5978 err = -EBUSY;
5979 goto out;
5980 }
5981 if (mddev->pers) {
5982 __md_stop_writes(mddev);
5983
5984 err = -ENXIO;
5985 if (mddev->ro==1)
5986 goto out;
5987 mddev->ro = 1;
5988 set_disk_ro(mddev->gendisk, 1);
5989 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5990 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5991 md_wakeup_thread(mddev->thread);
5992 sysfs_notify_dirent_safe(mddev->sysfs_state);
5993 err = 0;
5994 }
5995 out:
5996 mutex_unlock(&mddev->open_mutex);
5997 return err;
5998 }
5999
6000 /* mode:
6001 * 0 - completely stop and dis-assemble array
6002 * 2 - stop but do not disassemble array
6003 */
6004 static int do_md_stop(struct mddev *mddev, int mode,
6005 struct block_device *bdev)
6006 {
6007 struct gendisk *disk = mddev->gendisk;
6008 struct md_rdev *rdev;
6009 int did_freeze = 0;
6010
6011 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6012 did_freeze = 1;
6013 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6014 md_wakeup_thread(mddev->thread);
6015 }
6016 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
6017 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6018 if (mddev->sync_thread)
6019 /* Thread might be blocked waiting for metadata update
6020 * which will now never happen */
6021 wake_up_process(mddev->sync_thread->tsk);
6022
6023 mddev_unlock(mddev);
6024 wait_event(resync_wait, (mddev->sync_thread == NULL &&
6025 !test_bit(MD_RECOVERY_RUNNING,
6026 &mddev->recovery)));
6027 mddev_lock_nointr(mddev);
6028
6029 mutex_lock(&mddev->open_mutex);
6030 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
6031 mddev->sysfs_active ||
6032 mddev->sync_thread ||
6033 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
6034 pr_warn("md: %s still in use.\n",mdname(mddev));
6035 mutex_unlock(&mddev->open_mutex);
6036 if (did_freeze) {
6037 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6038 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6039 md_wakeup_thread(mddev->thread);
6040 }
6041 return -EBUSY;
6042 }
6043 if (mddev->pers) {
6044 if (mddev->ro)
6045 set_disk_ro(disk, 0);
6046
6047 __md_stop_writes(mddev);
6048 __md_stop(mddev);
6049 mddev->queue->backing_dev_info->congested_fn = NULL;
6050
6051 /* tell userspace to handle 'inactive' */
6052 sysfs_notify_dirent_safe(mddev->sysfs_state);
6053
6054 rdev_for_each(rdev, mddev)
6055 if (rdev->raid_disk >= 0)
6056 sysfs_unlink_rdev(mddev, rdev);
6057
6058 set_capacity(disk, 0);
6059 mutex_unlock(&mddev->open_mutex);
6060 mddev->changed = 1;
6061 revalidate_disk(disk);
6062
6063 if (mddev->ro)
6064 mddev->ro = 0;
6065 } else
6066 mutex_unlock(&mddev->open_mutex);
6067 /*
6068 * Free resources if final stop
6069 */
6070 if (mode == 0) {
6071 pr_info("md: %s stopped.\n", mdname(mddev));
6072
6073 if (mddev->bitmap_info.file) {
6074 struct file *f = mddev->bitmap_info.file;
6075 spin_lock(&mddev->lock);
6076 mddev->bitmap_info.file = NULL;
6077 spin_unlock(&mddev->lock);
6078 fput(f);
6079 }
6080 mddev->bitmap_info.offset = 0;
6081
6082 export_array(mddev);
6083
6084 md_clean(mddev);
6085 if (mddev->hold_active == UNTIL_STOP)
6086 mddev->hold_active = 0;
6087 }
6088 md_new_event(mddev);
6089 sysfs_notify_dirent_safe(mddev->sysfs_state);
6090 return 0;
6091 }
6092
6093 #ifndef MODULE
6094 static void autorun_array(struct mddev *mddev)
6095 {
6096 struct md_rdev *rdev;
6097 int err;
6098
6099 if (list_empty(&mddev->disks))
6100 return;
6101
6102 pr_info("md: running: ");
6103
6104 rdev_for_each(rdev, mddev) {
6105 char b[BDEVNAME_SIZE];
6106 pr_cont("<%s>", bdevname(rdev->bdev,b));
6107 }
6108 pr_cont("\n");
6109
6110 err = do_md_run(mddev);
6111 if (err) {
6112 pr_warn("md: do_md_run() returned %d\n", err);
6113 do_md_stop(mddev, 0, NULL);
6114 }
6115 }
6116
6117 /*
6118 * lets try to run arrays based on all disks that have arrived
6119 * until now. (those are in pending_raid_disks)
6120 *
6121 * the method: pick the first pending disk, collect all disks with
6122 * the same UUID, remove all from the pending list and put them into
6123 * the 'same_array' list. Then order this list based on superblock
6124 * update time (freshest comes first), kick out 'old' disks and
6125 * compare superblocks. If everything's fine then run it.
6126 *
6127 * If "unit" is allocated, then bump its reference count
6128 */
6129 static void autorun_devices(int part)
6130 {
6131 struct md_rdev *rdev0, *rdev, *tmp;
6132 struct mddev *mddev;
6133 char b[BDEVNAME_SIZE];
6134
6135 pr_info("md: autorun ...\n");
6136 while (!list_empty(&pending_raid_disks)) {
6137 int unit;
6138 dev_t dev;
6139 LIST_HEAD(candidates);
6140 rdev0 = list_entry(pending_raid_disks.next,
6141 struct md_rdev, same_set);
6142
6143 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
6144 INIT_LIST_HEAD(&candidates);
6145 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
6146 if (super_90_load(rdev, rdev0, 0) >= 0) {
6147 pr_debug("md: adding %s ...\n",
6148 bdevname(rdev->bdev,b));
6149 list_move(&rdev->same_set, &candidates);
6150 }
6151 /*
6152 * now we have a set of devices, with all of them having
6153 * mostly sane superblocks. It's time to allocate the
6154 * mddev.
6155 */
6156 if (part) {
6157 dev = MKDEV(mdp_major,
6158 rdev0->preferred_minor << MdpMinorShift);
6159 unit = MINOR(dev) >> MdpMinorShift;
6160 } else {
6161 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6162 unit = MINOR(dev);
6163 }
6164 if (rdev0->preferred_minor != unit) {
6165 pr_warn("md: unit number in %s is bad: %d\n",
6166 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
6167 break;
6168 }
6169
6170 md_probe(dev, NULL, NULL);
6171 mddev = mddev_find(dev);
6172 if (!mddev || !mddev->gendisk) {
6173 if (mddev)
6174 mddev_put(mddev);
6175 break;
6176 }
6177 if (mddev_lock(mddev))
6178 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
6179 else if (mddev->raid_disks || mddev->major_version
6180 || !list_empty(&mddev->disks)) {
6181 pr_warn("md: %s already running, cannot run %s\n",
6182 mdname(mddev), bdevname(rdev0->bdev,b));
6183 mddev_unlock(mddev);
6184 } else {
6185 pr_debug("md: created %s\n", mdname(mddev));
6186 mddev->persistent = 1;
6187 rdev_for_each_list(rdev, tmp, &candidates) {
6188 list_del_init(&rdev->same_set);
6189 if (bind_rdev_to_array(rdev, mddev))
6190 export_rdev(rdev);
6191 }
6192 autorun_array(mddev);
6193 mddev_unlock(mddev);
6194 }
6195 /* on success, candidates will be empty, on error
6196 * it won't...
6197 */
6198 rdev_for_each_list(rdev, tmp, &candidates) {
6199 list_del_init(&rdev->same_set);
6200 export_rdev(rdev);
6201 }
6202 mddev_put(mddev);
6203 }
6204 pr_info("md: ... autorun DONE.\n");
6205 }
6206 #endif /* !MODULE */
6207
6208 static int get_version(void __user *arg)
6209 {
6210 mdu_version_t ver;
6211
6212 ver.major = MD_MAJOR_VERSION;
6213 ver.minor = MD_MINOR_VERSION;
6214 ver.patchlevel = MD_PATCHLEVEL_VERSION;
6215
6216 if (copy_to_user(arg, &ver, sizeof(ver)))
6217 return -EFAULT;
6218
6219 return 0;
6220 }
6221
6222 static int get_array_info(struct mddev *mddev, void __user *arg)
6223 {
6224 mdu_array_info_t info;
6225 int nr,working,insync,failed,spare;
6226 struct md_rdev *rdev;
6227
6228 nr = working = insync = failed = spare = 0;
6229 rcu_read_lock();
6230 rdev_for_each_rcu(rdev, mddev) {
6231 nr++;
6232 if (test_bit(Faulty, &rdev->flags))
6233 failed++;
6234 else {
6235 working++;
6236 if (test_bit(In_sync, &rdev->flags))
6237 insync++;
6238 else if (test_bit(Journal, &rdev->flags))
6239 /* TODO: add journal count to md_u.h */
6240 ;
6241 else
6242 spare++;
6243 }
6244 }
6245 rcu_read_unlock();
6246
6247 info.major_version = mddev->major_version;
6248 info.minor_version = mddev->minor_version;
6249 info.patch_version = MD_PATCHLEVEL_VERSION;
6250 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
6251 info.level = mddev->level;
6252 info.size = mddev->dev_sectors / 2;
6253 if (info.size != mddev->dev_sectors / 2) /* overflow */
6254 info.size = -1;
6255 info.nr_disks = nr;
6256 info.raid_disks = mddev->raid_disks;
6257 info.md_minor = mddev->md_minor;
6258 info.not_persistent= !mddev->persistent;
6259
6260 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
6261 info.state = 0;
6262 if (mddev->in_sync)
6263 info.state = (1<<MD_SB_CLEAN);
6264 if (mddev->bitmap && mddev->bitmap_info.offset)
6265 info.state |= (1<<MD_SB_BITMAP_PRESENT);
6266 if (mddev_is_clustered(mddev))
6267 info.state |= (1<<MD_SB_CLUSTERED);
6268 info.active_disks = insync;
6269 info.working_disks = working;
6270 info.failed_disks = failed;
6271 info.spare_disks = spare;
6272
6273 info.layout = mddev->layout;
6274 info.chunk_size = mddev->chunk_sectors << 9;
6275
6276 if (copy_to_user(arg, &info, sizeof(info)))
6277 return -EFAULT;
6278
6279 return 0;
6280 }
6281
6282 static int get_bitmap_file(struct mddev *mddev, void __user * arg)
6283 {
6284 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
6285 char *ptr;
6286 int err;
6287
6288 file = kzalloc(sizeof(*file), GFP_NOIO);
6289 if (!file)
6290 return -ENOMEM;
6291
6292 err = 0;
6293 spin_lock(&mddev->lock);
6294 /* bitmap enabled */
6295 if (mddev->bitmap_info.file) {
6296 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6297 sizeof(file->pathname));
6298 if (IS_ERR(ptr))
6299 err = PTR_ERR(ptr);
6300 else
6301 memmove(file->pathname, ptr,
6302 sizeof(file->pathname)-(ptr-file->pathname));
6303 }
6304 spin_unlock(&mddev->lock);
6305
6306 if (err == 0 &&
6307 copy_to_user(arg, file, sizeof(*file)))
6308 err = -EFAULT;
6309
6310 kfree(file);
6311 return err;
6312 }
6313
6314 static int get_disk_info(struct mddev *mddev, void __user * arg)
6315 {
6316 mdu_disk_info_t info;
6317 struct md_rdev *rdev;
6318
6319 if (copy_from_user(&info, arg, sizeof(info)))
6320 return -EFAULT;
6321
6322 rcu_read_lock();
6323 rdev = md_find_rdev_nr_rcu(mddev, info.number);
6324 if (rdev) {
6325 info.major = MAJOR(rdev->bdev->bd_dev);
6326 info.minor = MINOR(rdev->bdev->bd_dev);
6327 info.raid_disk = rdev->raid_disk;
6328 info.state = 0;
6329 if (test_bit(Faulty, &rdev->flags))
6330 info.state |= (1<<MD_DISK_FAULTY);
6331 else if (test_bit(In_sync, &rdev->flags)) {
6332 info.state |= (1<<MD_DISK_ACTIVE);
6333 info.state |= (1<<MD_DISK_SYNC);
6334 }
6335 if (test_bit(Journal, &rdev->flags))
6336 info.state |= (1<<MD_DISK_JOURNAL);
6337 if (test_bit(WriteMostly, &rdev->flags))
6338 info.state |= (1<<MD_DISK_WRITEMOSTLY);
6339 if (test_bit(FailFast, &rdev->flags))
6340 info.state |= (1<<MD_DISK_FAILFAST);
6341 } else {
6342 info.major = info.minor = 0;
6343 info.raid_disk = -1;
6344 info.state = (1<<MD_DISK_REMOVED);
6345 }
6346 rcu_read_unlock();
6347
6348 if (copy_to_user(arg, &info, sizeof(info)))
6349 return -EFAULT;
6350
6351 return 0;
6352 }
6353
6354 static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
6355 {
6356 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
6357 struct md_rdev *rdev;
6358 dev_t dev = MKDEV(info->major,info->minor);
6359
6360 if (mddev_is_clustered(mddev) &&
6361 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
6362 pr_warn("%s: Cannot add to clustered mddev.\n",
6363 mdname(mddev));
6364 return -EINVAL;
6365 }
6366
6367 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6368 return -EOVERFLOW;
6369
6370 if (!mddev->raid_disks) {
6371 int err;
6372 /* expecting a device which has a superblock */
6373 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6374 if (IS_ERR(rdev)) {
6375 pr_warn("md: md_import_device returned %ld\n",
6376 PTR_ERR(rdev));
6377 return PTR_ERR(rdev);
6378 }
6379 if (!list_empty(&mddev->disks)) {
6380 struct md_rdev *rdev0
6381 = list_entry(mddev->disks.next,
6382 struct md_rdev, same_set);
6383 err = super_types[mddev->major_version]
6384 .load_super(rdev, rdev0, mddev->minor_version);
6385 if (err < 0) {
6386 pr_warn("md: %s has different UUID to %s\n",
6387 bdevname(rdev->bdev,b),
6388 bdevname(rdev0->bdev,b2));
6389 export_rdev(rdev);
6390 return -EINVAL;
6391 }
6392 }
6393 err = bind_rdev_to_array(rdev, mddev);
6394 if (err)
6395 export_rdev(rdev);
6396 return err;
6397 }
6398
6399 /*
6400 * add_new_disk can be used once the array is assembled
6401 * to add "hot spares". They must already have a superblock
6402 * written
6403 */
6404 if (mddev->pers) {
6405 int err;
6406 if (!mddev->pers->hot_add_disk) {
6407 pr_warn("%s: personality does not support diskops!\n",
6408 mdname(mddev));
6409 return -EINVAL;
6410 }
6411 if (mddev->persistent)
6412 rdev = md_import_device(dev, mddev->major_version,
6413 mddev->minor_version);
6414 else
6415 rdev = md_import_device(dev, -1, -1);
6416 if (IS_ERR(rdev)) {
6417 pr_warn("md: md_import_device returned %ld\n",
6418 PTR_ERR(rdev));
6419 return PTR_ERR(rdev);
6420 }
6421 /* set saved_raid_disk if appropriate */
6422 if (!mddev->persistent) {
6423 if (info->state & (1<<MD_DISK_SYNC) &&
6424 info->raid_disk < mddev->raid_disks) {
6425 rdev->raid_disk = info->raid_disk;
6426 set_bit(In_sync, &rdev->flags);
6427 clear_bit(Bitmap_sync, &rdev->flags);
6428 } else
6429 rdev->raid_disk = -1;
6430 rdev->saved_raid_disk = rdev->raid_disk;
6431 } else
6432 super_types[mddev->major_version].
6433 validate_super(mddev, rdev);
6434 if ((info->state & (1<<MD_DISK_SYNC)) &&
6435 rdev->raid_disk != info->raid_disk) {
6436 /* This was a hot-add request, but events doesn't
6437 * match, so reject it.
6438 */
6439 export_rdev(rdev);
6440 return -EINVAL;
6441 }
6442
6443 clear_bit(In_sync, &rdev->flags); /* just to be sure */
6444 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6445 set_bit(WriteMostly, &rdev->flags);
6446 else
6447 clear_bit(WriteMostly, &rdev->flags);
6448 if (info->state & (1<<MD_DISK_FAILFAST))
6449 set_bit(FailFast, &rdev->flags);
6450 else
6451 clear_bit(FailFast, &rdev->flags);
6452
6453 if (info->state & (1<<MD_DISK_JOURNAL)) {
6454 struct md_rdev *rdev2;
6455 bool has_journal = false;
6456
6457 /* make sure no existing journal disk */
6458 rdev_for_each(rdev2, mddev) {
6459 if (test_bit(Journal, &rdev2->flags)) {
6460 has_journal = true;
6461 break;
6462 }
6463 }
6464 if (has_journal || mddev->bitmap) {
6465 export_rdev(rdev);
6466 return -EBUSY;
6467 }
6468 set_bit(Journal, &rdev->flags);
6469 }
6470 /*
6471 * check whether the device shows up in other nodes
6472 */
6473 if (mddev_is_clustered(mddev)) {
6474 if (info->state & (1 << MD_DISK_CANDIDATE))
6475 set_bit(Candidate, &rdev->flags);
6476 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
6477 /* --add initiated by this node */
6478 err = md_cluster_ops->add_new_disk(mddev, rdev);
6479 if (err) {
6480 export_rdev(rdev);
6481 return err;
6482 }
6483 }
6484 }
6485
6486 rdev->raid_disk = -1;
6487 err = bind_rdev_to_array(rdev, mddev);
6488
6489 if (err)
6490 export_rdev(rdev);
6491
6492 if (mddev_is_clustered(mddev)) {
6493 if (info->state & (1 << MD_DISK_CANDIDATE)) {
6494 if (!err) {
6495 err = md_cluster_ops->new_disk_ack(mddev,
6496 err == 0);
6497 if (err)
6498 md_kick_rdev_from_array(rdev);
6499 }
6500 } else {
6501 if (err)
6502 md_cluster_ops->add_new_disk_cancel(mddev);
6503 else
6504 err = add_bound_rdev(rdev);
6505 }
6506
6507 } else if (!err)
6508 err = add_bound_rdev(rdev);
6509
6510 return err;
6511 }
6512
6513 /* otherwise, add_new_disk is only allowed
6514 * for major_version==0 superblocks
6515 */
6516 if (mddev->major_version != 0) {
6517 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
6518 return -EINVAL;
6519 }
6520
6521 if (!(info->state & (1<<MD_DISK_FAULTY))) {
6522 int err;
6523 rdev = md_import_device(dev, -1, 0);
6524 if (IS_ERR(rdev)) {
6525 pr_warn("md: error, md_import_device() returned %ld\n",
6526 PTR_ERR(rdev));
6527 return PTR_ERR(rdev);
6528 }
6529 rdev->desc_nr = info->number;
6530 if (info->raid_disk < mddev->raid_disks)
6531 rdev->raid_disk = info->raid_disk;
6532 else
6533 rdev->raid_disk = -1;
6534
6535 if (rdev->raid_disk < mddev->raid_disks)
6536 if (info->state & (1<<MD_DISK_SYNC))
6537 set_bit(In_sync, &rdev->flags);
6538
6539 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6540 set_bit(WriteMostly, &rdev->flags);
6541 if (info->state & (1<<MD_DISK_FAILFAST))
6542 set_bit(FailFast, &rdev->flags);
6543
6544 if (!mddev->persistent) {
6545 pr_debug("md: nonpersistent superblock ...\n");
6546 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6547 } else
6548 rdev->sb_start = calc_dev_sboffset(rdev);
6549 rdev->sectors = rdev->sb_start;
6550
6551 err = bind_rdev_to_array(rdev, mddev);
6552 if (err) {
6553 export_rdev(rdev);
6554 return err;
6555 }
6556 }
6557
6558 return 0;
6559 }
6560
6561 static int hot_remove_disk(struct mddev *mddev, dev_t dev)
6562 {
6563 char b[BDEVNAME_SIZE];
6564 struct md_rdev *rdev;
6565
6566 if (!mddev->pers)
6567 return -ENODEV;
6568
6569 rdev = find_rdev(mddev, dev);
6570 if (!rdev)
6571 return -ENXIO;
6572
6573 if (rdev->raid_disk < 0)
6574 goto kick_rdev;
6575
6576 clear_bit(Blocked, &rdev->flags);
6577 remove_and_add_spares(mddev, rdev);
6578
6579 if (rdev->raid_disk >= 0)
6580 goto busy;
6581
6582 kick_rdev:
6583 if (mddev_is_clustered(mddev))
6584 md_cluster_ops->remove_disk(mddev, rdev);
6585
6586 md_kick_rdev_from_array(rdev);
6587 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6588 if (mddev->thread)
6589 md_wakeup_thread(mddev->thread);
6590 else
6591 md_update_sb(mddev, 1);
6592 md_new_event(mddev);
6593
6594 return 0;
6595 busy:
6596 pr_debug("md: cannot remove active disk %s from %s ...\n",
6597 bdevname(rdev->bdev,b), mdname(mddev));
6598 return -EBUSY;
6599 }
6600
6601 static int hot_add_disk(struct mddev *mddev, dev_t dev)
6602 {
6603 char b[BDEVNAME_SIZE];
6604 int err;
6605 struct md_rdev *rdev;
6606
6607 if (!mddev->pers)
6608 return -ENODEV;
6609
6610 if (mddev->major_version != 0) {
6611 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
6612 mdname(mddev));
6613 return -EINVAL;
6614 }
6615 if (!mddev->pers->hot_add_disk) {
6616 pr_warn("%s: personality does not support diskops!\n",
6617 mdname(mddev));
6618 return -EINVAL;
6619 }
6620
6621 rdev = md_import_device(dev, -1, 0);
6622 if (IS_ERR(rdev)) {
6623 pr_warn("md: error, md_import_device() returned %ld\n",
6624 PTR_ERR(rdev));
6625 return -EINVAL;
6626 }
6627
6628 if (mddev->persistent)
6629 rdev->sb_start = calc_dev_sboffset(rdev);
6630 else
6631 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6632
6633 rdev->sectors = rdev->sb_start;
6634
6635 if (test_bit(Faulty, &rdev->flags)) {
6636 pr_warn("md: can not hot-add faulty %s disk to %s!\n",
6637 bdevname(rdev->bdev,b), mdname(mddev));
6638 err = -EINVAL;
6639 goto abort_export;
6640 }
6641
6642 clear_bit(In_sync, &rdev->flags);
6643 rdev->desc_nr = -1;
6644 rdev->saved_raid_disk = -1;
6645 err = bind_rdev_to_array(rdev, mddev);
6646 if (err)
6647 goto abort_export;
6648
6649 /*
6650 * The rest should better be atomic, we can have disk failures
6651 * noticed in interrupt contexts ...
6652 */
6653
6654 rdev->raid_disk = -1;
6655
6656 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6657 if (!mddev->thread)
6658 md_update_sb(mddev, 1);
6659 /*
6660 * Kick recovery, maybe this spare has to be added to the
6661 * array immediately.
6662 */
6663 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6664 md_wakeup_thread(mddev->thread);
6665 md_new_event(mddev);
6666 return 0;
6667
6668 abort_export:
6669 export_rdev(rdev);
6670 return err;
6671 }
6672
6673 static int set_bitmap_file(struct mddev *mddev, int fd)
6674 {
6675 int err = 0;
6676
6677 if (mddev->pers) {
6678 if (!mddev->pers->quiesce || !mddev->thread)
6679 return -EBUSY;
6680 if (mddev->recovery || mddev->sync_thread)
6681 return -EBUSY;
6682 /* we should be able to change the bitmap.. */
6683 }
6684
6685 if (fd >= 0) {
6686 struct inode *inode;
6687 struct file *f;
6688
6689 if (mddev->bitmap || mddev->bitmap_info.file)
6690 return -EEXIST; /* cannot add when bitmap is present */
6691 f = fget(fd);
6692
6693 if (f == NULL) {
6694 pr_warn("%s: error: failed to get bitmap file\n",
6695 mdname(mddev));
6696 return -EBADF;
6697 }
6698
6699 inode = f->f_mapping->host;
6700 if (!S_ISREG(inode->i_mode)) {
6701 pr_warn("%s: error: bitmap file must be a regular file\n",
6702 mdname(mddev));
6703 err = -EBADF;
6704 } else if (!(f->f_mode & FMODE_WRITE)) {
6705 pr_warn("%s: error: bitmap file must open for write\n",
6706 mdname(mddev));
6707 err = -EBADF;
6708 } else if (atomic_read(&inode->i_writecount) != 1) {
6709 pr_warn("%s: error: bitmap file is already in use\n",
6710 mdname(mddev));
6711 err = -EBUSY;
6712 }
6713 if (err) {
6714 fput(f);
6715 return err;
6716 }
6717 mddev->bitmap_info.file = f;
6718 mddev->bitmap_info.offset = 0; /* file overrides offset */
6719 } else if (mddev->bitmap == NULL)
6720 return -ENOENT; /* cannot remove what isn't there */
6721 err = 0;
6722 if (mddev->pers) {
6723 if (fd >= 0) {
6724 struct bitmap *bitmap;
6725
6726 bitmap = bitmap_create(mddev, -1);
6727 mddev_suspend(mddev);
6728 if (!IS_ERR(bitmap)) {
6729 mddev->bitmap = bitmap;
6730 err = bitmap_load(mddev);
6731 } else
6732 err = PTR_ERR(bitmap);
6733 if (err) {
6734 bitmap_destroy(mddev);
6735 fd = -1;
6736 }
6737 mddev_resume(mddev);
6738 } else if (fd < 0) {
6739 mddev_suspend(mddev);
6740 bitmap_destroy(mddev);
6741 mddev_resume(mddev);
6742 }
6743 }
6744 if (fd < 0) {
6745 struct file *f = mddev->bitmap_info.file;
6746 if (f) {
6747 spin_lock(&mddev->lock);
6748 mddev->bitmap_info.file = NULL;
6749 spin_unlock(&mddev->lock);
6750 fput(f);
6751 }
6752 }
6753
6754 return err;
6755 }
6756
6757 /*
6758 * set_array_info is used two different ways
6759 * The original usage is when creating a new array.
6760 * In this usage, raid_disks is > 0 and it together with
6761 * level, size, not_persistent,layout,chunksize determine the
6762 * shape of the array.
6763 * This will always create an array with a type-0.90.0 superblock.
6764 * The newer usage is when assembling an array.
6765 * In this case raid_disks will be 0, and the major_version field is
6766 * use to determine which style super-blocks are to be found on the devices.
6767 * The minor and patch _version numbers are also kept incase the
6768 * super_block handler wishes to interpret them.
6769 */
6770 static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
6771 {
6772
6773 if (info->raid_disks == 0) {
6774 /* just setting version number for superblock loading */
6775 if (info->major_version < 0 ||
6776 info->major_version >= ARRAY_SIZE(super_types) ||
6777 super_types[info->major_version].name == NULL) {
6778 /* maybe try to auto-load a module? */
6779 pr_warn("md: superblock version %d not known\n",
6780 info->major_version);
6781 return -EINVAL;
6782 }
6783 mddev->major_version = info->major_version;
6784 mddev->minor_version = info->minor_version;
6785 mddev->patch_version = info->patch_version;
6786 mddev->persistent = !info->not_persistent;
6787 /* ensure mddev_put doesn't delete this now that there
6788 * is some minimal configuration.
6789 */
6790 mddev->ctime = ktime_get_real_seconds();
6791 return 0;
6792 }
6793 mddev->major_version = MD_MAJOR_VERSION;
6794 mddev->minor_version = MD_MINOR_VERSION;
6795 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6796 mddev->ctime = ktime_get_real_seconds();
6797
6798 mddev->level = info->level;
6799 mddev->clevel[0] = 0;
6800 mddev->dev_sectors = 2 * (sector_t)info->size;
6801 mddev->raid_disks = info->raid_disks;
6802 /* don't set md_minor, it is determined by which /dev/md* was
6803 * openned
6804 */
6805 if (info->state & (1<<MD_SB_CLEAN))
6806 mddev->recovery_cp = MaxSector;
6807 else
6808 mddev->recovery_cp = 0;
6809 mddev->persistent = ! info->not_persistent;
6810 mddev->external = 0;
6811
6812 mddev->layout = info->layout;
6813 mddev->chunk_sectors = info->chunk_size >> 9;
6814
6815 if (mddev->persistent) {
6816 mddev->max_disks = MD_SB_DISKS;
6817 mddev->flags = 0;
6818 mddev->sb_flags = 0;
6819 }
6820 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
6821
6822 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6823 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6824 mddev->bitmap_info.offset = 0;
6825
6826 mddev->reshape_position = MaxSector;
6827
6828 /*
6829 * Generate a 128 bit UUID
6830 */
6831 get_random_bytes(mddev->uuid, 16);
6832
6833 mddev->new_level = mddev->level;
6834 mddev->new_chunk_sectors = mddev->chunk_sectors;
6835 mddev->new_layout = mddev->layout;
6836 mddev->delta_disks = 0;
6837 mddev->reshape_backwards = 0;
6838
6839 return 0;
6840 }
6841
6842 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6843 {
6844 lockdep_assert_held(&mddev->reconfig_mutex);
6845
6846 if (mddev->external_size)
6847 return;
6848
6849 mddev->array_sectors = array_sectors;
6850 }
6851 EXPORT_SYMBOL(md_set_array_sectors);
6852
6853 static int update_size(struct mddev *mddev, sector_t num_sectors)
6854 {
6855 struct md_rdev *rdev;
6856 int rv;
6857 int fit = (num_sectors == 0);
6858 sector_t old_dev_sectors = mddev->dev_sectors;
6859
6860 if (mddev->pers->resize == NULL)
6861 return -EINVAL;
6862 /* The "num_sectors" is the number of sectors of each device that
6863 * is used. This can only make sense for arrays with redundancy.
6864 * linear and raid0 always use whatever space is available. We can only
6865 * consider changing this number if no resync or reconstruction is
6866 * happening, and if the new size is acceptable. It must fit before the
6867 * sb_start or, if that is <data_offset, it must fit before the size
6868 * of each device. If num_sectors is zero, we find the largest size
6869 * that fits.
6870 */
6871 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6872 mddev->sync_thread)
6873 return -EBUSY;
6874 if (mddev->ro)
6875 return -EROFS;
6876
6877 rdev_for_each(rdev, mddev) {
6878 sector_t avail = rdev->sectors;
6879
6880 if (fit && (num_sectors == 0 || num_sectors > avail))
6881 num_sectors = avail;
6882 if (avail < num_sectors)
6883 return -ENOSPC;
6884 }
6885 rv = mddev->pers->resize(mddev, num_sectors);
6886 if (!rv) {
6887 if (mddev_is_clustered(mddev))
6888 md_cluster_ops->update_size(mddev, old_dev_sectors);
6889 else if (mddev->queue) {
6890 set_capacity(mddev->gendisk, mddev->array_sectors);
6891 revalidate_disk(mddev->gendisk);
6892 }
6893 }
6894 return rv;
6895 }
6896
6897 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6898 {
6899 int rv;
6900 struct md_rdev *rdev;
6901 /* change the number of raid disks */
6902 if (mddev->pers->check_reshape == NULL)
6903 return -EINVAL;
6904 if (mddev->ro)
6905 return -EROFS;
6906 if (raid_disks <= 0 ||
6907 (mddev->max_disks && raid_disks >= mddev->max_disks))
6908 return -EINVAL;
6909 if (mddev->sync_thread ||
6910 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
6911 mddev->reshape_position != MaxSector)
6912 return -EBUSY;
6913
6914 rdev_for_each(rdev, mddev) {
6915 if (mddev->raid_disks < raid_disks &&
6916 rdev->data_offset < rdev->new_data_offset)
6917 return -EINVAL;
6918 if (mddev->raid_disks > raid_disks &&
6919 rdev->data_offset > rdev->new_data_offset)
6920 return -EINVAL;
6921 }
6922
6923 mddev->delta_disks = raid_disks - mddev->raid_disks;
6924 if (mddev->delta_disks < 0)
6925 mddev->reshape_backwards = 1;
6926 else if (mddev->delta_disks > 0)
6927 mddev->reshape_backwards = 0;
6928
6929 rv = mddev->pers->check_reshape(mddev);
6930 if (rv < 0) {
6931 mddev->delta_disks = 0;
6932 mddev->reshape_backwards = 0;
6933 }
6934 return rv;
6935 }
6936
6937 /*
6938 * update_array_info is used to change the configuration of an
6939 * on-line array.
6940 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6941 * fields in the info are checked against the array.
6942 * Any differences that cannot be handled will cause an error.
6943 * Normally, only one change can be managed at a time.
6944 */
6945 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6946 {
6947 int rv = 0;
6948 int cnt = 0;
6949 int state = 0;
6950
6951 /* calculate expected state,ignoring low bits */
6952 if (mddev->bitmap && mddev->bitmap_info.offset)
6953 state |= (1 << MD_SB_BITMAP_PRESENT);
6954
6955 if (mddev->major_version != info->major_version ||
6956 mddev->minor_version != info->minor_version ||
6957 /* mddev->patch_version != info->patch_version || */
6958 mddev->ctime != info->ctime ||
6959 mddev->level != info->level ||
6960 /* mddev->layout != info->layout || */
6961 mddev->persistent != !info->not_persistent ||
6962 mddev->chunk_sectors != info->chunk_size >> 9 ||
6963 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6964 ((state^info->state) & 0xfffffe00)
6965 )
6966 return -EINVAL;
6967 /* Check there is only one change */
6968 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6969 cnt++;
6970 if (mddev->raid_disks != info->raid_disks)
6971 cnt++;
6972 if (mddev->layout != info->layout)
6973 cnt++;
6974 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6975 cnt++;
6976 if (cnt == 0)
6977 return 0;
6978 if (cnt > 1)
6979 return -EINVAL;
6980
6981 if (mddev->layout != info->layout) {
6982 /* Change layout
6983 * we don't need to do anything at the md level, the
6984 * personality will take care of it all.
6985 */
6986 if (mddev->pers->check_reshape == NULL)
6987 return -EINVAL;
6988 else {
6989 mddev->new_layout = info->layout;
6990 rv = mddev->pers->check_reshape(mddev);
6991 if (rv)
6992 mddev->new_layout = mddev->layout;
6993 return rv;
6994 }
6995 }
6996 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6997 rv = update_size(mddev, (sector_t)info->size * 2);
6998
6999 if (mddev->raid_disks != info->raid_disks)
7000 rv = update_raid_disks(mddev, info->raid_disks);
7001
7002 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
7003 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7004 rv = -EINVAL;
7005 goto err;
7006 }
7007 if (mddev->recovery || mddev->sync_thread) {
7008 rv = -EBUSY;
7009 goto err;
7010 }
7011 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
7012 struct bitmap *bitmap;
7013 /* add the bitmap */
7014 if (mddev->bitmap) {
7015 rv = -EEXIST;
7016 goto err;
7017 }
7018 if (mddev->bitmap_info.default_offset == 0) {
7019 rv = -EINVAL;
7020 goto err;
7021 }
7022 mddev->bitmap_info.offset =
7023 mddev->bitmap_info.default_offset;
7024 mddev->bitmap_info.space =
7025 mddev->bitmap_info.default_space;
7026 bitmap = bitmap_create(mddev, -1);
7027 mddev_suspend(mddev);
7028 if (!IS_ERR(bitmap)) {
7029 mddev->bitmap = bitmap;
7030 rv = bitmap_load(mddev);
7031 } else
7032 rv = PTR_ERR(bitmap);
7033 if (rv)
7034 bitmap_destroy(mddev);
7035 mddev_resume(mddev);
7036 } else {
7037 /* remove the bitmap */
7038 if (!mddev->bitmap) {
7039 rv = -ENOENT;
7040 goto err;
7041 }
7042 if (mddev->bitmap->storage.file) {
7043 rv = -EINVAL;
7044 goto err;
7045 }
7046 if (mddev->bitmap_info.nodes) {
7047 /* hold PW on all the bitmap lock */
7048 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
7049 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
7050 rv = -EPERM;
7051 md_cluster_ops->unlock_all_bitmaps(mddev);
7052 goto err;
7053 }
7054
7055 mddev->bitmap_info.nodes = 0;
7056 md_cluster_ops->leave(mddev);
7057 }
7058 mddev_suspend(mddev);
7059 bitmap_destroy(mddev);
7060 mddev_resume(mddev);
7061 mddev->bitmap_info.offset = 0;
7062 }
7063 }
7064 md_update_sb(mddev, 1);
7065 return rv;
7066 err:
7067 return rv;
7068 }
7069
7070 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
7071 {
7072 struct md_rdev *rdev;
7073 int err = 0;
7074
7075 if (mddev->pers == NULL)
7076 return -ENODEV;
7077
7078 rcu_read_lock();
7079 rdev = find_rdev_rcu(mddev, dev);
7080 if (!rdev)
7081 err = -ENODEV;
7082 else {
7083 md_error(mddev, rdev);
7084 if (!test_bit(Faulty, &rdev->flags))
7085 err = -EBUSY;
7086 }
7087 rcu_read_unlock();
7088 return err;
7089 }
7090
7091 /*
7092 * We have a problem here : there is no easy way to give a CHS
7093 * virtual geometry. We currently pretend that we have a 2 heads
7094 * 4 sectors (with a BIG number of cylinders...). This drives
7095 * dosfs just mad... ;-)
7096 */
7097 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7098 {
7099 struct mddev *mddev = bdev->bd_disk->private_data;
7100
7101 geo->heads = 2;
7102 geo->sectors = 4;
7103 geo->cylinders = mddev->array_sectors / 8;
7104 return 0;
7105 }
7106
7107 static inline bool md_ioctl_valid(unsigned int cmd)
7108 {
7109 switch (cmd) {
7110 case ADD_NEW_DISK:
7111 case BLKROSET:
7112 case GET_ARRAY_INFO:
7113 case GET_BITMAP_FILE:
7114 case GET_DISK_INFO:
7115 case HOT_ADD_DISK:
7116 case HOT_REMOVE_DISK:
7117 case RAID_AUTORUN:
7118 case RAID_VERSION:
7119 case RESTART_ARRAY_RW:
7120 case RUN_ARRAY:
7121 case SET_ARRAY_INFO:
7122 case SET_BITMAP_FILE:
7123 case SET_DISK_FAULTY:
7124 case STOP_ARRAY:
7125 case STOP_ARRAY_RO:
7126 case CLUSTERED_DISK_NACK:
7127 return true;
7128 default:
7129 return false;
7130 }
7131 }
7132
7133 static int md_ioctl(struct block_device *bdev, fmode_t mode,
7134 unsigned int cmd, unsigned long arg)
7135 {
7136 int err = 0;
7137 void __user *argp = (void __user *)arg;
7138 struct mddev *mddev = NULL;
7139 int ro;
7140 bool did_set_md_closing = false;
7141
7142 if (!md_ioctl_valid(cmd))
7143 return -ENOTTY;
7144
7145 switch (cmd) {
7146 case RAID_VERSION:
7147 case GET_ARRAY_INFO:
7148 case GET_DISK_INFO:
7149 break;
7150 default:
7151 if (!capable(CAP_SYS_ADMIN))
7152 return -EACCES;
7153 }
7154
7155 /*
7156 * Commands dealing with the RAID driver but not any
7157 * particular array:
7158 */
7159 switch (cmd) {
7160 case RAID_VERSION:
7161 err = get_version(argp);
7162 goto out;
7163
7164 #ifndef MODULE
7165 case RAID_AUTORUN:
7166 err = 0;
7167 autostart_arrays(arg);
7168 goto out;
7169 #endif
7170 default:;
7171 }
7172
7173 /*
7174 * Commands creating/starting a new array:
7175 */
7176
7177 mddev = bdev->bd_disk->private_data;
7178
7179 if (!mddev) {
7180 BUG();
7181 goto out;
7182 }
7183
7184 /* Some actions do not requires the mutex */
7185 switch (cmd) {
7186 case GET_ARRAY_INFO:
7187 if (!mddev->raid_disks && !mddev->external)
7188 err = -ENODEV;
7189 else
7190 err = get_array_info(mddev, argp);
7191 goto out;
7192
7193 case GET_DISK_INFO:
7194 if (!mddev->raid_disks && !mddev->external)
7195 err = -ENODEV;
7196 else
7197 err = get_disk_info(mddev, argp);
7198 goto out;
7199
7200 case SET_DISK_FAULTY:
7201 err = set_disk_faulty(mddev, new_decode_dev(arg));
7202 goto out;
7203
7204 case GET_BITMAP_FILE:
7205 err = get_bitmap_file(mddev, argp);
7206 goto out;
7207
7208 }
7209
7210 if (cmd == ADD_NEW_DISK)
7211 /* need to ensure md_delayed_delete() has completed */
7212 flush_workqueue(md_misc_wq);
7213
7214 if (cmd == HOT_REMOVE_DISK)
7215 /* need to ensure recovery thread has run */
7216 wait_event_interruptible_timeout(mddev->sb_wait,
7217 !test_bit(MD_RECOVERY_NEEDED,
7218 &mddev->recovery),
7219 msecs_to_jiffies(5000));
7220 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7221 /* Need to flush page cache, and ensure no-one else opens
7222 * and writes
7223 */
7224 mutex_lock(&mddev->open_mutex);
7225 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
7226 mutex_unlock(&mddev->open_mutex);
7227 err = -EBUSY;
7228 goto out;
7229 }
7230 WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
7231 set_bit(MD_CLOSING, &mddev->flags);
7232 did_set_md_closing = true;
7233 mutex_unlock(&mddev->open_mutex);
7234 sync_blockdev(bdev);
7235 }
7236 err = mddev_lock(mddev);
7237 if (err) {
7238 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7239 err, cmd);
7240 goto out;
7241 }
7242
7243 if (cmd == SET_ARRAY_INFO) {
7244 mdu_array_info_t info;
7245 if (!arg)
7246 memset(&info, 0, sizeof(info));
7247 else if (copy_from_user(&info, argp, sizeof(info))) {
7248 err = -EFAULT;
7249 goto unlock;
7250 }
7251 if (mddev->pers) {
7252 err = update_array_info(mddev, &info);
7253 if (err) {
7254 pr_warn("md: couldn't update array info. %d\n", err);
7255 goto unlock;
7256 }
7257 goto unlock;
7258 }
7259 if (!list_empty(&mddev->disks)) {
7260 pr_warn("md: array %s already has disks!\n", mdname(mddev));
7261 err = -EBUSY;
7262 goto unlock;
7263 }
7264 if (mddev->raid_disks) {
7265 pr_warn("md: array %s already initialised!\n", mdname(mddev));
7266 err = -EBUSY;
7267 goto unlock;
7268 }
7269 err = set_array_info(mddev, &info);
7270 if (err) {
7271 pr_warn("md: couldn't set array info. %d\n", err);
7272 goto unlock;
7273 }
7274 goto unlock;
7275 }
7276
7277 /*
7278 * Commands querying/configuring an existing array:
7279 */
7280 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
7281 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
7282 if ((!mddev->raid_disks && !mddev->external)
7283 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7284 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7285 && cmd != GET_BITMAP_FILE) {
7286 err = -ENODEV;
7287 goto unlock;
7288 }
7289
7290 /*
7291 * Commands even a read-only array can execute:
7292 */
7293 switch (cmd) {
7294 case RESTART_ARRAY_RW:
7295 err = restart_array(mddev);
7296 goto unlock;
7297
7298 case STOP_ARRAY:
7299 err = do_md_stop(mddev, 0, bdev);
7300 goto unlock;
7301
7302 case STOP_ARRAY_RO:
7303 err = md_set_readonly(mddev, bdev);
7304 goto unlock;
7305
7306 case HOT_REMOVE_DISK:
7307 err = hot_remove_disk(mddev, new_decode_dev(arg));
7308 goto unlock;
7309
7310 case ADD_NEW_DISK:
7311 /* We can support ADD_NEW_DISK on read-only arrays
7312 * only if we are re-adding a preexisting device.
7313 * So require mddev->pers and MD_DISK_SYNC.
7314 */
7315 if (mddev->pers) {
7316 mdu_disk_info_t info;
7317 if (copy_from_user(&info, argp, sizeof(info)))
7318 err = -EFAULT;
7319 else if (!(info.state & (1<<MD_DISK_SYNC)))
7320 /* Need to clear read-only for this */
7321 break;
7322 else
7323 err = add_new_disk(mddev, &info);
7324 goto unlock;
7325 }
7326 break;
7327
7328 case BLKROSET:
7329 if (get_user(ro, (int __user *)(arg))) {
7330 err = -EFAULT;
7331 goto unlock;
7332 }
7333 err = -EINVAL;
7334
7335 /* if the bdev is going readonly the value of mddev->ro
7336 * does not matter, no writes are coming
7337 */
7338 if (ro)
7339 goto unlock;
7340
7341 /* are we are already prepared for writes? */
7342 if (mddev->ro != 1)
7343 goto unlock;
7344
7345 /* transitioning to readauto need only happen for
7346 * arrays that call md_write_start
7347 */
7348 if (mddev->pers) {
7349 err = restart_array(mddev);
7350 if (err == 0) {
7351 mddev->ro = 2;
7352 set_disk_ro(mddev->gendisk, 0);
7353 }
7354 }
7355 goto unlock;
7356 }
7357
7358 /*
7359 * The remaining ioctls are changing the state of the
7360 * superblock, so we do not allow them on read-only arrays.
7361 */
7362 if (mddev->ro && mddev->pers) {
7363 if (mddev->ro == 2) {
7364 mddev->ro = 0;
7365 sysfs_notify_dirent_safe(mddev->sysfs_state);
7366 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7367 /* mddev_unlock will wake thread */
7368 /* If a device failed while we were read-only, we
7369 * need to make sure the metadata is updated now.
7370 */
7371 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
7372 mddev_unlock(mddev);
7373 wait_event(mddev->sb_wait,
7374 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7375 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
7376 mddev_lock_nointr(mddev);
7377 }
7378 } else {
7379 err = -EROFS;
7380 goto unlock;
7381 }
7382 }
7383
7384 switch (cmd) {
7385 case ADD_NEW_DISK:
7386 {
7387 mdu_disk_info_t info;
7388 if (copy_from_user(&info, argp, sizeof(info)))
7389 err = -EFAULT;
7390 else
7391 err = add_new_disk(mddev, &info);
7392 goto unlock;
7393 }
7394
7395 case CLUSTERED_DISK_NACK:
7396 if (mddev_is_clustered(mddev))
7397 md_cluster_ops->new_disk_ack(mddev, false);
7398 else
7399 err = -EINVAL;
7400 goto unlock;
7401
7402 case HOT_ADD_DISK:
7403 err = hot_add_disk(mddev, new_decode_dev(arg));
7404 goto unlock;
7405
7406 case RUN_ARRAY:
7407 err = do_md_run(mddev);
7408 goto unlock;
7409
7410 case SET_BITMAP_FILE:
7411 err = set_bitmap_file(mddev, (int)arg);
7412 goto unlock;
7413
7414 default:
7415 err = -EINVAL;
7416 goto unlock;
7417 }
7418
7419 unlock:
7420 if (mddev->hold_active == UNTIL_IOCTL &&
7421 err != -EINVAL)
7422 mddev->hold_active = 0;
7423 mddev_unlock(mddev);
7424 out:
7425 if(did_set_md_closing)
7426 clear_bit(MD_CLOSING, &mddev->flags);
7427 return err;
7428 }
7429 #ifdef CONFIG_COMPAT
7430 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7431 unsigned int cmd, unsigned long arg)
7432 {
7433 switch (cmd) {
7434 case HOT_REMOVE_DISK:
7435 case HOT_ADD_DISK:
7436 case SET_DISK_FAULTY:
7437 case SET_BITMAP_FILE:
7438 /* These take in integer arg, do not convert */
7439 break;
7440 default:
7441 arg = (unsigned long)compat_ptr(arg);
7442 break;
7443 }
7444
7445 return md_ioctl(bdev, mode, cmd, arg);
7446 }
7447 #endif /* CONFIG_COMPAT */
7448
7449 static int md_open(struct block_device *bdev, fmode_t mode)
7450 {
7451 /*
7452 * Succeed if we can lock the mddev, which confirms that
7453 * it isn't being stopped right now.
7454 */
7455 struct mddev *mddev = mddev_find(bdev->bd_dev);
7456 int err;
7457
7458 if (!mddev)
7459 return -ENODEV;
7460
7461 if (mddev->gendisk != bdev->bd_disk) {
7462 /* we are racing with mddev_put which is discarding this
7463 * bd_disk.
7464 */
7465 mddev_put(mddev);
7466 /* Wait until bdev->bd_disk is definitely gone */
7467 flush_workqueue(md_misc_wq);
7468 /* Then retry the open from the top */
7469 return -ERESTARTSYS;
7470 }
7471 BUG_ON(mddev != bdev->bd_disk->private_data);
7472
7473 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
7474 goto out;
7475
7476 if (test_bit(MD_CLOSING, &mddev->flags)) {
7477 mutex_unlock(&mddev->open_mutex);
7478 err = -ENODEV;
7479 goto out;
7480 }
7481
7482 err = 0;
7483 atomic_inc(&mddev->openers);
7484 mutex_unlock(&mddev->open_mutex);
7485
7486 check_disk_change(bdev);
7487 out:
7488 if (err)
7489 mddev_put(mddev);
7490 return err;
7491 }
7492
7493 static void md_release(struct gendisk *disk, fmode_t mode)
7494 {
7495 struct mddev *mddev = disk->private_data;
7496
7497 BUG_ON(!mddev);
7498 atomic_dec(&mddev->openers);
7499 mddev_put(mddev);
7500 }
7501
7502 static int md_media_changed(struct gendisk *disk)
7503 {
7504 struct mddev *mddev = disk->private_data;
7505
7506 return mddev->changed;
7507 }
7508
7509 static int md_revalidate(struct gendisk *disk)
7510 {
7511 struct mddev *mddev = disk->private_data;
7512
7513 mddev->changed = 0;
7514 return 0;
7515 }
7516 static const struct block_device_operations md_fops =
7517 {
7518 .owner = THIS_MODULE,
7519 .open = md_open,
7520 .release = md_release,
7521 .ioctl = md_ioctl,
7522 #ifdef CONFIG_COMPAT
7523 .compat_ioctl = md_compat_ioctl,
7524 #endif
7525 .getgeo = md_getgeo,
7526 .media_changed = md_media_changed,
7527 .revalidate_disk= md_revalidate,
7528 };
7529
7530 static int md_thread(void *arg)
7531 {
7532 struct md_thread *thread = arg;
7533
7534 /*
7535 * md_thread is a 'system-thread', it's priority should be very
7536 * high. We avoid resource deadlocks individually in each
7537 * raid personality. (RAID5 does preallocation) We also use RR and
7538 * the very same RT priority as kswapd, thus we will never get
7539 * into a priority inversion deadlock.
7540 *
7541 * we definitely have to have equal or higher priority than
7542 * bdflush, otherwise bdflush will deadlock if there are too
7543 * many dirty RAID5 blocks.
7544 */
7545
7546 allow_signal(SIGKILL);
7547 while (!kthread_should_stop()) {
7548
7549 /* We need to wait INTERRUPTIBLE so that
7550 * we don't add to the load-average.
7551 * That means we need to be sure no signals are
7552 * pending
7553 */
7554 if (signal_pending(current))
7555 flush_signals(current);
7556
7557 wait_event_interruptible_timeout
7558 (thread->wqueue,
7559 test_bit(THREAD_WAKEUP, &thread->flags)
7560 || kthread_should_stop() || kthread_should_park(),
7561 thread->timeout);
7562
7563 clear_bit(THREAD_WAKEUP, &thread->flags);
7564 if (kthread_should_park())
7565 kthread_parkme();
7566 if (!kthread_should_stop())
7567 thread->run(thread);
7568 }
7569
7570 return 0;
7571 }
7572
7573 void md_wakeup_thread(struct md_thread *thread)
7574 {
7575 if (thread) {
7576 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
7577 set_bit(THREAD_WAKEUP, &thread->flags);
7578 wake_up(&thread->wqueue);
7579 }
7580 }
7581 EXPORT_SYMBOL(md_wakeup_thread);
7582
7583 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7584 struct mddev *mddev, const char *name)
7585 {
7586 struct md_thread *thread;
7587
7588 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
7589 if (!thread)
7590 return NULL;
7591
7592 init_waitqueue_head(&thread->wqueue);
7593
7594 thread->run = run;
7595 thread->mddev = mddev;
7596 thread->timeout = MAX_SCHEDULE_TIMEOUT;
7597 thread->tsk = kthread_run(md_thread, thread,
7598 "%s_%s",
7599 mdname(thread->mddev),
7600 name);
7601 if (IS_ERR(thread->tsk)) {
7602 kfree(thread);
7603 return NULL;
7604 }
7605 return thread;
7606 }
7607 EXPORT_SYMBOL(md_register_thread);
7608
7609 void md_unregister_thread(struct md_thread **threadp)
7610 {
7611 struct md_thread *thread = *threadp;
7612 if (!thread)
7613 return;
7614 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
7615 /* Locking ensures that mddev_unlock does not wake_up a
7616 * non-existent thread
7617 */
7618 spin_lock(&pers_lock);
7619 *threadp = NULL;
7620 spin_unlock(&pers_lock);
7621
7622 kthread_stop(thread->tsk);
7623 kfree(thread);
7624 }
7625 EXPORT_SYMBOL(md_unregister_thread);
7626
7627 void md_error(struct mddev *mddev, struct md_rdev *rdev)
7628 {
7629 if (!rdev || test_bit(Faulty, &rdev->flags))
7630 return;
7631
7632 if (!mddev->pers || !mddev->pers->error_handler)
7633 return;
7634 mddev->pers->error_handler(mddev,rdev);
7635 if (mddev->degraded)
7636 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7637 sysfs_notify_dirent_safe(rdev->sysfs_state);
7638 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7639 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7640 md_wakeup_thread(mddev->thread);
7641 if (mddev->event_work.func)
7642 queue_work(md_misc_wq, &mddev->event_work);
7643 md_new_event(mddev);
7644 }
7645 EXPORT_SYMBOL(md_error);
7646
7647 /* seq_file implementation /proc/mdstat */
7648
7649 static void status_unused(struct seq_file *seq)
7650 {
7651 int i = 0;
7652 struct md_rdev *rdev;
7653
7654 seq_printf(seq, "unused devices: ");
7655
7656 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
7657 char b[BDEVNAME_SIZE];
7658 i++;
7659 seq_printf(seq, "%s ",
7660 bdevname(rdev->bdev,b));
7661 }
7662 if (!i)
7663 seq_printf(seq, "<none>");
7664
7665 seq_printf(seq, "\n");
7666 }
7667
7668 static int status_resync(struct seq_file *seq, struct mddev *mddev)
7669 {
7670 sector_t max_sectors, resync, res;
7671 unsigned long dt, db = 0;
7672 sector_t rt, curr_mark_cnt, resync_mark_cnt;
7673 int scale, recovery_active;
7674 unsigned int per_milli;
7675
7676 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7677 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7678 max_sectors = mddev->resync_max_sectors;
7679 else
7680 max_sectors = mddev->dev_sectors;
7681
7682 resync = mddev->curr_resync;
7683 if (resync <= 3) {
7684 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7685 /* Still cleaning up */
7686 resync = max_sectors;
7687 } else if (resync > max_sectors)
7688 resync = max_sectors;
7689 else
7690 resync -= atomic_read(&mddev->recovery_active);
7691
7692 if (resync == 0) {
7693 if (mddev->recovery_cp < MaxSector) {
7694 seq_printf(seq, "\tresync=PENDING");
7695 return 1;
7696 }
7697 return 0;
7698 }
7699 if (resync < 3) {
7700 seq_printf(seq, "\tresync=DELAYED");
7701 return 1;
7702 }
7703
7704 WARN_ON(max_sectors == 0);
7705 /* Pick 'scale' such that (resync>>scale)*1000 will fit
7706 * in a sector_t, and (max_sectors>>scale) will fit in a
7707 * u32, as those are the requirements for sector_div.
7708 * Thus 'scale' must be at least 10
7709 */
7710 scale = 10;
7711 if (sizeof(sector_t) > sizeof(unsigned long)) {
7712 while ( max_sectors/2 > (1ULL<<(scale+32)))
7713 scale++;
7714 }
7715 res = (resync>>scale)*1000;
7716 sector_div(res, (u32)((max_sectors>>scale)+1));
7717
7718 per_milli = res;
7719 {
7720 int i, x = per_milli/50, y = 20-x;
7721 seq_printf(seq, "[");
7722 for (i = 0; i < x; i++)
7723 seq_printf(seq, "=");
7724 seq_printf(seq, ">");
7725 for (i = 0; i < y; i++)
7726 seq_printf(seq, ".");
7727 seq_printf(seq, "] ");
7728 }
7729 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
7730 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
7731 "reshape" :
7732 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
7733 "check" :
7734 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
7735 "resync" : "recovery"))),
7736 per_milli/10, per_milli % 10,
7737 (unsigned long long) resync/2,
7738 (unsigned long long) max_sectors/2);
7739
7740 /*
7741 * dt: time from mark until now
7742 * db: blocks written from mark until now
7743 * rt: remaining time
7744 *
7745 * rt is a sector_t, which is always 64bit now. We are keeping
7746 * the original algorithm, but it is not really necessary.
7747 *
7748 * Original algorithm:
7749 * So we divide before multiply in case it is 32bit and close
7750 * to the limit.
7751 * We scale the divisor (db) by 32 to avoid losing precision
7752 * near the end of resync when the number of remaining sectors
7753 * is close to 'db'.
7754 * We then divide rt by 32 after multiplying by db to compensate.
7755 * The '+1' avoids division by zero if db is very small.
7756 */
7757 dt = ((jiffies - mddev->resync_mark) / HZ);
7758 if (!dt) dt++;
7759
7760 curr_mark_cnt = mddev->curr_mark_cnt;
7761 recovery_active = atomic_read(&mddev->recovery_active);
7762 resync_mark_cnt = mddev->resync_mark_cnt;
7763
7764 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
7765 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
7766
7767 rt = max_sectors - resync; /* number of remaining sectors */
7768 rt = div64_u64(rt, db/32+1);
7769 rt *= dt;
7770 rt >>= 5;
7771
7772 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
7773 ((unsigned long)rt % 60)/6);
7774
7775 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
7776 return 1;
7777 }
7778
7779 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
7780 {
7781 struct list_head *tmp;
7782 loff_t l = *pos;
7783 struct mddev *mddev;
7784
7785 if (l >= 0x10000)
7786 return NULL;
7787 if (!l--)
7788 /* header */
7789 return (void*)1;
7790
7791 spin_lock(&all_mddevs_lock);
7792 list_for_each(tmp,&all_mddevs)
7793 if (!l--) {
7794 mddev = list_entry(tmp, struct mddev, all_mddevs);
7795 mddev_get(mddev);
7796 spin_unlock(&all_mddevs_lock);
7797 return mddev;
7798 }
7799 spin_unlock(&all_mddevs_lock);
7800 if (!l--)
7801 return (void*)2;/* tail */
7802 return NULL;
7803 }
7804
7805 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7806 {
7807 struct list_head *tmp;
7808 struct mddev *next_mddev, *mddev = v;
7809
7810 ++*pos;
7811 if (v == (void*)2)
7812 return NULL;
7813
7814 spin_lock(&all_mddevs_lock);
7815 if (v == (void*)1)
7816 tmp = all_mddevs.next;
7817 else
7818 tmp = mddev->all_mddevs.next;
7819 if (tmp != &all_mddevs)
7820 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7821 else {
7822 next_mddev = (void*)2;
7823 *pos = 0x10000;
7824 }
7825 spin_unlock(&all_mddevs_lock);
7826
7827 if (v != (void*)1)
7828 mddev_put(mddev);
7829 return next_mddev;
7830
7831 }
7832
7833 static void md_seq_stop(struct seq_file *seq, void *v)
7834 {
7835 struct mddev *mddev = v;
7836
7837 if (mddev && v != (void*)1 && v != (void*)2)
7838 mddev_put(mddev);
7839 }
7840
7841 static int md_seq_show(struct seq_file *seq, void *v)
7842 {
7843 struct mddev *mddev = v;
7844 sector_t sectors;
7845 struct md_rdev *rdev;
7846
7847 if (v == (void*)1) {
7848 struct md_personality *pers;
7849 seq_printf(seq, "Personalities : ");
7850 spin_lock(&pers_lock);
7851 list_for_each_entry(pers, &pers_list, list)
7852 seq_printf(seq, "[%s] ", pers->name);
7853
7854 spin_unlock(&pers_lock);
7855 seq_printf(seq, "\n");
7856 seq->poll_event = atomic_read(&md_event_count);
7857 return 0;
7858 }
7859 if (v == (void*)2) {
7860 status_unused(seq);
7861 return 0;
7862 }
7863
7864 spin_lock(&mddev->lock);
7865 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7866 seq_printf(seq, "%s : %sactive", mdname(mddev),
7867 mddev->pers ? "" : "in");
7868 if (mddev->pers) {
7869 if (mddev->ro==1)
7870 seq_printf(seq, " (read-only)");
7871 if (mddev->ro==2)
7872 seq_printf(seq, " (auto-read-only)");
7873 seq_printf(seq, " %s", mddev->pers->name);
7874 }
7875
7876 sectors = 0;
7877 rcu_read_lock();
7878 rdev_for_each_rcu(rdev, mddev) {
7879 char b[BDEVNAME_SIZE];
7880 seq_printf(seq, " %s[%d]",
7881 bdevname(rdev->bdev,b), rdev->desc_nr);
7882 if (test_bit(WriteMostly, &rdev->flags))
7883 seq_printf(seq, "(W)");
7884 if (test_bit(Journal, &rdev->flags))
7885 seq_printf(seq, "(J)");
7886 if (test_bit(Faulty, &rdev->flags)) {
7887 seq_printf(seq, "(F)");
7888 continue;
7889 }
7890 if (rdev->raid_disk < 0)
7891 seq_printf(seq, "(S)"); /* spare */
7892 if (test_bit(Replacement, &rdev->flags))
7893 seq_printf(seq, "(R)");
7894 sectors += rdev->sectors;
7895 }
7896 rcu_read_unlock();
7897
7898 if (!list_empty(&mddev->disks)) {
7899 if (mddev->pers)
7900 seq_printf(seq, "\n %llu blocks",
7901 (unsigned long long)
7902 mddev->array_sectors / 2);
7903 else
7904 seq_printf(seq, "\n %llu blocks",
7905 (unsigned long long)sectors / 2);
7906 }
7907 if (mddev->persistent) {
7908 if (mddev->major_version != 0 ||
7909 mddev->minor_version != 90) {
7910 seq_printf(seq," super %d.%d",
7911 mddev->major_version,
7912 mddev->minor_version);
7913 }
7914 } else if (mddev->external)
7915 seq_printf(seq, " super external:%s",
7916 mddev->metadata_type);
7917 else
7918 seq_printf(seq, " super non-persistent");
7919
7920 if (mddev->pers) {
7921 mddev->pers->status(seq, mddev);
7922 seq_printf(seq, "\n ");
7923 if (mddev->pers->sync_request) {
7924 if (status_resync(seq, mddev))
7925 seq_printf(seq, "\n ");
7926 }
7927 } else
7928 seq_printf(seq, "\n ");
7929
7930 bitmap_status(seq, mddev->bitmap);
7931
7932 seq_printf(seq, "\n");
7933 }
7934 spin_unlock(&mddev->lock);
7935
7936 return 0;
7937 }
7938
7939 static const struct seq_operations md_seq_ops = {
7940 .start = md_seq_start,
7941 .next = md_seq_next,
7942 .stop = md_seq_stop,
7943 .show = md_seq_show,
7944 };
7945
7946 static int md_seq_open(struct inode *inode, struct file *file)
7947 {
7948 struct seq_file *seq;
7949 int error;
7950
7951 error = seq_open(file, &md_seq_ops);
7952 if (error)
7953 return error;
7954
7955 seq = file->private_data;
7956 seq->poll_event = atomic_read(&md_event_count);
7957 return error;
7958 }
7959
7960 static int md_unloading;
7961 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7962 {
7963 struct seq_file *seq = filp->private_data;
7964 int mask;
7965
7966 if (md_unloading)
7967 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;
7968 poll_wait(filp, &md_event_waiters, wait);
7969
7970 /* always allow read */
7971 mask = POLLIN | POLLRDNORM;
7972
7973 if (seq->poll_event != atomic_read(&md_event_count))
7974 mask |= POLLERR | POLLPRI;
7975 return mask;
7976 }
7977
7978 static const struct file_operations md_seq_fops = {
7979 .owner = THIS_MODULE,
7980 .open = md_seq_open,
7981 .read = seq_read,
7982 .llseek = seq_lseek,
7983 .release = seq_release,
7984 .poll = mdstat_poll,
7985 };
7986
7987 int register_md_personality(struct md_personality *p)
7988 {
7989 pr_debug("md: %s personality registered for level %d\n",
7990 p->name, p->level);
7991 spin_lock(&pers_lock);
7992 list_add_tail(&p->list, &pers_list);
7993 spin_unlock(&pers_lock);
7994 return 0;
7995 }
7996 EXPORT_SYMBOL(register_md_personality);
7997
7998 int unregister_md_personality(struct md_personality *p)
7999 {
8000 pr_debug("md: %s personality unregistered\n", p->name);
8001 spin_lock(&pers_lock);
8002 list_del_init(&p->list);
8003 spin_unlock(&pers_lock);
8004 return 0;
8005 }
8006 EXPORT_SYMBOL(unregister_md_personality);
8007
8008 int register_md_cluster_operations(struct md_cluster_operations *ops,
8009 struct module *module)
8010 {
8011 int ret = 0;
8012 spin_lock(&pers_lock);
8013 if (md_cluster_ops != NULL)
8014 ret = -EALREADY;
8015 else {
8016 md_cluster_ops = ops;
8017 md_cluster_mod = module;
8018 }
8019 spin_unlock(&pers_lock);
8020 return ret;
8021 }
8022 EXPORT_SYMBOL(register_md_cluster_operations);
8023
8024 int unregister_md_cluster_operations(void)
8025 {
8026 spin_lock(&pers_lock);
8027 md_cluster_ops = NULL;
8028 spin_unlock(&pers_lock);
8029 return 0;
8030 }
8031 EXPORT_SYMBOL(unregister_md_cluster_operations);
8032
8033 int md_setup_cluster(struct mddev *mddev, int nodes)
8034 {
8035 if (!md_cluster_ops)
8036 request_module("md-cluster");
8037 spin_lock(&pers_lock);
8038 /* ensure module won't be unloaded */
8039 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
8040 pr_warn("can't find md-cluster module or get it's reference.\n");
8041 spin_unlock(&pers_lock);
8042 return -ENOENT;
8043 }
8044 spin_unlock(&pers_lock);
8045
8046 return md_cluster_ops->join(mddev, nodes);
8047 }
8048
8049 void md_cluster_stop(struct mddev *mddev)
8050 {
8051 if (!md_cluster_ops)
8052 return;
8053 md_cluster_ops->leave(mddev);
8054 module_put(md_cluster_mod);
8055 }
8056
8057 static int is_mddev_idle(struct mddev *mddev, int init)
8058 {
8059 struct md_rdev *rdev;
8060 int idle;
8061 int curr_events;
8062
8063 idle = 1;
8064 rcu_read_lock();
8065 rdev_for_each_rcu(rdev, mddev) {
8066 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
8067 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
8068 (int)part_stat_read(&disk->part0, sectors[1]) -
8069 atomic_read(&disk->sync_io);
8070 /* sync IO will cause sync_io to increase before the disk_stats
8071 * as sync_io is counted when a request starts, and
8072 * disk_stats is counted when it completes.
8073 * So resync activity will cause curr_events to be smaller than
8074 * when there was no such activity.
8075 * non-sync IO will cause disk_stat to increase without
8076 * increasing sync_io so curr_events will (eventually)
8077 * be larger than it was before. Once it becomes
8078 * substantially larger, the test below will cause
8079 * the array to appear non-idle, and resync will slow
8080 * down.
8081 * If there is a lot of outstanding resync activity when
8082 * we set last_event to curr_events, then all that activity
8083 * completing might cause the array to appear non-idle
8084 * and resync will be slowed down even though there might
8085 * not have been non-resync activity. This will only
8086 * happen once though. 'last_events' will soon reflect
8087 * the state where there is little or no outstanding
8088 * resync requests, and further resync activity will
8089 * always make curr_events less than last_events.
8090 *
8091 */
8092 if (init || curr_events - rdev->last_events > 64) {
8093 rdev->last_events = curr_events;
8094 idle = 0;
8095 }
8096 }
8097 rcu_read_unlock();
8098 return idle;
8099 }
8100
8101 void md_done_sync(struct mddev *mddev, int blocks, int ok)
8102 {
8103 /* another "blocks" (512byte) blocks have been synced */
8104 atomic_sub(blocks, &mddev->recovery_active);
8105 wake_up(&mddev->recovery_wait);
8106 if (!ok) {
8107 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8108 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
8109 md_wakeup_thread(mddev->thread);
8110 // stop recovery, signal do_sync ....
8111 }
8112 }
8113 EXPORT_SYMBOL(md_done_sync);
8114
8115 /* md_write_start(mddev, bi)
8116 * If we need to update some array metadata (e.g. 'active' flag
8117 * in superblock) before writing, schedule a superblock update
8118 * and wait for it to complete.
8119 * A return value of 'false' means that the write wasn't recorded
8120 * and cannot proceed as the array is being suspend.
8121 */
8122 bool md_write_start(struct mddev *mddev, struct bio *bi)
8123 {
8124 int did_change = 0;
8125
8126 if (bio_data_dir(bi) != WRITE)
8127 return true;
8128
8129 BUG_ON(mddev->ro == 1);
8130 if (mddev->ro == 2) {
8131 /* need to switch to read/write */
8132 mddev->ro = 0;
8133 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8134 md_wakeup_thread(mddev->thread);
8135 md_wakeup_thread(mddev->sync_thread);
8136 did_change = 1;
8137 }
8138 rcu_read_lock();
8139 percpu_ref_get(&mddev->writes_pending);
8140 smp_mb(); /* Match smp_mb in set_in_sync() */
8141 if (mddev->safemode == 1)
8142 mddev->safemode = 0;
8143 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
8144 if (mddev->in_sync || mddev->sync_checkers) {
8145 spin_lock(&mddev->lock);
8146 if (mddev->in_sync) {
8147 mddev->in_sync = 0;
8148 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8149 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8150 md_wakeup_thread(mddev->thread);
8151 did_change = 1;
8152 }
8153 spin_unlock(&mddev->lock);
8154 }
8155 rcu_read_unlock();
8156 if (did_change)
8157 sysfs_notify_dirent_safe(mddev->sysfs_state);
8158 if (!mddev->has_superblocks)
8159 return true;
8160 wait_event(mddev->sb_wait,
8161 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8162 mddev->suspended);
8163 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8164 percpu_ref_put(&mddev->writes_pending);
8165 return false;
8166 }
8167 return true;
8168 }
8169 EXPORT_SYMBOL(md_write_start);
8170
8171 /* md_write_inc can only be called when md_write_start() has
8172 * already been called at least once of the current request.
8173 * It increments the counter and is useful when a single request
8174 * is split into several parts. Each part causes an increment and
8175 * so needs a matching md_write_end().
8176 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8177 * a spinlocked region.
8178 */
8179 void md_write_inc(struct mddev *mddev, struct bio *bi)
8180 {
8181 if (bio_data_dir(bi) != WRITE)
8182 return;
8183 WARN_ON_ONCE(mddev->in_sync || mddev->ro);
8184 percpu_ref_get(&mddev->writes_pending);
8185 }
8186 EXPORT_SYMBOL(md_write_inc);
8187
8188 void md_write_end(struct mddev *mddev)
8189 {
8190 percpu_ref_put(&mddev->writes_pending);
8191
8192 if (mddev->safemode == 2)
8193 md_wakeup_thread(mddev->thread);
8194 else if (mddev->safemode_delay)
8195 /* The roundup() ensures this only performs locking once
8196 * every ->safemode_delay jiffies
8197 */
8198 mod_timer(&mddev->safemode_timer,
8199 roundup(jiffies, mddev->safemode_delay) +
8200 mddev->safemode_delay);
8201 }
8202
8203 EXPORT_SYMBOL(md_write_end);
8204
8205 /* md_allow_write(mddev)
8206 * Calling this ensures that the array is marked 'active' so that writes
8207 * may proceed without blocking. It is important to call this before
8208 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8209 * Must be called with mddev_lock held.
8210 */
8211 void md_allow_write(struct mddev *mddev)
8212 {
8213 if (!mddev->pers)
8214 return;
8215 if (mddev->ro)
8216 return;
8217 if (!mddev->pers->sync_request)
8218 return;
8219
8220 spin_lock(&mddev->lock);
8221 if (mddev->in_sync) {
8222 mddev->in_sync = 0;
8223 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8224 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8225 if (mddev->safemode_delay &&
8226 mddev->safemode == 0)
8227 mddev->safemode = 1;
8228 spin_unlock(&mddev->lock);
8229 md_update_sb(mddev, 0);
8230 sysfs_notify_dirent_safe(mddev->sysfs_state);
8231 /* wait for the dirty state to be recorded in the metadata */
8232 wait_event(mddev->sb_wait,
8233 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
8234 } else
8235 spin_unlock(&mddev->lock);
8236 }
8237 EXPORT_SYMBOL_GPL(md_allow_write);
8238
8239 #define SYNC_MARKS 10
8240 #define SYNC_MARK_STEP (3*HZ)
8241 #define UPDATE_FREQUENCY (5*60*HZ)
8242 void md_do_sync(struct md_thread *thread)
8243 {
8244 struct mddev *mddev = thread->mddev;
8245 struct mddev *mddev2;
8246 unsigned int currspeed = 0,
8247 window;
8248 sector_t max_sectors,j, io_sectors, recovery_done;
8249 unsigned long mark[SYNC_MARKS];
8250 unsigned long update_time;
8251 sector_t mark_cnt[SYNC_MARKS];
8252 int last_mark,m;
8253 struct list_head *tmp;
8254 sector_t last_check;
8255 int skipped = 0;
8256 struct md_rdev *rdev;
8257 char *desc, *action = NULL;
8258 struct blk_plug plug;
8259 int ret;
8260
8261 /* just incase thread restarts... */
8262 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
8263 return;
8264 if (mddev->ro) {/* never try to sync a read-only array */
8265 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8266 return;
8267 }
8268
8269 if (mddev_is_clustered(mddev)) {
8270 ret = md_cluster_ops->resync_start(mddev);
8271 if (ret)
8272 goto skip;
8273
8274 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
8275 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8276 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8277 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8278 && ((unsigned long long)mddev->curr_resync_completed
8279 < (unsigned long long)mddev->resync_max_sectors))
8280 goto skip;
8281 }
8282
8283 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8284 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
8285 desc = "data-check";
8286 action = "check";
8287 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
8288 desc = "requested-resync";
8289 action = "repair";
8290 } else
8291 desc = "resync";
8292 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8293 desc = "reshape";
8294 else
8295 desc = "recovery";
8296
8297 mddev->last_sync_action = action ?: desc;
8298
8299 /* we overload curr_resync somewhat here.
8300 * 0 == not engaged in resync at all
8301 * 2 == checking that there is no conflict with another sync
8302 * 1 == like 2, but have yielded to allow conflicting resync to
8303 * commense
8304 * other == active in resync - this many blocks
8305 *
8306 * Before starting a resync we must have set curr_resync to
8307 * 2, and then checked that every "conflicting" array has curr_resync
8308 * less than ours. When we find one that is the same or higher
8309 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
8310 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8311 * This will mean we have to start checking from the beginning again.
8312 *
8313 */
8314
8315 do {
8316 int mddev2_minor = -1;
8317 mddev->curr_resync = 2;
8318
8319 try_again:
8320 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8321 goto skip;
8322 for_each_mddev(mddev2, tmp) {
8323 if (mddev2 == mddev)
8324 continue;
8325 if (!mddev->parallel_resync
8326 && mddev2->curr_resync
8327 && match_mddev_units(mddev, mddev2)) {
8328 DEFINE_WAIT(wq);
8329 if (mddev < mddev2 && mddev->curr_resync == 2) {
8330 /* arbitrarily yield */
8331 mddev->curr_resync = 1;
8332 wake_up(&resync_wait);
8333 }
8334 if (mddev > mddev2 && mddev->curr_resync == 1)
8335 /* no need to wait here, we can wait the next
8336 * time 'round when curr_resync == 2
8337 */
8338 continue;
8339 /* We need to wait 'interruptible' so as not to
8340 * contribute to the load average, and not to
8341 * be caught by 'softlockup'
8342 */
8343 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
8344 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8345 mddev2->curr_resync >= mddev->curr_resync) {
8346 if (mddev2_minor != mddev2->md_minor) {
8347 mddev2_minor = mddev2->md_minor;
8348 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8349 desc, mdname(mddev),
8350 mdname(mddev2));
8351 }
8352 mddev_put(mddev2);
8353 if (signal_pending(current))
8354 flush_signals(current);
8355 schedule();
8356 finish_wait(&resync_wait, &wq);
8357 goto try_again;
8358 }
8359 finish_wait(&resync_wait, &wq);
8360 }
8361 }
8362 } while (mddev->curr_resync < 2);
8363
8364 j = 0;
8365 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8366 /* resync follows the size requested by the personality,
8367 * which defaults to physical size, but can be virtual size
8368 */
8369 max_sectors = mddev->resync_max_sectors;
8370 atomic64_set(&mddev->resync_mismatches, 0);
8371 /* we don't use the checkpoint if there's a bitmap */
8372 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8373 j = mddev->resync_min;
8374 else if (!mddev->bitmap)
8375 j = mddev->recovery_cp;
8376
8377 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8378 max_sectors = mddev->resync_max_sectors;
8379 else {
8380 /* recovery follows the physical size of devices */
8381 max_sectors = mddev->dev_sectors;
8382 j = MaxSector;
8383 rcu_read_lock();
8384 rdev_for_each_rcu(rdev, mddev)
8385 if (rdev->raid_disk >= 0 &&
8386 !test_bit(Journal, &rdev->flags) &&
8387 !test_bit(Faulty, &rdev->flags) &&
8388 !test_bit(In_sync, &rdev->flags) &&
8389 rdev->recovery_offset < j)
8390 j = rdev->recovery_offset;
8391 rcu_read_unlock();
8392
8393 /* If there is a bitmap, we need to make sure all
8394 * writes that started before we added a spare
8395 * complete before we start doing a recovery.
8396 * Otherwise the write might complete and (via
8397 * bitmap_endwrite) set a bit in the bitmap after the
8398 * recovery has checked that bit and skipped that
8399 * region.
8400 */
8401 if (mddev->bitmap) {
8402 mddev->pers->quiesce(mddev, 1);
8403 mddev->pers->quiesce(mddev, 0);
8404 }
8405 }
8406
8407 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8408 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
8409 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8410 speed_max(mddev), desc);
8411
8412 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
8413
8414 io_sectors = 0;
8415 for (m = 0; m < SYNC_MARKS; m++) {
8416 mark[m] = jiffies;
8417 mark_cnt[m] = io_sectors;
8418 }
8419 last_mark = 0;
8420 mddev->resync_mark = mark[last_mark];
8421 mddev->resync_mark_cnt = mark_cnt[last_mark];
8422
8423 /*
8424 * Tune reconstruction:
8425 */
8426 window = 32*(PAGE_SIZE/512);
8427 pr_debug("md: using %dk window, over a total of %lluk.\n",
8428 window/2, (unsigned long long)max_sectors/2);
8429
8430 atomic_set(&mddev->recovery_active, 0);
8431 last_check = 0;
8432
8433 if (j>2) {
8434 pr_debug("md: resuming %s of %s from checkpoint.\n",
8435 desc, mdname(mddev));
8436 mddev->curr_resync = j;
8437 } else
8438 mddev->curr_resync = 3; /* no longer delayed */
8439 mddev->curr_resync_completed = j;
8440 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8441 md_new_event(mddev);
8442 update_time = jiffies;
8443
8444 blk_start_plug(&plug);
8445 while (j < max_sectors) {
8446 sector_t sectors;
8447
8448 skipped = 0;
8449
8450 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8451 ((mddev->curr_resync > mddev->curr_resync_completed &&
8452 (mddev->curr_resync - mddev->curr_resync_completed)
8453 > (max_sectors >> 4)) ||
8454 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
8455 (j - mddev->curr_resync_completed)*2
8456 >= mddev->resync_max - mddev->curr_resync_completed ||
8457 mddev->curr_resync_completed > mddev->resync_max
8458 )) {
8459 /* time to update curr_resync_completed */
8460 wait_event(mddev->recovery_wait,
8461 atomic_read(&mddev->recovery_active) == 0);
8462 mddev->curr_resync_completed = j;
8463 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8464 j > mddev->recovery_cp)
8465 mddev->recovery_cp = j;
8466 update_time = jiffies;
8467 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8468 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8469 }
8470
8471 while (j >= mddev->resync_max &&
8472 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8473 /* As this condition is controlled by user-space,
8474 * we can block indefinitely, so use '_interruptible'
8475 * to avoid triggering warnings.
8476 */
8477 flush_signals(current); /* just in case */
8478 wait_event_interruptible(mddev->recovery_wait,
8479 mddev->resync_max > j
8480 || test_bit(MD_RECOVERY_INTR,
8481 &mddev->recovery));
8482 }
8483
8484 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8485 break;
8486
8487 sectors = mddev->pers->sync_request(mddev, j, &skipped);
8488 if (sectors == 0) {
8489 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8490 break;
8491 }
8492
8493 if (!skipped) { /* actual IO requested */
8494 io_sectors += sectors;
8495 atomic_add(sectors, &mddev->recovery_active);
8496 }
8497
8498 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8499 break;
8500
8501 j += sectors;
8502 if (j > max_sectors)
8503 /* when skipping, extra large numbers can be returned. */
8504 j = max_sectors;
8505 if (j > 2)
8506 mddev->curr_resync = j;
8507 mddev->curr_mark_cnt = io_sectors;
8508 if (last_check == 0)
8509 /* this is the earliest that rebuild will be
8510 * visible in /proc/mdstat
8511 */
8512 md_new_event(mddev);
8513
8514 if (last_check + window > io_sectors || j == max_sectors)
8515 continue;
8516
8517 last_check = io_sectors;
8518 repeat:
8519 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8520 /* step marks */
8521 int next = (last_mark+1) % SYNC_MARKS;
8522
8523 mddev->resync_mark = mark[next];
8524 mddev->resync_mark_cnt = mark_cnt[next];
8525 mark[next] = jiffies;
8526 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
8527 last_mark = next;
8528 }
8529
8530 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8531 break;
8532
8533 /*
8534 * this loop exits only if either when we are slower than
8535 * the 'hard' speed limit, or the system was IO-idle for
8536 * a jiffy.
8537 * the system might be non-idle CPU-wise, but we only care
8538 * about not overloading the IO subsystem. (things like an
8539 * e2fsck being done on the RAID array should execute fast)
8540 */
8541 cond_resched();
8542
8543 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8544 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
8545 /((jiffies-mddev->resync_mark)/HZ +1) +1;
8546
8547 if (currspeed > speed_min(mddev)) {
8548 if (currspeed > speed_max(mddev)) {
8549 msleep(500);
8550 goto repeat;
8551 }
8552 if (!is_mddev_idle(mddev, 0)) {
8553 /*
8554 * Give other IO more of a chance.
8555 * The faster the devices, the less we wait.
8556 */
8557 wait_event(mddev->recovery_wait,
8558 !atomic_read(&mddev->recovery_active));
8559 }
8560 }
8561 }
8562 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8563 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8564 ? "interrupted" : "done");
8565 /*
8566 * this also signals 'finished resyncing' to md_stop
8567 */
8568 blk_finish_plug(&plug);
8569 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8570
8571 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8572 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8573 mddev->curr_resync > 3) {
8574 mddev->curr_resync_completed = mddev->curr_resync;
8575 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8576 }
8577 mddev->pers->sync_request(mddev, max_sectors, &skipped);
8578
8579 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
8580 mddev->curr_resync > 3) {
8581 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8582 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8583 if (mddev->curr_resync >= mddev->recovery_cp) {
8584 pr_debug("md: checkpointing %s of %s.\n",
8585 desc, mdname(mddev));
8586 if (test_bit(MD_RECOVERY_ERROR,
8587 &mddev->recovery))
8588 mddev->recovery_cp =
8589 mddev->curr_resync_completed;
8590 else
8591 mddev->recovery_cp =
8592 mddev->curr_resync;
8593 }
8594 } else
8595 mddev->recovery_cp = MaxSector;
8596 } else {
8597 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8598 mddev->curr_resync = MaxSector;
8599 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8600 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8601 rcu_read_lock();
8602 rdev_for_each_rcu(rdev, mddev)
8603 if (rdev->raid_disk >= 0 &&
8604 mddev->delta_disks >= 0 &&
8605 !test_bit(Journal, &rdev->flags) &&
8606 !test_bit(Faulty, &rdev->flags) &&
8607 !test_bit(In_sync, &rdev->flags) &&
8608 rdev->recovery_offset < mddev->curr_resync)
8609 rdev->recovery_offset = mddev->curr_resync;
8610 rcu_read_unlock();
8611 }
8612 }
8613 }
8614 skip:
8615 /* set CHANGE_PENDING here since maybe another update is needed,
8616 * so other nodes are informed. It should be harmless for normal
8617 * raid */
8618 set_mask_bits(&mddev->sb_flags, 0,
8619 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
8620
8621 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8622 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8623 mddev->delta_disks > 0 &&
8624 mddev->pers->finish_reshape &&
8625 mddev->pers->size &&
8626 mddev->queue) {
8627 mddev_lock_nointr(mddev);
8628 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
8629 mddev_unlock(mddev);
8630 set_capacity(mddev->gendisk, mddev->array_sectors);
8631 revalidate_disk(mddev->gendisk);
8632 }
8633
8634 spin_lock(&mddev->lock);
8635 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8636 /* We completed so min/max setting can be forgotten if used. */
8637 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8638 mddev->resync_min = 0;
8639 mddev->resync_max = MaxSector;
8640 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8641 mddev->resync_min = mddev->curr_resync_completed;
8642 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
8643 mddev->curr_resync = 0;
8644 spin_unlock(&mddev->lock);
8645
8646 wake_up(&resync_wait);
8647 md_wakeup_thread(mddev->thread);
8648 return;
8649 }
8650 EXPORT_SYMBOL_GPL(md_do_sync);
8651
8652 static int remove_and_add_spares(struct mddev *mddev,
8653 struct md_rdev *this)
8654 {
8655 struct md_rdev *rdev;
8656 int spares = 0;
8657 int removed = 0;
8658 bool remove_some = false;
8659
8660 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8661 /* Mustn't remove devices when resync thread is running */
8662 return 0;
8663
8664 rdev_for_each(rdev, mddev) {
8665 if ((this == NULL || rdev == this) &&
8666 rdev->raid_disk >= 0 &&
8667 !test_bit(Blocked, &rdev->flags) &&
8668 test_bit(Faulty, &rdev->flags) &&
8669 atomic_read(&rdev->nr_pending)==0) {
8670 /* Faulty non-Blocked devices with nr_pending == 0
8671 * never get nr_pending incremented,
8672 * never get Faulty cleared, and never get Blocked set.
8673 * So we can synchronize_rcu now rather than once per device
8674 */
8675 remove_some = true;
8676 set_bit(RemoveSynchronized, &rdev->flags);
8677 }
8678 }
8679
8680 if (remove_some)
8681 synchronize_rcu();
8682 rdev_for_each(rdev, mddev) {
8683 if ((this == NULL || rdev == this) &&
8684 rdev->raid_disk >= 0 &&
8685 !test_bit(Blocked, &rdev->flags) &&
8686 ((test_bit(RemoveSynchronized, &rdev->flags) ||
8687 (!test_bit(In_sync, &rdev->flags) &&
8688 !test_bit(Journal, &rdev->flags))) &&
8689 atomic_read(&rdev->nr_pending)==0)) {
8690 if (mddev->pers->hot_remove_disk(
8691 mddev, rdev) == 0) {
8692 sysfs_unlink_rdev(mddev, rdev);
8693 rdev->saved_raid_disk = rdev->raid_disk;
8694 rdev->raid_disk = -1;
8695 removed++;
8696 }
8697 }
8698 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
8699 clear_bit(RemoveSynchronized, &rdev->flags);
8700 }
8701
8702 if (removed && mddev->kobj.sd)
8703 sysfs_notify(&mddev->kobj, NULL, "degraded");
8704
8705 if (this && removed)
8706 goto no_add;
8707
8708 rdev_for_each(rdev, mddev) {
8709 if (this && this != rdev)
8710 continue;
8711 if (test_bit(Candidate, &rdev->flags))
8712 continue;
8713 if (rdev->raid_disk >= 0 &&
8714 !test_bit(In_sync, &rdev->flags) &&
8715 !test_bit(Journal, &rdev->flags) &&
8716 !test_bit(Faulty, &rdev->flags))
8717 spares++;
8718 if (rdev->raid_disk >= 0)
8719 continue;
8720 if (test_bit(Faulty, &rdev->flags))
8721 continue;
8722 if (!test_bit(Journal, &rdev->flags)) {
8723 if (mddev->ro &&
8724 ! (rdev->saved_raid_disk >= 0 &&
8725 !test_bit(Bitmap_sync, &rdev->flags)))
8726 continue;
8727
8728 rdev->recovery_offset = 0;
8729 }
8730 if (mddev->pers->
8731 hot_add_disk(mddev, rdev) == 0) {
8732 if (sysfs_link_rdev(mddev, rdev))
8733 /* failure here is OK */;
8734 if (!test_bit(Journal, &rdev->flags))
8735 spares++;
8736 md_new_event(mddev);
8737 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8738 }
8739 }
8740 no_add:
8741 if (removed)
8742 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8743 return spares;
8744 }
8745
8746 static void md_start_sync(struct work_struct *ws)
8747 {
8748 struct mddev *mddev = container_of(ws, struct mddev, del_work);
8749
8750 mddev->sync_thread = md_register_thread(md_do_sync,
8751 mddev,
8752 "resync");
8753 if (!mddev->sync_thread) {
8754 pr_warn("%s: could not start resync thread...\n",
8755 mdname(mddev));
8756 /* leave the spares where they are, it shouldn't hurt */
8757 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8758 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8759 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8760 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8761 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8762 wake_up(&resync_wait);
8763 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8764 &mddev->recovery))
8765 if (mddev->sysfs_action)
8766 sysfs_notify_dirent_safe(mddev->sysfs_action);
8767 } else
8768 md_wakeup_thread(mddev->sync_thread);
8769 sysfs_notify_dirent_safe(mddev->sysfs_action);
8770 md_new_event(mddev);
8771 }
8772
8773 /*
8774 * This routine is regularly called by all per-raid-array threads to
8775 * deal with generic issues like resync and super-block update.
8776 * Raid personalities that don't have a thread (linear/raid0) do not
8777 * need this as they never do any recovery or update the superblock.
8778 *
8779 * It does not do any resync itself, but rather "forks" off other threads
8780 * to do that as needed.
8781 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
8782 * "->recovery" and create a thread at ->sync_thread.
8783 * When the thread finishes it sets MD_RECOVERY_DONE
8784 * and wakeups up this thread which will reap the thread and finish up.
8785 * This thread also removes any faulty devices (with nr_pending == 0).
8786 *
8787 * The overall approach is:
8788 * 1/ if the superblock needs updating, update it.
8789 * 2/ If a recovery thread is running, don't do anything else.
8790 * 3/ If recovery has finished, clean up, possibly marking spares active.
8791 * 4/ If there are any faulty devices, remove them.
8792 * 5/ If array is degraded, try to add spares devices
8793 * 6/ If array has spares or is not in-sync, start a resync thread.
8794 */
8795 void md_check_recovery(struct mddev *mddev)
8796 {
8797 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
8798 /* Write superblock - thread that called mddev_suspend()
8799 * holds reconfig_mutex for us.
8800 */
8801 set_bit(MD_UPDATING_SB, &mddev->flags);
8802 smp_mb__after_atomic();
8803 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
8804 md_update_sb(mddev, 0);
8805 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
8806 wake_up(&mddev->sb_wait);
8807 }
8808
8809 if (mddev->suspended)
8810 return;
8811
8812 if (mddev->bitmap)
8813 bitmap_daemon_work(mddev);
8814
8815 if (signal_pending(current)) {
8816 if (mddev->pers->sync_request && !mddev->external) {
8817 pr_debug("md: %s in immediate safe mode\n",
8818 mdname(mddev));
8819 mddev->safemode = 2;
8820 }
8821 flush_signals(current);
8822 }
8823
8824 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
8825 return;
8826 if ( ! (
8827 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
8828 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8829 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8830 (mddev->external == 0 && mddev->safemode == 1) ||
8831 (mddev->safemode == 2
8832 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
8833 ))
8834 return;
8835
8836 if (mddev_trylock(mddev)) {
8837 int spares = 0;
8838 bool try_set_sync = mddev->safemode != 0;
8839
8840 if (!mddev->external && mddev->safemode == 1)
8841 mddev->safemode = 0;
8842
8843 if (mddev->ro) {
8844 struct md_rdev *rdev;
8845 if (!mddev->external && mddev->in_sync)
8846 /* 'Blocked' flag not needed as failed devices
8847 * will be recorded if array switched to read/write.
8848 * Leaving it set will prevent the device
8849 * from being removed.
8850 */
8851 rdev_for_each(rdev, mddev)
8852 clear_bit(Blocked, &rdev->flags);
8853 /* On a read-only array we can:
8854 * - remove failed devices
8855 * - add already-in_sync devices if the array itself
8856 * is in-sync.
8857 * As we only add devices that are already in-sync,
8858 * we can activate the spares immediately.
8859 */
8860 remove_and_add_spares(mddev, NULL);
8861 /* There is no thread, but we need to call
8862 * ->spare_active and clear saved_raid_disk
8863 */
8864 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8865 md_reap_sync_thread(mddev);
8866 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8867 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8868 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
8869 goto unlock;
8870 }
8871
8872 if (mddev_is_clustered(mddev)) {
8873 struct md_rdev *rdev;
8874 /* kick the device if another node issued a
8875 * remove disk.
8876 */
8877 rdev_for_each(rdev, mddev) {
8878 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
8879 rdev->raid_disk < 0)
8880 md_kick_rdev_from_array(rdev);
8881 }
8882 }
8883
8884 if (try_set_sync && !mddev->external && !mddev->in_sync) {
8885 spin_lock(&mddev->lock);
8886 set_in_sync(mddev);
8887 spin_unlock(&mddev->lock);
8888 }
8889
8890 if (mddev->sb_flags)
8891 md_update_sb(mddev, 0);
8892
8893 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
8894 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
8895 /* resync/recovery still happening */
8896 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8897 goto unlock;
8898 }
8899 if (mddev->sync_thread) {
8900 md_reap_sync_thread(mddev);
8901 goto unlock;
8902 }
8903 /* Set RUNNING before clearing NEEDED to avoid
8904 * any transients in the value of "sync_action".
8905 */
8906 mddev->curr_resync_completed = 0;
8907 spin_lock(&mddev->lock);
8908 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8909 spin_unlock(&mddev->lock);
8910 /* Clear some bits that don't mean anything, but
8911 * might be left set
8912 */
8913 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
8914 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
8915
8916 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
8917 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
8918 goto not_running;
8919 /* no recovery is running.
8920 * remove any failed drives, then
8921 * add spares if possible.
8922 * Spares are also removed and re-added, to allow
8923 * the personality to fail the re-add.
8924 */
8925
8926 if (mddev->reshape_position != MaxSector) {
8927 if (mddev->pers->check_reshape == NULL ||
8928 mddev->pers->check_reshape(mddev) != 0)
8929 /* Cannot proceed */
8930 goto not_running;
8931 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
8932 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8933 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
8934 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8935 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
8936 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
8937 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8938 } else if (mddev->recovery_cp < MaxSector) {
8939 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
8940 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8941 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
8942 /* nothing to be done ... */
8943 goto not_running;
8944
8945 if (mddev->pers->sync_request) {
8946 if (spares) {
8947 /* We are adding a device or devices to an array
8948 * which has the bitmap stored on all devices.
8949 * So make sure all bitmap pages get written
8950 */
8951 bitmap_write_all(mddev->bitmap);
8952 }
8953 INIT_WORK(&mddev->del_work, md_start_sync);
8954 queue_work(md_misc_wq, &mddev->del_work);
8955 goto unlock;
8956 }
8957 not_running:
8958 if (!mddev->sync_thread) {
8959 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
8960 wake_up(&resync_wait);
8961 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
8962 &mddev->recovery))
8963 if (mddev->sysfs_action)
8964 sysfs_notify_dirent_safe(mddev->sysfs_action);
8965 }
8966 unlock:
8967 wake_up(&mddev->sb_wait);
8968 mddev_unlock(mddev);
8969 }
8970 }
8971 EXPORT_SYMBOL(md_check_recovery);
8972
8973 void md_reap_sync_thread(struct mddev *mddev)
8974 {
8975 struct md_rdev *rdev;
8976
8977 /* resync has finished, collect result */
8978 md_unregister_thread(&mddev->sync_thread);
8979 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8980 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
8981 mddev->degraded != mddev->raid_disks) {
8982 /* success...*/
8983 /* activate any spares */
8984 if (mddev->pers->spare_active(mddev)) {
8985 sysfs_notify(&mddev->kobj, NULL,
8986 "degraded");
8987 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
8988 }
8989 }
8990 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8991 mddev->pers->finish_reshape)
8992 mddev->pers->finish_reshape(mddev);
8993
8994 /* If array is no-longer degraded, then any saved_raid_disk
8995 * information must be scrapped.
8996 */
8997 if (!mddev->degraded)
8998 rdev_for_each(rdev, mddev)
8999 rdev->saved_raid_disk = -1;
9000
9001 md_update_sb(mddev, 1);
9002 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
9003 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9004 * clustered raid */
9005 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9006 md_cluster_ops->resync_finish(mddev);
9007 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
9008 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
9009 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9010 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9011 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9012 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9013 wake_up(&resync_wait);
9014 /* flag recovery needed just to double check */
9015 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9016 sysfs_notify_dirent_safe(mddev->sysfs_action);
9017 md_new_event(mddev);
9018 if (mddev->event_work.func)
9019 queue_work(md_misc_wq, &mddev->event_work);
9020 }
9021 EXPORT_SYMBOL(md_reap_sync_thread);
9022
9023 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
9024 {
9025 sysfs_notify_dirent_safe(rdev->sysfs_state);
9026 wait_event_timeout(rdev->blocked_wait,
9027 !test_bit(Blocked, &rdev->flags) &&
9028 !test_bit(BlockedBadBlocks, &rdev->flags),
9029 msecs_to_jiffies(5000));
9030 rdev_dec_pending(rdev, mddev);
9031 }
9032 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9033
9034 void md_finish_reshape(struct mddev *mddev)
9035 {
9036 /* called be personality module when reshape completes. */
9037 struct md_rdev *rdev;
9038
9039 rdev_for_each(rdev, mddev) {
9040 if (rdev->data_offset > rdev->new_data_offset)
9041 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9042 else
9043 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9044 rdev->data_offset = rdev->new_data_offset;
9045 }
9046 }
9047 EXPORT_SYMBOL(md_finish_reshape);
9048
9049 /* Bad block management */
9050
9051 /* Returns 1 on success, 0 on failure */
9052 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9053 int is_new)
9054 {
9055 struct mddev *mddev = rdev->mddev;
9056 int rv;
9057 if (is_new)
9058 s += rdev->new_data_offset;
9059 else
9060 s += rdev->data_offset;
9061 rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9062 if (rv == 0) {
9063 /* Make sure they get written out promptly */
9064 if (test_bit(ExternalBbl, &rdev->flags))
9065 sysfs_notify(&rdev->kobj, NULL,
9066 "unacknowledged_bad_blocks");
9067 sysfs_notify_dirent_safe(rdev->sysfs_state);
9068 set_mask_bits(&mddev->sb_flags, 0,
9069 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
9070 md_wakeup_thread(rdev->mddev->thread);
9071 return 1;
9072 } else
9073 return 0;
9074 }
9075 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9076
9077 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9078 int is_new)
9079 {
9080 int rv;
9081 if (is_new)
9082 s += rdev->new_data_offset;
9083 else
9084 s += rdev->data_offset;
9085 rv = badblocks_clear(&rdev->badblocks, s, sectors);
9086 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9087 sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9088 return rv;
9089 }
9090 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9091
9092 static int md_notify_reboot(struct notifier_block *this,
9093 unsigned long code, void *x)
9094 {
9095 struct list_head *tmp;
9096 struct mddev *mddev;
9097 int need_delay = 0;
9098
9099 for_each_mddev(mddev, tmp) {
9100 if (mddev_trylock(mddev)) {
9101 if (mddev->pers)
9102 __md_stop_writes(mddev);
9103 if (mddev->persistent)
9104 mddev->safemode = 2;
9105 mddev_unlock(mddev);
9106 }
9107 need_delay = 1;
9108 }
9109 /*
9110 * certain more exotic SCSI devices are known to be
9111 * volatile wrt too early system reboots. While the
9112 * right place to handle this issue is the given
9113 * driver, we do want to have a safe RAID driver ...
9114 */
9115 if (need_delay)
9116 mdelay(1000*1);
9117
9118 return NOTIFY_DONE;
9119 }
9120
9121 static struct notifier_block md_notifier = {
9122 .notifier_call = md_notify_reboot,
9123 .next = NULL,
9124 .priority = INT_MAX, /* before any real devices */
9125 };
9126
9127 static void md_geninit(void)
9128 {
9129 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
9130
9131 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
9132 }
9133
9134 static int __init md_init(void)
9135 {
9136 int ret = -ENOMEM;
9137
9138 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
9139 if (!md_wq)
9140 goto err_wq;
9141
9142 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9143 if (!md_misc_wq)
9144 goto err_misc_wq;
9145
9146 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9147 goto err_md;
9148
9149 if ((ret = register_blkdev(0, "mdp")) < 0)
9150 goto err_mdp;
9151 mdp_major = ret;
9152
9153 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
9154 md_probe, NULL, NULL);
9155 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
9156 md_probe, NULL, NULL);
9157
9158 register_reboot_notifier(&md_notifier);
9159 raid_table_header = register_sysctl_table(raid_root_table);
9160
9161 md_geninit();
9162 return 0;
9163
9164 err_mdp:
9165 unregister_blkdev(MD_MAJOR, "md");
9166 err_md:
9167 destroy_workqueue(md_misc_wq);
9168 err_misc_wq:
9169 destroy_workqueue(md_wq);
9170 err_wq:
9171 return ret;
9172 }
9173
9174 static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
9175 {
9176 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9177 struct md_rdev *rdev2;
9178 int role, ret;
9179 char b[BDEVNAME_SIZE];
9180
9181 /*
9182 * If size is changed in another node then we need to
9183 * do resize as well.
9184 */
9185 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9186 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9187 if (ret)
9188 pr_info("md-cluster: resize failed\n");
9189 else
9190 bitmap_update_sb(mddev->bitmap);
9191 }
9192
9193 /* Check for change of roles in the active devices */
9194 rdev_for_each(rdev2, mddev) {
9195 if (test_bit(Faulty, &rdev2->flags))
9196 continue;
9197
9198 /* Check if the roles changed */
9199 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
9200
9201 if (test_bit(Candidate, &rdev2->flags)) {
9202 if (role == 0xfffe) {
9203 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9204 md_kick_rdev_from_array(rdev2);
9205 continue;
9206 }
9207 else
9208 clear_bit(Candidate, &rdev2->flags);
9209 }
9210
9211 if (role != rdev2->raid_disk) {
9212 /* got activated */
9213 if (rdev2->raid_disk == -1 && role != 0xffff) {
9214 rdev2->saved_raid_disk = role;
9215 ret = remove_and_add_spares(mddev, rdev2);
9216 pr_info("Activated spare: %s\n",
9217 bdevname(rdev2->bdev,b));
9218 /* wakeup mddev->thread here, so array could
9219 * perform resync with the new activated disk */
9220 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9221 md_wakeup_thread(mddev->thread);
9222
9223 }
9224 /* device faulty
9225 * We just want to do the minimum to mark the disk
9226 * as faulty. The recovery is performed by the
9227 * one who initiated the error.
9228 */
9229 if ((role == 0xfffe) || (role == 0xfffd)) {
9230 md_error(mddev, rdev2);
9231 clear_bit(Blocked, &rdev2->flags);
9232 }
9233 }
9234 }
9235
9236 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9237 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
9238
9239 /* Finally set the event to be up to date */
9240 mddev->events = le64_to_cpu(sb->events);
9241 }
9242
9243 static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9244 {
9245 int err;
9246 struct page *swapout = rdev->sb_page;
9247 struct mdp_superblock_1 *sb;
9248
9249 /* Store the sb page of the rdev in the swapout temporary
9250 * variable in case we err in the future
9251 */
9252 rdev->sb_page = NULL;
9253 err = alloc_disk_sb(rdev);
9254 if (err == 0) {
9255 ClearPageUptodate(rdev->sb_page);
9256 rdev->sb_loaded = 0;
9257 err = super_types[mddev->major_version].
9258 load_super(rdev, NULL, mddev->minor_version);
9259 }
9260 if (err < 0) {
9261 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9262 __func__, __LINE__, rdev->desc_nr, err);
9263 if (rdev->sb_page)
9264 put_page(rdev->sb_page);
9265 rdev->sb_page = swapout;
9266 rdev->sb_loaded = 1;
9267 return err;
9268 }
9269
9270 sb = page_address(rdev->sb_page);
9271 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9272 * is not set
9273 */
9274
9275 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9276 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9277
9278 /* The other node finished recovery, call spare_active to set
9279 * device In_sync and mddev->degraded
9280 */
9281 if (rdev->recovery_offset == MaxSector &&
9282 !test_bit(In_sync, &rdev->flags) &&
9283 mddev->pers->spare_active(mddev))
9284 sysfs_notify(&mddev->kobj, NULL, "degraded");
9285
9286 put_page(swapout);
9287 return 0;
9288 }
9289
9290 void md_reload_sb(struct mddev *mddev, int nr)
9291 {
9292 struct md_rdev *rdev;
9293 int err;
9294
9295 /* Find the rdev */
9296 rdev_for_each_rcu(rdev, mddev) {
9297 if (rdev->desc_nr == nr)
9298 break;
9299 }
9300
9301 if (!rdev || rdev->desc_nr != nr) {
9302 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9303 return;
9304 }
9305
9306 err = read_rdev(mddev, rdev);
9307 if (err < 0)
9308 return;
9309
9310 check_sb_changes(mddev, rdev);
9311
9312 /* Read all rdev's to update recovery_offset */
9313 rdev_for_each_rcu(rdev, mddev)
9314 read_rdev(mddev, rdev);
9315 }
9316 EXPORT_SYMBOL(md_reload_sb);
9317
9318 #ifndef MODULE
9319
9320 /*
9321 * Searches all registered partitions for autorun RAID arrays
9322 * at boot time.
9323 */
9324
9325 static DEFINE_MUTEX(detected_devices_mutex);
9326 static LIST_HEAD(all_detected_devices);
9327 struct detected_devices_node {
9328 struct list_head list;
9329 dev_t dev;
9330 };
9331
9332 void md_autodetect_dev(dev_t dev)
9333 {
9334 struct detected_devices_node *node_detected_dev;
9335
9336 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9337 if (node_detected_dev) {
9338 node_detected_dev->dev = dev;
9339 mutex_lock(&detected_devices_mutex);
9340 list_add_tail(&node_detected_dev->list, &all_detected_devices);
9341 mutex_unlock(&detected_devices_mutex);
9342 }
9343 }
9344
9345 static void autostart_arrays(int part)
9346 {
9347 struct md_rdev *rdev;
9348 struct detected_devices_node *node_detected_dev;
9349 dev_t dev;
9350 int i_scanned, i_passed;
9351
9352 i_scanned = 0;
9353 i_passed = 0;
9354
9355 pr_info("md: Autodetecting RAID arrays.\n");
9356
9357 mutex_lock(&detected_devices_mutex);
9358 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9359 i_scanned++;
9360 node_detected_dev = list_entry(all_detected_devices.next,
9361 struct detected_devices_node, list);
9362 list_del(&node_detected_dev->list);
9363 dev = node_detected_dev->dev;
9364 kfree(node_detected_dev);
9365 mutex_unlock(&detected_devices_mutex);
9366 rdev = md_import_device(dev,0, 90);
9367 mutex_lock(&detected_devices_mutex);
9368 if (IS_ERR(rdev))
9369 continue;
9370
9371 if (test_bit(Faulty, &rdev->flags))
9372 continue;
9373
9374 set_bit(AutoDetected, &rdev->flags);
9375 list_add(&rdev->same_set, &pending_raid_disks);
9376 i_passed++;
9377 }
9378 mutex_unlock(&detected_devices_mutex);
9379
9380 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
9381
9382 autorun_devices(part);
9383 }
9384
9385 #endif /* !MODULE */
9386
9387 static __exit void md_exit(void)
9388 {
9389 struct mddev *mddev;
9390 struct list_head *tmp;
9391 int delay = 1;
9392
9393 blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
9394 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
9395
9396 unregister_blkdev(MD_MAJOR,"md");
9397 unregister_blkdev(mdp_major, "mdp");
9398 unregister_reboot_notifier(&md_notifier);
9399 unregister_sysctl_table(raid_table_header);
9400
9401 /* We cannot unload the modules while some process is
9402 * waiting for us in select() or poll() - wake them up
9403 */
9404 md_unloading = 1;
9405 while (waitqueue_active(&md_event_waiters)) {
9406 /* not safe to leave yet */
9407 wake_up(&md_event_waiters);
9408 msleep(delay);
9409 delay += delay;
9410 }
9411 remove_proc_entry("mdstat", NULL);
9412
9413 for_each_mddev(mddev, tmp) {
9414 export_array(mddev);
9415 mddev->ctime = 0;
9416 mddev->hold_active = 0;
9417 /*
9418 * for_each_mddev() will call mddev_put() at the end of each
9419 * iteration. As the mddev is now fully clear, this will
9420 * schedule the mddev for destruction by a workqueue, and the
9421 * destroy_workqueue() below will wait for that to complete.
9422 */
9423 }
9424 destroy_workqueue(md_misc_wq);
9425 destroy_workqueue(md_wq);
9426 }
9427
9428 subsys_initcall(md_init);
9429 module_exit(md_exit)
9430
9431 static int get_ro(char *buffer, const struct kernel_param *kp)
9432 {
9433 return sprintf(buffer, "%d", start_readonly);
9434 }
9435 static int set_ro(const char *val, const struct kernel_param *kp)
9436 {
9437 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
9438 }
9439
9440 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9441 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
9442 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
9443 module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
9444
9445 MODULE_LICENSE("GPL");
9446 MODULE_DESCRIPTION("MD RAID framework");
9447 MODULE_ALIAS("md");
9448 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);