]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/blame - drivers/md/md.c
md: simplify md_setup_drive
[mirror_ubuntu-jammy-kernel.git] / drivers / md / md.c
CommitLineData
af1a8899 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 md.c : Multiple Devices driver for Linux
f72ffdd6 4 Copyright (C) 1998, 1999, 2000 Ingo Molnar
1da177e4
LT
5
6 completely rewritten, based on the MD driver code from Marc Zyngier
7
8 Changes:
9
10 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
11 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
12 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
13 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
14 - kmod support by: Cyrus Durgin
15 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
16 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
17
18 - lots of fixes and improvements to the RAID1/RAID5 and generic
19 RAID code (such as request based resynchronization):
20
21 Neil Brown <neilb@cse.unsw.edu.au>.
22
32a7627c
N
23 - persistent bitmap code
24 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
25
9d48739e
N
26
27 Errors, Warnings, etc.
28 Please use:
29 pr_crit() for error conditions that risk data loss
30 pr_err() for error conditions that are unexpected, like an IO error
31 or internal inconsistency
32 pr_warn() for error conditions that could have been predicated, like
33 adding a device to an array when it has incompatible metadata
34 pr_info() for every interesting, very rare events, like an array starting
35 or stopping, or resync starting or stopping
36 pr_debug() for everything else.
37
1da177e4
LT
38*/
39
963c555e 40#include <linux/sched/mm.h>
3f07c014 41#include <linux/sched/signal.h>
a6fb0934 42#include <linux/kthread.h>
bff61975 43#include <linux/blkdev.h>
fc974ee2 44#include <linux/badblocks.h>
1da177e4 45#include <linux/sysctl.h>
bff61975 46#include <linux/seq_file.h>
ff01bb48 47#include <linux/fs.h>
d7603b7e 48#include <linux/poll.h>
16f17b39 49#include <linux/ctype.h>
e7d2860b 50#include <linux/string.h>
fb4d8c76
N
51#include <linux/hdreg.h>
52#include <linux/proc_fs.h>
53#include <linux/random.h>
056075c7 54#include <linux/module.h>
fb4d8c76 55#include <linux/reboot.h>
32a7627c 56#include <linux/file.h>
aa98aa31 57#include <linux/compat.h>
25570727 58#include <linux/delay.h>
bff61975
N
59#include <linux/raid/md_p.h>
60#include <linux/raid/md_u.h>
74cc979c 61#include <linux/raid/detect.h>
5a0e3ad6 62#include <linux/slab.h>
4ad23a97 63#include <linux/percpu-refcount.h>
c6a564ff 64#include <linux/part_stat.h>
4ad23a97 65
504634f6 66#include <trace/events/block.h>
43b2e5d8 67#include "md.h"
935fe098 68#include "md-bitmap.h"
edb39c9d 69#include "md-cluster.h"
1da177e4 70
01f96c0a
N
71/* pers_list is a list of registered personalities protected
72 * by pers_lock.
73 * pers_lock does extra service to protect accesses to
74 * mddev->thread when the mutex cannot be held.
75 */
2604b703 76static LIST_HEAD(pers_list);
1da177e4
LT
77static DEFINE_SPINLOCK(pers_lock);
78
28dec870
KO
79static struct kobj_type md_ktype;
80
edb39c9d 81struct md_cluster_operations *md_cluster_ops;
589a1c49 82EXPORT_SYMBOL(md_cluster_ops);
2b598ee5 83static struct module *md_cluster_mod;
edb39c9d 84
90b08710 85static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
e804ac78
TH
86static struct workqueue_struct *md_wq;
87static struct workqueue_struct *md_misc_wq;
cc1ffe61 88static struct workqueue_struct *md_rdev_misc_wq;
90b08710 89
746d3207
N
90static int remove_and_add_spares(struct mddev *mddev,
91 struct md_rdev *this);
5aa61f42 92static void mddev_detach(struct mddev *mddev);
746d3207 93
1e50915f
RB
94/*
95 * Default number of read corrections we'll attempt on an rdev
96 * before ejecting it from the array. We divide the read error
97 * count by 2 for every hour elapsed between read errors.
98 */
99#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
1da177e4
LT
100/*
101 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
102 * is 1000 KB/sec, so the extra system load does not show up that much.
103 * Increase it if you want to have more _guaranteed_ speed. Note that
338cec32 104 * the RAID driver will use the maximum available bandwidth if the IO
1da177e4
LT
105 * subsystem is idle. There is also an 'absolute maximum' reconstruction
106 * speed limit - in case reconstruction slows down your system despite
107 * idle IO detection.
108 *
109 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
88202a0c 110 * or /sys/block/mdX/md/sync_speed_{min,max}
1da177e4
LT
111 */
112
113static int sysctl_speed_limit_min = 1000;
114static int sysctl_speed_limit_max = 200000;
fd01b88c 115static inline int speed_min(struct mddev *mddev)
88202a0c
N
116{
117 return mddev->sync_speed_min ?
118 mddev->sync_speed_min : sysctl_speed_limit_min;
119}
120
fd01b88c 121static inline int speed_max(struct mddev *mddev)
88202a0c
N
122{
123 return mddev->sync_speed_max ?
124 mddev->sync_speed_max : sysctl_speed_limit_max;
125}
1da177e4 126
69b00b5b
GJ
127static void rdev_uninit_serial(struct md_rdev *rdev)
128{
129 if (!test_and_clear_bit(CollisionCheck, &rdev->flags))
130 return;
131
025471f9 132 kvfree(rdev->serial);
69b00b5b
GJ
133 rdev->serial = NULL;
134}
135
136static void rdevs_uninit_serial(struct mddev *mddev)
137{
138 struct md_rdev *rdev;
139
140 rdev_for_each(rdev, mddev)
141 rdev_uninit_serial(rdev);
142}
143
404659cf 144static int rdev_init_serial(struct md_rdev *rdev)
3e148a32 145{
025471f9
GJ
146 /* serial_nums equals with BARRIER_BUCKETS_NR */
147 int i, serial_nums = 1 << ((PAGE_SHIFT - ilog2(sizeof(atomic_t))));
69b00b5b
GJ
148 struct serial_in_rdev *serial = NULL;
149
150 if (test_bit(CollisionCheck, &rdev->flags))
151 return 0;
152
025471f9
GJ
153 serial = kvmalloc(sizeof(struct serial_in_rdev) * serial_nums,
154 GFP_KERNEL);
69b00b5b
GJ
155 if (!serial)
156 return -ENOMEM;
157
025471f9
GJ
158 for (i = 0; i < serial_nums; i++) {
159 struct serial_in_rdev *serial_tmp = &serial[i];
160
161 spin_lock_init(&serial_tmp->serial_lock);
162 serial_tmp->serial_rb = RB_ROOT_CACHED;
163 init_waitqueue_head(&serial_tmp->serial_io_wait);
164 }
165
69b00b5b 166 rdev->serial = serial;
404659cf 167 set_bit(CollisionCheck, &rdev->flags);
3e148a32 168
69b00b5b 169 return 0;
3e148a32
GJ
170}
171
69b00b5b 172static int rdevs_init_serial(struct mddev *mddev)
11d3a9f6
GJ
173{
174 struct md_rdev *rdev;
69b00b5b 175 int ret = 0;
11d3a9f6
GJ
176
177 rdev_for_each(rdev, mddev) {
69b00b5b
GJ
178 ret = rdev_init_serial(rdev);
179 if (ret)
180 break;
11d3a9f6 181 }
69b00b5b
GJ
182
183 /* Free all resources if pool is not existed */
184 if (ret && !mddev->serial_info_pool)
185 rdevs_uninit_serial(mddev);
186
187 return ret;
11d3a9f6
GJ
188}
189
963c555e 190/*
de31ee94
GJ
191 * rdev needs to enable serial stuffs if it meets the conditions:
192 * 1. it is multi-queue device flaged with writemostly.
193 * 2. the write-behind mode is enabled.
194 */
195static int rdev_need_serial(struct md_rdev *rdev)
196{
197 return (rdev && rdev->mddev->bitmap_info.max_write_behind > 0 &&
198 rdev->bdev->bd_queue->nr_hw_queues != 1 &&
199 test_bit(WriteMostly, &rdev->flags));
200}
201
202/*
203 * Init resource for rdev(s), then create serial_info_pool if:
204 * 1. rdev is the first device which return true from rdev_enable_serial.
205 * 2. rdev is NULL, means we want to enable serialization for all rdevs.
963c555e 206 */
404659cf 207void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
11d3a9f6 208 bool is_suspend)
963c555e 209{
69b00b5b
GJ
210 int ret = 0;
211
de31ee94
GJ
212 if (rdev && !rdev_need_serial(rdev) &&
213 !test_bit(CollisionCheck, &rdev->flags))
963c555e
GJ
214 return;
215
de31ee94
GJ
216 if (!is_suspend)
217 mddev_suspend(mddev);
218
219 if (!rdev)
69b00b5b 220 ret = rdevs_init_serial(mddev);
de31ee94 221 else
69b00b5b
GJ
222 ret = rdev_init_serial(rdev);
223 if (ret)
224 goto abort;
de31ee94 225
404659cf 226 if (mddev->serial_info_pool == NULL) {
3024ba2d
CL
227 /*
228 * already in memalloc noio context by
229 * mddev_suspend()
230 */
404659cf
GJ
231 mddev->serial_info_pool =
232 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
233 sizeof(struct serial_info));
69b00b5b
GJ
234 if (!mddev->serial_info_pool) {
235 rdevs_uninit_serial(mddev);
404659cf 236 pr_err("can't alloc memory pool for serialization\n");
69b00b5b 237 }
963c555e 238 }
69b00b5b
GJ
239
240abort:
de31ee94
GJ
241 if (!is_suspend)
242 mddev_resume(mddev);
963c555e 243}
963c555e
GJ
244
245/*
de31ee94
GJ
246 * Free resource from rdev(s), and destroy serial_info_pool under conditions:
247 * 1. rdev is the last device flaged with CollisionCheck.
248 * 2. when bitmap is destroyed while policy is not enabled.
249 * 3. for disable policy, the pool is destroyed only when no rdev needs it.
963c555e 250 */
69b00b5b
GJ
251void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
252 bool is_suspend)
963c555e 253{
11d3a9f6 254 if (rdev && !test_bit(CollisionCheck, &rdev->flags))
963c555e
GJ
255 return;
256
404659cf 257 if (mddev->serial_info_pool) {
963c555e 258 struct md_rdev *temp;
de31ee94 259 int num = 0; /* used to track if other rdevs need the pool */
963c555e 260
11d3a9f6
GJ
261 if (!is_suspend)
262 mddev_suspend(mddev);
263 rdev_for_each(temp, mddev) {
264 if (!rdev) {
69b00b5b
GJ
265 if (!mddev->serialize_policy ||
266 !rdev_need_serial(temp))
267 rdev_uninit_serial(temp);
de31ee94
GJ
268 else
269 num++;
270 } else if (temp != rdev &&
271 test_bit(CollisionCheck, &temp->flags))
963c555e 272 num++;
11d3a9f6
GJ
273 }
274
275 if (rdev)
69b00b5b 276 rdev_uninit_serial(rdev);
de31ee94
GJ
277
278 if (num)
279 pr_info("The mempool could be used by other devices\n");
280 else {
404659cf
GJ
281 mempool_destroy(mddev->serial_info_pool);
282 mddev->serial_info_pool = NULL;
963c555e 283 }
11d3a9f6
GJ
284 if (!is_suspend)
285 mddev_resume(mddev);
963c555e
GJ
286 }
287}
288
1da177e4
LT
289static struct ctl_table_header *raid_table_header;
290
82592c38 291static struct ctl_table raid_table[] = {
1da177e4 292 {
1da177e4
LT
293 .procname = "speed_limit_min",
294 .data = &sysctl_speed_limit_min,
295 .maxlen = sizeof(int),
80ca3a44 296 .mode = S_IRUGO|S_IWUSR,
6d456111 297 .proc_handler = proc_dointvec,
1da177e4
LT
298 },
299 {
1da177e4
LT
300 .procname = "speed_limit_max",
301 .data = &sysctl_speed_limit_max,
302 .maxlen = sizeof(int),
80ca3a44 303 .mode = S_IRUGO|S_IWUSR,
6d456111 304 .proc_handler = proc_dointvec,
1da177e4 305 },
894d2491 306 { }
1da177e4
LT
307};
308
82592c38 309static struct ctl_table raid_dir_table[] = {
1da177e4 310 {
1da177e4
LT
311 .procname = "raid",
312 .maxlen = 0,
80ca3a44 313 .mode = S_IRUGO|S_IXUGO,
1da177e4
LT
314 .child = raid_table,
315 },
894d2491 316 { }
1da177e4
LT
317};
318
82592c38 319static struct ctl_table raid_root_table[] = {
1da177e4 320 {
1da177e4
LT
321 .procname = "dev",
322 .maxlen = 0,
323 .mode = 0555,
324 .child = raid_dir_table,
325 },
894d2491 326 { }
1da177e4
LT
327};
328
83d5cde4 329static const struct block_device_operations md_fops;
1da177e4 330
f91de92e
N
331static int start_readonly;
332
78b6350d
N
333/*
334 * The original mechanism for creating an md device is to create
335 * a device node in /dev and to open it. This causes races with device-close.
336 * The preferred method is to write to the "new_array" module parameter.
337 * This can avoid races.
338 * Setting create_on_open to false disables the original mechanism
339 * so all the races disappear.
340 */
341static bool create_on_open = true;
342
a167f663 343struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
fd01b88c 344 struct mddev *mddev)
a167f663 345{
afeee514 346 if (!mddev || !bioset_initialized(&mddev->bio_set))
a167f663
N
347 return bio_alloc(gfp_mask, nr_iovecs);
348
6251691a 349 return bio_alloc_bioset(gfp_mask, nr_iovecs, &mddev->bio_set);
a167f663
N
350}
351EXPORT_SYMBOL_GPL(bio_alloc_mddev);
352
5a85071c
N
353static struct bio *md_bio_alloc_sync(struct mddev *mddev)
354{
afeee514 355 if (!mddev || !bioset_initialized(&mddev->sync_set))
5a85071c
N
356 return bio_alloc(GFP_NOIO, 1);
357
afeee514 358 return bio_alloc_bioset(GFP_NOIO, 1, &mddev->sync_set);
5a85071c
N
359}
360
d7603b7e
N
361/*
362 * We have a system wide 'event count' that is incremented
363 * on any 'interesting' event, and readers of /proc/mdstat
364 * can use 'poll' or 'select' to find out when the event
365 * count increases.
366 *
367 * Events are:
368 * start array, stop array, error, add device, remove device,
369 * start build, activate spare
370 */
2989ddbd 371static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
d7603b7e 372static atomic_t md_event_count;
fd01b88c 373void md_new_event(struct mddev *mddev)
d7603b7e
N
374{
375 atomic_inc(&md_event_count);
376 wake_up(&md_event_waiters);
377}
29269553 378EXPORT_SYMBOL_GPL(md_new_event);
d7603b7e 379
1da177e4
LT
380/*
381 * Enables to iterate over all existing md arrays
382 * all_mddevs_lock protects this list.
383 */
384static LIST_HEAD(all_mddevs);
385static DEFINE_SPINLOCK(all_mddevs_lock);
386
1da177e4
LT
387/*
388 * iterates through all used mddevs in the system.
389 * We take care to grab the all_mddevs_lock whenever navigating
390 * the list, and to always hold a refcount when unlocked.
391 * Any code which breaks out of this loop while own
392 * a reference to the current mddev and must mddev_put it.
393 */
fd01b88c 394#define for_each_mddev(_mddev,_tmp) \
1da177e4 395 \
f72ffdd6 396 for (({ spin_lock(&all_mddevs_lock); \
fd01b88c
N
397 _tmp = all_mddevs.next; \
398 _mddev = NULL;}); \
399 ({ if (_tmp != &all_mddevs) \
400 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
1da177e4 401 spin_unlock(&all_mddevs_lock); \
fd01b88c
N
402 if (_mddev) mddev_put(_mddev); \
403 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
404 _tmp != &all_mddevs;}); \
1da177e4 405 ({ spin_lock(&all_mddevs_lock); \
fd01b88c 406 _tmp = _tmp->next;}) \
1da177e4
LT
407 )
408
409c57f3
N
409/* Rather than calling directly into the personality make_request function,
410 * IO requests come here first so that we can check if the device is
411 * being suspended pending a reconfiguration.
412 * We hold a refcount over the call to ->make_request. By the time that
413 * call has finished, the bio has been linked into some internal structure
414 * and so is visible to ->quiesce(), so we don't need the refcount any more.
415 */
b3143b9a
N
416static bool is_suspended(struct mddev *mddev, struct bio *bio)
417{
418 if (mddev->suspended)
419 return true;
420 if (bio_data_dir(bio) != WRITE)
421 return false;
422 if (mddev->suspend_lo >= mddev->suspend_hi)
423 return false;
424 if (bio->bi_iter.bi_sector >= mddev->suspend_hi)
425 return false;
426 if (bio_end_sector(bio) < mddev->suspend_lo)
427 return false;
428 return true;
429}
430
393debc2
SL
431void md_handle_request(struct mddev *mddev, struct bio *bio)
432{
433check_suspended:
434 rcu_read_lock();
b3143b9a 435 if (is_suspended(mddev, bio)) {
393debc2
SL
436 DEFINE_WAIT(__wait);
437 for (;;) {
438 prepare_to_wait(&mddev->sb_wait, &__wait,
439 TASK_UNINTERRUPTIBLE);
b3143b9a 440 if (!is_suspended(mddev, bio))
393debc2
SL
441 break;
442 rcu_read_unlock();
443 schedule();
444 rcu_read_lock();
445 }
446 finish_wait(&mddev->sb_wait, &__wait);
447 }
448 atomic_inc(&mddev->active_io);
449 rcu_read_unlock();
450
451 if (!mddev->pers->make_request(mddev, bio)) {
452 atomic_dec(&mddev->active_io);
453 wake_up(&mddev->sb_wait);
454 goto check_suspended;
455 }
456
457 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
458 wake_up(&mddev->sb_wait);
459}
460EXPORT_SYMBOL(md_handle_request);
461
dece1635 462static blk_qc_t md_make_request(struct request_queue *q, struct bio *bio)
1da177e4 463{
49077326 464 const int rw = bio_data_dir(bio);
ddcf35d3 465 const int sgrp = op_stat_group(bio_op(bio));
e4fc5a74 466 struct mddev *mddev = bio->bi_disk->private_data;
e91ece55 467 unsigned int sectors;
49077326 468
62f7b198
GP
469 if (unlikely(test_bit(MD_BROKEN, &mddev->flags)) && (rw == WRITE)) {
470 bio_io_error(bio);
471 return BLK_QC_T_NONE;
472 }
473
af67c31f 474 blk_queue_split(q, &bio);
54efd50b 475
274d8cbd 476 if (mddev == NULL || mddev->pers == NULL) {
409c57f3 477 bio_io_error(bio);
dece1635 478 return BLK_QC_T_NONE;
409c57f3 479 }
bbfa57c0 480 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
4246a0b6 481 if (bio_sectors(bio) != 0)
4e4cbee9 482 bio->bi_status = BLK_STS_IOERR;
4246a0b6 483 bio_endio(bio);
dece1635 484 return BLK_QC_T_NONE;
bbfa57c0 485 }
49077326 486
e91ece55
CM
487 /*
488 * save the sectors now since our bio can
489 * go away inside make_request
490 */
491 sectors = bio_sectors(bio);
9c573de3 492 /* bio could be mergeable after passing to underlayer */
1eff9d32 493 bio->bi_opf &= ~REQ_NOMERGE;
393debc2
SL
494
495 md_handle_request(mddev, bio);
49077326 496
112f158f
MS
497 part_stat_lock();
498 part_stat_inc(&mddev->gendisk->part0, ios[sgrp]);
499 part_stat_add(&mddev->gendisk->part0, sectors[sgrp], sectors);
74672d06 500 part_stat_unlock();
49077326 501
dece1635 502 return BLK_QC_T_NONE;
409c57f3
N
503}
504
9e35b99c
N
505/* mddev_suspend makes sure no new requests are submitted
506 * to the device, and that any requests that have been submitted
507 * are completely handled.
afa0f557
N
508 * Once mddev_detach() is called and completes, the module will be
509 * completely unused.
9e35b99c 510 */
fd01b88c 511void mddev_suspend(struct mddev *mddev)
409c57f3 512{
092398dc 513 WARN_ON_ONCE(mddev->thread && current == mddev->thread->tsk);
4d5324f7 514 lockdep_assert_held(&mddev->reconfig_mutex);
0dc10e50
MP
515 if (mddev->suspended++)
516 return;
409c57f3 517 synchronize_rcu();
cc27b0c7 518 wake_up(&mddev->sb_wait);
35bfc521
N
519 set_bit(MD_ALLOW_SB_UPDATE, &mddev->flags);
520 smp_mb__after_atomic();
409c57f3
N
521 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
522 mddev->pers->quiesce(mddev, 1);
35bfc521
N
523 clear_bit_unlock(MD_ALLOW_SB_UPDATE, &mddev->flags);
524 wait_event(mddev->sb_wait, !test_bit(MD_UPDATING_SB, &mddev->flags));
0d9f4f13
JB
525
526 del_timer_sync(&mddev->safemode_timer);
78f57ef9
CL
527 /* restrict memory reclaim I/O during raid array is suspend */
528 mddev->noio_flag = memalloc_noio_save();
409c57f3 529}
390ee602 530EXPORT_SYMBOL_GPL(mddev_suspend);
409c57f3 531
fd01b88c 532void mddev_resume(struct mddev *mddev)
409c57f3 533{
78f57ef9
CL
534 /* entred the memalloc scope from mddev_suspend() */
535 memalloc_noio_restore(mddev->noio_flag);
4d5324f7 536 lockdep_assert_held(&mddev->reconfig_mutex);
0dc10e50
MP
537 if (--mddev->suspended)
538 return;
409c57f3
N
539 wake_up(&mddev->sb_wait);
540 mddev->pers->quiesce(mddev, 0);
0fd018af 541
47525e59 542 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
0fd018af
JB
543 md_wakeup_thread(mddev->thread);
544 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
1da177e4 545}
390ee602 546EXPORT_SYMBOL_GPL(mddev_resume);
1da177e4 547
fd01b88c 548int mddev_congested(struct mddev *mddev, int bits)
3fa841d7 549{
5c675f83
N
550 struct md_personality *pers = mddev->pers;
551 int ret = 0;
552
553 rcu_read_lock();
554 if (mddev->suspended)
555 ret = 1;
556 else if (pers && pers->congested)
557 ret = pers->congested(mddev, bits);
558 rcu_read_unlock();
559 return ret;
560}
561EXPORT_SYMBOL_GPL(mddev_congested);
562static int md_congested(void *data, int bits)
563{
564 struct mddev *mddev = data;
565 return mddev_congested(mddev, bits);
3fa841d7 566}
3fa841d7 567
a2826aa9 568/*
e9c7469b 569 * Generic flush handling for md
a2826aa9
N
570 */
571
4bc034d3 572static void md_end_flush(struct bio *bio)
a2826aa9 573{
4bc034d3
N
574 struct md_rdev *rdev = bio->bi_private;
575 struct mddev *mddev = rdev->mddev;
a2826aa9
N
576
577 rdev_dec_pending(rdev, mddev);
578
4bc034d3
N
579 if (atomic_dec_and_test(&mddev->flush_pending)) {
580 /* The pre-request flush has finished */
581 queue_work(md_wq, &mddev->flush_work);
a2826aa9 582 }
4bc034d3 583 bio_put(bio);
5a409b4f 584}
a7a07e69 585
4bc034d3
N
586static void md_submit_flush_data(struct work_struct *ws);
587
588static void submit_flushes(struct work_struct *ws)
a2826aa9 589{
4bc034d3 590 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
3cb03002 591 struct md_rdev *rdev;
a2826aa9 592
2bc13b83 593 mddev->start_flush = ktime_get_boottime();
4bc034d3
N
594 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
595 atomic_set(&mddev->flush_pending, 1);
a2826aa9 596 rcu_read_lock();
dafb20fa 597 rdev_for_each_rcu(rdev, mddev)
a2826aa9
N
598 if (rdev->raid_disk >= 0 &&
599 !test_bit(Faulty, &rdev->flags)) {
600 /* Take two references, one is dropped
601 * when request finishes, one after
602 * we reclaim rcu_read_lock
603 */
604 struct bio *bi;
605 atomic_inc(&rdev->nr_pending);
606 atomic_inc(&rdev->nr_pending);
607 rcu_read_unlock();
b5e1b8ce 608 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
5a409b4f 609 bi->bi_end_io = md_end_flush;
4bc034d3
N
610 bi->bi_private = rdev;
611 bio_set_dev(bi, rdev->bdev);
70fd7614 612 bi->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
4bc034d3 613 atomic_inc(&mddev->flush_pending);
4e49ea4a 614 submit_bio(bi);
a2826aa9
N
615 rcu_read_lock();
616 rdev_dec_pending(rdev, mddev);
617 }
618 rcu_read_unlock();
4bc034d3
N
619 if (atomic_dec_and_test(&mddev->flush_pending))
620 queue_work(md_wq, &mddev->flush_work);
621}
a2826aa9 622
4bc034d3
N
623static void md_submit_flush_data(struct work_struct *ws)
624{
625 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
626 struct bio *bio = mddev->flush_bio;
627
628 /*
629 * must reset flush_bio before calling into md_handle_request to avoid a
630 * deadlock, because other bios passed md_handle_request suspend check
631 * could wait for this and below md_handle_request could wait for those
632 * bios because of suspend check
633 */
2bc13b83 634 mddev->last_flush = mddev->start_flush;
4bc034d3
N
635 mddev->flush_bio = NULL;
636 wake_up(&mddev->sb_wait);
637
638 if (bio->bi_iter.bi_size == 0) {
639 /* an empty barrier - all done */
640 bio_endio(bio);
641 } else {
642 bio->bi_opf &= ~REQ_PREFLUSH;
643 md_handle_request(mddev, bio);
a2826aa9 644 }
a2826aa9 645}
4bc034d3 646
775d7831
DJ
647/*
648 * Manages consolidation of flushes and submitting any flushes needed for
649 * a bio with REQ_PREFLUSH. Returns true if the bio is finished or is
650 * being finished in another context. Returns false if the flushing is
651 * complete but still needs the I/O portion of the bio to be processed.
652 */
653bool md_flush_request(struct mddev *mddev, struct bio *bio)
4bc034d3 654{
2bc13b83 655 ktime_t start = ktime_get_boottime();
4bc034d3
N
656 spin_lock_irq(&mddev->lock);
657 wait_event_lock_irq(mddev->sb_wait,
2bc13b83
N
658 !mddev->flush_bio ||
659 ktime_after(mddev->last_flush, start),
4bc034d3 660 mddev->lock);
2bc13b83
N
661 if (!ktime_after(mddev->last_flush, start)) {
662 WARN_ON(mddev->flush_bio);
663 mddev->flush_bio = bio;
664 bio = NULL;
665 }
4bc034d3
N
666 spin_unlock_irq(&mddev->lock);
667
2bc13b83
N
668 if (!bio) {
669 INIT_WORK(&mddev->flush_work, submit_flushes);
670 queue_work(md_wq, &mddev->flush_work);
671 } else {
672 /* flush was performed for some other bio while we waited. */
673 if (bio->bi_iter.bi_size == 0)
674 /* an empty barrier - all done */
675 bio_endio(bio);
676 else {
677 bio->bi_opf &= ~REQ_PREFLUSH;
775d7831 678 return false;
2bc13b83
N
679 }
680 }
775d7831 681 return true;
4bc034d3 682}
e9c7469b 683EXPORT_SYMBOL(md_flush_request);
409c57f3 684
fd01b88c 685static inline struct mddev *mddev_get(struct mddev *mddev)
1da177e4
LT
686{
687 atomic_inc(&mddev->active);
688 return mddev;
689}
690
5fd3a17e 691static void mddev_delayed_delete(struct work_struct *ws);
d3374825 692
fd01b88c 693static void mddev_put(struct mddev *mddev)
1da177e4
LT
694{
695 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
696 return;
d3374825 697 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
cbd19983
N
698 mddev->ctime == 0 && !mddev->hold_active) {
699 /* Array is not configured at all, and not held active,
700 * so destroy it */
af8a2434 701 list_del_init(&mddev->all_mddevs);
28dec870
KO
702
703 /*
704 * Call queue_work inside the spinlock so that
705 * flush_workqueue() after mddev_find will succeed in waiting
706 * for the work to be done.
707 */
708 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
709 queue_work(md_misc_wq, &mddev->del_work);
d3374825
N
710 }
711 spin_unlock(&all_mddevs_lock);
1da177e4
LT
712}
713
8376d3c1 714static void md_safemode_timeout(struct timer_list *t);
25b2edfa 715
fd01b88c 716void mddev_init(struct mddev *mddev)
fafd7fb0 717{
28dec870 718 kobject_init(&mddev->kobj, &md_ktype);
fafd7fb0
N
719 mutex_init(&mddev->open_mutex);
720 mutex_init(&mddev->reconfig_mutex);
721 mutex_init(&mddev->bitmap_info.mutex);
722 INIT_LIST_HEAD(&mddev->disks);
723 INIT_LIST_HEAD(&mddev->all_mddevs);
8376d3c1 724 timer_setup(&mddev->safemode_timer, md_safemode_timeout, 0);
fafd7fb0
N
725 atomic_set(&mddev->active, 1);
726 atomic_set(&mddev->openers, 0);
727 atomic_set(&mddev->active_io, 0);
85572d7c 728 spin_lock_init(&mddev->lock);
4bc034d3 729 atomic_set(&mddev->flush_pending, 0);
fafd7fb0
N
730 init_waitqueue_head(&mddev->sb_wait);
731 init_waitqueue_head(&mddev->recovery_wait);
732 mddev->reshape_position = MaxSector;
2c810cdd 733 mddev->reshape_backwards = 0;
c4a39551 734 mddev->last_sync_action = "none";
fafd7fb0
N
735 mddev->resync_min = 0;
736 mddev->resync_max = MaxSector;
737 mddev->level = LEVEL_NONE;
738}
390ee602 739EXPORT_SYMBOL_GPL(mddev_init);
fafd7fb0 740
f72ffdd6 741static struct mddev *mddev_find(dev_t unit)
1da177e4 742{
fd01b88c 743 struct mddev *mddev, *new = NULL;
1da177e4 744
8f5f02c4
N
745 if (unit && MAJOR(unit) != MD_MAJOR)
746 unit &= ~((1<<MdpMinorShift)-1);
747
1da177e4
LT
748 retry:
749 spin_lock(&all_mddevs_lock);
efeb53c0
N
750
751 if (unit) {
752 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
753 if (mddev->unit == unit) {
754 mddev_get(mddev);
755 spin_unlock(&all_mddevs_lock);
756 kfree(new);
757 return mddev;
758 }
759
760 if (new) {
761 list_add(&new->all_mddevs, &all_mddevs);
1da177e4 762 spin_unlock(&all_mddevs_lock);
efeb53c0
N
763 new->hold_active = UNTIL_IOCTL;
764 return new;
1da177e4 765 }
efeb53c0
N
766 } else if (new) {
767 /* find an unused unit number */
768 static int next_minor = 512;
769 int start = next_minor;
770 int is_free = 0;
771 int dev = 0;
772 while (!is_free) {
773 dev = MKDEV(MD_MAJOR, next_minor);
774 next_minor++;
775 if (next_minor > MINORMASK)
776 next_minor = 0;
777 if (next_minor == start) {
778 /* Oh dear, all in use. */
779 spin_unlock(&all_mddevs_lock);
780 kfree(new);
781 return NULL;
782 }
f72ffdd6 783
efeb53c0
N
784 is_free = 1;
785 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
786 if (mddev->unit == dev) {
787 is_free = 0;
788 break;
789 }
790 }
791 new->unit = dev;
792 new->md_minor = MINOR(dev);
793 new->hold_active = UNTIL_STOP;
1da177e4
LT
794 list_add(&new->all_mddevs, &all_mddevs);
795 spin_unlock(&all_mddevs_lock);
796 return new;
797 }
798 spin_unlock(&all_mddevs_lock);
799
9ffae0cf 800 new = kzalloc(sizeof(*new), GFP_KERNEL);
1da177e4
LT
801 if (!new)
802 return NULL;
803
1da177e4
LT
804 new->unit = unit;
805 if (MAJOR(unit) == MD_MAJOR)
806 new->md_minor = MINOR(unit);
807 else
808 new->md_minor = MINOR(unit) >> MdpMinorShift;
809
fafd7fb0 810 mddev_init(new);
1da177e4 811
1da177e4
LT
812 goto retry;
813}
814
b6eb127d
N
815static struct attribute_group md_redundancy_group;
816
5c47daf6 817void mddev_unlock(struct mddev *mddev)
1da177e4 818{
a64c876f 819 if (mddev->to_remove) {
b6eb127d
N
820 /* These cannot be removed under reconfig_mutex as
821 * an access to the files will try to take reconfig_mutex
822 * while holding the file unremovable, which leads to
823 * a deadlock.
bb4f1e9d
N
824 * So hold set sysfs_active while the remove in happeing,
825 * and anything else which might set ->to_remove or my
826 * otherwise change the sysfs namespace will fail with
827 * -EBUSY if sysfs_active is still set.
828 * We set sysfs_active under reconfig_mutex and elsewhere
829 * test it under the same mutex to ensure its correct value
830 * is seen.
b6eb127d 831 */
a64c876f
N
832 struct attribute_group *to_remove = mddev->to_remove;
833 mddev->to_remove = NULL;
bb4f1e9d 834 mddev->sysfs_active = 1;
b6eb127d
N
835 mutex_unlock(&mddev->reconfig_mutex);
836
00bcb4ac
N
837 if (mddev->kobj.sd) {
838 if (to_remove != &md_redundancy_group)
839 sysfs_remove_group(&mddev->kobj, to_remove);
840 if (mddev->pers == NULL ||
841 mddev->pers->sync_request == NULL) {
842 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
843 if (mddev->sysfs_action)
844 sysfs_put(mddev->sysfs_action);
845 mddev->sysfs_action = NULL;
846 }
a64c876f 847 }
bb4f1e9d 848 mddev->sysfs_active = 0;
b6eb127d
N
849 } else
850 mutex_unlock(&mddev->reconfig_mutex);
1da177e4 851
751e67ca
CD
852 /* As we've dropped the mutex we need a spinlock to
853 * make sure the thread doesn't disappear
01f96c0a
N
854 */
855 spin_lock(&pers_lock);
005eca5e 856 md_wakeup_thread(mddev->thread);
4d5324f7 857 wake_up(&mddev->sb_wait);
01f96c0a 858 spin_unlock(&pers_lock);
1da177e4 859}
5c47daf6 860EXPORT_SYMBOL_GPL(mddev_unlock);
1da177e4 861
57d051dc 862struct md_rdev *md_find_rdev_nr_rcu(struct mddev *mddev, int nr)
1ca69c4b
N
863{
864 struct md_rdev *rdev;
865
866 rdev_for_each_rcu(rdev, mddev)
867 if (rdev->desc_nr == nr)
868 return rdev;
869
870 return NULL;
871}
57d051dc 872EXPORT_SYMBOL_GPL(md_find_rdev_nr_rcu);
1ca69c4b
N
873
874static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
1da177e4 875{
3cb03002 876 struct md_rdev *rdev;
1da177e4 877
dafb20fa 878 rdev_for_each(rdev, mddev)
1da177e4
LT
879 if (rdev->bdev->bd_dev == dev)
880 return rdev;
159ec1fc 881
1da177e4
LT
882 return NULL;
883}
884
1532d9e8 885struct md_rdev *md_find_rdev_rcu(struct mddev *mddev, dev_t dev)
1ca69c4b
N
886{
887 struct md_rdev *rdev;
888
889 rdev_for_each_rcu(rdev, mddev)
890 if (rdev->bdev->bd_dev == dev)
891 return rdev;
892
893 return NULL;
894}
1532d9e8 895EXPORT_SYMBOL_GPL(md_find_rdev_rcu);
1ca69c4b 896
84fc4b56 897static struct md_personality *find_pers(int level, char *clevel)
2604b703 898{
84fc4b56 899 struct md_personality *pers;
d9d166c2
N
900 list_for_each_entry(pers, &pers_list, list) {
901 if (level != LEVEL_NONE && pers->level == level)
2604b703 902 return pers;
d9d166c2
N
903 if (strcmp(pers->name, clevel)==0)
904 return pers;
905 }
2604b703
N
906 return NULL;
907}
908
b73df2d3 909/* return the offset of the super block in 512byte sectors */
3cb03002 910static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
1da177e4 911{
57b2caa3 912 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
b73df2d3 913 return MD_NEW_SIZE_SECTORS(num_sectors);
1da177e4
LT
914}
915
f72ffdd6 916static int alloc_disk_sb(struct md_rdev *rdev)
1da177e4 917{
1da177e4 918 rdev->sb_page = alloc_page(GFP_KERNEL);
7f0f0d87 919 if (!rdev->sb_page)
ebc24337 920 return -ENOMEM;
1da177e4
LT
921 return 0;
922}
923
545c8795 924void md_rdev_clear(struct md_rdev *rdev)
1da177e4
LT
925{
926 if (rdev->sb_page) {
2d1f3b5d 927 put_page(rdev->sb_page);
1da177e4
LT
928 rdev->sb_loaded = 0;
929 rdev->sb_page = NULL;
0f420358 930 rdev->sb_start = 0;
dd8ac336 931 rdev->sectors = 0;
1da177e4 932 }
2699b672
N
933 if (rdev->bb_page) {
934 put_page(rdev->bb_page);
935 rdev->bb_page = NULL;
936 }
d3b407fb 937 badblocks_exit(&rdev->badblocks);
1da177e4 938}
545c8795 939EXPORT_SYMBOL_GPL(md_rdev_clear);
1da177e4 940
4246a0b6 941static void super_written(struct bio *bio)
7bfa19f2 942{
3cb03002 943 struct md_rdev *rdev = bio->bi_private;
fd01b88c 944 struct mddev *mddev = rdev->mddev;
7bfa19f2 945
4e4cbee9
CH
946 if (bio->bi_status) {
947 pr_err("md: super_written gets error=%d\n", bio->bi_status);
a9701a30 948 md_error(mddev, rdev);
46533ff7
N
949 if (!test_bit(Faulty, &rdev->flags)
950 && (bio->bi_opf & MD_FAILFAST)) {
2953079c 951 set_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags);
46533ff7
N
952 set_bit(LastDev, &rdev->flags);
953 }
954 } else
955 clear_bit(LastDev, &rdev->flags);
7bfa19f2 956
a9701a30
N
957 if (atomic_dec_and_test(&mddev->pending_writes))
958 wake_up(&mddev->sb_wait);
ed3b98c7 959 rdev_dec_pending(rdev, mddev);
f8b58edf 960 bio_put(bio);
7bfa19f2
N
961}
962
fd01b88c 963void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
7bfa19f2
N
964 sector_t sector, int size, struct page *page)
965{
966 /* write first size bytes of page to sector of rdev
967 * Increment mddev->pending_writes before returning
968 * and decrement it on completion, waking up sb_wait
969 * if zero is reached.
970 * If an error occurred, call md_error
971 */
46533ff7
N
972 struct bio *bio;
973 int ff = 0;
974
4b6c1060
HM
975 if (!page)
976 return;
977
46533ff7
N
978 if (test_bit(Faulty, &rdev->flags))
979 return;
980
5a85071c 981 bio = md_bio_alloc_sync(mddev);
7bfa19f2 982
ed3b98c7
SL
983 atomic_inc(&rdev->nr_pending);
984
74d46992 985 bio_set_dev(bio, rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev);
4f024f37 986 bio->bi_iter.bi_sector = sector;
7bfa19f2
N
987 bio_add_page(bio, page, size, 0);
988 bio->bi_private = rdev;
989 bio->bi_end_io = super_written;
46533ff7
N
990
991 if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) &&
992 test_bit(FailFast, &rdev->flags) &&
993 !test_bit(LastDev, &rdev->flags))
994 ff = MD_FAILFAST;
5a8948f8 995 bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH | REQ_FUA | ff;
a9701a30 996
7bfa19f2 997 atomic_inc(&mddev->pending_writes);
4e49ea4a 998 submit_bio(bio);
a9701a30
N
999}
1000
46533ff7 1001int md_super_wait(struct mddev *mddev)
a9701a30 1002{
e9c7469b 1003 /* wait for all superblock writes that were scheduled to complete */
1967cd56 1004 wait_event(mddev->sb_wait, atomic_read(&mddev->pending_writes)==0);
2953079c 1005 if (test_and_clear_bit(MD_SB_NEED_REWRITE, &mddev->sb_flags))
46533ff7
N
1006 return -EAGAIN;
1007 return 0;
7bfa19f2
N
1008}
1009
3cb03002 1010int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
796a5cf0 1011 struct page *page, int op, int op_flags, bool metadata_op)
1da177e4 1012{
5a85071c 1013 struct bio *bio = md_bio_alloc_sync(rdev->mddev);
1da177e4
LT
1014 int ret;
1015
74d46992
CH
1016 if (metadata_op && rdev->meta_bdev)
1017 bio_set_dev(bio, rdev->meta_bdev);
1018 else
1019 bio_set_dev(bio, rdev->bdev);
796a5cf0 1020 bio_set_op_attrs(bio, op, op_flags);
ccebd4c4 1021 if (metadata_op)
4f024f37 1022 bio->bi_iter.bi_sector = sector + rdev->sb_start;
1fdd6fc9
N
1023 else if (rdev->mddev->reshape_position != MaxSector &&
1024 (rdev->mddev->reshape_backwards ==
1025 (sector >= rdev->mddev->reshape_position)))
4f024f37 1026 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
ccebd4c4 1027 else
4f024f37 1028 bio->bi_iter.bi_sector = sector + rdev->data_offset;
1da177e4 1029 bio_add_page(bio, page, size, 0);
4e49ea4a
MC
1030
1031 submit_bio_wait(bio);
1da177e4 1032
4e4cbee9 1033 ret = !bio->bi_status;
1da177e4
LT
1034 bio_put(bio);
1035 return ret;
1036}
a8745db2 1037EXPORT_SYMBOL_GPL(sync_page_io);
1da177e4 1038
f72ffdd6 1039static int read_disk_sb(struct md_rdev *rdev, int size)
1da177e4
LT
1040{
1041 char b[BDEVNAME_SIZE];
403df478 1042
1da177e4
LT
1043 if (rdev->sb_loaded)
1044 return 0;
1045
796a5cf0 1046 if (!sync_page_io(rdev, 0, size, rdev->sb_page, REQ_OP_READ, 0, true))
1da177e4
LT
1047 goto fail;
1048 rdev->sb_loaded = 1;
1049 return 0;
1050
1051fail:
9d48739e
N
1052 pr_err("md: disabled device %s, could not read superblock.\n",
1053 bdevname(rdev->bdev,b));
1da177e4
LT
1054 return -EINVAL;
1055}
1056
e6fd2093 1057static int md_uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1da177e4 1058{
f72ffdd6 1059 return sb1->set_uuid0 == sb2->set_uuid0 &&
05710466
AN
1060 sb1->set_uuid1 == sb2->set_uuid1 &&
1061 sb1->set_uuid2 == sb2->set_uuid2 &&
1062 sb1->set_uuid3 == sb2->set_uuid3;
1da177e4
LT
1063}
1064
e6fd2093 1065static int md_sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
1da177e4
LT
1066{
1067 int ret;
1068 mdp_super_t *tmp1, *tmp2;
1069
1070 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
1071 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
1072
1073 if (!tmp1 || !tmp2) {
1074 ret = 0;
1da177e4
LT
1075 goto abort;
1076 }
1077
1078 *tmp1 = *sb1;
1079 *tmp2 = *sb2;
1080
1081 /*
1082 * nr_disks is not constant
1083 */
1084 tmp1->nr_disks = 0;
1085 tmp2->nr_disks = 0;
1086
ce0c8e05 1087 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
1da177e4 1088abort:
990a8baf
JJ
1089 kfree(tmp1);
1090 kfree(tmp2);
1da177e4
LT
1091 return ret;
1092}
1093
4d167f09
N
1094static u32 md_csum_fold(u32 csum)
1095{
1096 csum = (csum & 0xffff) + (csum >> 16);
1097 return (csum & 0xffff) + (csum >> 16);
1098}
1099
f72ffdd6 1100static unsigned int calc_sb_csum(mdp_super_t *sb)
1da177e4 1101{
4d167f09
N
1102 u64 newcsum = 0;
1103 u32 *sb32 = (u32*)sb;
1104 int i;
1da177e4
LT
1105 unsigned int disk_csum, csum;
1106
1107 disk_csum = sb->sb_csum;
1108 sb->sb_csum = 0;
4d167f09
N
1109
1110 for (i = 0; i < MD_SB_BYTES/4 ; i++)
1111 newcsum += sb32[i];
1112 csum = (newcsum & 0xffffffff) + (newcsum>>32);
1113
4d167f09
N
1114#ifdef CONFIG_ALPHA
1115 /* This used to use csum_partial, which was wrong for several
1116 * reasons including that different results are returned on
1117 * different architectures. It isn't critical that we get exactly
1118 * the same return value as before (we always csum_fold before
1119 * testing, and that removes any differences). However as we
1120 * know that csum_partial always returned a 16bit value on
1121 * alphas, do a fold to maximise conformity to previous behaviour.
1122 */
1123 sb->sb_csum = md_csum_fold(disk_csum);
1124#else
1da177e4 1125 sb->sb_csum = disk_csum;
4d167f09 1126#endif
1da177e4
LT
1127 return csum;
1128}
1129
1da177e4
LT
1130/*
1131 * Handle superblock details.
1132 * We want to be able to handle multiple superblock formats
1133 * so we have a common interface to them all, and an array of
1134 * different handlers.
1135 * We rely on user-space to write the initial superblock, and support
1136 * reading and updating of superblocks.
1137 * Interface methods are:
3cb03002 1138 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
1139 * loads and validates a superblock on dev.
1140 * if refdev != NULL, compare superblocks on both devices
1141 * Return:
1142 * 0 - dev has a superblock that is compatible with refdev
1143 * 1 - dev has a superblock that is compatible and newer than refdev
1144 * so dev should be used as the refdev in future
1145 * -EINVAL superblock incompatible or invalid
1146 * -othererror e.g. -EIO
1147 *
fd01b88c 1148 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
1da177e4
LT
1149 * Verify that dev is acceptable into mddev.
1150 * The first time, mddev->raid_disks will be 0, and data from
1151 * dev should be merged in. Subsequent calls check that dev
1152 * is new enough. Return 0 or -EINVAL
1153 *
fd01b88c 1154 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
1da177e4
LT
1155 * Update the superblock for rdev with data in mddev
1156 * This does not write to disc.
1157 *
1158 */
1159
1160struct super_type {
0cd17fec
CW
1161 char *name;
1162 struct module *owner;
c6563a8c
N
1163 int (*load_super)(struct md_rdev *rdev,
1164 struct md_rdev *refdev,
0cd17fec 1165 int minor_version);
c6563a8c
N
1166 int (*validate_super)(struct mddev *mddev,
1167 struct md_rdev *rdev);
1168 void (*sync_super)(struct mddev *mddev,
1169 struct md_rdev *rdev);
3cb03002 1170 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
15f4a5fd 1171 sector_t num_sectors);
c6563a8c
N
1172 int (*allow_new_offset)(struct md_rdev *rdev,
1173 unsigned long long new_offset);
1da177e4
LT
1174};
1175
0894cc30
AN
1176/*
1177 * Check that the given mddev has no bitmap.
1178 *
1179 * This function is called from the run method of all personalities that do not
1180 * support bitmaps. It prints an error message and returns non-zero if mddev
1181 * has a bitmap. Otherwise, it returns 0.
1182 *
1183 */
fd01b88c 1184int md_check_no_bitmap(struct mddev *mddev)
0894cc30 1185{
c3d9714e 1186 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
0894cc30 1187 return 0;
9d48739e 1188 pr_warn("%s: bitmaps are not supported for %s\n",
0894cc30
AN
1189 mdname(mddev), mddev->pers->name);
1190 return 1;
1191}
1192EXPORT_SYMBOL(md_check_no_bitmap);
1193
1da177e4 1194/*
f72ffdd6 1195 * load_super for 0.90.0
1da177e4 1196 */
3cb03002 1197static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
1198{
1199 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1200 mdp_super_t *sb;
1201 int ret;
228fc7d7 1202 bool spare_disk = true;
1da177e4
LT
1203
1204 /*
0f420358 1205 * Calculate the position of the superblock (512byte sectors),
1da177e4
LT
1206 * it's at the end of the disk.
1207 *
1208 * It also happens to be a multiple of 4Kb.
1209 */
57b2caa3 1210 rdev->sb_start = calc_dev_sboffset(rdev);
1da177e4 1211
0002b271 1212 ret = read_disk_sb(rdev, MD_SB_BYTES);
9d48739e
N
1213 if (ret)
1214 return ret;
1da177e4
LT
1215
1216 ret = -EINVAL;
1217
1218 bdevname(rdev->bdev, b);
65a06f06 1219 sb = page_address(rdev->sb_page);
1da177e4
LT
1220
1221 if (sb->md_magic != MD_SB_MAGIC) {
9d48739e 1222 pr_warn("md: invalid raid superblock magic on %s\n", b);
1da177e4
LT
1223 goto abort;
1224 }
1225
1226 if (sb->major_version != 0 ||
f6705578
N
1227 sb->minor_version < 90 ||
1228 sb->minor_version > 91) {
9d48739e
N
1229 pr_warn("Bad version number %d.%d on %s\n",
1230 sb->major_version, sb->minor_version, b);
1da177e4
LT
1231 goto abort;
1232 }
1233
1234 if (sb->raid_disks <= 0)
1235 goto abort;
1236
4d167f09 1237 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
9d48739e 1238 pr_warn("md: invalid superblock checksum on %s\n", b);
1da177e4
LT
1239 goto abort;
1240 }
1241
1242 rdev->preferred_minor = sb->md_minor;
1243 rdev->data_offset = 0;
c6563a8c 1244 rdev->new_data_offset = 0;
0002b271 1245 rdev->sb_size = MD_SB_BYTES;
9f2f3830 1246 rdev->badblocks.shift = -1;
1da177e4
LT
1247
1248 if (sb->level == LEVEL_MULTIPATH)
1249 rdev->desc_nr = -1;
1250 else
1251 rdev->desc_nr = sb->this_disk.number;
1252
228fc7d7
YY
1253 /* not spare disk, or LEVEL_MULTIPATH */
1254 if (sb->level == LEVEL_MULTIPATH ||
1255 (rdev->desc_nr >= 0 &&
3b7436cc 1256 rdev->desc_nr < MD_SB_DISKS &&
228fc7d7
YY
1257 sb->disks[rdev->desc_nr].state &
1258 ((1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))))
1259 spare_disk = false;
1260
9a7b2b0f 1261 if (!refdev) {
228fc7d7 1262 if (!spare_disk)
6a5cb53a
YY
1263 ret = 1;
1264 else
1265 ret = 0;
9a7b2b0f 1266 } else {
1da177e4 1267 __u64 ev1, ev2;
65a06f06 1268 mdp_super_t *refsb = page_address(refdev->sb_page);
e6fd2093 1269 if (!md_uuid_equal(refsb, sb)) {
9d48739e 1270 pr_warn("md: %s has different UUID to %s\n",
1da177e4
LT
1271 b, bdevname(refdev->bdev,b2));
1272 goto abort;
1273 }
e6fd2093 1274 if (!md_sb_equal(refsb, sb)) {
9d48739e
N
1275 pr_warn("md: %s has same UUID but different superblock to %s\n",
1276 b, bdevname(refdev->bdev, b2));
1da177e4
LT
1277 goto abort;
1278 }
1279 ev1 = md_event(sb);
1280 ev2 = md_event(refsb);
6a5cb53a 1281
228fc7d7 1282 if (!spare_disk && ev1 > ev2)
1da177e4 1283 ret = 1;
f72ffdd6 1284 else
1da177e4
LT
1285 ret = 0;
1286 }
8190e754 1287 rdev->sectors = rdev->sb_start;
667a5313
N
1288 /* Limit to 4TB as metadata cannot record more than that.
1289 * (not needed for Linear and RAID0 as metadata doesn't
1290 * record this size)
1291 */
72deb455 1292 if ((u64)rdev->sectors >= (2ULL << 32) && sb->level >= 1)
3312c951 1293 rdev->sectors = (sector_t)(2ULL << 32) - 2;
1da177e4 1294
27a7b260 1295 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
2bf071bf
N
1296 /* "this cannot possibly happen" ... */
1297 ret = -EINVAL;
1298
1da177e4
LT
1299 abort:
1300 return ret;
1301}
1302
1303/*
1304 * validate_super for 0.90.0
1305 */
fd01b88c 1306static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1307{
1308 mdp_disk_t *desc;
65a06f06 1309 mdp_super_t *sb = page_address(rdev->sb_page);
07d84d10 1310 __u64 ev1 = md_event(sb);
1da177e4 1311
41158c7e 1312 rdev->raid_disk = -1;
c5d79adb
N
1313 clear_bit(Faulty, &rdev->flags);
1314 clear_bit(In_sync, &rdev->flags);
8313b8e5 1315 clear_bit(Bitmap_sync, &rdev->flags);
c5d79adb 1316 clear_bit(WriteMostly, &rdev->flags);
c5d79adb 1317
1da177e4
LT
1318 if (mddev->raid_disks == 0) {
1319 mddev->major_version = 0;
1320 mddev->minor_version = sb->minor_version;
1321 mddev->patch_version = sb->patch_version;
e691063a 1322 mddev->external = 0;
9d8f0363 1323 mddev->chunk_sectors = sb->chunk_size >> 9;
1da177e4
LT
1324 mddev->ctime = sb->ctime;
1325 mddev->utime = sb->utime;
1326 mddev->level = sb->level;
d9d166c2 1327 mddev->clevel[0] = 0;
1da177e4
LT
1328 mddev->layout = sb->layout;
1329 mddev->raid_disks = sb->raid_disks;
27a7b260 1330 mddev->dev_sectors = ((sector_t)sb->size) * 2;
07d84d10 1331 mddev->events = ev1;
c3d9714e 1332 mddev->bitmap_info.offset = 0;
6409bb05
N
1333 mddev->bitmap_info.space = 0;
1334 /* bitmap can use 60 K after the 4K superblocks */
c3d9714e 1335 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6409bb05 1336 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
2c810cdd 1337 mddev->reshape_backwards = 0;
1da177e4 1338
f6705578
N
1339 if (mddev->minor_version >= 91) {
1340 mddev->reshape_position = sb->reshape_position;
1341 mddev->delta_disks = sb->delta_disks;
1342 mddev->new_level = sb->new_level;
1343 mddev->new_layout = sb->new_layout;
664e7c41 1344 mddev->new_chunk_sectors = sb->new_chunk >> 9;
2c810cdd
N
1345 if (mddev->delta_disks < 0)
1346 mddev->reshape_backwards = 1;
f6705578
N
1347 } else {
1348 mddev->reshape_position = MaxSector;
1349 mddev->delta_disks = 0;
1350 mddev->new_level = mddev->level;
1351 mddev->new_layout = mddev->layout;
664e7c41 1352 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578 1353 }
33f2c35a
N
1354 if (mddev->level == 0)
1355 mddev->layout = -1;
f6705578 1356
1da177e4
LT
1357 if (sb->state & (1<<MD_SB_CLEAN))
1358 mddev->recovery_cp = MaxSector;
1359 else {
f72ffdd6 1360 if (sb->events_hi == sb->cp_events_hi &&
1da177e4
LT
1361 sb->events_lo == sb->cp_events_lo) {
1362 mddev->recovery_cp = sb->recovery_cp;
1363 } else
1364 mddev->recovery_cp = 0;
1365 }
1366
1367 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1368 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1369 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1370 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1371
1372 mddev->max_disks = MD_SB_DISKS;
a654b9d8
N
1373
1374 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
6409bb05 1375 mddev->bitmap_info.file == NULL) {
c3d9714e
N
1376 mddev->bitmap_info.offset =
1377 mddev->bitmap_info.default_offset;
6409bb05 1378 mddev->bitmap_info.space =
c9ad020f 1379 mddev->bitmap_info.default_space;
6409bb05 1380 }
a654b9d8 1381
41158c7e 1382 } else if (mddev->pers == NULL) {
be6800a7
N
1383 /* Insist on good event counter while assembling, except
1384 * for spares (which don't need an event count) */
1da177e4 1385 ++ev1;
be6800a7
N
1386 if (sb->disks[rdev->desc_nr].state & (
1387 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
f72ffdd6 1388 if (ev1 < mddev->events)
be6800a7 1389 return -EINVAL;
41158c7e
N
1390 } else if (mddev->bitmap) {
1391 /* if adding to array with a bitmap, then we can accept an
1392 * older device ... but not too old.
1393 */
41158c7e
N
1394 if (ev1 < mddev->bitmap->events_cleared)
1395 return 0;
8313b8e5
N
1396 if (ev1 < mddev->events)
1397 set_bit(Bitmap_sync, &rdev->flags);
07d84d10
N
1398 } else {
1399 if (ev1 < mddev->events)
1400 /* just a hot-add of a new device, leave raid_disk at -1 */
1401 return 0;
1402 }
41158c7e 1403
1da177e4 1404 if (mddev->level != LEVEL_MULTIPATH) {
1da177e4
LT
1405 desc = sb->disks + rdev->desc_nr;
1406
1407 if (desc->state & (1<<MD_DISK_FAULTY))
b2d444d7 1408 set_bit(Faulty, &rdev->flags);
7c7546cc
N
1409 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1410 desc->raid_disk < mddev->raid_disks */) {
b2d444d7 1411 set_bit(In_sync, &rdev->flags);
1da177e4 1412 rdev->raid_disk = desc->raid_disk;
f466722c 1413 rdev->saved_raid_disk = desc->raid_disk;
0261cd9f
N
1414 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1415 /* active but not in sync implies recovery up to
1416 * reshape position. We don't know exactly where
1417 * that is, so set to zero for now */
1418 if (mddev->minor_version >= 91) {
1419 rdev->recovery_offset = 0;
1420 rdev->raid_disk = desc->raid_disk;
1421 }
1da177e4 1422 }
8ddf9efe
N
1423 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1424 set_bit(WriteMostly, &rdev->flags);
688834e6
N
1425 if (desc->state & (1<<MD_DISK_FAILFAST))
1426 set_bit(FailFast, &rdev->flags);
41158c7e 1427 } else /* MULTIPATH are always insync */
b2d444d7 1428 set_bit(In_sync, &rdev->flags);
1da177e4
LT
1429 return 0;
1430}
1431
1432/*
1433 * sync_super for 0.90.0
1434 */
fd01b88c 1435static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1436{
1437 mdp_super_t *sb;
3cb03002 1438 struct md_rdev *rdev2;
1da177e4 1439 int next_spare = mddev->raid_disks;
19133a42 1440
1da177e4
LT
1441 /* make rdev->sb match mddev data..
1442 *
1443 * 1/ zero out disks
1444 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1445 * 3/ any empty disks < next_spare become removed
1446 *
1447 * disks[0] gets initialised to REMOVED because
1448 * we cannot be sure from other fields if it has
1449 * been initialised or not.
1450 */
1451 int i;
1452 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1453
61181565
N
1454 rdev->sb_size = MD_SB_BYTES;
1455
65a06f06 1456 sb = page_address(rdev->sb_page);
1da177e4
LT
1457
1458 memset(sb, 0, sizeof(*sb));
1459
1460 sb->md_magic = MD_SB_MAGIC;
1461 sb->major_version = mddev->major_version;
1da177e4
LT
1462 sb->patch_version = mddev->patch_version;
1463 sb->gvalid_words = 0; /* ignored */
1464 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1465 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1466 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1467 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1468
9ebc6ef1 1469 sb->ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1da177e4 1470 sb->level = mddev->level;
58c0fed4 1471 sb->size = mddev->dev_sectors / 2;
1da177e4
LT
1472 sb->raid_disks = mddev->raid_disks;
1473 sb->md_minor = mddev->md_minor;
e691063a 1474 sb->not_persistent = 0;
9ebc6ef1 1475 sb->utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1da177e4
LT
1476 sb->state = 0;
1477 sb->events_hi = (mddev->events>>32);
1478 sb->events_lo = (u32)mddev->events;
1479
f6705578
N
1480 if (mddev->reshape_position == MaxSector)
1481 sb->minor_version = 90;
1482 else {
1483 sb->minor_version = 91;
1484 sb->reshape_position = mddev->reshape_position;
1485 sb->new_level = mddev->new_level;
1486 sb->delta_disks = mddev->delta_disks;
1487 sb->new_layout = mddev->new_layout;
664e7c41 1488 sb->new_chunk = mddev->new_chunk_sectors << 9;
f6705578
N
1489 }
1490 mddev->minor_version = sb->minor_version;
1da177e4
LT
1491 if (mddev->in_sync)
1492 {
1493 sb->recovery_cp = mddev->recovery_cp;
1494 sb->cp_events_hi = (mddev->events>>32);
1495 sb->cp_events_lo = (u32)mddev->events;
1496 if (mddev->recovery_cp == MaxSector)
1497 sb->state = (1<< MD_SB_CLEAN);
1498 } else
1499 sb->recovery_cp = 0;
1500
1501 sb->layout = mddev->layout;
9d8f0363 1502 sb->chunk_size = mddev->chunk_sectors << 9;
1da177e4 1503
c3d9714e 1504 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
a654b9d8
N
1505 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1506
1da177e4 1507 sb->disks[0].state = (1<<MD_DISK_REMOVED);
dafb20fa 1508 rdev_for_each(rdev2, mddev) {
1da177e4 1509 mdp_disk_t *d;
86e6ffdd 1510 int desc_nr;
0261cd9f
N
1511 int is_active = test_bit(In_sync, &rdev2->flags);
1512
1513 if (rdev2->raid_disk >= 0 &&
1514 sb->minor_version >= 91)
1515 /* we have nowhere to store the recovery_offset,
1516 * but if it is not below the reshape_position,
1517 * we can piggy-back on that.
1518 */
1519 is_active = 1;
1520 if (rdev2->raid_disk < 0 ||
1521 test_bit(Faulty, &rdev2->flags))
1522 is_active = 0;
1523 if (is_active)
86e6ffdd 1524 desc_nr = rdev2->raid_disk;
1da177e4 1525 else
86e6ffdd 1526 desc_nr = next_spare++;
19133a42 1527 rdev2->desc_nr = desc_nr;
1da177e4
LT
1528 d = &sb->disks[rdev2->desc_nr];
1529 nr_disks++;
1530 d->number = rdev2->desc_nr;
1531 d->major = MAJOR(rdev2->bdev->bd_dev);
1532 d->minor = MINOR(rdev2->bdev->bd_dev);
0261cd9f 1533 if (is_active)
1da177e4
LT
1534 d->raid_disk = rdev2->raid_disk;
1535 else
1536 d->raid_disk = rdev2->desc_nr; /* compatibility */
1be7892f 1537 if (test_bit(Faulty, &rdev2->flags))
1da177e4 1538 d->state = (1<<MD_DISK_FAULTY);
0261cd9f 1539 else if (is_active) {
1da177e4 1540 d->state = (1<<MD_DISK_ACTIVE);
0261cd9f
N
1541 if (test_bit(In_sync, &rdev2->flags))
1542 d->state |= (1<<MD_DISK_SYNC);
1da177e4
LT
1543 active++;
1544 working++;
1545 } else {
1546 d->state = 0;
1547 spare++;
1548 working++;
1549 }
8ddf9efe
N
1550 if (test_bit(WriteMostly, &rdev2->flags))
1551 d->state |= (1<<MD_DISK_WRITEMOSTLY);
688834e6
N
1552 if (test_bit(FailFast, &rdev2->flags))
1553 d->state |= (1<<MD_DISK_FAILFAST);
1da177e4 1554 }
1da177e4
LT
1555 /* now set the "removed" and "faulty" bits on any missing devices */
1556 for (i=0 ; i < mddev->raid_disks ; i++) {
1557 mdp_disk_t *d = &sb->disks[i];
1558 if (d->state == 0 && d->number == 0) {
1559 d->number = i;
1560 d->raid_disk = i;
1561 d->state = (1<<MD_DISK_REMOVED);
1562 d->state |= (1<<MD_DISK_FAULTY);
1563 failed++;
1564 }
1565 }
1566 sb->nr_disks = nr_disks;
1567 sb->active_disks = active;
1568 sb->working_disks = working;
1569 sb->failed_disks = failed;
1570 sb->spare_disks = spare;
1571
1572 sb->this_disk = sb->disks[rdev->desc_nr];
1573 sb->sb_csum = calc_sb_csum(sb);
1574}
1575
0cd17fec
CW
1576/*
1577 * rdev_size_change for 0.90.0
1578 */
1579static unsigned long long
3cb03002 1580super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
0cd17fec 1581{
58c0fed4 1582 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
0cd17fec 1583 return 0; /* component must fit device */
c3d9714e 1584 if (rdev->mddev->bitmap_info.offset)
0cd17fec 1585 return 0; /* can't move bitmap */
57b2caa3 1586 rdev->sb_start = calc_dev_sboffset(rdev);
15f4a5fd
AN
1587 if (!num_sectors || num_sectors > rdev->sb_start)
1588 num_sectors = rdev->sb_start;
27a7b260
N
1589 /* Limit to 4TB as metadata cannot record more than that.
1590 * 4TB == 2^32 KB, or 2*2^32 sectors.
1591 */
72deb455 1592 if ((u64)num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
3312c951 1593 num_sectors = (sector_t)(2ULL << 32) - 2;
46533ff7
N
1594 do {
1595 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
0cd17fec 1596 rdev->sb_page);
46533ff7 1597 } while (md_super_wait(rdev->mddev) < 0);
c26a44ed 1598 return num_sectors;
0cd17fec
CW
1599}
1600
c6563a8c
N
1601static int
1602super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1603{
1604 /* non-zero offset changes not possible with v0.90 */
1605 return new_offset == 0;
1606}
0cd17fec 1607
1da177e4
LT
1608/*
1609 * version 1 superblock
1610 */
1611
f72ffdd6 1612static __le32 calc_sb_1_csum(struct mdp_superblock_1 *sb)
1da177e4 1613{
1c05b4bc
N
1614 __le32 disk_csum;
1615 u32 csum;
1da177e4
LT
1616 unsigned long long newcsum;
1617 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1c05b4bc 1618 __le32 *isuper = (__le32*)sb;
1da177e4
LT
1619
1620 disk_csum = sb->sb_csum;
1621 sb->sb_csum = 0;
1622 newcsum = 0;
1f3c9907 1623 for (; size >= 4; size -= 4)
1da177e4
LT
1624 newcsum += le32_to_cpu(*isuper++);
1625
1626 if (size == 2)
1c05b4bc 1627 newcsum += le16_to_cpu(*(__le16*) isuper);
1da177e4
LT
1628
1629 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1630 sb->sb_csum = disk_csum;
1631 return cpu_to_le32(csum);
1632}
1633
3cb03002 1634static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1da177e4
LT
1635{
1636 struct mdp_superblock_1 *sb;
1637 int ret;
0f420358 1638 sector_t sb_start;
c6563a8c 1639 sector_t sectors;
1da177e4 1640 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
0002b271 1641 int bmask;
228fc7d7 1642 bool spare_disk = true;
1da177e4
LT
1643
1644 /*
0f420358 1645 * Calculate the position of the superblock in 512byte sectors.
1da177e4
LT
1646 * It is always aligned to a 4K boundary and
1647 * depeding on minor_version, it can be:
1648 * 0: At least 8K, but less than 12K, from end of device
1649 * 1: At start of device
1650 * 2: 4K from start of device.
1651 */
1652 switch(minor_version) {
1653 case 0:
77304d2a 1654 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
0f420358
AN
1655 sb_start -= 8*2;
1656 sb_start &= ~(sector_t)(4*2-1);
1da177e4
LT
1657 break;
1658 case 1:
0f420358 1659 sb_start = 0;
1da177e4
LT
1660 break;
1661 case 2:
0f420358 1662 sb_start = 8;
1da177e4
LT
1663 break;
1664 default:
1665 return -EINVAL;
1666 }
0f420358 1667 rdev->sb_start = sb_start;
1da177e4 1668
0002b271
N
1669 /* superblock is rarely larger than 1K, but it can be larger,
1670 * and it is safe to read 4k, so we do that
1671 */
1672 ret = read_disk_sb(rdev, 4096);
1da177e4
LT
1673 if (ret) return ret;
1674
65a06f06 1675 sb = page_address(rdev->sb_page);
1da177e4
LT
1676
1677 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1678 sb->major_version != cpu_to_le32(1) ||
1679 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
0f420358 1680 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
71c0805c 1681 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1da177e4
LT
1682 return -EINVAL;
1683
1684 if (calc_sb_1_csum(sb) != sb->sb_csum) {
9d48739e 1685 pr_warn("md: invalid superblock checksum on %s\n",
1da177e4
LT
1686 bdevname(rdev->bdev,b));
1687 return -EINVAL;
1688 }
1689 if (le64_to_cpu(sb->data_size) < 10) {
9d48739e
N
1690 pr_warn("md: data_size too small on %s\n",
1691 bdevname(rdev->bdev,b));
1da177e4
LT
1692 return -EINVAL;
1693 }
c6563a8c
N
1694 if (sb->pad0 ||
1695 sb->pad3[0] ||
1696 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1697 /* Some padding is non-zero, might be a new feature */
1698 return -EINVAL;
e11e93fa 1699
1da177e4
LT
1700 rdev->preferred_minor = 0xffff;
1701 rdev->data_offset = le64_to_cpu(sb->data_offset);
c6563a8c
N
1702 rdev->new_data_offset = rdev->data_offset;
1703 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1704 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1705 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
4dbcdc75 1706 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1da177e4 1707
0002b271 1708 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
e1defc4f 1709 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
0002b271 1710 if (rdev->sb_size & bmask)
a1801f85
N
1711 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1712
1713 if (minor_version
0f420358 1714 && rdev->data_offset < sb_start + (rdev->sb_size/512))
a1801f85 1715 return -EINVAL;
c6563a8c
N
1716 if (minor_version
1717 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1718 return -EINVAL;
0002b271 1719
31b65a0d
N
1720 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1721 rdev->desc_nr = -1;
1722 else
1723 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1724
2699b672
N
1725 if (!rdev->bb_page) {
1726 rdev->bb_page = alloc_page(GFP_KERNEL);
1727 if (!rdev->bb_page)
1728 return -ENOMEM;
1729 }
1730 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1731 rdev->badblocks.count == 0) {
1732 /* need to load the bad block list.
1733 * Currently we limit it to one page.
1734 */
1735 s32 offset;
1736 sector_t bb_sector;
00485d09 1737 __le64 *bbp;
2699b672
N
1738 int i;
1739 int sectors = le16_to_cpu(sb->bblog_size);
1740 if (sectors > (PAGE_SIZE / 512))
1741 return -EINVAL;
1742 offset = le32_to_cpu(sb->bblog_offset);
1743 if (offset == 0)
1744 return -EINVAL;
1745 bb_sector = (long long)offset;
1746 if (!sync_page_io(rdev, bb_sector, sectors << 9,
796a5cf0 1747 rdev->bb_page, REQ_OP_READ, 0, true))
2699b672 1748 return -EIO;
00485d09 1749 bbp = (__le64 *)page_address(rdev->bb_page);
2699b672
N
1750 rdev->badblocks.shift = sb->bblog_shift;
1751 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1752 u64 bb = le64_to_cpu(*bbp);
1753 int count = bb & (0x3ff);
1754 u64 sector = bb >> 10;
1755 sector <<= sb->bblog_shift;
1756 count <<= sb->bblog_shift;
1757 if (bb + 1 == 0)
1758 break;
fc974ee2 1759 if (badblocks_set(&rdev->badblocks, sector, count, 1))
2699b672
N
1760 return -EINVAL;
1761 }
486adf72
N
1762 } else if (sb->bblog_offset != 0)
1763 rdev->badblocks.shift = 0;
2699b672 1764
ddc08823
PB
1765 if ((le32_to_cpu(sb->feature_map) &
1766 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS))) {
ea0213e0
AP
1767 rdev->ppl.offset = (__s16)le16_to_cpu(sb->ppl.offset);
1768 rdev->ppl.size = le16_to_cpu(sb->ppl.size);
1769 rdev->ppl.sector = rdev->sb_start + rdev->ppl.offset;
1770 }
1771
33f2c35a
N
1772 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT) &&
1773 sb->level != 0)
1774 return -EINVAL;
1775
228fc7d7
YY
1776 /* not spare disk, or LEVEL_MULTIPATH */
1777 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH) ||
1778 (rdev->desc_nr >= 0 &&
1779 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1780 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1781 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL)))
1782 spare_disk = false;
6a5cb53a 1783
9a7b2b0f 1784 if (!refdev) {
228fc7d7 1785 if (!spare_disk)
6a5cb53a
YY
1786 ret = 1;
1787 else
1788 ret = 0;
9a7b2b0f 1789 } else {
1da177e4 1790 __u64 ev1, ev2;
65a06f06 1791 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1da177e4
LT
1792
1793 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1794 sb->level != refsb->level ||
1795 sb->layout != refsb->layout ||
1796 sb->chunksize != refsb->chunksize) {
9d48739e 1797 pr_warn("md: %s has strangely different superblock to %s\n",
1da177e4
LT
1798 bdevname(rdev->bdev,b),
1799 bdevname(refdev->bdev,b2));
1800 return -EINVAL;
1801 }
1802 ev1 = le64_to_cpu(sb->events);
1803 ev2 = le64_to_cpu(refsb->events);
1804
228fc7d7 1805 if (!spare_disk && ev1 > ev2)
8ed75463
N
1806 ret = 1;
1807 else
1808 ret = 0;
1da177e4 1809 }
c6563a8c
N
1810 if (minor_version) {
1811 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1812 sectors -= rdev->data_offset;
1813 } else
1814 sectors = rdev->sb_start;
1815 if (sectors < le64_to_cpu(sb->data_size))
1da177e4 1816 return -EINVAL;
dd8ac336 1817 rdev->sectors = le64_to_cpu(sb->data_size);
8ed75463 1818 return ret;
1da177e4
LT
1819}
1820
fd01b88c 1821static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 1822{
65a06f06 1823 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
07d84d10 1824 __u64 ev1 = le64_to_cpu(sb->events);
1da177e4 1825
41158c7e 1826 rdev->raid_disk = -1;
c5d79adb
N
1827 clear_bit(Faulty, &rdev->flags);
1828 clear_bit(In_sync, &rdev->flags);
8313b8e5 1829 clear_bit(Bitmap_sync, &rdev->flags);
c5d79adb 1830 clear_bit(WriteMostly, &rdev->flags);
c5d79adb 1831
1da177e4
LT
1832 if (mddev->raid_disks == 0) {
1833 mddev->major_version = 1;
1834 mddev->patch_version = 0;
e691063a 1835 mddev->external = 0;
9d8f0363 1836 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
9ebc6ef1
DD
1837 mddev->ctime = le64_to_cpu(sb->ctime);
1838 mddev->utime = le64_to_cpu(sb->utime);
1da177e4 1839 mddev->level = le32_to_cpu(sb->level);
d9d166c2 1840 mddev->clevel[0] = 0;
1da177e4
LT
1841 mddev->layout = le32_to_cpu(sb->layout);
1842 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
58c0fed4 1843 mddev->dev_sectors = le64_to_cpu(sb->size);
07d84d10 1844 mddev->events = ev1;
c3d9714e 1845 mddev->bitmap_info.offset = 0;
6409bb05
N
1846 mddev->bitmap_info.space = 0;
1847 /* Default location for bitmap is 1K after superblock
1848 * using 3K - total of 4K
1849 */
c3d9714e 1850 mddev->bitmap_info.default_offset = 1024 >> 9;
6409bb05 1851 mddev->bitmap_info.default_space = (4096-1024) >> 9;
2c810cdd
N
1852 mddev->reshape_backwards = 0;
1853
1da177e4
LT
1854 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1855 memcpy(mddev->uuid, sb->set_uuid, 16);
1856
1857 mddev->max_disks = (4096-256)/2;
a654b9d8 1858
71c0805c 1859 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
6409bb05 1860 mddev->bitmap_info.file == NULL) {
c3d9714e
N
1861 mddev->bitmap_info.offset =
1862 (__s32)le32_to_cpu(sb->bitmap_offset);
6409bb05
N
1863 /* Metadata doesn't record how much space is available.
1864 * For 1.0, we assume we can use up to the superblock
1865 * if before, else to 4K beyond superblock.
1866 * For others, assume no change is possible.
1867 */
1868 if (mddev->minor_version > 0)
1869 mddev->bitmap_info.space = 0;
1870 else if (mddev->bitmap_info.offset > 0)
1871 mddev->bitmap_info.space =
1872 8 - mddev->bitmap_info.offset;
1873 else
1874 mddev->bitmap_info.space =
1875 -mddev->bitmap_info.offset;
1876 }
e11e93fa 1877
f6705578
N
1878 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1879 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1880 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1881 mddev->new_level = le32_to_cpu(sb->new_level);
1882 mddev->new_layout = le32_to_cpu(sb->new_layout);
664e7c41 1883 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
2c810cdd
N
1884 if (mddev->delta_disks < 0 ||
1885 (mddev->delta_disks == 0 &&
1886 (le32_to_cpu(sb->feature_map)
1887 & MD_FEATURE_RESHAPE_BACKWARDS)))
1888 mddev->reshape_backwards = 1;
f6705578
N
1889 } else {
1890 mddev->reshape_position = MaxSector;
1891 mddev->delta_disks = 0;
1892 mddev->new_level = mddev->level;
1893 mddev->new_layout = mddev->layout;
664e7c41 1894 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578
N
1895 }
1896
33f2c35a
N
1897 if (mddev->level == 0 &&
1898 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RAID0_LAYOUT))
1899 mddev->layout = -1;
1900
486b0f7b 1901 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)
a62ab49e 1902 set_bit(MD_HAS_JOURNAL, &mddev->flags);
ea0213e0 1903
ddc08823
PB
1904 if (le32_to_cpu(sb->feature_map) &
1905 (MD_FEATURE_PPL | MD_FEATURE_MULTIPLE_PPLS)) {
ea0213e0
AP
1906 if (le32_to_cpu(sb->feature_map) &
1907 (MD_FEATURE_BITMAP_OFFSET | MD_FEATURE_JOURNAL))
1908 return -EINVAL;
ddc08823
PB
1909 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_PPL) &&
1910 (le32_to_cpu(sb->feature_map) &
1911 MD_FEATURE_MULTIPLE_PPLS))
1912 return -EINVAL;
ea0213e0
AP
1913 set_bit(MD_HAS_PPL, &mddev->flags);
1914 }
41158c7e 1915 } else if (mddev->pers == NULL) {
be6800a7
N
1916 /* Insist of good event counter while assembling, except for
1917 * spares (which don't need an event count) */
1da177e4 1918 ++ev1;
be6800a7
N
1919 if (rdev->desc_nr >= 0 &&
1920 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
a3dfbdaa
SL
1921 (le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < MD_DISK_ROLE_MAX ||
1922 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) == MD_DISK_ROLE_JOURNAL))
be6800a7
N
1923 if (ev1 < mddev->events)
1924 return -EINVAL;
41158c7e
N
1925 } else if (mddev->bitmap) {
1926 /* If adding to array with a bitmap, then we can accept an
1927 * older device, but not too old.
1928 */
41158c7e
N
1929 if (ev1 < mddev->bitmap->events_cleared)
1930 return 0;
8313b8e5
N
1931 if (ev1 < mddev->events)
1932 set_bit(Bitmap_sync, &rdev->flags);
07d84d10
N
1933 } else {
1934 if (ev1 < mddev->events)
1935 /* just a hot-add of a new device, leave raid_disk at -1 */
1936 return 0;
1937 }
1da177e4
LT
1938 if (mddev->level != LEVEL_MULTIPATH) {
1939 int role;
3673f305
N
1940 if (rdev->desc_nr < 0 ||
1941 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
c4d4c91b 1942 role = MD_DISK_ROLE_SPARE;
3673f305
N
1943 rdev->desc_nr = -1;
1944 } else
1945 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1da177e4 1946 switch(role) {
c4d4c91b 1947 case MD_DISK_ROLE_SPARE: /* spare */
1da177e4 1948 break;
c4d4c91b 1949 case MD_DISK_ROLE_FAULTY: /* faulty */
b2d444d7 1950 set_bit(Faulty, &rdev->flags);
1da177e4 1951 break;
bac624f3
SL
1952 case MD_DISK_ROLE_JOURNAL: /* journal device */
1953 if (!(le32_to_cpu(sb->feature_map) & MD_FEATURE_JOURNAL)) {
1954 /* journal device without journal feature */
9d48739e 1955 pr_warn("md: journal device provided without journal feature, ignoring the device\n");
bac624f3
SL
1956 return -EINVAL;
1957 }
1958 set_bit(Journal, &rdev->flags);
3069aa8d 1959 rdev->journal_tail = le64_to_cpu(sb->journal_tail);
9b15603d 1960 rdev->raid_disk = 0;
bac624f3 1961 break;
1da177e4 1962 default:
f466722c 1963 rdev->saved_raid_disk = role;
5fd6c1dc 1964 if ((le32_to_cpu(sb->feature_map) &
f466722c 1965 MD_FEATURE_RECOVERY_OFFSET)) {
5fd6c1dc 1966 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
f466722c
N
1967 if (!(le32_to_cpu(sb->feature_map) &
1968 MD_FEATURE_RECOVERY_BITMAP))
1969 rdev->saved_raid_disk = -1;
062f5b2a
GJ
1970 } else {
1971 /*
1972 * If the array is FROZEN, then the device can't
1973 * be in_sync with rest of array.
1974 */
1975 if (!test_bit(MD_RECOVERY_FROZEN,
1976 &mddev->recovery))
1977 set_bit(In_sync, &rdev->flags);
1978 }
1da177e4
LT
1979 rdev->raid_disk = role;
1980 break;
1981 }
8ddf9efe
N
1982 if (sb->devflags & WriteMostly1)
1983 set_bit(WriteMostly, &rdev->flags);
688834e6
N
1984 if (sb->devflags & FailFast1)
1985 set_bit(FailFast, &rdev->flags);
2d78f8c4
N
1986 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1987 set_bit(Replacement, &rdev->flags);
41158c7e 1988 } else /* MULTIPATH are always insync */
b2d444d7 1989 set_bit(In_sync, &rdev->flags);
41158c7e 1990
1da177e4
LT
1991 return 0;
1992}
1993
fd01b88c 1994static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1995{
1996 struct mdp_superblock_1 *sb;
3cb03002 1997 struct md_rdev *rdev2;
1da177e4
LT
1998 int max_dev, i;
1999 /* make rdev->sb match mddev and rdev data. */
2000
65a06f06 2001 sb = page_address(rdev->sb_page);
1da177e4
LT
2002
2003 sb->feature_map = 0;
2004 sb->pad0 = 0;
5fd6c1dc 2005 sb->recovery_offset = cpu_to_le64(0);
1da177e4
LT
2006 memset(sb->pad3, 0, sizeof(sb->pad3));
2007
2008 sb->utime = cpu_to_le64((__u64)mddev->utime);
2009 sb->events = cpu_to_le64(mddev->events);
2010 if (mddev->in_sync)
2011 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
bd18f646
SL
2012 else if (test_bit(MD_JOURNAL_CLEAN, &mddev->flags))
2013 sb->resync_offset = cpu_to_le64(MaxSector);
1da177e4
LT
2014 else
2015 sb->resync_offset = cpu_to_le64(0);
2016
1c05b4bc 2017 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
4dbcdc75 2018
f0ca340c 2019 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
58c0fed4 2020 sb->size = cpu_to_le64(mddev->dev_sectors);
9d8f0363 2021 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
62e1e389
N
2022 sb->level = cpu_to_le32(mddev->level);
2023 sb->layout = cpu_to_le32(mddev->layout);
688834e6
N
2024 if (test_bit(FailFast, &rdev->flags))
2025 sb->devflags |= FailFast1;
2026 else
2027 sb->devflags &= ~FailFast1;
f0ca340c 2028
aeb9b211
N
2029 if (test_bit(WriteMostly, &rdev->flags))
2030 sb->devflags |= WriteMostly1;
2031 else
2032 sb->devflags &= ~WriteMostly1;
c6563a8c
N
2033 sb->data_offset = cpu_to_le64(rdev->data_offset);
2034 sb->data_size = cpu_to_le64(rdev->sectors);
aeb9b211 2035
c3d9714e
N
2036 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
2037 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
71c0805c 2038 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
a654b9d8 2039 }
5fd6c1dc 2040
f2076e7d 2041 if (rdev->raid_disk >= 0 && !test_bit(Journal, &rdev->flags) &&
97e4f42d 2042 !test_bit(In_sync, &rdev->flags)) {
93be75ff
N
2043 sb->feature_map |=
2044 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
2045 sb->recovery_offset =
2046 cpu_to_le64(rdev->recovery_offset);
f466722c
N
2047 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
2048 sb->feature_map |=
2049 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
5fd6c1dc 2050 }
3069aa8d
SL
2051 /* Note: recovery_offset and journal_tail share space */
2052 if (test_bit(Journal, &rdev->flags))
2053 sb->journal_tail = cpu_to_le64(rdev->journal_tail);
2d78f8c4
N
2054 if (test_bit(Replacement, &rdev->flags))
2055 sb->feature_map |=
2056 cpu_to_le32(MD_FEATURE_REPLACEMENT);
5fd6c1dc 2057
f6705578
N
2058 if (mddev->reshape_position != MaxSector) {
2059 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
2060 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
2061 sb->new_layout = cpu_to_le32(mddev->new_layout);
2062 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
2063 sb->new_level = cpu_to_le32(mddev->new_level);
664e7c41 2064 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
2c810cdd
N
2065 if (mddev->delta_disks == 0 &&
2066 mddev->reshape_backwards)
2067 sb->feature_map
2068 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
c6563a8c
N
2069 if (rdev->new_data_offset != rdev->data_offset) {
2070 sb->feature_map
2071 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
2072 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
2073 - rdev->data_offset));
2074 }
f6705578 2075 }
a654b9d8 2076
3c462c88
GR
2077 if (mddev_is_clustered(mddev))
2078 sb->feature_map |= cpu_to_le32(MD_FEATURE_CLUSTERED);
2079
2699b672
N
2080 if (rdev->badblocks.count == 0)
2081 /* Nothing to do for bad blocks*/ ;
2082 else if (sb->bblog_offset == 0)
2083 /* Cannot record bad blocks on this device */
2084 md_error(mddev, rdev);
2085 else {
2086 struct badblocks *bb = &rdev->badblocks;
ae50640b 2087 __le64 *bbp = (__le64 *)page_address(rdev->bb_page);
2699b672
N
2088 u64 *p = bb->page;
2089 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
2090 if (bb->changed) {
2091 unsigned seq;
2092
2093retry:
2094 seq = read_seqbegin(&bb->lock);
2095
2096 memset(bbp, 0xff, PAGE_SIZE);
2097
2098 for (i = 0 ; i < bb->count ; i++) {
35f9ac2d 2099 u64 internal_bb = p[i];
2699b672
N
2100 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
2101 | BB_LEN(internal_bb));
35f9ac2d 2102 bbp[i] = cpu_to_le64(store_bb);
2699b672 2103 }
d0962936 2104 bb->changed = 0;
2699b672
N
2105 if (read_seqretry(&bb->lock, seq))
2106 goto retry;
2107
2108 bb->sector = (rdev->sb_start +
2109 (int)le32_to_cpu(sb->bblog_offset));
2110 bb->size = le16_to_cpu(sb->bblog_size);
2699b672
N
2111 }
2112 }
2113
1da177e4 2114 max_dev = 0;
dafb20fa 2115 rdev_for_each(rdev2, mddev)
1da177e4
LT
2116 if (rdev2->desc_nr+1 > max_dev)
2117 max_dev = rdev2->desc_nr+1;
a778b73f 2118
70471daf
N
2119 if (max_dev > le32_to_cpu(sb->max_dev)) {
2120 int bmask;
a778b73f 2121 sb->max_dev = cpu_to_le32(max_dev);
70471daf
N
2122 rdev->sb_size = max_dev * 2 + 256;
2123 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
2124 if (rdev->sb_size & bmask)
2125 rdev->sb_size = (rdev->sb_size | bmask) + 1;
ddcf3522
N
2126 } else
2127 max_dev = le32_to_cpu(sb->max_dev);
2128
1da177e4 2129 for (i=0; i<max_dev;i++)
8df72024 2130 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
f72ffdd6 2131
a97b7896
SL
2132 if (test_bit(MD_HAS_JOURNAL, &mddev->flags))
2133 sb->feature_map |= cpu_to_le32(MD_FEATURE_JOURNAL);
f72ffdd6 2134
ea0213e0 2135 if (test_bit(MD_HAS_PPL, &mddev->flags)) {
ddc08823
PB
2136 if (test_bit(MD_HAS_MULTIPLE_PPLS, &mddev->flags))
2137 sb->feature_map |=
2138 cpu_to_le32(MD_FEATURE_MULTIPLE_PPLS);
2139 else
2140 sb->feature_map |= cpu_to_le32(MD_FEATURE_PPL);
ea0213e0
AP
2141 sb->ppl.offset = cpu_to_le16(rdev->ppl.offset);
2142 sb->ppl.size = cpu_to_le16(rdev->ppl.size);
2143 }
2144
dafb20fa 2145 rdev_for_each(rdev2, mddev) {
1da177e4 2146 i = rdev2->desc_nr;
b2d444d7 2147 if (test_bit(Faulty, &rdev2->flags))
c4d4c91b 2148 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_FAULTY);
b2d444d7 2149 else if (test_bit(In_sync, &rdev2->flags))
1da177e4 2150 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
a97b7896 2151 else if (test_bit(Journal, &rdev2->flags))
bac624f3 2152 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_JOURNAL);
93be75ff 2153 else if (rdev2->raid_disk >= 0)
5fd6c1dc 2154 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1da177e4 2155 else
c4d4c91b 2156 sb->dev_roles[i] = cpu_to_le16(MD_DISK_ROLE_SPARE);
1da177e4
LT
2157 }
2158
1da177e4
LT
2159 sb->sb_csum = calc_sb_1_csum(sb);
2160}
2161
0cd17fec 2162static unsigned long long
3cb03002 2163super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
0cd17fec
CW
2164{
2165 struct mdp_superblock_1 *sb;
15f4a5fd 2166 sector_t max_sectors;
58c0fed4 2167 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
0cd17fec 2168 return 0; /* component must fit device */
c6563a8c
N
2169 if (rdev->data_offset != rdev->new_data_offset)
2170 return 0; /* too confusing */
0f420358 2171 if (rdev->sb_start < rdev->data_offset) {
0cd17fec 2172 /* minor versions 1 and 2; superblock before data */
77304d2a 2173 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
15f4a5fd
AN
2174 max_sectors -= rdev->data_offset;
2175 if (!num_sectors || num_sectors > max_sectors)
2176 num_sectors = max_sectors;
c3d9714e 2177 } else if (rdev->mddev->bitmap_info.offset) {
0cd17fec
CW
2178 /* minor version 0 with bitmap we can't move */
2179 return 0;
2180 } else {
2181 /* minor version 0; superblock after data */
0f420358 2182 sector_t sb_start;
77304d2a 2183 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
0f420358 2184 sb_start &= ~(sector_t)(4*2 - 1);
dd8ac336 2185 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
15f4a5fd
AN
2186 if (!num_sectors || num_sectors > max_sectors)
2187 num_sectors = max_sectors;
0f420358 2188 rdev->sb_start = sb_start;
0cd17fec 2189 }
65a06f06 2190 sb = page_address(rdev->sb_page);
15f4a5fd 2191 sb->data_size = cpu_to_le64(num_sectors);
3fb632e4 2192 sb->super_offset = cpu_to_le64(rdev->sb_start);
0cd17fec 2193 sb->sb_csum = calc_sb_1_csum(sb);
46533ff7
N
2194 do {
2195 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
2196 rdev->sb_page);
2197 } while (md_super_wait(rdev->mddev) < 0);
c26a44ed 2198 return num_sectors;
c6563a8c
N
2199
2200}
2201
2202static int
2203super_1_allow_new_offset(struct md_rdev *rdev,
2204 unsigned long long new_offset)
2205{
2206 /* All necessary checks on new >= old have been done */
2207 struct bitmap *bitmap;
2208 if (new_offset >= rdev->data_offset)
2209 return 1;
2210
2211 /* with 1.0 metadata, there is no metadata to tread on
2212 * so we can always move back */
2213 if (rdev->mddev->minor_version == 0)
2214 return 1;
2215
2216 /* otherwise we must be sure not to step on
2217 * any metadata, so stay:
2218 * 36K beyond start of superblock
2219 * beyond end of badblocks
2220 * beyond write-intent bitmap
2221 */
2222 if (rdev->sb_start + (32+4)*2 > new_offset)
2223 return 0;
2224 bitmap = rdev->mddev->bitmap;
2225 if (bitmap && !rdev->mddev->bitmap_info.file &&
2226 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1ec885cd 2227 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
c6563a8c
N
2228 return 0;
2229 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
2230 return 0;
2231
2232 return 1;
0cd17fec 2233}
1da177e4 2234
75c96f85 2235static struct super_type super_types[] = {
1da177e4
LT
2236 [0] = {
2237 .name = "0.90.0",
2238 .owner = THIS_MODULE,
0cd17fec
CW
2239 .load_super = super_90_load,
2240 .validate_super = super_90_validate,
2241 .sync_super = super_90_sync,
2242 .rdev_size_change = super_90_rdev_size_change,
c6563a8c 2243 .allow_new_offset = super_90_allow_new_offset,
1da177e4
LT
2244 },
2245 [1] = {
2246 .name = "md-1",
2247 .owner = THIS_MODULE,
0cd17fec
CW
2248 .load_super = super_1_load,
2249 .validate_super = super_1_validate,
2250 .sync_super = super_1_sync,
2251 .rdev_size_change = super_1_rdev_size_change,
c6563a8c 2252 .allow_new_offset = super_1_allow_new_offset,
1da177e4
LT
2253 },
2254};
1da177e4 2255
fd01b88c 2256static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
076f968b
JB
2257{
2258 if (mddev->sync_super) {
2259 mddev->sync_super(mddev, rdev);
2260 return;
2261 }
2262
2263 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
2264
2265 super_types[mddev->major_version].sync_super(mddev, rdev);
2266}
2267
fd01b88c 2268static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1da177e4 2269{
3cb03002 2270 struct md_rdev *rdev, *rdev2;
1da177e4 2271
4b80991c 2272 rcu_read_lock();
0b020e85
SL
2273 rdev_for_each_rcu(rdev, mddev1) {
2274 if (test_bit(Faulty, &rdev->flags) ||
2275 test_bit(Journal, &rdev->flags) ||
2276 rdev->raid_disk == -1)
2277 continue;
2278 rdev_for_each_rcu(rdev2, mddev2) {
2279 if (test_bit(Faulty, &rdev2->flags) ||
2280 test_bit(Journal, &rdev2->flags) ||
2281 rdev2->raid_disk == -1)
2282 continue;
7dd5e7c3 2283 if (rdev->bdev->bd_contains ==
4b80991c
N
2284 rdev2->bdev->bd_contains) {
2285 rcu_read_unlock();
7dd5e7c3 2286 return 1;
4b80991c 2287 }
0b020e85
SL
2288 }
2289 }
4b80991c 2290 rcu_read_unlock();
1da177e4
LT
2291 return 0;
2292}
2293
2294static LIST_HEAD(pending_raid_disks);
2295
ac5e7113
AN
2296/*
2297 * Try to register data integrity profile for an mddev
2298 *
2299 * This is called when an array is started and after a disk has been kicked
2300 * from the array. It only succeeds if all working and active component devices
2301 * are integrity capable with matching profiles.
2302 */
fd01b88c 2303int md_integrity_register(struct mddev *mddev)
ac5e7113 2304{
3cb03002 2305 struct md_rdev *rdev, *reference = NULL;
ac5e7113
AN
2306
2307 if (list_empty(&mddev->disks))
2308 return 0; /* nothing to do */
629acb6a
JB
2309 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2310 return 0; /* shouldn't register, or already is */
dafb20fa 2311 rdev_for_each(rdev, mddev) {
ac5e7113
AN
2312 /* skip spares and non-functional disks */
2313 if (test_bit(Faulty, &rdev->flags))
2314 continue;
2315 if (rdev->raid_disk < 0)
2316 continue;
ac5e7113
AN
2317 if (!reference) {
2318 /* Use the first rdev as the reference */
2319 reference = rdev;
2320 continue;
2321 }
2322 /* does this rdev's profile match the reference profile? */
2323 if (blk_integrity_compare(reference->bdev->bd_disk,
2324 rdev->bdev->bd_disk) < 0)
2325 return -EINVAL;
2326 }
89078d57
MP
2327 if (!reference || !bdev_get_integrity(reference->bdev))
2328 return 0;
ac5e7113
AN
2329 /*
2330 * All component devices are integrity capable and have matching
2331 * profiles, register the common profile for the md device.
2332 */
25520d55
MP
2333 blk_integrity_register(mddev->gendisk,
2334 bdev_get_integrity(reference->bdev));
2335
9d48739e 2336 pr_debug("md: data integrity enabled on %s\n", mdname(mddev));
afeee514 2337 if (bioset_integrity_create(&mddev->bio_set, BIO_POOL_SIZE)) {
9d48739e 2338 pr_err("md: failed to create integrity pool for %s\n",
a91a2785
MP
2339 mdname(mddev));
2340 return -EINVAL;
2341 }
ac5e7113
AN
2342 return 0;
2343}
2344EXPORT_SYMBOL(md_integrity_register);
2345
1501efad
DW
2346/*
2347 * Attempt to add an rdev, but only if it is consistent with the current
2348 * integrity profile
2349 */
2350int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
3f9d99c1 2351{
2863b9eb 2352 struct blk_integrity *bi_mddev;
1501efad 2353 char name[BDEVNAME_SIZE];
2863b9eb
JB
2354
2355 if (!mddev->gendisk)
1501efad 2356 return 0;
2863b9eb 2357
2863b9eb 2358 bi_mddev = blk_get_integrity(mddev->gendisk);
3f9d99c1 2359
ac5e7113 2360 if (!bi_mddev) /* nothing to do */
1501efad
DW
2361 return 0;
2362
2363 if (blk_integrity_compare(mddev->gendisk, rdev->bdev->bd_disk) != 0) {
9d48739e
N
2364 pr_err("%s: incompatible integrity profile for %s\n",
2365 mdname(mddev), bdevname(rdev->bdev, name));
1501efad
DW
2366 return -ENXIO;
2367 }
2368
2369 return 0;
3f9d99c1 2370}
ac5e7113 2371EXPORT_SYMBOL(md_integrity_add_rdev);
3f9d99c1 2372
f72ffdd6 2373static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
1da177e4 2374{
7dd5e7c3 2375 char b[BDEVNAME_SIZE];
f637b9f9 2376 struct kobject *ko;
5e55e2f5 2377 int err;
1da177e4 2378
11e2ede0
DW
2379 /* prevent duplicates */
2380 if (find_rdev(mddev, rdev->bdev->bd_dev))
2381 return -EEXIST;
2382
97b20ef7
N
2383 if ((bdev_read_only(rdev->bdev) || bdev_read_only(rdev->meta_bdev)) &&
2384 mddev->pers)
2385 return -EROFS;
2386
dd8ac336 2387 /* make sure rdev->sectors exceeds mddev->dev_sectors */
f6b6ec5c
SL
2388 if (!test_bit(Journal, &rdev->flags) &&
2389 rdev->sectors &&
2390 (mddev->dev_sectors == 0 || rdev->sectors < mddev->dev_sectors)) {
a778b73f
N
2391 if (mddev->pers) {
2392 /* Cannot change size, so fail
2393 * If mddev->level <= 0, then we don't care
2394 * about aligning sizes (e.g. linear)
2395 */
2396 if (mddev->level > 0)
2397 return -ENOSPC;
2398 } else
dd8ac336 2399 mddev->dev_sectors = rdev->sectors;
2bf071bf 2400 }
1da177e4
LT
2401
2402 /* Verify rdev->desc_nr is unique.
2403 * If it is -1, assign a free number, else
2404 * check number is not in use
2405 */
4878e9eb 2406 rcu_read_lock();
1da177e4
LT
2407 if (rdev->desc_nr < 0) {
2408 int choice = 0;
4878e9eb
N
2409 if (mddev->pers)
2410 choice = mddev->raid_disks;
57d051dc 2411 while (md_find_rdev_nr_rcu(mddev, choice))
1da177e4
LT
2412 choice++;
2413 rdev->desc_nr = choice;
2414 } else {
57d051dc 2415 if (md_find_rdev_nr_rcu(mddev, rdev->desc_nr)) {
4878e9eb 2416 rcu_read_unlock();
1da177e4 2417 return -EBUSY;
4878e9eb 2418 }
1da177e4 2419 }
4878e9eb 2420 rcu_read_unlock();
f6b6ec5c
SL
2421 if (!test_bit(Journal, &rdev->flags) &&
2422 mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
9d48739e
N
2423 pr_warn("md: %s: array is limited to %d devices\n",
2424 mdname(mddev), mddev->max_disks);
de01dfad
N
2425 return -EBUSY;
2426 }
19133a42 2427 bdevname(rdev->bdev,b);
90a9befb 2428 strreplace(b, '/', '!');
649316b2 2429
1da177e4 2430 rdev->mddev = mddev;
9d48739e 2431 pr_debug("md: bind<%s>\n", b);
86e6ffdd 2432
963c555e 2433 if (mddev->raid_disks)
404659cf 2434 mddev_create_serial_pool(mddev, rdev, false);
963c555e 2435
b2d6db58 2436 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
5e55e2f5 2437 goto fail;
86e6ffdd 2438
0762b8bd 2439 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
00bcb4ac
N
2440 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2441 /* failure here is OK */;
2442 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
3c0ee63a 2443
4b80991c 2444 list_add_rcu(&rdev->same_set, &mddev->disks);
e09b457b 2445 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
4044ba58
N
2446
2447 /* May as well allow recovery to be retried once */
5389042f 2448 mddev->recovery_disabled++;
3f9d99c1 2449
1da177e4 2450 return 0;
5e55e2f5
N
2451
2452 fail:
9d48739e
N
2453 pr_warn("md: failed to register dev-%s for %s\n",
2454 b, mdname(mddev));
5e55e2f5 2455 return err;
1da177e4
LT
2456}
2457
cc1ffe61 2458static void rdev_delayed_delete(struct work_struct *ws)
5792a285 2459{
3cb03002 2460 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
5792a285 2461 kobject_del(&rdev->kobj);
177a99b2 2462 kobject_put(&rdev->kobj);
5792a285
N
2463}
2464
f72ffdd6 2465static void unbind_rdev_from_array(struct md_rdev *rdev)
1da177e4
LT
2466{
2467 char b[BDEVNAME_SIZE];
403df478 2468
49731baa 2469 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
4b80991c 2470 list_del_rcu(&rdev->same_set);
9d48739e 2471 pr_debug("md: unbind<%s>\n", bdevname(rdev->bdev,b));
11d3a9f6 2472 mddev_destroy_serial_pool(rdev->mddev, rdev, false);
1da177e4 2473 rdev->mddev = NULL;
86e6ffdd 2474 sysfs_remove_link(&rdev->kobj, "block");
3c0ee63a
N
2475 sysfs_put(rdev->sysfs_state);
2476 rdev->sysfs_state = NULL;
2230dfe4 2477 rdev->badblocks.count = 0;
5792a285 2478 /* We need to delay this, otherwise we can deadlock when
4b80991c
N
2479 * writing to 'remove' to "dev/state". We also need
2480 * to delay it due to rcu usage.
5792a285 2481 */
4b80991c 2482 synchronize_rcu();
cc1ffe61 2483 INIT_WORK(&rdev->del_work, rdev_delayed_delete);
177a99b2 2484 kobject_get(&rdev->kobj);
cc1ffe61 2485 queue_work(md_rdev_misc_wq, &rdev->del_work);
1da177e4
LT
2486}
2487
2488/*
2489 * prevent the device from being mounted, repartitioned or
2490 * otherwise reused by a RAID array (or any other kernel
2491 * subsystem), by bd_claiming the device.
2492 */
3cb03002 2493static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
1da177e4
LT
2494{
2495 int err = 0;
2496 struct block_device *bdev;
1da177e4 2497
d4d77629 2498 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
3cb03002 2499 shared ? (struct md_rdev *)lock_rdev : rdev);
1da177e4 2500 if (IS_ERR(bdev)) {
ea3edd4d
CH
2501 pr_warn("md: could not open device unknown-block(%u,%u).\n",
2502 MAJOR(dev), MINOR(dev));
1da177e4
LT
2503 return PTR_ERR(bdev);
2504 }
1da177e4
LT
2505 rdev->bdev = bdev;
2506 return err;
2507}
2508
3cb03002 2509static void unlock_rdev(struct md_rdev *rdev)
1da177e4
LT
2510{
2511 struct block_device *bdev = rdev->bdev;
2512 rdev->bdev = NULL;
e525fd89 2513 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
1da177e4
LT
2514}
2515
2516void md_autodetect_dev(dev_t dev);
2517
f72ffdd6 2518static void export_rdev(struct md_rdev *rdev)
1da177e4
LT
2519{
2520 char b[BDEVNAME_SIZE];
403df478 2521
9d48739e 2522 pr_debug("md: export_rdev(%s)\n", bdevname(rdev->bdev,b));
545c8795 2523 md_rdev_clear(rdev);
1da177e4 2524#ifndef MODULE
d0fae18f
N
2525 if (test_bit(AutoDetected, &rdev->flags))
2526 md_autodetect_dev(rdev->bdev->bd_dev);
1da177e4
LT
2527#endif
2528 unlock_rdev(rdev);
86e6ffdd 2529 kobject_put(&rdev->kobj);
1da177e4
LT
2530}
2531
fb56dfef 2532void md_kick_rdev_from_array(struct md_rdev *rdev)
1da177e4
LT
2533{
2534 unbind_rdev_from_array(rdev);
2535 export_rdev(rdev);
2536}
fb56dfef 2537EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
1da177e4 2538
fd01b88c 2539static void export_array(struct mddev *mddev)
1da177e4 2540{
0638bb0e 2541 struct md_rdev *rdev;
1da177e4 2542
0638bb0e
N
2543 while (!list_empty(&mddev->disks)) {
2544 rdev = list_first_entry(&mddev->disks, struct md_rdev,
2545 same_set);
fb56dfef 2546 md_kick_rdev_from_array(rdev);
1da177e4 2547 }
1da177e4
LT
2548 mddev->raid_disks = 0;
2549 mddev->major_version = 0;
2550}
2551
6497709b
N
2552static bool set_in_sync(struct mddev *mddev)
2553{
efa4b77b 2554 lockdep_assert_held(&mddev->lock);
4ad23a97
N
2555 if (!mddev->in_sync) {
2556 mddev->sync_checkers++;
2557 spin_unlock(&mddev->lock);
2558 percpu_ref_switch_to_atomic_sync(&mddev->writes_pending);
2559 spin_lock(&mddev->lock);
2560 if (!mddev->in_sync &&
2561 percpu_ref_is_zero(&mddev->writes_pending)) {
6497709b 2562 mddev->in_sync = 1;
4ad23a97
N
2563 /*
2564 * Ensure ->in_sync is visible before we clear
2565 * ->sync_checkers.
2566 */
55cc39f3 2567 smp_mb();
6497709b
N
2568 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2569 sysfs_notify_dirent_safe(mddev->sysfs_state);
2570 }
4ad23a97
N
2571 if (--mddev->sync_checkers == 0)
2572 percpu_ref_switch_to_percpu(&mddev->writes_pending);
6497709b
N
2573 }
2574 if (mddev->safemode == 1)
2575 mddev->safemode = 0;
2576 return mddev->in_sync;
2577}
2578
f72ffdd6 2579static void sync_sbs(struct mddev *mddev, int nospares)
1da177e4 2580{
42543769
N
2581 /* Update each superblock (in-memory image), but
2582 * if we are allowed to, skip spares which already
2583 * have the right event counter, or have one earlier
2584 * (which would mean they aren't being marked as dirty
2585 * with the rest of the array)
2586 */
3cb03002 2587 struct md_rdev *rdev;
dafb20fa 2588 rdev_for_each(rdev, mddev) {
42543769
N
2589 if (rdev->sb_events == mddev->events ||
2590 (nospares &&
2591 rdev->raid_disk < 0 &&
42543769
N
2592 rdev->sb_events+1 == mddev->events)) {
2593 /* Don't update this superblock */
2594 rdev->sb_loaded = 2;
2595 } else {
076f968b 2596 sync_super(mddev, rdev);
42543769
N
2597 rdev->sb_loaded = 1;
2598 }
1da177e4
LT
2599 }
2600}
2601
2aa82191
GR
2602static bool does_sb_need_changing(struct mddev *mddev)
2603{
2604 struct md_rdev *rdev;
2605 struct mdp_superblock_1 *sb;
2606 int role;
2607
2608 /* Find a good rdev */
2609 rdev_for_each(rdev, mddev)
2610 if ((rdev->raid_disk >= 0) && !test_bit(Faulty, &rdev->flags))
2611 break;
2612
2613 /* No good device found. */
2614 if (!rdev)
2615 return false;
2616
2617 sb = page_address(rdev->sb_page);
2618 /* Check if a device has become faulty or a spare become active */
2619 rdev_for_each(rdev, mddev) {
2620 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
2621 /* Device activated? */
2622 if (role == 0xffff && rdev->raid_disk >=0 &&
2623 !test_bit(Faulty, &rdev->flags))
2624 return true;
2625 /* Device turned faulty? */
2626 if (test_bit(Faulty, &rdev->flags) && (role < 0xfffd))
2627 return true;
2628 }
2629
2630 /* Check if any mddev parameters have changed */
2631 if ((mddev->dev_sectors != le64_to_cpu(sb->size)) ||
2632 (mddev->reshape_position != le64_to_cpu(sb->reshape_position)) ||
13459213 2633 (mddev->layout != le32_to_cpu(sb->layout)) ||
2aa82191
GR
2634 (mddev->raid_disks != le32_to_cpu(sb->raid_disks)) ||
2635 (mddev->chunk_sectors != le32_to_cpu(sb->chunksize)))
2636 return true;
2637
2638 return false;
2639}
2640
1aee41f6 2641void md_update_sb(struct mddev *mddev, int force_change)
1da177e4 2642{
3cb03002 2643 struct md_rdev *rdev;
06d91a5f 2644 int sync_req;
42543769 2645 int nospares = 0;
2699b672 2646 int any_badblocks_changed = 0;
23b63f9f 2647 int ret = -1;
1da177e4 2648
d87f064f
N
2649 if (mddev->ro) {
2650 if (force_change)
2953079c 2651 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
d87f064f
N
2652 return;
2653 }
2aa82191 2654
2c97cf13 2655repeat:
2aa82191 2656 if (mddev_is_clustered(mddev)) {
2953079c 2657 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
2aa82191 2658 force_change = 1;
2953079c 2659 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
85ad1d13 2660 nospares = 1;
23b63f9f 2661 ret = md_cluster_ops->metadata_update_start(mddev);
2aa82191
GR
2662 /* Has someone else has updated the sb */
2663 if (!does_sb_need_changing(mddev)) {
23b63f9f
GJ
2664 if (ret == 0)
2665 md_cluster_ops->metadata_update_cancel(mddev);
2953079c
SL
2666 bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2667 BIT(MD_SB_CHANGE_DEVS) |
2668 BIT(MD_SB_CHANGE_CLEAN));
2aa82191
GR
2669 return;
2670 }
2671 }
2c97cf13 2672
db0505d3
N
2673 /*
2674 * First make sure individual recovery_offsets are correct
2675 * curr_resync_completed can only be used during recovery.
2676 * During reshape/resync it might use array-addresses rather
2677 * that device addresses.
2678 */
dafb20fa 2679 rdev_for_each(rdev, mddev) {
3a3a5ddb
N
2680 if (rdev->raid_disk >= 0 &&
2681 mddev->delta_disks >= 0 &&
db0505d3
N
2682 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
2683 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery) &&
2684 !test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
f2076e7d 2685 !test_bit(Journal, &rdev->flags) &&
3a3a5ddb
N
2686 !test_bit(In_sync, &rdev->flags) &&
2687 mddev->curr_resync_completed > rdev->recovery_offset)
2688 rdev->recovery_offset = mddev->curr_resync_completed;
2689
f72ffdd6 2690 }
bd52b746 2691 if (!mddev->persistent) {
2953079c
SL
2692 clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
2693 clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
de393cde 2694 if (!mddev->external) {
2953079c 2695 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
dafb20fa 2696 rdev_for_each(rdev, mddev) {
de393cde 2697 if (rdev->badblocks.changed) {
d0962936 2698 rdev->badblocks.changed = 0;
fc974ee2 2699 ack_all_badblocks(&rdev->badblocks);
de393cde
N
2700 md_error(mddev, rdev);
2701 }
2702 clear_bit(Blocked, &rdev->flags);
2703 clear_bit(BlockedBadBlocks, &rdev->flags);
2704 wake_up(&rdev->blocked_wait);
2705 }
2706 }
3a3a5ddb
N
2707 wake_up(&mddev->sb_wait);
2708 return;
2709 }
2710
85572d7c 2711 spin_lock(&mddev->lock);
84692195 2712
9ebc6ef1 2713 mddev->utime = ktime_get_real_seconds();
3a3a5ddb 2714
2953079c 2715 if (test_and_clear_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags))
850b2b42 2716 force_change = 1;
2953079c 2717 if (test_and_clear_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags))
850b2b42
N
2718 /* just a clean<-> dirty transition, possibly leave spares alone,
2719 * though if events isn't the right even/odd, we will have to do
2720 * spares after all
2721 */
2722 nospares = 1;
2723 if (force_change)
2724 nospares = 0;
2725 if (mddev->degraded)
84692195
N
2726 /* If the array is degraded, then skipping spares is both
2727 * dangerous and fairly pointless.
2728 * Dangerous because a device that was removed from the array
2729 * might have a event_count that still looks up-to-date,
2730 * so it can be re-added without a resync.
2731 * Pointless because if there are any spares to skip,
2732 * then a recovery will happen and soon that array won't
2733 * be degraded any more and the spare can go back to sleep then.
2734 */
850b2b42 2735 nospares = 0;
84692195 2736
06d91a5f 2737 sync_req = mddev->in_sync;
42543769
N
2738
2739 /* If this is just a dirty<->clean transition, and the array is clean
2740 * and 'events' is odd, we can roll back to the previous clean state */
850b2b42 2741 if (nospares
42543769 2742 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
a8707c08
N
2743 && mddev->can_decrease_events
2744 && mddev->events != 1) {
42543769 2745 mddev->events--;
a8707c08
N
2746 mddev->can_decrease_events = 0;
2747 } else {
42543769
N
2748 /* otherwise we have to go forward and ... */
2749 mddev->events ++;
a8707c08 2750 mddev->can_decrease_events = nospares;
42543769 2751 }
1da177e4 2752
403df478
N
2753 /*
2754 * This 64-bit counter should never wrap.
2755 * Either we are in around ~1 trillion A.C., assuming
2756 * 1 reboot per second, or we have a bug...
2757 */
2758 WARN_ON(mddev->events == 0);
2699b672 2759
dafb20fa 2760 rdev_for_each(rdev, mddev) {
2699b672
N
2761 if (rdev->badblocks.changed)
2762 any_badblocks_changed++;
de393cde
N
2763 if (test_bit(Faulty, &rdev->flags))
2764 set_bit(FaultRecorded, &rdev->flags);
2765 }
2699b672 2766
e691063a 2767 sync_sbs(mddev, nospares);
85572d7c 2768 spin_unlock(&mddev->lock);
1da177e4 2769
36a4e1fe
N
2770 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2771 mdname(mddev), mddev->in_sync);
1da177e4 2772
504634f6
SL
2773 if (mddev->queue)
2774 blk_add_trace_msg(mddev->queue, "md md_update_sb");
46533ff7 2775rewrite:
e64e4018 2776 md_bitmap_update_sb(mddev->bitmap);
dafb20fa 2777 rdev_for_each(rdev, mddev) {
1da177e4 2778 char b[BDEVNAME_SIZE];
36a4e1fe 2779
42543769
N
2780 if (rdev->sb_loaded != 1)
2781 continue; /* no noise on spare devices */
1da177e4 2782
f466722c 2783 if (!test_bit(Faulty, &rdev->flags)) {
7bfa19f2 2784 md_super_write(mddev,rdev,
0f420358 2785 rdev->sb_start, rdev->sb_size,
7bfa19f2 2786 rdev->sb_page);
36a4e1fe
N
2787 pr_debug("md: (write) %s's sb offset: %llu\n",
2788 bdevname(rdev->bdev, b),
2789 (unsigned long long)rdev->sb_start);
42543769 2790 rdev->sb_events = mddev->events;
2699b672
N
2791 if (rdev->badblocks.size) {
2792 md_super_write(mddev, rdev,
2793 rdev->badblocks.sector,
2794 rdev->badblocks.size << 9,
2795 rdev->bb_page);
2796 rdev->badblocks.size = 0;
2797 }
7bfa19f2 2798
f466722c 2799 } else
36a4e1fe
N
2800 pr_debug("md: %s (skipping faulty)\n",
2801 bdevname(rdev->bdev, b));
d70ed2e4 2802
7bfa19f2 2803 if (mddev->level == LEVEL_MULTIPATH)
1da177e4
LT
2804 /* only need to write one superblock... */
2805 break;
2806 }
46533ff7
N
2807 if (md_super_wait(mddev) < 0)
2808 goto rewrite;
2953079c 2809 /* if there was a failure, MD_SB_CHANGE_DEVS was set, and we re-write super */
7bfa19f2 2810
2c97cf13
GJ
2811 if (mddev_is_clustered(mddev) && ret == 0)
2812 md_cluster_ops->metadata_update_finish(mddev);
2813
850b2b42 2814 if (mddev->in_sync != sync_req ||
2953079c
SL
2815 !bit_clear_unless(&mddev->sb_flags, BIT(MD_SB_CHANGE_PENDING),
2816 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_CLEAN)))
06d91a5f 2817 /* have to write it out again */
06d91a5f 2818 goto repeat;
3d310eb7 2819 wake_up(&mddev->sb_wait);
acb180b0
N
2820 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2821 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
06d91a5f 2822
dafb20fa 2823 rdev_for_each(rdev, mddev) {
de393cde
N
2824 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2825 clear_bit(Blocked, &rdev->flags);
2826
2827 if (any_badblocks_changed)
fc974ee2 2828 ack_all_badblocks(&rdev->badblocks);
de393cde
N
2829 clear_bit(BlockedBadBlocks, &rdev->flags);
2830 wake_up(&rdev->blocked_wait);
2831 }
1da177e4 2832}
1aee41f6 2833EXPORT_SYMBOL(md_update_sb);
1da177e4 2834
a6da4ef8
GR
2835static int add_bound_rdev(struct md_rdev *rdev)
2836{
2837 struct mddev *mddev = rdev->mddev;
2838 int err = 0;
87d4d916 2839 bool add_journal = test_bit(Journal, &rdev->flags);
a6da4ef8 2840
87d4d916 2841 if (!mddev->pers->hot_remove_disk || add_journal) {
a6da4ef8
GR
2842 /* If there is hot_add_disk but no hot_remove_disk
2843 * then added disks for geometry changes,
2844 * and should be added immediately.
2845 */
2846 super_types[mddev->major_version].
2847 validate_super(mddev, rdev);
87d4d916
SL
2848 if (add_journal)
2849 mddev_suspend(mddev);
a6da4ef8 2850 err = mddev->pers->hot_add_disk(mddev, rdev);
87d4d916
SL
2851 if (add_journal)
2852 mddev_resume(mddev);
a6da4ef8 2853 if (err) {
db767672 2854 md_kick_rdev_from_array(rdev);
a6da4ef8
GR
2855 return err;
2856 }
2857 }
2858 sysfs_notify_dirent_safe(rdev->sysfs_state);
2859
2953079c 2860 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
a6da4ef8
GR
2861 if (mddev->degraded)
2862 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
2863 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2864 md_new_event(mddev);
2865 md_wakeup_thread(mddev->thread);
2866 return 0;
2867}
1da177e4 2868
7f6ce769 2869/* words written to sysfs files may, or may not, be \n terminated.
bce74dac
N
2870 * We want to accept with case. For this we use cmd_match.
2871 */
2872static int cmd_match(const char *cmd, const char *str)
2873{
2874 /* See if cmd, written into a sysfs file, matches
2875 * str. They must either be the same, or cmd can
2876 * have a trailing newline
2877 */
2878 while (*cmd && *str && *cmd == *str) {
2879 cmd++;
2880 str++;
2881 }
2882 if (*cmd == '\n')
2883 cmd++;
2884 if (*str || *cmd)
2885 return 0;
2886 return 1;
2887}
2888
86e6ffdd
N
2889struct rdev_sysfs_entry {
2890 struct attribute attr;
3cb03002
N
2891 ssize_t (*show)(struct md_rdev *, char *);
2892 ssize_t (*store)(struct md_rdev *, const char *, size_t);
86e6ffdd
N
2893};
2894
2895static ssize_t
3cb03002 2896state_show(struct md_rdev *rdev, char *page)
86e6ffdd 2897{
35b785f7 2898 char *sep = ",";
20a49ff6 2899 size_t len = 0;
6aa7de05 2900 unsigned long flags = READ_ONCE(rdev->flags);
86e6ffdd 2901
758bfc8a 2902 if (test_bit(Faulty, &flags) ||
dcbcb486
TM
2903 (!test_bit(ExternalBbl, &flags) &&
2904 rdev->badblocks.unacked_exist))
35b785f7
TM
2905 len += sprintf(page+len, "faulty%s", sep);
2906 if (test_bit(In_sync, &flags))
2907 len += sprintf(page+len, "in_sync%s", sep);
2908 if (test_bit(Journal, &flags))
2909 len += sprintf(page+len, "journal%s", sep);
2910 if (test_bit(WriteMostly, &flags))
2911 len += sprintf(page+len, "write_mostly%s", sep);
758bfc8a 2912 if (test_bit(Blocked, &flags) ||
52c64152 2913 (rdev->badblocks.unacked_exist
35b785f7
TM
2914 && !test_bit(Faulty, &flags)))
2915 len += sprintf(page+len, "blocked%s", sep);
758bfc8a 2916 if (!test_bit(Faulty, &flags) &&
f2076e7d 2917 !test_bit(Journal, &flags) &&
35b785f7
TM
2918 !test_bit(In_sync, &flags))
2919 len += sprintf(page+len, "spare%s", sep);
2920 if (test_bit(WriteErrorSeen, &flags))
2921 len += sprintf(page+len, "write_error%s", sep);
2922 if (test_bit(WantReplacement, &flags))
2923 len += sprintf(page+len, "want_replacement%s", sep);
2924 if (test_bit(Replacement, &flags))
2925 len += sprintf(page+len, "replacement%s", sep);
2926 if (test_bit(ExternalBbl, &flags))
2927 len += sprintf(page+len, "external_bbl%s", sep);
688834e6
N
2928 if (test_bit(FailFast, &flags))
2929 len += sprintf(page+len, "failfast%s", sep);
35b785f7
TM
2930
2931 if (len)
2932 len -= strlen(sep);
2d78f8c4 2933
86e6ffdd
N
2934 return len+sprintf(page+len, "\n");
2935}
2936
45dc2de1 2937static ssize_t
3cb03002 2938state_store(struct md_rdev *rdev, const char *buf, size_t len)
45dc2de1
N
2939{
2940 /* can write
de393cde 2941 * faulty - simulates an error
45dc2de1 2942 * remove - disconnects the device
f655675b
N
2943 * writemostly - sets write_mostly
2944 * -writemostly - clears write_mostly
de393cde
N
2945 * blocked - sets the Blocked flags
2946 * -blocked - clears the Blocked and possibly simulates an error
6d56e278 2947 * insync - sets Insync providing device isn't active
f466722c
N
2948 * -insync - clear Insync for a device with a slot assigned,
2949 * so that it gets rebuilt based on bitmap
d7a9d443
N
2950 * write_error - sets WriteErrorSeen
2951 * -write_error - clears WriteErrorSeen
688834e6 2952 * {,-}failfast - set/clear FailFast
45dc2de1
N
2953 */
2954 int err = -EINVAL;
2955 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2956 md_error(rdev->mddev, rdev);
5ef56c8f
N
2957 if (test_bit(Faulty, &rdev->flags))
2958 err = 0;
2959 else
2960 err = -EBUSY;
45dc2de1 2961 } else if (cmd_match(buf, "remove")) {
5d881783
SL
2962 if (rdev->mddev->pers) {
2963 clear_bit(Blocked, &rdev->flags);
2964 remove_and_add_spares(rdev->mddev, rdev);
2965 }
45dc2de1
N
2966 if (rdev->raid_disk >= 0)
2967 err = -EBUSY;
2968 else {
fd01b88c 2969 struct mddev *mddev = rdev->mddev;
45dc2de1 2970 err = 0;
a9720903
GJ
2971 if (mddev_is_clustered(mddev))
2972 err = md_cluster_ops->remove_disk(mddev, rdev);
2973
2974 if (err == 0) {
2975 md_kick_rdev_from_array(rdev);
060b0689 2976 if (mddev->pers) {
2953079c 2977 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
060b0689
N
2978 md_wakeup_thread(mddev->thread);
2979 }
a9720903
GJ
2980 md_new_event(mddev);
2981 }
45dc2de1 2982 }
f655675b
N
2983 } else if (cmd_match(buf, "writemostly")) {
2984 set_bit(WriteMostly, &rdev->flags);
404659cf 2985 mddev_create_serial_pool(rdev->mddev, rdev, false);
f655675b
N
2986 err = 0;
2987 } else if (cmd_match(buf, "-writemostly")) {
11d3a9f6 2988 mddev_destroy_serial_pool(rdev->mddev, rdev, false);
f655675b 2989 clear_bit(WriteMostly, &rdev->flags);
6bfe0b49
DW
2990 err = 0;
2991 } else if (cmd_match(buf, "blocked")) {
2992 set_bit(Blocked, &rdev->flags);
2993 err = 0;
2994 } else if (cmd_match(buf, "-blocked")) {
de393cde 2995 if (!test_bit(Faulty, &rdev->flags) &&
dcbcb486 2996 !test_bit(ExternalBbl, &rdev->flags) &&
7da64a0a 2997 rdev->badblocks.unacked_exist) {
de393cde
N
2998 /* metadata handler doesn't understand badblocks,
2999 * so we need to fail the device
3000 */
3001 md_error(rdev->mddev, rdev);
3002 }
6bfe0b49 3003 clear_bit(Blocked, &rdev->flags);
de393cde 3004 clear_bit(BlockedBadBlocks, &rdev->flags);
6bfe0b49
DW
3005 wake_up(&rdev->blocked_wait);
3006 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3007 md_wakeup_thread(rdev->mddev->thread);
3008
6d56e278
N
3009 err = 0;
3010 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
3011 set_bit(In_sync, &rdev->flags);
f655675b 3012 err = 0;
688834e6
N
3013 } else if (cmd_match(buf, "failfast")) {
3014 set_bit(FailFast, &rdev->flags);
3015 err = 0;
3016 } else if (cmd_match(buf, "-failfast")) {
3017 clear_bit(FailFast, &rdev->flags);
3018 err = 0;
f2076e7d
SL
3019 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0 &&
3020 !test_bit(Journal, &rdev->flags)) {
e1960f8c
N
3021 if (rdev->mddev->pers == NULL) {
3022 clear_bit(In_sync, &rdev->flags);
3023 rdev->saved_raid_disk = rdev->raid_disk;
3024 rdev->raid_disk = -1;
3025 err = 0;
3026 }
d7a9d443
N
3027 } else if (cmd_match(buf, "write_error")) {
3028 set_bit(WriteErrorSeen, &rdev->flags);
3029 err = 0;
3030 } else if (cmd_match(buf, "-write_error")) {
3031 clear_bit(WriteErrorSeen, &rdev->flags);
3032 err = 0;
2d78f8c4
N
3033 } else if (cmd_match(buf, "want_replacement")) {
3034 /* Any non-spare device that is not a replacement can
3035 * become want_replacement at any time, but we then need to
3036 * check if recovery is needed.
3037 */
3038 if (rdev->raid_disk >= 0 &&
f2076e7d 3039 !test_bit(Journal, &rdev->flags) &&
2d78f8c4
N
3040 !test_bit(Replacement, &rdev->flags))
3041 set_bit(WantReplacement, &rdev->flags);
3042 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3043 md_wakeup_thread(rdev->mddev->thread);
3044 err = 0;
3045 } else if (cmd_match(buf, "-want_replacement")) {
3046 /* Clearing 'want_replacement' is always allowed.
3047 * Once replacements starts it is too late though.
3048 */
3049 err = 0;
3050 clear_bit(WantReplacement, &rdev->flags);
3051 } else if (cmd_match(buf, "replacement")) {
3052 /* Can only set a device as a replacement when array has not
3053 * yet been started. Once running, replacement is automatic
3054 * from spares, or by assigning 'slot'.
3055 */
3056 if (rdev->mddev->pers)
3057 err = -EBUSY;
3058 else {
3059 set_bit(Replacement, &rdev->flags);
3060 err = 0;
3061 }
3062 } else if (cmd_match(buf, "-replacement")) {
3063 /* Similarly, can only clear Replacement before start */
3064 if (rdev->mddev->pers)
3065 err = -EBUSY;
3066 else {
3067 clear_bit(Replacement, &rdev->flags);
3068 err = 0;
3069 }
a6da4ef8 3070 } else if (cmd_match(buf, "re-add")) {
ee37e621
YY
3071 if (!rdev->mddev->pers)
3072 err = -EINVAL;
3073 else if (test_bit(Faulty, &rdev->flags) && (rdev->raid_disk == -1) &&
3074 rdev->saved_raid_disk >= 0) {
97f6cd39
GR
3075 /* clear_bit is performed _after_ all the devices
3076 * have their local Faulty bit cleared. If any writes
3077 * happen in the meantime in the local node, they
3078 * will land in the local bitmap, which will be synced
3079 * by this node eventually
3080 */
3081 if (!mddev_is_clustered(rdev->mddev) ||
3082 (err = md_cluster_ops->gather_bitmaps(rdev)) == 0) {
3083 clear_bit(Faulty, &rdev->flags);
3084 err = add_bound_rdev(rdev);
3085 }
a6da4ef8
GR
3086 } else
3087 err = -EBUSY;
35b785f7
TM
3088 } else if (cmd_match(buf, "external_bbl") && (rdev->mddev->external)) {
3089 set_bit(ExternalBbl, &rdev->flags);
3090 rdev->badblocks.shift = 0;
3091 err = 0;
3092 } else if (cmd_match(buf, "-external_bbl") && (rdev->mddev->external)) {
3093 clear_bit(ExternalBbl, &rdev->flags);
3094 err = 0;
45dc2de1 3095 }
00bcb4ac
N
3096 if (!err)
3097 sysfs_notify_dirent_safe(rdev->sysfs_state);
45dc2de1
N
3098 return err ? err : len;
3099}
80ca3a44 3100static struct rdev_sysfs_entry rdev_state =
750f199e 3101__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store);
86e6ffdd 3102
4dbcdc75 3103static ssize_t
3cb03002 3104errors_show(struct md_rdev *rdev, char *page)
4dbcdc75
N
3105{
3106 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
3107}
3108
3109static ssize_t
3cb03002 3110errors_store(struct md_rdev *rdev, const char *buf, size_t len)
4dbcdc75 3111{
4c9309c0
AD
3112 unsigned int n;
3113 int rv;
3114
3115 rv = kstrtouint(buf, 10, &n);
3116 if (rv < 0)
3117 return rv;
3118 atomic_set(&rdev->corrected_errors, n);
3119 return len;
4dbcdc75
N
3120}
3121static struct rdev_sysfs_entry rdev_errors =
80ca3a44 3122__ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
4dbcdc75 3123
014236d2 3124static ssize_t
3cb03002 3125slot_show(struct md_rdev *rdev, char *page)
014236d2 3126{
f2076e7d
SL
3127 if (test_bit(Journal, &rdev->flags))
3128 return sprintf(page, "journal\n");
3129 else if (rdev->raid_disk < 0)
014236d2
N
3130 return sprintf(page, "none\n");
3131 else
3132 return sprintf(page, "%d\n", rdev->raid_disk);
3133}
3134
3135static ssize_t
3cb03002 3136slot_store(struct md_rdev *rdev, const char *buf, size_t len)
014236d2 3137{
4c9309c0 3138 int slot;
c303da6d 3139 int err;
4c9309c0 3140
f2076e7d
SL
3141 if (test_bit(Journal, &rdev->flags))
3142 return -EBUSY;
014236d2
N
3143 if (strncmp(buf, "none", 4)==0)
3144 slot = -1;
4c9309c0
AD
3145 else {
3146 err = kstrtouint(buf, 10, (unsigned int *)&slot);
3147 if (err < 0)
3148 return err;
3149 }
6c2fce2e 3150 if (rdev->mddev->pers && slot == -1) {
c303da6d
N
3151 /* Setting 'slot' on an active array requires also
3152 * updating the 'rd%d' link, and communicating
3153 * with the personality with ->hot_*_disk.
3154 * For now we only support removing
3155 * failed/spare devices. This normally happens automatically,
3156 * but not when the metadata is externally managed.
3157 */
c303da6d
N
3158 if (rdev->raid_disk == -1)
3159 return -EEXIST;
3160 /* personality does all needed checks */
01393f3d 3161 if (rdev->mddev->pers->hot_remove_disk == NULL)
c303da6d 3162 return -EINVAL;
746d3207
N
3163 clear_bit(Blocked, &rdev->flags);
3164 remove_and_add_spares(rdev->mddev, rdev);
3165 if (rdev->raid_disk >= 0)
3166 return -EBUSY;
c303da6d
N
3167 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
3168 md_wakeup_thread(rdev->mddev->thread);
6c2fce2e 3169 } else if (rdev->mddev->pers) {
6c2fce2e 3170 /* Activating a spare .. or possibly reactivating
6d56e278 3171 * if we ever get bitmaps working here.
6c2fce2e 3172 */
cb01c549 3173 int err;
6c2fce2e
NB
3174
3175 if (rdev->raid_disk != -1)
3176 return -EBUSY;
3177
c6751b2b
N
3178 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
3179 return -EBUSY;
3180
6c2fce2e
NB
3181 if (rdev->mddev->pers->hot_add_disk == NULL)
3182 return -EINVAL;
3183
ba1b41b6
N
3184 if (slot >= rdev->mddev->raid_disks &&
3185 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
3186 return -ENOSPC;
3187
6c2fce2e
NB
3188 rdev->raid_disk = slot;
3189 if (test_bit(In_sync, &rdev->flags))
3190 rdev->saved_raid_disk = slot;
3191 else
3192 rdev->saved_raid_disk = -1;
d30519fc 3193 clear_bit(In_sync, &rdev->flags);
8313b8e5 3194 clear_bit(Bitmap_sync, &rdev->flags);
3f79cc22 3195 err = rdev->mddev->pers->hot_add_disk(rdev->mddev, rdev);
cb01c549
GR
3196 if (err) {
3197 rdev->raid_disk = -1;
3198 return err;
3199 } else
3200 sysfs_notify_dirent_safe(rdev->sysfs_state);
3201 if (sysfs_link_rdev(rdev->mddev, rdev))
3202 /* failure here is OK */;
6c2fce2e 3203 /* don't wakeup anyone, leave that to userspace. */
c303da6d 3204 } else {
ba1b41b6
N
3205 if (slot >= rdev->mddev->raid_disks &&
3206 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
c303da6d
N
3207 return -ENOSPC;
3208 rdev->raid_disk = slot;
3209 /* assume it is working */
c5d79adb
N
3210 clear_bit(Faulty, &rdev->flags);
3211 clear_bit(WriteMostly, &rdev->flags);
c303da6d 3212 set_bit(In_sync, &rdev->flags);
00bcb4ac 3213 sysfs_notify_dirent_safe(rdev->sysfs_state);
c303da6d 3214 }
014236d2
N
3215 return len;
3216}
3217
014236d2 3218static struct rdev_sysfs_entry rdev_slot =
80ca3a44 3219__ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
014236d2 3220
93c8cad0 3221static ssize_t
3cb03002 3222offset_show(struct md_rdev *rdev, char *page)
93c8cad0 3223{
6961ece4 3224 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
93c8cad0
N
3225}
3226
3227static ssize_t
3cb03002 3228offset_store(struct md_rdev *rdev, const char *buf, size_t len)
93c8cad0 3229{
c6563a8c 3230 unsigned long long offset;
b29bebd6 3231 if (kstrtoull(buf, 10, &offset) < 0)
93c8cad0 3232 return -EINVAL;
8ed0a521 3233 if (rdev->mddev->pers && rdev->raid_disk >= 0)
93c8cad0 3234 return -EBUSY;
dd8ac336 3235 if (rdev->sectors && rdev->mddev->external)
c5d79adb
N
3236 /* Must set offset before size, so overlap checks
3237 * can be sane */
3238 return -EBUSY;
93c8cad0 3239 rdev->data_offset = offset;
25f7fd47 3240 rdev->new_data_offset = offset;
93c8cad0
N
3241 return len;
3242}
3243
3244static struct rdev_sysfs_entry rdev_offset =
80ca3a44 3245__ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
93c8cad0 3246
c6563a8c
N
3247static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
3248{
3249 return sprintf(page, "%llu\n",
3250 (unsigned long long)rdev->new_data_offset);
3251}
3252
3253static ssize_t new_offset_store(struct md_rdev *rdev,
3254 const char *buf, size_t len)
3255{
3256 unsigned long long new_offset;
3257 struct mddev *mddev = rdev->mddev;
3258
b29bebd6 3259 if (kstrtoull(buf, 10, &new_offset) < 0)
c6563a8c
N
3260 return -EINVAL;
3261
f851b60d
N
3262 if (mddev->sync_thread ||
3263 test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
c6563a8c
N
3264 return -EBUSY;
3265 if (new_offset == rdev->data_offset)
3266 /* reset is always permitted */
3267 ;
3268 else if (new_offset > rdev->data_offset) {
3269 /* must not push array size beyond rdev_sectors */
3270 if (new_offset - rdev->data_offset
3271 + mddev->dev_sectors > rdev->sectors)
3272 return -E2BIG;
3273 }
3274 /* Metadata worries about other space details. */
3275
3276 /* decreasing the offset is inconsistent with a backwards
3277 * reshape.
3278 */
3279 if (new_offset < rdev->data_offset &&
3280 mddev->reshape_backwards)
3281 return -EINVAL;
3282 /* Increasing offset is inconsistent with forwards
3283 * reshape. reshape_direction should be set to
3284 * 'backwards' first.
3285 */
3286 if (new_offset > rdev->data_offset &&
3287 !mddev->reshape_backwards)
3288 return -EINVAL;
3289
3290 if (mddev->pers && mddev->persistent &&
3291 !super_types[mddev->major_version]
3292 .allow_new_offset(rdev, new_offset))
3293 return -E2BIG;
3294 rdev->new_data_offset = new_offset;
3295 if (new_offset > rdev->data_offset)
3296 mddev->reshape_backwards = 1;
3297 else if (new_offset < rdev->data_offset)
3298 mddev->reshape_backwards = 0;
3299
3300 return len;
3301}
3302static struct rdev_sysfs_entry rdev_new_offset =
3303__ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
3304
83303b61 3305static ssize_t
3cb03002 3306rdev_size_show(struct md_rdev *rdev, char *page)
83303b61 3307{
dd8ac336 3308 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
83303b61
N
3309}
3310
c5d79adb
N
3311static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
3312{
3313 /* check if two start/length pairs overlap */
3314 if (s1+l1 <= s2)
3315 return 0;
3316 if (s2+l2 <= s1)
3317 return 0;
3318 return 1;
3319}
3320
b522adcd
DW
3321static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
3322{
3323 unsigned long long blocks;
3324 sector_t new;
3325
b29bebd6 3326 if (kstrtoull(buf, 10, &blocks) < 0)
b522adcd
DW
3327 return -EINVAL;
3328
3329 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
3330 return -EINVAL; /* sector conversion overflow */
3331
3332 new = blocks * 2;
3333 if (new != blocks * 2)
3334 return -EINVAL; /* unsigned long long to sector_t overflow */
3335
3336 *sectors = new;
3337 return 0;
3338}
3339
83303b61 3340static ssize_t
3cb03002 3341rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
83303b61 3342{
fd01b88c 3343 struct mddev *my_mddev = rdev->mddev;
dd8ac336 3344 sector_t oldsectors = rdev->sectors;
b522adcd 3345 sector_t sectors;
27c529bb 3346
f2076e7d
SL
3347 if (test_bit(Journal, &rdev->flags))
3348 return -EBUSY;
b522adcd 3349 if (strict_blocks_to_sectors(buf, &sectors) < 0)
d7027458 3350 return -EINVAL;
c6563a8c
N
3351 if (rdev->data_offset != rdev->new_data_offset)
3352 return -EINVAL; /* too confusing */
0cd17fec 3353 if (my_mddev->pers && rdev->raid_disk >= 0) {
d7027458 3354 if (my_mddev->persistent) {
dd8ac336
AN
3355 sectors = super_types[my_mddev->major_version].
3356 rdev_size_change(rdev, sectors);
3357 if (!sectors)
0cd17fec 3358 return -EBUSY;
dd8ac336 3359 } else if (!sectors)
77304d2a 3360 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
dd8ac336 3361 rdev->data_offset;
a6468539
N
3362 if (!my_mddev->pers->resize)
3363 /* Cannot change size for RAID0 or Linear etc */
3364 return -EINVAL;
0cd17fec 3365 }
dd8ac336 3366 if (sectors < my_mddev->dev_sectors)
7d3c6f87 3367 return -EINVAL; /* component must fit device */
0cd17fec 3368
dd8ac336
AN
3369 rdev->sectors = sectors;
3370 if (sectors > oldsectors && my_mddev->external) {
8b1afc3d
N
3371 /* Need to check that all other rdevs with the same
3372 * ->bdev do not overlap. 'rcu' is sufficient to walk
3373 * the rdev lists safely.
3374 * This check does not provide a hard guarantee, it
3375 * just helps avoid dangerous mistakes.
c5d79adb 3376 */
fd01b88c 3377 struct mddev *mddev;
c5d79adb 3378 int overlap = 0;
159ec1fc 3379 struct list_head *tmp;
c5d79adb 3380
8b1afc3d 3381 rcu_read_lock();
29ac4aa3 3382 for_each_mddev(mddev, tmp) {
3cb03002 3383 struct md_rdev *rdev2;
c5d79adb 3384
dafb20fa 3385 rdev_for_each(rdev2, mddev)
f21e9ff7
N
3386 if (rdev->bdev == rdev2->bdev &&
3387 rdev != rdev2 &&
3388 overlaps(rdev->data_offset, rdev->sectors,
3389 rdev2->data_offset,
3390 rdev2->sectors)) {
c5d79adb
N
3391 overlap = 1;
3392 break;
3393 }
c5d79adb
N
3394 if (overlap) {
3395 mddev_put(mddev);
3396 break;
3397 }
3398 }
8b1afc3d 3399 rcu_read_unlock();
c5d79adb
N
3400 if (overlap) {
3401 /* Someone else could have slipped in a size
3402 * change here, but doing so is just silly.
dd8ac336 3403 * We put oldsectors back because we *know* it is
c5d79adb
N
3404 * safe, and trust userspace not to race with
3405 * itself
3406 */
dd8ac336 3407 rdev->sectors = oldsectors;
c5d79adb
N
3408 return -EBUSY;
3409 }
3410 }
83303b61
N
3411 return len;
3412}
3413
3414static struct rdev_sysfs_entry rdev_size =
80ca3a44 3415__ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
83303b61 3416
3cb03002 3417static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
06e3c817
DW
3418{
3419 unsigned long long recovery_start = rdev->recovery_offset;
3420
3421 if (test_bit(In_sync, &rdev->flags) ||
3422 recovery_start == MaxSector)
3423 return sprintf(page, "none\n");
3424
3425 return sprintf(page, "%llu\n", recovery_start);
3426}
3427
3cb03002 3428static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
06e3c817
DW
3429{
3430 unsigned long long recovery_start;
3431
3432 if (cmd_match(buf, "none"))
3433 recovery_start = MaxSector;
b29bebd6 3434 else if (kstrtoull(buf, 10, &recovery_start))
06e3c817
DW
3435 return -EINVAL;
3436
3437 if (rdev->mddev->pers &&
3438 rdev->raid_disk >= 0)
3439 return -EBUSY;
3440
3441 rdev->recovery_offset = recovery_start;
3442 if (recovery_start == MaxSector)
3443 set_bit(In_sync, &rdev->flags);
3444 else
3445 clear_bit(In_sync, &rdev->flags);
3446 return len;
3447}
3448
3449static struct rdev_sysfs_entry rdev_recovery_start =
3450__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3451
fc974ee2
VV
3452/* sysfs access to bad-blocks list.
3453 * We present two files.
3454 * 'bad-blocks' lists sector numbers and lengths of ranges that
3455 * are recorded as bad. The list is truncated to fit within
3456 * the one-page limit of sysfs.
3457 * Writing "sector length" to this file adds an acknowledged
3458 * bad block list.
3459 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
3460 * been acknowledged. Writing to this file adds bad blocks
3461 * without acknowledging them. This is largely for testing.
3462 */
3cb03002 3463static ssize_t bb_show(struct md_rdev *rdev, char *page)
16c791a5
N
3464{
3465 return badblocks_show(&rdev->badblocks, page, 0);
3466}
3cb03002 3467static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
16c791a5 3468{
de393cde
N
3469 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3470 /* Maybe that ack was all we needed */
3471 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3472 wake_up(&rdev->blocked_wait);
3473 return rv;
16c791a5
N
3474}
3475static struct rdev_sysfs_entry rdev_bad_blocks =
3476__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3477
3cb03002 3478static ssize_t ubb_show(struct md_rdev *rdev, char *page)
16c791a5
N
3479{
3480 return badblocks_show(&rdev->badblocks, page, 1);
3481}
3cb03002 3482static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
16c791a5
N
3483{
3484 return badblocks_store(&rdev->badblocks, page, len, 1);
3485}
3486static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3487__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3488
664aed04
AP
3489static ssize_t
3490ppl_sector_show(struct md_rdev *rdev, char *page)
3491{
3492 return sprintf(page, "%llu\n", (unsigned long long)rdev->ppl.sector);
3493}
3494
3495static ssize_t
3496ppl_sector_store(struct md_rdev *rdev, const char *buf, size_t len)
3497{
3498 unsigned long long sector;
3499
3500 if (kstrtoull(buf, 10, &sector) < 0)
3501 return -EINVAL;
3502 if (sector != (sector_t)sector)
3503 return -EINVAL;
3504
3505 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3506 rdev->raid_disk >= 0)
3507 return -EBUSY;
3508
3509 if (rdev->mddev->persistent) {
3510 if (rdev->mddev->major_version == 0)
3511 return -EINVAL;
3512 if ((sector > rdev->sb_start &&
3513 sector - rdev->sb_start > S16_MAX) ||
3514 (sector < rdev->sb_start &&
3515 rdev->sb_start - sector > -S16_MIN))
3516 return -EINVAL;
3517 rdev->ppl.offset = sector - rdev->sb_start;
3518 } else if (!rdev->mddev->external) {
3519 return -EBUSY;
3520 }
3521 rdev->ppl.sector = sector;
3522 return len;
3523}
3524
3525static struct rdev_sysfs_entry rdev_ppl_sector =
3526__ATTR(ppl_sector, S_IRUGO|S_IWUSR, ppl_sector_show, ppl_sector_store);
3527
3528static ssize_t
3529ppl_size_show(struct md_rdev *rdev, char *page)
3530{
3531 return sprintf(page, "%u\n", rdev->ppl.size);
3532}
3533
3534static ssize_t
3535ppl_size_store(struct md_rdev *rdev, const char *buf, size_t len)
3536{
3537 unsigned int size;
3538
3539 if (kstrtouint(buf, 10, &size) < 0)
3540 return -EINVAL;
3541
3542 if (rdev->mddev->pers && test_bit(MD_HAS_PPL, &rdev->mddev->flags) &&
3543 rdev->raid_disk >= 0)
3544 return -EBUSY;
3545
3546 if (rdev->mddev->persistent) {
3547 if (rdev->mddev->major_version == 0)
3548 return -EINVAL;
3549 if (size > U16_MAX)
3550 return -EINVAL;
3551 } else if (!rdev->mddev->external) {
3552 return -EBUSY;
3553 }
3554 rdev->ppl.size = size;
3555 return len;
3556}
3557
3558static struct rdev_sysfs_entry rdev_ppl_size =
3559__ATTR(ppl_size, S_IRUGO|S_IWUSR, ppl_size_show, ppl_size_store);
3560
86e6ffdd
N
3561static struct attribute *rdev_default_attrs[] = {
3562 &rdev_state.attr,
4dbcdc75 3563 &rdev_errors.attr,
014236d2 3564 &rdev_slot.attr,
93c8cad0 3565 &rdev_offset.attr,
c6563a8c 3566 &rdev_new_offset.attr,
83303b61 3567 &rdev_size.attr,
06e3c817 3568 &rdev_recovery_start.attr,
16c791a5
N
3569 &rdev_bad_blocks.attr,
3570 &rdev_unack_bad_blocks.attr,
664aed04
AP
3571 &rdev_ppl_sector.attr,
3572 &rdev_ppl_size.attr,
86e6ffdd
N
3573 NULL,
3574};
3575static ssize_t
3576rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3577{
3578 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3cb03002 3579 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
86e6ffdd
N
3580
3581 if (!entry->show)
3582 return -EIO;
758bfc8a 3583 if (!rdev->mddev)
168b305b 3584 return -ENODEV;
758bfc8a 3585 return entry->show(rdev, page);
86e6ffdd
N
3586}
3587
3588static ssize_t
3589rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3590 const char *page, size_t length)
3591{
3592 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3cb03002 3593 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
27c529bb 3594 ssize_t rv;
fd01b88c 3595 struct mddev *mddev = rdev->mddev;
86e6ffdd
N
3596
3597 if (!entry->store)
3598 return -EIO;
67463acb
N
3599 if (!capable(CAP_SYS_ADMIN))
3600 return -EACCES;
c42d3240 3601 rv = mddev ? mddev_lock(mddev) : -ENODEV;
ca388059 3602 if (!rv) {
27c529bb 3603 if (rdev->mddev == NULL)
c42d3240 3604 rv = -ENODEV;
27c529bb
N
3605 else
3606 rv = entry->store(rdev, page, length);
6a51830e 3607 mddev_unlock(mddev);
ca388059
N
3608 }
3609 return rv;
86e6ffdd
N
3610}
3611
3612static void rdev_free(struct kobject *ko)
3613{
3cb03002 3614 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
86e6ffdd
N
3615 kfree(rdev);
3616}
52cf25d0 3617static const struct sysfs_ops rdev_sysfs_ops = {
86e6ffdd
N
3618 .show = rdev_attr_show,
3619 .store = rdev_attr_store,
3620};
3621static struct kobj_type rdev_ktype = {
3622 .release = rdev_free,
3623 .sysfs_ops = &rdev_sysfs_ops,
3624 .default_attrs = rdev_default_attrs,
3625};
3626
3cb03002 3627int md_rdev_init(struct md_rdev *rdev)
e8bb9a83
N
3628{
3629 rdev->desc_nr = -1;
3630 rdev->saved_raid_disk = -1;
3631 rdev->raid_disk = -1;
3632 rdev->flags = 0;
3633 rdev->data_offset = 0;
c6563a8c 3634 rdev->new_data_offset = 0;
e8bb9a83 3635 rdev->sb_events = 0;
0e3ef49e 3636 rdev->last_read_error = 0;
2699b672
N
3637 rdev->sb_loaded = 0;
3638 rdev->bb_page = NULL;
e8bb9a83
N
3639 atomic_set(&rdev->nr_pending, 0);
3640 atomic_set(&rdev->read_errors, 0);
3641 atomic_set(&rdev->corrected_errors, 0);
3642
3643 INIT_LIST_HEAD(&rdev->same_set);
3644 init_waitqueue_head(&rdev->blocked_wait);
2230dfe4
N
3645
3646 /* Add space to store bad block list.
3647 * This reserves the space even on arrays where it cannot
3648 * be used - I wonder if that matters
3649 */
fc974ee2 3650 return badblocks_init(&rdev->badblocks, 0);
e8bb9a83
N
3651}
3652EXPORT_SYMBOL_GPL(md_rdev_init);
1da177e4
LT
3653/*
3654 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3655 *
3656 * mark the device faulty if:
3657 *
3658 * - the device is nonexistent (zero size)
3659 * - the device has no valid superblock
3660 *
3661 * a faulty rdev _never_ has rdev->sb set.
3662 */
3cb03002 3663static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
1da177e4
LT
3664{
3665 char b[BDEVNAME_SIZE];
3666 int err;
3cb03002 3667 struct md_rdev *rdev;
1da177e4
LT
3668 sector_t size;
3669
9ffae0cf 3670 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
9d48739e 3671 if (!rdev)
1da177e4 3672 return ERR_PTR(-ENOMEM);
1da177e4 3673
2230dfe4
N
3674 err = md_rdev_init(rdev);
3675 if (err)
3676 goto abort_free;
3677 err = alloc_disk_sb(rdev);
3678 if (err)
1da177e4
LT
3679 goto abort_free;
3680
c5d79adb 3681 err = lock_rdev(rdev, newdev, super_format == -2);
1da177e4
LT
3682 if (err)
3683 goto abort_free;
3684
f9cb074b 3685 kobject_init(&rdev->kobj, &rdev_ktype);
86e6ffdd 3686
77304d2a 3687 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
1da177e4 3688 if (!size) {
9d48739e 3689 pr_warn("md: %s has zero or unknown size, marking faulty!\n",
1da177e4
LT
3690 bdevname(rdev->bdev,b));
3691 err = -EINVAL;
3692 goto abort_free;
3693 }
3694
3695 if (super_format >= 0) {
3696 err = super_types[super_format].
3697 load_super(rdev, NULL, super_minor);
3698 if (err == -EINVAL) {
9d48739e 3699 pr_warn("md: %s does not have a valid v%d.%d superblock, not importing!\n",
df968c4e 3700 bdevname(rdev->bdev,b),
9d48739e 3701 super_format, super_minor);
1da177e4
LT
3702 goto abort_free;
3703 }
3704 if (err < 0) {
9d48739e 3705 pr_warn("md: could not read %s's sb, not importing!\n",
1da177e4
LT
3706 bdevname(rdev->bdev,b));
3707 goto abort_free;
3708 }
3709 }
6bfe0b49 3710
1da177e4
LT
3711 return rdev;
3712
3713abort_free:
2699b672
N
3714 if (rdev->bdev)
3715 unlock_rdev(rdev);
545c8795 3716 md_rdev_clear(rdev);
1da177e4
LT
3717 kfree(rdev);
3718 return ERR_PTR(err);
3719}
3720
3721/*
3722 * Check a full RAID array for plausibility
3723 */
3724
6a5cb53a 3725static int analyze_sbs(struct mddev *mddev)
1da177e4
LT
3726{
3727 int i;
3cb03002 3728 struct md_rdev *rdev, *freshest, *tmp;
1da177e4
LT
3729 char b[BDEVNAME_SIZE];
3730
3731 freshest = NULL;
dafb20fa 3732 rdev_for_each_safe(rdev, tmp, mddev)
1da177e4
LT
3733 switch (super_types[mddev->major_version].
3734 load_super(rdev, freshest, mddev->minor_version)) {
3735 case 1:
3736 freshest = rdev;
3737 break;
3738 case 0:
3739 break;
3740 default:
9d48739e 3741 pr_warn("md: fatal superblock inconsistency in %s -- removing from array\n",
1da177e4 3742 bdevname(rdev->bdev,b));
fb56dfef 3743 md_kick_rdev_from_array(rdev);
1da177e4
LT
3744 }
3745
6a5cb53a
YY
3746 /* Cannot find a valid fresh disk */
3747 if (!freshest) {
3748 pr_warn("md: cannot find a valid disk\n");
3749 return -EINVAL;
3750 }
3751
1da177e4
LT
3752 super_types[mddev->major_version].
3753 validate_super(mddev, freshest);
3754
3755 i = 0;
dafb20fa 3756 rdev_for_each_safe(rdev, tmp, mddev) {
233fca36
N
3757 if (mddev->max_disks &&
3758 (rdev->desc_nr >= mddev->max_disks ||
3759 i > mddev->max_disks)) {
9d48739e
N
3760 pr_warn("md: %s: %s: only %d devices permitted\n",
3761 mdname(mddev), bdevname(rdev->bdev, b),
3762 mddev->max_disks);
fb56dfef 3763 md_kick_rdev_from_array(rdev);
de01dfad
N
3764 continue;
3765 }
1aee41f6 3766 if (rdev != freshest) {
1da177e4
LT
3767 if (super_types[mddev->major_version].
3768 validate_super(mddev, rdev)) {
9d48739e 3769 pr_warn("md: kicking non-fresh %s from array!\n",
1da177e4 3770 bdevname(rdev->bdev,b));
fb56dfef 3771 md_kick_rdev_from_array(rdev);
1da177e4
LT
3772 continue;
3773 }
1aee41f6 3774 }
1da177e4
LT
3775 if (mddev->level == LEVEL_MULTIPATH) {
3776 rdev->desc_nr = i++;
3777 rdev->raid_disk = rdev->desc_nr;
b2d444d7 3778 set_bit(In_sync, &rdev->flags);
f2076e7d
SL
3779 } else if (rdev->raid_disk >=
3780 (mddev->raid_disks - min(0, mddev->delta_disks)) &&
3781 !test_bit(Journal, &rdev->flags)) {
a778b73f
N
3782 rdev->raid_disk = -1;
3783 clear_bit(In_sync, &rdev->flags);
1da177e4
LT
3784 }
3785 }
6a5cb53a
YY
3786
3787 return 0;
1da177e4
LT
3788}
3789
72e02075
N
3790/* Read a fixed-point number.
3791 * Numbers in sysfs attributes should be in "standard" units where
3792 * possible, so time should be in seconds.
f72ffdd6 3793 * However we internally use a a much smaller unit such as
72e02075
N
3794 * milliseconds or jiffies.
3795 * This function takes a decimal number with a possible fractional
3796 * component, and produces an integer which is the result of
3797 * multiplying that number by 10^'scale'.
3798 * all without any floating-point arithmetic.
3799 */
3800int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3801{
3802 unsigned long result = 0;
3803 long decimals = -1;
3804 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3805 if (*cp == '.')
3806 decimals = 0;
3807 else if (decimals < scale) {
3808 unsigned int value;
3809 value = *cp - '0';
3810 result = result * 10 + value;
3811 if (decimals >= 0)
3812 decimals++;
3813 }
3814 cp++;
3815 }
3816 if (*cp == '\n')
3817 cp++;
3818 if (*cp)
3819 return -EINVAL;
3820 if (decimals < 0)
3821 decimals = 0;
cf891607 3822 *res = result * int_pow(10, scale - decimals);
72e02075
N
3823 return 0;
3824}
3825
16f17b39 3826static ssize_t
fd01b88c 3827safe_delay_show(struct mddev *mddev, char *page)
16f17b39
N
3828{
3829 int msec = (mddev->safemode_delay*1000)/HZ;
3830 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3831}
3832static ssize_t
fd01b88c 3833safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
16f17b39 3834{
16f17b39 3835 unsigned long msec;
97ce0a7f 3836
28c1b9fd 3837 if (mddev_is_clustered(mddev)) {
9d48739e 3838 pr_warn("md: Safemode is disabled for clustered mode\n");
28c1b9fd
GR
3839 return -EINVAL;
3840 }
3841
72e02075 3842 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
16f17b39 3843 return -EINVAL;
16f17b39
N
3844 if (msec == 0)
3845 mddev->safemode_delay = 0;
3846 else {
19052c0e 3847 unsigned long old_delay = mddev->safemode_delay;
1b30e66f
N
3848 unsigned long new_delay = (msec*HZ)/1000;
3849
3850 if (new_delay == 0)
3851 new_delay = 1;
3852 mddev->safemode_delay = new_delay;
3853 if (new_delay < old_delay || old_delay == 0)
3854 mod_timer(&mddev->safemode_timer, jiffies+1);
16f17b39
N
3855 }
3856 return len;
3857}
3858static struct md_sysfs_entry md_safe_delay =
80ca3a44 3859__ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
16f17b39 3860
eae1701f 3861static ssize_t
fd01b88c 3862level_show(struct mddev *mddev, char *page)
eae1701f 3863{
36d091f4
N
3864 struct md_personality *p;
3865 int ret;
3866 spin_lock(&mddev->lock);
3867 p = mddev->pers;
d9d166c2 3868 if (p)
36d091f4 3869 ret = sprintf(page, "%s\n", p->name);
d9d166c2 3870 else if (mddev->clevel[0])
36d091f4 3871 ret = sprintf(page, "%s\n", mddev->clevel);
d9d166c2 3872 else if (mddev->level != LEVEL_NONE)
36d091f4 3873 ret = sprintf(page, "%d\n", mddev->level);
d9d166c2 3874 else
36d091f4
N
3875 ret = 0;
3876 spin_unlock(&mddev->lock);
3877 return ret;
eae1701f
N
3878}
3879
d9d166c2 3880static ssize_t
fd01b88c 3881level_store(struct mddev *mddev, const char *buf, size_t len)
d9d166c2 3882{
f2859af6 3883 char clevel[16];
6791875e
N
3884 ssize_t rv;
3885 size_t slen = len;
db721d32 3886 struct md_personality *pers, *oldpers;
f2859af6 3887 long level;
db721d32 3888 void *priv, *oldpriv;
3cb03002 3889 struct md_rdev *rdev;
245f46c2 3890
6791875e
N
3891 if (slen == 0 || slen >= sizeof(clevel))
3892 return -EINVAL;
3893
3894 rv = mddev_lock(mddev);
3895 if (rv)
3896 return rv;
3897
245f46c2 3898 if (mddev->pers == NULL) {
6791875e
N
3899 strncpy(mddev->clevel, buf, slen);
3900 if (mddev->clevel[slen-1] == '\n')
3901 slen--;
3902 mddev->clevel[slen] = 0;
245f46c2 3903 mddev->level = LEVEL_NONE;
6791875e
N
3904 rv = len;
3905 goto out_unlock;
245f46c2 3906 }
6791875e 3907 rv = -EROFS;
bd8839e0 3908 if (mddev->ro)
6791875e 3909 goto out_unlock;
245f46c2
N
3910
3911 /* request to change the personality. Need to ensure:
3912 * - array is not engaged in resync/recovery/reshape
3913 * - old personality can be suspended
3914 * - new personality will access other array.
3915 */
3916
6791875e 3917 rv = -EBUSY;
bb4f1e9d 3918 if (mddev->sync_thread ||
f851b60d 3919 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
bb4f1e9d
N
3920 mddev->reshape_position != MaxSector ||
3921 mddev->sysfs_active)
6791875e 3922 goto out_unlock;
245f46c2 3923
6791875e 3924 rv = -EINVAL;
245f46c2 3925 if (!mddev->pers->quiesce) {
9d48739e
N
3926 pr_warn("md: %s: %s does not support online personality change\n",
3927 mdname(mddev), mddev->pers->name);
6791875e 3928 goto out_unlock;
245f46c2
N
3929 }
3930
3931 /* Now find the new personality */
6791875e
N
3932 strncpy(clevel, buf, slen);
3933 if (clevel[slen-1] == '\n')
3934 slen--;
3935 clevel[slen] = 0;
b29bebd6 3936 if (kstrtol(clevel, 10, &level))
f2859af6 3937 level = LEVEL_NONE;
245f46c2 3938
f2859af6
DW
3939 if (request_module("md-%s", clevel) != 0)
3940 request_module("md-level-%s", clevel);
245f46c2 3941 spin_lock(&pers_lock);
f2859af6 3942 pers = find_pers(level, clevel);
245f46c2
N
3943 if (!pers || !try_module_get(pers->owner)) {
3944 spin_unlock(&pers_lock);
9d48739e 3945 pr_warn("md: personality %s not loaded\n", clevel);
6791875e
N
3946 rv = -EINVAL;
3947 goto out_unlock;
245f46c2
N
3948 }
3949 spin_unlock(&pers_lock);
3950
3951 if (pers == mddev->pers) {
3952 /* Nothing to do! */
3953 module_put(pers->owner);
6791875e
N
3954 rv = len;
3955 goto out_unlock;
245f46c2
N
3956 }
3957 if (!pers->takeover) {
3958 module_put(pers->owner);
9d48739e
N
3959 pr_warn("md: %s: %s does not support personality takeover\n",
3960 mdname(mddev), clevel);
6791875e
N
3961 rv = -EINVAL;
3962 goto out_unlock;
245f46c2
N
3963 }
3964
dafb20fa 3965 rdev_for_each(rdev, mddev)
e93f68a1
N
3966 rdev->new_raid_disk = rdev->raid_disk;
3967
245f46c2
N
3968 /* ->takeover must set new_* and/or delta_disks
3969 * if it succeeds, and may set them when it fails.
3970 */
3971 priv = pers->takeover(mddev);
3972 if (IS_ERR(priv)) {
3973 mddev->new_level = mddev->level;
3974 mddev->new_layout = mddev->layout;
664e7c41 3975 mddev->new_chunk_sectors = mddev->chunk_sectors;
245f46c2
N
3976 mddev->raid_disks -= mddev->delta_disks;
3977 mddev->delta_disks = 0;
2c810cdd 3978 mddev->reshape_backwards = 0;
245f46c2 3979 module_put(pers->owner);
9d48739e
N
3980 pr_warn("md: %s: %s would not accept array\n",
3981 mdname(mddev), clevel);
6791875e
N
3982 rv = PTR_ERR(priv);
3983 goto out_unlock;
245f46c2
N
3984 }
3985
3986 /* Looks like we have a winner */
3987 mddev_suspend(mddev);
5aa61f42 3988 mddev_detach(mddev);
36d091f4
N
3989
3990 spin_lock(&mddev->lock);
db721d32
N
3991 oldpers = mddev->pers;
3992 oldpriv = mddev->private;
3993 mddev->pers = pers;
3994 mddev->private = priv;
3995 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3996 mddev->level = mddev->new_level;
3997 mddev->layout = mddev->new_layout;
3998 mddev->chunk_sectors = mddev->new_chunk_sectors;
3999 mddev->delta_disks = 0;
4000 mddev->reshape_backwards = 0;
4001 mddev->degraded = 0;
36d091f4 4002 spin_unlock(&mddev->lock);
db721d32
N
4003
4004 if (oldpers->sync_request == NULL &&
4005 mddev->external) {
4006 /* We are converting from a no-redundancy array
4007 * to a redundancy array and metadata is managed
4008 * externally so we need to be sure that writes
4009 * won't block due to a need to transition
4010 * clean->dirty
4011 * until external management is started.
4012 */
4013 mddev->in_sync = 0;
4014 mddev->safemode_delay = 0;
4015 mddev->safemode = 0;
4016 }
f72ffdd6 4017
db721d32
N
4018 oldpers->free(mddev, oldpriv);
4019
4020 if (oldpers->sync_request == NULL &&
a64c876f
N
4021 pers->sync_request != NULL) {
4022 /* need to add the md_redundancy_group */
4023 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
9d48739e
N
4024 pr_warn("md: cannot register extra attributes for %s\n",
4025 mdname(mddev));
388975cc 4026 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
f72ffdd6 4027 }
db721d32 4028 if (oldpers->sync_request != NULL &&
a64c876f
N
4029 pers->sync_request == NULL) {
4030 /* need to remove the md_redundancy_group */
4031 if (mddev->to_remove == NULL)
4032 mddev->to_remove = &md_redundancy_group;
4033 }
4034
4cb9da7d
AO
4035 module_put(oldpers->owner);
4036
dafb20fa 4037 rdev_for_each(rdev, mddev) {
e93f68a1
N
4038 if (rdev->raid_disk < 0)
4039 continue;
bf2cb0da 4040 if (rdev->new_raid_disk >= mddev->raid_disks)
e93f68a1
N
4041 rdev->new_raid_disk = -1;
4042 if (rdev->new_raid_disk == rdev->raid_disk)
4043 continue;
36fad858 4044 sysfs_unlink_rdev(mddev, rdev);
e93f68a1 4045 }
dafb20fa 4046 rdev_for_each(rdev, mddev) {
e93f68a1
N
4047 if (rdev->raid_disk < 0)
4048 continue;
4049 if (rdev->new_raid_disk == rdev->raid_disk)
4050 continue;
4051 rdev->raid_disk = rdev->new_raid_disk;
4052 if (rdev->raid_disk < 0)
3a981b03 4053 clear_bit(In_sync, &rdev->flags);
e93f68a1 4054 else {
36fad858 4055 if (sysfs_link_rdev(mddev, rdev))
9d48739e
N
4056 pr_warn("md: cannot register rd%d for %s after level change\n",
4057 rdev->raid_disk, mdname(mddev));
3a981b03 4058 }
e93f68a1
N
4059 }
4060
db721d32 4061 if (pers->sync_request == NULL) {
9af204cf
TM
4062 /* this is now an array without redundancy, so
4063 * it must always be in_sync
4064 */
4065 mddev->in_sync = 1;
4066 del_timer_sync(&mddev->safemode_timer);
4067 }
02e5f5c0 4068 blk_set_stacking_limits(&mddev->queue->limits);
245f46c2 4069 pers->run(mddev);
2953079c 4070 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
47525e59 4071 mddev_resume(mddev);
830778a1
N
4072 if (!mddev->thread)
4073 md_update_sb(mddev, 1);
5cac7861 4074 sysfs_notify(&mddev->kobj, NULL, "level");
bb7f8d22 4075 md_new_event(mddev);
6791875e
N
4076 rv = len;
4077out_unlock:
4078 mddev_unlock(mddev);
d9d166c2
N
4079 return rv;
4080}
4081
4082static struct md_sysfs_entry md_level =
80ca3a44 4083__ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
eae1701f 4084
d4dbd025 4085static ssize_t
fd01b88c 4086layout_show(struct mddev *mddev, char *page)
d4dbd025
N
4087{
4088 /* just a number, not meaningful for all levels */
08a02ecd
N
4089 if (mddev->reshape_position != MaxSector &&
4090 mddev->layout != mddev->new_layout)
4091 return sprintf(page, "%d (%d)\n",
4092 mddev->new_layout, mddev->layout);
d4dbd025
N
4093 return sprintf(page, "%d\n", mddev->layout);
4094}
4095
4096static ssize_t
fd01b88c 4097layout_store(struct mddev *mddev, const char *buf, size_t len)
d4dbd025 4098{
4c9309c0 4099 unsigned int n;
6791875e 4100 int err;
d4dbd025 4101
4c9309c0
AD
4102 err = kstrtouint(buf, 10, &n);
4103 if (err < 0)
4104 return err;
6791875e
N
4105 err = mddev_lock(mddev);
4106 if (err)
4107 return err;
d4dbd025 4108
b3546035 4109 if (mddev->pers) {
50ac168a 4110 if (mddev->pers->check_reshape == NULL)
6791875e
N
4111 err = -EBUSY;
4112 else if (mddev->ro)
4113 err = -EROFS;
4114 else {
4115 mddev->new_layout = n;
4116 err = mddev->pers->check_reshape(mddev);
4117 if (err)
4118 mddev->new_layout = mddev->layout;
597a711b 4119 }
b3546035 4120 } else {
08a02ecd 4121 mddev->new_layout = n;
b3546035
N
4122 if (mddev->reshape_position == MaxSector)
4123 mddev->layout = n;
4124 }
6791875e
N
4125 mddev_unlock(mddev);
4126 return err ?: len;
d4dbd025
N
4127}
4128static struct md_sysfs_entry md_layout =
80ca3a44 4129__ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
d4dbd025 4130
eae1701f 4131static ssize_t
fd01b88c 4132raid_disks_show(struct mddev *mddev, char *page)
eae1701f 4133{
bb636547
N
4134 if (mddev->raid_disks == 0)
4135 return 0;
08a02ecd
N
4136 if (mddev->reshape_position != MaxSector &&
4137 mddev->delta_disks != 0)
4138 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
4139 mddev->raid_disks - mddev->delta_disks);
eae1701f
N
4140 return sprintf(page, "%d\n", mddev->raid_disks);
4141}
4142
fd01b88c 4143static int update_raid_disks(struct mddev *mddev, int raid_disks);
da943b99
N
4144
4145static ssize_t
fd01b88c 4146raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
da943b99 4147{
4c9309c0 4148 unsigned int n;
6791875e 4149 int err;
da943b99 4150
4c9309c0
AD
4151 err = kstrtouint(buf, 10, &n);
4152 if (err < 0)
4153 return err;
da943b99 4154
6791875e
N
4155 err = mddev_lock(mddev);
4156 if (err)
4157 return err;
da943b99 4158 if (mddev->pers)
6791875e 4159 err = update_raid_disks(mddev, n);
08a02ecd 4160 else if (mddev->reshape_position != MaxSector) {
c6563a8c 4161 struct md_rdev *rdev;
08a02ecd 4162 int olddisks = mddev->raid_disks - mddev->delta_disks;
c6563a8c 4163
6791875e 4164 err = -EINVAL;
c6563a8c
N
4165 rdev_for_each(rdev, mddev) {
4166 if (olddisks < n &&
4167 rdev->data_offset < rdev->new_data_offset)
6791875e 4168 goto out_unlock;
c6563a8c
N
4169 if (olddisks > n &&
4170 rdev->data_offset > rdev->new_data_offset)
6791875e 4171 goto out_unlock;
c6563a8c 4172 }
6791875e 4173 err = 0;
08a02ecd
N
4174 mddev->delta_disks = n - olddisks;
4175 mddev->raid_disks = n;
2c810cdd 4176 mddev->reshape_backwards = (mddev->delta_disks < 0);
08a02ecd 4177 } else
da943b99 4178 mddev->raid_disks = n;
6791875e
N
4179out_unlock:
4180 mddev_unlock(mddev);
4181 return err ? err : len;
da943b99
N
4182}
4183static struct md_sysfs_entry md_raid_disks =
80ca3a44 4184__ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
eae1701f 4185
3b34380a 4186static ssize_t
fd01b88c 4187chunk_size_show(struct mddev *mddev, char *page)
3b34380a 4188{
08a02ecd 4189 if (mddev->reshape_position != MaxSector &&
664e7c41
AN
4190 mddev->chunk_sectors != mddev->new_chunk_sectors)
4191 return sprintf(page, "%d (%d)\n",
4192 mddev->new_chunk_sectors << 9,
9d8f0363
AN
4193 mddev->chunk_sectors << 9);
4194 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3b34380a
N
4195}
4196
4197static ssize_t
fd01b88c 4198chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3b34380a 4199{
4c9309c0 4200 unsigned long n;
6791875e 4201 int err;
3b34380a 4202
4c9309c0
AD
4203 err = kstrtoul(buf, 10, &n);
4204 if (err < 0)
4205 return err;
3b34380a 4206
6791875e
N
4207 err = mddev_lock(mddev);
4208 if (err)
4209 return err;
b3546035 4210 if (mddev->pers) {
50ac168a 4211 if (mddev->pers->check_reshape == NULL)
6791875e
N
4212 err = -EBUSY;
4213 else if (mddev->ro)
4214 err = -EROFS;
4215 else {
4216 mddev->new_chunk_sectors = n >> 9;
4217 err = mddev->pers->check_reshape(mddev);
4218 if (err)
4219 mddev->new_chunk_sectors = mddev->chunk_sectors;
597a711b 4220 }
b3546035 4221 } else {
664e7c41 4222 mddev->new_chunk_sectors = n >> 9;
b3546035 4223 if (mddev->reshape_position == MaxSector)
9d8f0363 4224 mddev->chunk_sectors = n >> 9;
b3546035 4225 }
6791875e
N
4226 mddev_unlock(mddev);
4227 return err ?: len;
3b34380a
N
4228}
4229static struct md_sysfs_entry md_chunk_size =
80ca3a44 4230__ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3b34380a 4231
a94213b1 4232static ssize_t
fd01b88c 4233resync_start_show(struct mddev *mddev, char *page)
a94213b1 4234{
d1a7c503
N
4235 if (mddev->recovery_cp == MaxSector)
4236 return sprintf(page, "none\n");
a94213b1
N
4237 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
4238}
4239
4240static ssize_t
fd01b88c 4241resync_start_store(struct mddev *mddev, const char *buf, size_t len)
a94213b1 4242{
4c9309c0 4243 unsigned long long n;
6791875e 4244 int err;
4c9309c0
AD
4245
4246 if (cmd_match(buf, "none"))
4247 n = MaxSector;
4248 else {
4249 err = kstrtoull(buf, 10, &n);
4250 if (err < 0)
4251 return err;
4252 if (n != (sector_t)n)
4253 return -EINVAL;
4254 }
a94213b1 4255
6791875e
N
4256 err = mddev_lock(mddev);
4257 if (err)
4258 return err;
b098636c 4259 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6791875e 4260 err = -EBUSY;
a94213b1 4261
6791875e
N
4262 if (!err) {
4263 mddev->recovery_cp = n;
4264 if (mddev->pers)
2953079c 4265 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
6791875e
N
4266 }
4267 mddev_unlock(mddev);
4268 return err ?: len;
a94213b1
N
4269}
4270static struct md_sysfs_entry md_resync_start =
750f199e
N
4271__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
4272 resync_start_show, resync_start_store);
a94213b1 4273
9e653b63
N
4274/*
4275 * The array state can be:
4276 *
4277 * clear
4278 * No devices, no size, no level
4279 * Equivalent to STOP_ARRAY ioctl
4280 * inactive
4281 * May have some settings, but array is not active
4282 * all IO results in error
4283 * When written, doesn't tear down array, but just stops it
4284 * suspended (not supported yet)
4285 * All IO requests will block. The array can be reconfigured.
910d8cb3 4286 * Writing this, if accepted, will block until array is quiescent
9e653b63
N
4287 * readonly
4288 * no resync can happen. no superblocks get written.
4289 * write requests fail
4290 * read-auto
4291 * like readonly, but behaves like 'clean' on a write request.
4292 *
4293 * clean - no pending writes, but otherwise active.
4294 * When written to inactive array, starts without resync
4295 * If a write request arrives then
4296 * if metadata is known, mark 'dirty' and switch to 'active'.
4297 * if not known, block and switch to write-pending
4298 * If written to an active array that has pending writes, then fails.
4299 * active
4300 * fully active: IO and resync can be happening.
4301 * When written to inactive array, starts with resync
4302 *
4303 * write-pending
4304 * clean, but writes are blocked waiting for 'active' to be written.
4305 *
4306 * active-idle
4307 * like active, but no writes have been seen for a while (100msec).
4308 *
62f7b198
GP
4309 * broken
4310 * RAID0/LINEAR-only: same as clean, but array is missing a member.
4311 * It's useful because RAID0/LINEAR mounted-arrays aren't stopped
4312 * when a member is gone, so this state will at least alert the
4313 * user that something is wrong.
9e653b63
N
4314 */
4315enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
62f7b198 4316 write_pending, active_idle, broken, bad_word};
05381954 4317static char *array_states[] = {
9e653b63 4318 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
62f7b198 4319 "write-pending", "active-idle", "broken", NULL };
9e653b63
N
4320
4321static int match_word(const char *word, char **list)
4322{
4323 int n;
4324 for (n=0; list[n]; n++)
4325 if (cmd_match(word, list[n]))
4326 break;
4327 return n;
4328}
4329
4330static ssize_t
fd01b88c 4331array_state_show(struct mddev *mddev, char *page)
9e653b63
N
4332{
4333 enum array_state st = inactive;
4334
62f7b198 4335 if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
9e653b63
N
4336 switch(mddev->ro) {
4337 case 1:
4338 st = readonly;
4339 break;
4340 case 2:
4341 st = read_auto;
4342 break;
4343 case 0:
55cc39f3 4344 spin_lock(&mddev->lock);
2953079c 4345 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
e691063a 4346 st = write_pending;
16f88949
TM
4347 else if (mddev->in_sync)
4348 st = clean;
9e653b63
N
4349 else if (mddev->safemode)
4350 st = active_idle;
4351 else
4352 st = active;
55cc39f3 4353 spin_unlock(&mddev->lock);
9e653b63 4354 }
62f7b198
GP
4355
4356 if (test_bit(MD_BROKEN, &mddev->flags) && st == clean)
4357 st = broken;
4358 } else {
9e653b63
N
4359 if (list_empty(&mddev->disks) &&
4360 mddev->raid_disks == 0 &&
58c0fed4 4361 mddev->dev_sectors == 0)
9e653b63
N
4362 st = clear;
4363 else
4364 st = inactive;
4365 }
4366 return sprintf(page, "%s\n", array_states[st]);
4367}
4368
f72ffdd6
N
4369static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
4370static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
4371static int do_md_run(struct mddev *mddev);
fd01b88c 4372static int restart_array(struct mddev *mddev);
9e653b63
N
4373
4374static ssize_t
fd01b88c 4375array_state_store(struct mddev *mddev, const char *buf, size_t len)
9e653b63 4376{
6497709b 4377 int err = 0;
9e653b63 4378 enum array_state st = match_word(buf, array_states);
6791875e
N
4379
4380 if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
4381 /* don't take reconfig_mutex when toggling between
4382 * clean and active
4383 */
4384 spin_lock(&mddev->lock);
4385 if (st == active) {
4386 restart_array(mddev);
2953079c 4387 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
91a6c4ad 4388 md_wakeup_thread(mddev->thread);
6791875e 4389 wake_up(&mddev->sb_wait);
6791875e
N
4390 } else /* st == clean */ {
4391 restart_array(mddev);
6497709b 4392 if (!set_in_sync(mddev))
6791875e
N
4393 err = -EBUSY;
4394 }
573275b5
TM
4395 if (!err)
4396 sysfs_notify_dirent_safe(mddev->sysfs_state);
6791875e 4397 spin_unlock(&mddev->lock);
c008f1d3 4398 return err ?: len;
6791875e
N
4399 }
4400 err = mddev_lock(mddev);
4401 if (err)
4402 return err;
4403 err = -EINVAL;
9e653b63
N
4404 switch(st) {
4405 case bad_word:
4406 break;
4407 case clear:
4408 /* stopping an active array */
a05b7ea0 4409 err = do_md_stop(mddev, 0, NULL);
9e653b63
N
4410 break;
4411 case inactive:
4412 /* stopping an active array */
90cf195d 4413 if (mddev->pers)
a05b7ea0 4414 err = do_md_stop(mddev, 2, NULL);
90cf195d 4415 else
e691063a 4416 err = 0; /* already inactive */
9e653b63
N
4417 break;
4418 case suspended:
4419 break; /* not supported yet */
4420 case readonly:
4421 if (mddev->pers)
a05b7ea0 4422 err = md_set_readonly(mddev, NULL);
9e653b63
N
4423 else {
4424 mddev->ro = 1;
648b629e 4425 set_disk_ro(mddev->gendisk, 1);
9e653b63
N
4426 err = do_md_run(mddev);
4427 }
4428 break;
4429 case read_auto:
9e653b63 4430 if (mddev->pers) {
80268ee9 4431 if (mddev->ro == 0)
a05b7ea0 4432 err = md_set_readonly(mddev, NULL);
80268ee9 4433 else if (mddev->ro == 1)
648b629e
N
4434 err = restart_array(mddev);
4435 if (err == 0) {
4436 mddev->ro = 2;
4437 set_disk_ro(mddev->gendisk, 0);
4438 }
9e653b63
N
4439 } else {
4440 mddev->ro = 2;
4441 err = do_md_run(mddev);
4442 }
4443 break;
4444 case clean:
4445 if (mddev->pers) {
339421de
SL
4446 err = restart_array(mddev);
4447 if (err)
4448 break;
85572d7c 4449 spin_lock(&mddev->lock);
6497709b 4450 if (!set_in_sync(mddev))
e691063a 4451 err = -EBUSY;
85572d7c 4452 spin_unlock(&mddev->lock);
5bf29597
N
4453 } else
4454 err = -EINVAL;
9e653b63
N
4455 break;
4456 case active:
4457 if (mddev->pers) {
339421de
SL
4458 err = restart_array(mddev);
4459 if (err)
4460 break;
2953079c 4461 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
9e653b63
N
4462 wake_up(&mddev->sb_wait);
4463 err = 0;
4464 } else {
4465 mddev->ro = 0;
648b629e 4466 set_disk_ro(mddev->gendisk, 0);
9e653b63
N
4467 err = do_md_run(mddev);
4468 }
4469 break;
4470 case write_pending:
4471 case active_idle:
62f7b198 4472 case broken:
9e653b63
N
4473 /* these cannot be set */
4474 break;
4475 }
6791875e
N
4476
4477 if (!err) {
1d23f178
N
4478 if (mddev->hold_active == UNTIL_IOCTL)
4479 mddev->hold_active = 0;
00bcb4ac 4480 sysfs_notify_dirent_safe(mddev->sysfs_state);
0fd62b86 4481 }
6791875e
N
4482 mddev_unlock(mddev);
4483 return err ?: len;
9e653b63 4484}
80ca3a44 4485static struct md_sysfs_entry md_array_state =
750f199e 4486__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
9e653b63 4487
1e50915f 4488static ssize_t
fd01b88c 4489max_corrected_read_errors_show(struct mddev *mddev, char *page) {
1e50915f
RB
4490 return sprintf(page, "%d\n",
4491 atomic_read(&mddev->max_corr_read_errors));
4492}
4493
4494static ssize_t
fd01b88c 4495max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
1e50915f 4496{
4c9309c0
AD
4497 unsigned int n;
4498 int rv;
1e50915f 4499
4c9309c0
AD
4500 rv = kstrtouint(buf, 10, &n);
4501 if (rv < 0)
4502 return rv;
4503 atomic_set(&mddev->max_corr_read_errors, n);
4504 return len;
1e50915f
RB
4505}
4506
4507static struct md_sysfs_entry max_corr_read_errors =
4508__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4509 max_corrected_read_errors_store);
4510
6d7ff738 4511static ssize_t
fd01b88c 4512null_show(struct mddev *mddev, char *page)
6d7ff738
N
4513{
4514 return -EINVAL;
4515}
4516
cc1ffe61
GJ
4517/* need to ensure rdev_delayed_delete() has completed */
4518static void flush_rdev_wq(struct mddev *mddev)
4519{
4520 struct md_rdev *rdev;
4521
4522 rcu_read_lock();
4523 rdev_for_each_rcu(rdev, mddev)
4524 if (work_pending(&rdev->del_work)) {
4525 flush_workqueue(md_rdev_misc_wq);
4526 break;
4527 }
4528 rcu_read_unlock();
4529}
4530
6d7ff738 4531static ssize_t
fd01b88c 4532new_dev_store(struct mddev *mddev, const char *buf, size_t len)
6d7ff738
N
4533{
4534 /* buf must be %d:%d\n? giving major and minor numbers */
4535 /* The new device is added to the array.
4536 * If the array has a persistent superblock, we read the
4537 * superblock to initialise info and check validity.
4538 * Otherwise, only checking done is that in bind_rdev_to_array,
4539 * which mainly checks size.
4540 */
4541 char *e;
4542 int major = simple_strtoul(buf, &e, 10);
4543 int minor;
4544 dev_t dev;
3cb03002 4545 struct md_rdev *rdev;
6d7ff738
N
4546 int err;
4547
4548 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4549 return -EINVAL;
4550 minor = simple_strtoul(e+1, &e, 10);
4551 if (*e && *e != '\n')
4552 return -EINVAL;
4553 dev = MKDEV(major, minor);
4554 if (major != MAJOR(dev) ||
4555 minor != MINOR(dev))
4556 return -EOVERFLOW;
4557
cc1ffe61 4558 flush_rdev_wq(mddev);
6791875e
N
4559 err = mddev_lock(mddev);
4560 if (err)
4561 return err;
6d7ff738
N
4562 if (mddev->persistent) {
4563 rdev = md_import_device(dev, mddev->major_version,
4564 mddev->minor_version);
4565 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3cb03002
N
4566 struct md_rdev *rdev0
4567 = list_entry(mddev->disks.next,
4568 struct md_rdev, same_set);
6d7ff738
N
4569 err = super_types[mddev->major_version]
4570 .load_super(rdev, rdev0, mddev->minor_version);
4571 if (err < 0)
4572 goto out;
4573 }
c5d79adb
N
4574 } else if (mddev->external)
4575 rdev = md_import_device(dev, -2, -1);
4576 else
6d7ff738
N
4577 rdev = md_import_device(dev, -1, -1);
4578
9a8c0fa8
N
4579 if (IS_ERR(rdev)) {
4580 mddev_unlock(mddev);
6d7ff738 4581 return PTR_ERR(rdev);
9a8c0fa8 4582 }
6d7ff738
N
4583 err = bind_rdev_to_array(rdev, mddev);
4584 out:
4585 if (err)
4586 export_rdev(rdev);
6791875e 4587 mddev_unlock(mddev);
5492c46e
AO
4588 if (!err)
4589 md_new_event(mddev);
6d7ff738
N
4590 return err ? err : len;
4591}
4592
4593static struct md_sysfs_entry md_new_device =
80ca3a44 4594__ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3b34380a 4595
9b1d1dac 4596static ssize_t
fd01b88c 4597bitmap_store(struct mddev *mddev, const char *buf, size_t len)
9b1d1dac
PC
4598{
4599 char *end;
4600 unsigned long chunk, end_chunk;
6791875e 4601 int err;
9b1d1dac 4602
6791875e
N
4603 err = mddev_lock(mddev);
4604 if (err)
4605 return err;
9b1d1dac
PC
4606 if (!mddev->bitmap)
4607 goto out;
4608 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4609 while (*buf) {
4610 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4611 if (buf == end) break;
4612 if (*end == '-') { /* range */
4613 buf = end + 1;
4614 end_chunk = simple_strtoul(buf, &end, 0);
4615 if (buf == end) break;
4616 }
4617 if (*end && !isspace(*end)) break;
e64e4018 4618 md_bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
e7d2860b 4619 buf = skip_spaces(end);
9b1d1dac 4620 }
e64e4018 4621 md_bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
9b1d1dac 4622out:
6791875e 4623 mddev_unlock(mddev);
9b1d1dac
PC
4624 return len;
4625}
4626
4627static struct md_sysfs_entry md_bitmap =
4628__ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4629
a35b0d69 4630static ssize_t
fd01b88c 4631size_show(struct mddev *mddev, char *page)
a35b0d69 4632{
58c0fed4
AN
4633 return sprintf(page, "%llu\n",
4634 (unsigned long long)mddev->dev_sectors / 2);
a35b0d69
N
4635}
4636
fd01b88c 4637static int update_size(struct mddev *mddev, sector_t num_sectors);
a35b0d69
N
4638
4639static ssize_t
fd01b88c 4640size_store(struct mddev *mddev, const char *buf, size_t len)
a35b0d69
N
4641{
4642 /* If array is inactive, we can reduce the component size, but
4643 * not increase it (except from 0).
4644 * If array is active, we can try an on-line resize
4645 */
b522adcd
DW
4646 sector_t sectors;
4647 int err = strict_blocks_to_sectors(buf, &sectors);
a35b0d69 4648
58c0fed4
AN
4649 if (err < 0)
4650 return err;
6791875e
N
4651 err = mddev_lock(mddev);
4652 if (err)
4653 return err;
a35b0d69 4654 if (mddev->pers) {
58c0fed4 4655 err = update_size(mddev, sectors);
4ba1e788
XN
4656 if (err == 0)
4657 md_update_sb(mddev, 1);
a35b0d69 4658 } else {
58c0fed4
AN
4659 if (mddev->dev_sectors == 0 ||
4660 mddev->dev_sectors > sectors)
4661 mddev->dev_sectors = sectors;
a35b0d69
N
4662 else
4663 err = -ENOSPC;
4664 }
6791875e 4665 mddev_unlock(mddev);
a35b0d69
N
4666 return err ? err : len;
4667}
4668
4669static struct md_sysfs_entry md_size =
80ca3a44 4670__ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
a35b0d69 4671
83f0d77a 4672/* Metadata version.
e691063a
N
4673 * This is one of
4674 * 'none' for arrays with no metadata (good luck...)
4675 * 'external' for arrays with externally managed metadata,
8bb93aac
N
4676 * or N.M for internally known formats
4677 */
4678static ssize_t
fd01b88c 4679metadata_show(struct mddev *mddev, char *page)
8bb93aac
N
4680{
4681 if (mddev->persistent)
4682 return sprintf(page, "%d.%d\n",
4683 mddev->major_version, mddev->minor_version);
e691063a
N
4684 else if (mddev->external)
4685 return sprintf(page, "external:%s\n", mddev->metadata_type);
8bb93aac
N
4686 else
4687 return sprintf(page, "none\n");
4688}
4689
4690static ssize_t
fd01b88c 4691metadata_store(struct mddev *mddev, const char *buf, size_t len)
8bb93aac
N
4692{
4693 int major, minor;
4694 char *e;
6791875e 4695 int err;
ea43ddd8
N
4696 /* Changing the details of 'external' metadata is
4697 * always permitted. Otherwise there must be
4698 * no devices attached to the array.
4699 */
6791875e
N
4700
4701 err = mddev_lock(mddev);
4702 if (err)
4703 return err;
4704 err = -EBUSY;
ea43ddd8
N
4705 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4706 ;
4707 else if (!list_empty(&mddev->disks))
6791875e 4708 goto out_unlock;
8bb93aac 4709
6791875e 4710 err = 0;
8bb93aac
N
4711 if (cmd_match(buf, "none")) {
4712 mddev->persistent = 0;
e691063a
N
4713 mddev->external = 0;
4714 mddev->major_version = 0;
4715 mddev->minor_version = 90;
6791875e 4716 goto out_unlock;
e691063a
N
4717 }
4718 if (strncmp(buf, "external:", 9) == 0) {
20a49ff6 4719 size_t namelen = len-9;
e691063a
N
4720 if (namelen >= sizeof(mddev->metadata_type))
4721 namelen = sizeof(mddev->metadata_type)-1;
4722 strncpy(mddev->metadata_type, buf+9, namelen);
4723 mddev->metadata_type[namelen] = 0;
4724 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4725 mddev->metadata_type[--namelen] = 0;
4726 mddev->persistent = 0;
4727 mddev->external = 1;
8bb93aac
N
4728 mddev->major_version = 0;
4729 mddev->minor_version = 90;
6791875e 4730 goto out_unlock;
8bb93aac
N
4731 }
4732 major = simple_strtoul(buf, &e, 10);
6791875e 4733 err = -EINVAL;
8bb93aac 4734 if (e==buf || *e != '.')
6791875e 4735 goto out_unlock;
8bb93aac
N
4736 buf = e+1;
4737 minor = simple_strtoul(buf, &e, 10);
3f9d7b0d 4738 if (e==buf || (*e && *e != '\n') )
6791875e
N
4739 goto out_unlock;
4740 err = -ENOENT;
50511da3 4741 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
6791875e 4742 goto out_unlock;
8bb93aac
N
4743 mddev->major_version = major;
4744 mddev->minor_version = minor;
4745 mddev->persistent = 1;
e691063a 4746 mddev->external = 0;
6791875e
N
4747 err = 0;
4748out_unlock:
4749 mddev_unlock(mddev);
4750 return err ?: len;
8bb93aac
N
4751}
4752
4753static struct md_sysfs_entry md_metadata =
750f199e 4754__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
8bb93aac 4755
24dd469d 4756static ssize_t
fd01b88c 4757action_show(struct mddev *mddev, char *page)
24dd469d 4758{
7eec314d 4759 char *type = "idle";
b7b17c9b
N
4760 unsigned long recovery = mddev->recovery;
4761 if (test_bit(MD_RECOVERY_FROZEN, &recovery))
b6a9ce68 4762 type = "frozen";
b7b17c9b
N
4763 else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
4764 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
4765 if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
ccfcc3c1 4766 type = "reshape";
b7b17c9b
N
4767 else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
4768 if (!test_bit(MD_RECOVERY_REQUESTED, &recovery))
24dd469d 4769 type = "resync";
b7b17c9b 4770 else if (test_bit(MD_RECOVERY_CHECK, &recovery))
24dd469d
N
4771 type = "check";
4772 else
4773 type = "repair";
b7b17c9b 4774 } else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
24dd469d 4775 type = "recover";
985ca973
N
4776 else if (mddev->reshape_position != MaxSector)
4777 type = "reshape";
24dd469d
N
4778 }
4779 return sprintf(page, "%s\n", type);
4780}
4781
4782static ssize_t
fd01b88c 4783action_store(struct mddev *mddev, const char *page, size_t len)
24dd469d 4784{
7eec314d
N
4785 if (!mddev->pers || !mddev->pers->sync_request)
4786 return -EINVAL;
4787
b6a9ce68
N
4788
4789 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
56ccc112
N
4790 if (cmd_match(page, "frozen"))
4791 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4792 else
4793 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
8e8e2518
N
4794 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
4795 mddev_lock(mddev) == 0) {
cc1ffe61
GJ
4796 if (work_pending(&mddev->del_work))
4797 flush_workqueue(md_misc_wq);
8e8e2518
N
4798 if (mddev->sync_thread) {
4799 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6791875e 4800 md_reap_sync_thread(mddev);
6791875e 4801 }
8e8e2518 4802 mddev_unlock(mddev);
7eec314d 4803 }
312045ee 4804 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
24dd469d 4805 return -EBUSY;
72a23c21 4806 else if (cmd_match(page, "resync"))
56ccc112 4807 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
72a23c21 4808 else if (cmd_match(page, "recover")) {
56ccc112 4809 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
72a23c21 4810 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
72a23c21 4811 } else if (cmd_match(page, "reshape")) {
16484bf5
N
4812 int err;
4813 if (mddev->pers->start_reshape == NULL)
4814 return -EINVAL;
6791875e
N
4815 err = mddev_lock(mddev);
4816 if (!err) {
312045ee
N
4817 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4818 err = -EBUSY;
4819 else {
4820 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4821 err = mddev->pers->start_reshape(mddev);
4822 }
6791875e
N
4823 mddev_unlock(mddev);
4824 }
16484bf5
N
4825 if (err)
4826 return err;
a99ac971 4827 sysfs_notify(&mddev->kobj, NULL, "degraded");
16484bf5 4828 } else {
bce74dac 4829 if (cmd_match(page, "check"))
7eec314d 4830 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
2adc7d47 4831 else if (!cmd_match(page, "repair"))
7eec314d 4832 return -EINVAL;
56ccc112 4833 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
7eec314d
N
4834 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4835 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7eec314d 4836 }
48c26ddc
N
4837 if (mddev->ro == 2) {
4838 /* A write to sync_action is enough to justify
4839 * canceling read-auto mode
4840 */
4841 mddev->ro = 0;
4842 md_wakeup_thread(mddev->sync_thread);
4843 }
03c902e1 4844 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
24dd469d 4845 md_wakeup_thread(mddev->thread);
00bcb4ac 4846 sysfs_notify_dirent_safe(mddev->sysfs_action);
24dd469d
N
4847 return len;
4848}
4849
c4a39551 4850static struct md_sysfs_entry md_scan_mode =
750f199e 4851__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
c4a39551
JB
4852
4853static ssize_t
4854last_sync_action_show(struct mddev *mddev, char *page)
4855{
4856 return sprintf(page, "%s\n", mddev->last_sync_action);
4857}
4858
4859static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4860
9d88883e 4861static ssize_t
fd01b88c 4862mismatch_cnt_show(struct mddev *mddev, char *page)
9d88883e
N
4863{
4864 return sprintf(page, "%llu\n",
7f7583d4
JM
4865 (unsigned long long)
4866 atomic64_read(&mddev->resync_mismatches));
9d88883e
N
4867}
4868
80ca3a44 4869static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
9d88883e 4870
88202a0c 4871static ssize_t
fd01b88c 4872sync_min_show(struct mddev *mddev, char *page)
88202a0c
N
4873{
4874 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4875 mddev->sync_speed_min ? "local": "system");
4876}
4877
4878static ssize_t
fd01b88c 4879sync_min_store(struct mddev *mddev, const char *buf, size_t len)
88202a0c 4880{
4c9309c0
AD
4881 unsigned int min;
4882 int rv;
4883
88202a0c 4884 if (strncmp(buf, "system", 6)==0) {
4c9309c0
AD
4885 min = 0;
4886 } else {
4887 rv = kstrtouint(buf, 10, &min);
4888 if (rv < 0)
4889 return rv;
4890 if (min == 0)
4891 return -EINVAL;
88202a0c 4892 }
88202a0c
N
4893 mddev->sync_speed_min = min;
4894 return len;
4895}
4896
4897static struct md_sysfs_entry md_sync_min =
4898__ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4899
4900static ssize_t
fd01b88c 4901sync_max_show(struct mddev *mddev, char *page)
88202a0c
N
4902{
4903 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4904 mddev->sync_speed_max ? "local": "system");
4905}
4906
4907static ssize_t
fd01b88c 4908sync_max_store(struct mddev *mddev, const char *buf, size_t len)
88202a0c 4909{
4c9309c0
AD
4910 unsigned int max;
4911 int rv;
4912
88202a0c 4913 if (strncmp(buf, "system", 6)==0) {
4c9309c0
AD
4914 max = 0;
4915 } else {
4916 rv = kstrtouint(buf, 10, &max);
4917 if (rv < 0)
4918 return rv;
4919 if (max == 0)
4920 return -EINVAL;
88202a0c 4921 }
88202a0c
N
4922 mddev->sync_speed_max = max;
4923 return len;
4924}
4925
4926static struct md_sysfs_entry md_sync_max =
4927__ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4928
d7f3d291 4929static ssize_t
fd01b88c 4930degraded_show(struct mddev *mddev, char *page)
d7f3d291
IP
4931{
4932 return sprintf(page, "%d\n", mddev->degraded);
4933}
4934static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
88202a0c 4935
90b08710 4936static ssize_t
fd01b88c 4937sync_force_parallel_show(struct mddev *mddev, char *page)
90b08710
BS
4938{
4939 return sprintf(page, "%d\n", mddev->parallel_resync);
4940}
4941
4942static ssize_t
fd01b88c 4943sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
90b08710
BS
4944{
4945 long n;
4946
b29bebd6 4947 if (kstrtol(buf, 10, &n))
90b08710
BS
4948 return -EINVAL;
4949
4950 if (n != 0 && n != 1)
4951 return -EINVAL;
4952
4953 mddev->parallel_resync = n;
4954
4955 if (mddev->sync_thread)
4956 wake_up(&resync_wait);
4957
4958 return len;
4959}
4960
4961/* force parallel resync, even with shared block devices */
4962static struct md_sysfs_entry md_sync_force_parallel =
4963__ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4964 sync_force_parallel_show, sync_force_parallel_store);
4965
88202a0c 4966static ssize_t
fd01b88c 4967sync_speed_show(struct mddev *mddev, char *page)
88202a0c
N
4968{
4969 unsigned long resync, dt, db;
d1a7c503
N
4970 if (mddev->curr_resync == 0)
4971 return sprintf(page, "none\n");
9687a60c
AN
4972 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4973 dt = (jiffies - mddev->resync_mark) / HZ;
88202a0c 4974 if (!dt) dt++;
9687a60c
AN
4975 db = resync - mddev->resync_mark_cnt;
4976 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
88202a0c
N
4977}
4978
80ca3a44 4979static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
88202a0c
N
4980
4981static ssize_t
fd01b88c 4982sync_completed_show(struct mddev *mddev, char *page)
88202a0c 4983{
13ae864b 4984 unsigned long long max_sectors, resync;
88202a0c 4985
acb180b0
N
4986 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4987 return sprintf(page, "none\n");
4988
72f36d59
N
4989 if (mddev->curr_resync == 1 ||
4990 mddev->curr_resync == 2)
4991 return sprintf(page, "delayed\n");
4992
c804cdec
N
4993 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4994 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
58c0fed4 4995 max_sectors = mddev->resync_max_sectors;
88202a0c 4996 else
58c0fed4 4997 max_sectors = mddev->dev_sectors;
88202a0c 4998
acb180b0 4999 resync = mddev->curr_resync_completed;
13ae864b 5000 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
88202a0c
N
5001}
5002
750f199e
N
5003static struct md_sysfs_entry md_sync_completed =
5004 __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL);
88202a0c 5005
5e96ee65 5006static ssize_t
fd01b88c 5007min_sync_show(struct mddev *mddev, char *page)
5e96ee65
NB
5008{
5009 return sprintf(page, "%llu\n",
5010 (unsigned long long)mddev->resync_min);
5011}
5012static ssize_t
fd01b88c 5013min_sync_store(struct mddev *mddev, const char *buf, size_t len)
5e96ee65
NB
5014{
5015 unsigned long long min;
23da422b 5016 int err;
23da422b 5017
b29bebd6 5018 if (kstrtoull(buf, 10, &min))
5e96ee65 5019 return -EINVAL;
23da422b
N
5020
5021 spin_lock(&mddev->lock);
5022 err = -EINVAL;
5e96ee65 5023 if (min > mddev->resync_max)
23da422b
N
5024 goto out_unlock;
5025
5026 err = -EBUSY;
5e96ee65 5027 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
23da422b 5028 goto out_unlock;
5e96ee65 5029
50c37b13
N
5030 /* Round down to multiple of 4K for safety */
5031 mddev->resync_min = round_down(min, 8);
23da422b 5032 err = 0;
5e96ee65 5033
23da422b
N
5034out_unlock:
5035 spin_unlock(&mddev->lock);
5036 return err ?: len;
5e96ee65
NB
5037}
5038
5039static struct md_sysfs_entry md_min_sync =
5040__ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
5041
c6207277 5042static ssize_t
fd01b88c 5043max_sync_show(struct mddev *mddev, char *page)
c6207277
N
5044{
5045 if (mddev->resync_max == MaxSector)
5046 return sprintf(page, "max\n");
5047 else
5048 return sprintf(page, "%llu\n",
5049 (unsigned long long)mddev->resync_max);
5050}
5051static ssize_t
fd01b88c 5052max_sync_store(struct mddev *mddev, const char *buf, size_t len)
c6207277 5053{
23da422b
N
5054 int err;
5055 spin_lock(&mddev->lock);
c6207277
N
5056 if (strncmp(buf, "max", 3) == 0)
5057 mddev->resync_max = MaxSector;
5058 else {
5e96ee65 5059 unsigned long long max;
23da422b
N
5060 int chunk;
5061
5062 err = -EINVAL;
b29bebd6 5063 if (kstrtoull(buf, 10, &max))
23da422b 5064 goto out_unlock;
5e96ee65 5065 if (max < mddev->resync_min)
23da422b
N
5066 goto out_unlock;
5067
5068 err = -EBUSY;
c6207277 5069 if (max < mddev->resync_max &&
4d484a4a 5070 mddev->ro == 0 &&
c6207277 5071 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
23da422b 5072 goto out_unlock;
c6207277
N
5073
5074 /* Must be a multiple of chunk_size */
23da422b
N
5075 chunk = mddev->chunk_sectors;
5076 if (chunk) {
2ac06c33 5077 sector_t temp = max;
23da422b
N
5078
5079 err = -EINVAL;
5080 if (sector_div(temp, chunk))
5081 goto out_unlock;
c6207277
N
5082 }
5083 mddev->resync_max = max;
5084 }
5085 wake_up(&mddev->recovery_wait);
23da422b
N
5086 err = 0;
5087out_unlock:
5088 spin_unlock(&mddev->lock);
5089 return err ?: len;
c6207277
N
5090}
5091
5092static struct md_sysfs_entry md_max_sync =
5093__ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
5094
e464eafd 5095static ssize_t
fd01b88c 5096suspend_lo_show(struct mddev *mddev, char *page)
e464eafd
N
5097{
5098 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
5099}
5100
5101static ssize_t
fd01b88c 5102suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
e464eafd 5103{
b03e0ccb 5104 unsigned long long new;
6791875e 5105 int err;
e464eafd 5106
4c9309c0
AD
5107 err = kstrtoull(buf, 10, &new);
5108 if (err < 0)
5109 return err;
5110 if (new != (sector_t)new)
e464eafd 5111 return -EINVAL;
23ddff37 5112
6791875e
N
5113 err = mddev_lock(mddev);
5114 if (err)
5115 return err;
5116 err = -EINVAL;
5117 if (mddev->pers == NULL ||
5118 mddev->pers->quiesce == NULL)
5119 goto unlock;
b03e0ccb 5120 mddev_suspend(mddev);
23ddff37 5121 mddev->suspend_lo = new;
b03e0ccb
N
5122 mddev_resume(mddev);
5123
6791875e
N
5124 err = 0;
5125unlock:
5126 mddev_unlock(mddev);
5127 return err ?: len;
e464eafd
N
5128}
5129static struct md_sysfs_entry md_suspend_lo =
5130__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
5131
e464eafd 5132static ssize_t
fd01b88c 5133suspend_hi_show(struct mddev *mddev, char *page)
e464eafd
N
5134{
5135 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
5136}
5137
5138static ssize_t
fd01b88c 5139suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
e464eafd 5140{
b03e0ccb 5141 unsigned long long new;
6791875e 5142 int err;
e464eafd 5143
4c9309c0
AD
5144 err = kstrtoull(buf, 10, &new);
5145 if (err < 0)
5146 return err;
5147 if (new != (sector_t)new)
e464eafd 5148 return -EINVAL;
23ddff37 5149
6791875e
N
5150 err = mddev_lock(mddev);
5151 if (err)
5152 return err;
5153 err = -EINVAL;
b03e0ccb 5154 if (mddev->pers == NULL)
6791875e 5155 goto unlock;
b03e0ccb
N
5156
5157 mddev_suspend(mddev);
23ddff37 5158 mddev->suspend_hi = new;
b03e0ccb
N
5159 mddev_resume(mddev);
5160
6791875e
N
5161 err = 0;
5162unlock:
5163 mddev_unlock(mddev);
5164 return err ?: len;
e464eafd
N
5165}
5166static struct md_sysfs_entry md_suspend_hi =
5167__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
5168
08a02ecd 5169static ssize_t
fd01b88c 5170reshape_position_show(struct mddev *mddev, char *page)
08a02ecd
N
5171{
5172 if (mddev->reshape_position != MaxSector)
5173 return sprintf(page, "%llu\n",
5174 (unsigned long long)mddev->reshape_position);
5175 strcpy(page, "none\n");
5176 return 5;
5177}
5178
5179static ssize_t
fd01b88c 5180reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
08a02ecd 5181{
c6563a8c 5182 struct md_rdev *rdev;
4c9309c0 5183 unsigned long long new;
6791875e 5184 int err;
6791875e 5185
4c9309c0
AD
5186 err = kstrtoull(buf, 10, &new);
5187 if (err < 0)
5188 return err;
5189 if (new != (sector_t)new)
08a02ecd 5190 return -EINVAL;
6791875e
N
5191 err = mddev_lock(mddev);
5192 if (err)
5193 return err;
5194 err = -EBUSY;
5195 if (mddev->pers)
5196 goto unlock;
08a02ecd
N
5197 mddev->reshape_position = new;
5198 mddev->delta_disks = 0;
2c810cdd 5199 mddev->reshape_backwards = 0;
08a02ecd
N
5200 mddev->new_level = mddev->level;
5201 mddev->new_layout = mddev->layout;
664e7c41 5202 mddev->new_chunk_sectors = mddev->chunk_sectors;
c6563a8c
N
5203 rdev_for_each(rdev, mddev)
5204 rdev->new_data_offset = rdev->data_offset;
6791875e
N
5205 err = 0;
5206unlock:
5207 mddev_unlock(mddev);
5208 return err ?: len;
08a02ecd
N
5209}
5210
5211static struct md_sysfs_entry md_reshape_position =
5212__ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
5213 reshape_position_store);
5214
2c810cdd
N
5215static ssize_t
5216reshape_direction_show(struct mddev *mddev, char *page)
5217{
5218 return sprintf(page, "%s\n",
5219 mddev->reshape_backwards ? "backwards" : "forwards");
5220}
5221
5222static ssize_t
5223reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
5224{
5225 int backwards = 0;
6791875e
N
5226 int err;
5227
2c810cdd
N
5228 if (cmd_match(buf, "forwards"))
5229 backwards = 0;
5230 else if (cmd_match(buf, "backwards"))
5231 backwards = 1;
5232 else
5233 return -EINVAL;
5234 if (mddev->reshape_backwards == backwards)
5235 return len;
5236
6791875e
N
5237 err = mddev_lock(mddev);
5238 if (err)
5239 return err;
2c810cdd
N
5240 /* check if we are allowed to change */
5241 if (mddev->delta_disks)
6791875e
N
5242 err = -EBUSY;
5243 else if (mddev->persistent &&
2c810cdd 5244 mddev->major_version == 0)
6791875e
N
5245 err = -EINVAL;
5246 else
5247 mddev->reshape_backwards = backwards;
5248 mddev_unlock(mddev);
5249 return err ?: len;
2c810cdd
N
5250}
5251
5252static struct md_sysfs_entry md_reshape_direction =
5253__ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
5254 reshape_direction_store);
5255
b522adcd 5256static ssize_t
fd01b88c 5257array_size_show(struct mddev *mddev, char *page)
b522adcd
DW
5258{
5259 if (mddev->external_size)
5260 return sprintf(page, "%llu\n",
5261 (unsigned long long)mddev->array_sectors/2);
5262 else
5263 return sprintf(page, "default\n");
5264}
5265
5266static ssize_t
fd01b88c 5267array_size_store(struct mddev *mddev, const char *buf, size_t len)
b522adcd
DW
5268{
5269 sector_t sectors;
6791875e
N
5270 int err;
5271
5272 err = mddev_lock(mddev);
5273 if (err)
5274 return err;
b522adcd 5275
ab5a98b1 5276 /* cluster raid doesn't support change array_sectors */
b670883b
ZL
5277 if (mddev_is_clustered(mddev)) {
5278 mddev_unlock(mddev);
ab5a98b1 5279 return -EINVAL;
b670883b 5280 }
ab5a98b1 5281
b522adcd
DW
5282 if (strncmp(buf, "default", 7) == 0) {
5283 if (mddev->pers)
5284 sectors = mddev->pers->size(mddev, 0, 0);
5285 else
5286 sectors = mddev->array_sectors;
5287
5288 mddev->external_size = 0;
5289 } else {
5290 if (strict_blocks_to_sectors(buf, &sectors) < 0)
6791875e
N
5291 err = -EINVAL;
5292 else if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
5293 err = -E2BIG;
5294 else
5295 mddev->external_size = 1;
b522adcd
DW
5296 }
5297
6791875e
N
5298 if (!err) {
5299 mddev->array_sectors = sectors;
5300 if (mddev->pers) {
5301 set_capacity(mddev->gendisk, mddev->array_sectors);
5302 revalidate_disk(mddev->gendisk);
5303 }
cbe6ef1d 5304 }
6791875e
N
5305 mddev_unlock(mddev);
5306 return err ?: len;
b522adcd
DW
5307}
5308
5309static struct md_sysfs_entry md_array_size =
5310__ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
5311 array_size_store);
e464eafd 5312
664aed04
AP
5313static ssize_t
5314consistency_policy_show(struct mddev *mddev, char *page)
5315{
5316 int ret;
5317
5318 if (test_bit(MD_HAS_JOURNAL, &mddev->flags)) {
5319 ret = sprintf(page, "journal\n");
5320 } else if (test_bit(MD_HAS_PPL, &mddev->flags)) {
5321 ret = sprintf(page, "ppl\n");
5322 } else if (mddev->bitmap) {
5323 ret = sprintf(page, "bitmap\n");
5324 } else if (mddev->pers) {
5325 if (mddev->pers->sync_request)
5326 ret = sprintf(page, "resync\n");
5327 else
5328 ret = sprintf(page, "none\n");
5329 } else {
5330 ret = sprintf(page, "unknown\n");
5331 }
5332
5333 return ret;
5334}
5335
5336static ssize_t
5337consistency_policy_store(struct mddev *mddev, const char *buf, size_t len)
5338{
ba903a3e
AP
5339 int err = 0;
5340
664aed04 5341 if (mddev->pers) {
ba903a3e
AP
5342 if (mddev->pers->change_consistency_policy)
5343 err = mddev->pers->change_consistency_policy(mddev, buf);
5344 else
5345 err = -EBUSY;
664aed04
AP
5346 } else if (mddev->external && strncmp(buf, "ppl", 3) == 0) {
5347 set_bit(MD_HAS_PPL, &mddev->flags);
664aed04 5348 } else {
ba903a3e 5349 err = -EINVAL;
664aed04 5350 }
ba903a3e
AP
5351
5352 return err ? err : len;
664aed04
AP
5353}
5354
5355static struct md_sysfs_entry md_consistency_policy =
5356__ATTR(consistency_policy, S_IRUGO | S_IWUSR, consistency_policy_show,
5357 consistency_policy_store);
5358
9a567843
GJ
5359static ssize_t fail_last_dev_show(struct mddev *mddev, char *page)
5360{
5361 return sprintf(page, "%d\n", mddev->fail_last_dev);
5362}
5363
5364/*
5365 * Setting fail_last_dev to true to allow last device to be forcibly removed
5366 * from RAID1/RAID10.
5367 */
5368static ssize_t
5369fail_last_dev_store(struct mddev *mddev, const char *buf, size_t len)
5370{
5371 int ret;
5372 bool value;
5373
5374 ret = kstrtobool(buf, &value);
5375 if (ret)
5376 return ret;
5377
5378 if (value != mddev->fail_last_dev)
5379 mddev->fail_last_dev = value;
5380
5381 return len;
5382}
5383static struct md_sysfs_entry md_fail_last_dev =
5384__ATTR(fail_last_dev, S_IRUGO | S_IWUSR, fail_last_dev_show,
5385 fail_last_dev_store);
5386
3938f5fb
GJ
5387static ssize_t serialize_policy_show(struct mddev *mddev, char *page)
5388{
5389 if (mddev->pers == NULL || (mddev->pers->level != 1))
5390 return sprintf(page, "n/a\n");
5391 else
5392 return sprintf(page, "%d\n", mddev->serialize_policy);
5393}
5394
5395/*
5396 * Setting serialize_policy to true to enforce write IO is not reordered
5397 * for raid1.
5398 */
5399static ssize_t
5400serialize_policy_store(struct mddev *mddev, const char *buf, size_t len)
5401{
5402 int err;
5403 bool value;
5404
5405 err = kstrtobool(buf, &value);
5406 if (err)
5407 return err;
5408
5409 if (value == mddev->serialize_policy)
5410 return len;
5411
5412 err = mddev_lock(mddev);
5413 if (err)
5414 return err;
5415 if (mddev->pers == NULL || (mddev->pers->level != 1)) {
5416 pr_err("md: serialize_policy is only effective for raid1\n");
5417 err = -EINVAL;
5418 goto unlock;
5419 }
5420
5421 mddev_suspend(mddev);
5422 if (value)
5423 mddev_create_serial_pool(mddev, NULL, true);
5424 else
5425 mddev_destroy_serial_pool(mddev, NULL, true);
5426 mddev->serialize_policy = value;
5427 mddev_resume(mddev);
5428unlock:
5429 mddev_unlock(mddev);
5430 return err ?: len;
5431}
5432
5433static struct md_sysfs_entry md_serialize_policy =
5434__ATTR(serialize_policy, S_IRUGO | S_IWUSR, serialize_policy_show,
5435 serialize_policy_store);
5436
5437
eae1701f
N
5438static struct attribute *md_default_attrs[] = {
5439 &md_level.attr,
d4dbd025 5440 &md_layout.attr,
eae1701f 5441 &md_raid_disks.attr,
3b34380a 5442 &md_chunk_size.attr,
a35b0d69 5443 &md_size.attr,
a94213b1 5444 &md_resync_start.attr,
8bb93aac 5445 &md_metadata.attr,
6d7ff738 5446 &md_new_device.attr,
16f17b39 5447 &md_safe_delay.attr,
9e653b63 5448 &md_array_state.attr,
08a02ecd 5449 &md_reshape_position.attr,
2c810cdd 5450 &md_reshape_direction.attr,
b522adcd 5451 &md_array_size.attr,
1e50915f 5452 &max_corr_read_errors.attr,
664aed04 5453 &md_consistency_policy.attr,
9a567843 5454 &md_fail_last_dev.attr,
3938f5fb 5455 &md_serialize_policy.attr,
411036fa
N
5456 NULL,
5457};
5458
5459static struct attribute *md_redundancy_attrs[] = {
24dd469d 5460 &md_scan_mode.attr,
c4a39551 5461 &md_last_scan_mode.attr,
9d88883e 5462 &md_mismatches.attr,
88202a0c
N
5463 &md_sync_min.attr,
5464 &md_sync_max.attr,
5465 &md_sync_speed.attr,
90b08710 5466 &md_sync_force_parallel.attr,
88202a0c 5467 &md_sync_completed.attr,
5e96ee65 5468 &md_min_sync.attr,
c6207277 5469 &md_max_sync.attr,
e464eafd
N
5470 &md_suspend_lo.attr,
5471 &md_suspend_hi.attr,
9b1d1dac 5472 &md_bitmap.attr,
d7f3d291 5473 &md_degraded.attr,
eae1701f
N
5474 NULL,
5475};
411036fa
N
5476static struct attribute_group md_redundancy_group = {
5477 .name = NULL,
5478 .attrs = md_redundancy_attrs,
5479};
5480
eae1701f
N
5481static ssize_t
5482md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
5483{
5484 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
fd01b88c 5485 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
96de1e66 5486 ssize_t rv;
eae1701f
N
5487
5488 if (!entry->show)
5489 return -EIO;
af8a2434
N
5490 spin_lock(&all_mddevs_lock);
5491 if (list_empty(&mddev->all_mddevs)) {
5492 spin_unlock(&all_mddevs_lock);
5493 return -EBUSY;
5494 }
5495 mddev_get(mddev);
5496 spin_unlock(&all_mddevs_lock);
5497
b7b17c9b 5498 rv = entry->show(mddev, page);
af8a2434 5499 mddev_put(mddev);
96de1e66 5500 return rv;
eae1701f
N
5501}
5502
5503static ssize_t
5504md_attr_store(struct kobject *kobj, struct attribute *attr,
5505 const char *page, size_t length)
5506{
5507 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
fd01b88c 5508 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
96de1e66 5509 ssize_t rv;
eae1701f
N
5510
5511 if (!entry->store)
5512 return -EIO;
67463acb
N
5513 if (!capable(CAP_SYS_ADMIN))
5514 return -EACCES;
af8a2434
N
5515 spin_lock(&all_mddevs_lock);
5516 if (list_empty(&mddev->all_mddevs)) {
5517 spin_unlock(&all_mddevs_lock);
5518 return -EBUSY;
5519 }
5520 mddev_get(mddev);
5521 spin_unlock(&all_mddevs_lock);
6791875e 5522 rv = entry->store(mddev, page, length);
af8a2434 5523 mddev_put(mddev);
96de1e66 5524 return rv;
eae1701f
N
5525}
5526
5527static void md_free(struct kobject *ko)
5528{
fd01b88c 5529 struct mddev *mddev = container_of(ko, struct mddev, kobj);
a21d1504
N
5530
5531 if (mddev->sysfs_state)
5532 sysfs_put(mddev->sysfs_state);
5533
d8115c35
BVA
5534 if (mddev->gendisk)
5535 del_gendisk(mddev->gendisk);
6cd18e71
N
5536 if (mddev->queue)
5537 blk_cleanup_queue(mddev->queue);
d8115c35 5538 if (mddev->gendisk)
a21d1504 5539 put_disk(mddev->gendisk);
4ad23a97 5540 percpu_ref_exit(&mddev->writes_pending);
a21d1504 5541
28dec870
KO
5542 bioset_exit(&mddev->bio_set);
5543 bioset_exit(&mddev->sync_set);
eae1701f
N
5544 kfree(mddev);
5545}
5546
52cf25d0 5547static const struct sysfs_ops md_sysfs_ops = {
eae1701f
N
5548 .show = md_attr_show,
5549 .store = md_attr_store,
5550};
5551static struct kobj_type md_ktype = {
5552 .release = md_free,
5553 .sysfs_ops = &md_sysfs_ops,
5554 .default_attrs = md_default_attrs,
5555};
5556
1da177e4
LT
5557int mdp_major = 0;
5558
5fd3a17e
DW
5559static void mddev_delayed_delete(struct work_struct *ws)
5560{
fd01b88c 5561 struct mddev *mddev = container_of(ws, struct mddev, del_work);
5fd3a17e 5562
43a70507 5563 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
5fd3a17e
DW
5564 kobject_del(&mddev->kobj);
5565 kobject_put(&mddev->kobj);
5566}
5567
4ad23a97
N
5568static void no_op(struct percpu_ref *r) {}
5569
a415c0f1
N
5570int mddev_init_writes_pending(struct mddev *mddev)
5571{
5572 if (mddev->writes_pending.percpu_count_ptr)
5573 return 0;
ddde2af7
RG
5574 if (percpu_ref_init(&mddev->writes_pending, no_op,
5575 PERCPU_REF_ALLOW_REINIT, GFP_KERNEL) < 0)
a415c0f1
N
5576 return -ENOMEM;
5577 /* We want to start with the refcount at zero */
5578 percpu_ref_put(&mddev->writes_pending);
5579 return 0;
5580}
5581EXPORT_SYMBOL_GPL(mddev_init_writes_pending);
5582
efeb53c0 5583static int md_alloc(dev_t dev, char *name)
1da177e4 5584{
039b7225
N
5585 /*
5586 * If dev is zero, name is the name of a device to allocate with
5587 * an arbitrary minor number. It will be "md_???"
5588 * If dev is non-zero it must be a device number with a MAJOR of
5589 * MD_MAJOR or mdp_major. In this case, if "name" is NULL, then
5590 * the device is being created by opening a node in /dev.
5591 * If "name" is not NULL, the device is being created by
5592 * writing to /sys/module/md_mod/parameters/new_array.
5593 */
48c9c27b 5594 static DEFINE_MUTEX(disks_mutex);
fd01b88c 5595 struct mddev *mddev = mddev_find(dev);
1da177e4 5596 struct gendisk *disk;
efeb53c0
N
5597 int partitioned;
5598 int shift;
5599 int unit;
3830c62f 5600 int error;
1da177e4
LT
5601
5602 if (!mddev)
efeb53c0
N
5603 return -ENODEV;
5604
5605 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
5606 shift = partitioned ? MdpMinorShift : 0;
5607 unit = MINOR(mddev->unit) >> shift;
1da177e4 5608
e804ac78
TH
5609 /* wait for any previous instance of this device to be
5610 * completely removed (mddev_delayed_delete).
d3374825 5611 */
e804ac78 5612 flush_workqueue(md_misc_wq);
d3374825 5613
48c9c27b 5614 mutex_lock(&disks_mutex);
0909dc44
N
5615 error = -EEXIST;
5616 if (mddev->gendisk)
5617 goto abort;
efeb53c0 5618
039b7225 5619 if (name && !dev) {
efeb53c0
N
5620 /* Need to ensure that 'name' is not a duplicate.
5621 */
fd01b88c 5622 struct mddev *mddev2;
efeb53c0
N
5623 spin_lock(&all_mddevs_lock);
5624
5625 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
5626 if (mddev2->gendisk &&
5627 strcmp(mddev2->gendisk->disk_name, name) == 0) {
5628 spin_unlock(&all_mddevs_lock);
0909dc44 5629 goto abort;
efeb53c0
N
5630 }
5631 spin_unlock(&all_mddevs_lock);
1da177e4 5632 }
039b7225
N
5633 if (name && dev)
5634 /*
5635 * Creating /dev/mdNNN via "newarray", so adjust hold_active.
5636 */
5637 mddev->hold_active = UNTIL_STOP;
8b765398 5638
0909dc44 5639 error = -ENOMEM;
3d745ea5 5640 mddev->queue = blk_alloc_queue(md_make_request, NUMA_NO_NODE);
0909dc44
N
5641 if (!mddev->queue)
5642 goto abort;
409c57f3 5643
b1bd055d 5644 blk_set_stacking_limits(&mddev->queue->limits);
8b765398 5645
1da177e4
LT
5646 disk = alloc_disk(1 << shift);
5647 if (!disk) {
8b765398
N
5648 blk_cleanup_queue(mddev->queue);
5649 mddev->queue = NULL;
0909dc44 5650 goto abort;
1da177e4 5651 }
efeb53c0 5652 disk->major = MAJOR(mddev->unit);
1da177e4 5653 disk->first_minor = unit << shift;
efeb53c0
N
5654 if (name)
5655 strcpy(disk->disk_name, name);
5656 else if (partitioned)
1da177e4 5657 sprintf(disk->disk_name, "md_d%d", unit);
ce7b0f46 5658 else
1da177e4 5659 sprintf(disk->disk_name, "md%d", unit);
1da177e4
LT
5660 disk->fops = &md_fops;
5661 disk->private_data = mddev;
5662 disk->queue = mddev->queue;
56883a7e 5663 blk_queue_write_cache(mddev->queue, true, true);
92850bbd 5664 /* Allow extended partitions. This makes the
d3374825 5665 * 'mdp' device redundant, but we can't really
92850bbd
N
5666 * remove it now.
5667 */
5668 disk->flags |= GENHD_FL_EXT_DEVT;
1da177e4 5669 mddev->gendisk = disk;
b0140891
N
5670 /* As soon as we call add_disk(), another thread could get
5671 * through to md_open, so make sure it doesn't get too far
5672 */
5673 mutex_lock(&mddev->open_mutex);
5674 add_disk(disk);
5675
28dec870 5676 error = kobject_add(&mddev->kobj, &disk_to_dev(disk)->kobj, "%s", "md");
0909dc44
N
5677 if (error) {
5678 /* This isn't possible, but as kobject_init_and_add is marked
5679 * __must_check, we must do something with the result
5680 */
9d48739e
N
5681 pr_debug("md: cannot register %s/md - name in use\n",
5682 disk->disk_name);
0909dc44
N
5683 error = 0;
5684 }
00bcb4ac
N
5685 if (mddev->kobj.sd &&
5686 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
9d48739e 5687 pr_debug("pointless warning\n");
b0140891 5688 mutex_unlock(&mddev->open_mutex);
0909dc44
N
5689 abort:
5690 mutex_unlock(&disks_mutex);
00bcb4ac 5691 if (!error && mddev->kobj.sd) {
3830c62f 5692 kobject_uevent(&mddev->kobj, KOBJ_ADD);
00bcb4ac 5693 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
b62b7590 5694 }
d3374825 5695 mddev_put(mddev);
0909dc44 5696 return error;
efeb53c0
N
5697}
5698
5699static struct kobject *md_probe(dev_t dev, int *part, void *data)
5700{
78b6350d
N
5701 if (create_on_open)
5702 md_alloc(dev, NULL);
1da177e4
LT
5703 return NULL;
5704}
5705
e4dca7b7 5706static int add_named_array(const char *val, const struct kernel_param *kp)
efeb53c0 5707{
039b7225
N
5708 /*
5709 * val must be "md_*" or "mdNNN".
5710 * For "md_*" we allocate an array with a large free minor number, and
efeb53c0 5711 * set the name to val. val must not already be an active name.
039b7225
N
5712 * For "mdNNN" we allocate an array with the minor number NNN
5713 * which must not already be in use.
efeb53c0
N
5714 */
5715 int len = strlen(val);
5716 char buf[DISK_NAME_LEN];
039b7225 5717 unsigned long devnum;
efeb53c0
N
5718
5719 while (len && val[len-1] == '\n')
5720 len--;
5721 if (len >= DISK_NAME_LEN)
5722 return -E2BIG;
5723 strlcpy(buf, val, len+1);
039b7225
N
5724 if (strncmp(buf, "md_", 3) == 0)
5725 return md_alloc(0, buf);
5726 if (strncmp(buf, "md", 2) == 0 &&
5727 isdigit(buf[2]) &&
5728 kstrtoul(buf+2, 10, &devnum) == 0 &&
5729 devnum <= MINORMASK)
5730 return md_alloc(MKDEV(MD_MAJOR, devnum), NULL);
5731
5732 return -EINVAL;
efeb53c0
N
5733}
5734
8376d3c1 5735static void md_safemode_timeout(struct timer_list *t)
1da177e4 5736{
8376d3c1 5737 struct mddev *mddev = from_timer(mddev, t, safemode_timer);
1da177e4 5738
4ad23a97
N
5739 mddev->safemode = 1;
5740 if (mddev->external)
5741 sysfs_notify_dirent_safe(mddev->sysfs_state);
5742
1da177e4
LT
5743 md_wakeup_thread(mddev->thread);
5744}
5745
6ff8d8ec 5746static int start_dirty_degraded;
1da177e4 5747
fd01b88c 5748int md_run(struct mddev *mddev)
1da177e4 5749{
2604b703 5750 int err;
3cb03002 5751 struct md_rdev *rdev;
84fc4b56 5752 struct md_personality *pers;
1da177e4 5753
a757e64c
N
5754 if (list_empty(&mddev->disks))
5755 /* cannot run an array with no devices.. */
1da177e4 5756 return -EINVAL;
1da177e4
LT
5757
5758 if (mddev->pers)
5759 return -EBUSY;
bb4f1e9d
N
5760 /* Cannot run until previous stop completes properly */
5761 if (mddev->sysfs_active)
5762 return -EBUSY;
b6eb127d 5763
1da177e4
LT
5764 /*
5765 * Analyze all RAID superblock(s)
5766 */
1ec4a939
N
5767 if (!mddev->raid_disks) {
5768 if (!mddev->persistent)
5769 return -EINVAL;
6a5cb53a
YY
5770 err = analyze_sbs(mddev);
5771 if (err)
5772 return -EINVAL;
1ec4a939 5773 }
1da177e4 5774
d9d166c2
N
5775 if (mddev->level != LEVEL_NONE)
5776 request_module("md-level-%d", mddev->level);
5777 else if (mddev->clevel[0])
5778 request_module("md-%s", mddev->clevel);
1da177e4
LT
5779
5780 /*
5781 * Drop all container device buffers, from now on
5782 * the only valid external interface is through the md
5783 * device.
1da177e4 5784 */
4b6c1060 5785 mddev->has_superblocks = false;
dafb20fa 5786 rdev_for_each(rdev, mddev) {
b2d444d7 5787 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
5788 continue;
5789 sync_blockdev(rdev->bdev);
f98393a6 5790 invalidate_bdev(rdev->bdev);
97b20ef7
N
5791 if (mddev->ro != 1 &&
5792 (bdev_read_only(rdev->bdev) ||
5793 bdev_read_only(rdev->meta_bdev))) {
5794 mddev->ro = 1;
5795 if (mddev->gendisk)
5796 set_disk_ro(mddev->gendisk, 1);
5797 }
f0d76d70 5798
4b6c1060
HM
5799 if (rdev->sb_page)
5800 mddev->has_superblocks = true;
5801
f0d76d70
N
5802 /* perform some consistency tests on the device.
5803 * We don't want the data to overlap the metadata,
58c0fed4 5804 * Internal Bitmap issues have been handled elsewhere.
f0d76d70 5805 */
a6ff7e08
JB
5806 if (rdev->meta_bdev) {
5807 /* Nothing to check */;
5808 } else if (rdev->data_offset < rdev->sb_start) {
58c0fed4
AN
5809 if (mddev->dev_sectors &&
5810 rdev->data_offset + mddev->dev_sectors
0f420358 5811 > rdev->sb_start) {
9d48739e
N
5812 pr_warn("md: %s: data overlaps metadata\n",
5813 mdname(mddev));
f0d76d70
N
5814 return -EINVAL;
5815 }
5816 } else {
0f420358 5817 if (rdev->sb_start + rdev->sb_size/512
f0d76d70 5818 > rdev->data_offset) {
9d48739e
N
5819 pr_warn("md: %s: metadata overlaps data\n",
5820 mdname(mddev));
f0d76d70
N
5821 return -EINVAL;
5822 }
5823 }
00bcb4ac 5824 sysfs_notify_dirent_safe(rdev->sysfs_state);
1da177e4
LT
5825 }
5826
afeee514
KO
5827 if (!bioset_initialized(&mddev->bio_set)) {
5828 err = bioset_init(&mddev->bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5829 if (err)
5830 return err;
10273170 5831 }
afeee514
KO
5832 if (!bioset_initialized(&mddev->sync_set)) {
5833 err = bioset_init(&mddev->sync_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
5834 if (err)
28dec870 5835 return err;
5a85071c 5836 }
a167f663 5837
1da177e4 5838 spin_lock(&pers_lock);
d9d166c2 5839 pers = find_pers(mddev->level, mddev->clevel);
2604b703 5840 if (!pers || !try_module_get(pers->owner)) {
1da177e4 5841 spin_unlock(&pers_lock);
d9d166c2 5842 if (mddev->level != LEVEL_NONE)
9d48739e
N
5843 pr_warn("md: personality for level %d is not loaded!\n",
5844 mddev->level);
d9d166c2 5845 else
9d48739e
N
5846 pr_warn("md: personality for level %s is not loaded!\n",
5847 mddev->clevel);
bfc9dfdc
SL
5848 err = -EINVAL;
5849 goto abort;
1da177e4 5850 }
1da177e4 5851 spin_unlock(&pers_lock);
34817e8c
N
5852 if (mddev->level != pers->level) {
5853 mddev->level = pers->level;
5854 mddev->new_level = pers->level;
5855 }
d9d166c2 5856 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
1da177e4 5857
f6705578 5858 if (mddev->reshape_position != MaxSector &&
63c70c4f 5859 pers->start_reshape == NULL) {
f6705578 5860 /* This personality cannot handle reshaping... */
f6705578 5861 module_put(pers->owner);
bfc9dfdc
SL
5862 err = -EINVAL;
5863 goto abort;
f6705578
N
5864 }
5865
7dd5e7c3
N
5866 if (pers->sync_request) {
5867 /* Warn if this is a potentially silly
5868 * configuration.
5869 */
5870 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3cb03002 5871 struct md_rdev *rdev2;
7dd5e7c3 5872 int warned = 0;
159ec1fc 5873
dafb20fa
N
5874 rdev_for_each(rdev, mddev)
5875 rdev_for_each(rdev2, mddev) {
7dd5e7c3
N
5876 if (rdev < rdev2 &&
5877 rdev->bdev->bd_contains ==
5878 rdev2->bdev->bd_contains) {
9d48739e
N
5879 pr_warn("%s: WARNING: %s appears to be on the same physical disk as %s.\n",
5880 mdname(mddev),
5881 bdevname(rdev->bdev,b),
5882 bdevname(rdev2->bdev,b2));
7dd5e7c3
N
5883 warned = 1;
5884 }
5885 }
159ec1fc 5886
7dd5e7c3 5887 if (warned)
9d48739e 5888 pr_warn("True protection against single-disk failure might be compromised.\n");
7dd5e7c3
N
5889 }
5890
657390d2 5891 mddev->recovery = 0;
58c0fed4
AN
5892 /* may be over-ridden by personality */
5893 mddev->resync_max_sectors = mddev->dev_sectors;
5894
6ff8d8ec 5895 mddev->ok_start_degraded = start_dirty_degraded;
1da177e4 5896
0f9552b5 5897 if (start_readonly && mddev->ro == 0)
f91de92e
N
5898 mddev->ro = 2; /* read-only, but switch on first write */
5899
36d091f4 5900 err = pers->run(mddev);
13e53df3 5901 if (err)
9d48739e 5902 pr_warn("md: pers->run() failed ...\n");
36d091f4 5903 else if (pers->size(mddev, 0, 0) < mddev->array_sectors) {
9d48739e
N
5904 WARN_ONCE(!mddev->external_size,
5905 "%s: default size too small, but 'external_size' not in effect?\n",
5906 __func__);
5907 pr_warn("md: invalid array_size %llu > default size %llu\n",
5908 (unsigned long long)mddev->array_sectors / 2,
5909 (unsigned long long)pers->size(mddev, 0, 0) / 2);
b522adcd 5910 err = -EINVAL;
b522adcd 5911 }
36d091f4 5912 if (err == 0 && pers->sync_request &&
ef99bf48 5913 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
f9209a32
GR
5914 struct bitmap *bitmap;
5915
e64e4018 5916 bitmap = md_bitmap_create(mddev, -1);
f9209a32
GR
5917 if (IS_ERR(bitmap)) {
5918 err = PTR_ERR(bitmap);
9d48739e
N
5919 pr_warn("%s: failed to create bitmap (%d)\n",
5920 mdname(mddev), err);
f9209a32
GR
5921 } else
5922 mddev->bitmap = bitmap;
5923
b15c2e57 5924 }
d494549a
GJ
5925 if (err)
5926 goto bitmap_abort;
3e148a32
GJ
5927
5928 if (mddev->bitmap_info.max_write_behind > 0) {
3e173ab5 5929 bool create_pool = false;
3e148a32
GJ
5930
5931 rdev_for_each(rdev, mddev) {
5932 if (test_bit(WriteMostly, &rdev->flags) &&
404659cf 5933 rdev_init_serial(rdev))
3e173ab5 5934 create_pool = true;
3e148a32 5935 }
3e173ab5 5936 if (create_pool && mddev->serial_info_pool == NULL) {
404659cf
GJ
5937 mddev->serial_info_pool =
5938 mempool_create_kmalloc_pool(NR_SERIAL_INFOS,
5939 sizeof(struct serial_info));
5940 if (!mddev->serial_info_pool) {
3e148a32 5941 err = -ENOMEM;
d494549a 5942 goto bitmap_abort;
3e148a32
GJ
5943 }
5944 }
5945 }
5946
5c675f83 5947 if (mddev->queue) {
bb086a89
SL
5948 bool nonrot = true;
5949
5950 rdev_for_each(rdev, mddev) {
5951 if (rdev->raid_disk >= 0 &&
5952 !blk_queue_nonrot(bdev_get_queue(rdev->bdev))) {
5953 nonrot = false;
5954 break;
5955 }
5956 }
5957 if (mddev->degraded)
5958 nonrot = false;
5959 if (nonrot)
8b904b5b 5960 blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
bb086a89 5961 else
8b904b5b 5962 blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
dc3b17cc
JK
5963 mddev->queue->backing_dev_info->congested_data = mddev;
5964 mddev->queue->backing_dev_info->congested_fn = md_congested;
5c675f83 5965 }
36d091f4 5966 if (pers->sync_request) {
00bcb4ac
N
5967 if (mddev->kobj.sd &&
5968 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
9d48739e
N
5969 pr_warn("md: cannot register extra attributes for %s\n",
5970 mdname(mddev));
00bcb4ac 5971 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5e55e2f5 5972 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
fd9d49ca
N
5973 mddev->ro = 0;
5974
1e50915f
RB
5975 atomic_set(&mddev->max_corr_read_errors,
5976 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
1da177e4 5977 mddev->safemode = 0;
28c1b9fd
GR
5978 if (mddev_is_clustered(mddev))
5979 mddev->safemode_delay = 0;
5980 else
5981 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
1da177e4 5982 mddev->in_sync = 1;
0ca69886 5983 smp_wmb();
36d091f4
N
5984 spin_lock(&mddev->lock);
5985 mddev->pers = pers;
36d091f4 5986 spin_unlock(&mddev->lock);
dafb20fa 5987 rdev_for_each(rdev, mddev)
36fad858 5988 if (rdev->raid_disk >= 0)
e5b521ee 5989 sysfs_link_rdev(mddev, rdev); /* failure here is OK */
f72ffdd6 5990
a4a3d26d
N
5991 if (mddev->degraded && !mddev->ro)
5992 /* This ensures that recovering status is reported immediately
5993 * via sysfs - until a lack of spares is confirmed.
5994 */
5995 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
1da177e4 5996 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
f72ffdd6 5997
2953079c 5998 if (mddev->sb_flags)
850b2b42 5999 md_update_sb(mddev, 0);
1da177e4 6000
d7603b7e 6001 md_new_event(mddev);
1da177e4 6002 return 0;
b126194c 6003
d494549a
GJ
6004bitmap_abort:
6005 mddev_detach(mddev);
6006 if (mddev->private)
6007 pers->free(mddev, mddev->private);
6008 mddev->private = NULL;
6009 module_put(pers->owner);
6010 md_bitmap_destroy(mddev);
b126194c 6011abort:
4bc034d3
N
6012 bioset_exit(&mddev->bio_set);
6013 bioset_exit(&mddev->sync_set);
b126194c 6014 return err;
1da177e4 6015}
390ee602 6016EXPORT_SYMBOL_GPL(md_run);
1da177e4 6017
fd01b88c 6018static int do_md_run(struct mddev *mddev)
fe60b014
N
6019{
6020 int err;
6021
9d4b45d6 6022 set_bit(MD_NOT_READY, &mddev->flags);
fe60b014
N
6023 err = md_run(mddev);
6024 if (err)
6025 goto out;
e64e4018 6026 err = md_bitmap_load(mddev);
69e51b44 6027 if (err) {
e64e4018 6028 md_bitmap_destroy(mddev);
69e51b44
N
6029 goto out;
6030 }
0fd018af 6031
28c1b9fd
GR
6032 if (mddev_is_clustered(mddev))
6033 md_allow_write(mddev);
6034
d5d885fd
SL
6035 /* run start up tasks that require md_thread */
6036 md_start(mddev);
6037
0fd018af
JB
6038 md_wakeup_thread(mddev->thread);
6039 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
6040
fe60b014
N
6041 set_capacity(mddev->gendisk, mddev->array_sectors);
6042 revalidate_disk(mddev->gendisk);
9d4b45d6 6043 clear_bit(MD_NOT_READY, &mddev->flags);
f0b4f7e2 6044 mddev->changed = 1;
fe60b014 6045 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
9d4b45d6
N
6046 sysfs_notify_dirent_safe(mddev->sysfs_state);
6047 sysfs_notify_dirent_safe(mddev->sysfs_action);
6048 sysfs_notify(&mddev->kobj, NULL, "degraded");
fe60b014 6049out:
9d4b45d6 6050 clear_bit(MD_NOT_READY, &mddev->flags);
fe60b014
N
6051 return err;
6052}
6053
d5d885fd
SL
6054int md_start(struct mddev *mddev)
6055{
6056 int ret = 0;
6057
6058 if (mddev->pers->start) {
6059 set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6060 md_wakeup_thread(mddev->thread);
6061 ret = mddev->pers->start(mddev);
6062 clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
6063 md_wakeup_thread(mddev->sync_thread);
6064 }
6065 return ret;
6066}
6067EXPORT_SYMBOL_GPL(md_start);
6068
fd01b88c 6069static int restart_array(struct mddev *mddev)
1da177e4
LT
6070{
6071 struct gendisk *disk = mddev->gendisk;
97b20ef7
N
6072 struct md_rdev *rdev;
6073 bool has_journal = false;
6074 bool has_readonly = false;
1da177e4 6075
80fab1d7 6076 /* Complain if it has no devices */
1da177e4 6077 if (list_empty(&mddev->disks))
80fab1d7
AN
6078 return -ENXIO;
6079 if (!mddev->pers)
6080 return -EINVAL;
6081 if (!mddev->ro)
6082 return -EBUSY;
339421de 6083
97b20ef7
N
6084 rcu_read_lock();
6085 rdev_for_each_rcu(rdev, mddev) {
6086 if (test_bit(Journal, &rdev->flags) &&
6087 !test_bit(Faulty, &rdev->flags))
6088 has_journal = true;
6089 if (bdev_read_only(rdev->bdev))
6090 has_readonly = true;
6091 }
6092 rcu_read_unlock();
6093 if (test_bit(MD_HAS_JOURNAL, &mddev->flags) && !has_journal)
339421de 6094 /* Don't restart rw with journal missing/faulty */
339421de 6095 return -EINVAL;
97b20ef7
N
6096 if (has_readonly)
6097 return -EROFS;
339421de 6098
80fab1d7
AN
6099 mddev->safemode = 0;
6100 mddev->ro = 0;
6101 set_disk_ro(disk, 0);
9d48739e 6102 pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
80fab1d7
AN
6103 /* Kick recovery or resync if necessary */
6104 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6105 md_wakeup_thread(mddev->thread);
6106 md_wakeup_thread(mddev->sync_thread);
00bcb4ac 6107 sysfs_notify_dirent_safe(mddev->sysfs_state);
80fab1d7 6108 return 0;
1da177e4
LT
6109}
6110
fd01b88c 6111static void md_clean(struct mddev *mddev)
6177b472
N
6112{
6113 mddev->array_sectors = 0;
6114 mddev->external_size = 0;
6115 mddev->dev_sectors = 0;
6116 mddev->raid_disks = 0;
6117 mddev->recovery_cp = 0;
6118 mddev->resync_min = 0;
6119 mddev->resync_max = MaxSector;
6120 mddev->reshape_position = MaxSector;
6121 mddev->external = 0;
6122 mddev->persistent = 0;
6123 mddev->level = LEVEL_NONE;
6124 mddev->clevel[0] = 0;
6125 mddev->flags = 0;
2953079c 6126 mddev->sb_flags = 0;
6177b472
N
6127 mddev->ro = 0;
6128 mddev->metadata_type[0] = 0;
6129 mddev->chunk_sectors = 0;
6130 mddev->ctime = mddev->utime = 0;
6131 mddev->layout = 0;
6132 mddev->max_disks = 0;
6133 mddev->events = 0;
a8707c08 6134 mddev->can_decrease_events = 0;
6177b472 6135 mddev->delta_disks = 0;
2c810cdd 6136 mddev->reshape_backwards = 0;
6177b472
N
6137 mddev->new_level = LEVEL_NONE;
6138 mddev->new_layout = 0;
6139 mddev->new_chunk_sectors = 0;
6140 mddev->curr_resync = 0;
7f7583d4 6141 atomic64_set(&mddev->resync_mismatches, 0);
6177b472
N
6142 mddev->suspend_lo = mddev->suspend_hi = 0;
6143 mddev->sync_speed_min = mddev->sync_speed_max = 0;
6144 mddev->recovery = 0;
6145 mddev->in_sync = 0;
f0b4f7e2 6146 mddev->changed = 0;
6177b472 6147 mddev->degraded = 0;
6177b472 6148 mddev->safemode = 0;
bd691922 6149 mddev->private = NULL;
c20c33f0 6150 mddev->cluster_info = NULL;
6177b472
N
6151 mddev->bitmap_info.offset = 0;
6152 mddev->bitmap_info.default_offset = 0;
6409bb05 6153 mddev->bitmap_info.default_space = 0;
6177b472
N
6154 mddev->bitmap_info.chunksize = 0;
6155 mddev->bitmap_info.daemon_sleep = 0;
6156 mddev->bitmap_info.max_write_behind = 0;
c20c33f0 6157 mddev->bitmap_info.nodes = 0;
6177b472
N
6158}
6159
fd01b88c 6160static void __md_stop_writes(struct mddev *mddev)
a047e125 6161{
6b6204ee 6162 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
21e0958e
GJ
6163 if (work_pending(&mddev->del_work))
6164 flush_workqueue(md_misc_wq);
a047e125 6165 if (mddev->sync_thread) {
a047e125 6166 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
a91d5ac0 6167 md_reap_sync_thread(mddev);
a047e125
N
6168 }
6169
6170 del_timer_sync(&mddev->safemode_timer);
6171
034e33f5
SL
6172 if (mddev->pers && mddev->pers->quiesce) {
6173 mddev->pers->quiesce(mddev, 1);
6174 mddev->pers->quiesce(mddev, 0);
6175 }
e64e4018 6176 md_bitmap_flush(mddev);
a047e125 6177
b6d428c6 6178 if (mddev->ro == 0 &&
28c1b9fd 6179 ((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
2953079c 6180 mddev->sb_flags)) {
a047e125 6181 /* mark array as shutdown cleanly */
28c1b9fd
GR
6182 if (!mddev_is_clustered(mddev))
6183 mddev->in_sync = 1;
a047e125
N
6184 md_update_sb(mddev, 1);
6185 }
69b00b5b
GJ
6186 /* disable policy to guarantee rdevs free resources for serialization */
6187 mddev->serialize_policy = 0;
6188 mddev_destroy_serial_pool(mddev, NULL, true);
a047e125 6189}
defad61a 6190
fd01b88c 6191void md_stop_writes(struct mddev *mddev)
defad61a 6192{
29f097c4 6193 mddev_lock_nointr(mddev);
defad61a
N
6194 __md_stop_writes(mddev);
6195 mddev_unlock(mddev);
6196}
390ee602 6197EXPORT_SYMBOL_GPL(md_stop_writes);
a047e125 6198
5aa61f42
N
6199static void mddev_detach(struct mddev *mddev)
6200{
e64e4018 6201 md_bitmap_wait_behind_writes(mddev);
6b40bec3 6202 if (mddev->pers && mddev->pers->quiesce && !mddev->suspended) {
5aa61f42
N
6203 mddev->pers->quiesce(mddev, 1);
6204 mddev->pers->quiesce(mddev, 0);
6205 }
6206 md_unregister_thread(&mddev->thread);
6207 if (mddev->queue)
6208 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
6209}
6210
5eff3c43 6211static void __md_stop(struct mddev *mddev)
6177b472 6212{
36d091f4 6213 struct md_personality *pers = mddev->pers;
e64e4018 6214 md_bitmap_destroy(mddev);
5aa61f42 6215 mddev_detach(mddev);
ee5d004f 6216 /* Ensure ->event_work is done */
21e0958e
GJ
6217 if (mddev->event_work.func)
6218 flush_workqueue(md_misc_wq);
36d091f4 6219 spin_lock(&mddev->lock);
6177b472 6220 mddev->pers = NULL;
36d091f4
N
6221 spin_unlock(&mddev->lock);
6222 pers->free(mddev, mddev->private);
bd691922 6223 mddev->private = NULL;
36d091f4
N
6224 if (pers->sync_request && mddev->to_remove == NULL)
6225 mddev->to_remove = &md_redundancy_group;
6226 module_put(pers->owner);
cca9cf90 6227 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6aaa58c9
JW
6228}
6229
6230void md_stop(struct mddev *mddev)
6231{
6232 /* stop the array and free an attached data structures.
6233 * This is called from dm-raid
6234 */
6235 __md_stop(mddev);
afeee514
KO
6236 bioset_exit(&mddev->bio_set);
6237 bioset_exit(&mddev->sync_set);
5eff3c43
N
6238}
6239
390ee602 6240EXPORT_SYMBOL_GPL(md_stop);
6177b472 6241
a05b7ea0 6242static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
a4bd82d0
N
6243{
6244 int err = 0;
30b8feb7
N
6245 int did_freeze = 0;
6246
6247 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6248 did_freeze = 1;
6249 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6250 md_wakeup_thread(mddev->thread);
6251 }
f851b60d 6252 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
30b8feb7 6253 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
f851b60d 6254 if (mddev->sync_thread)
30b8feb7
N
6255 /* Thread might be blocked waiting for metadata update
6256 * which will now never happen */
6257 wake_up_process(mddev->sync_thread->tsk);
f851b60d 6258
2953079c 6259 if (mddev->external && test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
88724bfa 6260 return -EBUSY;
30b8feb7 6261 mddev_unlock(mddev);
f851b60d
N
6262 wait_event(resync_wait, !test_bit(MD_RECOVERY_RUNNING,
6263 &mddev->recovery));
88724bfa 6264 wait_event(mddev->sb_wait,
2953079c 6265 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
30b8feb7
N
6266 mddev_lock_nointr(mddev);
6267
a4bd82d0 6268 mutex_lock(&mddev->open_mutex);
9ba3b7f5 6269 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
30b8feb7 6270 mddev->sync_thread ||
af8d8e6f 6271 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9d48739e 6272 pr_warn("md: %s still in use.\n",mdname(mddev));
30b8feb7
N
6273 if (did_freeze) {
6274 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d 6275 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
30b8feb7
N
6276 md_wakeup_thread(mddev->thread);
6277 }
a4bd82d0
N
6278 err = -EBUSY;
6279 goto out;
6280 }
6281 if (mddev->pers) {
defad61a 6282 __md_stop_writes(mddev);
a4bd82d0
N
6283
6284 err = -ENXIO;
6285 if (mddev->ro==1)
6286 goto out;
6287 mddev->ro = 1;
6288 set_disk_ro(mddev->gendisk, 1);
6289 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d
N
6290 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6291 md_wakeup_thread(mddev->thread);
00bcb4ac 6292 sysfs_notify_dirent_safe(mddev->sysfs_state);
30b8feb7 6293 err = 0;
a4bd82d0
N
6294 }
6295out:
6296 mutex_unlock(&mddev->open_mutex);
6297 return err;
6298}
6299
9e653b63
N
6300/* mode:
6301 * 0 - completely stop and dis-assemble array
9e653b63
N
6302 * 2 - stop but do not disassemble array
6303 */
f72ffdd6 6304static int do_md_stop(struct mddev *mddev, int mode,
a05b7ea0 6305 struct block_device *bdev)
1da177e4 6306{
1da177e4 6307 struct gendisk *disk = mddev->gendisk;
3cb03002 6308 struct md_rdev *rdev;
30b8feb7
N
6309 int did_freeze = 0;
6310
6311 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
6312 did_freeze = 1;
6313 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
6314 md_wakeup_thread(mddev->thread);
6315 }
f851b60d 6316 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
30b8feb7 6317 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
f851b60d 6318 if (mddev->sync_thread)
30b8feb7
N
6319 /* Thread might be blocked waiting for metadata update
6320 * which will now never happen */
6321 wake_up_process(mddev->sync_thread->tsk);
f851b60d 6322
30b8feb7 6323 mddev_unlock(mddev);
f851b60d
N
6324 wait_event(resync_wait, (mddev->sync_thread == NULL &&
6325 !test_bit(MD_RECOVERY_RUNNING,
6326 &mddev->recovery)));
30b8feb7 6327 mddev_lock_nointr(mddev);
1da177e4 6328
c8c00a69 6329 mutex_lock(&mddev->open_mutex);
9ba3b7f5 6330 if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
30b8feb7
N
6331 mddev->sysfs_active ||
6332 mddev->sync_thread ||
af8d8e6f 6333 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
9d48739e 6334 pr_warn("md: %s still in use.\n",mdname(mddev));
6e17b027 6335 mutex_unlock(&mddev->open_mutex);
30b8feb7
N
6336 if (did_freeze) {
6337 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
45eaf45d 6338 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
30b8feb7
N
6339 md_wakeup_thread(mddev->thread);
6340 }
260fa034
N
6341 return -EBUSY;
6342 }
6e17b027 6343 if (mddev->pers) {
a4bd82d0
N
6344 if (mddev->ro)
6345 set_disk_ro(disk, 0);
409c57f3 6346
defad61a 6347 __md_stop_writes(mddev);
5eff3c43 6348 __md_stop(mddev);
dc3b17cc 6349 mddev->queue->backing_dev_info->congested_fn = NULL;
6177b472 6350
a4bd82d0 6351 /* tell userspace to handle 'inactive' */
00bcb4ac 6352 sysfs_notify_dirent_safe(mddev->sysfs_state);
0d4ca600 6353
dafb20fa 6354 rdev_for_each(rdev, mddev)
36fad858
NK
6355 if (rdev->raid_disk >= 0)
6356 sysfs_unlink_rdev(mddev, rdev);
c4647292 6357
a4bd82d0 6358 set_capacity(disk, 0);
6e17b027 6359 mutex_unlock(&mddev->open_mutex);
f0b4f7e2 6360 mddev->changed = 1;
a4bd82d0 6361 revalidate_disk(disk);
0d4ca600 6362
a4bd82d0
N
6363 if (mddev->ro)
6364 mddev->ro = 0;
6e17b027
N
6365 } else
6366 mutex_unlock(&mddev->open_mutex);
1da177e4
LT
6367 /*
6368 * Free resources if final stop
6369 */
9e653b63 6370 if (mode == 0) {
9d48739e 6371 pr_info("md: %s stopped.\n", mdname(mddev));
1da177e4 6372
c3d9714e 6373 if (mddev->bitmap_info.file) {
4af1a041
N
6374 struct file *f = mddev->bitmap_info.file;
6375 spin_lock(&mddev->lock);
c3d9714e 6376 mddev->bitmap_info.file = NULL;
4af1a041
N
6377 spin_unlock(&mddev->lock);
6378 fput(f);
978f946b 6379 }
c3d9714e 6380 mddev->bitmap_info.offset = 0;
978f946b 6381
1da177e4
LT
6382 export_array(mddev);
6383
6177b472 6384 md_clean(mddev);
efeb53c0
N
6385 if (mddev->hold_active == UNTIL_STOP)
6386 mddev->hold_active = 0;
a4bd82d0 6387 }
d7603b7e 6388 md_new_event(mddev);
00bcb4ac 6389 sysfs_notify_dirent_safe(mddev->sysfs_state);
6e17b027 6390 return 0;
1da177e4
LT
6391}
6392
fdee8ae4 6393#ifndef MODULE
fd01b88c 6394static void autorun_array(struct mddev *mddev)
1da177e4 6395{
3cb03002 6396 struct md_rdev *rdev;
1da177e4
LT
6397 int err;
6398
a757e64c 6399 if (list_empty(&mddev->disks))
1da177e4 6400 return;
1da177e4 6401
9d48739e 6402 pr_info("md: running: ");
1da177e4 6403
dafb20fa 6404 rdev_for_each(rdev, mddev) {
1da177e4 6405 char b[BDEVNAME_SIZE];
9d48739e 6406 pr_cont("<%s>", bdevname(rdev->bdev,b));
1da177e4 6407 }
9d48739e 6408 pr_cont("\n");
1da177e4 6409
d710e138 6410 err = do_md_run(mddev);
1da177e4 6411 if (err) {
9d48739e 6412 pr_warn("md: do_md_run() returned %d\n", err);
a05b7ea0 6413 do_md_stop(mddev, 0, NULL);
1da177e4
LT
6414 }
6415}
6416
6417/*
6418 * lets try to run arrays based on all disks that have arrived
6419 * until now. (those are in pending_raid_disks)
6420 *
6421 * the method: pick the first pending disk, collect all disks with
6422 * the same UUID, remove all from the pending list and put them into
6423 * the 'same_array' list. Then order this list based on superblock
6424 * update time (freshest comes first), kick out 'old' disks and
6425 * compare superblocks. If everything's fine then run it.
6426 *
6427 * If "unit" is allocated, then bump its reference count
6428 */
6429static void autorun_devices(int part)
6430{
3cb03002 6431 struct md_rdev *rdev0, *rdev, *tmp;
fd01b88c 6432 struct mddev *mddev;
1da177e4
LT
6433 char b[BDEVNAME_SIZE];
6434
9d48739e 6435 pr_info("md: autorun ...\n");
1da177e4 6436 while (!list_empty(&pending_raid_disks)) {
e8703fe1 6437 int unit;
1da177e4 6438 dev_t dev;
ad01c9e3 6439 LIST_HEAD(candidates);
1da177e4 6440 rdev0 = list_entry(pending_raid_disks.next,
3cb03002 6441 struct md_rdev, same_set);
1da177e4 6442
9d48739e 6443 pr_debug("md: considering %s ...\n", bdevname(rdev0->bdev,b));
1da177e4 6444 INIT_LIST_HEAD(&candidates);
159ec1fc 6445 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
1da177e4 6446 if (super_90_load(rdev, rdev0, 0) >= 0) {
9d48739e
N
6447 pr_debug("md: adding %s ...\n",
6448 bdevname(rdev->bdev,b));
1da177e4
LT
6449 list_move(&rdev->same_set, &candidates);
6450 }
6451 /*
6452 * now we have a set of devices, with all of them having
6453 * mostly sane superblocks. It's time to allocate the
6454 * mddev.
6455 */
e8703fe1
N
6456 if (part) {
6457 dev = MKDEV(mdp_major,
6458 rdev0->preferred_minor << MdpMinorShift);
6459 unit = MINOR(dev) >> MdpMinorShift;
6460 } else {
6461 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
6462 unit = MINOR(dev);
6463 }
6464 if (rdev0->preferred_minor != unit) {
9d48739e
N
6465 pr_warn("md: unit number in %s is bad: %d\n",
6466 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
1da177e4
LT
6467 break;
6468 }
1da177e4
LT
6469
6470 md_probe(dev, NULL, NULL);
6471 mddev = mddev_find(dev);
9bbbca3a
NB
6472 if (!mddev || !mddev->gendisk) {
6473 if (mddev)
6474 mddev_put(mddev);
1da177e4
LT
6475 break;
6476 }
f72ffdd6 6477 if (mddev_lock(mddev))
9d48739e 6478 pr_warn("md: %s locked, cannot run\n", mdname(mddev));
1da177e4
LT
6479 else if (mddev->raid_disks || mddev->major_version
6480 || !list_empty(&mddev->disks)) {
9d48739e 6481 pr_warn("md: %s already running, cannot run %s\n",
1da177e4
LT
6482 mdname(mddev), bdevname(rdev0->bdev,b));
6483 mddev_unlock(mddev);
6484 } else {
9d48739e 6485 pr_debug("md: created %s\n", mdname(mddev));
1ec4a939 6486 mddev->persistent = 1;
159ec1fc 6487 rdev_for_each_list(rdev, tmp, &candidates) {
1da177e4
LT
6488 list_del_init(&rdev->same_set);
6489 if (bind_rdev_to_array(rdev, mddev))
6490 export_rdev(rdev);
6491 }
6492 autorun_array(mddev);
6493 mddev_unlock(mddev);
6494 }
6495 /* on success, candidates will be empty, on error
6496 * it won't...
6497 */
159ec1fc 6498 rdev_for_each_list(rdev, tmp, &candidates) {
4b80991c 6499 list_del_init(&rdev->same_set);
1da177e4 6500 export_rdev(rdev);
4b80991c 6501 }
1da177e4
LT
6502 mddev_put(mddev);
6503 }
9d48739e 6504 pr_info("md: ... autorun DONE.\n");
1da177e4 6505}
fdee8ae4 6506#endif /* !MODULE */
1da177e4 6507
f72ffdd6 6508static int get_version(void __user *arg)
1da177e4
LT
6509{
6510 mdu_version_t ver;
6511
6512 ver.major = MD_MAJOR_VERSION;
6513 ver.minor = MD_MINOR_VERSION;
6514 ver.patchlevel = MD_PATCHLEVEL_VERSION;
6515
6516 if (copy_to_user(arg, &ver, sizeof(ver)))
6517 return -EFAULT;
6518
6519 return 0;
6520}
6521
f72ffdd6 6522static int get_array_info(struct mddev *mddev, void __user *arg)
1da177e4
LT
6523{
6524 mdu_array_info_t info;
a9f326eb 6525 int nr,working,insync,failed,spare;
3cb03002 6526 struct md_rdev *rdev;
1da177e4 6527
1ca69c4b
N
6528 nr = working = insync = failed = spare = 0;
6529 rcu_read_lock();
6530 rdev_for_each_rcu(rdev, mddev) {
1da177e4 6531 nr++;
b2d444d7 6532 if (test_bit(Faulty, &rdev->flags))
1da177e4
LT
6533 failed++;
6534 else {
6535 working++;
b2d444d7 6536 if (test_bit(In_sync, &rdev->flags))
f72ffdd6 6537 insync++;
b347af81
SL
6538 else if (test_bit(Journal, &rdev->flags))
6539 /* TODO: add journal count to md_u.h */
6540 ;
1da177e4
LT
6541 else
6542 spare++;
6543 }
6544 }
1ca69c4b 6545 rcu_read_unlock();
1da177e4
LT
6546
6547 info.major_version = mddev->major_version;
6548 info.minor_version = mddev->minor_version;
6549 info.patch_version = MD_PATCHLEVEL_VERSION;
9ebc6ef1 6550 info.ctime = clamp_t(time64_t, mddev->ctime, 0, U32_MAX);
1da177e4 6551 info.level = mddev->level;
58c0fed4
AN
6552 info.size = mddev->dev_sectors / 2;
6553 if (info.size != mddev->dev_sectors / 2) /* overflow */
284ae7ca 6554 info.size = -1;
1da177e4
LT
6555 info.nr_disks = nr;
6556 info.raid_disks = mddev->raid_disks;
6557 info.md_minor = mddev->md_minor;
6558 info.not_persistent= !mddev->persistent;
6559
9ebc6ef1 6560 info.utime = clamp_t(time64_t, mddev->utime, 0, U32_MAX);
1da177e4
LT
6561 info.state = 0;
6562 if (mddev->in_sync)
6563 info.state = (1<<MD_SB_CLEAN);
c3d9714e 6564 if (mddev->bitmap && mddev->bitmap_info.offset)
9bd35920 6565 info.state |= (1<<MD_SB_BITMAP_PRESENT);
ca8895d9
GR
6566 if (mddev_is_clustered(mddev))
6567 info.state |= (1<<MD_SB_CLUSTERED);
a9f326eb 6568 info.active_disks = insync;
1da177e4
LT
6569 info.working_disks = working;
6570 info.failed_disks = failed;
6571 info.spare_disks = spare;
6572
6573 info.layout = mddev->layout;
9d8f0363 6574 info.chunk_size = mddev->chunk_sectors << 9;
1da177e4
LT
6575
6576 if (copy_to_user(arg, &info, sizeof(info)))
6577 return -EFAULT;
6578
6579 return 0;
6580}
6581
f72ffdd6 6582static int get_bitmap_file(struct mddev *mddev, void __user * arg)
32a7627c
N
6583{
6584 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
f4ad3d38 6585 char *ptr;
4af1a041 6586 int err;
32a7627c 6587
b6878d9e 6588 file = kzalloc(sizeof(*file), GFP_NOIO);
32a7627c 6589 if (!file)
4af1a041 6590 return -ENOMEM;
32a7627c 6591
4af1a041
N
6592 err = 0;
6593 spin_lock(&mddev->lock);
25eafe1a
BR
6594 /* bitmap enabled */
6595 if (mddev->bitmap_info.file) {
6596 ptr = file_path(mddev->bitmap_info.file, file->pathname,
6597 sizeof(file->pathname));
6598 if (IS_ERR(ptr))
6599 err = PTR_ERR(ptr);
6600 else
6601 memmove(file->pathname, ptr,
6602 sizeof(file->pathname)-(ptr-file->pathname));
6603 }
4af1a041 6604 spin_unlock(&mddev->lock);
32a7627c 6605
4af1a041
N
6606 if (err == 0 &&
6607 copy_to_user(arg, file, sizeof(*file)))
32a7627c 6608 err = -EFAULT;
4af1a041 6609
32a7627c
N
6610 kfree(file);
6611 return err;
6612}
6613
f72ffdd6 6614static int get_disk_info(struct mddev *mddev, void __user * arg)
1da177e4
LT
6615{
6616 mdu_disk_info_t info;
3cb03002 6617 struct md_rdev *rdev;
1da177e4
LT
6618
6619 if (copy_from_user(&info, arg, sizeof(info)))
6620 return -EFAULT;
6621
1ca69c4b 6622 rcu_read_lock();
57d051dc 6623 rdev = md_find_rdev_nr_rcu(mddev, info.number);
1da177e4
LT
6624 if (rdev) {
6625 info.major = MAJOR(rdev->bdev->bd_dev);
6626 info.minor = MINOR(rdev->bdev->bd_dev);
6627 info.raid_disk = rdev->raid_disk;
6628 info.state = 0;
b2d444d7 6629 if (test_bit(Faulty, &rdev->flags))
1da177e4 6630 info.state |= (1<<MD_DISK_FAULTY);
b2d444d7 6631 else if (test_bit(In_sync, &rdev->flags)) {
1da177e4
LT
6632 info.state |= (1<<MD_DISK_ACTIVE);
6633 info.state |= (1<<MD_DISK_SYNC);
6634 }
9efdca16 6635 if (test_bit(Journal, &rdev->flags))
bac624f3 6636 info.state |= (1<<MD_DISK_JOURNAL);
8ddf9efe
N
6637 if (test_bit(WriteMostly, &rdev->flags))
6638 info.state |= (1<<MD_DISK_WRITEMOSTLY);
688834e6
N
6639 if (test_bit(FailFast, &rdev->flags))
6640 info.state |= (1<<MD_DISK_FAILFAST);
1da177e4
LT
6641 } else {
6642 info.major = info.minor = 0;
6643 info.raid_disk = -1;
6644 info.state = (1<<MD_DISK_REMOVED);
6645 }
1ca69c4b 6646 rcu_read_unlock();
1da177e4
LT
6647
6648 if (copy_to_user(arg, &info, sizeof(info)))
6649 return -EFAULT;
6650
6651 return 0;
6652}
6653
f72ffdd6 6654static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info)
1da177e4
LT
6655{
6656 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
3cb03002 6657 struct md_rdev *rdev;
1da177e4
LT
6658 dev_t dev = MKDEV(info->major,info->minor);
6659
1aee41f6
GR
6660 if (mddev_is_clustered(mddev) &&
6661 !(info->state & ((1 << MD_DISK_CLUSTER_ADD) | (1 << MD_DISK_CANDIDATE)))) {
9d48739e
N
6662 pr_warn("%s: Cannot add to clustered mddev.\n",
6663 mdname(mddev));
1aee41f6
GR
6664 return -EINVAL;
6665 }
6666
1da177e4
LT
6667 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
6668 return -EOVERFLOW;
6669
6670 if (!mddev->raid_disks) {
6671 int err;
6672 /* expecting a device which has a superblock */
6673 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
6674 if (IS_ERR(rdev)) {
9d48739e 6675 pr_warn("md: md_import_device returned %ld\n",
1da177e4
LT
6676 PTR_ERR(rdev));
6677 return PTR_ERR(rdev);
6678 }
6679 if (!list_empty(&mddev->disks)) {
3cb03002
N
6680 struct md_rdev *rdev0
6681 = list_entry(mddev->disks.next,
6682 struct md_rdev, same_set);
a9f326eb 6683 err = super_types[mddev->major_version]
1da177e4
LT
6684 .load_super(rdev, rdev0, mddev->minor_version);
6685 if (err < 0) {
9d48739e 6686 pr_warn("md: %s has different UUID to %s\n",
f72ffdd6 6687 bdevname(rdev->bdev,b),
1da177e4
LT
6688 bdevname(rdev0->bdev,b2));
6689 export_rdev(rdev);
6690 return -EINVAL;
6691 }
6692 }
6693 err = bind_rdev_to_array(rdev, mddev);
6694 if (err)
6695 export_rdev(rdev);
6696 return err;
6697 }
6698
6699 /*
6700 * add_new_disk can be used once the array is assembled
6701 * to add "hot spares". They must already have a superblock
6702 * written
6703 */
6704 if (mddev->pers) {
6705 int err;
6706 if (!mddev->pers->hot_add_disk) {
9d48739e
N
6707 pr_warn("%s: personality does not support diskops!\n",
6708 mdname(mddev));
1da177e4
LT
6709 return -EINVAL;
6710 }
7b1e35f6
N
6711 if (mddev->persistent)
6712 rdev = md_import_device(dev, mddev->major_version,
6713 mddev->minor_version);
6714 else
6715 rdev = md_import_device(dev, -1, -1);
1da177e4 6716 if (IS_ERR(rdev)) {
9d48739e 6717 pr_warn("md: md_import_device returned %ld\n",
1da177e4
LT
6718 PTR_ERR(rdev));
6719 return PTR_ERR(rdev);
6720 }
1a855a06 6721 /* set saved_raid_disk if appropriate */
41158c7e
N
6722 if (!mddev->persistent) {
6723 if (info->state & (1<<MD_DISK_SYNC) &&
bf572541 6724 info->raid_disk < mddev->raid_disks) {
41158c7e 6725 rdev->raid_disk = info->raid_disk;
bf572541 6726 set_bit(In_sync, &rdev->flags);
8313b8e5 6727 clear_bit(Bitmap_sync, &rdev->flags);
bf572541 6728 } else
41158c7e 6729 rdev->raid_disk = -1;
f466722c 6730 rdev->saved_raid_disk = rdev->raid_disk;
41158c7e
N
6731 } else
6732 super_types[mddev->major_version].
6733 validate_super(mddev, rdev);
bedd86b7 6734 if ((info->state & (1<<MD_DISK_SYNC)) &&
f4563091 6735 rdev->raid_disk != info->raid_disk) {
bedd86b7
N
6736 /* This was a hot-add request, but events doesn't
6737 * match, so reject it.
6738 */
6739 export_rdev(rdev);
6740 return -EINVAL;
6741 }
6742
b2d444d7 6743 clear_bit(In_sync, &rdev->flags); /* just to be sure */
8ddf9efe
N
6744 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6745 set_bit(WriteMostly, &rdev->flags);
575a80fa
N
6746 else
6747 clear_bit(WriteMostly, &rdev->flags);
688834e6
N
6748 if (info->state & (1<<MD_DISK_FAILFAST))
6749 set_bit(FailFast, &rdev->flags);
6750 else
6751 clear_bit(FailFast, &rdev->flags);
8ddf9efe 6752
f6b6ec5c
SL
6753 if (info->state & (1<<MD_DISK_JOURNAL)) {
6754 struct md_rdev *rdev2;
6755 bool has_journal = false;
6756
6757 /* make sure no existing journal disk */
6758 rdev_for_each(rdev2, mddev) {
6759 if (test_bit(Journal, &rdev2->flags)) {
6760 has_journal = true;
6761 break;
6762 }
6763 }
230b55fa 6764 if (has_journal || mddev->bitmap) {
f6b6ec5c
SL
6765 export_rdev(rdev);
6766 return -EBUSY;
6767 }
bac624f3 6768 set_bit(Journal, &rdev->flags);
f6b6ec5c 6769 }
1aee41f6
GR
6770 /*
6771 * check whether the device shows up in other nodes
6772 */
6773 if (mddev_is_clustered(mddev)) {
dbb64f86 6774 if (info->state & (1 << MD_DISK_CANDIDATE))
1aee41f6 6775 set_bit(Candidate, &rdev->flags);
dbb64f86 6776 else if (info->state & (1 << MD_DISK_CLUSTER_ADD)) {
1aee41f6 6777 /* --add initiated by this node */
dbb64f86 6778 err = md_cluster_ops->add_new_disk(mddev, rdev);
1aee41f6 6779 if (err) {
1aee41f6
GR
6780 export_rdev(rdev);
6781 return err;
6782 }
6783 }
6784 }
6785
1da177e4
LT
6786 rdev->raid_disk = -1;
6787 err = bind_rdev_to_array(rdev, mddev);
dbb64f86 6788
1da177e4
LT
6789 if (err)
6790 export_rdev(rdev);
dbb64f86
GR
6791
6792 if (mddev_is_clustered(mddev)) {
e566aef1
GJ
6793 if (info->state & (1 << MD_DISK_CANDIDATE)) {
6794 if (!err) {
6795 err = md_cluster_ops->new_disk_ack(mddev,
6796 err == 0);
6797 if (err)
6798 md_kick_rdev_from_array(rdev);
6799 }
6800 } else {
dbb64f86
GR
6801 if (err)
6802 md_cluster_ops->add_new_disk_cancel(mddev);
6803 else
6804 err = add_bound_rdev(rdev);
6805 }
6806
6807 } else if (!err)
a6da4ef8 6808 err = add_bound_rdev(rdev);
dbb64f86 6809
1da177e4
LT
6810 return err;
6811 }
6812
6813 /* otherwise, add_new_disk is only allowed
6814 * for major_version==0 superblocks
6815 */
6816 if (mddev->major_version != 0) {
9d48739e 6817 pr_warn("%s: ADD_NEW_DISK not supported\n", mdname(mddev));
1da177e4
LT
6818 return -EINVAL;
6819 }
6820
6821 if (!(info->state & (1<<MD_DISK_FAULTY))) {
6822 int err;
d710e138 6823 rdev = md_import_device(dev, -1, 0);
1da177e4 6824 if (IS_ERR(rdev)) {
9d48739e 6825 pr_warn("md: error, md_import_device() returned %ld\n",
1da177e4
LT
6826 PTR_ERR(rdev));
6827 return PTR_ERR(rdev);
6828 }
6829 rdev->desc_nr = info->number;
6830 if (info->raid_disk < mddev->raid_disks)
6831 rdev->raid_disk = info->raid_disk;
6832 else
6833 rdev->raid_disk = -1;
6834
1da177e4 6835 if (rdev->raid_disk < mddev->raid_disks)
b2d444d7
N
6836 if (info->state & (1<<MD_DISK_SYNC))
6837 set_bit(In_sync, &rdev->flags);
1da177e4 6838
8ddf9efe
N
6839 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
6840 set_bit(WriteMostly, &rdev->flags);
688834e6
N
6841 if (info->state & (1<<MD_DISK_FAILFAST))
6842 set_bit(FailFast, &rdev->flags);
8ddf9efe 6843
1da177e4 6844 if (!mddev->persistent) {
9d48739e 6845 pr_debug("md: nonpersistent superblock ...\n");
77304d2a
MS
6846 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
6847 } else
57b2caa3 6848 rdev->sb_start = calc_dev_sboffset(rdev);
8190e754 6849 rdev->sectors = rdev->sb_start;
1da177e4 6850
2bf071bf
N
6851 err = bind_rdev_to_array(rdev, mddev);
6852 if (err) {
6853 export_rdev(rdev);
6854 return err;
6855 }
1da177e4
LT
6856 }
6857
6858 return 0;
6859}
6860
f72ffdd6 6861static int hot_remove_disk(struct mddev *mddev, dev_t dev)
1da177e4
LT
6862{
6863 char b[BDEVNAME_SIZE];
3cb03002 6864 struct md_rdev *rdev;
1da177e4 6865
c42a0e26
YY
6866 if (!mddev->pers)
6867 return -ENODEV;
6868
1da177e4
LT
6869 rdev = find_rdev(mddev, dev);
6870 if (!rdev)
6871 return -ENXIO;
6872
2910ff17
GR
6873 if (rdev->raid_disk < 0)
6874 goto kick_rdev;
293467aa 6875
3ea8929d
N
6876 clear_bit(Blocked, &rdev->flags);
6877 remove_and_add_spares(mddev, rdev);
6878
1da177e4
LT
6879 if (rdev->raid_disk >= 0)
6880 goto busy;
6881
2910ff17 6882kick_rdev:
54a88392 6883 if (mddev_is_clustered(mddev))
88bcfef7
GR
6884 md_cluster_ops->remove_disk(mddev, rdev);
6885
fb56dfef 6886 md_kick_rdev_from_array(rdev);
2953079c 6887 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
060b0689
N
6888 if (mddev->thread)
6889 md_wakeup_thread(mddev->thread);
6890 else
6891 md_update_sb(mddev, 1);
d7603b7e 6892 md_new_event(mddev);
1da177e4
LT
6893
6894 return 0;
6895busy:
9d48739e
N
6896 pr_debug("md: cannot remove active disk %s from %s ...\n",
6897 bdevname(rdev->bdev,b), mdname(mddev));
1da177e4
LT
6898 return -EBUSY;
6899}
6900
f72ffdd6 6901static int hot_add_disk(struct mddev *mddev, dev_t dev)
1da177e4
LT
6902{
6903 char b[BDEVNAME_SIZE];
6904 int err;
3cb03002 6905 struct md_rdev *rdev;
1da177e4
LT
6906
6907 if (!mddev->pers)
6908 return -ENODEV;
6909
6910 if (mddev->major_version != 0) {
9d48739e 6911 pr_warn("%s: HOT_ADD may only be used with version-0 superblocks.\n",
1da177e4
LT
6912 mdname(mddev));
6913 return -EINVAL;
6914 }
6915 if (!mddev->pers->hot_add_disk) {
9d48739e 6916 pr_warn("%s: personality does not support diskops!\n",
1da177e4
LT
6917 mdname(mddev));
6918 return -EINVAL;
6919 }
6920
d710e138 6921 rdev = md_import_device(dev, -1, 0);
1da177e4 6922 if (IS_ERR(rdev)) {
9d48739e 6923 pr_warn("md: error, md_import_device() returned %ld\n",
1da177e4
LT
6924 PTR_ERR(rdev));
6925 return -EINVAL;
6926 }
6927
6928 if (mddev->persistent)
57b2caa3 6929 rdev->sb_start = calc_dev_sboffset(rdev);
1da177e4 6930 else
77304d2a 6931 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
1da177e4 6932
8190e754 6933 rdev->sectors = rdev->sb_start;
1da177e4 6934
b2d444d7 6935 if (test_bit(Faulty, &rdev->flags)) {
9d48739e 6936 pr_warn("md: can not hot-add faulty %s disk to %s!\n",
1da177e4
LT
6937 bdevname(rdev->bdev,b), mdname(mddev));
6938 err = -EINVAL;
6939 goto abort_export;
6940 }
293467aa 6941
b2d444d7 6942 clear_bit(In_sync, &rdev->flags);
1da177e4 6943 rdev->desc_nr = -1;
5842730d 6944 rdev->saved_raid_disk = -1;
2bf071bf
N
6945 err = bind_rdev_to_array(rdev, mddev);
6946 if (err)
2aa82191 6947 goto abort_export;
1da177e4
LT
6948
6949 /*
6950 * The rest should better be atomic, we can have disk failures
6951 * noticed in interrupt contexts ...
6952 */
6953
1da177e4
LT
6954 rdev->raid_disk = -1;
6955
2953079c 6956 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
060b0689
N
6957 if (!mddev->thread)
6958 md_update_sb(mddev, 1);
1da177e4
LT
6959 /*
6960 * Kick recovery, maybe this spare has to be added to the
6961 * array immediately.
6962 */
6963 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6964 md_wakeup_thread(mddev->thread);
d7603b7e 6965 md_new_event(mddev);
1da177e4
LT
6966 return 0;
6967
1da177e4
LT
6968abort_export:
6969 export_rdev(rdev);
6970 return err;
6971}
6972
fd01b88c 6973static int set_bitmap_file(struct mddev *mddev, int fd)
32a7627c 6974{
035328c2 6975 int err = 0;
32a7627c 6976
36fa3063 6977 if (mddev->pers) {
d66b1b39 6978 if (!mddev->pers->quiesce || !mddev->thread)
36fa3063
N
6979 return -EBUSY;
6980 if (mddev->recovery || mddev->sync_thread)
6981 return -EBUSY;
6982 /* we should be able to change the bitmap.. */
6983 }
32a7627c 6984
36fa3063 6985 if (fd >= 0) {
035328c2 6986 struct inode *inode;
1e594bb2
N
6987 struct file *f;
6988
6989 if (mddev->bitmap || mddev->bitmap_info.file)
36fa3063 6990 return -EEXIST; /* cannot add when bitmap is present */
1e594bb2 6991 f = fget(fd);
32a7627c 6992
1e594bb2 6993 if (f == NULL) {
9d48739e
N
6994 pr_warn("%s: error: failed to get bitmap file\n",
6995 mdname(mddev));
36fa3063
N
6996 return -EBADF;
6997 }
6998
1e594bb2 6999 inode = f->f_mapping->host;
035328c2 7000 if (!S_ISREG(inode->i_mode)) {
9d48739e
N
7001 pr_warn("%s: error: bitmap file must be a regular file\n",
7002 mdname(mddev));
035328c2 7003 err = -EBADF;
1e594bb2 7004 } else if (!(f->f_mode & FMODE_WRITE)) {
9d48739e
N
7005 pr_warn("%s: error: bitmap file must open for write\n",
7006 mdname(mddev));
035328c2
N
7007 err = -EBADF;
7008 } else if (atomic_read(&inode->i_writecount) != 1) {
9d48739e
N
7009 pr_warn("%s: error: bitmap file is already in use\n",
7010 mdname(mddev));
035328c2
N
7011 err = -EBUSY;
7012 }
7013 if (err) {
1e594bb2 7014 fput(f);
36fa3063
N
7015 return err;
7016 }
1e594bb2 7017 mddev->bitmap_info.file = f;
c3d9714e 7018 mddev->bitmap_info.offset = 0; /* file overrides offset */
36fa3063
N
7019 } else if (mddev->bitmap == NULL)
7020 return -ENOENT; /* cannot remove what isn't there */
7021 err = 0;
7022 if (mddev->pers) {
69e51b44 7023 if (fd >= 0) {
f9209a32
GR
7024 struct bitmap *bitmap;
7025
e64e4018 7026 bitmap = md_bitmap_create(mddev, -1);
9e1cc0a5 7027 mddev_suspend(mddev);
f9209a32
GR
7028 if (!IS_ERR(bitmap)) {
7029 mddev->bitmap = bitmap;
e64e4018 7030 err = md_bitmap_load(mddev);
ba599aca
N
7031 } else
7032 err = PTR_ERR(bitmap);
52a0d49d 7033 if (err) {
e64e4018 7034 md_bitmap_destroy(mddev);
52a0d49d
N
7035 fd = -1;
7036 }
9e1cc0a5 7037 mddev_resume(mddev);
52a0d49d 7038 } else if (fd < 0) {
9e1cc0a5 7039 mddev_suspend(mddev);
e64e4018 7040 md_bitmap_destroy(mddev);
9e1cc0a5 7041 mddev_resume(mddev);
d7375ab3 7042 }
d7375ab3
N
7043 }
7044 if (fd < 0) {
4af1a041
N
7045 struct file *f = mddev->bitmap_info.file;
7046 if (f) {
7047 spin_lock(&mddev->lock);
7048 mddev->bitmap_info.file = NULL;
7049 spin_unlock(&mddev->lock);
7050 fput(f);
7051 }
36fa3063
N
7052 }
7053
32a7627c
N
7054 return err;
7055}
7056
1da177e4
LT
7057/*
7058 * set_array_info is used two different ways
7059 * The original usage is when creating a new array.
7060 * In this usage, raid_disks is > 0 and it together with
7061 * level, size, not_persistent,layout,chunksize determine the
7062 * shape of the array.
7063 * This will always create an array with a type-0.90.0 superblock.
7064 * The newer usage is when assembling an array.
7065 * In this case raid_disks will be 0, and the major_version field is
7066 * use to determine which style super-blocks are to be found on the devices.
7067 * The minor and patch _version numbers are also kept incase the
7068 * super_block handler wishes to interpret them.
7069 */
f72ffdd6 7070static int set_array_info(struct mddev *mddev, mdu_array_info_t *info)
1da177e4
LT
7071{
7072
7073 if (info->raid_disks == 0) {
7074 /* just setting version number for superblock loading */
7075 if (info->major_version < 0 ||
50511da3 7076 info->major_version >= ARRAY_SIZE(super_types) ||
1da177e4
LT
7077 super_types[info->major_version].name == NULL) {
7078 /* maybe try to auto-load a module? */
9d48739e 7079 pr_warn("md: superblock version %d not known\n",
1da177e4
LT
7080 info->major_version);
7081 return -EINVAL;
7082 }
7083 mddev->major_version = info->major_version;
7084 mddev->minor_version = info->minor_version;
7085 mddev->patch_version = info->patch_version;
3f9d7b0d 7086 mddev->persistent = !info->not_persistent;
cbd19983
N
7087 /* ensure mddev_put doesn't delete this now that there
7088 * is some minimal configuration.
7089 */
9ebc6ef1 7090 mddev->ctime = ktime_get_real_seconds();
1da177e4
LT
7091 return 0;
7092 }
7093 mddev->major_version = MD_MAJOR_VERSION;
7094 mddev->minor_version = MD_MINOR_VERSION;
7095 mddev->patch_version = MD_PATCHLEVEL_VERSION;
9ebc6ef1 7096 mddev->ctime = ktime_get_real_seconds();
1da177e4
LT
7097
7098 mddev->level = info->level;
17115e03 7099 mddev->clevel[0] = 0;
58c0fed4 7100 mddev->dev_sectors = 2 * (sector_t)info->size;
1da177e4
LT
7101 mddev->raid_disks = info->raid_disks;
7102 /* don't set md_minor, it is determined by which /dev/md* was
7103 * openned
7104 */
7105 if (info->state & (1<<MD_SB_CLEAN))
7106 mddev->recovery_cp = MaxSector;
7107 else
7108 mddev->recovery_cp = 0;
7109 mddev->persistent = ! info->not_persistent;
e691063a 7110 mddev->external = 0;
1da177e4
LT
7111
7112 mddev->layout = info->layout;
33f2c35a
N
7113 if (mddev->level == 0)
7114 /* Cannot trust RAID0 layout info here */
7115 mddev->layout = -1;
9d8f0363 7116 mddev->chunk_sectors = info->chunk_size >> 9;
1da177e4 7117
2953079c 7118 if (mddev->persistent) {
1b3bae49
N
7119 mddev->max_disks = MD_SB_DISKS;
7120 mddev->flags = 0;
7121 mddev->sb_flags = 0;
2953079c
SL
7122 }
7123 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
1da177e4 7124
c3d9714e 7125 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6409bb05 7126 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
c3d9714e 7127 mddev->bitmap_info.offset = 0;
b2a2703c 7128
f6705578
N
7129 mddev->reshape_position = MaxSector;
7130
1da177e4
LT
7131 /*
7132 * Generate a 128 bit UUID
7133 */
7134 get_random_bytes(mddev->uuid, 16);
7135
f6705578 7136 mddev->new_level = mddev->level;
664e7c41 7137 mddev->new_chunk_sectors = mddev->chunk_sectors;
f6705578
N
7138 mddev->new_layout = mddev->layout;
7139 mddev->delta_disks = 0;
2c810cdd 7140 mddev->reshape_backwards = 0;
f6705578 7141
1da177e4
LT
7142 return 0;
7143}
7144
fd01b88c 7145void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
1f403624 7146{
efa4b77b 7147 lockdep_assert_held(&mddev->reconfig_mutex);
b522adcd
DW
7148
7149 if (mddev->external_size)
7150 return;
7151
1f403624
DW
7152 mddev->array_sectors = array_sectors;
7153}
7154EXPORT_SYMBOL(md_set_array_sectors);
7155
fd01b88c 7156static int update_size(struct mddev *mddev, sector_t num_sectors)
a35b0d69 7157{
3cb03002 7158 struct md_rdev *rdev;
a35b0d69 7159 int rv;
d71f9f88 7160 int fit = (num_sectors == 0);
818da59f 7161 sector_t old_dev_sectors = mddev->dev_sectors;
ab5a98b1 7162
a35b0d69
N
7163 if (mddev->pers->resize == NULL)
7164 return -EINVAL;
d71f9f88
AN
7165 /* The "num_sectors" is the number of sectors of each device that
7166 * is used. This can only make sense for arrays with redundancy.
7167 * linear and raid0 always use whatever space is available. We can only
7168 * consider changing this number if no resync or reconstruction is
7169 * happening, and if the new size is acceptable. It must fit before the
0f420358 7170 * sb_start or, if that is <data_offset, it must fit before the size
d71f9f88
AN
7171 * of each device. If num_sectors is zero, we find the largest size
7172 * that fits.
a35b0d69 7173 */
f851b60d
N
7174 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7175 mddev->sync_thread)
a35b0d69 7176 return -EBUSY;
bd8839e0
N
7177 if (mddev->ro)
7178 return -EROFS;
a4a6125a 7179
dafb20fa 7180 rdev_for_each(rdev, mddev) {
dd8ac336 7181 sector_t avail = rdev->sectors;
01ab5662 7182
d71f9f88
AN
7183 if (fit && (num_sectors == 0 || num_sectors > avail))
7184 num_sectors = avail;
7185 if (avail < num_sectors)
a35b0d69
N
7186 return -ENOSPC;
7187 }
d71f9f88 7188 rv = mddev->pers->resize(mddev, num_sectors);
c9483634 7189 if (!rv) {
818da59f
GJ
7190 if (mddev_is_clustered(mddev))
7191 md_cluster_ops->update_size(mddev, old_dev_sectors);
7192 else if (mddev->queue) {
c9483634
GJ
7193 set_capacity(mddev->gendisk, mddev->array_sectors);
7194 revalidate_disk(mddev->gendisk);
7195 }
7196 }
a35b0d69
N
7197 return rv;
7198}
7199
fd01b88c 7200static int update_raid_disks(struct mddev *mddev, int raid_disks)
da943b99
N
7201{
7202 int rv;
c6563a8c 7203 struct md_rdev *rdev;
da943b99 7204 /* change the number of raid disks */
63c70c4f 7205 if (mddev->pers->check_reshape == NULL)
da943b99 7206 return -EINVAL;
bd8839e0
N
7207 if (mddev->ro)
7208 return -EROFS;
da943b99 7209 if (raid_disks <= 0 ||
233fca36 7210 (mddev->max_disks && raid_disks >= mddev->max_disks))
da943b99 7211 return -EINVAL;
f851b60d
N
7212 if (mddev->sync_thread ||
7213 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
7214 mddev->reshape_position != MaxSector)
da943b99 7215 return -EBUSY;
c6563a8c
N
7216
7217 rdev_for_each(rdev, mddev) {
7218 if (mddev->raid_disks < raid_disks &&
7219 rdev->data_offset < rdev->new_data_offset)
7220 return -EINVAL;
7221 if (mddev->raid_disks > raid_disks &&
7222 rdev->data_offset > rdev->new_data_offset)
7223 return -EINVAL;
7224 }
7225
63c70c4f 7226 mddev->delta_disks = raid_disks - mddev->raid_disks;
2c810cdd
N
7227 if (mddev->delta_disks < 0)
7228 mddev->reshape_backwards = 1;
7229 else if (mddev->delta_disks > 0)
7230 mddev->reshape_backwards = 0;
63c70c4f
N
7231
7232 rv = mddev->pers->check_reshape(mddev);
2c810cdd 7233 if (rv < 0) {
de171cb9 7234 mddev->delta_disks = 0;
2c810cdd
N
7235 mddev->reshape_backwards = 0;
7236 }
da943b99
N
7237 return rv;
7238}
7239
1da177e4
LT
7240/*
7241 * update_array_info is used to change the configuration of an
7242 * on-line array.
7243 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
7244 * fields in the info are checked against the array.
7245 * Any differences that cannot be handled will cause an error.
7246 * Normally, only one change can be managed at a time.
7247 */
fd01b88c 7248static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
1da177e4
LT
7249{
7250 int rv = 0;
7251 int cnt = 0;
36fa3063
N
7252 int state = 0;
7253
7254 /* calculate expected state,ignoring low bits */
c3d9714e 7255 if (mddev->bitmap && mddev->bitmap_info.offset)
36fa3063 7256 state |= (1 << MD_SB_BITMAP_PRESENT);
1da177e4
LT
7257
7258 if (mddev->major_version != info->major_version ||
7259 mddev->minor_version != info->minor_version ||
7260/* mddev->patch_version != info->patch_version || */
7261 mddev->ctime != info->ctime ||
7262 mddev->level != info->level ||
7263/* mddev->layout != info->layout || */
4e023612 7264 mddev->persistent != !info->not_persistent ||
9d8f0363 7265 mddev->chunk_sectors != info->chunk_size >> 9 ||
36fa3063
N
7266 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
7267 ((state^info->state) & 0xfffffe00)
7268 )
1da177e4
LT
7269 return -EINVAL;
7270 /* Check there is only one change */
58c0fed4
AN
7271 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
7272 cnt++;
7273 if (mddev->raid_disks != info->raid_disks)
7274 cnt++;
7275 if (mddev->layout != info->layout)
7276 cnt++;
7277 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
7278 cnt++;
7279 if (cnt == 0)
7280 return 0;
7281 if (cnt > 1)
7282 return -EINVAL;
1da177e4
LT
7283
7284 if (mddev->layout != info->layout) {
7285 /* Change layout
7286 * we don't need to do anything at the md level, the
7287 * personality will take care of it all.
7288 */
50ac168a 7289 if (mddev->pers->check_reshape == NULL)
1da177e4 7290 return -EINVAL;
597a711b
N
7291 else {
7292 mddev->new_layout = info->layout;
50ac168a 7293 rv = mddev->pers->check_reshape(mddev);
597a711b
N
7294 if (rv)
7295 mddev->new_layout = mddev->layout;
7296 return rv;
7297 }
1da177e4 7298 }
58c0fed4 7299 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
d71f9f88 7300 rv = update_size(mddev, (sector_t)info->size * 2);
a35b0d69 7301
da943b99
N
7302 if (mddev->raid_disks != info->raid_disks)
7303 rv = update_raid_disks(mddev, info->raid_disks);
7304
36fa3063 7305 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
293467aa
GR
7306 if (mddev->pers->quiesce == NULL || mddev->thread == NULL) {
7307 rv = -EINVAL;
7308 goto err;
7309 }
7310 if (mddev->recovery || mddev->sync_thread) {
7311 rv = -EBUSY;
7312 goto err;
7313 }
36fa3063 7314 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
f9209a32 7315 struct bitmap *bitmap;
36fa3063 7316 /* add the bitmap */
293467aa
GR
7317 if (mddev->bitmap) {
7318 rv = -EEXIST;
7319 goto err;
7320 }
7321 if (mddev->bitmap_info.default_offset == 0) {
7322 rv = -EINVAL;
7323 goto err;
7324 }
c3d9714e
N
7325 mddev->bitmap_info.offset =
7326 mddev->bitmap_info.default_offset;
6409bb05
N
7327 mddev->bitmap_info.space =
7328 mddev->bitmap_info.default_space;
e64e4018 7329 bitmap = md_bitmap_create(mddev, -1);
9e1cc0a5 7330 mddev_suspend(mddev);
f9209a32
GR
7331 if (!IS_ERR(bitmap)) {
7332 mddev->bitmap = bitmap;
e64e4018 7333 rv = md_bitmap_load(mddev);
ba599aca
N
7334 } else
7335 rv = PTR_ERR(bitmap);
36fa3063 7336 if (rv)
e64e4018 7337 md_bitmap_destroy(mddev);
9e1cc0a5 7338 mddev_resume(mddev);
36fa3063
N
7339 } else {
7340 /* remove the bitmap */
293467aa
GR
7341 if (!mddev->bitmap) {
7342 rv = -ENOENT;
7343 goto err;
7344 }
7345 if (mddev->bitmap->storage.file) {
7346 rv = -EINVAL;
7347 goto err;
7348 }
f6a2dc64
GJ
7349 if (mddev->bitmap_info.nodes) {
7350 /* hold PW on all the bitmap lock */
7351 if (md_cluster_ops->lock_all_bitmaps(mddev) <= 0) {
9d48739e 7352 pr_warn("md: can't change bitmap to none since the array is in use by more than one node\n");
f6a2dc64
GJ
7353 rv = -EPERM;
7354 md_cluster_ops->unlock_all_bitmaps(mddev);
7355 goto err;
7356 }
7357
7358 mddev->bitmap_info.nodes = 0;
7359 md_cluster_ops->leave(mddev);
7360 }
9e1cc0a5 7361 mddev_suspend(mddev);
e64e4018 7362 md_bitmap_destroy(mddev);
9e1cc0a5 7363 mddev_resume(mddev);
c3d9714e 7364 mddev->bitmap_info.offset = 0;
36fa3063
N
7365 }
7366 }
850b2b42 7367 md_update_sb(mddev, 1);
293467aa
GR
7368 return rv;
7369err:
1da177e4
LT
7370 return rv;
7371}
7372
fd01b88c 7373static int set_disk_faulty(struct mddev *mddev, dev_t dev)
1da177e4 7374{
3cb03002 7375 struct md_rdev *rdev;
1ca69c4b 7376 int err = 0;
1da177e4
LT
7377
7378 if (mddev->pers == NULL)
7379 return -ENODEV;
7380
1ca69c4b 7381 rcu_read_lock();
1532d9e8 7382 rdev = md_find_rdev_rcu(mddev, dev);
1da177e4 7383 if (!rdev)
1ca69c4b
N
7384 err = -ENODEV;
7385 else {
7386 md_error(mddev, rdev);
7387 if (!test_bit(Faulty, &rdev->flags))
7388 err = -EBUSY;
7389 }
7390 rcu_read_unlock();
7391 return err;
1da177e4
LT
7392}
7393
2f9618ce
AN
7394/*
7395 * We have a problem here : there is no easy way to give a CHS
7396 * virtual geometry. We currently pretend that we have a 2 heads
7397 * 4 sectors (with a BIG number of cylinders...). This drives
7398 * dosfs just mad... ;-)
7399 */
a885c8c4
CH
7400static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
7401{
fd01b88c 7402 struct mddev *mddev = bdev->bd_disk->private_data;
a885c8c4
CH
7403
7404 geo->heads = 2;
7405 geo->sectors = 4;
49ce6cea 7406 geo->cylinders = mddev->array_sectors / 8;
a885c8c4
CH
7407 return 0;
7408}
7409
cb335f88
NS
7410static inline bool md_ioctl_valid(unsigned int cmd)
7411{
7412 switch (cmd) {
7413 case ADD_NEW_DISK:
7414 case BLKROSET:
7415 case GET_ARRAY_INFO:
7416 case GET_BITMAP_FILE:
7417 case GET_DISK_INFO:
7418 case HOT_ADD_DISK:
7419 case HOT_REMOVE_DISK:
cb335f88
NS
7420 case RAID_VERSION:
7421 case RESTART_ARRAY_RW:
7422 case RUN_ARRAY:
7423 case SET_ARRAY_INFO:
7424 case SET_BITMAP_FILE:
7425 case SET_DISK_FAULTY:
7426 case STOP_ARRAY:
7427 case STOP_ARRAY_RO:
1aee41f6 7428 case CLUSTERED_DISK_NACK:
cb335f88
NS
7429 return true;
7430 default:
7431 return false;
7432 }
7433}
7434
a39907fa 7435static int md_ioctl(struct block_device *bdev, fmode_t mode,
1da177e4
LT
7436 unsigned int cmd, unsigned long arg)
7437{
7438 int err = 0;
7439 void __user *argp = (void __user *)arg;
fd01b88c 7440 struct mddev *mddev = NULL;
e2218350 7441 int ro;
065e519e 7442 bool did_set_md_closing = false;
1da177e4 7443
cb335f88
NS
7444 if (!md_ioctl_valid(cmd))
7445 return -ENOTTY;
7446
506c9e44
N
7447 switch (cmd) {
7448 case RAID_VERSION:
7449 case GET_ARRAY_INFO:
7450 case GET_DISK_INFO:
7451 break;
7452 default:
7453 if (!capable(CAP_SYS_ADMIN))
7454 return -EACCES;
7455 }
1da177e4
LT
7456
7457 /*
7458 * Commands dealing with the RAID driver but not any
7459 * particular array:
7460 */
c02c0aeb
N
7461 switch (cmd) {
7462 case RAID_VERSION:
7463 err = get_version(argp);
3adc28d8 7464 goto out;
c02c0aeb 7465 default:;
1da177e4
LT
7466 }
7467
7468 /*
7469 * Commands creating/starting a new array:
7470 */
7471
a39907fa 7472 mddev = bdev->bd_disk->private_data;
1da177e4
LT
7473
7474 if (!mddev) {
7475 BUG();
3adc28d8 7476 goto out;
1da177e4
LT
7477 }
7478
1ca69c4b
N
7479 /* Some actions do not requires the mutex */
7480 switch (cmd) {
7481 case GET_ARRAY_INFO:
7482 if (!mddev->raid_disks && !mddev->external)
7483 err = -ENODEV;
7484 else
7485 err = get_array_info(mddev, argp);
3adc28d8 7486 goto out;
1ca69c4b
N
7487
7488 case GET_DISK_INFO:
7489 if (!mddev->raid_disks && !mddev->external)
7490 err = -ENODEV;
7491 else
7492 err = get_disk_info(mddev, argp);
3adc28d8 7493 goto out;
1ca69c4b
N
7494
7495 case SET_DISK_FAULTY:
7496 err = set_disk_faulty(mddev, new_decode_dev(arg));
3adc28d8 7497 goto out;
4af1a041
N
7498
7499 case GET_BITMAP_FILE:
7500 err = get_bitmap_file(mddev, argp);
7501 goto out;
7502
1ca69c4b
N
7503 }
7504
78b990cf 7505 if (cmd == ADD_NEW_DISK || cmd == HOT_ADD_DISK)
cc1ffe61 7506 flush_rdev_wq(mddev);
a7a3f08d 7507
90f5f7ad
HR
7508 if (cmd == HOT_REMOVE_DISK)
7509 /* need to ensure recovery thread has run */
7510 wait_event_interruptible_timeout(mddev->sb_wait,
7511 !test_bit(MD_RECOVERY_NEEDED,
82a301cb 7512 &mddev->recovery),
90f5f7ad 7513 msecs_to_jiffies(5000));
260fa034
N
7514 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
7515 /* Need to flush page cache, and ensure no-one else opens
7516 * and writes
7517 */
7518 mutex_lock(&mddev->open_mutex);
9ba3b7f5 7519 if (mddev->pers && atomic_read(&mddev->openers) > 1) {
260fa034
N
7520 mutex_unlock(&mddev->open_mutex);
7521 err = -EBUSY;
3adc28d8 7522 goto out;
260fa034 7523 }
065e519e 7524 WARN_ON_ONCE(test_bit(MD_CLOSING, &mddev->flags));
af8d8e6f 7525 set_bit(MD_CLOSING, &mddev->flags);
065e519e 7526 did_set_md_closing = true;
260fa034
N
7527 mutex_unlock(&mddev->open_mutex);
7528 sync_blockdev(bdev);
7529 }
1da177e4
LT
7530 err = mddev_lock(mddev);
7531 if (err) {
9d48739e
N
7532 pr_debug("md: ioctl lock interrupted, reason %d, cmd %d\n",
7533 err, cmd);
3adc28d8 7534 goto out;
1da177e4
LT
7535 }
7536
c02c0aeb
N
7537 if (cmd == SET_ARRAY_INFO) {
7538 mdu_array_info_t info;
7539 if (!arg)
7540 memset(&info, 0, sizeof(info));
7541 else if (copy_from_user(&info, argp, sizeof(info))) {
7542 err = -EFAULT;
3adc28d8 7543 goto unlock;
c02c0aeb
N
7544 }
7545 if (mddev->pers) {
7546 err = update_array_info(mddev, &info);
7547 if (err) {
9d48739e 7548 pr_warn("md: couldn't update array info. %d\n", err);
3adc28d8 7549 goto unlock;
1da177e4 7550 }
3adc28d8 7551 goto unlock;
c02c0aeb
N
7552 }
7553 if (!list_empty(&mddev->disks)) {
9d48739e 7554 pr_warn("md: array %s already has disks!\n", mdname(mddev));
c02c0aeb 7555 err = -EBUSY;
3adc28d8 7556 goto unlock;
c02c0aeb
N
7557 }
7558 if (mddev->raid_disks) {
9d48739e 7559 pr_warn("md: array %s already initialised!\n", mdname(mddev));
c02c0aeb 7560 err = -EBUSY;
3adc28d8 7561 goto unlock;
c02c0aeb
N
7562 }
7563 err = set_array_info(mddev, &info);
7564 if (err) {
9d48739e 7565 pr_warn("md: couldn't set array info. %d\n", err);
3adc28d8 7566 goto unlock;
c02c0aeb 7567 }
3adc28d8 7568 goto unlock;
1da177e4
LT
7569 }
7570
7571 /*
7572 * Commands querying/configuring an existing array:
7573 */
32a7627c 7574 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
3f9d7b0d 7575 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
a17184a9
N
7576 if ((!mddev->raid_disks && !mddev->external)
7577 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
7578 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
7579 && cmd != GET_BITMAP_FILE) {
1da177e4 7580 err = -ENODEV;
3adc28d8 7581 goto unlock;
1da177e4
LT
7582 }
7583
7584 /*
7585 * Commands even a read-only array can execute:
7586 */
c02c0aeb 7587 switch (cmd) {
c02c0aeb
N
7588 case RESTART_ARRAY_RW:
7589 err = restart_array(mddev);
3adc28d8 7590 goto unlock;
1da177e4 7591
c02c0aeb
N
7592 case STOP_ARRAY:
7593 err = do_md_stop(mddev, 0, bdev);
3adc28d8 7594 goto unlock;
1da177e4 7595
c02c0aeb
N
7596 case STOP_ARRAY_RO:
7597 err = md_set_readonly(mddev, bdev);
3adc28d8 7598 goto unlock;
1da177e4 7599
3ea8929d
N
7600 case HOT_REMOVE_DISK:
7601 err = hot_remove_disk(mddev, new_decode_dev(arg));
3adc28d8 7602 goto unlock;
3ea8929d 7603
7ceb17e8
N
7604 case ADD_NEW_DISK:
7605 /* We can support ADD_NEW_DISK on read-only arrays
466ad292 7606 * only if we are re-adding a preexisting device.
7ceb17e8
N
7607 * So require mddev->pers and MD_DISK_SYNC.
7608 */
7609 if (mddev->pers) {
7610 mdu_disk_info_t info;
7611 if (copy_from_user(&info, argp, sizeof(info)))
7612 err = -EFAULT;
7613 else if (!(info.state & (1<<MD_DISK_SYNC)))
7614 /* Need to clear read-only for this */
7615 break;
7616 else
7617 err = add_new_disk(mddev, &info);
3adc28d8 7618 goto unlock;
7ceb17e8
N
7619 }
7620 break;
7621
c02c0aeb
N
7622 case BLKROSET:
7623 if (get_user(ro, (int __user *)(arg))) {
7624 err = -EFAULT;
3adc28d8 7625 goto unlock;
c02c0aeb
N
7626 }
7627 err = -EINVAL;
e2218350 7628
c02c0aeb
N
7629 /* if the bdev is going readonly the value of mddev->ro
7630 * does not matter, no writes are coming
7631 */
7632 if (ro)
3adc28d8 7633 goto unlock;
e2218350 7634
c02c0aeb
N
7635 /* are we are already prepared for writes? */
7636 if (mddev->ro != 1)
3adc28d8 7637 goto unlock;
e2218350 7638
c02c0aeb
N
7639 /* transitioning to readauto need only happen for
7640 * arrays that call md_write_start
7641 */
7642 if (mddev->pers) {
7643 err = restart_array(mddev);
7644 if (err == 0) {
7645 mddev->ro = 2;
7646 set_disk_ro(mddev->gendisk, 0);
e2218350 7647 }
c02c0aeb 7648 }
3adc28d8 7649 goto unlock;
1da177e4
LT
7650 }
7651
7652 /*
7653 * The remaining ioctls are changing the state of the
f91de92e 7654 * superblock, so we do not allow them on read-only arrays.
1da177e4 7655 */
326eb17d 7656 if (mddev->ro && mddev->pers) {
f91de92e
N
7657 if (mddev->ro == 2) {
7658 mddev->ro = 0;
00bcb4ac 7659 sysfs_notify_dirent_safe(mddev->sysfs_state);
0fd62b86 7660 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
f3378b48
N
7661 /* mddev_unlock will wake thread */
7662 /* If a device failed while we were read-only, we
7663 * need to make sure the metadata is updated now.
7664 */
2953079c 7665 if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
f3378b48
N
7666 mddev_unlock(mddev);
7667 wait_event(mddev->sb_wait,
2953079c
SL
7668 !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
7669 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
29f097c4 7670 mddev_lock_nointr(mddev);
f3378b48 7671 }
f91de92e
N
7672 } else {
7673 err = -EROFS;
3adc28d8 7674 goto unlock;
f91de92e 7675 }
1da177e4
LT
7676 }
7677
c02c0aeb
N
7678 switch (cmd) {
7679 case ADD_NEW_DISK:
1da177e4 7680 {
c02c0aeb
N
7681 mdu_disk_info_t info;
7682 if (copy_from_user(&info, argp, sizeof(info)))
7683 err = -EFAULT;
7684 else
7685 err = add_new_disk(mddev, &info);
3adc28d8 7686 goto unlock;
c02c0aeb 7687 }
1da177e4 7688
1aee41f6
GR
7689 case CLUSTERED_DISK_NACK:
7690 if (mddev_is_clustered(mddev))
7691 md_cluster_ops->new_disk_ack(mddev, false);
7692 else
7693 err = -EINVAL;
7694 goto unlock;
7695
c02c0aeb
N
7696 case HOT_ADD_DISK:
7697 err = hot_add_disk(mddev, new_decode_dev(arg));
3adc28d8 7698 goto unlock;
1da177e4 7699
c02c0aeb
N
7700 case RUN_ARRAY:
7701 err = do_md_run(mddev);
3adc28d8 7702 goto unlock;
1da177e4 7703
c02c0aeb
N
7704 case SET_BITMAP_FILE:
7705 err = set_bitmap_file(mddev, (int)arg);
3adc28d8 7706 goto unlock;
32a7627c 7707
c02c0aeb
N
7708 default:
7709 err = -EINVAL;
3adc28d8 7710 goto unlock;
1da177e4
LT
7711 }
7712
3adc28d8 7713unlock:
d3374825
N
7714 if (mddev->hold_active == UNTIL_IOCTL &&
7715 err != -EINVAL)
7716 mddev->hold_active = 0;
1da177e4 7717 mddev_unlock(mddev);
3adc28d8 7718out:
065e519e
N
7719 if(did_set_md_closing)
7720 clear_bit(MD_CLOSING, &mddev->flags);
1da177e4
LT
7721 return err;
7722}
aa98aa31
AB
7723#ifdef CONFIG_COMPAT
7724static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
7725 unsigned int cmd, unsigned long arg)
7726{
7727 switch (cmd) {
7728 case HOT_REMOVE_DISK:
7729 case HOT_ADD_DISK:
7730 case SET_DISK_FAULTY:
7731 case SET_BITMAP_FILE:
7732 /* These take in integer arg, do not convert */
7733 break;
7734 default:
7735 arg = (unsigned long)compat_ptr(arg);
7736 break;
7737 }
7738
7739 return md_ioctl(bdev, mode, cmd, arg);
7740}
7741#endif /* CONFIG_COMPAT */
1da177e4 7742
a39907fa 7743static int md_open(struct block_device *bdev, fmode_t mode)
1da177e4
LT
7744{
7745 /*
7746 * Succeed if we can lock the mddev, which confirms that
7747 * it isn't being stopped right now.
7748 */
fd01b88c 7749 struct mddev *mddev = mddev_find(bdev->bd_dev);
1da177e4
LT
7750 int err;
7751
0c098220
YL
7752 if (!mddev)
7753 return -ENODEV;
7754
d3374825
N
7755 if (mddev->gendisk != bdev->bd_disk) {
7756 /* we are racing with mddev_put which is discarding this
7757 * bd_disk.
7758 */
7759 mddev_put(mddev);
7760 /* Wait until bdev->bd_disk is definitely gone */
f6766ff6
GJ
7761 if (work_pending(&mddev->del_work))
7762 flush_workqueue(md_misc_wq);
d3374825
N
7763 /* Then retry the open from the top */
7764 return -ERESTARTSYS;
7765 }
7766 BUG_ON(mddev != bdev->bd_disk->private_data);
7767
c8c00a69 7768 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
1da177e4
LT
7769 goto out;
7770
af8d8e6f
GJ
7771 if (test_bit(MD_CLOSING, &mddev->flags)) {
7772 mutex_unlock(&mddev->open_mutex);
e2342ca8
N
7773 err = -ENODEV;
7774 goto out;
af8d8e6f
GJ
7775 }
7776
1da177e4 7777 err = 0;
f2ea68cf 7778 atomic_inc(&mddev->openers);
c8c00a69 7779 mutex_unlock(&mddev->open_mutex);
1da177e4 7780
f0b4f7e2 7781 check_disk_change(bdev);
1da177e4 7782 out:
e2342ca8
N
7783 if (err)
7784 mddev_put(mddev);
1da177e4
LT
7785 return err;
7786}
7787
db2a144b 7788static void md_release(struct gendisk *disk, fmode_t mode)
1da177e4 7789{
f72ffdd6 7790 struct mddev *mddev = disk->private_data;
1da177e4 7791
52e5f9d1 7792 BUG_ON(!mddev);
f2ea68cf 7793 atomic_dec(&mddev->openers);
1da177e4 7794 mddev_put(mddev);
1da177e4 7795}
f0b4f7e2
N
7796
7797static int md_media_changed(struct gendisk *disk)
7798{
fd01b88c 7799 struct mddev *mddev = disk->private_data;
f0b4f7e2
N
7800
7801 return mddev->changed;
7802}
7803
7804static int md_revalidate(struct gendisk *disk)
7805{
fd01b88c 7806 struct mddev *mddev = disk->private_data;
f0b4f7e2
N
7807
7808 mddev->changed = 0;
7809 return 0;
7810}
83d5cde4 7811static const struct block_device_operations md_fops =
1da177e4
LT
7812{
7813 .owner = THIS_MODULE,
a39907fa
AV
7814 .open = md_open,
7815 .release = md_release,
b492b852 7816 .ioctl = md_ioctl,
aa98aa31
AB
7817#ifdef CONFIG_COMPAT
7818 .compat_ioctl = md_compat_ioctl,
7819#endif
a885c8c4 7820 .getgeo = md_getgeo,
f0b4f7e2
N
7821 .media_changed = md_media_changed,
7822 .revalidate_disk= md_revalidate,
1da177e4
LT
7823};
7824
f72ffdd6 7825static int md_thread(void *arg)
1da177e4 7826{
2b8bf345 7827 struct md_thread *thread = arg;
1da177e4 7828
1da177e4
LT
7829 /*
7830 * md_thread is a 'system-thread', it's priority should be very
7831 * high. We avoid resource deadlocks individually in each
7832 * raid personality. (RAID5 does preallocation) We also use RR and
7833 * the very same RT priority as kswapd, thus we will never get
7834 * into a priority inversion deadlock.
7835 *
7836 * we definitely have to have equal or higher priority than
7837 * bdflush, otherwise bdflush will deadlock if there are too
7838 * many dirty RAID5 blocks.
7839 */
1da177e4 7840
6985c43f 7841 allow_signal(SIGKILL);
a6fb0934 7842 while (!kthread_should_stop()) {
1da177e4 7843
93588e22
N
7844 /* We need to wait INTERRUPTIBLE so that
7845 * we don't add to the load-average.
7846 * That means we need to be sure no signals are
7847 * pending
7848 */
7849 if (signal_pending(current))
7850 flush_signals(current);
7851
7852 wait_event_interruptible_timeout
7853 (thread->wqueue,
7854 test_bit(THREAD_WAKEUP, &thread->flags)
ce1ccd07 7855 || kthread_should_stop() || kthread_should_park(),
93588e22 7856 thread->timeout);
1da177e4 7857
6c987910 7858 clear_bit(THREAD_WAKEUP, &thread->flags);
ce1ccd07
SL
7859 if (kthread_should_park())
7860 kthread_parkme();
6c987910 7861 if (!kthread_should_stop())
4ed8731d 7862 thread->run(thread);
1da177e4 7863 }
a6fb0934 7864
1da177e4
LT
7865 return 0;
7866}
7867
2b8bf345 7868void md_wakeup_thread(struct md_thread *thread)
1da177e4
LT
7869{
7870 if (thread) {
36a4e1fe 7871 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
d1d90147
GJ
7872 set_bit(THREAD_WAKEUP, &thread->flags);
7873 wake_up(&thread->wqueue);
1da177e4
LT
7874 }
7875}
6c144d31 7876EXPORT_SYMBOL(md_wakeup_thread);
1da177e4 7877
4ed8731d
SL
7878struct md_thread *md_register_thread(void (*run) (struct md_thread *),
7879 struct mddev *mddev, const char *name)
1da177e4 7880{
2b8bf345 7881 struct md_thread *thread;
1da177e4 7882
2b8bf345 7883 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
1da177e4
LT
7884 if (!thread)
7885 return NULL;
7886
1da177e4
LT
7887 init_waitqueue_head(&thread->wqueue);
7888
1da177e4
LT
7889 thread->run = run;
7890 thread->mddev = mddev;
32a7627c 7891 thread->timeout = MAX_SCHEDULE_TIMEOUT;
0da3c619
N
7892 thread->tsk = kthread_run(md_thread, thread,
7893 "%s_%s",
7894 mdname(thread->mddev),
0232605d 7895 name);
a6fb0934 7896 if (IS_ERR(thread->tsk)) {
1da177e4
LT
7897 kfree(thread);
7898 return NULL;
7899 }
1da177e4
LT
7900 return thread;
7901}
6c144d31 7902EXPORT_SYMBOL(md_register_thread);
1da177e4 7903
2b8bf345 7904void md_unregister_thread(struct md_thread **threadp)
1da177e4 7905{
2b8bf345 7906 struct md_thread *thread = *threadp;
e0cf8f04
N
7907 if (!thread)
7908 return;
36a4e1fe 7909 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
01f96c0a
N
7910 /* Locking ensures that mddev_unlock does not wake_up a
7911 * non-existent thread
7912 */
7913 spin_lock(&pers_lock);
7914 *threadp = NULL;
7915 spin_unlock(&pers_lock);
a6fb0934
N
7916
7917 kthread_stop(thread->tsk);
1da177e4
LT
7918 kfree(thread);
7919}
6c144d31 7920EXPORT_SYMBOL(md_unregister_thread);
1da177e4 7921
fd01b88c 7922void md_error(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 7923{
b2d444d7 7924 if (!rdev || test_bit(Faulty, &rdev->flags))
1da177e4 7925 return;
6bfe0b49 7926
de393cde 7927 if (!mddev->pers || !mddev->pers->error_handler)
1da177e4
LT
7928 return;
7929 mddev->pers->error_handler(mddev,rdev);
72a23c21
NB
7930 if (mddev->degraded)
7931 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
00bcb4ac 7932 sysfs_notify_dirent_safe(rdev->sysfs_state);
1da177e4
LT
7933 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7934 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7935 md_wakeup_thread(mddev->thread);
768a418d 7936 if (mddev->event_work.func)
e804ac78 7937 queue_work(md_misc_wq, &mddev->event_work);
bb9ef716 7938 md_new_event(mddev);
1da177e4 7939}
6c144d31 7940EXPORT_SYMBOL(md_error);
1da177e4
LT
7941
7942/* seq_file implementation /proc/mdstat */
7943
7944static void status_unused(struct seq_file *seq)
7945{
7946 int i = 0;
3cb03002 7947 struct md_rdev *rdev;
1da177e4
LT
7948
7949 seq_printf(seq, "unused devices: ");
7950
159ec1fc 7951 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
1da177e4
LT
7952 char b[BDEVNAME_SIZE];
7953 i++;
7954 seq_printf(seq, "%s ",
7955 bdevname(rdev->bdev,b));
7956 }
7957 if (!i)
7958 seq_printf(seq, "<none>");
7959
7960 seq_printf(seq, "\n");
7961}
7962
f7851be7 7963static int status_resync(struct seq_file *seq, struct mddev *mddev)
1da177e4 7964{
dd71cf6b 7965 sector_t max_sectors, resync, res;
9642fa73
MT
7966 unsigned long dt, db = 0;
7967 sector_t rt, curr_mark_cnt, resync_mark_cnt;
7968 int scale, recovery_active;
4588b42e 7969 unsigned int per_milli;
1da177e4 7970
c804cdec
N
7971 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
7972 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
dd71cf6b 7973 max_sectors = mddev->resync_max_sectors;
1da177e4 7974 else
dd71cf6b 7975 max_sectors = mddev->dev_sectors;
1da177e4 7976
f7851be7
N
7977 resync = mddev->curr_resync;
7978 if (resync <= 3) {
7979 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7980 /* Still cleaning up */
7981 resync = max_sectors;
d2e2ec82
ND
7982 } else if (resync > max_sectors)
7983 resync = max_sectors;
7984 else
f7851be7
N
7985 resync -= atomic_read(&mddev->recovery_active);
7986
7987 if (resync == 0) {
0357ba27
GJ
7988 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery)) {
7989 struct md_rdev *rdev;
7990
7991 rdev_for_each(rdev, mddev)
7992 if (rdev->raid_disk >= 0 &&
7993 !test_bit(Faulty, &rdev->flags) &&
7994 rdev->recovery_offset != MaxSector &&
7995 rdev->recovery_offset) {
7996 seq_printf(seq, "\trecover=REMOTE");
7997 return 1;
7998 }
7999 if (mddev->reshape_position != MaxSector)
8000 seq_printf(seq, "\treshape=REMOTE");
8001 else
8002 seq_printf(seq, "\tresync=REMOTE");
8003 return 1;
8004 }
f7851be7
N
8005 if (mddev->recovery_cp < MaxSector) {
8006 seq_printf(seq, "\tresync=PENDING");
8007 return 1;
8008 }
8009 return 0;
8010 }
8011 if (resync < 3) {
8012 seq_printf(seq, "\tresync=DELAYED");
8013 return 1;
8014 }
8015
403df478 8016 WARN_ON(max_sectors == 0);
4588b42e 8017 /* Pick 'scale' such that (resync>>scale)*1000 will fit
dd71cf6b 8018 * in a sector_t, and (max_sectors>>scale) will fit in a
4588b42e
N
8019 * u32, as those are the requirements for sector_div.
8020 * Thus 'scale' must be at least 10
8021 */
8022 scale = 10;
8023 if (sizeof(sector_t) > sizeof(unsigned long)) {
dd71cf6b 8024 while ( max_sectors/2 > (1ULL<<(scale+32)))
4588b42e
N
8025 scale++;
8026 }
8027 res = (resync>>scale)*1000;
dd71cf6b 8028 sector_div(res, (u32)((max_sectors>>scale)+1));
4588b42e
N
8029
8030 per_milli = res;
1da177e4 8031 {
4588b42e 8032 int i, x = per_milli/50, y = 20-x;
1da177e4
LT
8033 seq_printf(seq, "[");
8034 for (i = 0; i < x; i++)
8035 seq_printf(seq, "=");
8036 seq_printf(seq, ">");
8037 for (i = 0; i < y; i++)
8038 seq_printf(seq, ".");
8039 seq_printf(seq, "] ");
8040 }
4588b42e 8041 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
ccfcc3c1
N
8042 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
8043 "reshape" :
61df9d91
N
8044 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
8045 "check" :
8046 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
8047 "resync" : "recovery"))),
8048 per_milli/10, per_milli % 10,
dd71cf6b
N
8049 (unsigned long long) resync/2,
8050 (unsigned long long) max_sectors/2);
1da177e4
LT
8051
8052 /*
1da177e4
LT
8053 * dt: time from mark until now
8054 * db: blocks written from mark until now
8055 * rt: remaining time
dd71cf6b 8056 *
9642fa73
MT
8057 * rt is a sector_t, which is always 64bit now. We are keeping
8058 * the original algorithm, but it is not really necessary.
8059 *
8060 * Original algorithm:
8061 * So we divide before multiply in case it is 32bit and close
8062 * to the limit.
8063 * We scale the divisor (db) by 32 to avoid losing precision
8064 * near the end of resync when the number of remaining sectors
8065 * is close to 'db'.
8066 * We then divide rt by 32 after multiplying by db to compensate.
8067 * The '+1' avoids division by zero if db is very small.
1da177e4
LT
8068 */
8069 dt = ((jiffies - mddev->resync_mark) / HZ);
8070 if (!dt) dt++;
9642fa73
MT
8071
8072 curr_mark_cnt = mddev->curr_mark_cnt;
8073 recovery_active = atomic_read(&mddev->recovery_active);
8074 resync_mark_cnt = mddev->resync_mark_cnt;
8075
8076 if (curr_mark_cnt >= (recovery_active + resync_mark_cnt))
8077 db = curr_mark_cnt - (recovery_active + resync_mark_cnt);
1da177e4 8078
dd71cf6b 8079 rt = max_sectors - resync; /* number of remaining sectors */
9642fa73 8080 rt = div64_u64(rt, db/32+1);
dd71cf6b
N
8081 rt *= dt;
8082 rt >>= 5;
8083
8084 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
8085 ((unsigned long)rt % 60)/6);
1da177e4 8086
ff4e8d9a 8087 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
f7851be7 8088 return 1;
1da177e4
LT
8089}
8090
8091static void *md_seq_start(struct seq_file *seq, loff_t *pos)
8092{
8093 struct list_head *tmp;
8094 loff_t l = *pos;
fd01b88c 8095 struct mddev *mddev;
1da177e4
LT
8096
8097 if (l >= 0x10000)
8098 return NULL;
8099 if (!l--)
8100 /* header */
8101 return (void*)1;
8102
8103 spin_lock(&all_mddevs_lock);
8104 list_for_each(tmp,&all_mddevs)
8105 if (!l--) {
fd01b88c 8106 mddev = list_entry(tmp, struct mddev, all_mddevs);
1da177e4
LT
8107 mddev_get(mddev);
8108 spin_unlock(&all_mddevs_lock);
8109 return mddev;
8110 }
8111 spin_unlock(&all_mddevs_lock);
8112 if (!l--)
8113 return (void*)2;/* tail */
8114 return NULL;
8115}
8116
8117static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
8118{
8119 struct list_head *tmp;
fd01b88c 8120 struct mddev *next_mddev, *mddev = v;
f72ffdd6 8121
1da177e4
LT
8122 ++*pos;
8123 if (v == (void*)2)
8124 return NULL;
8125
8126 spin_lock(&all_mddevs_lock);
8127 if (v == (void*)1)
8128 tmp = all_mddevs.next;
8129 else
8130 tmp = mddev->all_mddevs.next;
8131 if (tmp != &all_mddevs)
fd01b88c 8132 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
1da177e4
LT
8133 else {
8134 next_mddev = (void*)2;
8135 *pos = 0x10000;
f72ffdd6 8136 }
1da177e4
LT
8137 spin_unlock(&all_mddevs_lock);
8138
8139 if (v != (void*)1)
8140 mddev_put(mddev);
8141 return next_mddev;
8142
8143}
8144
8145static void md_seq_stop(struct seq_file *seq, void *v)
8146{
fd01b88c 8147 struct mddev *mddev = v;
1da177e4
LT
8148
8149 if (mddev && v != (void*)1 && v != (void*)2)
8150 mddev_put(mddev);
8151}
8152
8153static int md_seq_show(struct seq_file *seq, void *v)
8154{
fd01b88c 8155 struct mddev *mddev = v;
dd8ac336 8156 sector_t sectors;
3cb03002 8157 struct md_rdev *rdev;
1da177e4
LT
8158
8159 if (v == (void*)1) {
84fc4b56 8160 struct md_personality *pers;
1da177e4
LT
8161 seq_printf(seq, "Personalities : ");
8162 spin_lock(&pers_lock);
2604b703
N
8163 list_for_each_entry(pers, &pers_list, list)
8164 seq_printf(seq, "[%s] ", pers->name);
1da177e4
LT
8165
8166 spin_unlock(&pers_lock);
8167 seq_printf(seq, "\n");
f1514638 8168 seq->poll_event = atomic_read(&md_event_count);
1da177e4
LT
8169 return 0;
8170 }
8171 if (v == (void*)2) {
8172 status_unused(seq);
8173 return 0;
8174 }
8175
36d091f4 8176 spin_lock(&mddev->lock);
1da177e4
LT
8177 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
8178 seq_printf(seq, "%s : %sactive", mdname(mddev),
8179 mddev->pers ? "" : "in");
8180 if (mddev->pers) {
f91de92e 8181 if (mddev->ro==1)
1da177e4 8182 seq_printf(seq, " (read-only)");
f91de92e 8183 if (mddev->ro==2)
52720ae7 8184 seq_printf(seq, " (auto-read-only)");
1da177e4
LT
8185 seq_printf(seq, " %s", mddev->pers->name);
8186 }
8187
dd8ac336 8188 sectors = 0;
f97fcad3
N
8189 rcu_read_lock();
8190 rdev_for_each_rcu(rdev, mddev) {
1da177e4
LT
8191 char b[BDEVNAME_SIZE];
8192 seq_printf(seq, " %s[%d]",
8193 bdevname(rdev->bdev,b), rdev->desc_nr);
8ddf9efe
N
8194 if (test_bit(WriteMostly, &rdev->flags))
8195 seq_printf(seq, "(W)");
9efdca16
SL
8196 if (test_bit(Journal, &rdev->flags))
8197 seq_printf(seq, "(J)");
b2d444d7 8198 if (test_bit(Faulty, &rdev->flags)) {
1da177e4
LT
8199 seq_printf(seq, "(F)");
8200 continue;
2d78f8c4
N
8201 }
8202 if (rdev->raid_disk < 0)
b325a32e 8203 seq_printf(seq, "(S)"); /* spare */
2d78f8c4
N
8204 if (test_bit(Replacement, &rdev->flags))
8205 seq_printf(seq, "(R)");
dd8ac336 8206 sectors += rdev->sectors;
1da177e4 8207 }
f97fcad3 8208 rcu_read_unlock();
1da177e4
LT
8209
8210 if (!list_empty(&mddev->disks)) {
8211 if (mddev->pers)
8212 seq_printf(seq, "\n %llu blocks",
f233ea5c
AN
8213 (unsigned long long)
8214 mddev->array_sectors / 2);
1da177e4
LT
8215 else
8216 seq_printf(seq, "\n %llu blocks",
dd8ac336 8217 (unsigned long long)sectors / 2);
1da177e4 8218 }
1cd6bf19
N
8219 if (mddev->persistent) {
8220 if (mddev->major_version != 0 ||
8221 mddev->minor_version != 90) {
8222 seq_printf(seq," super %d.%d",
8223 mddev->major_version,
8224 mddev->minor_version);
8225 }
e691063a
N
8226 } else if (mddev->external)
8227 seq_printf(seq, " super external:%s",
8228 mddev->metadata_type);
8229 else
1cd6bf19 8230 seq_printf(seq, " super non-persistent");
1da177e4
LT
8231
8232 if (mddev->pers) {
d710e138 8233 mddev->pers->status(seq, mddev);
f72ffdd6 8234 seq_printf(seq, "\n ");
8e1b39d6 8235 if (mddev->pers->sync_request) {
f7851be7 8236 if (status_resync(seq, mddev))
8e1b39d6 8237 seq_printf(seq, "\n ");
8e1b39d6 8238 }
32a7627c
N
8239 } else
8240 seq_printf(seq, "\n ");
8241
e64e4018 8242 md_bitmap_status(seq, mddev->bitmap);
1da177e4
LT
8243
8244 seq_printf(seq, "\n");
8245 }
36d091f4 8246 spin_unlock(&mddev->lock);
f72ffdd6 8247
1da177e4
LT
8248 return 0;
8249}
8250
110518bc 8251static const struct seq_operations md_seq_ops = {
1da177e4
LT
8252 .start = md_seq_start,
8253 .next = md_seq_next,
8254 .stop = md_seq_stop,
8255 .show = md_seq_show,
8256};
8257
8258static int md_seq_open(struct inode *inode, struct file *file)
8259{
f1514638 8260 struct seq_file *seq;
1da177e4
LT
8261 int error;
8262
8263 error = seq_open(file, &md_seq_ops);
d7603b7e 8264 if (error)
f1514638
KS
8265 return error;
8266
8267 seq = file->private_data;
8268 seq->poll_event = atomic_read(&md_event_count);
1da177e4
LT
8269 return error;
8270}
8271
e2f23b60 8272static int md_unloading;
afc9a42b 8273static __poll_t mdstat_poll(struct file *filp, poll_table *wait)
d7603b7e 8274{
f1514638 8275 struct seq_file *seq = filp->private_data;
afc9a42b 8276 __poll_t mask;
d7603b7e 8277
e2f23b60 8278 if (md_unloading)
a9a08845 8279 return EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI;
d7603b7e
N
8280 poll_wait(filp, &md_event_waiters, wait);
8281
8282 /* always allow read */
a9a08845 8283 mask = EPOLLIN | EPOLLRDNORM;
d7603b7e 8284
f1514638 8285 if (seq->poll_event != atomic_read(&md_event_count))
a9a08845 8286 mask |= EPOLLERR | EPOLLPRI;
d7603b7e
N
8287 return mask;
8288}
8289
97a32539
AD
8290static const struct proc_ops mdstat_proc_ops = {
8291 .proc_open = md_seq_open,
8292 .proc_read = seq_read,
8293 .proc_lseek = seq_lseek,
8294 .proc_release = seq_release,
8295 .proc_poll = mdstat_poll,
1da177e4
LT
8296};
8297
84fc4b56 8298int register_md_personality(struct md_personality *p)
1da177e4 8299{
9d48739e
N
8300 pr_debug("md: %s personality registered for level %d\n",
8301 p->name, p->level);
1da177e4 8302 spin_lock(&pers_lock);
2604b703 8303 list_add_tail(&p->list, &pers_list);
1da177e4
LT
8304 spin_unlock(&pers_lock);
8305 return 0;
8306}
6c144d31 8307EXPORT_SYMBOL(register_md_personality);
1da177e4 8308
84fc4b56 8309int unregister_md_personality(struct md_personality *p)
1da177e4 8310{
9d48739e 8311 pr_debug("md: %s personality unregistered\n", p->name);
1da177e4 8312 spin_lock(&pers_lock);
2604b703 8313 list_del_init(&p->list);
1da177e4
LT
8314 spin_unlock(&pers_lock);
8315 return 0;
8316}
6c144d31 8317EXPORT_SYMBOL(unregister_md_personality);
1da177e4 8318
6022e75b
N
8319int register_md_cluster_operations(struct md_cluster_operations *ops,
8320 struct module *module)
edb39c9d 8321{
6022e75b 8322 int ret = 0;
edb39c9d 8323 spin_lock(&pers_lock);
6022e75b
N
8324 if (md_cluster_ops != NULL)
8325 ret = -EALREADY;
8326 else {
8327 md_cluster_ops = ops;
8328 md_cluster_mod = module;
8329 }
edb39c9d 8330 spin_unlock(&pers_lock);
6022e75b 8331 return ret;
edb39c9d
GR
8332}
8333EXPORT_SYMBOL(register_md_cluster_operations);
8334
8335int unregister_md_cluster_operations(void)
8336{
8337 spin_lock(&pers_lock);
8338 md_cluster_ops = NULL;
8339 spin_unlock(&pers_lock);
8340 return 0;
8341}
8342EXPORT_SYMBOL(unregister_md_cluster_operations);
8343
8344int md_setup_cluster(struct mddev *mddev, int nodes)
8345{
47a7b0d8
GJ
8346 if (!md_cluster_ops)
8347 request_module("md-cluster");
edb39c9d 8348 spin_lock(&pers_lock);
47a7b0d8 8349 /* ensure module won't be unloaded */
edb39c9d 8350 if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
9d48739e 8351 pr_warn("can't find md-cluster module or get it's reference.\n");
edb39c9d
GR
8352 spin_unlock(&pers_lock);
8353 return -ENOENT;
8354 }
8355 spin_unlock(&pers_lock);
8356
cf921cc1 8357 return md_cluster_ops->join(mddev, nodes);
edb39c9d
GR
8358}
8359
8360void md_cluster_stop(struct mddev *mddev)
8361{
c4ce867f
GR
8362 if (!md_cluster_ops)
8363 return;
edb39c9d
GR
8364 md_cluster_ops->leave(mddev);
8365 module_put(md_cluster_mod);
8366}
8367
fd01b88c 8368static int is_mddev_idle(struct mddev *mddev, int init)
1da177e4 8369{
f72ffdd6 8370 struct md_rdev *rdev;
1da177e4 8371 int idle;
eea1bf38 8372 int curr_events;
1da177e4
LT
8373
8374 idle = 1;
4b80991c
N
8375 rcu_read_lock();
8376 rdev_for_each_rcu(rdev, mddev) {
1da177e4 8377 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
59767fbd 8378 curr_events = (int)part_stat_read_accum(&disk->part0, sectors) -
eea1bf38 8379 atomic_read(&disk->sync_io);
713f6ab1
N
8380 /* sync IO will cause sync_io to increase before the disk_stats
8381 * as sync_io is counted when a request starts, and
8382 * disk_stats is counted when it completes.
8383 * So resync activity will cause curr_events to be smaller than
8384 * when there was no such activity.
8385 * non-sync IO will cause disk_stat to increase without
8386 * increasing sync_io so curr_events will (eventually)
8387 * be larger than it was before. Once it becomes
8388 * substantially larger, the test below will cause
8389 * the array to appear non-idle, and resync will slow
8390 * down.
8391 * If there is a lot of outstanding resync activity when
8392 * we set last_event to curr_events, then all that activity
8393 * completing might cause the array to appear non-idle
8394 * and resync will be slowed down even though there might
8395 * not have been non-resync activity. This will only
8396 * happen once though. 'last_events' will soon reflect
8397 * the state where there is little or no outstanding
8398 * resync requests, and further resync activity will
8399 * always make curr_events less than last_events.
c0e48521 8400 *
1da177e4 8401 */
eea1bf38 8402 if (init || curr_events - rdev->last_events > 64) {
1da177e4
LT
8403 rdev->last_events = curr_events;
8404 idle = 0;
8405 }
8406 }
4b80991c 8407 rcu_read_unlock();
1da177e4
LT
8408 return idle;
8409}
8410
fd01b88c 8411void md_done_sync(struct mddev *mddev, int blocks, int ok)
1da177e4
LT
8412{
8413 /* another "blocks" (512byte) blocks have been synced */
8414 atomic_sub(blocks, &mddev->recovery_active);
8415 wake_up(&mddev->recovery_wait);
8416 if (!ok) {
dfc70645 8417 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
0a19caab 8418 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
1da177e4
LT
8419 md_wakeup_thread(mddev->thread);
8420 // stop recovery, signal do_sync ....
8421 }
8422}
6c144d31 8423EXPORT_SYMBOL(md_done_sync);
1da177e4 8424
06d91a5f
N
8425/* md_write_start(mddev, bi)
8426 * If we need to update some array metadata (e.g. 'active' flag
3d310eb7
N
8427 * in superblock) before writing, schedule a superblock update
8428 * and wait for it to complete.
cc27b0c7
N
8429 * A return value of 'false' means that the write wasn't recorded
8430 * and cannot proceed as the array is being suspend.
06d91a5f 8431 */
cc27b0c7 8432bool md_write_start(struct mddev *mddev, struct bio *bi)
1da177e4 8433{
0fd62b86 8434 int did_change = 0;
4b6c1060 8435
06d91a5f 8436 if (bio_data_dir(bi) != WRITE)
cc27b0c7 8437 return true;
06d91a5f 8438
f91de92e
N
8439 BUG_ON(mddev->ro == 1);
8440 if (mddev->ro == 2) {
8441 /* need to switch to read/write */
8442 mddev->ro = 0;
8443 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
8444 md_wakeup_thread(mddev->thread);
25156198 8445 md_wakeup_thread(mddev->sync_thread);
0fd62b86 8446 did_change = 1;
f91de92e 8447 }
4ad23a97
N
8448 rcu_read_lock();
8449 percpu_ref_get(&mddev->writes_pending);
55cc39f3 8450 smp_mb(); /* Match smp_mb in set_in_sync() */
31a59e34
N
8451 if (mddev->safemode == 1)
8452 mddev->safemode = 0;
4ad23a97 8453 /* sync_checkers is always 0 when writes_pending is in per-cpu mode */
81fe48e9 8454 if (mddev->in_sync || mddev->sync_checkers) {
85572d7c 8455 spin_lock(&mddev->lock);
3d310eb7
N
8456 if (mddev->in_sync) {
8457 mddev->in_sync = 0;
2953079c
SL
8458 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8459 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
3d310eb7 8460 md_wakeup_thread(mddev->thread);
0fd62b86 8461 did_change = 1;
3d310eb7 8462 }
85572d7c 8463 spin_unlock(&mddev->lock);
06d91a5f 8464 }
4ad23a97 8465 rcu_read_unlock();
0fd62b86 8466 if (did_change)
00bcb4ac 8467 sysfs_notify_dirent_safe(mddev->sysfs_state);
4b6c1060
HM
8468 if (!mddev->has_superblocks)
8469 return true;
09a44cc1 8470 wait_event(mddev->sb_wait,
d47c8ad2
N
8471 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags) ||
8472 mddev->suspended);
cc27b0c7
N
8473 if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
8474 percpu_ref_put(&mddev->writes_pending);
8475 return false;
8476 }
8477 return true;
1da177e4 8478}
6c144d31 8479EXPORT_SYMBOL(md_write_start);
1da177e4 8480
49728050
N
8481/* md_write_inc can only be called when md_write_start() has
8482 * already been called at least once of the current request.
8483 * It increments the counter and is useful when a single request
8484 * is split into several parts. Each part causes an increment and
8485 * so needs a matching md_write_end().
8486 * Unlike md_write_start(), it is safe to call md_write_inc() inside
8487 * a spinlocked region.
8488 */
8489void md_write_inc(struct mddev *mddev, struct bio *bi)
8490{
8491 if (bio_data_dir(bi) != WRITE)
8492 return;
8493 WARN_ON_ONCE(mddev->in_sync || mddev->ro);
4ad23a97 8494 percpu_ref_get(&mddev->writes_pending);
49728050
N
8495}
8496EXPORT_SYMBOL(md_write_inc);
8497
fd01b88c 8498void md_write_end(struct mddev *mddev)
1da177e4 8499{
4ad23a97
N
8500 percpu_ref_put(&mddev->writes_pending);
8501
8502 if (mddev->safemode == 2)
8503 md_wakeup_thread(mddev->thread);
8504 else if (mddev->safemode_delay)
8505 /* The roundup() ensures this only performs locking once
8506 * every ->safemode_delay jiffies
8507 */
8508 mod_timer(&mddev->safemode_timer,
8509 roundup(jiffies, mddev->safemode_delay) +
8510 mddev->safemode_delay);
1da177e4 8511}
4ad23a97 8512
6c144d31 8513EXPORT_SYMBOL(md_write_end);
1da177e4 8514
2a2275d6
N
8515/* md_allow_write(mddev)
8516 * Calling this ensures that the array is marked 'active' so that writes
8517 * may proceed without blocking. It is important to call this before
8518 * attempting a GFP_KERNEL allocation while holding the mddev lock.
8519 * Must be called with mddev_lock held.
8520 */
2214c260 8521void md_allow_write(struct mddev *mddev)
2a2275d6
N
8522{
8523 if (!mddev->pers)
2214c260 8524 return;
2a2275d6 8525 if (mddev->ro)
2214c260 8526 return;
1a0fd497 8527 if (!mddev->pers->sync_request)
2214c260 8528 return;
2a2275d6 8529
85572d7c 8530 spin_lock(&mddev->lock);
2a2275d6
N
8531 if (mddev->in_sync) {
8532 mddev->in_sync = 0;
2953079c
SL
8533 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
8534 set_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
2a2275d6
N
8535 if (mddev->safemode_delay &&
8536 mddev->safemode == 0)
8537 mddev->safemode = 1;
85572d7c 8538 spin_unlock(&mddev->lock);
2a2275d6 8539 md_update_sb(mddev, 0);
00bcb4ac 8540 sysfs_notify_dirent_safe(mddev->sysfs_state);
2214c260
AP
8541 /* wait for the dirty state to be recorded in the metadata */
8542 wait_event(mddev->sb_wait,
2214c260 8543 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
2a2275d6 8544 } else
85572d7c 8545 spin_unlock(&mddev->lock);
2a2275d6
N
8546}
8547EXPORT_SYMBOL_GPL(md_allow_write);
8548
1da177e4
LT
8549#define SYNC_MARKS 10
8550#define SYNC_MARK_STEP (3*HZ)
54f89341 8551#define UPDATE_FREQUENCY (5*60*HZ)
4ed8731d 8552void md_do_sync(struct md_thread *thread)
1da177e4 8553{
4ed8731d 8554 struct mddev *mddev = thread->mddev;
fd01b88c 8555 struct mddev *mddev2;
e5b521ee 8556 unsigned int currspeed = 0, window;
ac7e50a3 8557 sector_t max_sectors,j, io_sectors, recovery_done;
1da177e4 8558 unsigned long mark[SYNC_MARKS];
54f89341 8559 unsigned long update_time;
1da177e4
LT
8560 sector_t mark_cnt[SYNC_MARKS];
8561 int last_mark,m;
8562 struct list_head *tmp;
8563 sector_t last_check;
57afd89f 8564 int skipped = 0;
3cb03002 8565 struct md_rdev *rdev;
c4a39551 8566 char *desc, *action = NULL;
7c2c57c9 8567 struct blk_plug plug;
41a9a0dc 8568 int ret;
1da177e4
LT
8569
8570 /* just incase thread restarts... */
d5d885fd
SL
8571 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
8572 test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
1da177e4 8573 return;
3991b31e
N
8574 if (mddev->ro) {/* never try to sync a read-only array */
8575 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5fd6c1dc 8576 return;
3991b31e 8577 }
1da177e4 8578
41a9a0dc
GJ
8579 if (mddev_is_clustered(mddev)) {
8580 ret = md_cluster_ops->resync_start(mddev);
8581 if (ret)
8582 goto skip;
8583
bb8bf15b 8584 set_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags);
41a9a0dc
GJ
8585 if (!(test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
8586 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) ||
8587 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
8588 && ((unsigned long long)mddev->curr_resync_completed
8589 < (unsigned long long)mddev->resync_max_sectors))
8590 goto skip;
8591 }
8592
61df9d91 8593 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
c4a39551 8594 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
61df9d91 8595 desc = "data-check";
c4a39551
JB
8596 action = "check";
8597 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
61df9d91 8598 desc = "requested-resync";
c4a39551
JB
8599 action = "repair";
8600 } else
61df9d91
N
8601 desc = "resync";
8602 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
8603 desc = "reshape";
8604 else
8605 desc = "recovery";
8606
c4a39551
JB
8607 mddev->last_sync_action = action ?: desc;
8608
1da177e4
LT
8609 /* we overload curr_resync somewhat here.
8610 * 0 == not engaged in resync at all
8611 * 2 == checking that there is no conflict with another sync
8612 * 1 == like 2, but have yielded to allow conflicting resync to
e5b521ee 8613 * commence
1da177e4
LT
8614 * other == active in resync - this many blocks
8615 *
8616 * Before starting a resync we must have set curr_resync to
8617 * 2, and then checked that every "conflicting" array has curr_resync
8618 * less than ours. When we find one that is the same or higher
8619 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
8620 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
8621 * This will mean we have to start checking from the beginning again.
8622 *
8623 */
8624
8625 do {
c622ca54 8626 int mddev2_minor = -1;
1da177e4
LT
8627 mddev->curr_resync = 2;
8628
8629 try_again:
404e4b43 8630 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
1da177e4 8631 goto skip;
29ac4aa3 8632 for_each_mddev(mddev2, tmp) {
1da177e4
LT
8633 if (mddev2 == mddev)
8634 continue;
90b08710
BS
8635 if (!mddev->parallel_resync
8636 && mddev2->curr_resync
8637 && match_mddev_units(mddev, mddev2)) {
1da177e4
LT
8638 DEFINE_WAIT(wq);
8639 if (mddev < mddev2 && mddev->curr_resync == 2) {
8640 /* arbitrarily yield */
8641 mddev->curr_resync = 1;
8642 wake_up(&resync_wait);
8643 }
8644 if (mddev > mddev2 && mddev->curr_resync == 1)
8645 /* no need to wait here, we can wait the next
8646 * time 'round when curr_resync == 2
8647 */
8648 continue;
9744197c
N
8649 /* We need to wait 'interruptible' so as not to
8650 * contribute to the load average, and not to
8651 * be caught by 'softlockup'
8652 */
8653 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
c91abf5a 8654 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8712e553 8655 mddev2->curr_resync >= mddev->curr_resync) {
c622ca54
AP
8656 if (mddev2_minor != mddev2->md_minor) {
8657 mddev2_minor = mddev2->md_minor;
9d48739e
N
8658 pr_info("md: delaying %s of %s until %s has finished (they share one or more physical units)\n",
8659 desc, mdname(mddev),
8660 mdname(mddev2));
c622ca54 8661 }
1da177e4 8662 mddev_put(mddev2);
9744197c
N
8663 if (signal_pending(current))
8664 flush_signals(current);
1da177e4
LT
8665 schedule();
8666 finish_wait(&resync_wait, &wq);
8667 goto try_again;
8668 }
8669 finish_wait(&resync_wait, &wq);
8670 }
8671 }
8672 } while (mddev->curr_resync < 2);
8673
5fd6c1dc 8674 j = 0;
9d88883e 8675 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
1da177e4 8676 /* resync follows the size requested by the personality,
57afd89f 8677 * which defaults to physical size, but can be virtual size
1da177e4
LT
8678 */
8679 max_sectors = mddev->resync_max_sectors;
7f7583d4 8680 atomic64_set(&mddev->resync_mismatches, 0);
5fd6c1dc 8681 /* we don't use the checkpoint if there's a bitmap */
5e96ee65
NB
8682 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8683 j = mddev->resync_min;
8684 else if (!mddev->bitmap)
5fd6c1dc 8685 j = mddev->recovery_cp;
5e96ee65 8686
cb9ee154 8687 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
c804cdec 8688 max_sectors = mddev->resync_max_sectors;
cb9ee154
GJ
8689 /*
8690 * If the original node aborts reshaping then we continue the
8691 * reshaping, so set j again to avoid restart reshape from the
8692 * first beginning
8693 */
8694 if (mddev_is_clustered(mddev) &&
8695 mddev->reshape_position != MaxSector)
8696 j = mddev->reshape_position;
8697 } else {
1da177e4 8698 /* recovery follows the physical size of devices */
58c0fed4 8699 max_sectors = mddev->dev_sectors;
5fd6c1dc 8700 j = MaxSector;
4e59ca7d 8701 rcu_read_lock();
dafb20fa 8702 rdev_for_each_rcu(rdev, mddev)
5fd6c1dc 8703 if (rdev->raid_disk >= 0 &&
f2076e7d 8704 !test_bit(Journal, &rdev->flags) &&
5fd6c1dc
N
8705 !test_bit(Faulty, &rdev->flags) &&
8706 !test_bit(In_sync, &rdev->flags) &&
8707 rdev->recovery_offset < j)
8708 j = rdev->recovery_offset;
4e59ca7d 8709 rcu_read_unlock();
133d4527
N
8710
8711 /* If there is a bitmap, we need to make sure all
8712 * writes that started before we added a spare
8713 * complete before we start doing a recovery.
8714 * Otherwise the write might complete and (via
8715 * bitmap_endwrite) set a bit in the bitmap after the
8716 * recovery has checked that bit and skipped that
8717 * region.
8718 */
8719 if (mddev->bitmap) {
8720 mddev->pers->quiesce(mddev, 1);
8721 mddev->pers->quiesce(mddev, 0);
8722 }
5fd6c1dc 8723 }
1da177e4 8724
9d48739e
N
8725 pr_info("md: %s of RAID array %s\n", desc, mdname(mddev));
8726 pr_debug("md: minimum _guaranteed_ speed: %d KB/sec/disk.\n", speed_min(mddev));
8727 pr_debug("md: using maximum available idle IO bandwidth (but not more than %d KB/sec) for %s.\n",
8728 speed_max(mddev), desc);
1da177e4 8729
eea1bf38 8730 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
5fd6c1dc 8731
57afd89f 8732 io_sectors = 0;
1da177e4
LT
8733 for (m = 0; m < SYNC_MARKS; m++) {
8734 mark[m] = jiffies;
57afd89f 8735 mark_cnt[m] = io_sectors;
1da177e4
LT
8736 }
8737 last_mark = 0;
8738 mddev->resync_mark = mark[last_mark];
8739 mddev->resync_mark_cnt = mark_cnt[last_mark];
8740
8741 /*
8742 * Tune reconstruction:
8743 */
e5b521ee 8744 window = 32 * (PAGE_SIZE / 512);
9d48739e
N
8745 pr_debug("md: using %dk window, over a total of %lluk.\n",
8746 window/2, (unsigned long long)max_sectors/2);
1da177e4
LT
8747
8748 atomic_set(&mddev->recovery_active, 0);
1da177e4
LT
8749 last_check = 0;
8750
8751 if (j>2) {
9d48739e
N
8752 pr_debug("md: resuming %s of %s from checkpoint.\n",
8753 desc, mdname(mddev));
1da177e4 8754 mddev->curr_resync = j;
72f36d59
N
8755 } else
8756 mddev->curr_resync = 3; /* no longer delayed */
75d3da43 8757 mddev->curr_resync_completed = j;
72f36d59
N
8758 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8759 md_new_event(mddev);
54f89341 8760 update_time = jiffies;
1da177e4 8761
7c2c57c9 8762 blk_start_plug(&plug);
1da177e4 8763 while (j < max_sectors) {
57afd89f 8764 sector_t sectors;
1da177e4 8765
57afd89f 8766 skipped = 0;
97e4f42d 8767
7a91ee1f
N
8768 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8769 ((mddev->curr_resync > mddev->curr_resync_completed &&
8770 (mddev->curr_resync - mddev->curr_resync_completed)
8771 > (max_sectors >> 4)) ||
54f89341 8772 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7a91ee1f 8773 (j - mddev->curr_resync_completed)*2
c5e19d90
N
8774 >= mddev->resync_max - mddev->curr_resync_completed ||
8775 mddev->curr_resync_completed > mddev->resync_max
7a91ee1f 8776 )) {
97e4f42d 8777 /* time to update curr_resync_completed */
97e4f42d
N
8778 wait_event(mddev->recovery_wait,
8779 atomic_read(&mddev->recovery_active) == 0);
75d3da43 8780 mddev->curr_resync_completed = j;
35d78c66 8781 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
8782 j > mddev->recovery_cp)
8783 mddev->recovery_cp = j;
54f89341 8784 update_time = jiffies;
2953079c 8785 set_bit(MD_SB_CHANGE_CLEAN, &mddev->sb_flags);
acb180b0 8786 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
97e4f42d 8787 }
acb180b0 8788
c91abf5a
N
8789 while (j >= mddev->resync_max &&
8790 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
e62e58a5
N
8791 /* As this condition is controlled by user-space,
8792 * we can block indefinitely, so use '_interruptible'
8793 * to avoid triggering warnings.
8794 */
8795 flush_signals(current); /* just in case */
8796 wait_event_interruptible(mddev->recovery_wait,
8797 mddev->resync_max > j
c91abf5a
N
8798 || test_bit(MD_RECOVERY_INTR,
8799 &mddev->recovery));
e62e58a5 8800 }
acb180b0 8801
c91abf5a
N
8802 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8803 break;
acb180b0 8804
09314799 8805 sectors = mddev->pers->sync_request(mddev, j, &skipped);
57afd89f 8806 if (sectors == 0) {
dfc70645 8807 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
c91abf5a 8808 break;
1da177e4 8809 }
57afd89f
N
8810
8811 if (!skipped) { /* actual IO requested */
8812 io_sectors += sectors;
8813 atomic_add(sectors, &mddev->recovery_active);
8814 }
8815
e875ecea
N
8816 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8817 break;
8818
1da177e4 8819 j += sectors;
5ed1df2e
N
8820 if (j > max_sectors)
8821 /* when skipping, extra large numbers can be returned. */
8822 j = max_sectors;
72f36d59
N
8823 if (j > 2)
8824 mddev->curr_resync = j;
ff4e8d9a 8825 mddev->curr_mark_cnt = io_sectors;
d7603b7e 8826 if (last_check == 0)
e875ecea 8827 /* this is the earliest that rebuild will be
d7603b7e
N
8828 * visible in /proc/mdstat
8829 */
8830 md_new_event(mddev);
57afd89f
N
8831
8832 if (last_check + window > io_sectors || j == max_sectors)
1da177e4
LT
8833 continue;
8834
57afd89f 8835 last_check = io_sectors;
1da177e4
LT
8836 repeat:
8837 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
8838 /* step marks */
8839 int next = (last_mark+1) % SYNC_MARKS;
8840
8841 mddev->resync_mark = mark[next];
8842 mddev->resync_mark_cnt = mark_cnt[next];
8843 mark[next] = jiffies;
57afd89f 8844 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
1da177e4
LT
8845 last_mark = next;
8846 }
8847
c91abf5a
N
8848 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8849 break;
1da177e4
LT
8850
8851 /*
8852 * this loop exits only if either when we are slower than
8853 * the 'hard' speed limit, or the system was IO-idle for
8854 * a jiffy.
8855 * the system might be non-idle CPU-wise, but we only care
8856 * about not overloading the IO subsystem. (things like an
8857 * e2fsck being done on the RAID array should execute fast)
8858 */
1da177e4
LT
8859 cond_resched();
8860
ac7e50a3
XN
8861 recovery_done = io_sectors - atomic_read(&mddev->recovery_active);
8862 currspeed = ((unsigned long)(recovery_done - mddev->resync_mark_cnt))/2
57afd89f 8863 /((jiffies-mddev->resync_mark)/HZ +1) +1;
1da177e4 8864
88202a0c 8865 if (currspeed > speed_min(mddev)) {
ac8fa419 8866 if (currspeed > speed_max(mddev)) {
c0e48521 8867 msleep(500);
1da177e4
LT
8868 goto repeat;
8869 }
ac8fa419
N
8870 if (!is_mddev_idle(mddev, 0)) {
8871 /*
8872 * Give other IO more of a chance.
8873 * The faster the devices, the less we wait.
8874 */
8875 wait_event(mddev->recovery_wait,
8876 !atomic_read(&mddev->recovery_active));
8877 }
1da177e4
LT
8878 }
8879 }
9d48739e
N
8880 pr_info("md: %s: %s %s.\n",mdname(mddev), desc,
8881 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
8882 ? "interrupted" : "done");
1da177e4
LT
8883 /*
8884 * this also signals 'finished resyncing' to md_stop
8885 */
7c2c57c9 8886 blk_finish_plug(&plug);
1da177e4
LT
8887 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
8888
5ed1df2e
N
8889 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8890 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
1217e1d1 8891 mddev->curr_resync > 3) {
5ed1df2e
N
8892 mddev->curr_resync_completed = mddev->curr_resync;
8893 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
8894 }
09314799 8895 mddev->pers->sync_request(mddev, max_sectors, &skipped);
1da177e4 8896
dfc70645 8897 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
1217e1d1 8898 mddev->curr_resync > 3) {
5fd6c1dc
N
8899 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
8900 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8901 if (mddev->curr_resync >= mddev->recovery_cp) {
9d48739e
N
8902 pr_debug("md: checkpointing %s of %s.\n",
8903 desc, mdname(mddev));
0a19caab 8904 if (test_bit(MD_RECOVERY_ERROR,
8905 &mddev->recovery))
8906 mddev->recovery_cp =
8907 mddev->curr_resync_completed;
8908 else
8909 mddev->recovery_cp =
8910 mddev->curr_resync;
5fd6c1dc
N
8911 }
8912 } else
8913 mddev->recovery_cp = MaxSector;
8914 } else {
8915 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
8916 mddev->curr_resync = MaxSector;
db0505d3
N
8917 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8918 test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) {
8919 rcu_read_lock();
8920 rdev_for_each_rcu(rdev, mddev)
8921 if (rdev->raid_disk >= 0 &&
8922 mddev->delta_disks >= 0 &&
8923 !test_bit(Journal, &rdev->flags) &&
8924 !test_bit(Faulty, &rdev->flags) &&
8925 !test_bit(In_sync, &rdev->flags) &&
8926 rdev->recovery_offset < mddev->curr_resync)
8927 rdev->recovery_offset = mddev->curr_resync;
8928 rcu_read_unlock();
8929 }
5fd6c1dc 8930 }
1da177e4 8931 }
db91ff55 8932 skip:
bb8bf15b
GJ
8933 /* set CHANGE_PENDING here since maybe another update is needed,
8934 * so other nodes are informed. It should be harmless for normal
8935 * raid */
2953079c
SL
8936 set_mask_bits(&mddev->sb_flags, 0,
8937 BIT(MD_SB_CHANGE_PENDING) | BIT(MD_SB_CHANGE_DEVS));
c186b128 8938
8876391e
BC
8939 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
8940 !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
8941 mddev->delta_disks > 0 &&
8942 mddev->pers->finish_reshape &&
8943 mddev->pers->size &&
8944 mddev->queue) {
8945 mddev_lock_nointr(mddev);
8946 md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
8947 mddev_unlock(mddev);
aefb2e5f
GJ
8948 if (!mddev_is_clustered(mddev)) {
8949 set_capacity(mddev->gendisk, mddev->array_sectors);
8950 revalidate_disk(mddev->gendisk);
8951 }
8876391e
BC
8952 }
8953
23da422b 8954 spin_lock(&mddev->lock);
c07b70ad
N
8955 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
8956 /* We completed so min/max setting can be forgotten if used. */
8957 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8958 mddev->resync_min = 0;
8959 mddev->resync_max = MaxSector;
8960 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
8961 mddev->resync_min = mddev->curr_resync_completed;
f7851be7 8962 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
1da177e4 8963 mddev->curr_resync = 0;
23da422b
N
8964 spin_unlock(&mddev->lock);
8965
1da177e4 8966 wake_up(&resync_wait);
1da177e4 8967 md_wakeup_thread(mddev->thread);
c6207277 8968 return;
1da177e4 8969}
29269553 8970EXPORT_SYMBOL_GPL(md_do_sync);
1da177e4 8971
746d3207
N
8972static int remove_and_add_spares(struct mddev *mddev,
8973 struct md_rdev *this)
b4c4c7b8 8974{
3cb03002 8975 struct md_rdev *rdev;
b4c4c7b8 8976 int spares = 0;
f2a371c5 8977 int removed = 0;
d787be40 8978 bool remove_some = false;
b4c4c7b8 8979
39772f0a
N
8980 if (this && test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
8981 /* Mustn't remove devices when resync thread is running */
8982 return 0;
8983
d787be40
N
8984 rdev_for_each(rdev, mddev) {
8985 if ((this == NULL || rdev == this) &&
8986 rdev->raid_disk >= 0 &&
8987 !test_bit(Blocked, &rdev->flags) &&
8988 test_bit(Faulty, &rdev->flags) &&
8989 atomic_read(&rdev->nr_pending)==0) {
8990 /* Faulty non-Blocked devices with nr_pending == 0
8991 * never get nr_pending incremented,
8992 * never get Faulty cleared, and never get Blocked set.
8993 * So we can synchronize_rcu now rather than once per device
8994 */
8995 remove_some = true;
8996 set_bit(RemoveSynchronized, &rdev->flags);
8997 }
8998 }
8999
9000 if (remove_some)
9001 synchronize_rcu();
9002 rdev_for_each(rdev, mddev) {
746d3207
N
9003 if ((this == NULL || rdev == this) &&
9004 rdev->raid_disk >= 0 &&
6bfe0b49 9005 !test_bit(Blocked, &rdev->flags) &&
d787be40 9006 ((test_bit(RemoveSynchronized, &rdev->flags) ||
f2076e7d
SL
9007 (!test_bit(In_sync, &rdev->flags) &&
9008 !test_bit(Journal, &rdev->flags))) &&
d787be40 9009 atomic_read(&rdev->nr_pending)==0)) {
b4c4c7b8 9010 if (mddev->pers->hot_remove_disk(
b8321b68 9011 mddev, rdev) == 0) {
36fad858 9012 sysfs_unlink_rdev(mddev, rdev);
011abdc9 9013 rdev->saved_raid_disk = rdev->raid_disk;
b4c4c7b8 9014 rdev->raid_disk = -1;
f2a371c5 9015 removed++;
b4c4c7b8
N
9016 }
9017 }
d787be40
N
9018 if (remove_some && test_bit(RemoveSynchronized, &rdev->flags))
9019 clear_bit(RemoveSynchronized, &rdev->flags);
9020 }
9021
90584fc9
JB
9022 if (removed && mddev->kobj.sd)
9023 sysfs_notify(&mddev->kobj, NULL, "degraded");
b4c4c7b8 9024
2910ff17 9025 if (this && removed)
746d3207
N
9026 goto no_add;
9027
dafb20fa 9028 rdev_for_each(rdev, mddev) {
2910ff17
GR
9029 if (this && this != rdev)
9030 continue;
dbb64f86
GR
9031 if (test_bit(Candidate, &rdev->flags))
9032 continue;
7bfec5f3
N
9033 if (rdev->raid_disk >= 0 &&
9034 !test_bit(In_sync, &rdev->flags) &&
f2076e7d 9035 !test_bit(Journal, &rdev->flags) &&
7bfec5f3
N
9036 !test_bit(Faulty, &rdev->flags))
9037 spares++;
7ceb17e8
N
9038 if (rdev->raid_disk >= 0)
9039 continue;
9040 if (test_bit(Faulty, &rdev->flags))
9041 continue;
f6b6ec5c
SL
9042 if (!test_bit(Journal, &rdev->flags)) {
9043 if (mddev->ro &&
9044 ! (rdev->saved_raid_disk >= 0 &&
9045 !test_bit(Bitmap_sync, &rdev->flags)))
9046 continue;
7ceb17e8 9047
f6b6ec5c
SL
9048 rdev->recovery_offset = 0;
9049 }
3f79cc22 9050 if (mddev->pers->hot_add_disk(mddev, rdev) == 0) {
7ceb17e8
N
9051 if (sysfs_link_rdev(mddev, rdev))
9052 /* failure here is OK */;
f6b6ec5c
SL
9053 if (!test_bit(Journal, &rdev->flags))
9054 spares++;
7ceb17e8 9055 md_new_event(mddev);
2953079c 9056 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
dfc70645 9057 }
b4c4c7b8 9058 }
746d3207 9059no_add:
6dafab6b 9060 if (removed)
2953079c 9061 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
b4c4c7b8
N
9062 return spares;
9063}
7ebc0be7 9064
ac05f256
N
9065static void md_start_sync(struct work_struct *ws)
9066{
9067 struct mddev *mddev = container_of(ws, struct mddev, del_work);
c186b128 9068
ac05f256
N
9069 mddev->sync_thread = md_register_thread(md_do_sync,
9070 mddev,
9071 "resync");
9072 if (!mddev->sync_thread) {
9d48739e
N
9073 pr_warn("%s: could not start resync thread...\n",
9074 mdname(mddev));
ac05f256
N
9075 /* leave the spares where they are, it shouldn't hurt */
9076 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9077 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9078 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9079 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
9080 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
f851b60d 9081 wake_up(&resync_wait);
ac05f256
N
9082 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9083 &mddev->recovery))
9084 if (mddev->sysfs_action)
9085 sysfs_notify_dirent_safe(mddev->sysfs_action);
9086 } else
9087 md_wakeup_thread(mddev->sync_thread);
9088 sysfs_notify_dirent_safe(mddev->sysfs_action);
9089 md_new_event(mddev);
9090}
9091
1da177e4
LT
9092/*
9093 * This routine is regularly called by all per-raid-array threads to
9094 * deal with generic issues like resync and super-block update.
9095 * Raid personalities that don't have a thread (linear/raid0) do not
9096 * need this as they never do any recovery or update the superblock.
9097 *
9098 * It does not do any resync itself, but rather "forks" off other threads
9099 * to do that as needed.
9100 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
9101 * "->recovery" and create a thread at ->sync_thread.
dfc70645 9102 * When the thread finishes it sets MD_RECOVERY_DONE
1da177e4
LT
9103 * and wakeups up this thread which will reap the thread and finish up.
9104 * This thread also removes any faulty devices (with nr_pending == 0).
9105 *
9106 * The overall approach is:
9107 * 1/ if the superblock needs updating, update it.
9108 * 2/ If a recovery thread is running, don't do anything else.
9109 * 3/ If recovery has finished, clean up, possibly marking spares active.
9110 * 4/ If there are any faulty devices, remove them.
9111 * 5/ If array is degraded, try to add spares devices
9112 * 6/ If array has spares or is not in-sync, start a resync thread.
9113 */
fd01b88c 9114void md_check_recovery(struct mddev *mddev)
1da177e4 9115{
059421e0
N
9116 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags) && mddev->sb_flags) {
9117 /* Write superblock - thread that called mddev_suspend()
9118 * holds reconfig_mutex for us.
9119 */
9120 set_bit(MD_UPDATING_SB, &mddev->flags);
9121 smp_mb__after_atomic();
9122 if (test_bit(MD_ALLOW_SB_UPDATE, &mddev->flags))
9123 md_update_sb(mddev, 0);
9124 clear_bit_unlock(MD_UPDATING_SB, &mddev->flags);
9125 wake_up(&mddev->sb_wait);
9126 }
9127
68866e42
JB
9128 if (mddev->suspended)
9129 return;
9130
5f40402d 9131 if (mddev->bitmap)
e64e4018 9132 md_bitmap_daemon_work(mddev);
1da177e4 9133
fca4d848 9134 if (signal_pending(current)) {
31a59e34 9135 if (mddev->pers->sync_request && !mddev->external) {
9d48739e
N
9136 pr_debug("md: %s in immediate safe mode\n",
9137 mdname(mddev));
fca4d848
N
9138 mddev->safemode = 2;
9139 }
9140 flush_signals(current);
9141 }
9142
c89a8eee
N
9143 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
9144 return;
1da177e4 9145 if ( ! (
2953079c 9146 (mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
1da177e4 9147 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
fca4d848 9148 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
31a59e34 9149 (mddev->external == 0 && mddev->safemode == 1) ||
4ad23a97 9150 (mddev->safemode == 2
fca4d848 9151 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
1da177e4
LT
9152 ))
9153 return;
fca4d848 9154
df5b89b3 9155 if (mddev_trylock(mddev)) {
b4c4c7b8 9156 int spares = 0;
480523fe 9157 bool try_set_sync = mddev->safemode != 0;
fca4d848 9158
afc1f55c 9159 if (!mddev->external && mddev->safemode == 1)
33182d15
N
9160 mddev->safemode = 0;
9161
c89a8eee 9162 if (mddev->ro) {
ab16bfc7
NB
9163 struct md_rdev *rdev;
9164 if (!mddev->external && mddev->in_sync)
9165 /* 'Blocked' flag not needed as failed devices
9166 * will be recorded if array switched to read/write.
9167 * Leaving it set will prevent the device
9168 * from being removed.
9169 */
9170 rdev_for_each(rdev, mddev)
9171 clear_bit(Blocked, &rdev->flags);
7ceb17e8
N
9172 /* On a read-only array we can:
9173 * - remove failed devices
9174 * - add already-in_sync devices if the array itself
9175 * is in-sync.
9176 * As we only add devices that are already in-sync,
9177 * we can activate the spares immediately.
c89a8eee 9178 */
7ceb17e8 9179 remove_and_add_spares(mddev, NULL);
8313b8e5
N
9180 /* There is no thread, but we need to call
9181 * ->spare_active and clear saved_raid_disk
9182 */
2ac295a5 9183 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
8313b8e5 9184 md_reap_sync_thread(mddev);
a4a3d26d 9185 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
8313b8e5 9186 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2953079c 9187 clear_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags);
c89a8eee
N
9188 goto unlock;
9189 }
9190
659b254f
GJ
9191 if (mddev_is_clustered(mddev)) {
9192 struct md_rdev *rdev;
9193 /* kick the device if another node issued a
9194 * remove disk.
9195 */
9196 rdev_for_each(rdev, mddev) {
9197 if (test_and_clear_bit(ClusterRemove, &rdev->flags) &&
9198 rdev->raid_disk < 0)
9199 md_kick_rdev_from_array(rdev);
9200 }
9201 }
9202
480523fe 9203 if (try_set_sync && !mddev->external && !mddev->in_sync) {
85572d7c 9204 spin_lock(&mddev->lock);
6497709b 9205 set_in_sync(mddev);
85572d7c 9206 spin_unlock(&mddev->lock);
fca4d848 9207 }
fca4d848 9208
2953079c 9209 if (mddev->sb_flags)
850b2b42 9210 md_update_sb(mddev, 0);
06d91a5f 9211
1da177e4
LT
9212 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
9213 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
9214 /* resync/recovery still happening */
9215 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9216 goto unlock;
9217 }
9218 if (mddev->sync_thread) {
a91d5ac0 9219 md_reap_sync_thread(mddev);
1da177e4
LT
9220 goto unlock;
9221 }
72a23c21
NB
9222 /* Set RUNNING before clearing NEEDED to avoid
9223 * any transients in the value of "sync_action".
9224 */
72f36d59 9225 mddev->curr_resync_completed = 0;
23da422b 9226 spin_lock(&mddev->lock);
72a23c21 9227 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
23da422b 9228 spin_unlock(&mddev->lock);
24dd469d
N
9229 /* Clear some bits that don't mean anything, but
9230 * might be left set
9231 */
24dd469d
N
9232 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
9233 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
1da177e4 9234
ed209584
N
9235 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
9236 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
ac05f256 9237 goto not_running;
1da177e4
LT
9238 /* no recovery is running.
9239 * remove any failed drives, then
9240 * add spares if possible.
72f36d59 9241 * Spares are also removed and re-added, to allow
1da177e4
LT
9242 * the personality to fail the re-add.
9243 */
1da177e4 9244
b4c4c7b8 9245 if (mddev->reshape_position != MaxSector) {
50ac168a
N
9246 if (mddev->pers->check_reshape == NULL ||
9247 mddev->pers->check_reshape(mddev) != 0)
b4c4c7b8 9248 /* Cannot proceed */
ac05f256 9249 goto not_running;
b4c4c7b8 9250 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
72a23c21 9251 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
746d3207 9252 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
24dd469d
N
9253 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9254 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
56ac36d7 9255 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
72a23c21 9256 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
24dd469d
N
9257 } else if (mddev->recovery_cp < MaxSector) {
9258 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
72a23c21 9259 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
24dd469d
N
9260 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
9261 /* nothing to be done ... */
ac05f256 9262 goto not_running;
24dd469d 9263
1da177e4 9264 if (mddev->pers->sync_request) {
ef99bf48 9265 if (spares) {
a654b9d8
N
9266 /* We are adding a device or devices to an array
9267 * which has the bitmap stored on all devices.
9268 * So make sure all bitmap pages get written
9269 */
e64e4018 9270 md_bitmap_write_all(mddev->bitmap);
a654b9d8 9271 }
ac05f256
N
9272 INIT_WORK(&mddev->del_work, md_start_sync);
9273 queue_work(md_misc_wq, &mddev->del_work);
9274 goto unlock;
1da177e4 9275 }
ac05f256 9276 not_running:
72a23c21
NB
9277 if (!mddev->sync_thread) {
9278 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
f851b60d 9279 wake_up(&resync_wait);
72a23c21
NB
9280 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
9281 &mddev->recovery))
0c3573f1 9282 if (mddev->sysfs_action)
00bcb4ac 9283 sysfs_notify_dirent_safe(mddev->sysfs_action);
72a23c21 9284 }
ac05f256
N
9285 unlock:
9286 wake_up(&mddev->sb_wait);
1da177e4
LT
9287 mddev_unlock(mddev);
9288 }
9289}
6c144d31 9290EXPORT_SYMBOL(md_check_recovery);
1da177e4 9291
a91d5ac0
JB
9292void md_reap_sync_thread(struct mddev *mddev)
9293{
9294 struct md_rdev *rdev;
aefb2e5f
GJ
9295 sector_t old_dev_sectors = mddev->dev_sectors;
9296 bool is_reshaped = false;
a91d5ac0
JB
9297
9298 /* resync has finished, collect result */
9299 md_unregister_thread(&mddev->sync_thread);
9300 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
0d8ed0e9
GJ
9301 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
9302 mddev->degraded != mddev->raid_disks) {
a91d5ac0
JB
9303 /* success...*/
9304 /* activate any spares */
9305 if (mddev->pers->spare_active(mddev)) {
9306 sysfs_notify(&mddev->kobj, NULL,
9307 "degraded");
2953079c 9308 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
a91d5ac0
JB
9309 }
9310 }
9311 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
aefb2e5f 9312 mddev->pers->finish_reshape) {
a91d5ac0 9313 mddev->pers->finish_reshape(mddev);
aefb2e5f
GJ
9314 if (mddev_is_clustered(mddev))
9315 is_reshaped = true;
9316 }
a91d5ac0
JB
9317
9318 /* If array is no-longer degraded, then any saved_raid_disk
f466722c 9319 * information must be scrapped.
a91d5ac0 9320 */
f466722c
N
9321 if (!mddev->degraded)
9322 rdev_for_each(rdev, mddev)
a91d5ac0
JB
9323 rdev->saved_raid_disk = -1;
9324
9325 md_update_sb(mddev, 1);
2953079c 9326 /* MD_SB_CHANGE_PENDING should be cleared by md_update_sb, so we can
bb8bf15b
GJ
9327 * call resync_finish here if MD_CLUSTER_RESYNC_LOCKED is set by
9328 * clustered raid */
9329 if (test_and_clear_bit(MD_CLUSTER_RESYNC_LOCKED, &mddev->flags))
9330 md_cluster_ops->resync_finish(mddev);
a91d5ac0 9331 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
ea358cd0 9332 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
a91d5ac0
JB
9333 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
9334 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
9335 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
9336 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
aefb2e5f
GJ
9337 /*
9338 * We call md_cluster_ops->update_size here because sync_size could
9339 * be changed by md_update_sb, and MD_RECOVERY_RESHAPE is cleared,
9340 * so it is time to update size across cluster.
9341 */
9342 if (mddev_is_clustered(mddev) && is_reshaped
9343 && !test_bit(MD_CLOSING, &mddev->flags))
9344 md_cluster_ops->update_size(mddev, old_dev_sectors);
f851b60d 9345 wake_up(&resync_wait);
a91d5ac0
JB
9346 /* flag recovery needed just to double check */
9347 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9348 sysfs_notify_dirent_safe(mddev->sysfs_action);
9349 md_new_event(mddev);
9350 if (mddev->event_work.func)
9351 queue_work(md_misc_wq, &mddev->event_work);
9352}
6c144d31 9353EXPORT_SYMBOL(md_reap_sync_thread);
a91d5ac0 9354
fd01b88c 9355void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
6bfe0b49 9356{
00bcb4ac 9357 sysfs_notify_dirent_safe(rdev->sysfs_state);
6bfe0b49 9358 wait_event_timeout(rdev->blocked_wait,
de393cde
N
9359 !test_bit(Blocked, &rdev->flags) &&
9360 !test_bit(BlockedBadBlocks, &rdev->flags),
6bfe0b49
DW
9361 msecs_to_jiffies(5000));
9362 rdev_dec_pending(rdev, mddev);
9363}
9364EXPORT_SYMBOL(md_wait_for_blocked_rdev);
9365
c6563a8c
N
9366void md_finish_reshape(struct mddev *mddev)
9367{
9368 /* called be personality module when reshape completes. */
9369 struct md_rdev *rdev;
9370
9371 rdev_for_each(rdev, mddev) {
9372 if (rdev->data_offset > rdev->new_data_offset)
9373 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
9374 else
9375 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
9376 rdev->data_offset = rdev->new_data_offset;
9377 }
9378}
9379EXPORT_SYMBOL(md_finish_reshape);
2230dfe4 9380
fc974ee2 9381/* Bad block management */
2230dfe4 9382
fc974ee2 9383/* Returns 1 on success, 0 on failure */
3cb03002 9384int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
c6563a8c 9385 int is_new)
2230dfe4 9386{
85ad1d13 9387 struct mddev *mddev = rdev->mddev;
c6563a8c
N
9388 int rv;
9389 if (is_new)
9390 s += rdev->new_data_offset;
9391 else
9392 s += rdev->data_offset;
fc974ee2
VV
9393 rv = badblocks_set(&rdev->badblocks, s, sectors, 0);
9394 if (rv == 0) {
2230dfe4 9395 /* Make sure they get written out promptly */
35b785f7
TM
9396 if (test_bit(ExternalBbl, &rdev->flags))
9397 sysfs_notify(&rdev->kobj, NULL,
9398 "unacknowledged_bad_blocks");
8bd2f0a0 9399 sysfs_notify_dirent_safe(rdev->sysfs_state);
2953079c
SL
9400 set_mask_bits(&mddev->sb_flags, 0,
9401 BIT(MD_SB_CHANGE_CLEAN) | BIT(MD_SB_CHANGE_PENDING));
2230dfe4 9402 md_wakeup_thread(rdev->mddev->thread);
fc974ee2
VV
9403 return 1;
9404 } else
9405 return 0;
2230dfe4
N
9406}
9407EXPORT_SYMBOL_GPL(rdev_set_badblocks);
9408
c6563a8c
N
9409int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
9410 int is_new)
2230dfe4 9411{
35b785f7 9412 int rv;
c6563a8c
N
9413 if (is_new)
9414 s += rdev->new_data_offset;
9415 else
9416 s += rdev->data_offset;
35b785f7
TM
9417 rv = badblocks_clear(&rdev->badblocks, s, sectors);
9418 if ((rv == 0) && test_bit(ExternalBbl, &rdev->flags))
9419 sysfs_notify(&rdev->kobj, NULL, "bad_blocks");
9420 return rv;
2230dfe4
N
9421}
9422EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
9423
75c96f85
AB
9424static int md_notify_reboot(struct notifier_block *this,
9425 unsigned long code, void *x)
1da177e4
LT
9426{
9427 struct list_head *tmp;
fd01b88c 9428 struct mddev *mddev;
2dba6a91 9429 int need_delay = 0;
1da177e4 9430
c744a65c
N
9431 for_each_mddev(mddev, tmp) {
9432 if (mddev_trylock(mddev)) {
30b8aa91
N
9433 if (mddev->pers)
9434 __md_stop_writes(mddev);
0f62fb22
N
9435 if (mddev->persistent)
9436 mddev->safemode = 2;
c744a65c 9437 mddev_unlock(mddev);
2dba6a91 9438 }
c744a65c 9439 need_delay = 1;
1da177e4 9440 }
c744a65c
N
9441 /*
9442 * certain more exotic SCSI devices are known to be
9443 * volatile wrt too early system reboots. While the
9444 * right place to handle this issue is the given
9445 * driver, we do want to have a safe RAID driver ...
9446 */
9447 if (need_delay)
9448 mdelay(1000*1);
9449
1da177e4
LT
9450 return NOTIFY_DONE;
9451}
9452
75c96f85 9453static struct notifier_block md_notifier = {
1da177e4
LT
9454 .notifier_call = md_notify_reboot,
9455 .next = NULL,
9456 .priority = INT_MAX, /* before any real devices */
9457};
9458
9459static void md_geninit(void)
9460{
36a4e1fe 9461 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
1da177e4 9462
97a32539 9463 proc_create("mdstat", S_IRUGO, NULL, &mdstat_proc_ops);
1da177e4
LT
9464}
9465
75c96f85 9466static int __init md_init(void)
1da177e4 9467{
e804ac78
TH
9468 int ret = -ENOMEM;
9469
ada609ee 9470 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
e804ac78
TH
9471 if (!md_wq)
9472 goto err_wq;
9473
9474 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
9475 if (!md_misc_wq)
9476 goto err_misc_wq;
9477
cc1ffe61
GJ
9478 md_rdev_misc_wq = alloc_workqueue("md_rdev_misc", 0, 0);
9479 if (!md_misc_wq)
9480 goto err_rdev_misc_wq;
9481
e804ac78
TH
9482 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
9483 goto err_md;
9484
9485 if ((ret = register_blkdev(0, "mdp")) < 0)
9486 goto err_mdp;
9487 mdp_major = ret;
9488
af5628f0 9489 blk_register_region(MKDEV(MD_MAJOR, 0), 512, THIS_MODULE,
e8703fe1
N
9490 md_probe, NULL, NULL);
9491 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
1da177e4
LT
9492 md_probe, NULL, NULL);
9493
1da177e4 9494 register_reboot_notifier(&md_notifier);
0b4d4147 9495 raid_table_header = register_sysctl_table(raid_root_table);
1da177e4
LT
9496
9497 md_geninit();
d710e138 9498 return 0;
1da177e4 9499
e804ac78
TH
9500err_mdp:
9501 unregister_blkdev(MD_MAJOR, "md");
9502err_md:
cc1ffe61
GJ
9503 destroy_workqueue(md_rdev_misc_wq);
9504err_rdev_misc_wq:
e804ac78
TH
9505 destroy_workqueue(md_misc_wq);
9506err_misc_wq:
9507 destroy_workqueue(md_wq);
9508err_wq:
9509 return ret;
9510}
1da177e4 9511
70bcecdb 9512static void check_sb_changes(struct mddev *mddev, struct md_rdev *rdev)
1d7e3e96 9513{
70bcecdb
GR
9514 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
9515 struct md_rdev *rdev2;
9516 int role, ret;
9517 char b[BDEVNAME_SIZE];
1d7e3e96 9518
818da59f
GJ
9519 /*
9520 * If size is changed in another node then we need to
9521 * do resize as well.
9522 */
9523 if (mddev->dev_sectors != le64_to_cpu(sb->size)) {
9524 ret = mddev->pers->resize(mddev, le64_to_cpu(sb->size));
9525 if (ret)
9526 pr_info("md-cluster: resize failed\n");
9527 else
e64e4018 9528 md_bitmap_update_sb(mddev->bitmap);
818da59f
GJ
9529 }
9530
70bcecdb
GR
9531 /* Check for change of roles in the active devices */
9532 rdev_for_each(rdev2, mddev) {
9533 if (test_bit(Faulty, &rdev2->flags))
9534 continue;
9535
9536 /* Check if the roles changed */
9537 role = le16_to_cpu(sb->dev_roles[rdev2->desc_nr]);
dbb64f86
GR
9538
9539 if (test_bit(Candidate, &rdev2->flags)) {
9540 if (role == 0xfffe) {
9541 pr_info("md: Removing Candidate device %s because add failed\n", bdevname(rdev2->bdev,b));
9542 md_kick_rdev_from_array(rdev2);
9543 continue;
9544 }
9545 else
9546 clear_bit(Candidate, &rdev2->flags);
9547 }
9548
70bcecdb 9549 if (role != rdev2->raid_disk) {
ca1e98e0
GJ
9550 /*
9551 * got activated except reshape is happening.
9552 */
9553 if (rdev2->raid_disk == -1 && role != 0xffff &&
9554 !(le32_to_cpu(sb->feature_map) &
9555 MD_FEATURE_RESHAPE_ACTIVE)) {
70bcecdb
GR
9556 rdev2->saved_raid_disk = role;
9557 ret = remove_and_add_spares(mddev, rdev2);
9558 pr_info("Activated spare: %s\n",
9d48739e 9559 bdevname(rdev2->bdev,b));
a578183e
GJ
9560 /* wakeup mddev->thread here, so array could
9561 * perform resync with the new activated disk */
9562 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
9563 md_wakeup_thread(mddev->thread);
70bcecdb
GR
9564 }
9565 /* device faulty
9566 * We just want to do the minimum to mark the disk
9567 * as faulty. The recovery is performed by the
9568 * one who initiated the error.
9569 */
9570 if ((role == 0xfffe) || (role == 0xfffd)) {
9571 md_error(mddev, rdev2);
9572 clear_bit(Blocked, &rdev2->flags);
9573 }
9574 }
1d7e3e96 9575 }
70bcecdb 9576
28c1b9fd
GR
9577 if (mddev->raid_disks != le32_to_cpu(sb->raid_disks))
9578 update_raid_disks(mddev, le32_to_cpu(sb->raid_disks));
70bcecdb 9579
7564beda
GJ
9580 /*
9581 * Since mddev->delta_disks has already updated in update_raid_disks,
9582 * so it is time to check reshape.
9583 */
9584 if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9585 (le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9586 /*
9587 * reshape is happening in the remote node, we need to
9588 * update reshape_position and call start_reshape.
9589 */
ed4d0a4e 9590 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
7564beda
GJ
9591 if (mddev->pers->update_reshape_pos)
9592 mddev->pers->update_reshape_pos(mddev);
9593 if (mddev->pers->start_reshape)
9594 mddev->pers->start_reshape(mddev);
9595 } else if (test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) &&
9596 mddev->reshape_position != MaxSector &&
9597 !(le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
9598 /* reshape is just done in another node. */
9599 mddev->reshape_position = MaxSector;
9600 if (mddev->pers->update_reshape_pos)
9601 mddev->pers->update_reshape_pos(mddev);
9602 }
9603
70bcecdb
GR
9604 /* Finally set the event to be up to date */
9605 mddev->events = le64_to_cpu(sb->events);
9606}
9607
9608static int read_rdev(struct mddev *mddev, struct md_rdev *rdev)
9609{
9610 int err;
9611 struct page *swapout = rdev->sb_page;
9612 struct mdp_superblock_1 *sb;
9613
9614 /* Store the sb page of the rdev in the swapout temporary
9615 * variable in case we err in the future
9616 */
9617 rdev->sb_page = NULL;
7f0f0d87
N
9618 err = alloc_disk_sb(rdev);
9619 if (err == 0) {
9620 ClearPageUptodate(rdev->sb_page);
9621 rdev->sb_loaded = 0;
9622 err = super_types[mddev->major_version].
9623 load_super(rdev, NULL, mddev->minor_version);
9624 }
70bcecdb
GR
9625 if (err < 0) {
9626 pr_warn("%s: %d Could not reload rdev(%d) err: %d. Restoring old values\n",
9627 __func__, __LINE__, rdev->desc_nr, err);
7f0f0d87
N
9628 if (rdev->sb_page)
9629 put_page(rdev->sb_page);
70bcecdb
GR
9630 rdev->sb_page = swapout;
9631 rdev->sb_loaded = 1;
9632 return err;
1d7e3e96
GR
9633 }
9634
70bcecdb
GR
9635 sb = page_address(rdev->sb_page);
9636 /* Read the offset unconditionally, even if MD_FEATURE_RECOVERY_OFFSET
9637 * is not set
9638 */
9639
9640 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET))
9641 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
9642
9643 /* The other node finished recovery, call spare_active to set
9644 * device In_sync and mddev->degraded
9645 */
9646 if (rdev->recovery_offset == MaxSector &&
9647 !test_bit(In_sync, &rdev->flags) &&
9648 mddev->pers->spare_active(mddev))
9649 sysfs_notify(&mddev->kobj, NULL, "degraded");
9650
9651 put_page(swapout);
9652 return 0;
9653}
9654
9655void md_reload_sb(struct mddev *mddev, int nr)
9656{
9657 struct md_rdev *rdev;
9658 int err;
9659
9660 /* Find the rdev */
9661 rdev_for_each_rcu(rdev, mddev) {
9662 if (rdev->desc_nr == nr)
9663 break;
9664 }
9665
9666 if (!rdev || rdev->desc_nr != nr) {
9667 pr_warn("%s: %d Could not find rdev with nr %d\n", __func__, __LINE__, nr);
9668 return;
9669 }
9670
9671 err = read_rdev(mddev, rdev);
9672 if (err < 0)
9673 return;
9674
9675 check_sb_changes(mddev, rdev);
9676
9677 /* Read all rdev's to update recovery_offset */
0ea9924a
GJ
9678 rdev_for_each_rcu(rdev, mddev) {
9679 if (!test_bit(Faulty, &rdev->flags))
9680 read_rdev(mddev, rdev);
9681 }
1d7e3e96
GR
9682}
9683EXPORT_SYMBOL(md_reload_sb);
9684
1da177e4
LT
9685#ifndef MODULE
9686
9687/*
9688 * Searches all registered partitions for autorun RAID arrays
9689 * at boot time.
9690 */
4d936ec1 9691
5b1f5bc3 9692static DEFINE_MUTEX(detected_devices_mutex);
4d936ec1
ME
9693static LIST_HEAD(all_detected_devices);
9694struct detected_devices_node {
9695 struct list_head list;
9696 dev_t dev;
9697};
1da177e4
LT
9698
9699void md_autodetect_dev(dev_t dev)
9700{
4d936ec1
ME
9701 struct detected_devices_node *node_detected_dev;
9702
9703 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
9704 if (node_detected_dev) {
9705 node_detected_dev->dev = dev;
5b1f5bc3 9706 mutex_lock(&detected_devices_mutex);
4d936ec1 9707 list_add_tail(&node_detected_dev->list, &all_detected_devices);
5b1f5bc3 9708 mutex_unlock(&detected_devices_mutex);
4d936ec1 9709 }
1da177e4
LT
9710}
9711
d82fa81c 9712void md_autostart_arrays(int part)
1da177e4 9713{
3cb03002 9714 struct md_rdev *rdev;
4d936ec1
ME
9715 struct detected_devices_node *node_detected_dev;
9716 dev_t dev;
9717 int i_scanned, i_passed;
1da177e4 9718
4d936ec1
ME
9719 i_scanned = 0;
9720 i_passed = 0;
1da177e4 9721
9d48739e 9722 pr_info("md: Autodetecting RAID arrays.\n");
1da177e4 9723
5b1f5bc3 9724 mutex_lock(&detected_devices_mutex);
4d936ec1
ME
9725 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
9726 i_scanned++;
9727 node_detected_dev = list_entry(all_detected_devices.next,
9728 struct detected_devices_node, list);
9729 list_del(&node_detected_dev->list);
9730 dev = node_detected_dev->dev;
9731 kfree(node_detected_dev);
90bcf133 9732 mutex_unlock(&detected_devices_mutex);
df968c4e 9733 rdev = md_import_device(dev,0, 90);
90bcf133 9734 mutex_lock(&detected_devices_mutex);
1da177e4
LT
9735 if (IS_ERR(rdev))
9736 continue;
9737
403df478 9738 if (test_bit(Faulty, &rdev->flags))
1da177e4 9739 continue;
403df478 9740
d0fae18f 9741 set_bit(AutoDetected, &rdev->flags);
1da177e4 9742 list_add(&rdev->same_set, &pending_raid_disks);
4d936ec1 9743 i_passed++;
1da177e4 9744 }
5b1f5bc3 9745 mutex_unlock(&detected_devices_mutex);
4d936ec1 9746
9d48739e 9747 pr_debug("md: Scanned %d and added %d devices.\n", i_scanned, i_passed);
1da177e4
LT
9748
9749 autorun_devices(part);
9750}
9751
fdee8ae4 9752#endif /* !MODULE */
1da177e4
LT
9753
9754static __exit void md_exit(void)
9755{
fd01b88c 9756 struct mddev *mddev;
1da177e4 9757 struct list_head *tmp;
e2f23b60 9758 int delay = 1;
8ab5e4c1 9759
af5628f0 9760 blk_unregister_region(MKDEV(MD_MAJOR,0), 512);
e8703fe1 9761 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
1da177e4 9762
3dbd8c2e 9763 unregister_blkdev(MD_MAJOR,"md");
1da177e4
LT
9764 unregister_blkdev(mdp_major, "mdp");
9765 unregister_reboot_notifier(&md_notifier);
9766 unregister_sysctl_table(raid_table_header);
e2f23b60
N
9767
9768 /* We cannot unload the modules while some process is
9769 * waiting for us in select() or poll() - wake them up
9770 */
9771 md_unloading = 1;
9772 while (waitqueue_active(&md_event_waiters)) {
9773 /* not safe to leave yet */
9774 wake_up(&md_event_waiters);
9775 msleep(delay);
9776 delay += delay;
9777 }
1da177e4 9778 remove_proc_entry("mdstat", NULL);
e2f23b60 9779
29ac4aa3 9780 for_each_mddev(mddev, tmp) {
1da177e4 9781 export_array(mddev);
9356863c 9782 mddev->ctime = 0;
d3374825 9783 mddev->hold_active = 0;
9356863c
N
9784 /*
9785 * for_each_mddev() will call mddev_put() at the end of each
9786 * iteration. As the mddev is now fully clear, this will
9787 * schedule the mddev for destruction by a workqueue, and the
9788 * destroy_workqueue() below will wait for that to complete.
9789 */
1da177e4 9790 }
cc1ffe61 9791 destroy_workqueue(md_rdev_misc_wq);
e804ac78
TH
9792 destroy_workqueue(md_misc_wq);
9793 destroy_workqueue(md_wq);
1da177e4
LT
9794}
9795
685784aa 9796subsys_initcall(md_init);
1da177e4
LT
9797module_exit(md_exit)
9798
e4dca7b7 9799static int get_ro(char *buffer, const struct kernel_param *kp)
f91de92e 9800{
3f99980c 9801 return sprintf(buffer, "%d\n", start_readonly);
f91de92e 9802}
e4dca7b7 9803static int set_ro(const char *val, const struct kernel_param *kp)
f91de92e 9804{
4c9309c0 9805 return kstrtouint(val, 10, (unsigned int *)&start_readonly);
f91de92e
N
9806}
9807
80ca3a44
N
9808module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
9809module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
efeb53c0 9810module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
78b6350d 9811module_param(create_on_open, bool, S_IRUSR|S_IWUSR);
f91de92e 9812
1da177e4 9813MODULE_LICENSE("GPL");
0efb9e61 9814MODULE_DESCRIPTION("MD RAID framework");
aa1595e9 9815MODULE_ALIAS("md");
72008652 9816MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);