]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blame - drivers/md/raid1.c
md: move two macros into md.h
[mirror_ubuntu-bionic-kernel.git] / drivers / md / raid1.c
CommitLineData
1da177e4
LT
1/*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
96de0e25 12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
1da177e4
LT
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
191ea9b2
N
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync:
17 *
18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync
20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code
23 *
1da177e4
LT
24 * This program is free software; you can redistribute it and/or modify
25 * it under the terms of the GNU General Public License as published by
26 * the Free Software Foundation; either version 2, or (at your option)
27 * any later version.
28 *
29 * You should have received a copy of the GNU General Public License
30 * (for example /usr/src/linux/COPYING); if not, write to the Free
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */
33
5a0e3ad6 34#include <linux/slab.h>
25570727 35#include <linux/delay.h>
bff61975 36#include <linux/blkdev.h>
056075c7 37#include <linux/module.h>
bff61975 38#include <linux/seq_file.h>
8bda470e 39#include <linux/ratelimit.h>
3f07c014
IM
40#include <linux/sched/signal.h>
41
109e3765 42#include <trace/events/block.h>
3f07c014 43
43b2e5d8 44#include "md.h"
ef740c37
CH
45#include "raid1.h"
46#include "bitmap.h"
191ea9b2 47
394ed8e4
SL
48#define UNSUPPORTED_MDDEV_FLAGS \
49 ((1L << MD_HAS_JOURNAL) | \
ea0213e0
AP
50 (1L << MD_JOURNAL_CLEAN) | \
51 (1L << MD_HAS_PPL))
394ed8e4 52
1da177e4
LT
53/*
54 * Number of guaranteed r1bios in case of extreme VM load:
55 */
56#define NR_RAID1_BIOS 256
57
473e87ce
JB
58/* when we get a read error on a read-only array, we redirect to another
59 * device without failing the first device, or trying to over-write to
60 * correct the read error. To keep track of bad blocks on a per-bio
61 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
62 */
63#define IO_BLOCKED ((struct bio *)1)
64/* When we successfully write to a known bad-block, we need to remove the
65 * bad-block marking which must be done from process context. So we record
66 * the success by setting devs[n].bio to IO_MADE_GOOD
67 */
68#define IO_MADE_GOOD ((struct bio *)2)
69
70#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
71
34db0cd6
N
72/* When there are this many requests queue to be written by
73 * the raid1 thread, we become 'congested' to provide back-pressure
74 * for writeback.
75 */
76static int max_queued_requests = 1024;
1da177e4 77
fd76863e 78static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
79static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
1da177e4 80
578b54ad
N
81#define raid1_log(md, fmt, args...) \
82 do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
83
dd0fc66f 84static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
1da177e4
LT
85{
86 struct pool_info *pi = data;
9f2c9d12 87 int size = offsetof(struct r1bio, bios[pi->raid_disks]);
1da177e4
LT
88
89 /* allocate a r1bio with room for raid_disks entries in the bios array */
7eaceacc 90 return kzalloc(size, gfp_flags);
1da177e4
LT
91}
92
93static void r1bio_pool_free(void *r1_bio, void *data)
94{
95 kfree(r1_bio);
96}
97
8e005f7c 98#define RESYNC_DEPTH 32
1da177e4 99#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
8e005f7c 100#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
101#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
c40f341f
GR
102#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
103#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9)
1da177e4 104
dd0fc66f 105static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
1da177e4
LT
106{
107 struct pool_info *pi = data;
9f2c9d12 108 struct r1bio *r1_bio;
1da177e4 109 struct bio *bio;
da1aab3d 110 int need_pages;
1da177e4
LT
111 int i, j;
112
113 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
7eaceacc 114 if (!r1_bio)
1da177e4 115 return NULL;
1da177e4
LT
116
117 /*
118 * Allocate bios : 1 for reading, n-1 for writing
119 */
120 for (j = pi->raid_disks ; j-- ; ) {
6746557f 121 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
1da177e4
LT
122 if (!bio)
123 goto out_free_bio;
124 r1_bio->bios[j] = bio;
125 }
126 /*
127 * Allocate RESYNC_PAGES data pages and attach them to
d11c171e
N
128 * the first bio.
129 * If this is a user-requested check/repair, allocate
130 * RESYNC_PAGES for each bio.
1da177e4 131 */
d11c171e 132 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
da1aab3d 133 need_pages = pi->raid_disks;
d11c171e 134 else
da1aab3d
N
135 need_pages = 1;
136 for (j = 0; j < need_pages; j++) {
d11c171e 137 bio = r1_bio->bios[j];
a0787606 138 bio->bi_vcnt = RESYNC_PAGES;
d11c171e 139
a0787606 140 if (bio_alloc_pages(bio, gfp_flags))
da1aab3d 141 goto out_free_pages;
d11c171e
N
142 }
143 /* If not user-requests, copy the page pointers to all bios */
144 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
145 for (i=0; i<RESYNC_PAGES ; i++)
146 for (j=1; j<pi->raid_disks; j++)
147 r1_bio->bios[j]->bi_io_vec[i].bv_page =
148 r1_bio->bios[0]->bi_io_vec[i].bv_page;
1da177e4
LT
149 }
150
151 r1_bio->master_bio = NULL;
152
153 return r1_bio;
154
da1aab3d 155out_free_pages:
491221f8
GJ
156 while (--j >= 0)
157 bio_free_pages(r1_bio->bios[j]);
da1aab3d 158
1da177e4 159out_free_bio:
8f19ccb2 160 while (++j < pi->raid_disks)
1da177e4
LT
161 bio_put(r1_bio->bios[j]);
162 r1bio_pool_free(r1_bio, data);
163 return NULL;
164}
165
166static void r1buf_pool_free(void *__r1_bio, void *data)
167{
168 struct pool_info *pi = data;
d11c171e 169 int i,j;
9f2c9d12 170 struct r1bio *r1bio = __r1_bio;
1da177e4 171
d11c171e
N
172 for (i = 0; i < RESYNC_PAGES; i++)
173 for (j = pi->raid_disks; j-- ;) {
174 if (j == 0 ||
175 r1bio->bios[j]->bi_io_vec[i].bv_page !=
176 r1bio->bios[0]->bi_io_vec[i].bv_page)
1345b1d8 177 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
d11c171e 178 }
1da177e4
LT
179 for (i=0 ; i < pi->raid_disks; i++)
180 bio_put(r1bio->bios[i]);
181
182 r1bio_pool_free(r1bio, data);
183}
184
e8096360 185static void put_all_bios(struct r1conf *conf, struct r1bio *r1_bio)
1da177e4
LT
186{
187 int i;
188
8f19ccb2 189 for (i = 0; i < conf->raid_disks * 2; i++) {
1da177e4 190 struct bio **bio = r1_bio->bios + i;
4367af55 191 if (!BIO_SPECIAL(*bio))
1da177e4
LT
192 bio_put(*bio);
193 *bio = NULL;
194 }
195}
196
9f2c9d12 197static void free_r1bio(struct r1bio *r1_bio)
1da177e4 198{
e8096360 199 struct r1conf *conf = r1_bio->mddev->private;
1da177e4 200
1da177e4
LT
201 put_all_bios(conf, r1_bio);
202 mempool_free(r1_bio, conf->r1bio_pool);
203}
204
9f2c9d12 205static void put_buf(struct r1bio *r1_bio)
1da177e4 206{
e8096360 207 struct r1conf *conf = r1_bio->mddev->private;
af5f42a7 208 sector_t sect = r1_bio->sector;
3e198f78
N
209 int i;
210
8f19ccb2 211 for (i = 0; i < conf->raid_disks * 2; i++) {
3e198f78
N
212 struct bio *bio = r1_bio->bios[i];
213 if (bio->bi_end_io)
214 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
215 }
1da177e4
LT
216
217 mempool_free(r1_bio, conf->r1buf_pool);
218
af5f42a7 219 lower_barrier(conf, sect);
1da177e4
LT
220}
221
9f2c9d12 222static void reschedule_retry(struct r1bio *r1_bio)
1da177e4
LT
223{
224 unsigned long flags;
fd01b88c 225 struct mddev *mddev = r1_bio->mddev;
e8096360 226 struct r1conf *conf = mddev->private;
fd76863e 227 int idx;
1da177e4 228
fd76863e 229 idx = sector_to_idx(r1_bio->sector);
1da177e4
LT
230 spin_lock_irqsave(&conf->device_lock, flags);
231 list_add(&r1_bio->retry_list, &conf->retry_list);
824e47da 232 atomic_inc(&conf->nr_queued[idx]);
1da177e4
LT
233 spin_unlock_irqrestore(&conf->device_lock, flags);
234
17999be4 235 wake_up(&conf->wait_barrier);
1da177e4
LT
236 md_wakeup_thread(mddev->thread);
237}
238
239/*
240 * raid_end_bio_io() is called when we have finished servicing a mirrored
241 * operation and are ready to return a success/failure code to the buffer
242 * cache layer.
243 */
9f2c9d12 244static void call_bio_endio(struct r1bio *r1_bio)
d2eb35ac
N
245{
246 struct bio *bio = r1_bio->master_bio;
e8096360 247 struct r1conf *conf = r1_bio->mddev->private;
d2eb35ac
N
248
249 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
4246a0b6
CH
250 bio->bi_error = -EIO;
251
37011e3a
N
252 bio_endio(bio);
253 /*
254 * Wake up any possible resync thread that waits for the device
255 * to go idle.
256 */
257 allow_barrier(conf, r1_bio->sector);
d2eb35ac
N
258}
259
9f2c9d12 260static void raid_end_bio_io(struct r1bio *r1_bio)
1da177e4
LT
261{
262 struct bio *bio = r1_bio->master_bio;
263
4b6d287f
N
264 /* if nobody has done the final endio yet, do it now */
265 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
36a4e1fe
N
266 pr_debug("raid1: sync end %s on sectors %llu-%llu\n",
267 (bio_data_dir(bio) == WRITE) ? "write" : "read",
4f024f37
KO
268 (unsigned long long) bio->bi_iter.bi_sector,
269 (unsigned long long) bio_end_sector(bio) - 1);
4b6d287f 270
d2eb35ac 271 call_bio_endio(r1_bio);
4b6d287f 272 }
1da177e4
LT
273 free_r1bio(r1_bio);
274}
275
276/*
277 * Update disk head position estimator based on IRQ completion info.
278 */
9f2c9d12 279static inline void update_head_pos(int disk, struct r1bio *r1_bio)
1da177e4 280{
e8096360 281 struct r1conf *conf = r1_bio->mddev->private;
1da177e4
LT
282
283 conf->mirrors[disk].head_position =
284 r1_bio->sector + (r1_bio->sectors);
285}
286
ba3ae3be
NK
287/*
288 * Find the disk number which triggered given bio
289 */
9f2c9d12 290static int find_bio_disk(struct r1bio *r1_bio, struct bio *bio)
ba3ae3be
NK
291{
292 int mirror;
30194636
N
293 struct r1conf *conf = r1_bio->mddev->private;
294 int raid_disks = conf->raid_disks;
ba3ae3be 295
8f19ccb2 296 for (mirror = 0; mirror < raid_disks * 2; mirror++)
ba3ae3be
NK
297 if (r1_bio->bios[mirror] == bio)
298 break;
299
8f19ccb2 300 BUG_ON(mirror == raid_disks * 2);
ba3ae3be
NK
301 update_head_pos(mirror, r1_bio);
302
303 return mirror;
304}
305
4246a0b6 306static void raid1_end_read_request(struct bio *bio)
1da177e4 307{
4246a0b6 308 int uptodate = !bio->bi_error;
9f2c9d12 309 struct r1bio *r1_bio = bio->bi_private;
e8096360 310 struct r1conf *conf = r1_bio->mddev->private;
e5872d58 311 struct md_rdev *rdev = conf->mirrors[r1_bio->read_disk].rdev;
1da177e4 312
1da177e4
LT
313 /*
314 * this branch is our 'one mirror IO has finished' event handler:
315 */
e5872d58 316 update_head_pos(r1_bio->read_disk, r1_bio);
ddaf22ab 317
dd00a99e
N
318 if (uptodate)
319 set_bit(R1BIO_Uptodate, &r1_bio->state);
2e52d449
N
320 else if (test_bit(FailFast, &rdev->flags) &&
321 test_bit(R1BIO_FailFast, &r1_bio->state))
322 /* This was a fail-fast read so we definitely
323 * want to retry */
324 ;
dd00a99e
N
325 else {
326 /* If all other devices have failed, we want to return
327 * the error upwards rather than fail the last device.
328 * Here we redefine "uptodate" to mean "Don't want to retry"
1da177e4 329 */
dd00a99e
N
330 unsigned long flags;
331 spin_lock_irqsave(&conf->device_lock, flags);
332 if (r1_bio->mddev->degraded == conf->raid_disks ||
333 (r1_bio->mddev->degraded == conf->raid_disks-1 &&
e5872d58 334 test_bit(In_sync, &rdev->flags)))
dd00a99e
N
335 uptodate = 1;
336 spin_unlock_irqrestore(&conf->device_lock, flags);
337 }
1da177e4 338
7ad4d4a6 339 if (uptodate) {
1da177e4 340 raid_end_bio_io(r1_bio);
e5872d58 341 rdev_dec_pending(rdev, conf->mddev);
7ad4d4a6 342 } else {
1da177e4
LT
343 /*
344 * oops, read error:
345 */
346 char b[BDEVNAME_SIZE];
1d41c216
N
347 pr_err_ratelimited("md/raid1:%s: %s: rescheduling sector %llu\n",
348 mdname(conf->mddev),
349 bdevname(rdev->bdev, b),
350 (unsigned long long)r1_bio->sector);
d2eb35ac 351 set_bit(R1BIO_ReadError, &r1_bio->state);
1da177e4 352 reschedule_retry(r1_bio);
7ad4d4a6 353 /* don't drop the reference on read_disk yet */
1da177e4 354 }
1da177e4
LT
355}
356
9f2c9d12 357static void close_write(struct r1bio *r1_bio)
cd5ff9a1
N
358{
359 /* it really is the end of this request */
360 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
361 /* free extra copy of the data pages */
362 int i = r1_bio->behind_page_count;
363 while (i--)
364 safe_put_page(r1_bio->behind_bvecs[i].bv_page);
365 kfree(r1_bio->behind_bvecs);
366 r1_bio->behind_bvecs = NULL;
367 }
368 /* clear the bitmap if all writes complete successfully */
369 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
370 r1_bio->sectors,
371 !test_bit(R1BIO_Degraded, &r1_bio->state),
372 test_bit(R1BIO_BehindIO, &r1_bio->state));
373 md_write_end(r1_bio->mddev);
374}
375
9f2c9d12 376static void r1_bio_write_done(struct r1bio *r1_bio)
4e78064f 377{
cd5ff9a1
N
378 if (!atomic_dec_and_test(&r1_bio->remaining))
379 return;
380
381 if (test_bit(R1BIO_WriteError, &r1_bio->state))
382 reschedule_retry(r1_bio);
383 else {
384 close_write(r1_bio);
4367af55
N
385 if (test_bit(R1BIO_MadeGood, &r1_bio->state))
386 reschedule_retry(r1_bio);
387 else
388 raid_end_bio_io(r1_bio);
4e78064f
N
389 }
390}
391
4246a0b6 392static void raid1_end_write_request(struct bio *bio)
1da177e4 393{
9f2c9d12 394 struct r1bio *r1_bio = bio->bi_private;
e5872d58 395 int behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
e8096360 396 struct r1conf *conf = r1_bio->mddev->private;
04b857f7 397 struct bio *to_put = NULL;
e5872d58
N
398 int mirror = find_bio_disk(r1_bio, bio);
399 struct md_rdev *rdev = conf->mirrors[mirror].rdev;
e3f948cd
SL
400 bool discard_error;
401
402 discard_error = bio->bi_error && bio_op(bio) == REQ_OP_DISCARD;
1da177e4 403
e9c7469b
TH
404 /*
405 * 'one mirror IO has finished' event handler:
406 */
e3f948cd 407 if (bio->bi_error && !discard_error) {
e5872d58
N
408 set_bit(WriteErrorSeen, &rdev->flags);
409 if (!test_and_set_bit(WantReplacement, &rdev->flags))
19d67169
N
410 set_bit(MD_RECOVERY_NEEDED, &
411 conf->mddev->recovery);
412
212e7eb7
N
413 if (test_bit(FailFast, &rdev->flags) &&
414 (bio->bi_opf & MD_FAILFAST) &&
415 /* We never try FailFast to WriteMostly devices */
416 !test_bit(WriteMostly, &rdev->flags)) {
417 md_error(r1_bio->mddev, rdev);
418 if (!test_bit(Faulty, &rdev->flags))
419 /* This is the only remaining device,
420 * We need to retry the write without
421 * FailFast
422 */
423 set_bit(R1BIO_WriteError, &r1_bio->state);
424 else {
425 /* Finished with this branch */
426 r1_bio->bios[mirror] = NULL;
427 to_put = bio;
428 }
429 } else
430 set_bit(R1BIO_WriteError, &r1_bio->state);
4367af55 431 } else {
1da177e4 432 /*
e9c7469b
TH
433 * Set R1BIO_Uptodate in our master bio, so that we
434 * will return a good error code for to the higher
435 * levels even if IO on some other mirrored buffer
436 * fails.
437 *
438 * The 'master' represents the composite IO operation
439 * to user-side. So if something waits for IO, then it
440 * will wait for the 'master' bio.
1da177e4 441 */
4367af55
N
442 sector_t first_bad;
443 int bad_sectors;
444
cd5ff9a1
N
445 r1_bio->bios[mirror] = NULL;
446 to_put = bio;
3056e3ae
AL
447 /*
448 * Do not set R1BIO_Uptodate if the current device is
449 * rebuilding or Faulty. This is because we cannot use
450 * such device for properly reading the data back (we could
451 * potentially use it, if the current write would have felt
452 * before rdev->recovery_offset, but for simplicity we don't
453 * check this here.
454 */
e5872d58
N
455 if (test_bit(In_sync, &rdev->flags) &&
456 !test_bit(Faulty, &rdev->flags))
3056e3ae 457 set_bit(R1BIO_Uptodate, &r1_bio->state);
e9c7469b 458
4367af55 459 /* Maybe we can clear some bad blocks. */
e5872d58 460 if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
e3f948cd 461 &first_bad, &bad_sectors) && !discard_error) {
4367af55
N
462 r1_bio->bios[mirror] = IO_MADE_GOOD;
463 set_bit(R1BIO_MadeGood, &r1_bio->state);
464 }
465 }
466
e9c7469b 467 if (behind) {
e5872d58 468 if (test_bit(WriteMostly, &rdev->flags))
e9c7469b
TH
469 atomic_dec(&r1_bio->behind_remaining);
470
471 /*
472 * In behind mode, we ACK the master bio once the I/O
473 * has safely reached all non-writemostly
474 * disks. Setting the Returned bit ensures that this
475 * gets done only once -- we don't ever want to return
476 * -EIO here, instead we'll wait
477 */
478 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
479 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
480 /* Maybe we can return now */
481 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
482 struct bio *mbio = r1_bio->master_bio;
36a4e1fe
N
483 pr_debug("raid1: behind end write sectors"
484 " %llu-%llu\n",
4f024f37
KO
485 (unsigned long long) mbio->bi_iter.bi_sector,
486 (unsigned long long) bio_end_sector(mbio) - 1);
d2eb35ac 487 call_bio_endio(r1_bio);
4b6d287f
N
488 }
489 }
490 }
4367af55 491 if (r1_bio->bios[mirror] == NULL)
e5872d58 492 rdev_dec_pending(rdev, conf->mddev);
e9c7469b 493
1da177e4 494 /*
1da177e4
LT
495 * Let's see if all mirrored write operations have finished
496 * already.
497 */
af6d7b76 498 r1_bio_write_done(r1_bio);
c70810b3 499
04b857f7
N
500 if (to_put)
501 bio_put(to_put);
1da177e4
LT
502}
503
fd76863e 504static sector_t align_to_barrier_unit_end(sector_t start_sector,
505 sector_t sectors)
506{
507 sector_t len;
508
509 WARN_ON(sectors == 0);
510 /*
511 * len is the number of sectors from start_sector to end of the
512 * barrier unit which start_sector belongs to.
513 */
514 len = round_up(start_sector + 1, BARRIER_UNIT_SECTOR_SIZE) -
515 start_sector;
516
517 if (len > sectors)
518 len = sectors;
519
520 return len;
521}
522
1da177e4
LT
523/*
524 * This routine returns the disk from which the requested read should
525 * be done. There is a per-array 'next expected sequential IO' sector
526 * number - if this matches on the next IO then we use the last disk.
527 * There is also a per-disk 'last know head position' sector that is
528 * maintained from IRQ contexts, both the normal and the resync IO
529 * completion handlers update this position correctly. If there is no
530 * perfect sequential match then we pick the disk whose head is closest.
531 *
532 * If there are 2 mirrors in the same 2 devices, performance degrades
533 * because position is mirror, not device based.
534 *
535 * The rdev for the device selected will have nr_pending incremented.
536 */
e8096360 537static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
1da177e4 538{
af3a2cd6 539 const sector_t this_sector = r1_bio->sector;
d2eb35ac
N
540 int sectors;
541 int best_good_sectors;
9dedf603
SL
542 int best_disk, best_dist_disk, best_pending_disk;
543 int has_nonrot_disk;
be4d3280 544 int disk;
76073054 545 sector_t best_dist;
9dedf603 546 unsigned int min_pending;
3cb03002 547 struct md_rdev *rdev;
f3ac8bf7 548 int choose_first;
12cee5a8 549 int choose_next_idle;
1da177e4
LT
550
551 rcu_read_lock();
552 /*
8ddf9efe 553 * Check if we can balance. We can balance on the whole
1da177e4
LT
554 * device if no resync is going on, or below the resync window.
555 * We take the first readable disk when above the resync window.
556 */
557 retry:
d2eb35ac 558 sectors = r1_bio->sectors;
76073054 559 best_disk = -1;
9dedf603 560 best_dist_disk = -1;
76073054 561 best_dist = MaxSector;
9dedf603
SL
562 best_pending_disk = -1;
563 min_pending = UINT_MAX;
d2eb35ac 564 best_good_sectors = 0;
9dedf603 565 has_nonrot_disk = 0;
12cee5a8 566 choose_next_idle = 0;
2e52d449 567 clear_bit(R1BIO_FailFast, &r1_bio->state);
d2eb35ac 568
7d49ffcf
GR
569 if ((conf->mddev->recovery_cp < this_sector + sectors) ||
570 (mddev_is_clustered(conf->mddev) &&
90382ed9 571 md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
7d49ffcf
GR
572 this_sector + sectors)))
573 choose_first = 1;
574 else
575 choose_first = 0;
1da177e4 576
be4d3280 577 for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
76073054 578 sector_t dist;
d2eb35ac
N
579 sector_t first_bad;
580 int bad_sectors;
9dedf603 581 unsigned int pending;
12cee5a8 582 bool nonrot;
d2eb35ac 583
f3ac8bf7
N
584 rdev = rcu_dereference(conf->mirrors[disk].rdev);
585 if (r1_bio->bios[disk] == IO_BLOCKED
586 || rdev == NULL
76073054 587 || test_bit(Faulty, &rdev->flags))
f3ac8bf7 588 continue;
76073054
N
589 if (!test_bit(In_sync, &rdev->flags) &&
590 rdev->recovery_offset < this_sector + sectors)
1da177e4 591 continue;
76073054
N
592 if (test_bit(WriteMostly, &rdev->flags)) {
593 /* Don't balance among write-mostly, just
594 * use the first as a last resort */
d1901ef0 595 if (best_dist_disk < 0) {
307729c8
N
596 if (is_badblock(rdev, this_sector, sectors,
597 &first_bad, &bad_sectors)) {
816b0acf 598 if (first_bad <= this_sector)
307729c8
N
599 /* Cannot use this */
600 continue;
601 best_good_sectors = first_bad - this_sector;
602 } else
603 best_good_sectors = sectors;
d1901ef0
TH
604 best_dist_disk = disk;
605 best_pending_disk = disk;
307729c8 606 }
76073054
N
607 continue;
608 }
609 /* This is a reasonable device to use. It might
610 * even be best.
611 */
d2eb35ac
N
612 if (is_badblock(rdev, this_sector, sectors,
613 &first_bad, &bad_sectors)) {
614 if (best_dist < MaxSector)
615 /* already have a better device */
616 continue;
617 if (first_bad <= this_sector) {
618 /* cannot read here. If this is the 'primary'
619 * device, then we must not read beyond
620 * bad_sectors from another device..
621 */
622 bad_sectors -= (this_sector - first_bad);
623 if (choose_first && sectors > bad_sectors)
624 sectors = bad_sectors;
625 if (best_good_sectors > sectors)
626 best_good_sectors = sectors;
627
628 } else {
629 sector_t good_sectors = first_bad - this_sector;
630 if (good_sectors > best_good_sectors) {
631 best_good_sectors = good_sectors;
632 best_disk = disk;
633 }
634 if (choose_first)
635 break;
636 }
637 continue;
638 } else
639 best_good_sectors = sectors;
640
2e52d449
N
641 if (best_disk >= 0)
642 /* At least two disks to choose from so failfast is OK */
643 set_bit(R1BIO_FailFast, &r1_bio->state);
644
12cee5a8
SL
645 nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev));
646 has_nonrot_disk |= nonrot;
9dedf603 647 pending = atomic_read(&rdev->nr_pending);
76073054 648 dist = abs(this_sector - conf->mirrors[disk].head_position);
12cee5a8 649 if (choose_first) {
76073054 650 best_disk = disk;
1da177e4
LT
651 break;
652 }
12cee5a8
SL
653 /* Don't change to another disk for sequential reads */
654 if (conf->mirrors[disk].next_seq_sect == this_sector
655 || dist == 0) {
656 int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
657 struct raid1_info *mirror = &conf->mirrors[disk];
658
659 best_disk = disk;
660 /*
661 * If buffered sequential IO size exceeds optimal
662 * iosize, check if there is idle disk. If yes, choose
663 * the idle disk. read_balance could already choose an
664 * idle disk before noticing it's a sequential IO in
665 * this disk. This doesn't matter because this disk
666 * will idle, next time it will be utilized after the
667 * first disk has IO size exceeds optimal iosize. In
668 * this way, iosize of the first disk will be optimal
669 * iosize at least. iosize of the second disk might be
670 * small, but not a big deal since when the second disk
671 * starts IO, the first disk is likely still busy.
672 */
673 if (nonrot && opt_iosize > 0 &&
674 mirror->seq_start != MaxSector &&
675 mirror->next_seq_sect > opt_iosize &&
676 mirror->next_seq_sect - opt_iosize >=
677 mirror->seq_start) {
678 choose_next_idle = 1;
679 continue;
680 }
681 break;
682 }
12cee5a8
SL
683
684 if (choose_next_idle)
685 continue;
9dedf603
SL
686
687 if (min_pending > pending) {
688 min_pending = pending;
689 best_pending_disk = disk;
690 }
691
76073054
N
692 if (dist < best_dist) {
693 best_dist = dist;
9dedf603 694 best_dist_disk = disk;
1da177e4 695 }
f3ac8bf7 696 }
1da177e4 697
9dedf603
SL
698 /*
699 * If all disks are rotational, choose the closest disk. If any disk is
700 * non-rotational, choose the disk with less pending request even the
701 * disk is rotational, which might/might not be optimal for raids with
702 * mixed ratation/non-rotational disks depending on workload.
703 */
704 if (best_disk == -1) {
2e52d449 705 if (has_nonrot_disk || min_pending == 0)
9dedf603
SL
706 best_disk = best_pending_disk;
707 else
708 best_disk = best_dist_disk;
709 }
710
76073054
N
711 if (best_disk >= 0) {
712 rdev = rcu_dereference(conf->mirrors[best_disk].rdev);
8ddf9efe
N
713 if (!rdev)
714 goto retry;
715 atomic_inc(&rdev->nr_pending);
d2eb35ac 716 sectors = best_good_sectors;
12cee5a8
SL
717
718 if (conf->mirrors[best_disk].next_seq_sect != this_sector)
719 conf->mirrors[best_disk].seq_start = this_sector;
720
be4d3280 721 conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
1da177e4
LT
722 }
723 rcu_read_unlock();
d2eb35ac 724 *max_sectors = sectors;
1da177e4 725
76073054 726 return best_disk;
1da177e4
LT
727}
728
5c675f83 729static int raid1_congested(struct mddev *mddev, int bits)
0d129228 730{
e8096360 731 struct r1conf *conf = mddev->private;
0d129228
N
732 int i, ret = 0;
733
4452226e 734 if ((bits & (1 << WB_async_congested)) &&
34db0cd6
N
735 conf->pending_count >= max_queued_requests)
736 return 1;
737
0d129228 738 rcu_read_lock();
f53e29fc 739 for (i = 0; i < conf->raid_disks * 2; i++) {
3cb03002 740 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
0d129228 741 if (rdev && !test_bit(Faulty, &rdev->flags)) {
165125e1 742 struct request_queue *q = bdev_get_queue(rdev->bdev);
0d129228 743
1ed7242e
JB
744 BUG_ON(!q);
745
0d129228
N
746 /* Note the '|| 1' - when read_balance prefers
747 * non-congested targets, it can be removed
748 */
4452226e 749 if ((bits & (1 << WB_async_congested)) || 1)
dc3b17cc 750 ret |= bdi_congested(q->backing_dev_info, bits);
0d129228 751 else
dc3b17cc 752 ret &= bdi_congested(q->backing_dev_info, bits);
0d129228
N
753 }
754 }
755 rcu_read_unlock();
756 return ret;
757}
0d129228 758
e8096360 759static void flush_pending_writes(struct r1conf *conf)
a35e63ef
N
760{
761 /* Any writes that have been queued but are awaiting
762 * bitmap updates get flushed here.
a35e63ef 763 */
a35e63ef
N
764 spin_lock_irq(&conf->device_lock);
765
766 if (conf->pending_bio_list.head) {
767 struct bio *bio;
768 bio = bio_list_get(&conf->pending_bio_list);
34db0cd6 769 conf->pending_count = 0;
a35e63ef
N
770 spin_unlock_irq(&conf->device_lock);
771 /* flush any pending bitmap writes to
772 * disk before proceeding w/ I/O */
773 bitmap_unplug(conf->mddev->bitmap);
34db0cd6 774 wake_up(&conf->wait_barrier);
a35e63ef
N
775
776 while (bio) { /* submit pending writes */
777 struct bio *next = bio->bi_next;
5e2c7a36 778 struct md_rdev *rdev = (void*)bio->bi_bdev;
a35e63ef 779 bio->bi_next = NULL;
5e2c7a36
N
780 bio->bi_bdev = rdev->bdev;
781 if (test_bit(Faulty, &rdev->flags)) {
782 bio->bi_error = -EIO;
783 bio_endio(bio);
784 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
785 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
2ff8cc2c 786 /* Just ignore it */
4246a0b6 787 bio_endio(bio);
2ff8cc2c
SL
788 else
789 generic_make_request(bio);
a35e63ef
N
790 bio = next;
791 }
a35e63ef
N
792 } else
793 spin_unlock_irq(&conf->device_lock);
7eaceacc
JA
794}
795
17999be4
N
796/* Barriers....
797 * Sometimes we need to suspend IO while we do something else,
798 * either some resync/recovery, or reconfigure the array.
799 * To do this we raise a 'barrier'.
800 * The 'barrier' is a counter that can be raised multiple times
801 * to count how many activities are happening which preclude
802 * normal IO.
803 * We can only raise the barrier if there is no pending IO.
804 * i.e. if nr_pending == 0.
805 * We choose only to raise the barrier if no-one is waiting for the
806 * barrier to go down. This means that as soon as an IO request
807 * is ready, no other operations which require a barrier will start
808 * until the IO request has had a chance.
809 *
810 * So: regular IO calls 'wait_barrier'. When that returns there
811 * is no backgroup IO happening, It must arrange to call
812 * allow_barrier when it has finished its IO.
813 * backgroup IO calls must call raise_barrier. Once that returns
814 * there is no normal IO happeing. It must arrange to call
815 * lower_barrier when the particular background IO completes.
1da177e4 816 */
c2fd4c94 817static void raise_barrier(struct r1conf *conf, sector_t sector_nr)
1da177e4 818{
fd76863e 819 int idx = sector_to_idx(sector_nr);
820
1da177e4 821 spin_lock_irq(&conf->resync_lock);
17999be4
N
822
823 /* Wait until no block IO is waiting */
824e47da 824 wait_event_lock_irq(conf->wait_barrier,
825 !atomic_read(&conf->nr_waiting[idx]),
eed8c02e 826 conf->resync_lock);
17999be4
N
827
828 /* block any new IO from starting */
824e47da 829 atomic_inc(&conf->barrier[idx]);
830 /*
831 * In raise_barrier() we firstly increase conf->barrier[idx] then
832 * check conf->nr_pending[idx]. In _wait_barrier() we firstly
833 * increase conf->nr_pending[idx] then check conf->barrier[idx].
834 * A memory barrier here to make sure conf->nr_pending[idx] won't
835 * be fetched before conf->barrier[idx] is increased. Otherwise
836 * there will be a race between raise_barrier() and _wait_barrier().
837 */
838 smp_mb__after_atomic();
17999be4 839
79ef3a8a 840 /* For these conditions we must wait:
841 * A: while the array is in frozen state
fd76863e 842 * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
843 * existing in corresponding I/O barrier bucket.
844 * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
845 * max resync count which allowed on current I/O barrier bucket.
79ef3a8a 846 */
17999be4 847 wait_event_lock_irq(conf->wait_barrier,
b364e3d0 848 !conf->array_frozen &&
824e47da 849 !atomic_read(&conf->nr_pending[idx]) &&
850 atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH,
eed8c02e 851 conf->resync_lock);
17999be4 852
824e47da 853 atomic_inc(&conf->nr_pending[idx]);
17999be4
N
854 spin_unlock_irq(&conf->resync_lock);
855}
856
fd76863e 857static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
17999be4 858{
fd76863e 859 int idx = sector_to_idx(sector_nr);
860
824e47da 861 BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);
fd76863e 862
824e47da 863 atomic_dec(&conf->barrier[idx]);
864 atomic_dec(&conf->nr_pending[idx]);
17999be4
N
865 wake_up(&conf->wait_barrier);
866}
867
fd76863e 868static void _wait_barrier(struct r1conf *conf, int idx)
17999be4 869{
824e47da 870 /*
871 * We need to increase conf->nr_pending[idx] very early here,
872 * then raise_barrier() can be blocked when it waits for
873 * conf->nr_pending[idx] to be 0. Then we can avoid holding
874 * conf->resync_lock when there is no barrier raised in same
875 * barrier unit bucket. Also if the array is frozen, I/O
876 * should be blocked until array is unfrozen.
877 */
878 atomic_inc(&conf->nr_pending[idx]);
879 /*
880 * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
881 * check conf->barrier[idx]. In raise_barrier() we firstly increase
882 * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
883 * barrier is necessary here to make sure conf->barrier[idx] won't be
884 * fetched before conf->nr_pending[idx] is increased. Otherwise there
885 * will be a race between _wait_barrier() and raise_barrier().
886 */
887 smp_mb__after_atomic();
79ef3a8a 888
824e47da 889 /*
890 * Don't worry about checking two atomic_t variables at same time
891 * here. If during we check conf->barrier[idx], the array is
892 * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
893 * 0, it is safe to return and make the I/O continue. Because the
894 * array is frozen, all I/O returned here will eventually complete
895 * or be queued, no race will happen. See code comment in
896 * frozen_array().
897 */
898 if (!READ_ONCE(conf->array_frozen) &&
899 !atomic_read(&conf->barrier[idx]))
900 return;
79ef3a8a 901
824e47da 902 /*
903 * After holding conf->resync_lock, conf->nr_pending[idx]
904 * should be decreased before waiting for barrier to drop.
905 * Otherwise, we may encounter a race condition because
906 * raise_barrer() might be waiting for conf->nr_pending[idx]
907 * to be 0 at same time.
908 */
909 spin_lock_irq(&conf->resync_lock);
910 atomic_inc(&conf->nr_waiting[idx]);
911 atomic_dec(&conf->nr_pending[idx]);
912 /*
913 * In case freeze_array() is waiting for
914 * get_unqueued_pending() == extra
915 */
916 wake_up(&conf->wait_barrier);
917 /* Wait for the barrier in same barrier unit bucket to drop. */
918 wait_event_lock_irq(conf->wait_barrier,
919 !conf->array_frozen &&
920 !atomic_read(&conf->barrier[idx]),
921 conf->resync_lock);
922 atomic_inc(&conf->nr_pending[idx]);
923 atomic_dec(&conf->nr_waiting[idx]);
fd76863e 924 spin_unlock_irq(&conf->resync_lock);
79ef3a8a 925}
926
fd76863e 927static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
79ef3a8a 928{
fd76863e 929 int idx = sector_to_idx(sector_nr);
79ef3a8a 930
824e47da 931 /*
932 * Very similar to _wait_barrier(). The difference is, for read
933 * I/O we don't need wait for sync I/O, but if the whole array
934 * is frozen, the read I/O still has to wait until the array is
935 * unfrozen. Since there is no ordering requirement with
936 * conf->barrier[idx] here, memory barrier is unnecessary as well.
937 */
938 atomic_inc(&conf->nr_pending[idx]);
79ef3a8a 939
824e47da 940 if (!READ_ONCE(conf->array_frozen))
941 return;
942
943 spin_lock_irq(&conf->resync_lock);
944 atomic_inc(&conf->nr_waiting[idx]);
945 atomic_dec(&conf->nr_pending[idx]);
946 /*
947 * In case freeze_array() is waiting for
948 * get_unqueued_pending() == extra
949 */
950 wake_up(&conf->wait_barrier);
951 /* Wait for array to be unfrozen */
952 wait_event_lock_irq(conf->wait_barrier,
953 !conf->array_frozen,
954 conf->resync_lock);
955 atomic_inc(&conf->nr_pending[idx]);
956 atomic_dec(&conf->nr_waiting[idx]);
1da177e4
LT
957 spin_unlock_irq(&conf->resync_lock);
958}
959
37011e3a
N
960static void inc_pending(struct r1conf *conf, sector_t bi_sector)
961{
962 /* The current request requires multiple r1_bio, so
963 * we need to increment the pending count, and the corresponding
964 * window count.
965 */
966 int idx = sector_to_idx(bi_sector);
967 atomic_inc(&conf->nr_pending[idx]);
968}
969
fd76863e 970static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
17999be4 971{
fd76863e 972 int idx = sector_to_idx(sector_nr);
79ef3a8a 973
fd76863e 974 _wait_barrier(conf, idx);
975}
976
977static void wait_all_barriers(struct r1conf *conf)
978{
979 int idx;
980
981 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
982 _wait_barrier(conf, idx);
983}
984
985static void _allow_barrier(struct r1conf *conf, int idx)
17999be4 986{
824e47da 987 atomic_dec(&conf->nr_pending[idx]);
17999be4
N
988 wake_up(&conf->wait_barrier);
989}
990
fd76863e 991static void allow_barrier(struct r1conf *conf, sector_t sector_nr)
992{
993 int idx = sector_to_idx(sector_nr);
994
995 _allow_barrier(conf, idx);
996}
997
998static void allow_all_barriers(struct r1conf *conf)
999{
1000 int idx;
1001
1002 for (idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
1003 _allow_barrier(conf, idx);
1004}
1005
1006/* conf->resync_lock should be held */
1007static int get_unqueued_pending(struct r1conf *conf)
1008{
1009 int idx, ret;
1010
1011 for (ret = 0, idx = 0; idx < BARRIER_BUCKETS_NR; idx++)
824e47da 1012 ret += atomic_read(&conf->nr_pending[idx]) -
1013 atomic_read(&conf->nr_queued[idx]);
fd76863e 1014
1015 return ret;
1016}
1017
e2d59925 1018static void freeze_array(struct r1conf *conf, int extra)
ddaf22ab 1019{
fd76863e 1020 /* Stop sync I/O and normal I/O and wait for everything to
11353b9d 1021 * go quiet.
fd76863e 1022 * This is called in two situations:
1023 * 1) management command handlers (reshape, remove disk, quiesce).
1024 * 2) one normal I/O request failed.
1025
1026 * After array_frozen is set to 1, new sync IO will be blocked at
1027 * raise_barrier(), and new normal I/O will blocked at _wait_barrier()
1028 * or wait_read_barrier(). The flying I/Os will either complete or be
1029 * queued. When everything goes quite, there are only queued I/Os left.
1030
1031 * Every flying I/O contributes to a conf->nr_pending[idx], idx is the
1032 * barrier bucket index which this I/O request hits. When all sync and
1033 * normal I/O are queued, sum of all conf->nr_pending[] will match sum
1034 * of all conf->nr_queued[]. But normal I/O failure is an exception,
1035 * in handle_read_error(), we may call freeze_array() before trying to
1036 * fix the read error. In this case, the error read I/O is not queued,
1037 * so get_unqueued_pending() == 1.
1038 *
1039 * Therefore before this function returns, we need to wait until
1040 * get_unqueued_pendings(conf) gets equal to extra. For
1041 * normal I/O context, extra is 1, in rested situations extra is 0.
ddaf22ab
N
1042 */
1043 spin_lock_irq(&conf->resync_lock);
b364e3d0 1044 conf->array_frozen = 1;
578b54ad 1045 raid1_log(conf->mddev, "wait freeze");
fd76863e 1046 wait_event_lock_irq_cmd(
1047 conf->wait_barrier,
1048 get_unqueued_pending(conf) == extra,
1049 conf->resync_lock,
1050 flush_pending_writes(conf));
ddaf22ab
N
1051 spin_unlock_irq(&conf->resync_lock);
1052}
e8096360 1053static void unfreeze_array(struct r1conf *conf)
ddaf22ab
N
1054{
1055 /* reverse the effect of the freeze */
1056 spin_lock_irq(&conf->resync_lock);
b364e3d0 1057 conf->array_frozen = 0;
ddaf22ab 1058 spin_unlock_irq(&conf->resync_lock);
824e47da 1059 wake_up(&conf->wait_barrier);
ddaf22ab
N
1060}
1061
f72ffdd6 1062/* duplicate the data pages for behind I/O
4e78064f 1063 */
9f2c9d12 1064static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
4b6d287f
N
1065{
1066 int i;
1067 struct bio_vec *bvec;
2ca68f5e 1068 struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
4b6d287f 1069 GFP_NOIO);
2ca68f5e 1070 if (unlikely(!bvecs))
af6d7b76 1071 return;
4b6d287f 1072
cb34e057 1073 bio_for_each_segment_all(bvec, bio, i) {
2ca68f5e
N
1074 bvecs[i] = *bvec;
1075 bvecs[i].bv_page = alloc_page(GFP_NOIO);
1076 if (unlikely(!bvecs[i].bv_page))
4b6d287f 1077 goto do_sync_io;
2ca68f5e
N
1078 memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
1079 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
1080 kunmap(bvecs[i].bv_page);
4b6d287f
N
1081 kunmap(bvec->bv_page);
1082 }
2ca68f5e 1083 r1_bio->behind_bvecs = bvecs;
af6d7b76
N
1084 r1_bio->behind_page_count = bio->bi_vcnt;
1085 set_bit(R1BIO_BehindIO, &r1_bio->state);
1086 return;
4b6d287f
N
1087
1088do_sync_io:
af6d7b76 1089 for (i = 0; i < bio->bi_vcnt; i++)
2ca68f5e
N
1090 if (bvecs[i].bv_page)
1091 put_page(bvecs[i].bv_page);
1092 kfree(bvecs);
4f024f37
KO
1093 pr_debug("%dB behind alloc failed, doing sync I/O\n",
1094 bio->bi_iter.bi_size);
4b6d287f
N
1095}
1096
f54a9d0e
N
1097struct raid1_plug_cb {
1098 struct blk_plug_cb cb;
1099 struct bio_list pending;
1100 int pending_cnt;
1101};
1102
1103static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
1104{
1105 struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb,
1106 cb);
1107 struct mddev *mddev = plug->cb.data;
1108 struct r1conf *conf = mddev->private;
1109 struct bio *bio;
1110
874807a8 1111 if (from_schedule || current->bio_list) {
f54a9d0e
N
1112 spin_lock_irq(&conf->device_lock);
1113 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1114 conf->pending_count += plug->pending_cnt;
1115 spin_unlock_irq(&conf->device_lock);
ee0b0244 1116 wake_up(&conf->wait_barrier);
f54a9d0e
N
1117 md_wakeup_thread(mddev->thread);
1118 kfree(plug);
1119 return;
1120 }
1121
1122 /* we aren't scheduling, so we can do the write-out directly. */
1123 bio = bio_list_get(&plug->pending);
1124 bitmap_unplug(mddev->bitmap);
1125 wake_up(&conf->wait_barrier);
1126
1127 while (bio) { /* submit pending writes */
1128 struct bio *next = bio->bi_next;
5e2c7a36 1129 struct md_rdev *rdev = (void*)bio->bi_bdev;
f54a9d0e 1130 bio->bi_next = NULL;
5e2c7a36
N
1131 bio->bi_bdev = rdev->bdev;
1132 if (test_bit(Faulty, &rdev->flags)) {
1133 bio->bi_error = -EIO;
1134 bio_endio(bio);
1135 } else if (unlikely((bio_op(bio) == REQ_OP_DISCARD) &&
1136 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
32f9f570 1137 /* Just ignore it */
4246a0b6 1138 bio_endio(bio);
32f9f570
SL
1139 else
1140 generic_make_request(bio);
f54a9d0e
N
1141 bio = next;
1142 }
1143 kfree(plug);
1144}
1145
fd76863e 1146static inline struct r1bio *
1147alloc_r1bio(struct mddev *mddev, struct bio *bio, sector_t sectors_handled)
1148{
1149 struct r1conf *conf = mddev->private;
1150 struct r1bio *r1_bio;
1151
1152 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
1153
1154 r1_bio->master_bio = bio;
1155 r1_bio->sectors = bio_sectors(bio) - sectors_handled;
1156 r1_bio->state = 0;
1157 r1_bio->mddev = mddev;
1158 r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
1159
1160 return r1_bio;
1161}
1162
1163static void raid1_read_request(struct mddev *mddev, struct bio *bio)
1da177e4 1164{
e8096360 1165 struct r1conf *conf = mddev->private;
0eaf822c 1166 struct raid1_info *mirror;
fd76863e 1167 struct r1bio *r1_bio;
1da177e4 1168 struct bio *read_bio;
3b046a97
RL
1169 struct bitmap *bitmap = mddev->bitmap;
1170 const int op = bio_op(bio);
1171 const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
1172 int sectors_handled;
1173 int max_sectors;
1174 int rdisk;
1175
fd76863e 1176 /*
1177 * Still need barrier for READ in case that whole
1178 * array is frozen.
1179 */
1180 wait_read_barrier(conf, bio->bi_iter.bi_sector);
1181
1182 r1_bio = alloc_r1bio(mddev, bio, 0);
3b046a97 1183
fd76863e 1184 /*
1185 * make_request() can abort the operation when read-ahead is being
1186 * used and no empty request is available.
1187 */
3b046a97
RL
1188read_again:
1189 rdisk = read_balance(conf, r1_bio, &max_sectors);
1190
1191 if (rdisk < 0) {
1192 /* couldn't find anywhere to read from */
1193 raid_end_bio_io(r1_bio);
1194 return;
1195 }
1196 mirror = conf->mirrors + rdisk;
1197
1198 if (test_bit(WriteMostly, &mirror->rdev->flags) &&
1199 bitmap) {
1200 /*
1201 * Reading from a write-mostly device must take care not to
1202 * over-take any writes that are 'behind'
1203 */
1204 raid1_log(mddev, "wait behind writes");
1205 wait_event(bitmap->behind_wait,
1206 atomic_read(&bitmap->behind_writes) == 0);
1207 }
1208 r1_bio->read_disk = rdisk;
3b046a97 1209
d7a10308 1210 read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
3b046a97
RL
1211 bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
1212 max_sectors);
1213
1214 r1_bio->bios[rdisk] = read_bio;
1215
1216 read_bio->bi_iter.bi_sector = r1_bio->sector +
1217 mirror->rdev->data_offset;
1218 read_bio->bi_bdev = mirror->rdev->bdev;
1219 read_bio->bi_end_io = raid1_end_read_request;
1220 bio_set_op_attrs(read_bio, op, do_sync);
1221 if (test_bit(FailFast, &mirror->rdev->flags) &&
1222 test_bit(R1BIO_FailFast, &r1_bio->state))
1223 read_bio->bi_opf |= MD_FAILFAST;
1224 read_bio->bi_private = r1_bio;
1225
1226 if (mddev->gendisk)
1227 trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
1228 read_bio, disk_devt(mddev->gendisk),
1229 r1_bio->sector);
1230
1231 if (max_sectors < r1_bio->sectors) {
1232 /*
1233 * could not read all from this device, so we will need another
1234 * r1_bio.
1235 */
1236 sectors_handled = (r1_bio->sector + max_sectors
1237 - bio->bi_iter.bi_sector);
1238 r1_bio->sectors = max_sectors;
37011e3a 1239 bio_inc_remaining(bio);
3b046a97
RL
1240
1241 /*
1242 * Cannot call generic_make_request directly as that will be
1243 * queued in __make_request and subsequent mempool_alloc might
1244 * block waiting for it. So hand bio over to raid1d.
1245 */
1246 reschedule_retry(r1_bio);
1247
fd76863e 1248 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
3b046a97
RL
1249 goto read_again;
1250 } else
1251 generic_make_request(read_bio);
1252}
1253
fd76863e 1254static void raid1_write_request(struct mddev *mddev, struct bio *bio)
3b046a97
RL
1255{
1256 struct r1conf *conf = mddev->private;
fd76863e 1257 struct r1bio *r1_bio;
1f68f0c4 1258 int i, disks;
3b046a97 1259 struct bitmap *bitmap = mddev->bitmap;
191ea9b2 1260 unsigned long flags;
3cb03002 1261 struct md_rdev *blocked_rdev;
f54a9d0e
N
1262 struct blk_plug_cb *cb;
1263 struct raid1_plug_cb *plug = NULL;
1f68f0c4
N
1264 int first_clone;
1265 int sectors_handled;
1266 int max_sectors;
191ea9b2 1267
1da177e4
LT
1268 /*
1269 * Register the new request and wait if the reconstruction
1270 * thread has put up a bar for new requests.
1271 * Continue immediately if no resync is active currently.
1272 */
62de608d 1273
3d310eb7
N
1274 md_write_start(mddev, bio); /* wait on superblock update early */
1275
3b046a97 1276 if ((bio_end_sector(bio) > mddev->suspend_lo &&
589a1c49
GR
1277 bio->bi_iter.bi_sector < mddev->suspend_hi) ||
1278 (mddev_is_clustered(mddev) &&
90382ed9 1279 md_cluster_ops->area_resyncing(mddev, WRITE,
3b046a97
RL
1280 bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
1281
1282 /*
1283 * As the suspend_* range is controlled by userspace, we want
1284 * an interruptible wait.
6eef4b21
N
1285 */
1286 DEFINE_WAIT(w);
1287 for (;;) {
1288 flush_signals(current);
1289 prepare_to_wait(&conf->wait_barrier,
1290 &w, TASK_INTERRUPTIBLE);
f73a1c7d 1291 if (bio_end_sector(bio) <= mddev->suspend_lo ||
589a1c49
GR
1292 bio->bi_iter.bi_sector >= mddev->suspend_hi ||
1293 (mddev_is_clustered(mddev) &&
90382ed9 1294 !md_cluster_ops->area_resyncing(mddev, WRITE,
3b046a97
RL
1295 bio->bi_iter.bi_sector,
1296 bio_end_sector(bio))))
6eef4b21
N
1297 break;
1298 schedule();
1299 }
1300 finish_wait(&conf->wait_barrier, &w);
1301 }
fd76863e 1302 wait_barrier(conf, bio->bi_iter.bi_sector);
1303
1304 r1_bio = alloc_r1bio(mddev, bio, 0);
1305
34db0cd6
N
1306 if (conf->pending_count >= max_queued_requests) {
1307 md_wakeup_thread(mddev->thread);
578b54ad 1308 raid1_log(mddev, "wait queued");
34db0cd6
N
1309 wait_event(conf->wait_barrier,
1310 conf->pending_count < max_queued_requests);
1311 }
1f68f0c4 1312 /* first select target devices under rcu_lock and
1da177e4
LT
1313 * inc refcount on their rdev. Record them by setting
1314 * bios[x] to bio
1f68f0c4
N
1315 * If there are known/acknowledged bad blocks on any device on
1316 * which we have seen a write error, we want to avoid writing those
1317 * blocks.
1318 * This potentially requires several writes to write around
1319 * the bad blocks. Each set of writes gets it's own r1bio
1320 * with a set of bios attached.
1da177e4 1321 */
c3b328ac 1322
8f19ccb2 1323 disks = conf->raid_disks * 2;
6bfe0b49
DW
1324 retry_write:
1325 blocked_rdev = NULL;
1da177e4 1326 rcu_read_lock();
1f68f0c4 1327 max_sectors = r1_bio->sectors;
1da177e4 1328 for (i = 0; i < disks; i++) {
3cb03002 1329 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
6bfe0b49
DW
1330 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1331 atomic_inc(&rdev->nr_pending);
1332 blocked_rdev = rdev;
1333 break;
1334 }
1f68f0c4 1335 r1_bio->bios[i] = NULL;
8ae12666 1336 if (!rdev || test_bit(Faulty, &rdev->flags)) {
8f19ccb2
N
1337 if (i < conf->raid_disks)
1338 set_bit(R1BIO_Degraded, &r1_bio->state);
1f68f0c4
N
1339 continue;
1340 }
1341
1342 atomic_inc(&rdev->nr_pending);
1343 if (test_bit(WriteErrorSeen, &rdev->flags)) {
1344 sector_t first_bad;
1345 int bad_sectors;
1346 int is_bad;
1347
3b046a97 1348 is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
1f68f0c4
N
1349 &first_bad, &bad_sectors);
1350 if (is_bad < 0) {
1351 /* mustn't write here until the bad block is
1352 * acknowledged*/
1353 set_bit(BlockedBadBlocks, &rdev->flags);
1354 blocked_rdev = rdev;
1355 break;
1356 }
1357 if (is_bad && first_bad <= r1_bio->sector) {
1358 /* Cannot write here at all */
1359 bad_sectors -= (r1_bio->sector - first_bad);
1360 if (bad_sectors < max_sectors)
1361 /* mustn't write more than bad_sectors
1362 * to other devices yet
1363 */
1364 max_sectors = bad_sectors;
03c902e1 1365 rdev_dec_pending(rdev, mddev);
1f68f0c4
N
1366 /* We don't set R1BIO_Degraded as that
1367 * only applies if the disk is
1368 * missing, so it might be re-added,
1369 * and we want to know to recover this
1370 * chunk.
1371 * In this case the device is here,
1372 * and the fact that this chunk is not
1373 * in-sync is recorded in the bad
1374 * block log
1375 */
1376 continue;
964147d5 1377 }
1f68f0c4
N
1378 if (is_bad) {
1379 int good_sectors = first_bad - r1_bio->sector;
1380 if (good_sectors < max_sectors)
1381 max_sectors = good_sectors;
1382 }
1383 }
1384 r1_bio->bios[i] = bio;
1da177e4
LT
1385 }
1386 rcu_read_unlock();
1387
6bfe0b49
DW
1388 if (unlikely(blocked_rdev)) {
1389 /* Wait for this device to become unblocked */
1390 int j;
1391
1392 for (j = 0; j < i; j++)
1393 if (r1_bio->bios[j])
1394 rdev_dec_pending(conf->mirrors[j].rdev, mddev);
1f68f0c4 1395 r1_bio->state = 0;
fd76863e 1396 allow_barrier(conf, bio->bi_iter.bi_sector);
578b54ad 1397 raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
6bfe0b49 1398 md_wait_for_blocked_rdev(blocked_rdev, mddev);
fd76863e 1399 wait_barrier(conf, bio->bi_iter.bi_sector);
6bfe0b49
DW
1400 goto retry_write;
1401 }
1402
6b6c8110 1403 if (max_sectors < r1_bio->sectors)
1f68f0c4 1404 r1_bio->sectors = max_sectors;
6b6c8110 1405
4f024f37 1406 sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
4b6d287f 1407
4e78064f 1408 atomic_set(&r1_bio->remaining, 1);
4b6d287f 1409 atomic_set(&r1_bio->behind_remaining, 0);
06d91a5f 1410
1f68f0c4 1411 first_clone = 1;
1da177e4 1412 for (i = 0; i < disks; i++) {
8e58e327
ML
1413 struct bio *mbio = NULL;
1414 sector_t offset;
1da177e4
LT
1415 if (!r1_bio->bios[i])
1416 continue;
1417
8e58e327 1418 offset = r1_bio->sector - bio->bi_iter.bi_sector;
1f68f0c4
N
1419
1420 if (first_clone) {
1421 /* do behind I/O ?
1422 * Not if there are too many, or cannot
1423 * allocate memory, or a reader on WriteMostly
1424 * is waiting for behind writes to flush */
1425 if (bitmap &&
1426 (atomic_read(&bitmap->behind_writes)
1427 < mddev->bitmap_info.max_write_behind) &&
8e58e327
ML
1428 !waitqueue_active(&bitmap->behind_wait)) {
1429 mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
1430 mddev->bio_set,
1ec49223
SL
1431 offset << 9,
1432 max_sectors << 9);
1f68f0c4 1433 alloc_behind_pages(mbio, r1_bio);
8e58e327 1434 }
1f68f0c4
N
1435
1436 bitmap_startwrite(bitmap, r1_bio->sector,
1437 r1_bio->sectors,
1438 test_bit(R1BIO_BehindIO,
1439 &r1_bio->state));
1440 first_clone = 0;
1441 }
8e58e327
ML
1442
1443 if (!mbio) {
1ec49223
SL
1444 if (r1_bio->behind_bvecs)
1445 mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
1446 mddev->bio_set,
1447 offset << 9,
1448 max_sectors << 9);
1449 else {
1450 mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
1451 bio_trim(mbio, offset, max_sectors);
1452 }
8e58e327
ML
1453 }
1454
2ca68f5e 1455 if (r1_bio->behind_bvecs) {
4b6d287f
N
1456 struct bio_vec *bvec;
1457 int j;
1458
cb34e057
KO
1459 /*
1460 * We trimmed the bio, so _all is legit
4b6d287f 1461 */
d74c6d51 1462 bio_for_each_segment_all(bvec, mbio, j)
2ca68f5e 1463 bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
4b6d287f
N
1464 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
1465 atomic_inc(&r1_bio->behind_remaining);
1466 }
1467
1f68f0c4
N
1468 r1_bio->bios[i] = mbio;
1469
4f024f37 1470 mbio->bi_iter.bi_sector = (r1_bio->sector +
1f68f0c4 1471 conf->mirrors[i].rdev->data_offset);
109e3765 1472 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1f68f0c4 1473 mbio->bi_end_io = raid1_end_write_request;
a682e003 1474 mbio->bi_opf = bio_op(bio) | (bio->bi_opf & (REQ_SYNC | REQ_FUA));
212e7eb7
N
1475 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags) &&
1476 !test_bit(WriteMostly, &conf->mirrors[i].rdev->flags) &&
1477 conf->raid_disks - mddev->degraded > 1)
1478 mbio->bi_opf |= MD_FAILFAST;
1f68f0c4
N
1479 mbio->bi_private = r1_bio;
1480
1da177e4 1481 atomic_inc(&r1_bio->remaining);
f54a9d0e 1482
109e3765
N
1483 if (mddev->gendisk)
1484 trace_block_bio_remap(bdev_get_queue(mbio->bi_bdev),
1485 mbio, disk_devt(mddev->gendisk),
1486 r1_bio->sector);
1487 /* flush_pending_writes() needs access to the rdev so...*/
1488 mbio->bi_bdev = (void*)conf->mirrors[i].rdev;
1489
f54a9d0e
N
1490 cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug));
1491 if (cb)
1492 plug = container_of(cb, struct raid1_plug_cb, cb);
1493 else
1494 plug = NULL;
4e78064f 1495 spin_lock_irqsave(&conf->device_lock, flags);
f54a9d0e
N
1496 if (plug) {
1497 bio_list_add(&plug->pending, mbio);
1498 plug->pending_cnt++;
1499 } else {
1500 bio_list_add(&conf->pending_bio_list, mbio);
1501 conf->pending_count++;
1502 }
4e78064f 1503 spin_unlock_irqrestore(&conf->device_lock, flags);
f54a9d0e 1504 if (!plug)
b357f04a 1505 md_wakeup_thread(mddev->thread);
1da177e4 1506 }
079fa166
N
1507 /* Mustn't call r1_bio_write_done before this next test,
1508 * as it could result in the bio being freed.
1509 */
aa8b57aa 1510 if (sectors_handled < bio_sectors(bio)) {
37011e3a
N
1511 /* We need another r1_bio, which must be counted */
1512 sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
6b6c8110 1513
37011e3a
N
1514 inc_pending(conf, sect);
1515 bio_inc_remaining(bio);
6b6c8110 1516 r1_bio_write_done(r1_bio);
fd76863e 1517 r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
1f68f0c4
N
1518 goto retry_write;
1519 }
1520
079fa166
N
1521 r1_bio_write_done(r1_bio);
1522
1523 /* In case raid1d snuck in to freeze_array */
1524 wake_up(&conf->wait_barrier);
1da177e4
LT
1525}
1526
3b046a97
RL
1527static void raid1_make_request(struct mddev *mddev, struct bio *bio)
1528{
fd76863e 1529 struct bio *split;
1530 sector_t sectors;
3b046a97 1531
aff8da09
SL
1532 if (unlikely(bio->bi_opf & REQ_PREFLUSH)) {
1533 md_flush_request(mddev, bio);
1534 return;
1535 }
3b046a97 1536
fd76863e 1537 /* if bio exceeds barrier unit boundary, split it */
1538 do {
1539 sectors = align_to_barrier_unit_end(
1540 bio->bi_iter.bi_sector, bio_sectors(bio));
1541 if (sectors < bio_sectors(bio)) {
1542 split = bio_split(bio, sectors, GFP_NOIO, fs_bio_set);
1543 bio_chain(split, bio);
1544 } else {
1545 split = bio;
1546 }
3b046a97 1547
61eb2b43 1548 if (bio_data_dir(split) == READ) {
fd76863e 1549 raid1_read_request(mddev, split);
61eb2b43
SL
1550
1551 /*
1552 * If a bio is splitted, the first part of bio will
1553 * pass barrier but the bio is queued in
1554 * current->bio_list (see generic_make_request). If
1555 * there is a raise_barrier() called here, the second
1556 * part of bio can't pass barrier. But since the first
1557 * part bio isn't dispatched to underlaying disks yet,
1558 * the barrier is never released, hence raise_barrier
1559 * will alays wait. We have a deadlock.
1560 * Note, this only happens in read path. For write
1561 * path, the first part of bio is dispatched in a
1562 * schedule() call (because of blk plug) or offloaded
1563 * to raid10d.
1564 * Quitting from the function immediately can change
1565 * the bio order queued in bio_list and avoid the deadlock.
1566 */
1567 if (split != bio) {
1568 generic_make_request(bio);
1569 break;
1570 }
1571 } else
fd76863e 1572 raid1_write_request(mddev, split);
1573 } while (split != bio);
3b046a97
RL
1574}
1575
849674e4 1576static void raid1_status(struct seq_file *seq, struct mddev *mddev)
1da177e4 1577{
e8096360 1578 struct r1conf *conf = mddev->private;
1da177e4
LT
1579 int i;
1580
1581 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
11ce99e6 1582 conf->raid_disks - mddev->degraded);
ddac7c7e
N
1583 rcu_read_lock();
1584 for (i = 0; i < conf->raid_disks; i++) {
3cb03002 1585 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
1da177e4 1586 seq_printf(seq, "%s",
ddac7c7e
N
1587 rdev && test_bit(In_sync, &rdev->flags) ? "U" : "_");
1588 }
1589 rcu_read_unlock();
1da177e4
LT
1590 seq_printf(seq, "]");
1591}
1592
849674e4 1593static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
1da177e4
LT
1594{
1595 char b[BDEVNAME_SIZE];
e8096360 1596 struct r1conf *conf = mddev->private;
423f04d6 1597 unsigned long flags;
1da177e4
LT
1598
1599 /*
1600 * If it is not operational, then we have already marked it as dead
1601 * else if it is the last working disks, ignore the error, let the
1602 * next level up know.
1603 * else mark the drive as failed
1604 */
2e52d449 1605 spin_lock_irqsave(&conf->device_lock, flags);
b2d444d7 1606 if (test_bit(In_sync, &rdev->flags)
4044ba58 1607 && (conf->raid_disks - mddev->degraded) == 1) {
1da177e4
LT
1608 /*
1609 * Don't fail the drive, act as though we were just a
4044ba58
N
1610 * normal single drive.
1611 * However don't try a recovery from this drive as
1612 * it is very likely to fail.
1da177e4 1613 */
5389042f 1614 conf->recovery_disabled = mddev->recovery_disabled;
2e52d449 1615 spin_unlock_irqrestore(&conf->device_lock, flags);
1da177e4 1616 return;
4044ba58 1617 }
de393cde 1618 set_bit(Blocked, &rdev->flags);
c04be0aa 1619 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1da177e4 1620 mddev->degraded++;
dd00a99e 1621 set_bit(Faulty, &rdev->flags);
dd00a99e
N
1622 } else
1623 set_bit(Faulty, &rdev->flags);
423f04d6 1624 spin_unlock_irqrestore(&conf->device_lock, flags);
2446dba0
N
1625 /*
1626 * if recovery is running, make sure it aborts.
1627 */
1628 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2953079c
SL
1629 set_mask_bits(&mddev->sb_flags, 0,
1630 BIT(MD_SB_CHANGE_DEVS) | BIT(MD_SB_CHANGE_PENDING));
1d41c216
N
1631 pr_crit("md/raid1:%s: Disk failure on %s, disabling device.\n"
1632 "md/raid1:%s: Operation continuing on %d devices.\n",
1633 mdname(mddev), bdevname(rdev->bdev, b),
1634 mdname(mddev), conf->raid_disks - mddev->degraded);
1da177e4
LT
1635}
1636
e8096360 1637static void print_conf(struct r1conf *conf)
1da177e4
LT
1638{
1639 int i;
1da177e4 1640
1d41c216 1641 pr_debug("RAID1 conf printout:\n");
1da177e4 1642 if (!conf) {
1d41c216 1643 pr_debug("(!conf)\n");
1da177e4
LT
1644 return;
1645 }
1d41c216
N
1646 pr_debug(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
1647 conf->raid_disks);
1da177e4 1648
ddac7c7e 1649 rcu_read_lock();
1da177e4
LT
1650 for (i = 0; i < conf->raid_disks; i++) {
1651 char b[BDEVNAME_SIZE];
3cb03002 1652 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
ddac7c7e 1653 if (rdev)
1d41c216
N
1654 pr_debug(" disk %d, wo:%d, o:%d, dev:%s\n",
1655 i, !test_bit(In_sync, &rdev->flags),
1656 !test_bit(Faulty, &rdev->flags),
1657 bdevname(rdev->bdev,b));
1da177e4 1658 }
ddac7c7e 1659 rcu_read_unlock();
1da177e4
LT
1660}
1661
e8096360 1662static void close_sync(struct r1conf *conf)
1da177e4 1663{
fd76863e 1664 wait_all_barriers(conf);
1665 allow_all_barriers(conf);
1da177e4
LT
1666
1667 mempool_destroy(conf->r1buf_pool);
1668 conf->r1buf_pool = NULL;
1669}
1670
fd01b88c 1671static int raid1_spare_active(struct mddev *mddev)
1da177e4
LT
1672{
1673 int i;
e8096360 1674 struct r1conf *conf = mddev->private;
6b965620
N
1675 int count = 0;
1676 unsigned long flags;
1da177e4
LT
1677
1678 /*
f72ffdd6 1679 * Find all failed disks within the RAID1 configuration
ddac7c7e
N
1680 * and mark them readable.
1681 * Called under mddev lock, so rcu protection not needed.
423f04d6
N
1682 * device_lock used to avoid races with raid1_end_read_request
1683 * which expects 'In_sync' flags and ->degraded to be consistent.
1da177e4 1684 */
423f04d6 1685 spin_lock_irqsave(&conf->device_lock, flags);
1da177e4 1686 for (i = 0; i < conf->raid_disks; i++) {
3cb03002 1687 struct md_rdev *rdev = conf->mirrors[i].rdev;
8c7a2c2b
N
1688 struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev;
1689 if (repl
1aee41f6 1690 && !test_bit(Candidate, &repl->flags)
8c7a2c2b
N
1691 && repl->recovery_offset == MaxSector
1692 && !test_bit(Faulty, &repl->flags)
1693 && !test_and_set_bit(In_sync, &repl->flags)) {
1694 /* replacement has just become active */
1695 if (!rdev ||
1696 !test_and_clear_bit(In_sync, &rdev->flags))
1697 count++;
1698 if (rdev) {
1699 /* Replaced device not technically
1700 * faulty, but we need to be sure
1701 * it gets removed and never re-added
1702 */
1703 set_bit(Faulty, &rdev->flags);
1704 sysfs_notify_dirent_safe(
1705 rdev->sysfs_state);
1706 }
1707 }
ddac7c7e 1708 if (rdev
61e4947c 1709 && rdev->recovery_offset == MaxSector
ddac7c7e 1710 && !test_bit(Faulty, &rdev->flags)
c04be0aa 1711 && !test_and_set_bit(In_sync, &rdev->flags)) {
6b965620 1712 count++;
654e8b5a 1713 sysfs_notify_dirent_safe(rdev->sysfs_state);
1da177e4
LT
1714 }
1715 }
6b965620
N
1716 mddev->degraded -= count;
1717 spin_unlock_irqrestore(&conf->device_lock, flags);
1da177e4
LT
1718
1719 print_conf(conf);
6b965620 1720 return count;
1da177e4
LT
1721}
1722
fd01b88c 1723static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 1724{
e8096360 1725 struct r1conf *conf = mddev->private;
199050ea 1726 int err = -EEXIST;
41158c7e 1727 int mirror = 0;
0eaf822c 1728 struct raid1_info *p;
6c2fce2e 1729 int first = 0;
30194636 1730 int last = conf->raid_disks - 1;
1da177e4 1731
5389042f
N
1732 if (mddev->recovery_disabled == conf->recovery_disabled)
1733 return -EBUSY;
1734
1501efad
DW
1735 if (md_integrity_add_rdev(rdev, mddev))
1736 return -ENXIO;
1737
6c2fce2e
NB
1738 if (rdev->raid_disk >= 0)
1739 first = last = rdev->raid_disk;
1740
70bcecdb
GR
1741 /*
1742 * find the disk ... but prefer rdev->saved_raid_disk
1743 * if possible.
1744 */
1745 if (rdev->saved_raid_disk >= 0 &&
1746 rdev->saved_raid_disk >= first &&
1747 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1748 first = last = rdev->saved_raid_disk;
1749
7ef449d1
N
1750 for (mirror = first; mirror <= last; mirror++) {
1751 p = conf->mirrors+mirror;
1752 if (!p->rdev) {
1da177e4 1753
9092c02d
JB
1754 if (mddev->gendisk)
1755 disk_stack_limits(mddev->gendisk, rdev->bdev,
1756 rdev->data_offset << 9);
1da177e4
LT
1757
1758 p->head_position = 0;
1759 rdev->raid_disk = mirror;
199050ea 1760 err = 0;
6aea114a
N
1761 /* As all devices are equivalent, we don't need a full recovery
1762 * if this was recently any drive of the array
1763 */
1764 if (rdev->saved_raid_disk < 0)
41158c7e 1765 conf->fullsync = 1;
d6065f7b 1766 rcu_assign_pointer(p->rdev, rdev);
1da177e4
LT
1767 break;
1768 }
7ef449d1
N
1769 if (test_bit(WantReplacement, &p->rdev->flags) &&
1770 p[conf->raid_disks].rdev == NULL) {
1771 /* Add this device as a replacement */
1772 clear_bit(In_sync, &rdev->flags);
1773 set_bit(Replacement, &rdev->flags);
1774 rdev->raid_disk = mirror;
1775 err = 0;
1776 conf->fullsync = 1;
1777 rcu_assign_pointer(p[conf->raid_disks].rdev, rdev);
1778 break;
1779 }
1780 }
9092c02d 1781 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
2ff8cc2c 1782 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1da177e4 1783 print_conf(conf);
199050ea 1784 return err;
1da177e4
LT
1785}
1786
b8321b68 1787static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1da177e4 1788{
e8096360 1789 struct r1conf *conf = mddev->private;
1da177e4 1790 int err = 0;
b8321b68 1791 int number = rdev->raid_disk;
0eaf822c 1792 struct raid1_info *p = conf->mirrors + number;
1da177e4 1793
b014f14c
N
1794 if (rdev != p->rdev)
1795 p = conf->mirrors + conf->raid_disks + number;
1796
1da177e4 1797 print_conf(conf);
b8321b68 1798 if (rdev == p->rdev) {
b2d444d7 1799 if (test_bit(In_sync, &rdev->flags) ||
1da177e4
LT
1800 atomic_read(&rdev->nr_pending)) {
1801 err = -EBUSY;
1802 goto abort;
1803 }
046abeed 1804 /* Only remove non-faulty devices if recovery
dfc70645
N
1805 * is not possible.
1806 */
1807 if (!test_bit(Faulty, &rdev->flags) &&
5389042f 1808 mddev->recovery_disabled != conf->recovery_disabled &&
dfc70645
N
1809 mddev->degraded < conf->raid_disks) {
1810 err = -EBUSY;
1811 goto abort;
1812 }
1da177e4 1813 p->rdev = NULL;
d787be40
N
1814 if (!test_bit(RemoveSynchronized, &rdev->flags)) {
1815 synchronize_rcu();
1816 if (atomic_read(&rdev->nr_pending)) {
1817 /* lost the race, try later */
1818 err = -EBUSY;
1819 p->rdev = rdev;
1820 goto abort;
1821 }
1822 }
1823 if (conf->mirrors[conf->raid_disks + number].rdev) {
8c7a2c2b
N
1824 /* We just removed a device that is being replaced.
1825 * Move down the replacement. We drain all IO before
1826 * doing this to avoid confusion.
1827 */
1828 struct md_rdev *repl =
1829 conf->mirrors[conf->raid_disks + number].rdev;
e2d59925 1830 freeze_array(conf, 0);
8c7a2c2b
N
1831 clear_bit(Replacement, &repl->flags);
1832 p->rdev = repl;
1833 conf->mirrors[conf->raid_disks + number].rdev = NULL;
e2d59925 1834 unfreeze_array(conf);
8c7a2c2b
N
1835 clear_bit(WantReplacement, &rdev->flags);
1836 } else
b014f14c 1837 clear_bit(WantReplacement, &rdev->flags);
a91a2785 1838 err = md_integrity_register(mddev);
1da177e4
LT
1839 }
1840abort:
1841
1842 print_conf(conf);
1843 return err;
1844}
1845
4246a0b6 1846static void end_sync_read(struct bio *bio)
1da177e4 1847{
9f2c9d12 1848 struct r1bio *r1_bio = bio->bi_private;
1da177e4 1849
0fc280f6 1850 update_head_pos(r1_bio->read_disk, r1_bio);
ba3ae3be 1851
1da177e4
LT
1852 /*
1853 * we have read a block, now it needs to be re-written,
1854 * or re-read if the read failed.
1855 * We don't do much here, just schedule handling by raid1d
1856 */
4246a0b6 1857 if (!bio->bi_error)
1da177e4 1858 set_bit(R1BIO_Uptodate, &r1_bio->state);
d11c171e
N
1859
1860 if (atomic_dec_and_test(&r1_bio->remaining))
1861 reschedule_retry(r1_bio);
1da177e4
LT
1862}
1863
4246a0b6 1864static void end_sync_write(struct bio *bio)
1da177e4 1865{
4246a0b6 1866 int uptodate = !bio->bi_error;
9f2c9d12 1867 struct r1bio *r1_bio = bio->bi_private;
fd01b88c 1868 struct mddev *mddev = r1_bio->mddev;
e8096360 1869 struct r1conf *conf = mddev->private;
4367af55
N
1870 sector_t first_bad;
1871 int bad_sectors;
854abd75 1872 struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
ba3ae3be 1873
6b1117d5 1874 if (!uptodate) {
57dab0bd 1875 sector_t sync_blocks = 0;
6b1117d5
N
1876 sector_t s = r1_bio->sector;
1877 long sectors_to_go = r1_bio->sectors;
1878 /* make sure these bits doesn't get cleared. */
1879 do {
5e3db645 1880 bitmap_end_sync(mddev->bitmap, s,
6b1117d5
N
1881 &sync_blocks, 1);
1882 s += sync_blocks;
1883 sectors_to_go -= sync_blocks;
1884 } while (sectors_to_go > 0);
854abd75
N
1885 set_bit(WriteErrorSeen, &rdev->flags);
1886 if (!test_and_set_bit(WantReplacement, &rdev->flags))
19d67169
N
1887 set_bit(MD_RECOVERY_NEEDED, &
1888 mddev->recovery);
d8f05d29 1889 set_bit(R1BIO_WriteError, &r1_bio->state);
854abd75 1890 } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
3a9f28a5
N
1891 &first_bad, &bad_sectors) &&
1892 !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
1893 r1_bio->sector,
1894 r1_bio->sectors,
1895 &first_bad, &bad_sectors)
1896 )
4367af55 1897 set_bit(R1BIO_MadeGood, &r1_bio->state);
e3b9703e 1898
1da177e4 1899 if (atomic_dec_and_test(&r1_bio->remaining)) {
4367af55 1900 int s = r1_bio->sectors;
d8f05d29
N
1901 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
1902 test_bit(R1BIO_WriteError, &r1_bio->state))
4367af55
N
1903 reschedule_retry(r1_bio);
1904 else {
1905 put_buf(r1_bio);
1906 md_done_sync(mddev, s, uptodate);
1907 }
1da177e4 1908 }
1da177e4
LT
1909}
1910
3cb03002 1911static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
d8f05d29
N
1912 int sectors, struct page *page, int rw)
1913{
796a5cf0 1914 if (sync_page_io(rdev, sector, sectors << 9, page, rw, 0, false))
d8f05d29
N
1915 /* success */
1916 return 1;
19d67169 1917 if (rw == WRITE) {
d8f05d29 1918 set_bit(WriteErrorSeen, &rdev->flags);
19d67169
N
1919 if (!test_and_set_bit(WantReplacement,
1920 &rdev->flags))
1921 set_bit(MD_RECOVERY_NEEDED, &
1922 rdev->mddev->recovery);
1923 }
d8f05d29
N
1924 /* need to record an error - either for the block or the device */
1925 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
1926 md_error(rdev->mddev, rdev);
1927 return 0;
1928}
1929
9f2c9d12 1930static int fix_sync_read_error(struct r1bio *r1_bio)
1da177e4 1931{
a68e5870
N
1932 /* Try some synchronous reads of other devices to get
1933 * good data, much like with normal read errors. Only
1934 * read into the pages we already have so we don't
1935 * need to re-issue the read request.
1936 * We don't need to freeze the array, because being in an
1937 * active sync request, there is no normal IO, and
1938 * no overlapping syncs.
06f60385
N
1939 * We don't need to check is_badblock() again as we
1940 * made sure that anything with a bad block in range
1941 * will have bi_end_io clear.
a68e5870 1942 */
fd01b88c 1943 struct mddev *mddev = r1_bio->mddev;
e8096360 1944 struct r1conf *conf = mddev->private;
a68e5870
N
1945 struct bio *bio = r1_bio->bios[r1_bio->read_disk];
1946 sector_t sect = r1_bio->sector;
1947 int sectors = r1_bio->sectors;
1948 int idx = 0;
2e52d449
N
1949 struct md_rdev *rdev;
1950
1951 rdev = conf->mirrors[r1_bio->read_disk].rdev;
1952 if (test_bit(FailFast, &rdev->flags)) {
1953 /* Don't try recovering from here - just fail it
1954 * ... unless it is the last working device of course */
1955 md_error(mddev, rdev);
1956 if (test_bit(Faulty, &rdev->flags))
1957 /* Don't try to read from here, but make sure
1958 * put_buf does it's thing
1959 */
1960 bio->bi_end_io = end_sync_write;
1961 }
a68e5870
N
1962
1963 while(sectors) {
1964 int s = sectors;
1965 int d = r1_bio->read_disk;
1966 int success = 0;
78d7f5f7 1967 int start;
a68e5870
N
1968
1969 if (s > (PAGE_SIZE>>9))
1970 s = PAGE_SIZE >> 9;
1971 do {
1972 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1973 /* No rcu protection needed here devices
1974 * can only be removed when no resync is
1975 * active, and resync is currently active
1976 */
1977 rdev = conf->mirrors[d].rdev;
9d3d8011 1978 if (sync_page_io(rdev, sect, s<<9,
a68e5870 1979 bio->bi_io_vec[idx].bv_page,
796a5cf0 1980 REQ_OP_READ, 0, false)) {
a68e5870
N
1981 success = 1;
1982 break;
1983 }
1984 }
1985 d++;
8f19ccb2 1986 if (d == conf->raid_disks * 2)
a68e5870
N
1987 d = 0;
1988 } while (!success && d != r1_bio->read_disk);
1989
78d7f5f7 1990 if (!success) {
a68e5870 1991 char b[BDEVNAME_SIZE];
3a9f28a5
N
1992 int abort = 0;
1993 /* Cannot read from anywhere, this block is lost.
1994 * Record a bad block on each device. If that doesn't
1995 * work just disable and interrupt the recovery.
1996 * Don't fail devices as that won't really help.
1997 */
1d41c216
N
1998 pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
1999 mdname(mddev),
2000 bdevname(bio->bi_bdev, b),
2001 (unsigned long long)r1_bio->sector);
8f19ccb2 2002 for (d = 0; d < conf->raid_disks * 2; d++) {
3a9f28a5
N
2003 rdev = conf->mirrors[d].rdev;
2004 if (!rdev || test_bit(Faulty, &rdev->flags))
2005 continue;
2006 if (!rdev_set_badblocks(rdev, sect, s, 0))
2007 abort = 1;
2008 }
2009 if (abort) {
d890fa2b
N
2010 conf->recovery_disabled =
2011 mddev->recovery_disabled;
3a9f28a5
N
2012 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2013 md_done_sync(mddev, r1_bio->sectors, 0);
2014 put_buf(r1_bio);
2015 return 0;
2016 }
2017 /* Try next page */
2018 sectors -= s;
2019 sect += s;
2020 idx++;
2021 continue;
d11c171e 2022 }
78d7f5f7
N
2023
2024 start = d;
2025 /* write it back and re-read */
2026 while (d != r1_bio->read_disk) {
2027 if (d == 0)
8f19ccb2 2028 d = conf->raid_disks * 2;
78d7f5f7
N
2029 d--;
2030 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2031 continue;
2032 rdev = conf->mirrors[d].rdev;
d8f05d29
N
2033 if (r1_sync_page_io(rdev, sect, s,
2034 bio->bi_io_vec[idx].bv_page,
2035 WRITE) == 0) {
78d7f5f7
N
2036 r1_bio->bios[d]->bi_end_io = NULL;
2037 rdev_dec_pending(rdev, mddev);
9d3d8011 2038 }
78d7f5f7
N
2039 }
2040 d = start;
2041 while (d != r1_bio->read_disk) {
2042 if (d == 0)
8f19ccb2 2043 d = conf->raid_disks * 2;
78d7f5f7
N
2044 d--;
2045 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
2046 continue;
2047 rdev = conf->mirrors[d].rdev;
d8f05d29
N
2048 if (r1_sync_page_io(rdev, sect, s,
2049 bio->bi_io_vec[idx].bv_page,
2050 READ) != 0)
9d3d8011 2051 atomic_add(s, &rdev->corrected_errors);
78d7f5f7 2052 }
a68e5870
N
2053 sectors -= s;
2054 sect += s;
2055 idx ++;
2056 }
78d7f5f7 2057 set_bit(R1BIO_Uptodate, &r1_bio->state);
4246a0b6 2058 bio->bi_error = 0;
a68e5870
N
2059 return 1;
2060}
2061
c95e6385 2062static void process_checks(struct r1bio *r1_bio)
a68e5870
N
2063{
2064 /* We have read all readable devices. If we haven't
2065 * got the block, then there is no hope left.
2066 * If we have, then we want to do a comparison
2067 * and skip the write if everything is the same.
2068 * If any blocks failed to read, then we need to
2069 * attempt an over-write
2070 */
fd01b88c 2071 struct mddev *mddev = r1_bio->mddev;
e8096360 2072 struct r1conf *conf = mddev->private;
a68e5870
N
2073 int primary;
2074 int i;
f4380a91 2075 int vcnt;
a68e5870 2076
30bc9b53
N
2077 /* Fix variable parts of all bios */
2078 vcnt = (r1_bio->sectors + PAGE_SIZE / 512 - 1) >> (PAGE_SHIFT - 9);
2079 for (i = 0; i < conf->raid_disks * 2; i++) {
2080 int j;
2081 int size;
4246a0b6 2082 int error;
30bc9b53
N
2083 struct bio *b = r1_bio->bios[i];
2084 if (b->bi_end_io != end_sync_read)
2085 continue;
4246a0b6
CH
2086 /* fixup the bio for reuse, but preserve errno */
2087 error = b->bi_error;
30bc9b53 2088 bio_reset(b);
4246a0b6 2089 b->bi_error = error;
30bc9b53 2090 b->bi_vcnt = vcnt;
4f024f37
KO
2091 b->bi_iter.bi_size = r1_bio->sectors << 9;
2092 b->bi_iter.bi_sector = r1_bio->sector +
30bc9b53
N
2093 conf->mirrors[i].rdev->data_offset;
2094 b->bi_bdev = conf->mirrors[i].rdev->bdev;
2095 b->bi_end_io = end_sync_read;
2096 b->bi_private = r1_bio;
2097
4f024f37 2098 size = b->bi_iter.bi_size;
30bc9b53
N
2099 for (j = 0; j < vcnt ; j++) {
2100 struct bio_vec *bi;
2101 bi = &b->bi_io_vec[j];
2102 bi->bv_offset = 0;
2103 if (size > PAGE_SIZE)
2104 bi->bv_len = PAGE_SIZE;
2105 else
2106 bi->bv_len = size;
2107 size -= PAGE_SIZE;
2108 }
2109 }
8f19ccb2 2110 for (primary = 0; primary < conf->raid_disks * 2; primary++)
a68e5870 2111 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
4246a0b6 2112 !r1_bio->bios[primary]->bi_error) {
a68e5870
N
2113 r1_bio->bios[primary]->bi_end_io = NULL;
2114 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
2115 break;
2116 }
2117 r1_bio->read_disk = primary;
8f19ccb2 2118 for (i = 0; i < conf->raid_disks * 2; i++) {
78d7f5f7 2119 int j;
78d7f5f7
N
2120 struct bio *pbio = r1_bio->bios[primary];
2121 struct bio *sbio = r1_bio->bios[i];
4246a0b6 2122 int error = sbio->bi_error;
a68e5870 2123
2aabaa65 2124 if (sbio->bi_end_io != end_sync_read)
78d7f5f7 2125 continue;
4246a0b6
CH
2126 /* Now we can 'fixup' the error value */
2127 sbio->bi_error = 0;
78d7f5f7 2128
4246a0b6 2129 if (!error) {
78d7f5f7
N
2130 for (j = vcnt; j-- ; ) {
2131 struct page *p, *s;
2132 p = pbio->bi_io_vec[j].bv_page;
2133 s = sbio->bi_io_vec[j].bv_page;
2134 if (memcmp(page_address(p),
2135 page_address(s),
5020ad7d 2136 sbio->bi_io_vec[j].bv_len))
78d7f5f7 2137 break;
69382e85 2138 }
78d7f5f7
N
2139 } else
2140 j = 0;
2141 if (j >= 0)
7f7583d4 2142 atomic64_add(r1_bio->sectors, &mddev->resync_mismatches);
78d7f5f7 2143 if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)
4246a0b6 2144 && !error)) {
78d7f5f7
N
2145 /* No need to write to this device. */
2146 sbio->bi_end_io = NULL;
2147 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
2148 continue;
2149 }
d3b45c2a
KO
2150
2151 bio_copy_data(sbio, pbio);
78d7f5f7 2152 }
a68e5870
N
2153}
2154
9f2c9d12 2155static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
a68e5870 2156{
e8096360 2157 struct r1conf *conf = mddev->private;
a68e5870 2158 int i;
8f19ccb2 2159 int disks = conf->raid_disks * 2;
a68e5870
N
2160 struct bio *bio, *wbio;
2161
2162 bio = r1_bio->bios[r1_bio->read_disk];
2163
a68e5870
N
2164 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
2165 /* ouch - failed to read all of that. */
2166 if (!fix_sync_read_error(r1_bio))
2167 return;
7ca78d57
N
2168
2169 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
c95e6385
N
2170 process_checks(r1_bio);
2171
d11c171e
N
2172 /*
2173 * schedule writes
2174 */
1da177e4
LT
2175 atomic_set(&r1_bio->remaining, 1);
2176 for (i = 0; i < disks ; i++) {
2177 wbio = r1_bio->bios[i];
3e198f78
N
2178 if (wbio->bi_end_io == NULL ||
2179 (wbio->bi_end_io == end_sync_read &&
2180 (i == r1_bio->read_disk ||
2181 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1da177e4
LT
2182 continue;
2183
796a5cf0 2184 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
212e7eb7
N
2185 if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
2186 wbio->bi_opf |= MD_FAILFAST;
2187
3e198f78 2188 wbio->bi_end_io = end_sync_write;
1da177e4 2189 atomic_inc(&r1_bio->remaining);
aa8b57aa 2190 md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
191ea9b2 2191
1da177e4
LT
2192 generic_make_request(wbio);
2193 }
2194
2195 if (atomic_dec_and_test(&r1_bio->remaining)) {
191ea9b2 2196 /* if we're here, all write(s) have completed, so clean up */
58e94ae1
N
2197 int s = r1_bio->sectors;
2198 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
2199 test_bit(R1BIO_WriteError, &r1_bio->state))
2200 reschedule_retry(r1_bio);
2201 else {
2202 put_buf(r1_bio);
2203 md_done_sync(mddev, s, 1);
2204 }
1da177e4
LT
2205 }
2206}
2207
2208/*
2209 * This is a kernel thread which:
2210 *
2211 * 1. Retries failed read operations on working mirrors.
2212 * 2. Updates the raid superblock when problems encounter.
d2eb35ac 2213 * 3. Performs writes following reads for array synchronising.
1da177e4
LT
2214 */
2215
e8096360 2216static void fix_read_error(struct r1conf *conf, int read_disk,
867868fb
N
2217 sector_t sect, int sectors)
2218{
fd01b88c 2219 struct mddev *mddev = conf->mddev;
867868fb
N
2220 while(sectors) {
2221 int s = sectors;
2222 int d = read_disk;
2223 int success = 0;
2224 int start;
3cb03002 2225 struct md_rdev *rdev;
867868fb
N
2226
2227 if (s > (PAGE_SIZE>>9))
2228 s = PAGE_SIZE >> 9;
2229
2230 do {
d2eb35ac
N
2231 sector_t first_bad;
2232 int bad_sectors;
2233
707a6a42
N
2234 rcu_read_lock();
2235 rdev = rcu_dereference(conf->mirrors[d].rdev);
867868fb 2236 if (rdev &&
da8840a7 2237 (test_bit(In_sync, &rdev->flags) ||
2238 (!test_bit(Faulty, &rdev->flags) &&
2239 rdev->recovery_offset >= sect + s)) &&
d2eb35ac 2240 is_badblock(rdev, sect, s,
707a6a42
N
2241 &first_bad, &bad_sectors) == 0) {
2242 atomic_inc(&rdev->nr_pending);
2243 rcu_read_unlock();
2244 if (sync_page_io(rdev, sect, s<<9,
796a5cf0 2245 conf->tmppage, REQ_OP_READ, 0, false))
707a6a42
N
2246 success = 1;
2247 rdev_dec_pending(rdev, mddev);
2248 if (success)
2249 break;
2250 } else
2251 rcu_read_unlock();
2252 d++;
2253 if (d == conf->raid_disks * 2)
2254 d = 0;
867868fb
N
2255 } while (!success && d != read_disk);
2256
2257 if (!success) {
d8f05d29 2258 /* Cannot read from anywhere - mark it bad */
3cb03002 2259 struct md_rdev *rdev = conf->mirrors[read_disk].rdev;
d8f05d29
N
2260 if (!rdev_set_badblocks(rdev, sect, s, 0))
2261 md_error(mddev, rdev);
867868fb
N
2262 break;
2263 }
2264 /* write it back and re-read */
2265 start = d;
2266 while (d != read_disk) {
2267 if (d==0)
8f19ccb2 2268 d = conf->raid_disks * 2;
867868fb 2269 d--;
707a6a42
N
2270 rcu_read_lock();
2271 rdev = rcu_dereference(conf->mirrors[d].rdev);
867868fb 2272 if (rdev &&
707a6a42
N
2273 !test_bit(Faulty, &rdev->flags)) {
2274 atomic_inc(&rdev->nr_pending);
2275 rcu_read_unlock();
d8f05d29
N
2276 r1_sync_page_io(rdev, sect, s,
2277 conf->tmppage, WRITE);
707a6a42
N
2278 rdev_dec_pending(rdev, mddev);
2279 } else
2280 rcu_read_unlock();
867868fb
N
2281 }
2282 d = start;
2283 while (d != read_disk) {
2284 char b[BDEVNAME_SIZE];
2285 if (d==0)
8f19ccb2 2286 d = conf->raid_disks * 2;
867868fb 2287 d--;
707a6a42
N
2288 rcu_read_lock();
2289 rdev = rcu_dereference(conf->mirrors[d].rdev);
867868fb 2290 if (rdev &&
b8cb6b4c 2291 !test_bit(Faulty, &rdev->flags)) {
707a6a42
N
2292 atomic_inc(&rdev->nr_pending);
2293 rcu_read_unlock();
d8f05d29
N
2294 if (r1_sync_page_io(rdev, sect, s,
2295 conf->tmppage, READ)) {
867868fb 2296 atomic_add(s, &rdev->corrected_errors);
1d41c216
N
2297 pr_info("md/raid1:%s: read error corrected (%d sectors at %llu on %s)\n",
2298 mdname(mddev), s,
2299 (unsigned long long)(sect +
2300 rdev->data_offset),
2301 bdevname(rdev->bdev, b));
867868fb 2302 }
707a6a42
N
2303 rdev_dec_pending(rdev, mddev);
2304 } else
2305 rcu_read_unlock();
867868fb
N
2306 }
2307 sectors -= s;
2308 sect += s;
2309 }
2310}
2311
9f2c9d12 2312static int narrow_write_error(struct r1bio *r1_bio, int i)
cd5ff9a1 2313{
fd01b88c 2314 struct mddev *mddev = r1_bio->mddev;
e8096360 2315 struct r1conf *conf = mddev->private;
3cb03002 2316 struct md_rdev *rdev = conf->mirrors[i].rdev;
cd5ff9a1
N
2317
2318 /* bio has the data to be written to device 'i' where
2319 * we just recently had a write error.
2320 * We repeatedly clone the bio and trim down to one block,
2321 * then try the write. Where the write fails we record
2322 * a bad block.
2323 * It is conceivable that the bio doesn't exactly align with
2324 * blocks. We must handle this somehow.
2325 *
2326 * We currently own a reference on the rdev.
2327 */
2328
2329 int block_sectors;
2330 sector_t sector;
2331 int sectors;
2332 int sect_to_write = r1_bio->sectors;
2333 int ok = 1;
2334
2335 if (rdev->badblocks.shift < 0)
2336 return 0;
2337
ab713cdc
ND
2338 block_sectors = roundup(1 << rdev->badblocks.shift,
2339 bdev_logical_block_size(rdev->bdev) >> 9);
cd5ff9a1
N
2340 sector = r1_bio->sector;
2341 sectors = ((sector + block_sectors)
2342 & ~(sector_t)(block_sectors - 1))
2343 - sector;
2344
cd5ff9a1
N
2345 while (sect_to_write) {
2346 struct bio *wbio;
2347 if (sectors > sect_to_write)
2348 sectors = sect_to_write;
2349 /* Write at 'sector' for 'sectors'*/
2350
b783863f
KO
2351 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
2352 unsigned vcnt = r1_bio->behind_page_count;
2353 struct bio_vec *vec = r1_bio->behind_bvecs;
2354
2355 while (!vec->bv_page) {
2356 vec++;
2357 vcnt--;
2358 }
2359
2360 wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
2361 memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
2362
2363 wbio->bi_vcnt = vcnt;
2364 } else {
d7a10308
ML
2365 wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
2366 mddev->bio_set);
b783863f
KO
2367 }
2368
796a5cf0 2369 bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
4f024f37
KO
2370 wbio->bi_iter.bi_sector = r1_bio->sector;
2371 wbio->bi_iter.bi_size = r1_bio->sectors << 9;
cd5ff9a1 2372
6678d83f 2373 bio_trim(wbio, sector - r1_bio->sector, sectors);
4f024f37 2374 wbio->bi_iter.bi_sector += rdev->data_offset;
cd5ff9a1 2375 wbio->bi_bdev = rdev->bdev;
4e49ea4a
MC
2376
2377 if (submit_bio_wait(wbio) < 0)
cd5ff9a1
N
2378 /* failure! */
2379 ok = rdev_set_badblocks(rdev, sector,
2380 sectors, 0)
2381 && ok;
2382
2383 bio_put(wbio);
2384 sect_to_write -= sectors;
2385 sector += sectors;
2386 sectors = block_sectors;
2387 }
2388 return ok;
2389}
2390
e8096360 2391static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
62096bce
N
2392{
2393 int m;
2394 int s = r1_bio->sectors;
8f19ccb2 2395 for (m = 0; m < conf->raid_disks * 2 ; m++) {
3cb03002 2396 struct md_rdev *rdev = conf->mirrors[m].rdev;
62096bce
N
2397 struct bio *bio = r1_bio->bios[m];
2398 if (bio->bi_end_io == NULL)
2399 continue;
4246a0b6 2400 if (!bio->bi_error &&
62096bce 2401 test_bit(R1BIO_MadeGood, &r1_bio->state)) {
c6563a8c 2402 rdev_clear_badblocks(rdev, r1_bio->sector, s, 0);
62096bce 2403 }
4246a0b6 2404 if (bio->bi_error &&
62096bce
N
2405 test_bit(R1BIO_WriteError, &r1_bio->state)) {
2406 if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0))
2407 md_error(conf->mddev, rdev);
2408 }
2409 }
2410 put_buf(r1_bio);
2411 md_done_sync(conf->mddev, s, 1);
2412}
2413
e8096360 2414static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
62096bce 2415{
fd76863e 2416 int m, idx;
55ce74d4 2417 bool fail = false;
fd76863e 2418
8f19ccb2 2419 for (m = 0; m < conf->raid_disks * 2 ; m++)
62096bce 2420 if (r1_bio->bios[m] == IO_MADE_GOOD) {
3cb03002 2421 struct md_rdev *rdev = conf->mirrors[m].rdev;
62096bce
N
2422 rdev_clear_badblocks(rdev,
2423 r1_bio->sector,
c6563a8c 2424 r1_bio->sectors, 0);
62096bce
N
2425 rdev_dec_pending(rdev, conf->mddev);
2426 } else if (r1_bio->bios[m] != NULL) {
2427 /* This drive got a write error. We need to
2428 * narrow down and record precise write
2429 * errors.
2430 */
55ce74d4 2431 fail = true;
62096bce
N
2432 if (!narrow_write_error(r1_bio, m)) {
2433 md_error(conf->mddev,
2434 conf->mirrors[m].rdev);
2435 /* an I/O failed, we can't clear the bitmap */
2436 set_bit(R1BIO_Degraded, &r1_bio->state);
2437 }
2438 rdev_dec_pending(conf->mirrors[m].rdev,
2439 conf->mddev);
2440 }
55ce74d4
N
2441 if (fail) {
2442 spin_lock_irq(&conf->device_lock);
2443 list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
fd76863e 2444 idx = sector_to_idx(r1_bio->sector);
824e47da 2445 atomic_inc(&conf->nr_queued[idx]);
55ce74d4 2446 spin_unlock_irq(&conf->device_lock);
824e47da 2447 /*
2448 * In case freeze_array() is waiting for condition
2449 * get_unqueued_pending() == extra to be true.
2450 */
2451 wake_up(&conf->wait_barrier);
55ce74d4 2452 md_wakeup_thread(conf->mddev->thread);
bd8688a1
N
2453 } else {
2454 if (test_bit(R1BIO_WriteError, &r1_bio->state))
2455 close_write(r1_bio);
55ce74d4 2456 raid_end_bio_io(r1_bio);
bd8688a1 2457 }
62096bce
N
2458}
2459
e8096360 2460static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
62096bce
N
2461{
2462 int disk;
2463 int max_sectors;
fd01b88c 2464 struct mddev *mddev = conf->mddev;
62096bce
N
2465 struct bio *bio;
2466 char b[BDEVNAME_SIZE];
3cb03002 2467 struct md_rdev *rdev;
109e3765
N
2468 dev_t bio_dev;
2469 sector_t bio_sector;
62096bce
N
2470
2471 clear_bit(R1BIO_ReadError, &r1_bio->state);
2472 /* we got a read error. Maybe the drive is bad. Maybe just
2473 * the block and we can fix it.
2474 * We freeze all other IO, and try reading the block from
2475 * other devices. When we find one, we re-write
2476 * and check it that fixes the read error.
2477 * This is all done synchronously while the array is
2478 * frozen
2479 */
7449f699
TM
2480
2481 bio = r1_bio->bios[r1_bio->read_disk];
2482 bdevname(bio->bi_bdev, b);
109e3765
N
2483 bio_dev = bio->bi_bdev->bd_dev;
2484 bio_sector = conf->mirrors[r1_bio->read_disk].rdev->data_offset + r1_bio->sector;
7449f699
TM
2485 bio_put(bio);
2486 r1_bio->bios[r1_bio->read_disk] = NULL;
2487
2e52d449
N
2488 rdev = conf->mirrors[r1_bio->read_disk].rdev;
2489 if (mddev->ro == 0
2490 && !test_bit(FailFast, &rdev->flags)) {
e2d59925 2491 freeze_array(conf, 1);
62096bce
N
2492 fix_read_error(conf, r1_bio->read_disk,
2493 r1_bio->sector, r1_bio->sectors);
2494 unfreeze_array(conf);
7449f699
TM
2495 } else {
2496 r1_bio->bios[r1_bio->read_disk] = IO_BLOCKED;
2497 }
2498
2e52d449 2499 rdev_dec_pending(rdev, conf->mddev);
62096bce 2500
62096bce
N
2501read_more:
2502 disk = read_balance(conf, r1_bio, &max_sectors);
2503 if (disk == -1) {
1d41c216
N
2504 pr_crit_ratelimited("md/raid1:%s: %s: unrecoverable I/O read error for block %llu\n",
2505 mdname(mddev), b, (unsigned long long)r1_bio->sector);
62096bce
N
2506 raid_end_bio_io(r1_bio);
2507 } else {
2508 const unsigned long do_sync
1eff9d32 2509 = r1_bio->master_bio->bi_opf & REQ_SYNC;
62096bce 2510 r1_bio->read_disk = disk;
d7a10308
ML
2511 bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
2512 mddev->bio_set);
4f024f37
KO
2513 bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector,
2514 max_sectors);
62096bce
N
2515 r1_bio->bios[r1_bio->read_disk] = bio;
2516 rdev = conf->mirrors[disk].rdev;
1d41c216
N
2517 pr_info_ratelimited("md/raid1:%s: redirecting sector %llu to other mirror: %s\n",
2518 mdname(mddev),
2519 (unsigned long long)r1_bio->sector,
2520 bdevname(rdev->bdev, b));
4f024f37 2521 bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset;
62096bce
N
2522 bio->bi_bdev = rdev->bdev;
2523 bio->bi_end_io = raid1_end_read_request;
796a5cf0 2524 bio_set_op_attrs(bio, REQ_OP_READ, do_sync);
2e52d449
N
2525 if (test_bit(FailFast, &rdev->flags) &&
2526 test_bit(R1BIO_FailFast, &r1_bio->state))
2527 bio->bi_opf |= MD_FAILFAST;
62096bce
N
2528 bio->bi_private = r1_bio;
2529 if (max_sectors < r1_bio->sectors) {
2530 /* Drat - have to split this up more */
2531 struct bio *mbio = r1_bio->master_bio;
2532 int sectors_handled = (r1_bio->sector + max_sectors
4f024f37 2533 - mbio->bi_iter.bi_sector);
62096bce 2534 r1_bio->sectors = max_sectors;
37011e3a 2535 bio_inc_remaining(mbio);
109e3765
N
2536 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2537 bio, bio_dev, bio_sector);
62096bce
N
2538 generic_make_request(bio);
2539 bio = NULL;
2540
fd76863e 2541 r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
62096bce 2542 set_bit(R1BIO_ReadError, &r1_bio->state);
37011e3a 2543 inc_pending(conf, r1_bio->sector);
62096bce
N
2544
2545 goto read_more;
109e3765
N
2546 } else {
2547 trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
2548 bio, bio_dev, bio_sector);
62096bce 2549 generic_make_request(bio);
109e3765 2550 }
62096bce
N
2551 }
2552}
2553
4ed8731d 2554static void raid1d(struct md_thread *thread)
1da177e4 2555{
4ed8731d 2556 struct mddev *mddev = thread->mddev;
9f2c9d12 2557 struct r1bio *r1_bio;
1da177e4 2558 unsigned long flags;
e8096360 2559 struct r1conf *conf = mddev->private;
1da177e4 2560 struct list_head *head = &conf->retry_list;
e1dfa0a2 2561 struct blk_plug plug;
fd76863e 2562 int idx;
1da177e4
LT
2563
2564 md_check_recovery(mddev);
e1dfa0a2 2565
55ce74d4 2566 if (!list_empty_careful(&conf->bio_end_io_list) &&
2953079c 2567 !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)) {
55ce74d4
N
2568 LIST_HEAD(tmp);
2569 spin_lock_irqsave(&conf->device_lock, flags);
fd76863e 2570 if (!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
2571 list_splice_init(&conf->bio_end_io_list, &tmp);
55ce74d4
N
2572 spin_unlock_irqrestore(&conf->device_lock, flags);
2573 while (!list_empty(&tmp)) {
a452744b
MP
2574 r1_bio = list_first_entry(&tmp, struct r1bio,
2575 retry_list);
55ce74d4 2576 list_del(&r1_bio->retry_list);
fd76863e 2577 idx = sector_to_idx(r1_bio->sector);
824e47da 2578 atomic_dec(&conf->nr_queued[idx]);
bd8688a1
N
2579 if (mddev->degraded)
2580 set_bit(R1BIO_Degraded, &r1_bio->state);
2581 if (test_bit(R1BIO_WriteError, &r1_bio->state))
2582 close_write(r1_bio);
55ce74d4
N
2583 raid_end_bio_io(r1_bio);
2584 }
2585 }
2586
e1dfa0a2 2587 blk_start_plug(&plug);
1da177e4 2588 for (;;) {
191ea9b2 2589
0021b7bc 2590 flush_pending_writes(conf);
191ea9b2 2591
a35e63ef
N
2592 spin_lock_irqsave(&conf->device_lock, flags);
2593 if (list_empty(head)) {
2594 spin_unlock_irqrestore(&conf->device_lock, flags);
1da177e4 2595 break;
a35e63ef 2596 }
9f2c9d12 2597 r1_bio = list_entry(head->prev, struct r1bio, retry_list);
1da177e4 2598 list_del(head->prev);
fd76863e 2599 idx = sector_to_idx(r1_bio->sector);
824e47da 2600 atomic_dec(&conf->nr_queued[idx]);
1da177e4
LT
2601 spin_unlock_irqrestore(&conf->device_lock, flags);
2602
2603 mddev = r1_bio->mddev;
070ec55d 2604 conf = mddev->private;
4367af55 2605 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
d8f05d29 2606 if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
62096bce
N
2607 test_bit(R1BIO_WriteError, &r1_bio->state))
2608 handle_sync_write_finished(conf, r1_bio);
2609 else
4367af55 2610 sync_request_write(mddev, r1_bio);
cd5ff9a1 2611 } else if (test_bit(R1BIO_MadeGood, &r1_bio->state) ||
62096bce
N
2612 test_bit(R1BIO_WriteError, &r1_bio->state))
2613 handle_write_finished(conf, r1_bio);
2614 else if (test_bit(R1BIO_ReadError, &r1_bio->state))
2615 handle_read_error(conf, r1_bio);
2616 else
d2eb35ac
N
2617 /* just a partial read to be scheduled from separate
2618 * context
2619 */
2620 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
62096bce 2621
1d9d5241 2622 cond_resched();
2953079c 2623 if (mddev->sb_flags & ~(1<<MD_SB_CHANGE_PENDING))
de393cde 2624 md_check_recovery(mddev);
1da177e4 2625 }
e1dfa0a2 2626 blk_finish_plug(&plug);
1da177e4
LT
2627}
2628
e8096360 2629static int init_resync(struct r1conf *conf)
1da177e4
LT
2630{
2631 int buffs;
2632
2633 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
9e77c485 2634 BUG_ON(conf->r1buf_pool);
1da177e4
LT
2635 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
2636 conf->poolinfo);
2637 if (!conf->r1buf_pool)
2638 return -ENOMEM;
1da177e4
LT
2639 return 0;
2640}
2641
2642/*
2643 * perform a "sync" on one "block"
2644 *
2645 * We need to make sure that no normal I/O request - particularly write
2646 * requests - conflict with active sync requests.
2647 *
2648 * This is achieved by tracking pending requests and a 'barrier' concept
2649 * that can be installed to exclude normal IO requests.
2650 */
2651
849674e4
SL
2652static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
2653 int *skipped)
1da177e4 2654{
e8096360 2655 struct r1conf *conf = mddev->private;
9f2c9d12 2656 struct r1bio *r1_bio;
1da177e4
LT
2657 struct bio *bio;
2658 sector_t max_sector, nr_sectors;
3e198f78 2659 int disk = -1;
1da177e4 2660 int i;
3e198f78
N
2661 int wonly = -1;
2662 int write_targets = 0, read_targets = 0;
57dab0bd 2663 sector_t sync_blocks;
e3b9703e 2664 int still_degraded = 0;
06f60385
N
2665 int good_sectors = RESYNC_SECTORS;
2666 int min_bad = 0; /* number of sectors that are bad in all devices */
fd76863e 2667 int idx = sector_to_idx(sector_nr);
1da177e4
LT
2668
2669 if (!conf->r1buf_pool)
2670 if (init_resync(conf))
57afd89f 2671 return 0;
1da177e4 2672
58c0fed4 2673 max_sector = mddev->dev_sectors;
1da177e4 2674 if (sector_nr >= max_sector) {
191ea9b2
N
2675 /* If we aborted, we need to abort the
2676 * sync on the 'current' bitmap chunk (there will
2677 * only be one in raid1 resync.
2678 * We can find the current addess in mddev->curr_resync
2679 */
6a806c51
N
2680 if (mddev->curr_resync < max_sector) /* aborted */
2681 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
191ea9b2 2682 &sync_blocks, 1);
6a806c51 2683 else /* completed sync */
191ea9b2 2684 conf->fullsync = 0;
6a806c51
N
2685
2686 bitmap_close_sync(mddev->bitmap);
1da177e4 2687 close_sync(conf);
c40f341f
GR
2688
2689 if (mddev_is_clustered(mddev)) {
2690 conf->cluster_sync_low = 0;
2691 conf->cluster_sync_high = 0;
c40f341f 2692 }
1da177e4
LT
2693 return 0;
2694 }
2695
07d84d10
N
2696 if (mddev->bitmap == NULL &&
2697 mddev->recovery_cp == MaxSector &&
6394cca5 2698 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
07d84d10
N
2699 conf->fullsync == 0) {
2700 *skipped = 1;
2701 return max_sector - sector_nr;
2702 }
6394cca5
N
2703 /* before building a request, check if we can skip these blocks..
2704 * This call the bitmap_start_sync doesn't actually record anything
2705 */
e3b9703e 2706 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
e5de485f 2707 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
191ea9b2
N
2708 /* We can skip this block, and probably several more */
2709 *skipped = 1;
2710 return sync_blocks;
2711 }
17999be4 2712
7ac50447
TM
2713 /*
2714 * If there is non-resync activity waiting for a turn, then let it
2715 * though before starting on this new sync request.
2716 */
824e47da 2717 if (atomic_read(&conf->nr_waiting[idx]))
7ac50447
TM
2718 schedule_timeout_uninterruptible(1);
2719
c40f341f
GR
2720 /* we are incrementing sector_nr below. To be safe, we check against
2721 * sector_nr + two times RESYNC_SECTORS
2722 */
2723
2724 bitmap_cond_end_sync(mddev->bitmap, sector_nr,
2725 mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high));
1c4588e9 2726 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
17999be4 2727
c2fd4c94 2728 raise_barrier(conf, sector_nr);
1da177e4 2729
3e198f78 2730 rcu_read_lock();
1da177e4 2731 /*
3e198f78
N
2732 * If we get a correctably read error during resync or recovery,
2733 * we might want to read from a different device. So we
2734 * flag all drives that could conceivably be read from for READ,
2735 * and any others (which will be non-In_sync devices) for WRITE.
2736 * If a read fails, we try reading from something else for which READ
2737 * is OK.
1da177e4 2738 */
1da177e4 2739
1da177e4
LT
2740 r1_bio->mddev = mddev;
2741 r1_bio->sector = sector_nr;
191ea9b2 2742 r1_bio->state = 0;
1da177e4 2743 set_bit(R1BIO_IsSync, &r1_bio->state);
fd76863e 2744 /* make sure good_sectors won't go across barrier unit boundary */
2745 good_sectors = align_to_barrier_unit_end(sector_nr, good_sectors);
1da177e4 2746
8f19ccb2 2747 for (i = 0; i < conf->raid_disks * 2; i++) {
3cb03002 2748 struct md_rdev *rdev;
1da177e4 2749 bio = r1_bio->bios[i];
2aabaa65 2750 bio_reset(bio);
1da177e4 2751
3e198f78
N
2752 rdev = rcu_dereference(conf->mirrors[i].rdev);
2753 if (rdev == NULL ||
06f60385 2754 test_bit(Faulty, &rdev->flags)) {
8f19ccb2
N
2755 if (i < conf->raid_disks)
2756 still_degraded = 1;
3e198f78 2757 } else if (!test_bit(In_sync, &rdev->flags)) {
796a5cf0 2758 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
1da177e4
LT
2759 bio->bi_end_io = end_sync_write;
2760 write_targets ++;
3e198f78
N
2761 } else {
2762 /* may need to read from here */
06f60385
N
2763 sector_t first_bad = MaxSector;
2764 int bad_sectors;
2765
2766 if (is_badblock(rdev, sector_nr, good_sectors,
2767 &first_bad, &bad_sectors)) {
2768 if (first_bad > sector_nr)
2769 good_sectors = first_bad - sector_nr;
2770 else {
2771 bad_sectors -= (sector_nr - first_bad);
2772 if (min_bad == 0 ||
2773 min_bad > bad_sectors)
2774 min_bad = bad_sectors;
2775 }
2776 }
2777 if (sector_nr < first_bad) {
2778 if (test_bit(WriteMostly, &rdev->flags)) {
2779 if (wonly < 0)
2780 wonly = i;
2781 } else {
2782 if (disk < 0)
2783 disk = i;
2784 }
796a5cf0 2785 bio_set_op_attrs(bio, REQ_OP_READ, 0);
06f60385
N
2786 bio->bi_end_io = end_sync_read;
2787 read_targets++;
d57368af
AL
2788 } else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
2789 test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
2790 !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
2791 /*
2792 * The device is suitable for reading (InSync),
2793 * but has bad block(s) here. Let's try to correct them,
2794 * if we are doing resync or repair. Otherwise, leave
2795 * this device alone for this sync request.
2796 */
796a5cf0 2797 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
d57368af
AL
2798 bio->bi_end_io = end_sync_write;
2799 write_targets++;
3e198f78 2800 }
3e198f78 2801 }
06f60385
N
2802 if (bio->bi_end_io) {
2803 atomic_inc(&rdev->nr_pending);
4f024f37 2804 bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
06f60385
N
2805 bio->bi_bdev = rdev->bdev;
2806 bio->bi_private = r1_bio;
2e52d449
N
2807 if (test_bit(FailFast, &rdev->flags))
2808 bio->bi_opf |= MD_FAILFAST;
06f60385 2809 }
1da177e4 2810 }
3e198f78
N
2811 rcu_read_unlock();
2812 if (disk < 0)
2813 disk = wonly;
2814 r1_bio->read_disk = disk;
191ea9b2 2815
06f60385
N
2816 if (read_targets == 0 && min_bad > 0) {
2817 /* These sectors are bad on all InSync devices, so we
2818 * need to mark them bad on all write targets
2819 */
2820 int ok = 1;
8f19ccb2 2821 for (i = 0 ; i < conf->raid_disks * 2 ; i++)
06f60385 2822 if (r1_bio->bios[i]->bi_end_io == end_sync_write) {
a42f9d83 2823 struct md_rdev *rdev = conf->mirrors[i].rdev;
06f60385
N
2824 ok = rdev_set_badblocks(rdev, sector_nr,
2825 min_bad, 0
2826 ) && ok;
2827 }
2953079c 2828 set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
06f60385
N
2829 *skipped = 1;
2830 put_buf(r1_bio);
2831
2832 if (!ok) {
2833 /* Cannot record the badblocks, so need to
2834 * abort the resync.
2835 * If there are multiple read targets, could just
2836 * fail the really bad ones ???
2837 */
2838 conf->recovery_disabled = mddev->recovery_disabled;
2839 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
2840 return 0;
2841 } else
2842 return min_bad;
2843
2844 }
2845 if (min_bad > 0 && min_bad < good_sectors) {
2846 /* only resync enough to reach the next bad->good
2847 * transition */
2848 good_sectors = min_bad;
2849 }
2850
3e198f78
N
2851 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
2852 /* extra read targets are also write targets */
2853 write_targets += read_targets-1;
2854
2855 if (write_targets == 0 || read_targets == 0) {
1da177e4
LT
2856 /* There is nowhere to write, so all non-sync
2857 * drives must be failed - so we are finished
2858 */
b7219ccb
N
2859 sector_t rv;
2860 if (min_bad > 0)
2861 max_sector = sector_nr + min_bad;
2862 rv = max_sector - sector_nr;
57afd89f 2863 *skipped = 1;
1da177e4 2864 put_buf(r1_bio);
1da177e4
LT
2865 return rv;
2866 }
2867
c6207277
N
2868 if (max_sector > mddev->resync_max)
2869 max_sector = mddev->resync_max; /* Don't do IO beyond here */
06f60385
N
2870 if (max_sector > sector_nr + good_sectors)
2871 max_sector = sector_nr + good_sectors;
1da177e4 2872 nr_sectors = 0;
289e99e8 2873 sync_blocks = 0;
1da177e4
LT
2874 do {
2875 struct page *page;
2876 int len = PAGE_SIZE;
2877 if (sector_nr + (len>>9) > max_sector)
2878 len = (max_sector - sector_nr) << 9;
2879 if (len == 0)
2880 break;
6a806c51
N
2881 if (sync_blocks == 0) {
2882 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
e5de485f
N
2883 &sync_blocks, still_degraded) &&
2884 !conf->fullsync &&
2885 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6a806c51 2886 break;
7571ae88 2887 if ((len >> 9) > sync_blocks)
6a806c51 2888 len = sync_blocks<<9;
ab7a30c7 2889 }
191ea9b2 2890
8f19ccb2 2891 for (i = 0 ; i < conf->raid_disks * 2; i++) {
1da177e4
LT
2892 bio = r1_bio->bios[i];
2893 if (bio->bi_end_io) {
d11c171e 2894 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
c85ba149
ML
2895
2896 /*
2897 * won't fail because the vec table is big
2898 * enough to hold all these pages
2899 */
2900 bio_add_page(bio, page, len, 0);
1da177e4
LT
2901 }
2902 }
2903 nr_sectors += len>>9;
2904 sector_nr += len>>9;
191ea9b2 2905 sync_blocks -= (len>>9);
1da177e4 2906 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1da177e4
LT
2907 r1_bio->sectors = nr_sectors;
2908
c40f341f
GR
2909 if (mddev_is_clustered(mddev) &&
2910 conf->cluster_sync_high < sector_nr + nr_sectors) {
2911 conf->cluster_sync_low = mddev->curr_resync_completed;
2912 conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS;
2913 /* Send resync message */
2914 md_cluster_ops->resync_info_update(mddev,
2915 conf->cluster_sync_low,
2916 conf->cluster_sync_high);
2917 }
2918
d11c171e
N
2919 /* For a user-requested sync, we read all readable devices and do a
2920 * compare
2921 */
2922 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
2923 atomic_set(&r1_bio->remaining, read_targets);
2d4f4f33 2924 for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) {
d11c171e
N
2925 bio = r1_bio->bios[i];
2926 if (bio->bi_end_io == end_sync_read) {
2d4f4f33 2927 read_targets--;
ddac7c7e 2928 md_sync_acct(bio->bi_bdev, nr_sectors);
2e52d449
N
2929 if (read_targets == 1)
2930 bio->bi_opf &= ~MD_FAILFAST;
d11c171e
N
2931 generic_make_request(bio);
2932 }
2933 }
2934 } else {
2935 atomic_set(&r1_bio->remaining, 1);
2936 bio = r1_bio->bios[r1_bio->read_disk];
ddac7c7e 2937 md_sync_acct(bio->bi_bdev, nr_sectors);
2e52d449
N
2938 if (read_targets == 1)
2939 bio->bi_opf &= ~MD_FAILFAST;
d11c171e 2940 generic_make_request(bio);
1da177e4 2941
d11c171e 2942 }
1da177e4
LT
2943 return nr_sectors;
2944}
2945
fd01b88c 2946static sector_t raid1_size(struct mddev *mddev, sector_t sectors, int raid_disks)
80c3a6ce
DW
2947{
2948 if (sectors)
2949 return sectors;
2950
2951 return mddev->dev_sectors;
2952}
2953
e8096360 2954static struct r1conf *setup_conf(struct mddev *mddev)
1da177e4 2955{
e8096360 2956 struct r1conf *conf;
709ae487 2957 int i;
0eaf822c 2958 struct raid1_info *disk;
3cb03002 2959 struct md_rdev *rdev;
709ae487 2960 int err = -ENOMEM;
1da177e4 2961
e8096360 2962 conf = kzalloc(sizeof(struct r1conf), GFP_KERNEL);
1da177e4 2963 if (!conf)
709ae487 2964 goto abort;
1da177e4 2965
fd76863e 2966 conf->nr_pending = kcalloc(BARRIER_BUCKETS_NR,
824e47da 2967 sizeof(atomic_t), GFP_KERNEL);
fd76863e 2968 if (!conf->nr_pending)
2969 goto abort;
2970
2971 conf->nr_waiting = kcalloc(BARRIER_BUCKETS_NR,
824e47da 2972 sizeof(atomic_t), GFP_KERNEL);
fd76863e 2973 if (!conf->nr_waiting)
2974 goto abort;
2975
2976 conf->nr_queued = kcalloc(BARRIER_BUCKETS_NR,
824e47da 2977 sizeof(atomic_t), GFP_KERNEL);
fd76863e 2978 if (!conf->nr_queued)
2979 goto abort;
2980
2981 conf->barrier = kcalloc(BARRIER_BUCKETS_NR,
824e47da 2982 sizeof(atomic_t), GFP_KERNEL);
fd76863e 2983 if (!conf->barrier)
2984 goto abort;
2985
0eaf822c 2986 conf->mirrors = kzalloc(sizeof(struct raid1_info)
8f19ccb2 2987 * mddev->raid_disks * 2,
1da177e4
LT
2988 GFP_KERNEL);
2989 if (!conf->mirrors)
709ae487 2990 goto abort;
1da177e4 2991
ddaf22ab
N
2992 conf->tmppage = alloc_page(GFP_KERNEL);
2993 if (!conf->tmppage)
709ae487 2994 goto abort;
ddaf22ab 2995
709ae487 2996 conf->poolinfo = kzalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1da177e4 2997 if (!conf->poolinfo)
709ae487 2998 goto abort;
8f19ccb2 2999 conf->poolinfo->raid_disks = mddev->raid_disks * 2;
1da177e4
LT
3000 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
3001 r1bio_pool_free,
3002 conf->poolinfo);
3003 if (!conf->r1bio_pool)
709ae487
N
3004 goto abort;
3005
ed9bfdf1 3006 conf->poolinfo->mddev = mddev;
1da177e4 3007
c19d5798 3008 err = -EINVAL;
e7e72bf6 3009 spin_lock_init(&conf->device_lock);
dafb20fa 3010 rdev_for_each(rdev, mddev) {
aba336bd 3011 struct request_queue *q;
709ae487 3012 int disk_idx = rdev->raid_disk;
1da177e4
LT
3013 if (disk_idx >= mddev->raid_disks
3014 || disk_idx < 0)
3015 continue;
c19d5798 3016 if (test_bit(Replacement, &rdev->flags))
02b898f2 3017 disk = conf->mirrors + mddev->raid_disks + disk_idx;
c19d5798
N
3018 else
3019 disk = conf->mirrors + disk_idx;
1da177e4 3020
c19d5798
N
3021 if (disk->rdev)
3022 goto abort;
1da177e4 3023 disk->rdev = rdev;
aba336bd 3024 q = bdev_get_queue(rdev->bdev);
1da177e4
LT
3025
3026 disk->head_position = 0;
12cee5a8 3027 disk->seq_start = MaxSector;
1da177e4
LT
3028 }
3029 conf->raid_disks = mddev->raid_disks;
3030 conf->mddev = mddev;
1da177e4 3031 INIT_LIST_HEAD(&conf->retry_list);
55ce74d4 3032 INIT_LIST_HEAD(&conf->bio_end_io_list);
1da177e4
LT
3033
3034 spin_lock_init(&conf->resync_lock);
17999be4 3035 init_waitqueue_head(&conf->wait_barrier);
1da177e4 3036
191ea9b2 3037 bio_list_init(&conf->pending_bio_list);
34db0cd6 3038 conf->pending_count = 0;
d890fa2b 3039 conf->recovery_disabled = mddev->recovery_disabled - 1;
191ea9b2 3040
c19d5798 3041 err = -EIO;
8f19ccb2 3042 for (i = 0; i < conf->raid_disks * 2; i++) {
1da177e4
LT
3043
3044 disk = conf->mirrors + i;
3045
c19d5798
N
3046 if (i < conf->raid_disks &&
3047 disk[conf->raid_disks].rdev) {
3048 /* This slot has a replacement. */
3049 if (!disk->rdev) {
3050 /* No original, just make the replacement
3051 * a recovering spare
3052 */
3053 disk->rdev =
3054 disk[conf->raid_disks].rdev;
3055 disk[conf->raid_disks].rdev = NULL;
3056 } else if (!test_bit(In_sync, &disk->rdev->flags))
3057 /* Original is not in_sync - bad */
3058 goto abort;
3059 }
3060
5fd6c1dc
N
3061 if (!disk->rdev ||
3062 !test_bit(In_sync, &disk->rdev->flags)) {
1da177e4 3063 disk->head_position = 0;
4f0a5e01
JB
3064 if (disk->rdev &&
3065 (disk->rdev->saved_raid_disk < 0))
918f0238 3066 conf->fullsync = 1;
be4d3280 3067 }
1da177e4 3068 }
709ae487 3069
709ae487 3070 err = -ENOMEM;
0232605d 3071 conf->thread = md_register_thread(raid1d, mddev, "raid1");
1d41c216 3072 if (!conf->thread)
709ae487 3073 goto abort;
1da177e4 3074
709ae487
N
3075 return conf;
3076
3077 abort:
3078 if (conf) {
644df1a8 3079 mempool_destroy(conf->r1bio_pool);
709ae487
N
3080 kfree(conf->mirrors);
3081 safe_put_page(conf->tmppage);
3082 kfree(conf->poolinfo);
fd76863e 3083 kfree(conf->nr_pending);
3084 kfree(conf->nr_waiting);
3085 kfree(conf->nr_queued);
3086 kfree(conf->barrier);
709ae487
N
3087 kfree(conf);
3088 }
3089 return ERR_PTR(err);
3090}
3091
afa0f557 3092static void raid1_free(struct mddev *mddev, void *priv);
849674e4 3093static int raid1_run(struct mddev *mddev)
709ae487 3094{
e8096360 3095 struct r1conf *conf;
709ae487 3096 int i;
3cb03002 3097 struct md_rdev *rdev;
5220ea1e 3098 int ret;
2ff8cc2c 3099 bool discard_supported = false;
709ae487
N
3100
3101 if (mddev->level != 1) {
1d41c216
N
3102 pr_warn("md/raid1:%s: raid level not set to mirroring (%d)\n",
3103 mdname(mddev), mddev->level);
709ae487
N
3104 return -EIO;
3105 }
3106 if (mddev->reshape_position != MaxSector) {
1d41c216
N
3107 pr_warn("md/raid1:%s: reshape_position set but not supported\n",
3108 mdname(mddev));
709ae487
N
3109 return -EIO;
3110 }
1da177e4 3111 /*
709ae487
N
3112 * copy the already verified devices into our private RAID1
3113 * bookkeeping area. [whatever we allocate in run(),
afa0f557 3114 * should be freed in raid1_free()]
1da177e4 3115 */
709ae487
N
3116 if (mddev->private == NULL)
3117 conf = setup_conf(mddev);
3118 else
3119 conf = mddev->private;
1da177e4 3120
709ae487
N
3121 if (IS_ERR(conf))
3122 return PTR_ERR(conf);
1da177e4 3123
c8dc9c65 3124 if (mddev->queue)
5026d7a9
PA
3125 blk_queue_max_write_same_sectors(mddev->queue, 0);
3126
dafb20fa 3127 rdev_for_each(rdev, mddev) {
1ed7242e
JB
3128 if (!mddev->gendisk)
3129 continue;
709ae487
N
3130 disk_stack_limits(mddev->gendisk, rdev->bdev,
3131 rdev->data_offset << 9);
2ff8cc2c
SL
3132 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3133 discard_supported = true;
1da177e4 3134 }
191ea9b2 3135
709ae487
N
3136 mddev->degraded = 0;
3137 for (i=0; i < conf->raid_disks; i++)
3138 if (conf->mirrors[i].rdev == NULL ||
3139 !test_bit(In_sync, &conf->mirrors[i].rdev->flags) ||
3140 test_bit(Faulty, &conf->mirrors[i].rdev->flags))
3141 mddev->degraded++;
3142
3143 if (conf->raid_disks - mddev->degraded == 1)
3144 mddev->recovery_cp = MaxSector;
3145
8c6ac868 3146 if (mddev->recovery_cp != MaxSector)
1d41c216
N
3147 pr_info("md/raid1:%s: not clean -- starting background reconstruction\n",
3148 mdname(mddev));
3149 pr_info("md/raid1:%s: active with %d out of %d mirrors\n",
f72ffdd6 3150 mdname(mddev), mddev->raid_disks - mddev->degraded,
1da177e4 3151 mddev->raid_disks);
709ae487 3152
1da177e4
LT
3153 /*
3154 * Ok, everything is just fine now
3155 */
709ae487
N
3156 mddev->thread = conf->thread;
3157 conf->thread = NULL;
3158 mddev->private = conf;
46533ff7 3159 set_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
709ae487 3160
1f403624 3161 md_set_array_sectors(mddev, raid1_size(mddev, 0, 0));
1da177e4 3162
1ed7242e 3163 if (mddev->queue) {
2ff8cc2c
SL
3164 if (discard_supported)
3165 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3166 mddev->queue);
3167 else
3168 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3169 mddev->queue);
1ed7242e 3170 }
5220ea1e 3171
3172 ret = md_integrity_register(mddev);
5aa61f42
N
3173 if (ret) {
3174 md_unregister_thread(&mddev->thread);
afa0f557 3175 raid1_free(mddev, conf);
5aa61f42 3176 }
5220ea1e 3177 return ret;
1da177e4
LT
3178}
3179
afa0f557 3180static void raid1_free(struct mddev *mddev, void *priv)
1da177e4 3181{
afa0f557 3182 struct r1conf *conf = priv;
409c57f3 3183
644df1a8 3184 mempool_destroy(conf->r1bio_pool);
990a8baf 3185 kfree(conf->mirrors);
0fea7ed8 3186 safe_put_page(conf->tmppage);
990a8baf 3187 kfree(conf->poolinfo);
fd76863e 3188 kfree(conf->nr_pending);
3189 kfree(conf->nr_waiting);
3190 kfree(conf->nr_queued);
3191 kfree(conf->barrier);
1da177e4 3192 kfree(conf);
1da177e4
LT
3193}
3194
fd01b88c 3195static int raid1_resize(struct mddev *mddev, sector_t sectors)
1da177e4
LT
3196{
3197 /* no resync is happening, and there is enough space
3198 * on all devices, so we can resize.
3199 * We need to make sure resync covers any new space.
3200 * If the array is shrinking we should possibly wait until
3201 * any io in the removed space completes, but it hardly seems
3202 * worth it.
3203 */
a4a6125a
N
3204 sector_t newsize = raid1_size(mddev, sectors, 0);
3205 if (mddev->external_size &&
3206 mddev->array_sectors > newsize)
b522adcd 3207 return -EINVAL;
a4a6125a
N
3208 if (mddev->bitmap) {
3209 int ret = bitmap_resize(mddev->bitmap, newsize, 0, 0);
3210 if (ret)
3211 return ret;
3212 }
3213 md_set_array_sectors(mddev, newsize);
b522adcd 3214 if (sectors > mddev->dev_sectors &&
b098636c 3215 mddev->recovery_cp > mddev->dev_sectors) {
58c0fed4 3216 mddev->recovery_cp = mddev->dev_sectors;
1da177e4
LT
3217 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3218 }
b522adcd 3219 mddev->dev_sectors = sectors;
4b5c7ae8 3220 mddev->resync_max_sectors = sectors;
1da177e4
LT
3221 return 0;
3222}
3223
fd01b88c 3224static int raid1_reshape(struct mddev *mddev)
1da177e4
LT
3225{
3226 /* We need to:
3227 * 1/ resize the r1bio_pool
3228 * 2/ resize conf->mirrors
3229 *
3230 * We allocate a new r1bio_pool if we can.
3231 * Then raise a device barrier and wait until all IO stops.
3232 * Then resize conf->mirrors and swap in the new r1bio pool.
6ea9c07c
N
3233 *
3234 * At the same time, we "pack" the devices so that all the missing
3235 * devices have the higher raid_disk numbers.
1da177e4
LT
3236 */
3237 mempool_t *newpool, *oldpool;
3238 struct pool_info *newpoolinfo;
0eaf822c 3239 struct raid1_info *newmirrors;
e8096360 3240 struct r1conf *conf = mddev->private;
63c70c4f 3241 int cnt, raid_disks;
c04be0aa 3242 unsigned long flags;
b5470dc5 3243 int d, d2, err;
1da177e4 3244
63c70c4f 3245 /* Cannot change chunk_size, layout, or level */
664e7c41 3246 if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
63c70c4f
N
3247 mddev->layout != mddev->new_layout ||
3248 mddev->level != mddev->new_level) {
664e7c41 3249 mddev->new_chunk_sectors = mddev->chunk_sectors;
63c70c4f
N
3250 mddev->new_layout = mddev->layout;
3251 mddev->new_level = mddev->level;
3252 return -EINVAL;
3253 }
3254
28c1b9fd
GR
3255 if (!mddev_is_clustered(mddev)) {
3256 err = md_allow_write(mddev);
3257 if (err)
3258 return err;
3259 }
2a2275d6 3260
63c70c4f
N
3261 raid_disks = mddev->raid_disks + mddev->delta_disks;
3262
6ea9c07c
N
3263 if (raid_disks < conf->raid_disks) {
3264 cnt=0;
3265 for (d= 0; d < conf->raid_disks; d++)
3266 if (conf->mirrors[d].rdev)
3267 cnt++;
3268 if (cnt > raid_disks)
1da177e4 3269 return -EBUSY;
6ea9c07c 3270 }
1da177e4
LT
3271
3272 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
3273 if (!newpoolinfo)
3274 return -ENOMEM;
3275 newpoolinfo->mddev = mddev;
8f19ccb2 3276 newpoolinfo->raid_disks = raid_disks * 2;
1da177e4
LT
3277
3278 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
3279 r1bio_pool_free, newpoolinfo);
3280 if (!newpool) {
3281 kfree(newpoolinfo);
3282 return -ENOMEM;
3283 }
0eaf822c 3284 newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2,
8f19ccb2 3285 GFP_KERNEL);
1da177e4
LT
3286 if (!newmirrors) {
3287 kfree(newpoolinfo);
3288 mempool_destroy(newpool);
3289 return -ENOMEM;
3290 }
1da177e4 3291
e2d59925 3292 freeze_array(conf, 0);
1da177e4
LT
3293
3294 /* ok, everything is stopped */
3295 oldpool = conf->r1bio_pool;
3296 conf->r1bio_pool = newpool;
6ea9c07c 3297
a88aa786 3298 for (d = d2 = 0; d < conf->raid_disks; d++) {
3cb03002 3299 struct md_rdev *rdev = conf->mirrors[d].rdev;
a88aa786 3300 if (rdev && rdev->raid_disk != d2) {
36fad858 3301 sysfs_unlink_rdev(mddev, rdev);
a88aa786 3302 rdev->raid_disk = d2;
36fad858
NK
3303 sysfs_unlink_rdev(mddev, rdev);
3304 if (sysfs_link_rdev(mddev, rdev))
1d41c216
N
3305 pr_warn("md/raid1:%s: cannot register rd%d\n",
3306 mdname(mddev), rdev->raid_disk);
6ea9c07c 3307 }
a88aa786
N
3308 if (rdev)
3309 newmirrors[d2++].rdev = rdev;
3310 }
1da177e4
LT
3311 kfree(conf->mirrors);
3312 conf->mirrors = newmirrors;
3313 kfree(conf->poolinfo);
3314 conf->poolinfo = newpoolinfo;
3315
c04be0aa 3316 spin_lock_irqsave(&conf->device_lock, flags);
1da177e4 3317 mddev->degraded += (raid_disks - conf->raid_disks);
c04be0aa 3318 spin_unlock_irqrestore(&conf->device_lock, flags);
1da177e4 3319 conf->raid_disks = mddev->raid_disks = raid_disks;
63c70c4f 3320 mddev->delta_disks = 0;
1da177e4 3321
e2d59925 3322 unfreeze_array(conf);
1da177e4 3323
985ca973 3324 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
1da177e4
LT
3325 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3326 md_wakeup_thread(mddev->thread);
3327
3328 mempool_destroy(oldpool);
3329 return 0;
3330}
3331
fd01b88c 3332static void raid1_quiesce(struct mddev *mddev, int state)
36fa3063 3333{
e8096360 3334 struct r1conf *conf = mddev->private;
36fa3063
N
3335
3336 switch(state) {
6eef4b21
N
3337 case 2: /* wake for suspend */
3338 wake_up(&conf->wait_barrier);
3339 break;
9e6603da 3340 case 1:
07169fd4 3341 freeze_array(conf, 0);
36fa3063 3342 break;
9e6603da 3343 case 0:
07169fd4 3344 unfreeze_array(conf);
36fa3063
N
3345 break;
3346 }
36fa3063
N
3347}
3348
fd01b88c 3349static void *raid1_takeover(struct mddev *mddev)
709ae487
N
3350{
3351 /* raid1 can take over:
3352 * raid5 with 2 devices, any layout or chunk size
3353 */
3354 if (mddev->level == 5 && mddev->raid_disks == 2) {
e8096360 3355 struct r1conf *conf;
709ae487
N
3356 mddev->new_level = 1;
3357 mddev->new_layout = 0;
3358 mddev->new_chunk_sectors = 0;
3359 conf = setup_conf(mddev);
6995f0b2 3360 if (!IS_ERR(conf)) {
07169fd4 3361 /* Array must appear to be quiesced */
3362 conf->array_frozen = 1;
394ed8e4
SL
3363 mddev_clear_unsupported_flags(mddev,
3364 UNSUPPORTED_MDDEV_FLAGS);
6995f0b2 3365 }
709ae487
N
3366 return conf;
3367 }
3368 return ERR_PTR(-EINVAL);
3369}
1da177e4 3370
84fc4b56 3371static struct md_personality raid1_personality =
1da177e4
LT
3372{
3373 .name = "raid1",
2604b703 3374 .level = 1,
1da177e4 3375 .owner = THIS_MODULE,
849674e4
SL
3376 .make_request = raid1_make_request,
3377 .run = raid1_run,
afa0f557 3378 .free = raid1_free,
849674e4
SL
3379 .status = raid1_status,
3380 .error_handler = raid1_error,
1da177e4
LT
3381 .hot_add_disk = raid1_add_disk,
3382 .hot_remove_disk= raid1_remove_disk,
3383 .spare_active = raid1_spare_active,
849674e4 3384 .sync_request = raid1_sync_request,
1da177e4 3385 .resize = raid1_resize,
80c3a6ce 3386 .size = raid1_size,
63c70c4f 3387 .check_reshape = raid1_reshape,
36fa3063 3388 .quiesce = raid1_quiesce,
709ae487 3389 .takeover = raid1_takeover,
5c675f83 3390 .congested = raid1_congested,
1da177e4
LT
3391};
3392
3393static int __init raid_init(void)
3394{
2604b703 3395 return register_md_personality(&raid1_personality);
1da177e4
LT
3396}
3397
3398static void raid_exit(void)
3399{
2604b703 3400 unregister_md_personality(&raid1_personality);
1da177e4
LT
3401}
3402
3403module_init(raid_init);
3404module_exit(raid_exit);
3405MODULE_LICENSE("GPL");
0efb9e61 3406MODULE_DESCRIPTION("RAID1 (mirroring) personality for MD");
1da177e4 3407MODULE_ALIAS("md-personality-3"); /* RAID1 */
d9d166c2 3408MODULE_ALIAS("md-raid1");
2604b703 3409MODULE_ALIAS("md-level-1");
34db0cd6
N
3410
3411module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);