#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/ratelimit.h>
+#include <linux/sched/signal.h>
+
#include <trace/events/block.h>
+
#include "md.h"
#include "raid1.h"
#include "bitmap.h"
#define UNSUPPORTED_MDDEV_FLAGS \
((1L << MD_HAS_JOURNAL) | \
- (1L << MD_JOURNAL_CLEAN))
+ (1L << MD_JOURNAL_CLEAN) | \
+ (1L << MD_HAS_PPL))
/*
* Number of guaranteed r1bios in case of extreme VM load:
#define raid1_log(md, fmt, args...) \
do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
+/*
+ * 'strct resync_pages' stores actual pages used for doing the resync
+ * IO, and it is per-bio, so make .bi_private points to it.
+ */
+static inline struct resync_pages *get_resync_pages(struct bio *bio)
+{
+ return bio->bi_private;
+}
+
+/*
+ * for resync bio, r1bio pointer can be retrieved from the per-bio
+ * 'struct resync_pages'.
+ */
+static inline struct r1bio *get_resync_r1bio(struct bio *bio)
+{
+ return get_resync_pages(bio)->raid_bio;
+}
+
static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
{
struct pool_info *pi = data;
kfree(r1_bio);
}
-#define RESYNC_BLOCK_SIZE (64*1024)
#define RESYNC_DEPTH 32
#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
-#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH)
#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9)
#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW)
struct r1bio *r1_bio;
struct bio *bio;
int need_pages;
- int i, j;
+ int j;
+ struct resync_pages *rps;
r1_bio = r1bio_pool_alloc(gfp_flags, pi);
if (!r1_bio)
return NULL;
+ rps = kmalloc(sizeof(struct resync_pages) * pi->raid_disks,
+ gfp_flags);
+ if (!rps)
+ goto out_free_r1bio;
+
/*
* Allocate bios : 1 for reading, n-1 for writing
*/
need_pages = pi->raid_disks;
else
need_pages = 1;
- for (j = 0; j < need_pages; j++) {
+ for (j = 0; j < pi->raid_disks; j++) {
+ struct resync_pages *rp = &rps[j];
+
bio = r1_bio->bios[j];
- bio->bi_vcnt = RESYNC_PAGES;
- if (bio_alloc_pages(bio, gfp_flags))
- goto out_free_pages;
- }
- /* If not user-requests, copy the page pointers to all bios */
- if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
- for (i=0; i<RESYNC_PAGES ; i++)
- for (j=1; j<pi->raid_disks; j++)
- r1_bio->bios[j]->bi_io_vec[i].bv_page =
- r1_bio->bios[0]->bi_io_vec[i].bv_page;
+ if (j < need_pages) {
+ if (resync_alloc_pages(rp, gfp_flags))
+ goto out_free_pages;
+ } else {
+ memcpy(rp, &rps[0], sizeof(*rp));
+ resync_get_all_pages(rp);
+ }
+
+ rp->idx = 0;
+ rp->raid_bio = r1_bio;
+ bio->bi_private = rp;
}
r1_bio->master_bio = NULL;
out_free_pages:
while (--j >= 0)
- bio_free_pages(r1_bio->bios[j]);
+ resync_free_pages(&rps[j]);
out_free_bio:
while (++j < pi->raid_disks)
bio_put(r1_bio->bios[j]);
+ kfree(rps);
+
+out_free_r1bio:
r1bio_pool_free(r1_bio, data);
return NULL;
}
static void r1buf_pool_free(void *__r1_bio, void *data)
{
struct pool_info *pi = data;
- int i,j;
+ int i;
struct r1bio *r1bio = __r1_bio;
+ struct resync_pages *rp = NULL;
- for (i = 0; i < RESYNC_PAGES; i++)
- for (j = pi->raid_disks; j-- ;) {
- if (j == 0 ||
- r1bio->bios[j]->bi_io_vec[i].bv_page !=
- r1bio->bios[0]->bi_io_vec[i].bv_page)
- safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
- }
- for (i=0 ; i < pi->raid_disks; i++)
+ for (i = pi->raid_disks; i--; ) {
+ rp = get_resync_pages(r1bio->bios[i]);
+ resync_free_pages(rp);
bio_put(r1bio->bios[i]);
+ }
+
+ /* resync pages array stored in the 1st bio's .bi_private */
+ kfree(rp);
r1bio_pool_free(r1bio, data);
}
static void call_bio_endio(struct r1bio *r1_bio)
{
struct bio *bio = r1_bio->master_bio;
- int done;
struct r1conf *conf = r1_bio->mddev->private;
- sector_t bi_sector = bio->bi_iter.bi_sector;
-
- if (bio->bi_phys_segments) {
- unsigned long flags;
- spin_lock_irqsave(&conf->device_lock, flags);
- bio->bi_phys_segments--;
- done = (bio->bi_phys_segments == 0);
- spin_unlock_irqrestore(&conf->device_lock, flags);
- /*
- * make_request() might be waiting for
- * bi_phys_segments to decrease
- */
- wake_up(&conf->wait_barrier);
- } else
- done = 1;
if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
bio->bi_error = -EIO;
- if (done) {
- bio_endio(bio);
- /*
- * Wake up any possible resync thread that waits for the device
- * to go idle.
- */
- allow_barrier(conf, bi_sector);
- }
+ bio_endio(bio);
+ /*
+ * Wake up any possible resync thread that waits for the device
+ * to go idle.
+ */
+ allow_barrier(conf, r1_bio->sector);
}
static void raid_end_bio_io(struct r1bio *r1_bio)
{
/* it really is the end of this request */
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- /* free extra copy of the data pages */
- int i = r1_bio->behind_page_count;
- while (i--)
- safe_put_page(r1_bio->behind_bvecs[i].bv_page);
- kfree(r1_bio->behind_bvecs);
- r1_bio->behind_bvecs = NULL;
+ bio_free_pages(r1_bio->behind_master_bio);
+ bio_put(r1_bio->behind_master_bio);
+ r1_bio->behind_master_bio = NULL;
}
/* clear the bitmap if all writes complete successfully */
bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
}
if (behind) {
+ /* we release behind master bio when all write are done */
+ if (r1_bio->behind_master_bio == bio)
+ to_put = NULL;
+
if (test_bit(WriteMostly, &rdev->flags))
atomic_dec(&r1_bio->behind_remaining);
spin_unlock_irq(&conf->resync_lock);
}
+static void inc_pending(struct r1conf *conf, sector_t bi_sector)
+{
+ /* The current request requires multiple r1_bio, so
+ * we need to increment the pending count, and the corresponding
+ * window count.
+ */
+ int idx = sector_to_idx(bi_sector);
+ atomic_inc(&conf->nr_pending[idx]);
+}
+
static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
{
int idx = sector_to_idx(sector_nr);
static void freeze_array(struct r1conf *conf, int extra)
{
/* Stop sync I/O and normal I/O and wait for everything to
- * go quite.
+ * go quiet.
* This is called in two situations:
* 1) management command handlers (reshape, remove disk, quiesce).
* 2) one normal I/O request failed.
wake_up(&conf->wait_barrier);
}
-/* duplicate the data pages for behind I/O
- */
-static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
+static struct bio *alloc_behind_master_bio(struct r1bio *r1_bio,
+ struct bio *bio,
+ int offset, int size)
{
- int i;
- struct bio_vec *bvec;
- struct bio_vec *bvecs = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec),
- GFP_NOIO);
- if (unlikely(!bvecs))
- return;
+ unsigned vcnt = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ int i = 0;
+ struct bio *behind_bio = NULL;
+
+ behind_bio = bio_alloc_mddev(GFP_NOIO, vcnt, r1_bio->mddev);
+ if (!behind_bio)
+ goto fail;
+
+ /* discard op, we don't support writezero/writesame yet */
+ if (!bio_has_data(bio))
+ goto skip_copy;
+
+ while (i < vcnt && size) {
+ struct page *page;
+ int len = min_t(int, PAGE_SIZE, size);
- bio_for_each_segment_all(bvec, bio, i) {
- bvecs[i] = *bvec;
- bvecs[i].bv_page = alloc_page(GFP_NOIO);
- if (unlikely(!bvecs[i].bv_page))
- goto do_sync_io;
- memcpy(kmap(bvecs[i].bv_page) + bvec->bv_offset,
- kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
- kunmap(bvecs[i].bv_page);
- kunmap(bvec->bv_page);
- }
- r1_bio->behind_bvecs = bvecs;
- r1_bio->behind_page_count = bio->bi_vcnt;
+ page = alloc_page(GFP_NOIO);
+ if (unlikely(!page))
+ goto free_pages;
+
+ bio_add_page(behind_bio, page, len, 0);
+
+ size -= len;
+ i++;
+ }
+
+ bio_copy_data_partial(behind_bio, bio, offset,
+ behind_bio->bi_iter.bi_size);
+skip_copy:
+ r1_bio->behind_master_bio = behind_bio;;
set_bit(R1BIO_BehindIO, &r1_bio->state);
- return;
-do_sync_io:
- for (i = 0; i < bio->bi_vcnt; i++)
- if (bvecs[i].bv_page)
- put_page(bvecs[i].bv_page);
- kfree(bvecs);
+ return behind_bio;
+
+free_pages:
pr_debug("%dB behind alloc failed, doing sync I/O\n",
bio->bi_iter.bi_size);
+ bio_free_pages(behind_bio);
+fail:
+ return behind_bio;
}
struct raid1_plug_cb {
r1_bio = alloc_r1bio(mddev, bio, 0);
- /*
- * We might need to issue multiple reads to different
- * devices if there are bad blocks around, so we keep
- * track of the number of reads in bio->bi_phys_segments.
- * If this is 0, there is only one r1_bio and no locking
- * will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
/*
* make_request() can abort the operation when read-ahead is being
* used and no empty request is available.
sectors_handled = (r1_bio->sector + max_sectors
- bio->bi_iter.bi_sector);
r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
+ bio_inc_remaining(bio);
/*
* Cannot call generic_make_request directly as that will be
int first_clone;
int sectors_handled;
int max_sectors;
+ sector_t offset;
/*
* Register the new request and wait if the reconstruction
r1_bio = alloc_r1bio(mddev, bio, 0);
- /* We might need to issue multiple writes to different
- * devices if there are bad blocks around, so we keep
- * track of the number of writes in bio->bi_phys_segments.
- * If this is 0, there is only one r1_bio and no locking
- * will be needed when requests complete. If it is
- * non-zero, then it is the number of not-completed requests.
- */
- bio->bi_phys_segments = 0;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
if (conf->pending_count >= max_queued_requests) {
md_wakeup_thread(mddev->thread);
raid1_log(mddev, "wait queued");
goto retry_write;
}
- if (max_sectors < r1_bio->sectors) {
- /* We are splitting this write into multiple parts, so
- * we need to prepare for allocating another r1_bio.
- */
+ if (max_sectors < r1_bio->sectors)
r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (bio->bi_phys_segments == 0)
- bio->bi_phys_segments = 2;
- else
- bio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
- }
+
sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector;
atomic_set(&r1_bio->remaining, 1);
atomic_set(&r1_bio->behind_remaining, 0);
first_clone = 1;
+
+ offset = r1_bio->sector - bio->bi_iter.bi_sector;
for (i = 0; i < disks; i++) {
struct bio *mbio = NULL;
- sector_t offset;
if (!r1_bio->bios[i])
continue;
- offset = r1_bio->sector - bio->bi_iter.bi_sector;
if (first_clone) {
/* do behind I/O ?
(atomic_read(&bitmap->behind_writes)
< mddev->bitmap_info.max_write_behind) &&
!waitqueue_active(&bitmap->behind_wait)) {
- mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
- mddev->bio_set,
- offset << 9,
- max_sectors << 9);
- alloc_behind_pages(mbio, r1_bio);
+ mbio = alloc_behind_master_bio(r1_bio, bio,
+ offset << 9,
+ max_sectors << 9);
}
bitmap_startwrite(bitmap, r1_bio->sector,
}
if (!mbio) {
- if (r1_bio->behind_bvecs)
- mbio = bio_clone_bioset_partial(bio, GFP_NOIO,
- mddev->bio_set,
- offset << 9,
- max_sectors << 9);
+ if (r1_bio->behind_master_bio)
+ mbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
else {
mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set);
bio_trim(mbio, offset, max_sectors);
}
}
- if (r1_bio->behind_bvecs) {
- struct bio_vec *bvec;
- int j;
-
- /*
- * We trimmed the bio, so _all is legit
- */
- bio_for_each_segment_all(bvec, mbio, j)
- bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
+ if (r1_bio->behind_master_bio) {
if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
atomic_inc(&r1_bio->behind_remaining);
}
* as it could result in the bio being freed.
*/
if (sectors_handled < bio_sectors(bio)) {
+ /* We need another r1_bio, which must be counted */
+ sector_t sect = bio->bi_iter.bi_sector + sectors_handled;
+
+ inc_pending(conf, sect);
+ bio_inc_remaining(bio);
r1_bio_write_done(r1_bio);
- /* We need another r1_bio. It has already been counted
- * in bio->bi_phys_segments
- */
r1_bio = alloc_r1bio(mddev, bio, sectors_handled);
goto retry_write;
}
split = bio;
}
- if (bio_data_dir(split) == READ)
+ if (bio_data_dir(split) == READ) {
raid1_read_request(mddev, split);
- else
+
+ /*
+ * If a bio is splitted, the first part of bio will
+ * pass barrier but the bio is queued in
+ * current->bio_list (see generic_make_request). If
+ * there is a raise_barrier() called here, the second
+ * part of bio can't pass barrier. But since the first
+ * part bio isn't dispatched to underlaying disks yet,
+ * the barrier is never released, hence raise_barrier
+ * will alays wait. We have a deadlock.
+ * Note, this only happens in read path. For write
+ * path, the first part of bio is dispatched in a
+ * schedule() call (because of blk plug) or offloaded
+ * to raid10d.
+ * Quitting from the function immediately can change
+ * the bio order queued in bio_list and avoid the deadlock.
+ */
+ if (split != bio) {
+ generic_make_request(bio);
+ break;
+ }
+ } else
raid1_write_request(mddev, split);
} while (split != bio);
}
static void end_sync_read(struct bio *bio)
{
- struct r1bio *r1_bio = bio->bi_private;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
update_head_pos(r1_bio->read_disk, r1_bio);
static void end_sync_write(struct bio *bio)
{
int uptodate = !bio->bi_error;
- struct r1bio *r1_bio = bio->bi_private;
+ struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
sector_t first_bad;
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
struct bio *bio = r1_bio->bios[r1_bio->read_disk];
+ struct page **pages = get_resync_pages(bio)->pages;
sector_t sect = r1_bio->sector;
int sectors = r1_bio->sectors;
int idx = 0;
*/
rdev = conf->mirrors[d].rdev;
if (sync_page_io(rdev, sect, s<<9,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
REQ_OP_READ, 0, false)) {
success = 1;
break;
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
WRITE) == 0) {
r1_bio->bios[d]->bi_end_io = NULL;
rdev_dec_pending(rdev, mddev);
continue;
rdev = conf->mirrors[d].rdev;
if (r1_sync_page_io(rdev, sect, s,
- bio->bi_io_vec[idx].bv_page,
+ pages[idx],
READ) != 0)
atomic_add(s, &rdev->corrected_errors);
}
int j;
int size;
int error;
+ struct bio_vec *bi;
struct bio *b = r1_bio->bios[i];
+ struct resync_pages *rp = get_resync_pages(b);
if (b->bi_end_io != end_sync_read)
continue;
/* fixup the bio for reuse, but preserve errno */
conf->mirrors[i].rdev->data_offset;
b->bi_bdev = conf->mirrors[i].rdev->bdev;
b->bi_end_io = end_sync_read;
- b->bi_private = r1_bio;
+ rp->raid_bio = r1_bio;
+ b->bi_private = rp;
size = b->bi_iter.bi_size;
- for (j = 0; j < vcnt ; j++) {
- struct bio_vec *bi;
- bi = &b->bi_io_vec[j];
+ bio_for_each_segment_all(bi, b, j) {
bi->bv_offset = 0;
if (size > PAGE_SIZE)
bi->bv_len = PAGE_SIZE;
struct bio *pbio = r1_bio->bios[primary];
struct bio *sbio = r1_bio->bios[i];
int error = sbio->bi_error;
+ struct page **ppages = get_resync_pages(pbio)->pages;
+ struct page **spages = get_resync_pages(sbio)->pages;
+ struct bio_vec *bi;
+ int page_len[RESYNC_PAGES] = { 0 };
if (sbio->bi_end_io != end_sync_read)
continue;
/* Now we can 'fixup' the error value */
sbio->bi_error = 0;
+ bio_for_each_segment_all(bi, sbio, j)
+ page_len[j] = bi->bv_len;
+
if (!error) {
for (j = vcnt; j-- ; ) {
- struct page *p, *s;
- p = pbio->bi_io_vec[j].bv_page;
- s = sbio->bi_io_vec[j].bv_page;
- if (memcmp(page_address(p),
- page_address(s),
- sbio->bi_io_vec[j].bv_len))
+ if (memcmp(page_address(ppages[j]),
+ page_address(spages[j]),
+ page_len[j]))
break;
}
} else
/* Write at 'sector' for 'sectors'*/
if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
- unsigned vcnt = r1_bio->behind_page_count;
- struct bio_vec *vec = r1_bio->behind_bvecs;
-
- while (!vec->bv_page) {
- vec++;
- vcnt--;
- }
-
- wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
- memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-
- wbio->bi_vcnt = vcnt;
+ wbio = bio_clone_fast(r1_bio->behind_master_bio,
+ GFP_NOIO,
+ mddev->bio_set);
+ /* We really need a _all clone */
+ wbio->bi_iter = (struct bvec_iter){ 0 };
} else {
wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO,
mddev->bio_set);
int sectors_handled = (r1_bio->sector + max_sectors
- mbio->bi_iter.bi_sector);
r1_bio->sectors = max_sectors;
- spin_lock_irq(&conf->device_lock);
- if (mbio->bi_phys_segments == 0)
- mbio->bi_phys_segments = 2;
- else
- mbio->bi_phys_segments++;
- spin_unlock_irq(&conf->device_lock);
+ bio_inc_remaining(mbio);
trace_block_bio_remap(bdev_get_queue(bio->bi_bdev),
bio, bio_dev, bio_sector);
generic_make_request(bio);
r1_bio = alloc_r1bio(mddev, mbio, sectors_handled);
set_bit(R1BIO_ReadError, &r1_bio->state);
+ inc_pending(conf, r1_bio->sector);
goto read_more;
} else {
for (i = 0; i < conf->raid_disks * 2; i++) {
struct md_rdev *rdev;
bio = r1_bio->bios[i];
- bio_reset(bio);
rdev = rcu_dereference(conf->mirrors[i].rdev);
if (rdev == NULL ||
atomic_inc(&rdev->nr_pending);
bio->bi_iter.bi_sector = sector_nr + rdev->data_offset;
bio->bi_bdev = rdev->bdev;
- bio->bi_private = r1_bio;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
}
}
for (i = 0 ; i < conf->raid_disks * 2; i++) {
+ struct resync_pages *rp;
+
bio = r1_bio->bios[i];
+ rp = get_resync_pages(bio);
if (bio->bi_end_io) {
- page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
- if (bio_add_page(bio, page, len, 0) == 0) {
- /* stop here */
- bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
- while (i > 0) {
- i--;
- bio = r1_bio->bios[i];
- if (bio->bi_end_io==NULL)
- continue;
- /* remove last page from this bio */
- bio->bi_vcnt--;
- bio->bi_iter.bi_size -= len;
- bio_clear_flag(bio, BIO_SEG_VALID);
- }
- goto bio_full;
- }
+ page = resync_fetch_page(rp, rp->idx++);
+
+ /*
+ * won't fail because the vec table is big
+ * enough to hold all these pages
+ */
+ bio_add_page(bio, page, len, 0);
}
}
nr_sectors += len>>9;
sector_nr += len>>9;
sync_blocks -= (len>>9);
- } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
- bio_full:
+ } while (get_resync_pages(r1_bio->bios[disk]->bi_private)->idx < RESYNC_PAGES);
+
r1_bio->sectors = nr_sectors;
if (mddev_is_clustered(mddev) &&
return ret;
}
md_set_array_sectors(mddev, newsize);
- set_capacity(mddev->gendisk, mddev->array_sectors);
- revalidate_disk(mddev->gendisk);
if (sectors > mddev->dev_sectors &&
mddev->recovery_cp > mddev->dev_sectors) {
mddev->recovery_cp = mddev->dev_sectors;