if (log->need_cache_flush)
md_wakeup_thread(log->rdev->mddev->thread);
- bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH);
+
+ if (io->has_null_flush) {
+ struct bio *bi;
+
+ WARN_ON(bio_list_empty(&io->flush_barriers));
+ while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) {
+ bio_endio(bi);
+ atomic_dec(&io->pending_stripe);
+ }
+ if (atomic_read(&io->pending_stripe) == 0)
+ __r5l_stripe_write_finished(io);
+ }
+ }
+
+ static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io)
+ {
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ __r5l_set_io_unit_state(io, IO_UNIT_IO_START);
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+
+ if (io->has_flush)
- bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA);
++ io->current_bio->bi_opf |= REQ_PREFLUSH;
+ if (io->has_fua)
- bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH);
++ io->current_bio->bi_opf |= REQ_FUA;
+ submit_bio(io->current_bio);
+
+ if (!io->split_bio)
+ return;
+
+ if (io->has_flush)
- bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA);
++ io->split_bio->bi_opf |= REQ_PREFLUSH;
+ if (io->has_fua)
++ io->split_bio->bi_opf |= REQ_FUA;
+ submit_bio(io->split_bio);
+ }
+
+ /* deferred io_unit will be dispatched here */
+ static void r5l_submit_io_async(struct work_struct *work)
+ {
+ struct r5l_log *log = container_of(work, struct r5l_log,
+ deferred_io_work);
+ struct r5l_io_unit *io = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&log->io_list_lock, flags);
+ if (!list_empty(&log->running_ios)) {
+ io = list_first_entry(&log->running_ios, struct r5l_io_unit,
+ log_sibling);
+ if (!io->io_deferred)
+ io = NULL;
+ else
+ io->io_deferred = 0;
+ }
+ spin_unlock_irqrestore(&log->io_list_lock, flags);
+ if (io)
+ r5l_do_submit_io(log, io);
}
static void r5l_submit_current_io(struct r5l_log *log)
return 0;
}
- static int r5l_recovery_flush_one_stripe(struct r5l_log *log,
- struct r5l_recovery_ctx *ctx,
- sector_t stripe_sect,
- int *offset, sector_t *log_offset)
+ static void
+ r5l_recovery_create_empty_meta_block(struct r5l_log *log,
+ struct page *page,
+ sector_t pos, u64 seq)
{
- struct r5conf *conf = log->rdev->mddev->private;
- struct stripe_head *sh;
- struct r5l_payload_data_parity *payload;
- int disk_index;
+ struct r5l_meta_block *mb;
- sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0);
- while (1) {
- payload = page_address(ctx->meta_page) + *offset;
+ mb = page_address(page);
+ clear_page(mb);
+ mb->magic = cpu_to_le32(R5LOG_MAGIC);
+ mb->version = R5LOG_VERSION;
+ mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
+ mb->seq = cpu_to_le64(seq);
+ mb->position = cpu_to_le64(pos);
+ }
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) {
- raid5_compute_sector(conf,
- le64_to_cpu(payload->location), 0,
- &disk_index, sh);
+ static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
+ u64 seq)
+ {
+ struct page *page;
+ struct r5l_meta_block *mb;
- sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
- sh->dev[disk_index].page, REQ_OP_READ, 0,
- false);
- sh->dev[disk_index].log_checksum =
- le32_to_cpu(payload->checksum[0]);
- set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
- ctx->meta_total_blocks += BLOCK_SECTORS;
- } else {
- disk_index = sh->pd_idx;
- sync_page_io(log->rdev, *log_offset, PAGE_SIZE,
- sh->dev[disk_index].page, REQ_OP_READ, 0,
- false);
- sh->dev[disk_index].log_checksum =
- le32_to_cpu(payload->checksum[0]);
- set_bit(R5_Wantwrite, &sh->dev[disk_index].flags);
-
- if (sh->qd_idx >= 0) {
- disk_index = sh->qd_idx;
- sync_page_io(log->rdev,
- r5l_ring_add(log, *log_offset, BLOCK_SECTORS),
- PAGE_SIZE, sh->dev[disk_index].page,
- REQ_OP_READ, 0, false);
- sh->dev[disk_index].log_checksum =
- le32_to_cpu(payload->checksum[1]);
- set_bit(R5_Wantwrite,
- &sh->dev[disk_index].flags);
- }
- ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
- }
+ page = alloc_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+ r5l_recovery_create_empty_meta_block(log, page, pos, seq);
+ mb = page_address(page);
+ mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
+ mb, PAGE_SIZE));
+ if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
- WRITE_FUA, false)) {
++ REQ_FUA, false)) {
+ __free_page(page);
+ return -EIO;
+ }
+ __free_page(page);
+ return 0;
+ }
- *log_offset = r5l_ring_add(log, *log_offset,
- le32_to_cpu(payload->size));
- *offset += sizeof(struct r5l_payload_data_parity) +
- sizeof(__le32) *
- (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9));
- if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY)
- break;
+ /*
+ * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite
+ * to mark valid (potentially not flushed) data in the journal.
+ *
+ * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb,
+ * so there should not be any mismatch here.
+ */
+ static void r5l_recovery_load_data(struct r5l_log *log,
+ struct stripe_head *sh,
+ struct r5l_recovery_ctx *ctx,
+ struct r5l_payload_data_parity *payload,
+ sector_t log_offset)
+ {
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+ int dd_idx;
+
+ raid5_compute_sector(conf,
+ le64_to_cpu(payload->location), 0,
+ &dd_idx, sh);
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ sh->dev[dd_idx].page, REQ_OP_READ, 0, false);
+ sh->dev[dd_idx].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ ctx->meta_total_blocks += BLOCK_SECTORS;
+
+ set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags);
+ set_bit(STRIPE_R5C_CACHING, &sh->state);
+ }
+
+ static void r5l_recovery_load_parity(struct r5l_log *log,
+ struct stripe_head *sh,
+ struct r5l_recovery_ctx *ctx,
+ struct r5l_payload_data_parity *payload,
+ sector_t log_offset)
+ {
+ struct mddev *mddev = log->rdev->mddev;
+ struct r5conf *conf = mddev->private;
+
+ ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded;
+ sync_page_io(log->rdev, log_offset, PAGE_SIZE,
+ sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false);
+ sh->dev[sh->pd_idx].log_checksum =
+ le32_to_cpu(payload->checksum[0]);
+ set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags);
+
+ if (sh->qd_idx >= 0) {
+ sync_page_io(log->rdev,
+ r5l_ring_add(log, log_offset, BLOCK_SECTORS),
+ PAGE_SIZE, sh->dev[sh->qd_idx].page,
+ REQ_OP_READ, 0, false);
+ sh->dev[sh->qd_idx].log_checksum =
+ le32_to_cpu(payload->checksum[1]);
+ set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags);
}
+ clear_bit(STRIPE_R5C_CACHING, &sh->state);
+ }
- for (disk_index = 0; disk_index < sh->disks; disk_index++) {
- void *addr;
- u32 checksum;
+ static void r5l_recovery_reset_stripe(struct stripe_head *sh)
+ {
+ int i;
+
+ sh->state = 0;
+ sh->log_start = MaxSector;
+ for (i = sh->disks; i--; )
+ sh->dev[i].flags = 0;
+ }
+
+ static void
+ r5l_recovery_replay_one_stripe(struct r5conf *conf,
+ struct stripe_head *sh,
+ struct r5l_recovery_ctx *ctx)
+ {
+ struct md_rdev *rdev, *rrdev;
+ int disk_index;
+ int data_count = 0;
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
- addr = kmap_atomic(sh->dev[disk_index].page);
- checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE);
- kunmap_atomic(addr);
- if (checksum != sh->dev[disk_index].log_checksum)
- goto error;
+ if (disk_index == sh->qd_idx || disk_index == sh->pd_idx)
+ continue;
+ data_count++;
}
- for (disk_index = 0; disk_index < sh->disks; disk_index++) {
- struct md_rdev *rdev, *rrdev;
+ /*
+ * stripes that only have parity must have been flushed
+ * before the crash that we are now recovering from, so
+ * there is nothing more to recovery.
+ */
+ if (data_count == 0)
+ goto out;
- if (!test_and_clear_bit(R5_Wantwrite,
- &sh->dev[disk_index].flags))
+ for (disk_index = 0; disk_index < sh->disks; disk_index++) {
+ if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags))
continue;
/* in case device is broken */
ctx->seq++;
ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks);
}
+
+ if (ret == -ENOMEM) {
+ r5c_recovery_drop_stripes(&ctx->cached_list, ctx);
+ return ret;
+ }
+
+ /* replay data-parity stripes */
+ r5c_recovery_replay_stripes(&ctx->cached_list, ctx);
+
+ /* load data-only stripes to stripe cache */
+ list_for_each_entry(sh, &ctx->cached_list, lru) {
+ WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
+ r5c_recovery_load_one_stripe(log, sh);
+ ctx->data_only_stripes++;
+ }
+
+ return 0;
}
- static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos,
- u64 seq)
+ /*
+ * we did a recovery. Now ctx.pos points to an invalid meta block. New
+ * log will start here. but we can't let superblock point to last valid
+ * meta block. The log might looks like:
+ * | meta 1| meta 2| meta 3|
+ * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If
+ * superblock points to meta 1, we write a new valid meta 2n. if crash
+ * happens again, new recovery will start from meta 1. Since meta 2n is
+ * valid now, recovery will think meta 3 is valid, which is wrong.
+ * The solution is we create a new meta in meta2 with its seq == meta
+ * 1's seq + 10000 and let superblock points to meta2. The same recovery
+ * will not think meta 3 is a valid meta, because its seq doesn't match
+ */
+
+ /*
+ * Before recovery, the log looks like the following
+ *
+ * ---------------------------------------------
+ * | valid log | invalid log |
+ * ---------------------------------------------
+ * ^
+ * |- log->last_checkpoint
+ * |- log->last_cp_seq
+ *
+ * Now we scan through the log until we see invalid entry
+ *
+ * ---------------------------------------------
+ * | valid log | invalid log |
+ * ---------------------------------------------
+ * ^ ^
+ * |- log->last_checkpoint |- ctx->pos
+ * |- log->last_cp_seq |- ctx->seq
+ *
+ * From this point, we need to increase seq number by 10 to avoid
+ * confusing next recovery.
+ *
+ * ---------------------------------------------
+ * | valid log | invalid log |
+ * ---------------------------------------------
+ * ^ ^
+ * |- log->last_checkpoint |- ctx->pos+1
+ * |- log->last_cp_seq |- ctx->seq+10001
+ *
+ * However, it is not safe to start the state machine yet, because data only
+ * parities are not yet secured in RAID. To save these data only parities, we
+ * rewrite them from seq+11.
+ *
+ * -----------------------------------------------------------------
+ * | valid log | data only stripes | invalid log |
+ * -----------------------------------------------------------------
+ * ^ ^
+ * |- log->last_checkpoint |- ctx->pos+n
+ * |- log->last_cp_seq |- ctx->seq+10000+n
+ *
+ * If failure happens again during this process, the recovery can safe start
+ * again from log->last_checkpoint.
+ *
+ * Once data only stripes are rewritten to journal, we move log_tail
+ *
+ * -----------------------------------------------------------------
+ * | old log | data only stripes | invalid log |
+ * -----------------------------------------------------------------
+ * ^ ^
+ * |- log->last_checkpoint |- ctx->pos+n
+ * |- log->last_cp_seq |- ctx->seq+10000+n
+ *
+ * Then we can safely start the state machine. If failure happens from this
+ * point on, the recovery will start from new log->last_checkpoint.
+ */
+ static int
+ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
+ struct r5l_recovery_ctx *ctx)
{
+ struct stripe_head *sh, *next;
+ struct mddev *mddev = log->rdev->mddev;
struct page *page;
- struct r5l_meta_block *mb;
- u32 crc;
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page)
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n",
+ mdname(mddev));
return -ENOMEM;
- mb = page_address(page);
- mb->magic = cpu_to_le32(R5LOG_MAGIC);
- mb->version = R5LOG_VERSION;
- mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block));
- mb->seq = cpu_to_le64(seq);
- mb->position = cpu_to_le64(pos);
- crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE);
- mb->checksum = cpu_to_le32(crc);
+ }
- if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE,
- REQ_FUA, false)) {
- __free_page(page);
- return -EIO;
+ list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
+ struct r5l_meta_block *mb;
+ int i;
+ int offset;
+ sector_t write_pos;
+
+ WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state));
+ r5l_recovery_create_empty_meta_block(log, page,
+ ctx->pos, ctx->seq);
+ mb = page_address(page);
+ offset = le32_to_cpu(mb->meta_size);
+ write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS);
+
+ for (i = sh->disks; i--; ) {
+ struct r5dev *dev = &sh->dev[i];
+ struct r5l_payload_data_parity *payload;
+ void *addr;
+
+ if (test_bit(R5_InJournal, &dev->flags)) {
+ payload = (void *)mb + offset;
+ payload->header.type = cpu_to_le16(
+ R5LOG_PAYLOAD_DATA);
+ payload->size = BLOCK_SECTORS;
+ payload->location = cpu_to_le64(
+ raid5_compute_blocknr(sh, i, 0));
+ addr = kmap_atomic(dev->page);
+ payload->checksum[0] = cpu_to_le32(
+ crc32c_le(log->uuid_checksum, addr,
+ PAGE_SIZE));
+ kunmap_atomic(addr);
+ sync_page_io(log->rdev, write_pos, PAGE_SIZE,
+ dev->page, REQ_OP_WRITE, 0, false);
+ write_pos = r5l_ring_add(log, write_pos,
+ BLOCK_SECTORS);
+ offset += sizeof(__le32) +
+ sizeof(struct r5l_payload_data_parity);
+
+ }
+ }
+ mb->meta_size = cpu_to_le32(offset);
+ mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum,
+ mb, PAGE_SIZE));
+ sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
- REQ_OP_WRITE, WRITE_FUA, false);
++ REQ_OP_WRITE, REQ_FUA, false);
+ sh->log_start = ctx->pos;
+ ctx->pos = write_pos;
+ ctx->seq += 1;
+
+ list_del_init(&sh->lru);
+ raid5_release_stripe(sh);
}
__free_page(page);
return 0;