From: Shaohua Li Date: Tue, 13 Dec 2016 20:40:15 +0000 (-0800) Subject: Merge branch 'md-next' into md-linus X-Git-Tag: Ubuntu-5.0.0-8.9~5891^2 X-Git-Url: https://git.proxmox.com/?a=commitdiff_plain;h=20737738d397dfadbca1ea50dcc00d7259f500cf;p=mirror_ubuntu-disco-kernel.git Merge branch 'md-next' into md-linus --- 20737738d397dfadbca1ea50dcc00d7259f500cf diff --cc drivers/md/md.c index f975cd08923d,c15e2344e7c8..82821ee0d57f --- a/drivers/md/md.c +++ b/drivers/md/md.c @@@ -743,7 -765,12 +765,12 @@@ void md_super_write(struct mddev *mddev bio_add_page(bio, page, size, 0); bio->bi_private = rdev; bio->bi_end_io = super_written; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA; + + if (test_bit(MD_FAILFAST_SUPPORTED, &mddev->flags) && + test_bit(FailFast, &rdev->flags) && + !test_bit(LastDev, &rdev->flags)) + ff = MD_FAILFAST; - bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH_FUA | ff); ++ bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_FUA | ff; atomic_inc(&mddev->pending_writes); submit_bio(bio); diff --cc drivers/md/raid5-cache.c index 8491edcfb5a6,6d1a150eacd6..d7bfb6fc8aef --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@@ -231,6 -551,64 +551,64 @@@ static void r5l_log_endio(struct bio *b if (log->need_cache_flush) md_wakeup_thread(log->rdev->mddev->thread); + + if (io->has_null_flush) { + struct bio *bi; + + WARN_ON(bio_list_empty(&io->flush_barriers)); + while ((bi = bio_list_pop(&io->flush_barriers)) != NULL) { + bio_endio(bi); + atomic_dec(&io->pending_stripe); + } + if (atomic_read(&io->pending_stripe) == 0) + __r5l_stripe_write_finished(io); + } + } + + static void r5l_do_submit_io(struct r5l_log *log, struct r5l_io_unit *io) + { + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + __r5l_set_io_unit_state(io, IO_UNIT_IO_START); + spin_unlock_irqrestore(&log->io_list_lock, flags); + + if (io->has_flush) - bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FLUSH); ++ io->current_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) - bio_set_op_attrs(io->current_bio, REQ_OP_WRITE, WRITE_FUA); ++ io->current_bio->bi_opf |= REQ_FUA; + submit_bio(io->current_bio); + + if (!io->split_bio) + return; + + if (io->has_flush) - bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FLUSH); ++ io->split_bio->bi_opf |= REQ_PREFLUSH; + if (io->has_fua) - bio_set_op_attrs(io->split_bio, REQ_OP_WRITE, WRITE_FUA); ++ io->split_bio->bi_opf |= REQ_FUA; + submit_bio(io->split_bio); + } + + /* deferred io_unit will be dispatched here */ + static void r5l_submit_io_async(struct work_struct *work) + { + struct r5l_log *log = container_of(work, struct r5l_log, + deferred_io_work); + struct r5l_io_unit *io = NULL; + unsigned long flags; + + spin_lock_irqsave(&log->io_list_lock, flags); + if (!list_empty(&log->running_ios)) { + io = list_first_entry(&log->running_ios, struct r5l_io_unit, + log_sibling); + if (!io->io_deferred) + io = NULL; + else + io->io_deferred = 0; + } + spin_unlock_irqrestore(&log->io_list_lock, flags); + if (io) + r5l_do_submit_io(log, io); } static void r5l_submit_current_io(struct r5l_log *log) @@@ -892,82 -1516,139 +1516,139 @@@ static int r5l_recovery_read_meta_block return 0; } - static int r5l_recovery_flush_one_stripe(struct r5l_log *log, - struct r5l_recovery_ctx *ctx, - sector_t stripe_sect, - int *offset, sector_t *log_offset) + static void + r5l_recovery_create_empty_meta_block(struct r5l_log *log, + struct page *page, + sector_t pos, u64 seq) { - struct r5conf *conf = log->rdev->mddev->private; - struct stripe_head *sh; - struct r5l_payload_data_parity *payload; - int disk_index; + struct r5l_meta_block *mb; - sh = raid5_get_active_stripe(conf, stripe_sect, 0, 0, 0); - while (1) { - payload = page_address(ctx->meta_page) + *offset; + mb = page_address(page); + clear_page(mb); + mb->magic = cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = cpu_to_le64(seq); + mb->position = cpu_to_le64(pos); + } - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_DATA) { - raid5_compute_sector(conf, - le64_to_cpu(payload->location), 0, - &disk_index, sh); + static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, + u64 seq) + { + struct page *page; + struct r5l_meta_block *mb; - sync_page_io(log->rdev, *log_offset, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_READ, 0, - false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[0]); - set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); - ctx->meta_total_blocks += BLOCK_SECTORS; - } else { - disk_index = sh->pd_idx; - sync_page_io(log->rdev, *log_offset, PAGE_SIZE, - sh->dev[disk_index].page, REQ_OP_READ, 0, - false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[0]); - set_bit(R5_Wantwrite, &sh->dev[disk_index].flags); - - if (sh->qd_idx >= 0) { - disk_index = sh->qd_idx; - sync_page_io(log->rdev, - r5l_ring_add(log, *log_offset, BLOCK_SECTORS), - PAGE_SIZE, sh->dev[disk_index].page, - REQ_OP_READ, 0, false); - sh->dev[disk_index].log_checksum = - le32_to_cpu(payload->checksum[1]); - set_bit(R5_Wantwrite, - &sh->dev[disk_index].flags); - } - ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; - } + page = alloc_page(GFP_KERNEL); + if (!page) + return -ENOMEM; + r5l_recovery_create_empty_meta_block(log, page, pos, seq); + mb = page_address(page); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, - WRITE_FUA, false)) { ++ REQ_FUA, false)) { + __free_page(page); + return -EIO; + } + __free_page(page); + return 0; + } - *log_offset = r5l_ring_add(log, *log_offset, - le32_to_cpu(payload->size)); - *offset += sizeof(struct r5l_payload_data_parity) + - sizeof(__le32) * - (le32_to_cpu(payload->size) >> (PAGE_SHIFT - 9)); - if (le16_to_cpu(payload->header.type) == R5LOG_PAYLOAD_PARITY) - break; + /* + * r5l_recovery_load_data and r5l_recovery_load_parity uses flag R5_Wantwrite + * to mark valid (potentially not flushed) data in the journal. + * + * We already verified checksum in r5l_recovery_verify_data_checksum_for_mb, + * so there should not be any mismatch here. + */ + static void r5l_recovery_load_data(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) + { + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + int dd_idx; + + raid5_compute_sector(conf, + le64_to_cpu(payload->location), 0, + &dd_idx, sh); + sync_page_io(log->rdev, log_offset, PAGE_SIZE, + sh->dev[dd_idx].page, REQ_OP_READ, 0, false); + sh->dev[dd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + ctx->meta_total_blocks += BLOCK_SECTORS; + + set_bit(R5_Wantwrite, &sh->dev[dd_idx].flags); + set_bit(STRIPE_R5C_CACHING, &sh->state); + } + + static void r5l_recovery_load_parity(struct r5l_log *log, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx, + struct r5l_payload_data_parity *payload, + sector_t log_offset) + { + struct mddev *mddev = log->rdev->mddev; + struct r5conf *conf = mddev->private; + + ctx->meta_total_blocks += BLOCK_SECTORS * conf->max_degraded; + sync_page_io(log->rdev, log_offset, PAGE_SIZE, + sh->dev[sh->pd_idx].page, REQ_OP_READ, 0, false); + sh->dev[sh->pd_idx].log_checksum = + le32_to_cpu(payload->checksum[0]); + set_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags); + + if (sh->qd_idx >= 0) { + sync_page_io(log->rdev, + r5l_ring_add(log, log_offset, BLOCK_SECTORS), + PAGE_SIZE, sh->dev[sh->qd_idx].page, + REQ_OP_READ, 0, false); + sh->dev[sh->qd_idx].log_checksum = + le32_to_cpu(payload->checksum[1]); + set_bit(R5_Wantwrite, &sh->dev[sh->qd_idx].flags); } + clear_bit(STRIPE_R5C_CACHING, &sh->state); + } - for (disk_index = 0; disk_index < sh->disks; disk_index++) { - void *addr; - u32 checksum; + static void r5l_recovery_reset_stripe(struct stripe_head *sh) + { + int i; + + sh->state = 0; + sh->log_start = MaxSector; + for (i = sh->disks; i--; ) + sh->dev[i].flags = 0; + } + + static void + r5l_recovery_replay_one_stripe(struct r5conf *conf, + struct stripe_head *sh, + struct r5l_recovery_ctx *ctx) + { + struct md_rdev *rdev, *rrdev; + int disk_index; + int data_count = 0; + for (disk_index = 0; disk_index < sh->disks; disk_index++) { if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) continue; - addr = kmap_atomic(sh->dev[disk_index].page); - checksum = crc32c_le(log->uuid_checksum, addr, PAGE_SIZE); - kunmap_atomic(addr); - if (checksum != sh->dev[disk_index].log_checksum) - goto error; + if (disk_index == sh->qd_idx || disk_index == sh->pd_idx) + continue; + data_count++; } - for (disk_index = 0; disk_index < sh->disks; disk_index++) { - struct md_rdev *rdev, *rrdev; + /* + * stripes that only have parity must have been flushed + * before the crash that we are now recovering from, so + * there is nothing more to recovery. + */ + if (data_count == 0) + goto out; - if (!test_and_clear_bit(R5_Wantwrite, - &sh->dev[disk_index].flags)) + for (disk_index = 0; disk_index < sh->disks; disk_index++) { + if (!test_bit(R5_Wantwrite, &sh->dev[disk_index].flags)) continue; /* in case device is broken */ @@@ -1031,31 -1975,159 +1975,159 @@@ static int r5c_recovery_flush_log(struc ctx->seq++; ctx->pos = r5l_ring_add(log, ctx->pos, ctx->meta_total_blocks); } + + if (ret == -ENOMEM) { + r5c_recovery_drop_stripes(&ctx->cached_list, ctx); + return ret; + } + + /* replay data-parity stripes */ + r5c_recovery_replay_stripes(&ctx->cached_list, ctx); + + /* load data-only stripes to stripe cache */ + list_for_each_entry(sh, &ctx->cached_list, lru) { + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5c_recovery_load_one_stripe(log, sh); + ctx->data_only_stripes++; + } + + return 0; } - static int r5l_log_write_empty_meta_block(struct r5l_log *log, sector_t pos, - u64 seq) + /* + * we did a recovery. Now ctx.pos points to an invalid meta block. New + * log will start here. but we can't let superblock point to last valid + * meta block. The log might looks like: + * | meta 1| meta 2| meta 3| + * meta 1 is valid, meta 2 is invalid. meta 3 could be valid. If + * superblock points to meta 1, we write a new valid meta 2n. if crash + * happens again, new recovery will start from meta 1. Since meta 2n is + * valid now, recovery will think meta 3 is valid, which is wrong. + * The solution is we create a new meta in meta2 with its seq == meta + * 1's seq + 10000 and let superblock points to meta2. The same recovery + * will not think meta 3 is a valid meta, because its seq doesn't match + */ + + /* + * Before recovery, the log looks like the following + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ + * |- log->last_checkpoint + * |- log->last_cp_seq + * + * Now we scan through the log until we see invalid entry + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos + * |- log->last_cp_seq |- ctx->seq + * + * From this point, we need to increase seq number by 10 to avoid + * confusing next recovery. + * + * --------------------------------------------- + * | valid log | invalid log | + * --------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+1 + * |- log->last_cp_seq |- ctx->seq+10001 + * + * However, it is not safe to start the state machine yet, because data only + * parities are not yet secured in RAID. To save these data only parities, we + * rewrite them from seq+11. + * + * ----------------------------------------------------------------- + * | valid log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * If failure happens again during this process, the recovery can safe start + * again from log->last_checkpoint. + * + * Once data only stripes are rewritten to journal, we move log_tail + * + * ----------------------------------------------------------------- + * | old log | data only stripes | invalid log | + * ----------------------------------------------------------------- + * ^ ^ + * |- log->last_checkpoint |- ctx->pos+n + * |- log->last_cp_seq |- ctx->seq+10000+n + * + * Then we can safely start the state machine. If failure happens from this + * point on, the recovery will start from new log->last_checkpoint. + */ + static int + r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log, + struct r5l_recovery_ctx *ctx) { + struct stripe_head *sh, *next; + struct mddev *mddev = log->rdev->mddev; struct page *page; - struct r5l_meta_block *mb; - u32 crc; - page = alloc_page(GFP_KERNEL | __GFP_ZERO); - if (!page) + page = alloc_page(GFP_KERNEL); + if (!page) { + pr_err("md/raid:%s: cannot allocate memory to rewrite data only stripes\n", + mdname(mddev)); return -ENOMEM; - mb = page_address(page); - mb->magic = cpu_to_le32(R5LOG_MAGIC); - mb->version = R5LOG_VERSION; - mb->meta_size = cpu_to_le32(sizeof(struct r5l_meta_block)); - mb->seq = cpu_to_le64(seq); - mb->position = cpu_to_le64(pos); - crc = crc32c_le(log->uuid_checksum, mb, PAGE_SIZE); - mb->checksum = cpu_to_le32(crc); + } - if (!sync_page_io(log->rdev, pos, PAGE_SIZE, page, REQ_OP_WRITE, - REQ_FUA, false)) { - __free_page(page); - return -EIO; + list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) { + struct r5l_meta_block *mb; + int i; + int offset; + sector_t write_pos; + + WARN_ON(!test_bit(STRIPE_R5C_CACHING, &sh->state)); + r5l_recovery_create_empty_meta_block(log, page, + ctx->pos, ctx->seq); + mb = page_address(page); + offset = le32_to_cpu(mb->meta_size); + write_pos = r5l_ring_add(log, ctx->pos, BLOCK_SECTORS); + + for (i = sh->disks; i--; ) { + struct r5dev *dev = &sh->dev[i]; + struct r5l_payload_data_parity *payload; + void *addr; + + if (test_bit(R5_InJournal, &dev->flags)) { + payload = (void *)mb + offset; + payload->header.type = cpu_to_le16( + R5LOG_PAYLOAD_DATA); + payload->size = BLOCK_SECTORS; + payload->location = cpu_to_le64( + raid5_compute_blocknr(sh, i, 0)); + addr = kmap_atomic(dev->page); + payload->checksum[0] = cpu_to_le32( + crc32c_le(log->uuid_checksum, addr, + PAGE_SIZE)); + kunmap_atomic(addr); + sync_page_io(log->rdev, write_pos, PAGE_SIZE, + dev->page, REQ_OP_WRITE, 0, false); + write_pos = r5l_ring_add(log, write_pos, + BLOCK_SECTORS); + offset += sizeof(__le32) + + sizeof(struct r5l_payload_data_parity); + + } + } + mb->meta_size = cpu_to_le32(offset); + mb->checksum = cpu_to_le32(crc32c_le(log->uuid_checksum, + mb, PAGE_SIZE)); + sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page, - REQ_OP_WRITE, WRITE_FUA, false); ++ REQ_OP_WRITE, REQ_FUA, false); + sh->log_start = ctx->pos; + ctx->pos = write_pos; + ctx->seq += 1; + + list_del_init(&sh->lru); + raid5_release_stripe(sh); } __free_page(page); return 0;