]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
Merge branch 'for-3.10/core' of git://git.kernel.dk/linux-block
authorLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 8 May 2013 17:13:35 +0000 (10:13 -0700)
Pull block core updates from Jens Axboe:

 - Major bit is Kents prep work for immutable bio vecs.

 - Stable candidate fix for a scheduling-while-atomic in the queue
   bypass operation.

 - Fix for the hang on exceeded rq->datalen 32-bit unsigned when merging
   discard bios.

 - Tejuns changes to convert the writeback thread pool to the generic
   workqueue mechanism.

 - Runtime PM framework, SCSI patches exists on top of these in James'
   tree.

 - A few random fixes.

* 'for-3.10/core' of git://git.kernel.dk/linux-block: (40 commits)
  relay: move remove_buf_file inside relay_close_buf
  partitions/efi.c: replace useless kzalloc's by kmalloc's
  fs/block_dev.c: fix iov_shorten() criteria in blkdev_aio_read()
  block: fix max discard sectors limit
  blkcg: fix "scheduling while atomic" in blk_queue_bypass_start
  Documentation: cfq-iosched: update documentation help for cfq tunables
  writeback: expose the bdi_wq workqueue
  writeback: replace custom worker pool implementation with unbound workqueue
  writeback: remove unused bdi_pending_list
  aoe: Fix unitialized var usage
  bio-integrity: Add explicit field for owner of bip_buf
  block: Add an explicit bio flag for bios that own their bvec
  block: Add bio_alloc_pages()
  block: Convert some code to bio_for_each_segment_all()
  block: Add bio_for_each_segment_all()
  bounce: Refactor __blk_queue_bounce to not use bi_io_vec
  raid1: use bio_copy_data()
  pktcdvd: Use bio_reset() in disabled code to kill bi_idx usage
  pktcdvd: use bio_copy_data()
  block: Add bio_copy_data()
  ...

50 files changed:
Documentation/block/cfq-iosched.txt
block/blk-cgroup.c
block/blk-core.c
block/cfq-iosched.c
block/deadline-iosched.c
block/elevator.c
block/partitions/efi.c
drivers/block/aoe/aoecmd.c
drivers/block/brd.c
drivers/block/floppy.c
drivers/block/pktcdvd.c
drivers/block/rbd.c
drivers/md/dm-crypt.c
drivers/md/dm-raid1.c
drivers/md/dm-stripe.c
drivers/md/dm-verity.c
drivers/md/faulty.c
drivers/md/linear.c
drivers/md/md.c
drivers/md/raid0.c
drivers/md/raid1.c
drivers/md/raid10.c
drivers/md/raid5.c
drivers/message/fusion/mptsas.c
drivers/s390/block/dcssblk.c
drivers/scsi/libsas/sas_expander.c
drivers/scsi/mpt2sas/mpt2sas_transport.c
fs/bio-integrity.c
fs/bio.c
fs/block_dev.c
fs/btrfs/extent_io.c
fs/btrfs/volumes.c
fs/buffer.c
fs/direct-io.c
fs/exofs/ore.c
fs/exofs/ore_raid.c
fs/fs-writeback.c
fs/gfs2/lops.c
fs/jfs/jfs_logmgr.c
fs/logfs/dev_bdev.c
include/linux/backing-dev.h
include/linux/bio.h
include/linux/blk_types.h
include/linux/blkdev.h
include/trace/events/block.h
include/trace/events/writeback.h
kernel/relay.c
mm/backing-dev.c
mm/bounce.c
mm/page_io.c

index a5eb7d19a65d241650e26a2b4e5a303437b46878..9887f0414c16642d204296d9c0b8abdc8096a5da 100644 (file)
@@ -5,7 +5,7 @@ The main aim of CFQ scheduler is to provide a fair allocation of the disk
 I/O bandwidth for all the processes which requests an I/O operation.
 
 CFQ maintains the per process queue for the processes which request I/O
-operation(syncronous requests). In case of asynchronous requests, all the
+operation(synchronous requests). In case of asynchronous requests, all the
 requests from all the processes are batched together according to their
 process's I/O priority.
 
@@ -66,6 +66,47 @@ This parameter is used to set the timeout of synchronous requests. Default
 value of this is 124ms. In case to favor synchronous requests over asynchronous
 one, this value should be decreased relative to fifo_expire_async.
 
+group_idle
+-----------
+This parameter forces idling at the CFQ group level instead of CFQ
+queue level. This was introduced after after a bottleneck was observed
+in higher end storage due to idle on sequential queue and allow dispatch
+from a single queue. The idea with this parameter is that it can be run with
+slice_idle=0 and group_idle=8, so that idling does not happen on individual
+queues in the group but happens overall on the group and thus still keeps the
+IO controller working.
+Not idling on individual queues in the group will dispatch requests from
+multiple queues in the group at the same time and achieve higher throughput
+on higher end storage.
+
+Default value for this parameter is 8ms.
+
+latency
+-------
+This parameter is used to enable/disable the latency mode of the CFQ
+scheduler. If latency mode (called low_latency) is enabled, CFQ tries
+to recompute the slice time for each process based on the target_latency set
+for the system. This favors fairness over throughput. Disabling low
+latency (setting it to 0) ignores target latency, allowing each process in the
+system to get a full time slice.
+
+By default low latency mode is enabled.
+
+target_latency
+--------------
+This parameter is used to calculate the time slice for a process if cfq's
+latency mode is enabled. It will ensure that sync requests have an estimated
+latency. But if sequential workload is higher(e.g. sequential read),
+then to meet the latency constraints, throughput may decrease because of less
+time for each process to issue I/O request before the cfq queue is switched.
+
+Though this can be overcome by disabling the latency_mode, it may increase
+the read latency for some applications. This parameter allows for changing
+target_latency through the sysfs interface which can provide the balanced
+throughput and read latency.
+
+Default value for target_latency is 300ms.
+
 slice_async
 -----------
 This parameter is same as of slice_sync but for asynchronous queue. The
@@ -98,8 +139,8 @@ in the device exceeds this parameter. This parameter is used for synchronous
 request.
 
 In case of storage with several disk, this setting can limit the parallel
-processing of request. Therefore, increasing the value can imporve the
-performace although this can cause the latency of some I/O to increase due
+processing of request. Therefore, increasing the value can improve the
+performance although this can cause the latency of some I/O to increase due
 to more number of requests.
 
 CFQ Group scheduling
index b2b9837f9dd3475be841d41ed52dd7715e936d4b..e8918ffaf96d4a0a2dacf75838b5d8a89e5e8ca3 100644 (file)
@@ -972,10 +972,10 @@ int blkcg_activate_policy(struct request_queue *q,
        if (!new_blkg)
                return -ENOMEM;
 
-       preloaded = !radix_tree_preload(GFP_KERNEL);
-
        blk_queue_bypass_start(q);
 
+       preloaded = !radix_tree_preload(GFP_KERNEL);
+
        /*
         * Make sure the root blkg exists and count the existing blkgs.  As
         * @q is bypassing at this point, blkg_lookup_create() can't be
index 7c288358a745ad2312e93080201e341cf2b69207..33c33bc99ddd5546e6ba30ce267cb436d0328c51 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/list_sort.h>
 #include <linux/delay.h>
 #include <linux/ratelimit.h>
+#include <linux/pm_runtime.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
@@ -159,20 +160,10 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
        else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
                error = -EIO;
 
-       if (unlikely(nbytes > bio->bi_size)) {
-               printk(KERN_ERR "%s: want %u bytes done, %u left\n",
-                      __func__, nbytes, bio->bi_size);
-               nbytes = bio->bi_size;
-       }
-
        if (unlikely(rq->cmd_flags & REQ_QUIET))
                set_bit(BIO_QUIET, &bio->bi_flags);
 
-       bio->bi_size -= nbytes;
-       bio->bi_sector += (nbytes >> 9);
-
-       if (bio_integrity(bio))
-               bio_integrity_advance(bio, nbytes);
+       bio_advance(bio, nbytes);
 
        /* don't actually finish bio if it's part of flush sequence */
        if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
@@ -1264,6 +1255,16 @@ void part_round_stats(int cpu, struct hd_struct *part)
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 
+#ifdef CONFIG_PM_RUNTIME
+static void blk_pm_put_request(struct request *rq)
+{
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending)
+               pm_runtime_mark_last_busy(rq->q->dev);
+}
+#else
+static inline void blk_pm_put_request(struct request *rq) {}
+#endif
+
 /*
  * queue lock must be held
  */
@@ -1274,6 +1275,8 @@ void __blk_put_request(struct request_queue *q, struct request *req)
        if (unlikely(--req->ref_count))
                return;
 
+       blk_pm_put_request(req);
+
        elv_completed_request(q, req);
 
        /* this is a bio leak */
@@ -1597,7 +1600,7 @@ static void handle_bad_sector(struct bio *bio)
        printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
                        bdevname(bio->bi_bdev, b),
                        bio->bi_rw,
-                       (unsigned long long)bio->bi_sector + bio_sectors(bio),
+                       (unsigned long long)bio_end_sector(bio),
                        (long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 
        set_bit(BIO_EOF, &bio->bi_flags);
@@ -2053,6 +2056,28 @@ static void blk_account_io_done(struct request *req)
        }
 }
 
+#ifdef CONFIG_PM_RUNTIME
+/*
+ * Don't process normal requests when queue is suspended
+ * or in the process of suspending/resuming
+ */
+static struct request *blk_pm_peek_request(struct request_queue *q,
+                                          struct request *rq)
+{
+       if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
+           (q->rpm_status != RPM_ACTIVE && !(rq->cmd_flags & REQ_PM))))
+               return NULL;
+       else
+               return rq;
+}
+#else
+static inline struct request *blk_pm_peek_request(struct request_queue *q,
+                                                 struct request *rq)
+{
+       return rq;
+}
+#endif
+
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
@@ -2075,6 +2100,11 @@ struct request *blk_peek_request(struct request_queue *q)
        int ret;
 
        while ((rq = __elv_next_request(q)) != NULL) {
+
+               rq = blk_pm_peek_request(q, rq);
+               if (!rq)
+                       break;
+
                if (!(rq->cmd_flags & REQ_STARTED)) {
                        /*
                         * This is the first time the device driver
@@ -2253,8 +2283,7 @@ EXPORT_SYMBOL(blk_fetch_request);
  **/
 bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
-       int total_bytes, bio_nbytes, next_idx = 0;
-       struct bio *bio;
+       int total_bytes;
 
        if (!req->bio)
                return false;
@@ -2300,56 +2329,21 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 
        blk_account_io_completion(req, nr_bytes);
 
-       total_bytes = bio_nbytes = 0;
-       while ((bio = req->bio) != NULL) {
-               int nbytes;
+       total_bytes = 0;
+       while (req->bio) {
+               struct bio *bio = req->bio;
+               unsigned bio_bytes = min(bio->bi_size, nr_bytes);
 
-               if (nr_bytes >= bio->bi_size) {
+               if (bio_bytes == bio->bi_size)
                        req->bio = bio->bi_next;
-                       nbytes = bio->bi_size;
-                       req_bio_endio(req, bio, nbytes, error);
-                       next_idx = 0;
-                       bio_nbytes = 0;
-               } else {
-                       int idx = bio->bi_idx + next_idx;
 
-                       if (unlikely(idx >= bio->bi_vcnt)) {
-                               blk_dump_rq_flags(req, "__end_that");
-                               printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
-                                      __func__, idx, bio->bi_vcnt);
-                               break;
-                       }
+               req_bio_endio(req, bio, bio_bytes, error);
 
-                       nbytes = bio_iovec_idx(bio, idx)->bv_len;
-                       BIO_BUG_ON(nbytes > bio->bi_size);
+               total_bytes += bio_bytes;
+               nr_bytes -= bio_bytes;
 
-                       /*
-                        * not a complete bvec done
-                        */
-                       if (unlikely(nbytes > nr_bytes)) {
-                               bio_nbytes += nr_bytes;
-                               total_bytes += nr_bytes;
-                               break;
-                       }
-
-                       /*
-                        * advance to the next vector
-                        */
-                       next_idx++;
-                       bio_nbytes += nbytes;
-               }
-
-               total_bytes += nbytes;
-               nr_bytes -= nbytes;
-
-               bio = req->bio;
-               if (bio) {
-                       /*
-                        * end more in this run, or just return 'not-done'
-                        */
-                       if (unlikely(nr_bytes <= 0))
-                               break;
-               }
+               if (!nr_bytes)
+                       break;
        }
 
        /*
@@ -2365,16 +2359,6 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
                return false;
        }
 
-       /*
-        * if the request wasn't completed, update state
-        */
-       if (bio_nbytes) {
-               req_bio_endio(req, bio, bio_nbytes, error);
-               bio->bi_idx += next_idx;
-               bio_iovec(bio)->bv_offset += nr_bytes;
-               bio_iovec(bio)->bv_len -= nr_bytes;
-       }
-
        req->__data_len -= total_bytes;
        req->buffer = bio_data(req->bio);
 
@@ -3046,6 +3030,149 @@ void blk_finish_plug(struct blk_plug *plug)
 }
 EXPORT_SYMBOL(blk_finish_plug);
 
+#ifdef CONFIG_PM_RUNTIME
+/**
+ * blk_pm_runtime_init - Block layer runtime PM initialization routine
+ * @q: the queue of the device
+ * @dev: the device the queue belongs to
+ *
+ * Description:
+ *    Initialize runtime-PM-related fields for @q and start auto suspend for
+ *    @dev. Drivers that want to take advantage of request-based runtime PM
+ *    should call this function after @dev has been initialized, and its
+ *    request queue @q has been allocated, and runtime PM for it can not happen
+ *    yet(either due to disabled/forbidden or its usage_count > 0). In most
+ *    cases, driver should call this function before any I/O has taken place.
+ *
+ *    This function takes care of setting up using auto suspend for the device,
+ *    the autosuspend delay is set to -1 to make runtime suspend impossible
+ *    until an updated value is either set by user or by driver. Drivers do
+ *    not need to touch other autosuspend settings.
+ *
+ *    The block layer runtime PM is request based, so only works for drivers
+ *    that use request as their IO unit instead of those directly use bio's.
+ */
+void blk_pm_runtime_init(struct request_queue *q, struct device *dev)
+{
+       q->dev = dev;
+       q->rpm_status = RPM_ACTIVE;
+       pm_runtime_set_autosuspend_delay(q->dev, -1);
+       pm_runtime_use_autosuspend(q->dev);
+}
+EXPORT_SYMBOL(blk_pm_runtime_init);
+
+/**
+ * blk_pre_runtime_suspend - Pre runtime suspend check
+ * @q: the queue of the device
+ *
+ * Description:
+ *    This function will check if runtime suspend is allowed for the device
+ *    by examining if there are any requests pending in the queue. If there
+ *    are requests pending, the device can not be runtime suspended; otherwise,
+ *    the queue's status will be updated to SUSPENDING and the driver can
+ *    proceed to suspend the device.
+ *
+ *    For the not allowed case, we mark last busy for the device so that
+ *    runtime PM core will try to autosuspend it some time later.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_suspend callback.
+ *
+ * Return:
+ *    0                - OK to runtime suspend the device
+ *    -EBUSY   - Device should not be runtime suspended
+ */
+int blk_pre_runtime_suspend(struct request_queue *q)
+{
+       int ret = 0;
+
+       spin_lock_irq(q->queue_lock);
+       if (q->nr_pending) {
+               ret = -EBUSY;
+               pm_runtime_mark_last_busy(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDING;
+       }
+       spin_unlock_irq(q->queue_lock);
+       return ret;
+}
+EXPORT_SYMBOL(blk_pre_runtime_suspend);
+
+/**
+ * blk_post_runtime_suspend - Post runtime suspend processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_suspend function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime suspend function and mark last busy for the device so
+ *    that PM core will try to auto suspend the device at a later time.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_suspend callback.
+ */
+void blk_post_runtime_suspend(struct request_queue *q, int err)
+{
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_SUSPENDED;
+       } else {
+               q->rpm_status = RPM_ACTIVE;
+               pm_runtime_mark_last_busy(q->dev);
+       }
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_suspend);
+
+/**
+ * blk_pre_runtime_resume - Pre runtime resume processing
+ * @q: the queue of the device
+ *
+ * Description:
+ *    Update the queue's runtime status to RESUMING in preparation for the
+ *    runtime resume of the device.
+ *
+ *    This function should be called near the start of the device's
+ *    runtime_resume callback.
+ */
+void blk_pre_runtime_resume(struct request_queue *q)
+{
+       spin_lock_irq(q->queue_lock);
+       q->rpm_status = RPM_RESUMING;
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_pre_runtime_resume);
+
+/**
+ * blk_post_runtime_resume - Post runtime resume processing
+ * @q: the queue of the device
+ * @err: return value of the device's runtime_resume function
+ *
+ * Description:
+ *    Update the queue's runtime status according to the return value of the
+ *    device's runtime_resume function. If it is successfully resumed, process
+ *    the requests that are queued into the device's queue when it is resuming
+ *    and then mark last busy and initiate autosuspend for it.
+ *
+ *    This function should be called near the end of the device's
+ *    runtime_resume callback.
+ */
+void blk_post_runtime_resume(struct request_queue *q, int err)
+{
+       spin_lock_irq(q->queue_lock);
+       if (!err) {
+               q->rpm_status = RPM_ACTIVE;
+               __blk_run_queue(q);
+               pm_runtime_mark_last_busy(q->dev);
+               pm_runtime_autosuspend(q->dev);
+       } else {
+               q->rpm_status = RPM_SUSPENDED;
+       }
+       spin_unlock_irq(q->queue_lock);
+}
+EXPORT_SYMBOL(blk_post_runtime_resume);
+#endif
+
 int __init blk_dev_init(void)
 {
        BUILD_BUG_ON(__REQ_NR_BITS > 8 *
index 4f0ade74cfd04a1c48f22218a6c4369517efa88b..d5cd3131c57a36645bcf049e28ab9a46ab5ea559 100644 (file)
@@ -2270,11 +2270,8 @@ cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
                return NULL;
 
        cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
-       if (cfqq) {
-               sector_t sector = bio->bi_sector + bio_sectors(bio);
-
-               return elv_rb_find(&cfqq->sort_list, sector);
-       }
+       if (cfqq)
+               return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio));
 
        return NULL;
 }
index 90037b5eb17fca86bc2212241a6d1916cf83960c..ba19a3afab7929cf3ff0857683ccaef40a007a77 100644 (file)
@@ -132,7 +132,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio)
         * check for front merge
         */
        if (dd->front_merges) {
-               sector_t sector = bio->bi_sector + bio_sectors(bio);
+               sector_t sector = bio_end_sector(bio);
 
                __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
                if (__rq) {
index a0ffdd943c98aa5e0f39f102b98f1bc4cf9777f5..eba5b04c29b135bdc6b84ac80690d380d59dd8a5 100644 (file)
@@ -34,6 +34,7 @@
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
+#include <linux/pm_runtime.h>
 
 #include <trace/events/block.h>
 
@@ -536,6 +537,27 @@ void elv_bio_merged(struct request_queue *q, struct request *rq,
                e->type->ops.elevator_bio_merged_fn(q, rq, bio);
 }
 
+#ifdef CONFIG_PM_RUNTIME
+static void blk_pm_requeue_request(struct request *rq)
+{
+       if (rq->q->dev && !(rq->cmd_flags & REQ_PM))
+               rq->q->nr_pending--;
+}
+
+static void blk_pm_add_request(struct request_queue *q, struct request *rq)
+{
+       if (q->dev && !(rq->cmd_flags & REQ_PM) && q->nr_pending++ == 0 &&
+           (q->rpm_status == RPM_SUSPENDED || q->rpm_status == RPM_SUSPENDING))
+               pm_request_resume(q->dev);
+}
+#else
+static inline void blk_pm_requeue_request(struct request *rq) {}
+static inline void blk_pm_add_request(struct request_queue *q,
+                                     struct request *rq)
+{
+}
+#endif
+
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
        /*
@@ -550,6 +572,8 @@ void elv_requeue_request(struct request_queue *q, struct request *rq)
 
        rq->cmd_flags &= ~REQ_STARTED;
 
+       blk_pm_requeue_request(rq);
+
        __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 
@@ -572,6 +596,8 @@ void __elv_add_request(struct request_queue *q, struct request *rq, int where)
 {
        trace_block_rq_insert(q, rq);
 
+       blk_pm_add_request(q, rq);
+
        rq->q = q;
 
        if (rq->cmd_flags & REQ_SOFTBARRIER) {
index ff5804e2f1d2ffeb40d7d7446af9867e0b48bd03..c85fc895ecdbbeba25b624e1af909bfa61f3acc2 100644 (file)
@@ -238,7 +238,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
                 le32_to_cpu(gpt->sizeof_partition_entry);
        if (!count)
                return NULL;
-       pte = kzalloc(count, GFP_KERNEL);
+       pte = kmalloc(count, GFP_KERNEL);
        if (!pte)
                return NULL;
 
@@ -267,7 +267,7 @@ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
        gpt_header *gpt;
        unsigned ssz = bdev_logical_block_size(state->bdev);
 
-       gpt = kzalloc(ssz, GFP_KERNEL);
+       gpt = kmalloc(ssz, GFP_KERNEL);
        if (!gpt)
                return NULL;
 
index 92b6d7c51e39590b3780f17c88737009b7becbbf..5efed089a702d501f3b0de2e85997362555d0a09 100644 (file)
@@ -928,7 +928,7 @@ bufinit(struct buf *buf, struct request *rq, struct bio *bio)
        buf->resid = bio->bi_size;
        buf->sector = bio->bi_sector;
        bio_pageinc(bio);
-       buf->bv = bv = &bio->bi_io_vec[bio->bi_idx];
+       buf->bv = bv = bio_iovec(bio);
        buf->bv_resid = bv->bv_len;
        WARN_ON(buf->bv_resid == 0);
 }
index 531ceb31d0ff86a4aec90301ea508254ea443457..f1a29f8e9d33dbe45474172a24f952f32ce4ba47 100644 (file)
@@ -334,8 +334,7 @@ static void brd_make_request(struct request_queue *q, struct bio *bio)
        int err = -EIO;
 
        sector = bio->bi_sector;
-       if (sector + (bio->bi_size >> SECTOR_SHIFT) >
-                                               get_capacity(bdev->bd_disk))
+       if (bio_end_sector(bio) > get_capacity(bdev->bd_disk))
                goto out;
 
        if (unlikely(bio->bi_rw & REQ_DISCARD)) {
index c49e85608101e6fb97b90b7a1b014fb0cbbd2ac4..04ceb7e2fadd6ca075d20ecd844c39bf1da07ff3 100644 (file)
@@ -3775,7 +3775,6 @@ static int __floppy_read_block_0(struct block_device *bdev)
        bio_vec.bv_len = size;
        bio_vec.bv_offset = 0;
        bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
        bio.bi_size = size;
        bio.bi_bdev = bdev;
        bio.bi_sector = 0;
index 9f2d348f7115424e3bda72d1f8ce61418f4f03d1..3c08983e600a0a15e1380e9de1e6f50714fe3976 100644 (file)
@@ -901,7 +901,7 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
                        pd->iosched.successive_reads += bio->bi_size >> 10;
                else {
                        pd->iosched.successive_reads = 0;
-                       pd->iosched.last_write = bio->bi_sector + bio_sectors(bio);
+                       pd->iosched.last_write = bio_end_sector(bio);
                }
                if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
                        if (pd->read_speed == pd->write_speed) {
@@ -947,31 +947,6 @@ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_que
        }
 }
 
-/*
- * Copy CD_FRAMESIZE bytes from src_bio into a destination page
- */
-static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct page *dst_page, int dst_offs)
-{
-       unsigned int copy_size = CD_FRAMESIZE;
-
-       while (copy_size > 0) {
-               struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-               void *vfrom = kmap_atomic(src_bvl->bv_page) +
-                       src_bvl->bv_offset + offs;
-               void *vto = page_address(dst_page) + dst_offs;
-               int len = min_t(int, copy_size, src_bvl->bv_len - offs);
-
-               BUG_ON(len < 0);
-               memcpy(vto, vfrom, len);
-               kunmap_atomic(vfrom);
-
-               seg++;
-               offs = 0;
-               dst_offs += len;
-               copy_size -= len;
-       }
-}
-
 /*
  * Copy all data for this packet to pkt->pages[], so that
  * a) The number of required segments for the write bio is minimized, which
@@ -1181,16 +1156,15 @@ static int pkt_start_recovery(struct packet_data *pkt)
        new_sector = new_block * (CD_FRAMESIZE >> 9);
        pkt->sector = new_sector;
 
+       bio_reset(pkt->bio);
+       pkt->bio->bi_bdev = pd->bdev;
+       pkt->bio->bi_rw = REQ_WRITE;
        pkt->bio->bi_sector = new_sector;
-       pkt->bio->bi_next = NULL;
-       pkt->bio->bi_flags = 1 << BIO_UPTODATE;
-       pkt->bio->bi_idx = 0;
+       pkt->bio->bi_size = pkt->frames * CD_FRAMESIZE;
+       pkt->bio->bi_vcnt = pkt->frames;
 
-       BUG_ON(pkt->bio->bi_rw != REQ_WRITE);
-       BUG_ON(pkt->bio->bi_vcnt != pkt->frames);
-       BUG_ON(pkt->bio->bi_size != pkt->frames * CD_FRAMESIZE);
-       BUG_ON(pkt->bio->bi_end_io != pkt_end_io_packet_write);
-       BUG_ON(pkt->bio->bi_private != pkt);
+       pkt->bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->bio->bi_private = pkt;
 
        drop_super(sb);
        return 1;
@@ -1325,55 +1299,35 @@ try_next_bio:
  */
 static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
 {
-       struct bio *bio;
        int f;
-       int frames_write;
        struct bio_vec *bvec = pkt->w_bio->bi_io_vec;
 
+       bio_reset(pkt->w_bio);
+       pkt->w_bio->bi_sector = pkt->sector;
+       pkt->w_bio->bi_bdev = pd->bdev;
+       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+       pkt->w_bio->bi_private = pkt;
+
+       /* XXX: locking? */
        for (f = 0; f < pkt->frames; f++) {
                bvec[f].bv_page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
                bvec[f].bv_offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
+                       BUG();
        }
+       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
 
        /*
         * Fill-in bvec with data from orig_bios.
         */
-       frames_write = 0;
        spin_lock(&pkt->lock);
-       bio_list_for_each(bio, &pkt->orig_bios) {
-               int segment = bio->bi_idx;
-               int src_offs = 0;
-               int first_frame = (bio->bi_sector - pkt->sector) / (CD_FRAMESIZE >> 9);
-               int num_frames = bio->bi_size / CD_FRAMESIZE;
-               BUG_ON(first_frame < 0);
-               BUG_ON(first_frame + num_frames > pkt->frames);
-               for (f = first_frame; f < first_frame + num_frames; f++) {
-                       struct bio_vec *src_bvl = bio_iovec_idx(bio, segment);
-
-                       while (src_offs >= src_bvl->bv_len) {
-                               src_offs -= src_bvl->bv_len;
-                               segment++;
-                               BUG_ON(segment >= bio->bi_vcnt);
-                               src_bvl = bio_iovec_idx(bio, segment);
-                       }
+       bio_copy_data(pkt->w_bio, pkt->orig_bios.head);
 
-                       if (src_bvl->bv_len - src_offs >= CD_FRAMESIZE) {
-                               bvec[f].bv_page = src_bvl->bv_page;
-                               bvec[f].bv_offset = src_bvl->bv_offset + src_offs;
-                       } else {
-                               pkt_copy_bio_data(bio, segment, src_offs,
-                                                 bvec[f].bv_page, bvec[f].bv_offset);
-                       }
-                       src_offs += CD_FRAMESIZE;
-                       frames_write++;
-               }
-       }
        pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
        spin_unlock(&pkt->lock);
 
        VPRINTK("pkt_start_write: Writing %d frames for zone %llx\n",
-               frames_write, (unsigned long long)pkt->sector);
-       BUG_ON(frames_write != pkt->write_size);
+               pkt->write_size, (unsigned long long)pkt->sector);
 
        if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames)) {
                pkt_make_local_copy(pkt, bvec);
@@ -1383,16 +1337,6 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
        }
 
        /* Start the write request */
-       bio_reset(pkt->w_bio);
-       pkt->w_bio->bi_sector = pkt->sector;
-       pkt->w_bio->bi_bdev = pd->bdev;
-       pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
-       pkt->w_bio->bi_private = pkt;
-       for (f = 0; f < pkt->frames; f++)
-               if (!bio_add_page(pkt->w_bio, bvec[f].bv_page, CD_FRAMESIZE, bvec[f].bv_offset))
-                       BUG();
-       VPRINTK(DRIVER_NAME": vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
        atomic_set(&pkt->io_wait, 1);
        pkt->w_bio->bi_rw = WRITE;
        pkt_queue_bio(pd, pkt->w_bio);
@@ -2431,7 +2375,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
                cloned_bio->bi_bdev = pd->bdev;
                cloned_bio->bi_private = psd;
                cloned_bio->bi_end_io = pkt_end_io_read_cloned;
-               pd->stats.secs_r += bio->bi_size >> 9;
+               pd->stats.secs_r += bio_sectors(bio);
                pkt_queue_bio(pd, cloned_bio);
                return;
        }
@@ -2452,7 +2396,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
        zone = ZONE(bio->bi_sector, pd);
        VPRINTK("pkt_make_request: start = %6llx stop = %6llx\n",
                (unsigned long long)bio->bi_sector,
-               (unsigned long long)(bio->bi_sector + bio_sectors(bio)));
+               (unsigned long long)bio_end_sector(bio));
 
        /* Check if we have to split the bio */
        {
@@ -2460,7 +2404,7 @@ static void pkt_make_request(struct request_queue *q, struct bio *bio)
                sector_t last_zone;
                int first_sectors;
 
-               last_zone = ZONE(bio->bi_sector + bio_sectors(bio) - 1, pd);
+               last_zone = ZONE(bio_end_sector(bio) - 1, pd);
                if (last_zone != zone) {
                        BUG_ON(last_zone != zone + pd->settings.size);
                        first_sectors = last_zone - bio->bi_sector;
index 22ffd5dcb1681da2b48b110e54e22f69457aa508..ca63104136e0db46d0248aa290c355483989ec2e 100644 (file)
@@ -1143,7 +1143,7 @@ static struct bio *bio_clone_range(struct bio *bio_src,
        /* Find first affected segment... */
 
        resid = offset;
-       __bio_for_each_segment(bv, bio_src, idx, 0) {
+       bio_for_each_segment(bv, bio_src, idx) {
                if (resid < bv->bv_len)
                        break;
                resid -= bv->bv_len;
index 13c15480d9401a79ea41fb44c603a007d97e2ea1..6d2d41ae9e322dbd53e787e5294f2d55551296eb 100644 (file)
@@ -858,8 +858,7 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
        unsigned int i;
        struct bio_vec *bv;
 
-       for (i = 0; i < clone->bi_vcnt; i++) {
-               bv = bio_iovec_idx(clone, i);
+       bio_for_each_segment_all(bv, clone, i) {
                BUG_ON(!bv->bv_page);
                mempool_free(bv->bv_page, cc->page_pool);
                bv->bv_page = NULL;
index d053098c6a917f1ac55fec457ab7c7f590ecb1eb..699b5be68d319263cce75e8d932deb0be25c7d00 100644 (file)
@@ -458,7 +458,7 @@ static void map_region(struct dm_io_region *io, struct mirror *m,
 {
        io->bdev = m->dev->bdev;
        io->sector = map_sector(m, bio);
-       io->count = bio->bi_size >> 9;
+       io->count = bio_sectors(bio);
 }
 
 static void hold_bio(struct mirror_set *ms, struct bio *bio)
index d8837d313f5434183439f057f463c99be100e0ea..ea5e878a30b93b1f974d449738fc46b55d5eaeba 100644 (file)
@@ -258,7 +258,7 @@ static int stripe_map_range(struct stripe_c *sc, struct bio *bio,
        sector_t begin, end;
 
        stripe_map_range_sector(sc, bio->bi_sector, target_stripe, &begin);
-       stripe_map_range_sector(sc, bio->bi_sector + bio_sectors(bio),
+       stripe_map_range_sector(sc, bio_end_sector(bio),
                                target_stripe, &end);
        if (begin < end) {
                bio->bi_bdev = sc->stripe[target_stripe].dev->bdev;
index a746f1d21c661bec7b809c9a688fddf44284581f..b948fd864d457e9ce857d1d04fd74bf8997e0aee 100644 (file)
@@ -501,7 +501,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
                return -EIO;
        }
 
-       if ((bio->bi_sector + bio_sectors(bio)) >>
+       if (bio_end_sector(bio) >>
            (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
                DMERR_LIMIT("io out of range");
                return -EIO;
@@ -519,7 +519,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
 
        bio->bi_end_io = verity_end_io;
        bio->bi_private = io;
-       io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
+       io->io_vec_size = bio_segments(bio);
        if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
                io->io_vec = io->io_vec_inline;
        else
index 5e7dc772f5deca223ea9142d073cc66785f54b27..3193aefe982b7b42badf4eba4adc36f89439d70c 100644 (file)
@@ -185,8 +185,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                        return;
                }
 
-               if (check_sector(conf, bio->bi_sector, bio->bi_sector+(bio->bi_size>>9),
-                                WRITE))
+               if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), WRITE))
                        failit = 1;
                if (check_mode(conf, WritePersistent)) {
                        add_sector(conf, bio->bi_sector, WritePersistent);
@@ -196,8 +195,7 @@ static void make_request(struct mddev *mddev, struct bio *bio)
                        failit = 1;
        } else {
                /* read request */
-               if (check_sector(conf, bio->bi_sector, bio->bi_sector + (bio->bi_size>>9),
-                                READ))
+               if (check_sector(conf, bio->bi_sector, bio_end_sector(bio), READ))
                        failit = 1;
                if (check_mode(conf, ReadTransient))
                        failit = 1;
index 21014836bdbf2286eba4fd02effbc4efcd9db625..f03fabd2b37bacf34a231a0bb034a6d8f2826e68 100644 (file)
@@ -317,8 +317,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio)
                bio_io_error(bio);
                return;
        }
-       if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-                    tmp_dev->end_sector)) {
+       if (unlikely(bio_end_sector(bio) > tmp_dev->end_sector)) {
                /* This bio crosses a device boundary, so we have to
                 * split it.
                 */
index 6330c727396cd6071f85e1a20bb4103d6aeeb322..681d1099a2d58936864b3b63610a31f38a908219 100644 (file)
@@ -197,21 +197,12 @@ void md_trim_bio(struct bio *bio, int offset, int size)
        if (offset == 0 && size == bio->bi_size)
                return;
 
-       bio->bi_sector += offset;
-       bio->bi_size = size;
-       offset <<= 9;
        clear_bit(BIO_SEG_VALID, &bio->bi_flags);
 
-       while (bio->bi_idx < bio->bi_vcnt &&
-              bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
-               /* remove this whole bio_vec */
-               offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
-               bio->bi_idx++;
-       }
-       if (bio->bi_idx < bio->bi_vcnt) {
-               bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
-               bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
-       }
+       bio_advance(bio, offset << 9);
+
+       bio->bi_size = size;
+
        /* avoid any complications with bi_idx being non-zero*/
        if (bio->bi_idx) {
                memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
index 0505452de8d6ee2b3533c7930d62df544636b0c3..fcf65e512cf51a02413ae4439e72ce7ab6675159 100644 (file)
@@ -502,11 +502,11 @@ static inline int is_io_in_chunk_boundary(struct mddev *mddev,
 {
        if (likely(is_power_of_2(chunk_sects))) {
                return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
-                                       + (bio->bi_size >> 9));
+                                       + bio_sectors(bio));
        } else{
                sector_t sector = bio->bi_sector;
                return chunk_sects >= (sector_div(sector, chunk_sects)
-                                               + (bio->bi_size >> 9));
+                                               + bio_sectors(bio));
        }
 }
 
@@ -527,8 +527,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
                sector_t sector = bio->bi_sector;
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                        goto bad_map;
                /* This is a one page bio that upper layers
                 * refuse to split for us, so we need to split it.
@@ -567,7 +566,7 @@ bad_map:
        printk("md/raid0:%s: make_request bug: can't convert block across chunks"
               " or bigger than %dk %llu %d\n",
               mdname(mddev), chunk_sects / 2,
-              (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+              (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
 
        bio_io_error(bio);
        return;
index 851023e2ba5d5296824a46bdc12482056de648a1..55951182af73680d3b7f40d32cac1302062dbe74 100644 (file)
@@ -92,7 +92,6 @@ static void r1bio_pool_free(void *r1_bio, void *data)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
        struct pool_info *pi = data;
-       struct page *page;
        struct r1bio *r1_bio;
        struct bio *bio;
        int i, j;
@@ -122,14 +121,10 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
                j = 1;
        while(j--) {
                bio = r1_bio->bios[j];
-               for (i = 0; i < RESYNC_PAGES; i++) {
-                       page = alloc_page(gfp_flags);
-                       if (unlikely(!page))
-                               goto out_free_pages;
+               bio->bi_vcnt = RESYNC_PAGES;
 
-                       bio->bi_io_vec[i].bv_page = page;
-                       bio->bi_vcnt = i+1;
-               }
+               if (bio_alloc_pages(bio, gfp_flags))
+                       goto out_free_bio;
        }
        /* If not user-requests, copy the page pointers to all bios */
        if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
@@ -143,11 +138,6 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 
        return r1_bio;
 
-out_free_pages:
-       for (j=0 ; j < pi->raid_disks; j++)
-               for (i=0; i < r1_bio->bios[j]->bi_vcnt ; i++)
-                       put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
-       j = -1;
 out_free_bio:
        while (++j < pi->raid_disks)
                bio_put(r1_bio->bios[j]);
@@ -267,7 +257,7 @@ static void raid_end_bio_io(struct r1bio *r1_bio)
                         (bio_data_dir(bio) == WRITE) ? "write" : "read",
                         (unsigned long long) bio->bi_sector,
                         (unsigned long long) bio->bi_sector +
-                        (bio->bi_size >> 9) - 1);
+                        bio_sectors(bio) - 1);
 
                call_bio_endio(r1_bio);
        }
@@ -458,7 +448,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
                                         " %llu-%llu\n",
                                         (unsigned long long) mbio->bi_sector,
                                         (unsigned long long) mbio->bi_sector +
-                                        (mbio->bi_size >> 9) - 1);
+                                        bio_sectors(mbio) - 1);
                                call_bio_endio(r1_bio);
                        }
                }
@@ -925,7 +915,7 @@ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio)
        if (unlikely(!bvecs))
                return;
 
-       bio_for_each_segment(bvec, bio, i) {
+       bio_for_each_segment_all(bvec, bio, i) {
                bvecs[i] = *bvec;
                bvecs[i].bv_page = alloc_page(GFP_NOIO);
                if (unlikely(!bvecs[i].bv_page))
@@ -1023,7 +1013,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        md_write_start(mddev, bio); /* wait on superblock update early */
 
        if (bio_data_dir(bio) == WRITE &&
-           bio->bi_sector + bio->bi_size/512 > mddev->suspend_lo &&
+           bio_end_sector(bio) > mddev->suspend_lo &&
            bio->bi_sector < mddev->suspend_hi) {
                /* As the suspend_* range is controlled by
                 * userspace, we want an interruptible
@@ -1034,7 +1024,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
                        flush_signals(current);
                        prepare_to_wait(&conf->wait_barrier,
                                        &w, TASK_INTERRUPTIBLE);
-                       if (bio->bi_sector + bio->bi_size/512 <= mddev->suspend_lo ||
+                       if (bio_end_sector(bio) <= mddev->suspend_lo ||
                            bio->bi_sector >= mddev->suspend_hi)
                                break;
                        schedule();
@@ -1054,7 +1044,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
        r1_bio->master_bio = bio;
-       r1_bio->sectors = bio->bi_size >> 9;
+       r1_bio->sectors = bio_sectors(bio);
        r1_bio->state = 0;
        r1_bio->mddev = mddev;
        r1_bio->sector = bio->bi_sector;
@@ -1132,7 +1122,7 @@ read_again:
                        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
                        r1_bio->master_bio = bio;
-                       r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                       r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r1_bio->state = 0;
                        r1_bio->mddev = mddev;
                        r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1289,14 +1279,10 @@ read_again:
                        struct bio_vec *bvec;
                        int j;
 
-                       /* Yes, I really want the '__' version so that
-                        * we clear any unused pointer in the io_vec, rather
-                        * than leave them unchanged.  This is important
-                        * because when we come to free the pages, we won't
-                        * know the original bi_idx, so we just free
-                        * them all
+                       /*
+                        * We trimmed the bio, so _all is legit
                         */
-                       __bio_for_each_segment(bvec, mbio, j, 0)
+                       bio_for_each_segment_all(bvec, mbio, j)
                                bvec->bv_page = r1_bio->behind_bvecs[j].bv_page;
                        if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
                                atomic_inc(&r1_bio->behind_remaining);
@@ -1334,14 +1320,14 @@ read_again:
        /* Mustn't call r1_bio_write_done before this next test,
         * as it could result in the bio being freed.
         */
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                r1_bio_write_done(r1_bio);
                /* We need another r1_bio.  It has already been counted
                 * in bio->bi_phys_segments
                 */
                r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
                r1_bio->master_bio = bio;
-               r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r1_bio->sectors = bio_sectors(bio) - sectors_handled;
                r1_bio->state = 0;
                r1_bio->mddev = mddev;
                r1_bio->sector = bio->bi_sector + sectors_handled;
@@ -1867,7 +1853,7 @@ static int process_checks(struct r1bio *r1_bio)
                struct bio *sbio = r1_bio->bios[i];
                int size;
 
-               if (r1_bio->bios[i]->bi_end_io != end_sync_read)
+               if (sbio->bi_end_io != end_sync_read)
                        continue;
 
                if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) {
@@ -1892,16 +1878,15 @@ static int process_checks(struct r1bio *r1_bio)
                        continue;
                }
                /* fixup the bio for reuse */
+               bio_reset(sbio);
                sbio->bi_vcnt = vcnt;
                sbio->bi_size = r1_bio->sectors << 9;
-               sbio->bi_idx = 0;
-               sbio->bi_phys_segments = 0;
-               sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               sbio->bi_flags |= 1 << BIO_UPTODATE;
-               sbio->bi_next = NULL;
                sbio->bi_sector = r1_bio->sector +
                        conf->mirrors[i].rdev->data_offset;
                sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+               sbio->bi_end_io = end_sync_read;
+               sbio->bi_private = r1_bio;
+
                size = sbio->bi_size;
                for (j = 0; j < vcnt ; j++) {
                        struct bio_vec *bi;
@@ -1912,10 +1897,9 @@ static int process_checks(struct r1bio *r1_bio)
                        else
                                bi->bv_len = size;
                        size -= PAGE_SIZE;
-                       memcpy(page_address(bi->bv_page),
-                              page_address(pbio->bi_io_vec[j].bv_page),
-                              PAGE_SIZE);
                }
+
+               bio_copy_data(sbio, pbio);
        }
        return 0;
 }
@@ -1952,7 +1936,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
                wbio->bi_rw = WRITE;
                wbio->bi_end_io = end_sync_write;
                atomic_inc(&r1_bio->remaining);
-               md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[i].rdev->bdev, bio_sectors(wbio));
 
                generic_make_request(wbio);
        }
@@ -2064,32 +2048,11 @@ static void fix_read_error(struct r1conf *conf, int read_disk,
        }
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-       struct completion event;
-       rw |= REQ_SYNC;
-
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r1bio *r1_bio, int i)
 {
        struct mddev *mddev = r1_bio->mddev;
        struct r1conf *conf = mddev->private;
        struct md_rdev *rdev = conf->mirrors[i].rdev;
-       int vcnt, idx;
-       struct bio_vec *vec;
 
        /* bio has the data to be written to device 'i' where
         * we just recently had a write error.
@@ -2117,30 +2080,32 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
                   & ~(sector_t)(block_sectors - 1))
                - sector;
 
-       if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-               vcnt = r1_bio->behind_page_count;
-               vec = r1_bio->behind_bvecs;
-               idx = 0;
-               while (vec[idx].bv_page == NULL)
-                       idx++;
-       } else {
-               vcnt = r1_bio->master_bio->bi_vcnt;
-               vec = r1_bio->master_bio->bi_io_vec;
-               idx = r1_bio->master_bio->bi_idx;
-       }
        while (sect_to_write) {
                struct bio *wbio;
                if (sectors > sect_to_write)
                        sectors = sect_to_write;
                /* Write at 'sector' for 'sectors'*/
 
-               wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
-               memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
-               wbio->bi_sector = r1_bio->sector;
+               if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                       unsigned vcnt = r1_bio->behind_page_count;
+                       struct bio_vec *vec = r1_bio->behind_bvecs;
+
+                       while (!vec->bv_page) {
+                               vec++;
+                               vcnt--;
+                       }
+
+                       wbio = bio_alloc_mddev(GFP_NOIO, vcnt, mddev);
+                       memcpy(wbio->bi_io_vec, vec, vcnt * sizeof(struct bio_vec));
+
+                       wbio->bi_vcnt = vcnt;
+               } else {
+                       wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev);
+               }
+
                wbio->bi_rw = WRITE;
-               wbio->bi_vcnt = vcnt;
+               wbio->bi_sector = r1_bio->sector;
                wbio->bi_size = r1_bio->sectors << 9;
-               wbio->bi_idx = idx;
 
                md_trim_bio(wbio, sector - r1_bio->sector, sectors);
                wbio->bi_sector += rdev->data_offset;
@@ -2289,8 +2254,7 @@ read_more:
                        r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 
                        r1_bio->master_bio = mbio;
-                       r1_bio->sectors = (mbio->bi_size >> 9)
-                                         - sectors_handled;
+                       r1_bio->sectors = bio_sectors(mbio) - sectors_handled;
                        r1_bio->state = 0;
                        set_bit(R1BIO_ReadError, &r1_bio->state);
                        r1_bio->mddev = mddev;
@@ -2464,18 +2428,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp
        for (i = 0; i < conf->raid_disks * 2; i++) {
                struct md_rdev *rdev;
                bio = r1_bio->bios[i];
-
-               /* take from bio_init */
-               bio->bi_next = NULL;
-               bio->bi_flags &= ~(BIO_POOL_MASK-1);
-               bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_rw = READ;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-               bio->bi_end_io = NULL;
-               bio->bi_private = NULL;
+               bio_reset(bio);
 
                rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev == NULL ||
index 018741ba93104d9ad432d7524a136cf7e69b2227..59d4daa5f4c7a32c245ef954f24650fe75084117 100644 (file)
@@ -1174,14 +1174,13 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        /* If this request crosses a chunk boundary, we need to
         * split it.  This will only happen for 1 PAGE (or less) requests.
         */
-       if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
+       if (unlikely((bio->bi_sector & chunk_mask) + bio_sectors(bio)
                     > chunk_sects
                     && (conf->geo.near_copies < conf->geo.raid_disks
                         || conf->prev.near_copies < conf->prev.raid_disks))) {
                struct bio_pair *bp;
                /* Sanity check -- queue functions should prevent this happening */
-               if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
-                   bio->bi_idx != 0)
+               if (bio_segments(bio) > 1)
                        goto bad_map;
                /* This is a one page bio that upper layers
                 * refuse to split for us, so we need to split it.
@@ -1214,7 +1213,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
        bad_map:
                printk("md/raid10:%s: make_request bug: can't convert block across chunks"
                       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
-                      (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
+                      (unsigned long long)bio->bi_sector, bio_sectors(bio) / 2);
 
                bio_io_error(bio);
                return;
@@ -1229,7 +1228,7 @@ static void make_request(struct mddev *mddev, struct bio * bio)
         */
        wait_barrier(conf);
 
-       sectors = bio->bi_size >> 9;
+       sectors = bio_sectors(bio);
        while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
            bio->bi_sector < conf->reshape_progress &&
            bio->bi_sector + sectors > conf->reshape_progress) {
@@ -1331,8 +1330,7 @@ read_again:
                        r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
                        r10_bio->master_bio = bio;
-                       r10_bio->sectors = ((bio->bi_size >> 9)
-                                           - sectors_handled);
+                       r10_bio->sectors = bio_sectors(bio) - sectors_handled;
                        r10_bio->state = 0;
                        r10_bio->mddev = mddev;
                        r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -1574,7 +1572,7 @@ retry_write:
         * after checking if we need to go around again.
         */
 
-       if (sectors_handled < (bio->bi_size >> 9)) {
+       if (sectors_handled < bio_sectors(bio)) {
                one_write_done(r10_bio);
                /* We need another r10_bio.  It has already been counted
                 * in bio->bi_phys_segments.
@@ -1582,7 +1580,7 @@ retry_write:
                r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 
                r10_bio->master_bio = bio;
-               r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+               r10_bio->sectors = bio_sectors(bio) - sectors_handled;
 
                r10_bio->mddev = mddev;
                r10_bio->sector = bio->bi_sector + sectors_handled;
@@ -2084,13 +2082,10 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                 * First we need to fixup bv_offset, bv_len and
                 * bi_vecs, as the read request might have corrupted these
                 */
+               bio_reset(tbio);
+
                tbio->bi_vcnt = vcnt;
                tbio->bi_size = r10_bio->sectors << 9;
-               tbio->bi_idx = 0;
-               tbio->bi_phys_segments = 0;
-               tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               tbio->bi_flags |= 1 << BIO_UPTODATE;
-               tbio->bi_next = NULL;
                tbio->bi_rw = WRITE;
                tbio->bi_private = r10_bio;
                tbio->bi_sector = r10_bio->devs[i].addr;
@@ -2108,7 +2103,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                d = r10_bio->devs[i].devnum;
                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
                atomic_inc(&r10_bio->remaining);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(tbio));
 
                tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
                tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
@@ -2133,7 +2128,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
                d = r10_bio->devs[i].devnum;
                atomic_inc(&r10_bio->remaining);
                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            tbio->bi_size >> 9);
+                            bio_sectors(tbio));
                generic_make_request(tbio);
        }
 
@@ -2259,13 +2254,13 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
        wbio2 = r10_bio->devs[1].repl_bio;
        if (wbio->bi_end_io) {
                atomic_inc(&conf->mirrors[d].rdev->nr_pending);
-               md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
+               md_sync_acct(conf->mirrors[d].rdev->bdev, bio_sectors(wbio));
                generic_make_request(wbio);
        }
        if (wbio2 && wbio2->bi_end_io) {
                atomic_inc(&conf->mirrors[d].replacement->nr_pending);
                md_sync_acct(conf->mirrors[d].replacement->bdev,
-                            wbio2->bi_size >> 9);
+                            bio_sectors(wbio2));
                generic_make_request(wbio2);
        }
 }
@@ -2536,25 +2531,6 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
        }
 }
 
-static void bi_complete(struct bio *bio, int error)
-{
-       complete((struct completion *)bio->bi_private);
-}
-
-static int submit_bio_wait(int rw, struct bio *bio)
-{
-       struct completion event;
-       rw |= REQ_SYNC;
-
-       init_completion(&event);
-       bio->bi_private = &event;
-       bio->bi_end_io = bi_complete;
-       submit_bio(rw, bio);
-       wait_for_completion(&event);
-
-       return test_bit(BIO_UPTODATE, &bio->bi_flags);
-}
-
 static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
        struct bio *bio = r10_bio->master_bio;
@@ -2695,8 +2671,7 @@ read_more:
                r10_bio = mempool_alloc(conf->r10bio_pool,
                                        GFP_NOIO);
                r10_bio->master_bio = mbio;
-               r10_bio->sectors = (mbio->bi_size >> 9)
-                       - sectors_handled;
+               r10_bio->sectors = bio_sectors(mbio) - sectors_handled;
                r10_bio->state = 0;
                set_bit(R10BIO_ReadError,
                        &r10_bio->state);
@@ -3133,6 +3108,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                        }
                                }
                                bio = r10_bio->devs[0].bio;
+                               bio_reset(bio);
                                bio->bi_next = biolist;
                                biolist = bio;
                                bio->bi_private = r10_bio;
@@ -3157,6 +3133,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                rdev = mirror->rdev;
                                if (!test_bit(In_sync, &rdev->flags)) {
                                        bio = r10_bio->devs[1].bio;
+                                       bio_reset(bio);
                                        bio->bi_next = biolist;
                                        biolist = bio;
                                        bio->bi_private = r10_bio;
@@ -3185,6 +3162,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                if (rdev == NULL || bio == NULL ||
                                    test_bit(Faulty, &rdev->flags))
                                        break;
+                               bio_reset(bio);
                                bio->bi_next = biolist;
                                biolist = bio;
                                bio->bi_private = r10_bio;
@@ -3283,7 +3261,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                                r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 
                        bio = r10_bio->devs[i].bio;
-                       bio->bi_end_io = NULL;
+                       bio_reset(bio);
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
                        if (conf->mirrors[d].rdev == NULL ||
                            test_bit(Faulty, &conf->mirrors[d].rdev->flags))
@@ -3320,6 +3298,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 
                        /* Need to set up for writing to the replacement */
                        bio = r10_bio->devs[i].repl_bio;
+                       bio_reset(bio);
                        clear_bit(BIO_UPTODATE, &bio->bi_flags);
 
                        sector = r10_bio->devs[i].addr;
@@ -3353,17 +3332,6 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
                }
        }
 
-       for (bio = biolist; bio ; bio=bio->bi_next) {
-
-               bio->bi_flags &= ~(BIO_POOL_MASK - 1);
-               if (bio->bi_end_io)
-                       bio->bi_flags |= 1 << BIO_UPTODATE;
-               bio->bi_vcnt = 0;
-               bio->bi_idx = 0;
-               bio->bi_phys_segments = 0;
-               bio->bi_size = 0;
-       }
-
        nr_sectors = 0;
        if (sector_nr + max_sync < max_sector)
                max_sector = sector_nr + max_sync;
@@ -4411,7 +4379,6 @@ read_more:
        read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
        read_bio->bi_flags |= 1 << BIO_UPTODATE;
        read_bio->bi_vcnt = 0;
-       read_bio->bi_idx = 0;
        read_bio->bi_size = 0;
        r10_bio->master_bio = read_bio;
        r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
@@ -4435,17 +4402,14 @@ read_more:
                }
                if (!rdev2 || test_bit(Faulty, &rdev2->flags))
                        continue;
+
+               bio_reset(b);
                b->bi_bdev = rdev2->bdev;
                b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
                b->bi_private = r10_bio;
                b->bi_end_io = end_reshape_write;
                b->bi_rw = WRITE;
-               b->bi_flags &= ~(BIO_POOL_MASK - 1);
-               b->bi_flags |= 1 << BIO_UPTODATE;
                b->bi_next = blist;
-               b->bi_vcnt = 0;
-               b->bi_idx = 0;
-               b->bi_size = 0;
                blist = b;
        }
 
index 4a7be455d6d86ceb6bda86a332b81d036db52dee..9359828ffe264d3313ee77de993ea4c5147f1205 100644 (file)
@@ -90,7 +90,7 @@ static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect)
  */
 static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector)
 {
-       int sectors = bio->bi_size >> 9;
+       int sectors = bio_sectors(bio);
        if (bio->bi_sector + sectors < sector + STRIPE_SECTORS)
                return bio->bi_next;
        else
@@ -569,14 +569,6 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                bi = &sh->dev[i].req;
                rbi = &sh->dev[i].rreq; /* For writing to replacement */
 
-               bi->bi_rw = rw;
-               rbi->bi_rw = rw;
-               if (rw & WRITE) {
-                       bi->bi_end_io = raid5_end_write_request;
-                       rbi->bi_end_io = raid5_end_write_request;
-               } else
-                       bi->bi_end_io = raid5_end_read_request;
-
                rcu_read_lock();
                rrdev = rcu_dereference(conf->disks[i].replacement);
                smp_mb(); /* Ensure that if rrdev is NULL, rdev won't be */
@@ -651,7 +643,14 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
                        set_bit(STRIPE_IO_STARTED, &sh->state);
 
+                       bio_reset(bi);
                        bi->bi_bdev = rdev->bdev;
+                       bi->bi_rw = rw;
+                       bi->bi_end_io = (rw & WRITE)
+                               ? raid5_end_write_request
+                               : raid5_end_read_request;
+                       bi->bi_private = sh;
+
                        pr_debug("%s: for %llu schedule op %ld on disc %d\n",
                                __func__, (unsigned long long)sh->sector,
                                bi->bi_rw, i);
@@ -665,12 +664,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags))
                                bi->bi_rw |= REQ_FLUSH;
 
-                       bi->bi_flags = 1 << BIO_UPTODATE;
-                       bi->bi_idx = 0;
                        bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        bi->bi_io_vec[0].bv_offset = 0;
                        bi->bi_size = STRIPE_SIZE;
-                       bi->bi_next = NULL;
                        if (rrdev)
                                set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
 
@@ -687,7 +683,13 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
 
                        set_bit(STRIPE_IO_STARTED, &sh->state);
 
+                       bio_reset(rbi);
                        rbi->bi_bdev = rrdev->bdev;
+                       rbi->bi_rw = rw;
+                       BUG_ON(!(rw & WRITE));
+                       rbi->bi_end_io = raid5_end_write_request;
+                       rbi->bi_private = sh;
+
                        pr_debug("%s: for %llu schedule op %ld on "
                                 "replacement disc %d\n",
                                __func__, (unsigned long long)sh->sector,
@@ -699,12 +701,9 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
                        else
                                rbi->bi_sector = (sh->sector
                                                  + rrdev->data_offset);
-                       rbi->bi_flags = 1 << BIO_UPTODATE;
-                       rbi->bi_idx = 0;
                        rbi->bi_io_vec[0].bv_len = STRIPE_SIZE;
                        rbi->bi_io_vec[0].bv_offset = 0;
                        rbi->bi_size = STRIPE_SIZE;
-                       rbi->bi_next = NULL;
                        if (conf->mddev->gendisk)
                                trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev),
                                                      rbi, disk_devt(conf->mddev->gendisk),
@@ -2402,11 +2401,11 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
        } else
                bip = &sh->dev[dd_idx].toread;
        while (*bip && (*bip)->bi_sector < bi->bi_sector) {
-               if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
+               if (bio_end_sector(*bip) > bi->bi_sector)
                        goto overlap;
                bip = & (*bip)->bi_next;
        }
-       if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
+       if (*bip && (*bip)->bi_sector < bio_end_sector(bi))
                goto overlap;
 
        BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
@@ -2422,8 +2421,8 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, in
                     sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
                             bi && bi->bi_sector <= sector;
                     bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
-                       if (bi->bi_sector + (bi->bi_size>>9) >= sector)
-                               sector = bi->bi_sector + (bi->bi_size>>9);
+                       if (bio_end_sector(bi) >= sector)
+                               sector = bio_end_sector(bi);
                }
                if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
                        set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
@@ -3849,7 +3848,7 @@ static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
        sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
        unsigned int chunk_sectors = mddev->chunk_sectors;
-       unsigned int bio_sectors = bio->bi_size >> 9;
+       unsigned int bio_sectors = bio_sectors(bio);
 
        if (mddev->new_chunk_sectors < mddev->chunk_sectors)
                chunk_sectors = mddev->new_chunk_sectors;
@@ -3941,7 +3940,7 @@ static int bio_fits_rdev(struct bio *bi)
 {
        struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-       if ((bi->bi_size>>9) > queue_max_sectors(q))
+       if (bio_sectors(bi) > queue_max_sectors(q))
                return 0;
        blk_recount_segments(q, bi);
        if (bi->bi_phys_segments > queue_max_segments(q))
@@ -3988,7 +3987,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                                                    0,
                                                    &dd_idx, NULL);
 
-       end_sector = align_bi->bi_sector + (align_bi->bi_size >> 9);
+       end_sector = bio_end_sector(align_bi);
        rcu_read_lock();
        rdev = rcu_dereference(conf->disks[dd_idx].replacement);
        if (!rdev || test_bit(Faulty, &rdev->flags) ||
@@ -4011,7 +4010,7 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio)
                align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
 
                if (!bio_fits_rdev(align_bi) ||
-                   is_badblock(rdev, align_bi->bi_sector, align_bi->bi_size>>9,
+                   is_badblock(rdev, align_bi->bi_sector, bio_sectors(align_bi),
                                &first_bad, &bad_sectors)) {
                        /* too big in some way, or has a known bad block */
                        bio_put(align_bi);
@@ -4273,7 +4272,7 @@ static void make_request(struct mddev *mddev, struct bio * bi)
        }
 
        logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
-       last_sector = bi->bi_sector + (bi->bi_size>>9);
+       last_sector = bio_end_sector(bi);
        bi->bi_next = NULL;
        bi->bi_phys_segments = 1;       /* over-loaded to count active stripes */
 
@@ -4739,7 +4738,7 @@ static int  retry_aligned_read(struct r5conf *conf, struct bio *raid_bio)
        logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
        sector = raid5_compute_sector(conf, logical_sector,
                                      0, &dd_idx, NULL);
-       last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
+       last_sector = bio_end_sector(raid_bio);
 
        for (; logical_sector < last_sector;
             logical_sector += STRIPE_SECTORS,
index ffee6f781e30f6a25537bba1ed44f379aee137fd..dd239bdbfcb4a0877db2ab49aa3c27a81eec7dd1 100644 (file)
@@ -2235,10 +2235,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        }
 
        /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n",
-                   ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                   rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                   ioc->name, __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                   bio_segments(rsp->bio), blk_rq_bytes(rsp));
                return -EINVAL;
        }
 
index 07ba32b07fb05d427cb18e62807ef40152577406..6eca019bcf30a50edfab1a80daf1b351d2320474 100644 (file)
@@ -822,8 +822,7 @@ dcssblk_make_request(struct request_queue *q, struct bio *bio)
        if ((bio->bi_sector & 7) != 0 || (bio->bi_size & 4095) != 0)
                /* Request is not page-aligned. */
                goto fail;
-       if (((bio->bi_size >> 9) + bio->bi_sector)
-                       > get_capacity(bio->bi_bdev->bd_disk)) {
+       if (bio_end_sector(bio) > get_capacity(bio->bi_bdev->bd_disk)) {
                /* Request beyond end of DCSS segment. */
                goto fail;
        }
index 55cbd018015997bba484bfe5ffdbacabcc7e43db..f42b0e15410f8a52e0a570664b04547d459e7aec 100644 (file)
@@ -2163,10 +2163,10 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        }
 
        /* do we need to support multiple segments? */
-       if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1 || bio_segments(rsp->bio) > 1) {
                printk("%s: multiple segments req %u %u, rsp %u %u\n",
-                      __func__, req->bio->bi_vcnt, blk_rq_bytes(req),
-                      rsp->bio->bi_vcnt, blk_rq_bytes(rsp));
+                      __func__, bio_segments(req->bio), blk_rq_bytes(req),
+                      bio_segments(rsp->bio), blk_rq_bytes(rsp));
                return -EINVAL;
        }
 
index 8c2ffbe6af0f1dbd67e724507101eabe5e0ceca6..193e7ae90c3bd754b7a50a01db5fda73b2822502 100644 (file)
@@ -1939,7 +1939,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        ioc->transport_cmds.status = MPT2_CMD_PENDING;
 
        /* Check if the request is split across multiple segments */
-       if (req->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1) {
                u32 offset = 0;
 
                /* Allocate memory and copy the request */
@@ -1971,7 +1971,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
 
        /* Check if the response needs to be populated across
         * multiple segments */
-       if (rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(rsp->bio) > 1) {
                pci_addr_in = pci_alloc_consistent(ioc->pdev, blk_rq_bytes(rsp),
                    &pci_dma_in);
                if (!pci_addr_in) {
@@ -2038,7 +2038,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
        sgl_flags = (MPI2_SGE_FLAGS_SIMPLE_ELEMENT |
            MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC);
        sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-       if (req->bio->bi_vcnt > 1) {
+       if (bio_segments(req->bio) > 1) {
                ioc->base_add_sg_single(psge, sgl_flags |
                    (blk_rq_bytes(req) - 4), pci_dma_out);
        } else {
@@ -2054,7 +2054,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
            MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER |
            MPI2_SGE_FLAGS_END_OF_LIST);
        sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT;
-       if (rsp->bio->bi_vcnt > 1) {
+       if (bio_segments(rsp->bio) > 1) {
                ioc->base_add_sg_single(psge, sgl_flags |
                    (blk_rq_bytes(rsp) + 4), pci_dma_in);
        } else {
@@ -2099,7 +2099,7 @@ _transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy,
                    le16_to_cpu(mpi_reply->ResponseDataLength);
                /* check if the resp needs to be copied from the allocated
                 * pci mem */
-               if (rsp->bio->bi_vcnt > 1) {
+               if (bio_segments(rsp->bio) > 1) {
                        u32 offset = 0;
                        u32 bytes_to_copy =
                            le16_to_cpu(mpi_reply->ResponseDataLength);
index a3f28f331b2bba7e6653da30dce92adba5d97140..8fb42916d8a29812e349fdbfa980ef1c34056ce4 100644 (file)
 #include <linux/workqueue.h>
 #include <linux/slab.h>
 
-struct integrity_slab {
-       struct kmem_cache *slab;
-       unsigned short nr_vecs;
-       char name[8];
-};
-
-#define IS(x) { .nr_vecs = x, .name = "bip-"__stringify(x) }
-struct integrity_slab bip_slab[BIOVEC_NR_POOLS] __read_mostly = {
-       IS(1), IS(4), IS(16), IS(64), IS(128), IS(BIO_MAX_PAGES),
-};
-#undef IS
+#define BIP_INLINE_VECS        4
 
+static struct kmem_cache *bip_slab;
 static struct workqueue_struct *kintegrityd_wq;
 
-static inline unsigned int vecs_to_idx(unsigned int nr)
-{
-       switch (nr) {
-       case 1:
-               return 0;
-       case 2 ... 4:
-               return 1;
-       case 5 ... 16:
-               return 2;
-       case 17 ... 64:
-               return 3;
-       case 65 ... 128:
-               return 4;
-       case 129 ... BIO_MAX_PAGES:
-               return 5;
-       default:
-               BUG();
-       }
-}
-
-static inline int use_bip_pool(unsigned int idx)
-{
-       if (idx == BIOVEC_MAX_IDX)
-               return 1;
-
-       return 0;
-}
-
 /**
  * bio_integrity_alloc - Allocate integrity payload and attach it to bio
  * @bio:       bio to attach integrity metadata to
@@ -84,37 +47,41 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
                                                  unsigned int nr_vecs)
 {
        struct bio_integrity_payload *bip;
-       unsigned int idx = vecs_to_idx(nr_vecs);
        struct bio_set *bs = bio->bi_pool;
-
-       if (!bs)
-               bs = fs_bio_set;
-
-       BUG_ON(bio == NULL);
-       bip = NULL;
-
-       /* Lower order allocations come straight from slab */
-       if (!use_bip_pool(idx))
-               bip = kmem_cache_alloc(bip_slab[idx].slab, gfp_mask);
-
-       /* Use mempool if lower order alloc failed or max vecs were requested */
-       if (bip == NULL) {
-               idx = BIOVEC_MAX_IDX;  /* so we free the payload properly later */
+       unsigned long idx = BIO_POOL_NONE;
+       unsigned inline_vecs;
+
+       if (!bs) {
+               bip = kmalloc(sizeof(struct bio_integrity_payload) +
+                             sizeof(struct bio_vec) * nr_vecs, gfp_mask);
+               inline_vecs = nr_vecs;
+       } else {
                bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-
-               if (unlikely(bip == NULL)) {
-                       printk(KERN_ERR "%s: could not alloc bip\n", __func__);
-                       return NULL;
-               }
+               inline_vecs = BIP_INLINE_VECS;
        }
 
+       if (unlikely(!bip))
+               return NULL;
+
        memset(bip, 0, sizeof(*bip));
 
+       if (nr_vecs > inline_vecs) {
+               bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
+                                         bs->bvec_integrity_pool);
+               if (!bip->bip_vec)
+                       goto err;
+       } else {
+               bip->bip_vec = bip->bip_inline_vecs;
+       }
+
        bip->bip_slab = idx;
        bip->bip_bio = bio;
        bio->bi_integrity = bip;
 
        return bip;
+err:
+       mempool_free(bip, bs->bio_integrity_pool);
+       return NULL;
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
@@ -130,20 +97,18 @@ void bio_integrity_free(struct bio *bio)
        struct bio_integrity_payload *bip = bio->bi_integrity;
        struct bio_set *bs = bio->bi_pool;
 
-       if (!bs)
-               bs = fs_bio_set;
-
-       BUG_ON(bip == NULL);
-
-       /* A cloned bio doesn't own the integrity metadata */
-       if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY)
-           && bip->bip_buf != NULL)
+       if (bip->bip_owns_buf)
                kfree(bip->bip_buf);
 
-       if (use_bip_pool(bip->bip_slab))
+       if (bs) {
+               if (bip->bip_slab != BIO_POOL_NONE)
+                       bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
+                                 bip->bip_slab);
+
                mempool_free(bip, bs->bio_integrity_pool);
-       else
-               kmem_cache_free(bip_slab[bip->bip_slab].slab, bip);
+       } else {
+               kfree(bip);
+       }
 
        bio->bi_integrity = NULL;
 }
@@ -419,6 +384,7 @@ int bio_integrity_prep(struct bio *bio)
                return -EIO;
        }
 
+       bip->bip_owns_buf = 1;
        bip->bip_buf = buf;
        bip->bip_size = len;
        bip->bip_sector = bio->bi_sector;
@@ -694,11 +660,11 @@ void bio_integrity_split(struct bio *bio, struct bio_pair *bp, int sectors)
        bp->bio1.bi_integrity = &bp->bip1;
        bp->bio2.bi_integrity = &bp->bip2;
 
-       bp->iv1 = bip->bip_vec[0];
-       bp->iv2 = bip->bip_vec[0];
+       bp->iv1 = bip->bip_vec[bip->bip_idx];
+       bp->iv2 = bip->bip_vec[bip->bip_idx];
 
-       bp->bip1.bip_vec[0] = bp->iv1;
-       bp->bip2.bip_vec[0] = bp->iv2;
+       bp->bip1.bip_vec = &bp->iv1;
+       bp->bip2.bip_vec = &bp->iv2;
 
        bp->iv1.bv_len = sectors * bi->tuple_size;
        bp->iv2.bv_offset += sectors * bi->tuple_size;
@@ -746,13 +712,14 @@ EXPORT_SYMBOL(bio_integrity_clone);
 
 int bioset_integrity_create(struct bio_set *bs, int pool_size)
 {
-       unsigned int max_slab = vecs_to_idx(BIO_MAX_PAGES);
-
        if (bs->bio_integrity_pool)
                return 0;
 
-       bs->bio_integrity_pool =
-               mempool_create_slab_pool(pool_size, bip_slab[max_slab].slab);
+       bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
+
+       bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_integrity_pool)
+               return -1;
 
        if (!bs->bio_integrity_pool)
                return -1;
@@ -765,13 +732,14 @@ void bioset_integrity_free(struct bio_set *bs)
 {
        if (bs->bio_integrity_pool)
                mempool_destroy(bs->bio_integrity_pool);
+
+       if (bs->bvec_integrity_pool)
+               mempool_destroy(bs->bio_integrity_pool);
 }
 EXPORT_SYMBOL(bioset_integrity_free);
 
 void __init bio_integrity_init(void)
 {
-       unsigned int i;
-
        /*
         * kintegrityd won't block much but may burn a lot of CPU cycles.
         * Make it highpri CPU intensive wq with max concurrency of 1.
@@ -781,14 +749,10 @@ void __init bio_integrity_init(void)
        if (!kintegrityd_wq)
                panic("Failed to create kintegrityd\n");
 
-       for (i = 0 ; i < BIOVEC_NR_POOLS ; i++) {
-               unsigned int size;
-
-               size = sizeof(struct bio_integrity_payload)
-                       + bip_slab[i].nr_vecs * sizeof(struct bio_vec);
-
-               bip_slab[i].slab =
-                       kmem_cache_create(bip_slab[i].name, size, 0,
-                                         SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-       }
+       bip_slab = kmem_cache_create("bio_integrity_payload",
+                                    sizeof(struct bio_integrity_payload) +
+                                    sizeof(struct bio_vec) * BIP_INLINE_VECS,
+                                    0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+       if (!bip_slab)
+               panic("Failed to create slab\n");
 }
index 954d73124b411a733891f00394c44a1ffaefd0b1..94bbc04dba77053bb47d3d8b793a3a8218f0a0d2 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -161,12 +161,12 @@ unsigned int bvec_nr_vecs(unsigned short idx)
        return bvec_slabs[idx].nr_vecs;
 }
 
-void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
 {
        BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
 
        if (idx == BIOVEC_MAX_IDX)
-               mempool_free(bv, bs->bvec_pool);
+               mempool_free(bv, pool);
        else {
                struct biovec_slab *bvs = bvec_slabs + idx;
 
@@ -174,8 +174,8 @@ void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
        }
 }
 
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
-                             struct bio_set *bs)
+struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
+                          mempool_t *pool)
 {
        struct bio_vec *bvl;
 
@@ -211,7 +211,7 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
         */
        if (*idx == BIOVEC_MAX_IDX) {
 fallback:
-               bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+               bvl = mempool_alloc(pool, gfp_mask);
        } else {
                struct biovec_slab *bvs = bvec_slabs + *idx;
                gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
@@ -253,8 +253,8 @@ static void bio_free(struct bio *bio)
        __bio_free(bio);
 
        if (bs) {
-               if (bio_has_allocated_vec(bio))
-                       bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
+               if (bio_flagged(bio, BIO_OWNS_VEC))
+                       bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
 
                /*
                 * If we have front padding, adjust the bio pointer before freeing
@@ -298,6 +298,54 @@ void bio_reset(struct bio *bio)
 }
 EXPORT_SYMBOL(bio_reset);
 
+static void bio_alloc_rescue(struct work_struct *work)
+{
+       struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
+       struct bio *bio;
+
+       while (1) {
+               spin_lock(&bs->rescue_lock);
+               bio = bio_list_pop(&bs->rescue_list);
+               spin_unlock(&bs->rescue_lock);
+
+               if (!bio)
+                       break;
+
+               generic_make_request(bio);
+       }
+}
+
+static void punt_bios_to_rescuer(struct bio_set *bs)
+{
+       struct bio_list punt, nopunt;
+       struct bio *bio;
+
+       /*
+        * In order to guarantee forward progress we must punt only bios that
+        * were allocated from this bio_set; otherwise, if there was a bio on
+        * there for a stacking driver higher up in the stack, processing it
+        * could require allocating bios from this bio_set, and doing that from
+        * our own rescuer would be bad.
+        *
+        * Since bio lists are singly linked, pop them all instead of trying to
+        * remove from the middle of the list:
+        */
+
+       bio_list_init(&punt);
+       bio_list_init(&nopunt);
+
+       while ((bio = bio_list_pop(current->bio_list)))
+               bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
+
+       *current->bio_list = nopunt;
+
+       spin_lock(&bs->rescue_lock);
+       bio_list_merge(&bs->rescue_list, &punt);
+       spin_unlock(&bs->rescue_lock);
+
+       queue_work(bs->rescue_workqueue, &bs->rescue_work);
+}
+
 /**
  * bio_alloc_bioset - allocate a bio for I/O
  * @gfp_mask:   the GFP_ mask given to the slab allocator
@@ -315,11 +363,27 @@ EXPORT_SYMBOL(bio_reset);
  *   previously allocated bio for IO before attempting to allocate a new one.
  *   Failure to do so can cause deadlocks under memory pressure.
  *
+ *   Note that when running under generic_make_request() (i.e. any block
+ *   driver), bios are not submitted until after you return - see the code in
+ *   generic_make_request() that converts recursion into iteration, to prevent
+ *   stack overflows.
+ *
+ *   This would normally mean allocating multiple bios under
+ *   generic_make_request() would be susceptible to deadlocks, but we have
+ *   deadlock avoidance code that resubmits any blocked bios from a rescuer
+ *   thread.
+ *
+ *   However, we do not guarantee forward progress for allocations from other
+ *   mempools. Doing multiple allocations from the same mempool under
+ *   generic_make_request() should be avoided - instead, use bio_set's front_pad
+ *   for per bio allocations.
+ *
  *   RETURNS:
  *   Pointer to new bio on success, NULL on failure.
  */
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
+       gfp_t saved_gfp = gfp_mask;
        unsigned front_pad;
        unsigned inline_vecs;
        unsigned long idx = BIO_POOL_NONE;
@@ -337,7 +401,37 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
                front_pad = 0;
                inline_vecs = nr_iovecs;
        } else {
+               /*
+                * generic_make_request() converts recursion to iteration; this
+                * means if we're running beneath it, any bios we allocate and
+                * submit will not be submitted (and thus freed) until after we
+                * return.
+                *
+                * This exposes us to a potential deadlock if we allocate
+                * multiple bios from the same bio_set() while running
+                * underneath generic_make_request(). If we were to allocate
+                * multiple bios (say a stacking block driver that was splitting
+                * bios), we would deadlock if we exhausted the mempool's
+                * reserve.
+                *
+                * We solve this, and guarantee forward progress, with a rescuer
+                * workqueue per bio_set. If we go to allocate and there are
+                * bios on current->bio_list, we first try the allocation
+                * without __GFP_WAIT; if that fails, we punt those bios we
+                * would be blocking to the rescuer workqueue before we retry
+                * with the original gfp_flags.
+                */
+
+               if (current->bio_list && !bio_list_empty(current->bio_list))
+                       gfp_mask &= ~__GFP_WAIT;
+
                p = mempool_alloc(bs->bio_pool, gfp_mask);
+               if (!p && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       p = mempool_alloc(bs->bio_pool, gfp_mask);
+               }
+
                front_pad = bs->front_pad;
                inline_vecs = BIO_INLINE_VECS;
        }
@@ -349,9 +443,17 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
        bio_init(bio);
 
        if (nr_iovecs > inline_vecs) {
-               bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+               bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               if (!bvl && gfp_mask != saved_gfp) {
+                       punt_bios_to_rescuer(bs);
+                       gfp_mask = saved_gfp;
+                       bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
+               }
+
                if (unlikely(!bvl))
                        goto err_free;
+
+               bio->bi_flags |= 1 << BIO_OWNS_VEC;
        } else if (nr_iovecs) {
                bvl = bio->bi_inline_vecs;
        }
@@ -653,6 +755,181 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 }
 EXPORT_SYMBOL(bio_add_page);
 
+struct submit_bio_ret {
+       struct completion event;
+       int error;
+};
+
+static void submit_bio_wait_endio(struct bio *bio, int error)
+{
+       struct submit_bio_ret *ret = bio->bi_private;
+
+       ret->error = error;
+       complete(&ret->event);
+}
+
+/**
+ * submit_bio_wait - submit a bio, and wait until it completes
+ * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
+ * @bio: The &struct bio which describes the I/O
+ *
+ * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
+ * bio_endio() on failure.
+ */
+int submit_bio_wait(int rw, struct bio *bio)
+{
+       struct submit_bio_ret ret;
+
+       rw |= REQ_SYNC;
+       init_completion(&ret.event);
+       bio->bi_private = &ret;
+       bio->bi_end_io = submit_bio_wait_endio;
+       submit_bio(rw, bio);
+       wait_for_completion(&ret.event);
+
+       return ret.error;
+}
+EXPORT_SYMBOL(submit_bio_wait);
+
+/**
+ * bio_advance - increment/complete a bio by some number of bytes
+ * @bio:       bio to advance
+ * @bytes:     number of bytes to complete
+ *
+ * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
+ * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
+ * be updated on the last bvec as well.
+ *
+ * @bio will then represent the remaining, uncompleted portion of the io.
+ */
+void bio_advance(struct bio *bio, unsigned bytes)
+{
+       if (bio_integrity(bio))
+               bio_integrity_advance(bio, bytes);
+
+       bio->bi_sector += bytes >> 9;
+       bio->bi_size -= bytes;
+
+       if (bio->bi_rw & BIO_NO_ADVANCE_ITER_MASK)
+               return;
+
+       while (bytes) {
+               if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
+                       WARN_ONCE(1, "bio idx %d >= vcnt %d\n",
+                                 bio->bi_idx, bio->bi_vcnt);
+                       break;
+               }
+
+               if (bytes >= bio_iovec(bio)->bv_len) {
+                       bytes -= bio_iovec(bio)->bv_len;
+                       bio->bi_idx++;
+               } else {
+                       bio_iovec(bio)->bv_len -= bytes;
+                       bio_iovec(bio)->bv_offset += bytes;
+                       bytes = 0;
+               }
+       }
+}
+EXPORT_SYMBOL(bio_advance);
+
+/**
+ * bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+       int i;
+       struct bio_vec *bv;
+
+       bio_for_each_segment_all(bv, bio, i) {
+               bv->bv_page = alloc_page(gfp_mask);
+               if (!bv->bv_page) {
+                       while (--bv >= bio->bi_io_vec)
+                               __free_page(bv->bv_page);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+EXPORT_SYMBOL(bio_alloc_pages);
+
+/**
+ * bio_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
+ * @src and @dst as linked lists of bios.
+ *
+ * Stops when it reaches the end of either @src or @dst - that is, copies
+ * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
+ */
+void bio_copy_data(struct bio *dst, struct bio *src)
+{
+       struct bio_vec *src_bv, *dst_bv;
+       unsigned src_offset, dst_offset, bytes;
+       void *src_p, *dst_p;
+
+       src_bv = bio_iovec(src);
+       dst_bv = bio_iovec(dst);
+
+       src_offset = src_bv->bv_offset;
+       dst_offset = dst_bv->bv_offset;
+
+       while (1) {
+               if (src_offset == src_bv->bv_offset + src_bv->bv_len) {
+                       src_bv++;
+                       if (src_bv == bio_iovec_idx(src, src->bi_vcnt)) {
+                               src = src->bi_next;
+                               if (!src)
+                                       break;
+
+                               src_bv = bio_iovec(src);
+                       }
+
+                       src_offset = src_bv->bv_offset;
+               }
+
+               if (dst_offset == dst_bv->bv_offset + dst_bv->bv_len) {
+                       dst_bv++;
+                       if (dst_bv == bio_iovec_idx(dst, dst->bi_vcnt)) {
+                               dst = dst->bi_next;
+                               if (!dst)
+                                       break;
+
+                               dst_bv = bio_iovec(dst);
+                       }
+
+                       dst_offset = dst_bv->bv_offset;
+               }
+
+               bytes = min(dst_bv->bv_offset + dst_bv->bv_len - dst_offset,
+                           src_bv->bv_offset + src_bv->bv_len - src_offset);
+
+               src_p = kmap_atomic(src_bv->bv_page);
+               dst_p = kmap_atomic(dst_bv->bv_page);
+
+               memcpy(dst_p + dst_bv->bv_offset,
+                      src_p + src_bv->bv_offset,
+                      bytes);
+
+               kunmap_atomic(dst_p);
+               kunmap_atomic(src_p);
+
+               src_offset += bytes;
+               dst_offset += bytes;
+       }
+}
+EXPORT_SYMBOL(bio_copy_data);
+
 struct bio_map_data {
        struct bio_vec *iovecs;
        struct sg_iovec *sgvecs;
@@ -715,7 +992,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs,
        int iov_idx = 0;
        unsigned int iov_off = 0;
 
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                char *bv_addr = page_address(bvec->bv_page);
                unsigned int bv_len = iovecs[i].bv_len;
 
@@ -897,7 +1174,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q,
        return bio;
 cleanup:
        if (!map_data)
-               bio_for_each_segment(bvec, bio, i)
+               bio_for_each_segment_all(bvec, bio, i)
                        __free_page(bvec->bv_page);
 
        bio_put(bio);
@@ -1111,7 +1388,7 @@ static void __bio_unmap_user(struct bio *bio)
        /*
         * make sure we dirty pages we wrote to
         */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                if (bio_data_dir(bio) == READ)
                        set_page_dirty_lock(bvec->bv_page);
 
@@ -1217,7 +1494,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err)
        int i;
        char *p = bmd->sgvecs[0].iov_base;
 
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                char *addr = page_address(bvec->bv_page);
                int len = bmd->iovecs[i].bv_len;
 
@@ -1257,7 +1534,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
        if (!reading) {
                void *p = data;
 
-               bio_for_each_segment(bvec, bio, i) {
+               bio_for_each_segment_all(bvec, bio, i) {
                        char *addr = page_address(bvec->bv_page);
 
                        memcpy(addr, p, bvec->bv_len);
@@ -1302,11 +1579,11 @@ EXPORT_SYMBOL(bio_copy_kern);
  */
 void bio_set_pages_dirty(struct bio *bio)
 {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int i;
 
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
 
                if (page && !PageCompound(page))
                        set_page_dirty_lock(page);
@@ -1315,11 +1592,11 @@ void bio_set_pages_dirty(struct bio *bio)
 
 static void bio_release_pages(struct bio *bio)
 {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int i;
 
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
 
                if (page)
                        put_page(page);
@@ -1368,16 +1645,16 @@ static void bio_dirty_fn(struct work_struct *work)
 
 void bio_check_pages_dirty(struct bio *bio)
 {
-       struct bio_vec *bvec = bio->bi_io_vec;
+       struct bio_vec *bvec;
        int nr_clean_pages = 0;
        int i;
 
-       for (i = 0; i < bio->bi_vcnt; i++) {
-               struct page *page = bvec[i].bv_page;
+       bio_for_each_segment_all(bvec, bio, i) {
+               struct page *page = bvec->bv_page;
 
                if (PageDirty(page) || PageCompound(page)) {
                        page_cache_release(page);
-                       bvec[i].bv_page = NULL;
+                       bvec->bv_page = NULL;
                } else {
                        nr_clean_pages++;
                }
@@ -1478,8 +1755,7 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        trace_block_split(bdev_get_queue(bi->bi_bdev), bi,
                                bi->bi_sector + first_sectors);
 
-       BUG_ON(bi->bi_vcnt != 1 && bi->bi_vcnt != 0);
-       BUG_ON(bi->bi_idx != 0);
+       BUG_ON(bio_segments(bi) > 1);
        atomic_set(&bp->cnt, 3);
        bp->error = 0;
        bp->bio1 = *bi;
@@ -1489,8 +1765,8 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
        bp->bio1.bi_size = first_sectors << 9;
 
        if (bi->bi_vcnt != 0) {
-               bp->bv1 = bi->bi_io_vec[0];
-               bp->bv2 = bi->bi_io_vec[0];
+               bp->bv1 = *bio_iovec(bi);
+               bp->bv2 = *bio_iovec(bi);
 
                if (bio_is_rw(bi)) {
                        bp->bv2.bv_offset += first_sectors << 9;
@@ -1542,7 +1818,7 @@ sector_t bio_sector_offset(struct bio *bio, unsigned short index,
        if (index >= bio->bi_idx)
                index = bio->bi_vcnt - 1;
 
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                if (i == index) {
                        if (offset > bv->bv_offset)
                                sectors += (offset - bv->bv_offset) / sector_sz;
@@ -1560,29 +1836,25 @@ EXPORT_SYMBOL(bio_sector_offset);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-static int biovec_create_pools(struct bio_set *bs, int pool_entries)
+mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
 {
        struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
 
-       bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
-       if (!bs->bvec_pool)
-               return -ENOMEM;
-
-       return 0;
-}
-
-static void biovec_free_pools(struct bio_set *bs)
-{
-       mempool_destroy(bs->bvec_pool);
+       return mempool_create_slab_pool(pool_entries, bp->slab);
 }
 
 void bioset_free(struct bio_set *bs)
 {
+       if (bs->rescue_workqueue)
+               destroy_workqueue(bs->rescue_workqueue);
+
        if (bs->bio_pool)
                mempool_destroy(bs->bio_pool);
 
+       if (bs->bvec_pool)
+               mempool_destroy(bs->bvec_pool);
+
        bioset_integrity_free(bs);
-       biovec_free_pools(bs);
        bio_put_slab(bs);
 
        kfree(bs);
@@ -1613,6 +1885,10 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
 
        bs->front_pad = front_pad;
 
+       spin_lock_init(&bs->rescue_lock);
+       bio_list_init(&bs->rescue_list);
+       INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
+
        bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
        if (!bs->bio_slab) {
                kfree(bs);
@@ -1623,9 +1899,15 @@ struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
        if (!bs->bio_pool)
                goto bad;
 
-       if (!biovec_create_pools(bs, pool_size))
-               return bs;
+       bs->bvec_pool = biovec_create_pool(bs, pool_size);
+       if (!bs->bvec_pool)
+               goto bad;
+
+       bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
+       if (!bs->rescue_workqueue)
+               goto bad;
 
+       return bs;
 bad:
        bioset_free(bs);
        return NULL;
index d9871c1f08949082ab5175f09c50668f961b9512..2091db8cdd783a2287ce9165a7223cfe59cf5a2b 100644 (file)
@@ -1556,7 +1556,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
                return 0;
 
        size -= pos;
-       if (size < INT_MAX)
+       if (size < iocb->ki_left)
                nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
        return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
index cdee391fc7bfd57c596204a8474142e7afe60335..73f2bfe3ac9302091608beae85b4aecf28622240 100644 (file)
@@ -2560,8 +2560,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                if (old_compressed)
                        contig = bio->bi_sector == sector;
                else
-                       contig = bio->bi_sector + (bio->bi_size >> 9) ==
-                               sector;
+                       contig = bio_end_sector(bio) == sector;
 
                if (prev_bio_flags != bio_flags || !contig ||
                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
index 2854c824ab6443d04eac19cf0b540f79c257cfa0..6789772265707bdd02d97f91dc92e8b34a2fd6bb 100644 (file)
@@ -5177,7 +5177,7 @@ static int bio_size_ok(struct block_device *bdev, struct bio *bio,
        }
 
        prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-       if ((bio->bi_size >> 9) > max_sectors)
+       if (bio_sectors(bio) > max_sectors)
                return 0;
 
        if (!q->merge_bvec_fn)
index bc1fe14aaa3e4583aa351298fc9faa4f42d2d6a9..d2a4d1bb2d57aec3999e494d52c4f765a0ae48e8 100644 (file)
@@ -2977,7 +2977,6 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
        bio->bi_io_vec[0].bv_offset = bh_offset(bh);
 
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = bh->b_size;
 
        bio->bi_end_io = end_bio_bh_io_sync;
index 51d16e067d6815909907b6ccb33bba7d7ec72ab5..7ab90f5081eebc4ab8b0de88bef8d0b6310ed113 100644 (file)
@@ -442,8 +442,8 @@ static struct bio *dio_await_one(struct dio *dio)
 static int dio_bio_complete(struct dio *dio, struct bio *bio)
 {
        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-       struct bio_vec *bvec = bio->bi_io_vec;
-       int page_no;
+       struct bio_vec *bvec;
+       unsigned i;
 
        if (!uptodate)
                dio->io_error = -EIO;
@@ -451,8 +451,8 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
        if (dio->is_async && dio->rw == READ) {
                bio_check_pages_dirty(bio);     /* transfers ownership */
        } else {
-               for (page_no = 0; page_no < bio->bi_vcnt; page_no++) {
-                       struct page *page = bvec[page_no].bv_page;
+               bio_for_each_segment_all(bvec, bio, i) {
+                       struct page *page = bvec->bv_page;
 
                        if (dio->rw == READ && !PageCompound(page))
                                set_page_dirty_lock(page);
index f936cb50dc0d524250dae6d6ac5382db5c1f25b7..b744228886043d522fa15a32ff98cc80d29db3c9 100644 (file)
@@ -401,7 +401,7 @@ static void _clear_bio(struct bio *bio)
        struct bio_vec *bv;
        unsigned i;
 
-       __bio_for_each_segment(bv, bio, i, 0) {
+       bio_for_each_segment_all(bv, bio, i) {
                unsigned this_count = bv->bv_len;
 
                if (likely(PAGE_SIZE == this_count))
index b963f38ac298e82404befa6265c171aca4584aba..7682b970d0f1352cd019c8076ea55cc470ecd523 100644 (file)
@@ -432,7 +432,7 @@ static void _mark_read4write_pages_uptodate(struct ore_io_state *ios, int ret)
                if (!bio)
                        continue;
 
-               __bio_for_each_segment(bv, bio, i, 0) {
+               bio_for_each_segment_all(bv, bio, i) {
                        struct page *page = bv->bv_page;
 
                        SetPageUptodate(page);
index 798d4458a4d3a5798a7b04858d82a5f031849bab..3be57189efd5b3a8005321f02e40971af9429cf6 100644 (file)
@@ -22,7 +22,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/kthread.h>
-#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
@@ -88,20 +87,6 @@ static inline struct inode *wb_inode(struct list_head *head)
 #define CREATE_TRACE_POINTS
 #include <trace/events/writeback.h>
 
-/* Wakeup flusher thread or forker thread to fork it. Requires bdi->wb_lock. */
-static void bdi_wakeup_flusher(struct backing_dev_info *bdi)
-{
-       if (bdi->wb.task) {
-               wake_up_process(bdi->wb.task);
-       } else {
-               /*
-                * The bdi thread isn't there, wake up the forker thread which
-                * will create and run it.
-                */
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
-}
-
 static void bdi_queue_work(struct backing_dev_info *bdi,
                           struct wb_writeback_work *work)
 {
@@ -109,10 +94,9 @@ static void bdi_queue_work(struct backing_dev_info *bdi,
 
        spin_lock_bh(&bdi->wb_lock);
        list_add_tail(&work->list, &bdi->work_list);
-       if (!bdi->wb.task)
-               trace_writeback_nothread(bdi, work);
-       bdi_wakeup_flusher(bdi);
        spin_unlock_bh(&bdi->wb_lock);
+
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 static void
@@ -127,10 +111,8 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
         */
        work = kzalloc(sizeof(*work), GFP_ATOMIC);
        if (!work) {
-               if (bdi->wb.task) {
-                       trace_writeback_nowork(bdi);
-                       wake_up_process(bdi->wb.task);
-               }
+               trace_writeback_nowork(bdi);
+               mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
                return;
        }
 
@@ -177,9 +159,7 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
         * writeback as soon as there is no other work to do.
         */
        trace_writeback_wake_background(bdi);
-       spin_lock_bh(&bdi->wb_lock);
-       bdi_wakeup_flusher(bdi);
-       spin_unlock_bh(&bdi->wb_lock);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 }
 
 /*
@@ -1020,67 +1000,49 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
 
 /*
  * Handle writeback of dirty data for the device backed by this bdi. Also
- * wakes up periodically and does kupdated style flushing.
+ * reschedules periodically and does kupdated style flushing.
  */
-int bdi_writeback_thread(void *data)
+void bdi_writeback_workfn(struct work_struct *work)
 {
-       struct bdi_writeback *wb = data;
+       struct bdi_writeback *wb = container_of(to_delayed_work(work),
+                                               struct bdi_writeback, dwork);
        struct backing_dev_info *bdi = wb->bdi;
        long pages_written;
 
        set_worker_desc("flush-%s", dev_name(bdi->dev));
        current->flags |= PF_SWAPWRITE;
-       set_freezable();
-       wb->last_active = jiffies;
-
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
-
-       trace_writeback_thread_start(bdi);
 
-       while (!kthread_freezable_should_stop(NULL)) {
+       if (likely(!current_is_workqueue_rescuer() ||
+                  list_empty(&bdi->bdi_list))) {
                /*
-                * Remove own delayed wake-up timer, since we are already awake
-                * and we'll take care of the periodic write-back.
+                * The normal path.  Keep writing back @bdi until its
+                * work_list is empty.  Note that this path is also taken
+                * if @bdi is shutting down even when we're running off the
+                * rescuer as work_list needs to be drained.
                 */
-               del_timer(&wb->wakeup_timer);
-
-               pages_written = wb_do_writeback(wb, 0);
-
+               do {
+                       pages_written = wb_do_writeback(wb, 0);
+                       trace_writeback_pages_written(pages_written);
+               } while (!list_empty(&bdi->work_list));
+       } else {
+               /*
+                * bdi_wq can't get enough workers and we're running off
+                * the emergency worker.  Don't hog it.  Hopefully, 1024 is
+                * enough for efficient IO.
+                */
+               pages_written = writeback_inodes_wb(&bdi->wb, 1024,
+                                                   WB_REASON_FORKER_THREAD);
                trace_writeback_pages_written(pages_written);
-
-               if (pages_written)
-                       wb->last_active = jiffies;
-
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (!list_empty(&bdi->work_list) || kthread_should_stop()) {
-                       __set_current_state(TASK_RUNNING);
-                       continue;
-               }
-
-               if (wb_has_dirty_io(wb) && dirty_writeback_interval)
-                       schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-               else {
-                       /*
-                        * We have nothing to do, so can go sleep without any
-                        * timeout and save power. When a work is queued or
-                        * something is made dirty - we will be woken up.
-                        */
-                       schedule();
-               }
        }
 
-       /* Flush any work that raced with us exiting */
-       if (!list_empty(&bdi->work_list))
-               wb_do_writeback(wb, 1);
+       if (!list_empty(&bdi->work_list) ||
+           (wb_has_dirty_io(wb) && dirty_writeback_interval))
+               queue_delayed_work(bdi_wq, &wb->dwork,
+                       msecs_to_jiffies(dirty_writeback_interval * 10));
 
-       trace_writeback_thread_stop(bdi);
-       return 0;
+       current->flags &= ~PF_SWAPWRITE;
 }
 
-
 /*
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
index 7318abf9d0fb863165857fb8adc897ad0f5e7c9e..c5fa758fd8446e1938036be9cdedaf75e2bc552b 100644 (file)
@@ -300,7 +300,7 @@ static struct bio *gfs2_log_get_bio(struct gfs2_sbd *sdp, u64 blkno)
        u64 nblk;
 
        if (bio) {
-               nblk = bio->bi_sector + bio_sectors(bio);
+               nblk = bio_end_sector(bio);
                nblk >>= sdp->sd_fsb2bb_shift;
                if (blkno == nblk)
                        return bio;
index cbe48ea9318eea1ce44cae4bf3190911edf6c4c4..c57499dca89c5a3910bcefc5af951179aa693f24 100644 (file)
@@ -2005,7 +2005,6 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = LOGPSIZE;
 
        bio->bi_end_io = lbmIODone;
@@ -2146,7 +2145,6 @@ static void lbmStartIO(struct lbuf * bp)
        bio->bi_io_vec[0].bv_offset = bp->l_offset;
 
        bio->bi_vcnt = 1;
-       bio->bi_idx = 0;
        bio->bi_size = LOGPSIZE;
 
        bio->bi_end_io = lbmIODone;
index e784a217b50067919ad3ebffe559b3552b58a9bc..550475ca6a0e0ec35c82d90b10372f2e4434fe90 100644 (file)
@@ -32,7 +32,6 @@ static int sync_request(struct page *page, struct block_device *bdev, int rw)
        bio_vec.bv_len = PAGE_SIZE;
        bio_vec.bv_offset = 0;
        bio.bi_vcnt = 1;
-       bio.bi_idx = 0;
        bio.bi_size = PAGE_SIZE;
        bio.bi_bdev = bdev;
        bio.bi_sector = page->index * (PAGE_SIZE >> 9);
@@ -108,7 +107,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                if (i >= max_pages) {
                        /* Block layer cannot split bios :( */
                        bio->bi_vcnt = i;
-                       bio->bi_idx = 0;
                        bio->bi_size = i * PAGE_SIZE;
                        bio->bi_bdev = super->s_bdev;
                        bio->bi_sector = ofs >> 9;
@@ -136,7 +134,6 @@ static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
                unlock_page(page);
        }
        bio->bi_vcnt = nr_pages;
-       bio->bi_idx = 0;
        bio->bi_size = nr_pages * PAGE_SIZE;
        bio->bi_bdev = super->s_bdev;
        bio->bi_sector = ofs >> 9;
@@ -202,7 +199,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                if (i >= max_pages) {
                        /* Block layer cannot split bios :( */
                        bio->bi_vcnt = i;
-                       bio->bi_idx = 0;
                        bio->bi_size = i * PAGE_SIZE;
                        bio->bi_bdev = super->s_bdev;
                        bio->bi_sector = ofs >> 9;
@@ -224,7 +220,6 @@ static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
                bio->bi_io_vec[i].bv_offset = 0;
        }
        bio->bi_vcnt = nr_pages;
-       bio->bi_idx = 0;
        bio->bi_size = nr_pages * PAGE_SIZE;
        bio->bi_bdev = super->s_bdev;
        bio->bi_sector = ofs >> 9;
index 350459910fe138a3d47573a07e8b125c62f36564..c3881553f7d15ef029323e49bcbc9e48365eb5db 100644 (file)
@@ -18,6 +18,7 @@
 #include <linux/writeback.h>
 #include <linux/atomic.h>
 #include <linux/sysctl.h>
+#include <linux/workqueue.h>
 
 struct page;
 struct device;
@@ -27,7 +28,6 @@ struct dentry;
  * Bits in backing_dev_info.state
  */
 enum bdi_state {
-       BDI_pending,            /* On its way to being activated */
        BDI_wb_alloc,           /* Default embedded wb allocated */
        BDI_async_congested,    /* The async (write) queue is getting full */
        BDI_sync_congested,     /* The sync queue is getting full */
@@ -53,10 +53,8 @@ struct bdi_writeback {
        unsigned int nr;
 
        unsigned long last_old_flush;   /* last old data flush */
-       unsigned long last_active;      /* last time bdi thread was active */
 
-       struct task_struct *task;       /* writeback thread */
-       struct timer_list wakeup_timer; /* used for delayed bdi thread wakeup */
+       struct delayed_work dwork;      /* work item used for writeback */
        struct list_head b_dirty;       /* dirty inodes */
        struct list_head b_io;          /* parked for writeback */
        struct list_head b_more_io;     /* parked for more writeback */
@@ -123,14 +121,15 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
                        enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
-int bdi_writeback_thread(void *data);
+void bdi_writeback_workfn(struct work_struct *work);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
-extern struct list_head bdi_pending_list;
+
+extern struct workqueue_struct *bdi_wq;
 
 static inline int wb_has_dirty_io(struct bdi_writeback *wb)
 {
@@ -336,11 +335,6 @@ static inline bool bdi_cap_swap_backed(struct backing_dev_info *bdi)
        return bdi->capabilities & BDI_CAP_SWAP_BACKED;
 }
 
-static inline bool bdi_cap_flush_forker(struct backing_dev_info *bdi)
-{
-       return bdi == &default_backing_dev_info;
-}
-
 static inline bool mapping_cap_writeback_dirty(struct address_space *mapping)
 {
        return bdi_cap_writeback_dirty(mapping->backing_dev_info);
index 820e7aaad4fdbbf432b188b083662b5015bd3905..ef24466d8f82516a76029577df9c8f1ec530cf20 100644 (file)
@@ -67,6 +67,7 @@
 #define bio_offset(bio)                bio_iovec((bio))->bv_offset
 #define bio_segments(bio)      ((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)       ((bio)->bi_size >> 9)
+#define bio_end_sector(bio)    ((bio)->bi_sector + bio_sectors((bio)))
 
 static inline unsigned int bio_cur_bytes(struct bio *bio)
 {
@@ -84,11 +85,6 @@ static inline void *bio_data(struct bio *bio)
        return NULL;
 }
 
-static inline int bio_has_allocated_vec(struct bio *bio)
-{
-       return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs;
-}
-
 /*
  * will die
  */
@@ -136,16 +132,27 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 
 /*
- * drivers should not use the __ version unless they _really_ want to
- * run through the entire bio and not just pending pieces
+ * drivers should not use the __ version unless they _really_ know what
+ * they're doing
  */
 #define __bio_for_each_segment(bvl, bio, i, start_idx)                 \
        for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);  \
             i < (bio)->bi_vcnt;                                        \
             bvl++, i++)
 
+/*
+ * drivers should _never_ use the all version - the bio may have been split
+ * before it got to the driver and the driver won't own all of it
+ */
+#define bio_for_each_segment_all(bvl, bio, i)                          \
+       for (i = 0;                                                     \
+            bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;       \
+            i++)
+
 #define bio_for_each_segment(bvl, bio, i)                              \
-       __bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
+       for (i = (bio)->bi_idx;                                         \
+            bvl = bio_iovec_idx((bio), (i)), i < (bio)->bi_vcnt;       \
+            i++)
 
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
@@ -180,9 +187,12 @@ struct bio_integrity_payload {
        unsigned short          bip_slab;       /* slab the bip came from */
        unsigned short          bip_vcnt;       /* # of integrity bio_vecs */
        unsigned short          bip_idx;        /* current bip_vec index */
+       unsigned                bip_owns_buf:1; /* should free bip_buf */
 
        struct work_struct      bip_work;       /* I/O completion */
-       struct bio_vec          bip_vec[0];     /* embedded bvec array */
+
+       struct bio_vec          *bip_vec;
+       struct bio_vec          bip_inline_vecs[0];/* embedded bvec array */
 };
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
@@ -211,6 +221,7 @@ extern void bio_pair_release(struct bio_pair *dbio);
 
 extern struct bio_set *bioset_create(unsigned int, unsigned int);
 extern void bioset_free(struct bio_set *);
+extern mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries);
 
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
@@ -245,6 +256,9 @@ extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 
+extern int submit_bio_wait(int rw, struct bio *bio);
+extern void bio_advance(struct bio *, unsigned);
+
 extern void bio_init(struct bio *);
 extern void bio_reset(struct bio *);
 
@@ -279,6 +293,9 @@ static inline void bio_flush_dcache_pages(struct bio *bi)
 }
 #endif
 
+extern void bio_copy_data(struct bio *dst, struct bio *src);
+extern int bio_alloc_pages(struct bio *bio, gfp_t gfp);
+
 extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *,
                                 unsigned long, unsigned int, int, gfp_t);
 extern struct bio *bio_copy_user_iov(struct request_queue *,
@@ -286,8 +303,8 @@ extern struct bio *bio_copy_user_iov(struct request_queue *,
                                     int, int, gfp_t);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
-extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *);
-extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int);
+extern struct bio_vec *bvec_alloc(gfp_t, int, unsigned long *, mempool_t *);
+extern void bvec_free(mempool_t *, struct bio_vec *, unsigned int);
 extern unsigned int bvec_nr_vecs(unsigned short idx);
 
 #ifdef CONFIG_BLK_CGROUP
@@ -298,39 +315,6 @@ static inline int bio_associate_current(struct bio *bio) { return -ENOENT; }
 static inline void bio_disassociate_task(struct bio *bio) { }
 #endif /* CONFIG_BLK_CGROUP */
 
-/*
- * bio_set is used to allow other portions of the IO system to
- * allocate their own private memory pools for bio and iovec structures.
- * These memory pools in turn all allocate from the bio_slab
- * and the bvec_slabs[].
- */
-#define BIO_POOL_SIZE 2
-#define BIOVEC_NR_POOLS 6
-#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
-
-struct bio_set {
-       struct kmem_cache *bio_slab;
-       unsigned int front_pad;
-
-       mempool_t *bio_pool;
-#if defined(CONFIG_BLK_DEV_INTEGRITY)
-       mempool_t *bio_integrity_pool;
-#endif
-       mempool_t *bvec_pool;
-};
-
-struct biovec_slab {
-       int nr_vecs;
-       char *name;
-       struct kmem_cache *slab;
-};
-
-/*
- * a small number of entries is fine, not going to be performance critical.
- * basically we just need to survive
- */
-#define BIO_SPLIT_ENTRIES 2
-
 #ifdef CONFIG_HIGHMEM
 /*
  * remember never ever reenable interrupts between a bvec_kmap_irq and
@@ -527,6 +511,49 @@ static inline struct bio *bio_list_get(struct bio_list *bl)
        return bio;
 }
 
+/*
+ * bio_set is used to allow other portions of the IO system to
+ * allocate their own private memory pools for bio and iovec structures.
+ * These memory pools in turn all allocate from the bio_slab
+ * and the bvec_slabs[].
+ */
+#define BIO_POOL_SIZE 2
+#define BIOVEC_NR_POOLS 6
+#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1)
+
+struct bio_set {
+       struct kmem_cache *bio_slab;
+       unsigned int front_pad;
+
+       mempool_t *bio_pool;
+       mempool_t *bvec_pool;
+#if defined(CONFIG_BLK_DEV_INTEGRITY)
+       mempool_t *bio_integrity_pool;
+       mempool_t *bvec_integrity_pool;
+#endif
+
+       /*
+        * Deadlock avoidance for stacking block drivers: see comments in
+        * bio_alloc_bioset() for details
+        */
+       spinlock_t              rescue_lock;
+       struct bio_list         rescue_list;
+       struct work_struct      rescue_work;
+       struct workqueue_struct *rescue_workqueue;
+};
+
+struct biovec_slab {
+       int nr_vecs;
+       char *name;
+       struct kmem_cache *slab;
+};
+
+/*
+ * a small number of entries is fine, not going to be performance critical.
+ * basically we just need to survive
+ */
+#define BIO_SPLIT_ENTRIES 2
+
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 
 #define bip_vec_idx(bip, idx)  (&(bip->bip_vec[(idx)]))
index 22990cf4439d2e8b91fedc0be9d254985f7ab64a..fa1abeb45b7602a4f0c1a4098f05f63d7a075281 100644 (file)
@@ -118,6 +118,7 @@ struct bio {
  * BIO_POOL_IDX()
  */
 #define BIO_RESET_BITS 13
+#define BIO_OWNS_VEC   13      /* bio_free() should free bvec */
 
 #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag)))
 
@@ -176,6 +177,7 @@ enum rq_flag_bits {
        __REQ_IO_STAT,          /* account I/O stat */
        __REQ_MIXED_MERGE,      /* merge of different types, fail separately */
        __REQ_KERNEL,           /* direct IO to kernel pages */
+       __REQ_PM,               /* runtime pm request */
        __REQ_NR_BITS,          /* stops here */
 };
 
@@ -198,6 +200,8 @@ enum rq_flag_bits {
         REQ_SECURE)
 #define REQ_CLONE_MASK         REQ_COMMON_MASK
 
+#define BIO_NO_ADVANCE_ITER_MASK       (REQ_DISCARD|REQ_WRITE_SAME)
+
 /* This mask is used for both bio and request merge checking */
 #define REQ_NOMERGE_FLAGS \
        (REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
@@ -224,5 +228,6 @@ enum rq_flag_bits {
 #define REQ_MIXED_MERGE                (1 << __REQ_MIXED_MERGE)
 #define REQ_SECURE             (1 << __REQ_SECURE)
 #define REQ_KERNEL             (1 << __REQ_KERNEL)
+#define REQ_PM                 (1 << __REQ_PM)
 
 #endif /* __LINUX_BLK_TYPES_H */
index e38cfe77f7f017ddc9347d66f224b986b86abf45..2fdb4a451b49bd626d9415b231c76b7ac927cf69 100644 (file)
@@ -361,6 +361,12 @@ struct request_queue {
         */
        struct kobject kobj;
 
+#ifdef CONFIG_PM_RUNTIME
+       struct device           *dev;
+       int                     rpm_status;
+       unsigned int            nr_pending;
+#endif
+
        /*
         * queue settings
         */
@@ -838,7 +844,7 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
                                                     unsigned int cmd_flags)
 {
        if (unlikely(cmd_flags & REQ_DISCARD))
-               return q->limits.max_discard_sectors;
+               return min(q->limits.max_discard_sectors, UINT_MAX >> 9);
 
        if (unlikely(cmd_flags & REQ_WRITE_SAME))
                return q->limits.max_write_same_sectors;
@@ -960,6 +966,27 @@ struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 
+/*
+ * block layer runtime pm functions
+ */
+#ifdef CONFIG_PM_RUNTIME
+extern void blk_pm_runtime_init(struct request_queue *q, struct device *dev);
+extern int blk_pre_runtime_suspend(struct request_queue *q);
+extern void blk_post_runtime_suspend(struct request_queue *q, int err);
+extern void blk_pre_runtime_resume(struct request_queue *q);
+extern void blk_post_runtime_resume(struct request_queue *q, int err);
+#else
+static inline void blk_pm_runtime_init(struct request_queue *q,
+       struct device *dev) {}
+static inline int blk_pre_runtime_suspend(struct request_queue *q)
+{
+       return -ENOSYS;
+}
+static inline void blk_post_runtime_suspend(struct request_queue *q, int err) {}
+static inline void blk_pre_runtime_resume(struct request_queue *q) {}
+static inline void blk_post_runtime_resume(struct request_queue *q, int err) {}
+#endif
+
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
index 9c1467357b03c616967cd193efab6506e3e5adff..60ae7c3db912de7e068452de1a1c1978cad0a662 100644 (file)
@@ -244,7 +244,7 @@ TRACE_EVENT(block_bio_bounce,
                __entry->dev            = bio->bi_bdev ?
                                          bio->bi_bdev->bd_dev : 0;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
@@ -281,7 +281,7 @@ TRACE_EVENT(block_bio_complete,
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                __entry->error          = error;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
        ),
@@ -309,7 +309,7 @@ DECLARE_EVENT_CLASS(block_bio_merge,
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
@@ -376,7 +376,7 @@ TRACE_EVENT(block_bio_queue,
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
        ),
@@ -404,7 +404,7 @@ DECLARE_EVENT_CLASS(block_get_rq,
        TP_fast_assign(
                __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
                __entry->sector         = bio ? bio->bi_sector : 0;
-               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               __entry->nr_sector      = bio ? bio_sectors(bio) : 0;
                blk_fill_rwbs(__entry->rwbs,
                              bio ? bio->bi_rw : 0, __entry->nr_sector);
                memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
@@ -580,7 +580,7 @@ TRACE_EVENT(block_bio_remap,
        TP_fast_assign(
                __entry->dev            = bio->bi_bdev->bd_dev;
                __entry->sector         = bio->bi_sector;
-               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->nr_sector      = bio_sectors(bio);
                __entry->old_dev        = dev;
                __entry->old_sector     = from;
                blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
index 6a16fd2e70ed27741ed13f80156d713d38127fb1..464ea82e10dbf1f1a519b75c52f9e129a5a715e0 100644 (file)
@@ -183,7 +183,6 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 DEFINE_EVENT(writeback_work_class, name, \
        TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
        TP_ARGS(bdi, work))
-DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -222,12 +221,8 @@ DEFINE_EVENT(writeback_class, name, \
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_wake_thread);
-DEFINE_WRITEBACK_EVENT(writeback_wake_forker_thread);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
-DEFINE_WRITEBACK_EVENT(writeback_thread_start);
-DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
 
 DECLARE_EVENT_CLASS(wbc_class,
        TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
index eef0d113b79ed22734b980b076e05c3aee73a64c..b91488ba2e5a7edb6e3cbdc5876adaf9ae7f2791 100644 (file)
@@ -234,7 +234,6 @@ static void relay_destroy_buf(struct rchan_buf *buf)
 static void relay_remove_buf(struct kref *kref)
 {
        struct rchan_buf *buf = container_of(kref, struct rchan_buf, kref);
-       buf->chan->cb->remove_buf_file(buf->dentry);
        relay_destroy_buf(buf);
 }
 
@@ -484,6 +483,7 @@ static void relay_close_buf(struct rchan_buf *buf)
 {
        buf->finalized = 1;
        del_timer_sync(&buf->timer);
+       buf->chan->cb->remove_buf_file(buf->dentry);
        kref_put(&buf->kref, relay_remove_buf);
 }
 
index 41733c5dc820af44282d382ebcc3c938eca912ac..50251749225885d958a1b531519326bd177ecf2e 100644 (file)
@@ -31,13 +31,14 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info);
 static struct class *bdi_class;
 
 /*
- * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as
- * reader side protection for bdi_pending_list. bdi_list has RCU reader side
+ * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
  * locking.
  */
 DEFINE_SPINLOCK(bdi_lock);
 LIST_HEAD(bdi_list);
-LIST_HEAD(bdi_pending_list);
+
+/* bdi_wq serves all asynchronous writeback tasks */
+struct workqueue_struct *bdi_wq;
 
 void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
 {
@@ -257,6 +258,11 @@ static int __init default_bdi_init(void)
 {
        int err;
 
+       bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
+                                             WQ_UNBOUND | WQ_SYSFS, 0);
+       if (!bdi_wq)
+               return -ENOMEM;
+
        err = bdi_init(&default_backing_dev_info);
        if (!err)
                bdi_register(&default_backing_dev_info, NULL, "default");
@@ -271,26 +277,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
        return wb_has_dirty_io(&bdi->wb);
 }
 
-static void wakeup_timer_fn(unsigned long data)
-{
-       struct backing_dev_info *bdi = (struct backing_dev_info *)data;
-
-       spin_lock_bh(&bdi->wb_lock);
-       if (bdi->wb.task) {
-               trace_writeback_wake_thread(bdi);
-               wake_up_process(bdi->wb.task);
-       } else if (bdi->dev) {
-               /*
-                * When bdi tasks are inactive for long time, they are killed.
-                * In this case we have to wake-up the forker thread which
-                * should create and run the bdi thread.
-                */
-               trace_writeback_wake_forker_thread(bdi);
-               wake_up_process(default_backing_dev_info.wb.task);
-       }
-       spin_unlock_bh(&bdi->wb_lock);
-}
-
 /*
  * This function is used when the first inode for this bdi is marked dirty. It
  * wakes-up the corresponding bdi thread which should then take care of the
@@ -307,176 +293,7 @@ void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi)
        unsigned long timeout;
 
        timeout = msecs_to_jiffies(dirty_writeback_interval * 10);
-       mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout);
-}
-
-/*
- * Calculate the longest interval (jiffies) bdi threads are allowed to be
- * inactive.
- */
-static unsigned long bdi_longest_inactive(void)
-{
-       unsigned long interval;
-
-       interval = msecs_to_jiffies(dirty_writeback_interval * 10);
-       return max(5UL * 60 * HZ, interval);
-}
-
-/*
- * Clear pending bit and wakeup anybody waiting for flusher thread creation or
- * shutdown
- */
-static void bdi_clear_pending(struct backing_dev_info *bdi)
-{
-       clear_bit(BDI_pending, &bdi->state);
-       smp_mb__after_clear_bit();
-       wake_up_bit(&bdi->state, BDI_pending);
-}
-
-static int bdi_forker_thread(void *ptr)
-{
-       struct bdi_writeback *me = ptr;
-
-       current->flags |= PF_SWAPWRITE;
-       set_freezable();
-
-       /*
-        * Our parent may run at a different priority, just set us to normal
-        */
-       set_user_nice(current, 0);
-
-       for (;;) {
-               struct task_struct *task = NULL;
-               struct backing_dev_info *bdi;
-               enum {
-                       NO_ACTION,   /* Nothing to do */
-                       FORK_THREAD, /* Fork bdi thread */
-                       KILL_THREAD, /* Kill inactive bdi thread */
-               } action = NO_ACTION;
-
-               /*
-                * Temporary measure, we want to make sure we don't see
-                * dirty data on the default backing_dev_info
-                */
-               if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) {
-                       del_timer(&me->wakeup_timer);
-                       wb_do_writeback(me, 0);
-               }
-
-               spin_lock_bh(&bdi_lock);
-               /*
-                * In the following loop we are going to check whether we have
-                * some work to do without any synchronization with tasks
-                * waking us up to do work for them. Set the task state here
-                * so that we don't miss wakeups after verifying conditions.
-                */
-               set_current_state(TASK_INTERRUPTIBLE);
-
-               list_for_each_entry(bdi, &bdi_list, bdi_list) {
-                       bool have_dirty_io;
-
-                       if (!bdi_cap_writeback_dirty(bdi) ||
-                            bdi_cap_flush_forker(bdi))
-                               continue;
-
-                       WARN(!test_bit(BDI_registered, &bdi->state),
-                            "bdi %p/%s is not registered!\n", bdi, bdi->name);
-
-                       have_dirty_io = !list_empty(&bdi->work_list) ||
-                                       wb_has_dirty_io(&bdi->wb);
-
-                       /*
-                        * If the bdi has work to do, but the thread does not
-                        * exist - create it.
-                        */
-                       if (!bdi->wb.task && have_dirty_io) {
-                               /*
-                                * Set the pending bit - if someone will try to
-                                * unregister this bdi - it'll wait on this bit.
-                                */
-                               set_bit(BDI_pending, &bdi->state);
-                               action = FORK_THREAD;
-                               break;
-                       }
-
-                       spin_lock(&bdi->wb_lock);
-
-                       /*
-                        * If there is no work to do and the bdi thread was
-                        * inactive long enough - kill it. The wb_lock is taken
-                        * to make sure no-one adds more work to this bdi and
-                        * wakes the bdi thread up.
-                        */
-                       if (bdi->wb.task && !have_dirty_io &&
-                           time_after(jiffies, bdi->wb.last_active +
-                                               bdi_longest_inactive())) {
-                               task = bdi->wb.task;
-                               bdi->wb.task = NULL;
-                               spin_unlock(&bdi->wb_lock);
-                               set_bit(BDI_pending, &bdi->state);
-                               action = KILL_THREAD;
-                               break;
-                       }
-                       spin_unlock(&bdi->wb_lock);
-               }
-               spin_unlock_bh(&bdi_lock);
-
-               /* Keep working if default bdi still has things to do */
-               if (!list_empty(&me->bdi->work_list))
-                       __set_current_state(TASK_RUNNING);
-
-               switch (action) {
-               case FORK_THREAD:
-                       __set_current_state(TASK_RUNNING);
-                       task = kthread_create(bdi_writeback_thread, &bdi->wb,
-                                             "flush-%s", dev_name(bdi->dev));
-                       if (IS_ERR(task)) {
-                               /*
-                                * If thread creation fails, force writeout of
-                                * the bdi from the thread. Hopefully 1024 is
-                                * large enough for efficient IO.
-                                */
-                               writeback_inodes_wb(&bdi->wb, 1024,
-                                                   WB_REASON_FORKER_THREAD);
-                       } else {
-                               /*
-                                * The spinlock makes sure we do not lose
-                                * wake-ups when racing with 'bdi_queue_work()'.
-                                * And as soon as the bdi thread is visible, we
-                                * can start it.
-                                */
-                               spin_lock_bh(&bdi->wb_lock);
-                               bdi->wb.task = task;
-                               spin_unlock_bh(&bdi->wb_lock);
-                               wake_up_process(task);
-                       }
-                       bdi_clear_pending(bdi);
-                       break;
-
-               case KILL_THREAD:
-                       __set_current_state(TASK_RUNNING);
-                       kthread_stop(task);
-                       bdi_clear_pending(bdi);
-                       break;
-
-               case NO_ACTION:
-                       if (!wb_has_dirty_io(me) || !dirty_writeback_interval)
-                               /*
-                                * There are no dirty data. The only thing we
-                                * should now care about is checking for
-                                * inactive bdi threads and killing them. Thus,
-                                * let's sleep for longer time, save energy and
-                                * be friendly for battery-driven devices.
-                                */
-                               schedule_timeout(bdi_longest_inactive());
-                       else
-                               schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10));
-                       try_to_freeze();
-                       break;
-               }
-       }
-
-       return 0;
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, timeout);
 }
 
 /*
@@ -489,6 +306,9 @@ static void bdi_remove_from_list(struct backing_dev_info *bdi)
        spin_unlock_bh(&bdi_lock);
 
        synchronize_rcu_expedited();
+
+       /* bdi_list is now unused, clear it to mark @bdi dying */
+       INIT_LIST_HEAD(&bdi->bdi_list);
 }
 
 int bdi_register(struct backing_dev_info *bdi, struct device *parent,
@@ -508,20 +328,6 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent,
 
        bdi->dev = dev;
 
-       /*
-        * Just start the forker thread for our default backing_dev_info,
-        * and add other bdi's to the list. They will get a thread created
-        * on-demand when they need it.
-        */
-       if (bdi_cap_flush_forker(bdi)) {
-               struct bdi_writeback *wb = &bdi->wb;
-
-               wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s",
-                                               dev_name(dev));
-               if (IS_ERR(wb->task))
-                       return PTR_ERR(wb->task);
-       }
-
        bdi_debug_register(bdi, dev_name(dev));
        set_bit(BDI_registered, &bdi->state);
 
@@ -545,8 +351,6 @@ EXPORT_SYMBOL(bdi_register_dev);
  */
 static void bdi_wb_shutdown(struct backing_dev_info *bdi)
 {
-       struct task_struct *task;
-
        if (!bdi_cap_writeback_dirty(bdi))
                return;
 
@@ -556,22 +360,20 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
        bdi_remove_from_list(bdi);
 
        /*
-        * If setup is pending, wait for that to complete first
+        * Drain work list and shutdown the delayed_work.  At this point,
+        * @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
+        * is dying and its work_list needs to be drained no matter what.
         */
-       wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait,
-                       TASK_UNINTERRUPTIBLE);
+       mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
+       flush_delayed_work(&bdi->wb.dwork);
+       WARN_ON(!list_empty(&bdi->work_list));
 
        /*
-        * Finally, kill the kernel thread. We don't need to be RCU
-        * safe anymore, since the bdi is gone from visibility.
+        * This shouldn't be necessary unless @bdi for some reason has
+        * unflushed dirty IO after work_list is drained.  Do it anyway
+        * just in case.
         */
-       spin_lock_bh(&bdi->wb_lock);
-       task = bdi->wb.task;
-       bdi->wb.task = NULL;
-       spin_unlock_bh(&bdi->wb_lock);
-
-       if (task)
-               kthread_stop(task);
+       cancel_delayed_work_sync(&bdi->wb.dwork);
 }
 
 /*
@@ -597,10 +399,8 @@ void bdi_unregister(struct backing_dev_info *bdi)
                bdi_set_min_ratio(bdi, 0);
                trace_writeback_bdi_unregister(bdi);
                bdi_prune_sb(bdi);
-               del_timer_sync(&bdi->wb.wakeup_timer);
 
-               if (!bdi_cap_flush_forker(bdi))
-                       bdi_wb_shutdown(bdi);
+               bdi_wb_shutdown(bdi);
                bdi_debug_unregister(bdi);
 
                spin_lock_bh(&bdi->wb_lock);
@@ -622,7 +422,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
        INIT_LIST_HEAD(&wb->b_io);
        INIT_LIST_HEAD(&wb->b_more_io);
        spin_lock_init(&wb->list_lock);
-       setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
+       INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
 
 /*
@@ -695,12 +495,11 @@ void bdi_destroy(struct backing_dev_info *bdi)
        bdi_unregister(bdi);
 
        /*
-        * If bdi_unregister() had already been called earlier, the
-        * wakeup_timer could still be armed because bdi_prune_sb()
-        * can race with the bdi_wakeup_thread_delayed() calls from
-        * __mark_inode_dirty().
+        * If bdi_unregister() had already been called earlier, the dwork
+        * could still be pending because bdi_prune_sb() can race with the
+        * bdi_wakeup_thread_delayed() calls from __mark_inode_dirty().
         */
-       del_timer_sync(&bdi->wb.wakeup_timer);
+       cancel_delayed_work_sync(&bdi->wb.dwork);
 
        for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
                percpu_counter_destroy(&bdi->bdi_stat[i]);
index a5c2ec3589cb94934e8654821b0876d28e1f5510..c9f0a4339a7dafc2ba7295e49ad8fcdda8fa13de 100644 (file)
@@ -101,7 +101,7 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
        struct bio_vec *tovec, *fromvec;
        int i;
 
-       __bio_for_each_segment(tovec, to, i, 0) {
+       bio_for_each_segment(tovec, to, i) {
                fromvec = from->bi_io_vec + i;
 
                /*
@@ -134,7 +134,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
        /*
         * free up bounce indirect pages used
         */
-       __bio_for_each_segment(bvec, bio, i, 0) {
+       bio_for_each_segment_all(bvec, bio, i) {
                org_vec = bio_orig->bi_io_vec + i;
                if (bvec->bv_page == org_vec->bv_page)
                        continue;
@@ -199,78 +199,43 @@ static int must_snapshot_stable_pages(struct request_queue *q, struct bio *bio)
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
                               mempool_t *pool, int force)
 {
-       struct page *page;
-       struct bio *bio = NULL;
-       int i, rw = bio_data_dir(*bio_orig);
+       struct bio *bio;
+       int rw = bio_data_dir(*bio_orig);
        struct bio_vec *to, *from;
+       unsigned i;
 
-       bio_for_each_segment(from, *bio_orig, i) {
-               page = from->bv_page;
+       bio_for_each_segment(from, *bio_orig, i)
+               if (page_to_pfn(from->bv_page) > queue_bounce_pfn(q))
+                       goto bounce;
 
-               /*
-                * is destination page below bounce pfn?
-                */
-               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
-                       continue;
-
-               /*
-                * irk, bounce it
-                */
-               if (!bio) {
-                       unsigned int cnt = (*bio_orig)->bi_vcnt;
+       return;
+bounce:
+       bio = bio_clone_bioset(*bio_orig, GFP_NOIO, fs_bio_set);
 
-                       bio = bio_alloc(GFP_NOIO, cnt);
-                       memset(bio->bi_io_vec, 0, cnt * sizeof(struct bio_vec));
-               }
-                       
+       bio_for_each_segment_all(to, bio, i) {
+               struct page *page = to->bv_page;
 
-               to = bio->bi_io_vec + i;
+               if (page_to_pfn(page) <= queue_bounce_pfn(q) && !force)
+                       continue;
 
-               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
-               to->bv_len = from->bv_len;
-               to->bv_offset = from->bv_offset;
                inc_zone_page_state(to->bv_page, NR_BOUNCE);
+               to->bv_page = mempool_alloc(pool, q->bounce_gfp);
 
                if (rw == WRITE) {
                        char *vto, *vfrom;
 
-                       flush_dcache_page(from->bv_page);
+                       flush_dcache_page(page);
+
                        vto = page_address(to->bv_page) + to->bv_offset;
-                       vfrom = kmap(from->bv_page) + from->bv_offset;
+                       vfrom = kmap_atomic(page) + to->bv_offset;
                        memcpy(vto, vfrom, to->bv_len);
-                       kunmap(from->bv_page);
+                       kunmap_atomic(vfrom);
                }
        }
 
-       /*
-        * no pages bounced
-        */
-       if (!bio)
-               return;
-
        trace_block_bio_bounce(q, *bio_orig);
 
-       /*
-        * at least one page was bounced, fill in possible non-highmem
-        * pages
-        */
-       __bio_for_each_segment(from, *bio_orig, i, 0) {
-               to = bio_iovec_idx(bio, i);
-               if (!to->bv_page) {
-                       to->bv_page = from->bv_page;
-                       to->bv_len = from->bv_len;
-                       to->bv_offset = from->bv_offset;
-               }
-       }
-
-       bio->bi_bdev = (*bio_orig)->bi_bdev;
        bio->bi_flags |= (1 << BIO_BOUNCED);
-       bio->bi_sector = (*bio_orig)->bi_sector;
-       bio->bi_rw = (*bio_orig)->bi_rw;
-
-       bio->bi_vcnt = (*bio_orig)->bi_vcnt;
-       bio->bi_idx = (*bio_orig)->bi_idx;
-       bio->bi_size = (*bio_orig)->bi_size;
 
        if (pool == page_pool) {
                bio->bi_end_io = bounce_end_io_write;
index 06a8842a6ec612dbda8071f94d15423326a375f4..a8a3ef45fed753b68ac1cc4a94c9260979a37879 100644 (file)
@@ -36,7 +36,6 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
                bio->bi_io_vec[0].bv_len = PAGE_SIZE;
                bio->bi_io_vec[0].bv_offset = 0;
                bio->bi_vcnt = 1;
-               bio->bi_idx = 0;
                bio->bi_size = PAGE_SIZE;
                bio->bi_end_io = end_io;
        }