static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *,
struct cgroup *);
-static int blkiocg_can_attach_task(struct cgroup *, struct task_struct *);
-static void blkiocg_attach_task(struct cgroup *, struct task_struct *);
+static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *,
+ struct cgroup_taskset *);
+static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *,
+ struct cgroup_taskset *);
static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *);
static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *);
struct cgroup_subsys blkio_subsys = {
.name = "blkio",
.create = blkiocg_create,
- .can_attach_task = blkiocg_can_attach_task,
- .attach_task = blkiocg_attach_task,
+ .can_attach = blkiocg_can_attach,
+ .attach = blkiocg_attach,
.destroy = blkiocg_destroy,
.populate = blkiocg_populate,
#ifdef CONFIG_BLK_CGROUP
* of the main cic data structures. For now we allow a task to change
* its cgroup only if it's the only owner of its ioc.
*/
-static int blkiocg_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static int blkiocg_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
struct io_context *ioc;
int ret = 0;
/* task_lock() is needed to avoid races with exit_io_context() */
- task_lock(tsk);
- ioc = tsk->io_context;
- if (ioc && atomic_read(&ioc->nr_tasks) > 1)
- ret = -EINVAL;
- task_unlock(tsk);
-
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ task_lock(task);
+ ioc = task->io_context;
+ if (ioc && atomic_read(&ioc->nr_tasks) > 1)
+ ret = -EINVAL;
+ task_unlock(task);
+ if (ret)
+ break;
+ }
return ret;
}
-static void blkiocg_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+static void blkiocg_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup_taskset *tset)
{
+ struct task_struct *task;
struct io_context *ioc;
- /* we don't lose anything even if ioc allocation fails */
- ioc = get_task_io_context(tsk, GFP_ATOMIC, NUMA_NO_NODE);
- if (ioc) {
- ioc_cgroup_changed(ioc);
- put_io_context(ioc, NULL);
+ cgroup_taskset_for_each(task, cgrp, tset) {
- task_lock(task);
- ioc = task->io_context;
- if (ioc)
- ioc->cgroup_changed = 1;
- task_unlock(task);
++ /* we don't lose anything even if ioc allocation fails */
++ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
++ if (ioc) {
++ ioc_cgroup_changed(ioc);
++ put_io_context(ioc, NULL);
++ }
}
}
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
+ DEFINE_IDA(blk_queue_ida);
+
/*
* For the allocated request tables
*/
void blk_drain_queue(struct request_queue *q, bool drain_all)
{
while (true) {
- int nr_rqs;
+ bool drain = false;
+ int i;
spin_lock_irq(q->queue_lock);
if (drain_all)
blk_throtl_drain(q);
- __blk_run_queue(q);
+ /*
+ * This function might be called on a queue which failed
+ * driver init after queue creation. Some drivers
+ * (e.g. fd) get unhappy in such cases. Kick queue iff
+ * dispatch queue has something on it.
+ */
+ if (!list_empty(&q->queue_head))
+ __blk_run_queue(q);
- if (drain_all)
- nr_rqs = q->rq.count[0] + q->rq.count[1];
- else
- nr_rqs = q->rq.elvpriv;
+ drain |= q->rq.elvpriv;
+
+ /*
+ * Unfortunately, requests are queued at and tracked from
+ * multiple places and there's no single counter which can
+ * be drained. Check all the queues and counters.
+ */
+ if (drain_all) {
+ drain |= !list_empty(&q->queue_head);
+ for (i = 0; i < 2; i++) {
+ drain |= q->rq.count[i];
+ drain |= q->in_flight[i];
+ drain |= !list_empty(&q->flush_queue[i]);
+ }
+ }
spin_unlock_irq(q->queue_lock);
- if (!nr_rqs)
+ if (!drain)
break;
msleep(10);
}
if (!q)
return NULL;
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ if (q->id < 0)
+ goto fail_q;
+
q->backing_dev_info.ra_pages =
(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
q->backing_dev_info.state = 0;
q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
q->backing_dev_info.name = "block";
+ q->node = node_id;
err = bdi_init(&q->backing_dev_info);
- if (err) {
- kmem_cache_free(blk_requestq_cachep, q);
- return NULL;
- }
+ if (err)
+ goto fail_id;
- if (blk_throtl_init(q)) {
- kmem_cache_free(blk_requestq_cachep, q);
- return NULL;
- }
+ if (blk_throtl_init(q))
+ goto fail_id;
setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
laptop_mode_timer_fn, (unsigned long) q);
setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
INIT_LIST_HEAD(&q->timeout_list);
+ INIT_LIST_HEAD(&q->icq_list);
INIT_LIST_HEAD(&q->flush_queue[0]);
INIT_LIST_HEAD(&q->flush_queue[1]);
INIT_LIST_HEAD(&q->flush_data_in_flight);
q->queue_lock = &q->__queue_lock;
return q;
+
+ fail_id:
+ ida_simple_remove(&blk_queue_ida, q->id);
+ fail_q:
+ kmem_cache_free(blk_requestq_cachep, q);
+ return NULL;
}
EXPORT_SYMBOL(blk_alloc_queue_node);
if (!uninit_q)
return NULL;
- q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
+ q = blk_init_allocated_queue(uninit_q, rfn, lock);
if (!q)
blk_cleanup_queue(uninit_q);
struct request_queue *
blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
spinlock_t *lock)
-{
- return blk_init_allocated_queue_node(q, rfn, lock, -1);
-}
-EXPORT_SYMBOL(blk_init_allocated_queue);
-
-struct request_queue *
-blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
- spinlock_t *lock, int node_id)
{
if (!q)
return NULL;
- q->node = node_id;
if (blk_init_free_list(q))
return NULL;
return NULL;
}
-EXPORT_SYMBOL(blk_init_allocated_queue_node);
+EXPORT_SYMBOL(blk_init_allocated_queue);
- int blk_get_queue(struct request_queue *q)
+ bool blk_get_queue(struct request_queue *q)
{
- if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
- kobject_get(&q->kobj);
- return 0;
+ if (likely(!blk_queue_dead(q))) {
+ __blk_get_queue(q);
+ return true;
}
- return 1;
+ return false;
}
EXPORT_SYMBOL(blk_get_queue);
static inline void blk_free_request(struct request_queue *q, struct request *rq)
{
- if (rq->cmd_flags & REQ_ELVPRIV)
+ if (rq->cmd_flags & REQ_ELVPRIV) {
elv_put_request(q, rq);
+ if (rq->elv.icq)
+ put_io_context(rq->elv.icq->ioc, q);
+ }
+
mempool_free(rq, q->rq.rq_pool);
}
static struct request *
- blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
+ blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+ unsigned int flags, gfp_t gfp_mask)
{
struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
rq->cmd_flags = flags | REQ_ALLOCED;
- if ((flags & REQ_ELVPRIV) &&
- unlikely(elv_set_request(q, rq, gfp_mask))) {
- mempool_free(rq, q->rq.rq_pool);
- return NULL;
+ if (flags & REQ_ELVPRIV) {
+ rq->elv.icq = icq;
+ if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+ mempool_free(rq, q->rq.rq_pool);
+ return NULL;
+ }
+ /* @rq->elv.icq holds on to io_context until @rq is freed */
+ if (icq)
+ get_io_context(icq->ioc);
}
return rq;
{
struct request *rq = NULL;
struct request_list *rl = &q->rq;
- struct io_context *ioc = NULL;
+ struct elevator_type *et;
+ struct io_context *ioc;
+ struct io_cq *icq = NULL;
const bool is_sync = rw_is_sync(rw_flags) != 0;
+ bool retried = false;
int may_queue;
+ retry:
+ et = q->elevator->type;
+ ioc = current->io_context;
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ if (unlikely(blk_queue_dead(q)))
return NULL;
may_queue = elv_may_queue(q, rw_flags);
if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
if (rl->count[is_sync]+1 >= q->nr_requests) {
- ioc = current_io_context(GFP_ATOMIC, q->node);
+ /*
+ * We want ioc to record batching state. If it's
+ * not already there, creating a new one requires
+ * dropping queue_lock, which in turn requires
+ * retesting conditions to avoid queue hang.
+ */
+ if (!ioc && !retried) {
+ spin_unlock_irq(q->queue_lock);
+ create_io_context(current, gfp_mask, q->node);
+ spin_lock_irq(q->queue_lock);
+ retried = true;
+ goto retry;
+ }
+
/*
* The queue will fill after this allocation, so set
* it as full, and mark this process as "batching".
rl->count[is_sync]++;
rl->starved[is_sync] = 0;
+ /*
+ * Decide whether the new request will be managed by elevator. If
+ * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
+ * prevent the current elevator from being destroyed until the new
+ * request is freed. This guarantees icq's won't be destroyed and
+ * makes creating new ones safe.
+ *
+ * Also, lookup icq while holding queue_lock. If it doesn't exist,
+ * it will be created after releasing queue_lock.
+ */
if (blk_rq_should_init_elevator(bio) &&
!test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
rw_flags |= REQ_ELVPRIV;
rl->elvpriv++;
+ if (et->icq_cache && ioc)
+ icq = ioc_lookup_icq(ioc, q);
}
if (blk_queue_io_stat(q))
rw_flags |= REQ_IO_STAT;
spin_unlock_irq(q->queue_lock);
- rq = blk_alloc_request(q, rw_flags, gfp_mask);
+ /* create icq if missing */
+ if (unlikely(et->icq_cache && !icq))
+ icq = ioc_create_icq(q, gfp_mask);
+
+ /* rqs are guaranteed to have icq on elv_set_request() if requested */
+ if (likely(!et->icq_cache || icq))
+ rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+
if (unlikely(!rq)) {
/*
* Allocation failed presumably due to memory. Undo anything
rq = get_request(q, rw_flags, bio, GFP_NOIO);
while (!rq) {
DEFINE_WAIT(wait);
- struct io_context *ioc;
struct request_list *rl = &q->rq;
- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+ if (unlikely(blk_queue_dead(q)))
return NULL;
prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
* up to a big batch of them for a small period time.
* See ioc_batching, ioc_set_batching
*/
- ioc = current_io_context(GFP_NOIO, q->node);
- ioc_set_batching(q, ioc);
+ create_io_context(current, GFP_NOIO, q->node);
+ ioc_set_batching(q, current->io_context);
spin_lock_irq(q->queue_lock);
finish_wait(&rl->wait[is_sync], &wait);
__elv_add_request(q, rq, where);
}
- /**
- * blk_insert_request - insert a special request into a request queue
- * @q: request queue where request should be inserted
- * @rq: request to be inserted
- * @at_head: insert request at head or tail of queue
- * @data: private data
- *
- * Description:
- * Many block devices need to execute commands asynchronously, so they don't
- * block the whole kernel from preemption during request execution. This is
- * accomplished normally by inserting aritficial requests tagged as
- * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- * be scheduled for actual execution by the request queue.
- *
- * We have the option of inserting the head or the tail of the queue.
- * Typically we use the tail for new ioctls and so forth. We use the head
- * of the queue for things like a QUEUE_FULL message from a device, or a
- * host that is unable to accept a particular command.
- */
- void blk_insert_request(struct request_queue *q, struct request *rq,
- int at_head, void *data)
- {
- int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
- unsigned long flags;
-
- /*
- * tell I/O scheduler that this isn't a regular read/write (ie it
- * must not attempt merges on this) and that it acts as a soft
- * barrier
- */
- rq->cmd_type = REQ_TYPE_SPECIAL;
-
- rq->special = data;
-
- spin_lock_irqsave(q->queue_lock, flags);
-
- /*
- * If command is tagged, release the tag
- */
- if (blk_rq_tagged(rq))
- blk_queue_end_tag(q, rq);
-
- add_acct_request(q, rq, where);
- __blk_run_queue(q);
- spin_unlock_irqrestore(q->queue_lock, flags);
- }
- EXPORT_SYMBOL(blk_insert_request);
-
static void part_round_stats_single(int cpu, struct hd_struct *part,
unsigned long now)
{
return -EIO;
spin_lock_irqsave(q->queue_lock, flags);
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
+ return -ENODEV;
+ }
/*
* Submitting request must be dequeued before calling this function
{
trace_block_unplug(q, depth, !from_schedule);
+ /*
+ * Don't mess with dead queue.
+ */
+ if (unlikely(blk_queue_dead(q))) {
+ spin_unlock(q->queue_lock);
+ return;
+ }
+
/*
* If we are punting this to kblockd, then we can safely drop
* the queue_lock before waking kblockd (which needs to take
depth = 0;
spin_lock(q->queue_lock);
}
+
+ /*
+ * Short-circuit if @q is dead
+ */
+ if (unlikely(blk_queue_dead(q))) {
+ __blk_end_request_all(rq, -ENODEV);
+ continue;
+ }
+
/*
* rq is already accounted, so use raw insert
*/
struct file *file)
{
struct bsg_device *bd;
- int ret;
#ifdef BSG_DEBUG
unsigned char buf[32];
#endif
- ret = blk_get_queue(rq);
- if (ret)
+ if (!blk_get_queue(rq))
return ERR_PTR(-ENXIO);
bd = bsg_alloc_device();
static struct cdev bsg_cdev;
-static char *bsg_devnode(struct device *dev, mode_t *mode)
+static char *bsg_devnode(struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
}
#include <linux/slab.h>
#include <linux/kmod.h>
#include <linux/kobj_map.h>
-#include <linux/buffer_head.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
return 0;
}
-void register_disk(struct gendisk *disk)
+static void register_disk(struct gendisk *disk)
{
struct device *ddev = disk_to_dev(disk);
struct block_device *bdev;
* Take an extra ref on queue which will be put on disk_release()
* so that it sticks around as long as @disk is there.
*/
- WARN_ON_ONCE(blk_get_queue(disk->queue));
+ WARN_ON_ONCE(!blk_get_queue(disk->queue));
retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
"bdi");
.name = "block",
};
-static char *block_devnode(struct device *dev, mode_t *mode)
+static char *block_devnode(struct device *dev, umode_t *mode)
{
struct gendisk *disk = dev_to_disk(dev);
#include <linux/blkpg.h>
#include <linux/hdreg.h>
#include <linux/backing-dev.h>
-#include <linux/buffer_head.h>
+#include <linux/fs.h>
#include <linux/blktrace_api.h>
#include <asm/uaccess.h>
*/
EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
+/*
+ * Is it an unrecognized ioctl? The correct returns are either
+ * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
+ * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
+ * code before returning.
+ *
+ * Confused drivers sometimes return EINVAL, which is wrong. It
+ * means "I understood the ioctl command, but the parameters to
+ * it were wrong".
+ *
+ * We should aim to just fix the broken drivers, the EINVAL case
+ * should go away.
+ */
+static inline int is_unrecognized_ioctl(int ret)
+{
+ return ret == -EINVAL ||
+ ret == -ENOTTY ||
+ ret == -ENOIOCTLCMD;
+}
+
/*
* always keep this in sync with compat_blkdev_ioctl()
*/
return -EACCES;
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- /* -EINVAL to handle old uncorrected drivers */
- if (ret != -EINVAL && ret != -ENOTTY)
+ if (!is_unrecognized_ioctl(ret))
return ret;
fsync_bdev(bdev);
case BLKROSET:
ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- /* -EINVAL to handle old uncorrected drivers */
- if (ret != -EINVAL && ret != -ENOTTY)
+ if (!is_unrecognized_ioctl(ret))
return ret;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
return put_uint(arg, bdev_discard_zeroes_data(bdev));
case BLKSECTGET:
return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+ case BLKROTATIONAL:
+ return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/mutex.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev */
+#include <linux/fs.h>
#include <linux/poll.h>
#include <linux/ctype.h>
#include <linux/string.h>
}
if (sb->devflags & WriteMostly1)
set_bit(WriteMostly, &rdev->flags);
+ if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
+ set_bit(Replacement, &rdev->flags);
} else /* MULTIPATH are always insync */
set_bit(In_sync, &rdev->flags);
sb->recovery_offset =
cpu_to_le64(rdev->recovery_offset);
}
+ if (test_bit(Replacement, &rdev->flags))
+ sb->feature_map |=
+ cpu_to_le32(MD_FEATURE_REPLACEMENT);
if (mddev->reshape_position != MaxSector) {
sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
len += sprintf(page+len, "%swrite_error", sep);
sep = ",";
}
+ if (test_bit(WantReplacement, &rdev->flags)) {
+ len += sprintf(page+len, "%swant_replacement", sep);
+ sep = ",";
+ }
+ if (test_bit(Replacement, &rdev->flags)) {
+ len += sprintf(page+len, "%sreplacement", sep);
+ sep = ",";
+ }
+
return len+sprintf(page+len, "\n");
}
} else if (cmd_match(buf, "-write_error")) {
clear_bit(WriteErrorSeen, &rdev->flags);
err = 0;
+ } else if (cmd_match(buf, "want_replacement")) {
+ /* Any non-spare device that is not a replacement can
+ * become want_replacement at any time, but we then need to
+ * check if recovery is needed.
+ */
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(Replacement, &rdev->flags))
+ set_bit(WantReplacement, &rdev->flags);
+ set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
+ md_wakeup_thread(rdev->mddev->thread);
+ err = 0;
+ } else if (cmd_match(buf, "-want_replacement")) {
+ /* Clearing 'want_replacement' is always allowed.
+ * Once replacements starts it is too late though.
+ */
+ err = 0;
+ clear_bit(WantReplacement, &rdev->flags);
+ } else if (cmd_match(buf, "replacement")) {
+ /* Can only set a device as a replacement when array has not
+ * yet been started. Once running, replacement is automatic
+ * from spares, or by assigning 'slot'.
+ */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ set_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
+ } else if (cmd_match(buf, "-replacement")) {
+ /* Similarly, can only clear Replacement before start */
+ if (rdev->mddev->pers)
+ err = -EBUSY;
+ else {
+ clear_bit(Replacement, &rdev->flags);
+ err = 0;
+ }
}
if (!err)
sysfs_notify_dirent_safe(rdev->sysfs_state);
if (rdev->mddev->pers->hot_remove_disk == NULL)
return -EINVAL;
err = rdev->mddev->pers->
- hot_remove_disk(rdev->mddev, rdev->raid_disk);
+ hot_remove_disk(rdev->mddev, rdev);
if (err)
return err;
sysfs_unlink_rdev(rdev->mddev, rdev);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
- struct md_rdev *rdev2;
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
*/
if (rdev->mddev->pers->hot_add_disk == NULL)
return -EINVAL;
- list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
- if (rdev2->raid_disk == slot)
- return -EEXIST;
-
if (slot >= rdev->mddev->raid_disks &&
slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
mddev->queue->queuedata = mddev;
blk_queue_make_request(mddev->queue, md_make_request);
+ blk_set_stacking_limits(&mddev->queue->limits);
disk = alloc_disk(1 << shift);
if (!disk) {
struct mddev *mddev = NULL;
int ro;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
+ switch (cmd) {
+ case RAID_VERSION:
+ case GET_ARRAY_INFO:
+ case GET_DISK_INFO:
+ break;
+ default:
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ }
/*
* Commands dealing with the RAID driver but not any
if (test_bit(Faulty, &rdev->flags)) {
seq_printf(seq, "(F)");
continue;
- } else if (rdev->raid_disk < 0)
+ }
+ if (rdev->raid_disk < 0)
seq_printf(seq, "(S)"); /* spare */
+ if (test_bit(Replacement, &rdev->flags))
+ seq_printf(seq, "(R)");
sectors += rdev->sectors;
}
{
struct md_rdev *rdev;
int spares = 0;
+ int removed = 0;
mddev->curr_resync_completed = 0;
! test_bit(In_sync, &rdev->flags)) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
- mddev, rdev->raid_disk)==0) {
+ mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
+ removed++;
}
}
+ if (removed)
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
- if (mddev->degraded) {
- list_for_each_entry(rdev, &mddev->disks, same_set) {
- if (rdev->raid_disk >= 0 &&
- !test_bit(In_sync, &rdev->flags) &&
- !test_bit(Faulty, &rdev->flags))
+
+ list_for_each_entry(rdev, &mddev->disks, same_set) {
+ if (rdev->raid_disk >= 0 &&
+ !test_bit(In_sync, &rdev->flags) &&
+ !test_bit(Faulty, &rdev->flags))
+ spares++;
+ if (rdev->raid_disk < 0
+ && !test_bit(Faulty, &rdev->flags)) {
+ rdev->recovery_offset = 0;
+ if (mddev->pers->
+ hot_add_disk(mddev, rdev) == 0) {
+ if (sysfs_link_rdev(mddev, rdev))
+ /* failure here is OK */;
spares++;
- if (rdev->raid_disk < 0
- && !test_bit(Faulty, &rdev->flags)) {
- rdev->recovery_offset = 0;
- if (mddev->pers->
- hot_add_disk(mddev, rdev) == 0) {
- if (sysfs_link_rdev(mddev, rdev))
- /* failure here is OK */;
- spares++;
- md_new_event(mddev);
- set_bit(MD_CHANGE_DEVS, &mddev->flags);
- } else
- break;
+ md_new_event(mddev);
+ set_bit(MD_CHANGE_DEVS, &mddev->flags);
}
}
}
test_bit(Faulty, &rdev->flags) &&
atomic_read(&rdev->nr_pending)==0) {
if (mddev->pers->hot_remove_disk(
- mddev, rdev->raid_disk)==0) {
+ mddev, rdev) == 0) {
sysfs_unlink_rdev(mddev, rdev);
rdev->raid_disk = -1;
}
* Three pointers are available for the IO schedulers, if they need
* more they have to dynamically allocate it. Flush requests are
* never put on the IO scheduler. So let the flush fields share
- * space with the three elevator_private pointers.
+ * space with the elevator data.
*/
union {
- void *elevator_private[3];
+ struct {
+ struct io_cq *icq;
+ void *priv[2];
+ } elv;
+
struct {
unsigned int seq;
struct list_head list;
*/
unsigned long queue_flags;
+ /*
+ * ida allocated id for this queue. Used to index queues from
+ * ioctx.
+ */
+ int id;
+
/*
* queue needs bounce pages for pages above this limit
*/
struct timer_list timeout;
struct list_head timeout_list;
+ struct list_head icq_list;
+
struct queue_limits limits;
/*
/* Throttle data */
struct throtl_data *td;
#endif
+ #ifdef CONFIG_LOCKDEP
+ int ioc_release_depth;
+ #endif
};
#define QUEUE_FLAG_QUEUED 1 /* uses generic tag queueing */
#define blk_queue_tagged(q) test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
+ #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q) \
test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
extern struct request *blk_make_request(struct request_queue *, struct bio *,
gfp_t);
- extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
extern void blk_requeue_request(struct request_queue *, struct request *);
extern void blk_add_request_payload(struct request *rq, struct page *page,
unsigned int len);
struct request *rq);
extern void blk_delay_queue(struct request_queue *, unsigned long);
extern void blk_recount_segments(struct request_queue *, struct bio *);
+extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
+extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
+ unsigned int, void __user *);
extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
unsigned int, void __user *);
extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
*/
extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
spinlock_t *lock, int node_id);
-extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
- request_fn_proc *,
- spinlock_t *, int node_id);
extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
request_fn_proc *, spinlock_t *);
extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
extern void blk_set_default_limits(struct queue_limits *lim);
+ extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
extern void blk_dump_rq_flags(struct request *, char *);
extern long nr_blockdev_pages(void);
- int blk_get_queue(struct request_queue *);
+ bool __must_check blk_get_queue(struct request_queue *);
struct request_queue *blk_alloc_queue(gfp_t);
struct request_queue *blk_alloc_queue_node(gfp_t, int);
extern void blk_put_queue(struct request_queue *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
- #define blk_integrity_rq(rq) (0)
- #define blk_rq_count_integrity_sg(a, b) (0)
- #define blk_rq_map_integrity_sg(a, b, c) (0)
- #define bdev_get_integrity(a) (0)
- #define blk_get_integrity(a) (0)
- #define blk_integrity_compare(a, b) (0)
- #define blk_integrity_register(a, b) (0)
- #define blk_integrity_unregister(a) do { } while (0)
- #define blk_queue_max_integrity_segments(a, b) do { } while (0)
- #define queue_max_integrity_segments(a) (0)
- #define blk_integrity_merge_rq(a, b, c) (0)
- #define blk_integrity_merge_bio(a, b, c) (0)
- #define blk_integrity_is_initialized(a) (0)
+ struct bio;
+ struct block_device;
+ struct gendisk;
+ struct blk_integrity;
+
+ static inline int blk_integrity_rq(struct request *rq)
+ {
+ return 0;
+ }
+ static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+ struct bio *b)
+ {
+ return 0;
+ }
+ static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+ struct bio *b,
+ struct scatterlist *s)
+ {
+ return 0;
+ }
+ static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+ {
+ return 0;
+ }
+ static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+ {
+ return NULL;
+ }
+ static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+ {
+ return 0;
+ }
+ static inline int blk_integrity_register(struct gendisk *d,
+ struct blk_integrity *b)
+ {
+ return 0;
+ }
+ static inline void blk_integrity_unregister(struct gendisk *d)
+ {
+ }
+ static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+ unsigned int segs)
+ {
+ }
+ static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
+ {
+ return 0;
+ }
+ static inline int blk_integrity_merge_rq(struct request_queue *rq,
+ struct request *r1,
+ struct request *r2)
+ {
+ return 0;
+ }
+ static inline int blk_integrity_merge_bio(struct request_queue *rq,
+ struct request *r,
+ struct bio *b)
+ {
+ return 0;
+ }
+ static inline bool blk_integrity_is_initialized(struct gendisk *g)
+ {
+ return 0;
+ }
#endif /* CONFIG_BLK_DEV_INTEGRITY */
#define BLKPBSZGET _IO(0x12,123)
#define BLKDISCARDZEROES _IO(0x12,124)
#define BLKSECDISCARD _IO(0x12,125)
+ #define BLKROTATIONAL _IO(0x12,126)
#define BMAP_IOCTL 1 /* obsolete - kept for compatibility */
#define FIBMAP _IO(0x00,1) /* bmap access */
struct page;
struct address_space;
struct writeback_control;
+enum migrate_mode;
struct iov_iter {
const struct iovec *iov;
loff_t offset, unsigned long nr_segs);
int (*get_xip_mem)(struct address_space *, pgoff_t, int,
void **, unsigned long *);
- /* migrate the contents of a page to the specified target */
+ /*
+ * migrate the contents of a page to the specified target. If sync
+ * is false, it must not block.
+ */
int (*migratepage) (struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *, enum migrate_mode);
int (*launder_page) (struct page *);
int (*is_partially_uptodate) (struct page *, read_descriptor_t *,
unsigned long);
* must be enforced here for CRIS, to let the least significant bit
* of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON.
*/
+struct request_queue;
struct block_device {
dev_t bd_dev; /* not a kdev_t - it's a search key */
unsigned bd_part_count;
int bd_invalidated;
struct gendisk * bd_disk;
+ struct request_queue * bd_queue;
struct list_head bd_list;
/*
* Private data. You must have bd_claim'ed the block_device
#ifdef CONFIG_EPOLL
/* Used by fs/eventpoll.c to link all the hooks to this file */
struct list_head f_ep_links;
+ struct list_head f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
struct address_space *f_mapping;
#ifdef CONFIG_DEBUG_WRITECOUNT
#else
struct list_head s_files;
#endif
+ struct list_head s_mounts; /* list of mounts; _not_ for fs use */
/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
struct list_head s_dentry_lru; /* unused dentry lru */
int s_nr_dentry_unused; /* # of dentry on lru */
struct block_device *s_bdev;
struct backing_dev_info *s_bdi;
struct mtd_info *s_mtd;
- struct list_head s_instances;
+ struct hlist_node s_instances;
struct quota_info s_dquot; /* Diskquota specific options */
int s_frozen;
int cleancache_poolid;
struct shrinker s_shrink; /* per-sb shrinker handle */
+
+ /* Number of inodes with nlink == 0 but still referenced */
+ atomic_long_t s_remove_count;
+
+ /* Being remounted read-only */
+ int s_readonly_remount;
};
/* superblock cache pruning functions */
/*
* VFS helper functions..
*/
-extern int vfs_create(struct inode *, struct dentry *, int, struct nameidata *);
-extern int vfs_mkdir(struct inode *, struct dentry *, int);
-extern int vfs_mknod(struct inode *, struct dentry *, int, dev_t);
+extern int vfs_create(struct inode *, struct dentry *, umode_t, struct nameidata *);
+extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
extern int vfs_symlink(struct inode *, struct dentry *, const char *);
extern int vfs_link(struct dentry *, struct inode *, struct dentry *);
extern int vfs_rmdir(struct inode *, struct dentry *);
* VFS file helper functions.
*/
extern void inode_init_owner(struct inode *inode, const struct inode *dir,
- mode_t mode);
+ umode_t mode);
/*
* VFS FS_IOC_FIEMAP helper definitions.
*/
int (*readlink) (struct dentry *, char __user *,int);
void (*put_link) (struct dentry *, struct nameidata *, void *);
- int (*create) (struct inode *,struct dentry *,int, struct nameidata *);
+ int (*create) (struct inode *,struct dentry *,umode_t,struct nameidata *);
int (*link) (struct dentry *,struct inode *,struct dentry *);
int (*unlink) (struct inode *,struct dentry *);
int (*symlink) (struct inode *,struct dentry *,const char *);
- int (*mkdir) (struct inode *,struct dentry *,int);
+ int (*mkdir) (struct inode *,struct dentry *,umode_t);
int (*rmdir) (struct inode *,struct dentry *);
- int (*mknod) (struct inode *,struct dentry *,int,dev_t);
+ int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
int (*rename) (struct inode *, struct dentry *,
struct inode *, struct dentry *);
void (*truncate) (struct inode *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*umount_begin) (struct super_block *);
- int (*show_options)(struct seq_file *, struct vfsmount *);
- int (*show_devname)(struct seq_file *, struct vfsmount *);
- int (*show_path)(struct seq_file *, struct vfsmount *);
- int (*show_stats)(struct seq_file *, struct vfsmount *);
+ int (*show_options)(struct seq_file *, struct dentry *);
+ int (*show_devname)(struct seq_file *, struct dentry *);
+ int (*show_path)(struct seq_file *, struct dentry *);
+ int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
__mark_inode_dirty(inode, I_DIRTY_SYNC);
}
-/**
- * set_nlink - directly set an inode's link count
- * @inode: inode
- * @nlink: new nlink (should be non-zero)
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink.
- */
-static inline void set_nlink(struct inode *inode, unsigned int nlink)
-{
- inode->__i_nlink = nlink;
-}
-
-/**
- * inc_nlink - directly increment an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink. Currently,
- * it is only here for parity with dec_nlink().
- */
-static inline void inc_nlink(struct inode *inode)
-{
- inode->__i_nlink++;
-}
+extern void inc_nlink(struct inode *inode);
+extern void drop_nlink(struct inode *inode);
+extern void clear_nlink(struct inode *inode);
+extern void set_nlink(struct inode *inode, unsigned int nlink);
static inline void inode_inc_link_count(struct inode *inode)
{
mark_inode_dirty(inode);
}
-/**
- * drop_nlink - directly drop an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink. In cases
- * where we are attempting to track writes to the
- * filesystem, a decrement to zero means an imminent
- * write when the file is truncated and actually unlinked
- * on the filesystem.
- */
-static inline void drop_nlink(struct inode *inode)
-{
- inode->__i_nlink--;
-}
-
-/**
- * clear_nlink - directly zero an inode's link count
- * @inode: inode
- *
- * This is a low-level filesystem helper to replace any
- * direct filesystem manipulation of i_nlink. See
- * drop_nlink() for why we care about i_nlink hitting zero.
- */
-static inline void clear_nlink(struct inode *inode)
-{
- inode->__i_nlink = 0;
-}
-
static inline void inode_dec_link_count(struct inode *inode)
{
drop_nlink(inode);
void (*kill_sb) (struct super_block *);
struct module *owner;
struct file_system_type * next;
- struct list_head fs_supers;
+ struct hlist_head fs_supers;
struct lock_class_key s_lock_key;
struct lock_class_key s_umount_key;
extern int vfs_statfs(struct path *, struct kstatfs *);
extern int user_statfs(const char __user *, struct kstatfs *);
extern int fd_statfs(int, struct kstatfs *);
-extern int statfs_by_dentry(struct dentry *, struct kstatfs *);
+extern int vfs_ustat(dev_t, struct kstatfs *);
extern int freeze_super(struct super_block *super);
extern int thaw_super(struct super_block *super);
extern bool our_mnt(struct vfsmount *mnt);
extern int do_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern long do_sys_open(int dfd, const char __user *filename, int flags,
- int mode);
-extern struct file *filp_open(const char *, int, int);
+ umode_t mode);
+extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(struct dentry *, struct vfsmount *,
const char *, int);
extern struct file * dentry_open(struct dentry *, struct vfsmount *, int,
extern void bdput(struct block_device *);
extern void invalidate_bdev(struct block_device *);
extern int sync_blockdev(struct block_device *bdev);
+extern void kill_bdev(struct block_device *);
extern struct super_block *freeze_bdev(struct block_device *);
extern void emergency_thaw_all(void);
extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
#else
static inline void bd_forget(struct inode *inode) {}
static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+static inline void kill_bdev(struct block_device *bdev) {}
static inline void invalidate_bdev(struct block_device *bdev) {}
static inline struct super_block *freeze_bdev(struct block_device *sb)
extern const struct file_operations write_pipefifo_fops;
extern const struct file_operations rdwr_pipefifo_fops;
-extern int fs_may_remount_ro(struct super_block *);
-
#ifdef CONFIG_BLOCK
/*
* return READ, READA, or WRITE
unsigned long nr_segs, loff_t pos);
extern int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
int datasync);
+extern void block_sync_page(struct page *page);
/* fs/splice.c */
extern ssize_t generic_file_splice_read(struct file *, loff_t *,
extern struct file_system_type *get_fs_type(const char *name);
extern struct super_block *get_super(struct block_device *);
extern struct super_block *get_active_super(struct block_device *bdev);
-extern struct super_block *user_get_super(dev_t);
extern void drop_super(struct super_block *sb);
extern void iterate_supers(void (*)(struct super_block *, void *), void *);
extern void iterate_supers_type(struct file_system_type *,
#ifdef CONFIG_MIGRATION
extern int buffer_migrate_page(struct address_space *,
- struct page *, struct page *);
+ struct page *, struct page *,
+ enum migrate_mode);
#else
#define buffer_migrate_page NULL
#endif
extern void file_update_time(struct file *file);
-extern int generic_show_options(struct seq_file *m, struct vfsmount *mnt);
+extern int generic_show_options(struct seq_file *m, struct dentry *root);
extern void save_mount_options(struct super_block *sb, char *options);
extern void replace_mount_options(struct super_block *sb, char *options);
#define OPEN_FMODE(flag) ((__force fmode_t)(((flag + 1) & O_ACCMODE) | \
(flag & __FMODE_NONOTIFY)))
-static inline int is_sxid(mode_t mode)
+static inline int is_sxid(umode_t mode)
{
return (mode & S_ISUID) || ((mode & S_ISGID) && (mode & S_IXGRP));
}
#include <trace/events/sched.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/task.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
+ struct io_context *new_ioc;
if (!ioc)
return 0;
if (unlikely(!tsk->io_context))
return -ENOMEM;
} else if (ioprio_valid(ioc->ioprio)) {
- tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
- if (unlikely(!tsk->io_context))
+ new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+ if (unlikely(!new_ioc))
return -ENOMEM;
- tsk->io_context->ioprio = ioc->ioprio;
+ new_ioc->ioprio = ioc->ioprio;
+ put_io_context(new_ioc, NULL);
}
#endif
return 0;
sched_autogroup_fork(sig);
#ifdef CONFIG_CGROUPS
- init_rwsem(&sig->threadgroup_fork_lock);
+ init_rwsem(&sig->group_rwsem);
#endif
sig->oom_adj = current->signal->oom_adj;
new_flags |= PF_FORKNOEXEC;
new_flags |= PF_STARTING;
p->flags = new_flags;
- clear_freeze_flag(p);
}
SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
*/
static void posix_cpu_timers_init(struct task_struct *tsk)
{
- tsk->cputime_expires.prof_exp = cputime_zero;
- tsk->cputime_expires.virt_exp = cputime_zero;
+ tsk->cputime_expires.prof_exp = 0;
+ tsk->cputime_expires.virt_exp = 0;
tsk->cputime_expires.sched_exp = 0;
INIT_LIST_HEAD(&tsk->cpu_timers[0]);
INIT_LIST_HEAD(&tsk->cpu_timers[1]);
init_sigpending(&p->pending);
- p->utime = cputime_zero;
- p->stime = cputime_zero;
- p->gtime = cputime_zero;
- p->utimescaled = cputime_zero;
- p->stimescaled = cputime_zero;
+ p->utime = p->stime = p->gtime = 0;
+ p->utimescaled = p->stimescaled = 0;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
- p->prev_utime = cputime_zero;
- p->prev_stime = cputime_zero;
+ p->prev_utime = p->prev_stime = 0;
#endif
#if defined(SPLIT_RSS_COUNTING)
memset(&p->rss_stat, 0, sizeof(p->rss_stat));
p->io_context = NULL;
p->audit_context = NULL;
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_lock(current);
+ threadgroup_change_begin(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
p->nr_dirtied = 0;
p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
+ p->dirty_paused_when = 0;
/*
* Ok, make it visible to the rest of the system.
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
perf_event_fork(p);
+
+ trace_task_newtask(p, clone_flags);
+
return p;
bad_fork_free_pid:
bad_fork_cleanup_cgroup:
#endif
if (clone_flags & CLONE_THREAD)
- threadgroup_fork_read_unlock(current);
+ threadgroup_change_end(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);