]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/zio_inject.c
OpenZFS 6531 - Provide mechanism to artificially limit disk performance
[mirror_zfs.git] / module / zfs / zio_inject.c
index 1458be4772803f4c1b49fb7d1bab0e9ec8892569..61b7d25e6536c230868742e7724b5f9e5d096353 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  */
 
 /*
 
 uint32_t zio_injection_enabled = 0;
 
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
 typedef struct inject_handler {
        int                     zi_id;
        spa_t                   *zi_spa;
        zinject_record_t        zi_record;
+       uint64_t                *zi_lanes;
+       int                     zi_next_lane;
        list_node_t             zi_link;
 } inject_handler_t;
 
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
 static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
 static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
 static int inject_next_id = 1;
 
 /*
@@ -361,21 +399,70 @@ spa_handle_ignored_writes(spa_t *spa)
        rw_exit(&inject_lock);
 }
 
-uint64_t
+hrtime_t
 zio_handle_io_delay(zio_t *zio)
 {
        vdev_t *vd = zio->io_vd;
+       inject_handler_t *min_handler = NULL;
+       hrtime_t min_target = 0;
        inject_handler_t *handler;
-       uint64_t seconds = 0;
-
-       if (zio_injection_enabled == 0)
-               return (0);
+       hrtime_t idle;
+       hrtime_t busy;
+       hrtime_t target;
 
        rw_enter(&inject_lock, RW_READER);
 
-       for (handler = list_head(&inject_handlers); handler != NULL;
-           handler = list_next(&inject_handlers, handler)) {
+       /*
+        * inject_delay_count is a subset of zio_injection_enabled that
+        * is only incremented for delay handlers. These checks are
+        * mainly added to remind the reader why we're not explicitly
+        * checking zio_injection_enabled like the other functions.
+        */
+       IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+       IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+       /*
+        * If there aren't any inject delay handlers registered, then we
+        * can short circuit and simply return 0 here. A value of zero
+        * informs zio_delay_interrupt() that this request should not be
+        * delayed. This short circuit keeps us from acquiring the
+        * inject_delay_mutex unnecessarily.
+        */
+       if (inject_delay_count == 0) {
+               rw_exit(&inject_lock);
+               return (0);
+       }
+
+       /*
+        * Each inject handler has a number of "lanes" associated with
+        * it. Each lane is able to handle requests independently of one
+        * another, and at a latency defined by the inject handler
+        * record's zi_timer field. Thus if a handler in configured with
+        * a single lane with a 10ms latency, it will delay requests
+        * such that only a single request is completed every 10ms. So,
+        * if more than one request is attempted per each 10ms interval,
+        * the average latency of the requests will be greater than
+        * 10ms; but if only a single request is submitted each 10ms
+        * interval the average latency will be 10ms.
+        *
+        * We need to acquire this mutex to prevent multiple concurrent
+        * threads being assigned to the same lane of a given inject
+        * handler. The mutex allows us to perform the following two
+        * operations atomically:
+        *
+        *      1. determine the minimum handler and minimum target
+        *         value of all the possible handlers
+        *      2. update that minimum handler's lane array
+        *
+        * Without atomicity, two (or more) threads could pick the same
+        * lane in step (1), and then conflict with each other in step
+        * (2). This could allow a single lane handler to process
+        * multiple requests simultaneously, which shouldn't be possible.
+        */
+       mutex_enter(&inject_delay_mtx);
 
+       for (handler = list_head(&inject_handlers);
+           handler != NULL; handler = list_next(&inject_handlers, handler)) {
                if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
                        continue;
 
@@ -384,14 +471,101 @@ zio_handle_io_delay(zio_t *zio)
                        continue;
                }
 
-               if (vd->vdev_guid == handler->zi_record.zi_guid) {
-                       seconds = handler->zi_record.zi_timer;
-                       break;
+               if (vd->vdev_guid != handler->zi_record.zi_guid)
+                       continue;
+
+               /*
+                * Defensive; should never happen as the array allocation
+                * occurs prior to inserting this handler on the list.
+                */
+               ASSERT3P(handler->zi_lanes, !=, NULL);
+
+               /*
+                * This should never happen, the zinject command should
+                * prevent a user from setting an IO delay with zero lanes.
+                */
+               ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+               ASSERT3U(handler->zi_record.zi_nlanes, >,
+                   handler->zi_next_lane);
+
+               /*
+                * We want to issue this IO to the lane that will become
+                * idle the soonest, so we compare the soonest this
+                * specific handler can complete the IO with all other
+                * handlers, to find the lowest value of all possible
+                * lanes. We then use this lane to submit the request.
+                *
+                * Since each handler has a constant value for its
+                * delay, we can just use the "next" lane for that
+                * handler; as it will always be the lane with the
+                * lowest value for that particular handler (i.e. the
+                * lane that will become idle the soonest). This saves a
+                * scan of each handler's lanes array.
+                *
+                * There's two cases to consider when determining when
+                * this specific IO request should complete. If this
+                * lane is idle, we want to "submit" the request now so
+                * it will complete after zi_timer milliseconds. Thus,
+                * we set the target to now + zi_timer.
+                *
+                * If the lane is busy, we want this request to complete
+                * zi_timer milliseconds after the lane becomes idle.
+                * Since the 'zi_lanes' array holds the time at which
+                * each lane will become idle, we use that value to
+                * determine when this request should complete.
+                */
+               idle = handler->zi_record.zi_timer + gethrtime();
+               busy = handler->zi_record.zi_timer +
+                   handler->zi_lanes[handler->zi_next_lane];
+               target = MAX(idle, busy);
+
+               if (min_handler == NULL) {
+                       min_handler = handler;
+                       min_target = target;
+                       continue;
                }
 
+               ASSERT3P(min_handler, !=, NULL);
+               ASSERT3U(min_target, !=, 0);
+
+               /*
+                * We don't yet increment the "next lane" variable since
+                * we still might find a lower value lane in another
+                * handler during any remaining iterations. Once we're
+                * sure we've selected the absolute minimum, we'll claim
+                * the lane and increment the handler's "next lane"
+                * field below.
+                */
+
+               if (target < min_target) {
+                       min_handler = handler;
+                       min_target = target;
+               }
        }
+
+       /*
+        * 'min_handler' will be NULL if no IO delays are registered for
+        * this vdev, otherwise it will point to the handler containing
+        * the lane that will become idle the soonest.
+        */
+       if (min_handler != NULL) {
+               ASSERT3U(min_target, !=, 0);
+               min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+               /*
+                * If we've used all possible lanes for this handler,
+                * loop back and start using the first lane again;
+                * otherwise, just increment the lane index.
+                */
+               min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+                   min_handler->zi_record.zi_nlanes;
+       }
+
+       mutex_exit(&inject_delay_mtx);
        rw_exit(&inject_lock);
-       return (seconds);
+
+       return (min_target);
 }
 
 /*
@@ -415,6 +589,24 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
                if ((error = spa_reset(name)) != 0)
                        return (error);
 
+       if (record->zi_cmd == ZINJECT_DELAY_IO) {
+               /*
+                * A value of zero for the number of lanes or for the
+                * delay time doesn't make sense.
+                */
+               if (record->zi_timer == 0 || record->zi_nlanes == 0)
+                       return (SET_ERROR(EINVAL));
+
+               /*
+                * The number of lanes is directly mapped to the size of
+                * an array used by the handler. Thus, to ensure the
+                * user doesn't trigger an allocation that's "too large"
+                * we cap the number of lanes here.
+                */
+               if (record->zi_nlanes >= UINT16_MAX)
+                       return (SET_ERROR(EINVAL));
+       }
+
        if (!(flags & ZINJECT_NULL)) {
                /*
                 * spa_inject_ref() will add an injection reference, which will
@@ -426,11 +618,34 @@ zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 
                handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
+               handler->zi_spa = spa;
+               handler->zi_record = *record;
+
+               if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+                       handler->zi_lanes = kmem_zalloc(
+                           sizeof (*handler->zi_lanes) *
+                           handler->zi_record.zi_nlanes, KM_SLEEP);
+                       handler->zi_next_lane = 0;
+               } else {
+                       handler->zi_lanes = NULL;
+                       handler->zi_next_lane = 0;
+               }
+
                rw_enter(&inject_lock, RW_WRITER);
 
+               /*
+                * We can't move this increment into the conditional
+                * above because we need to hold the RW_WRITER lock of
+                * inject_lock, and we don't want to hold that while
+                * allocating the handler's zi_lanes array.
+                */
+               if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+                       ASSERT3S(inject_delay_count, >=, 0);
+                       inject_delay_count++;
+                       ASSERT3S(inject_delay_count, >, 0);
+               }
+
                *id = handler->zi_id = inject_next_id++;
-               handler->zi_spa = spa;
-               handler->zi_record = *record;
                list_insert_tail(&inject_handlers, handler);
                atomic_inc_32(&zio_injection_enabled);
 
@@ -508,9 +723,23 @@ zio_clear_fault(int id)
                return (SET_ERROR(ENOENT));
        }
 
+       if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+               ASSERT3S(inject_delay_count, >, 0);
+               inject_delay_count--;
+               ASSERT3S(inject_delay_count, >=, 0);
+       }
+
        list_remove(&inject_handlers, handler);
        rw_exit(&inject_lock);
 
+       if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+               ASSERT3P(handler->zi_lanes, !=, NULL);
+               kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+                   handler->zi_record.zi_nlanes);
+       } else {
+               ASSERT3P(handler->zi_lanes, ==, NULL);
+       }
+
        spa_inject_delref(handler->zi_spa);
        kmem_free(handler, sizeof (inject_handler_t));
        atomic_dec_32(&zio_injection_enabled);
@@ -522,6 +751,7 @@ void
 zio_inject_init(void)
 {
        rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+       mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
        list_create(&inject_handlers, sizeof (inject_handler_t),
            offsetof(inject_handler_t, zi_link));
 }
@@ -530,6 +760,7 @@ void
 zio_inject_fini(void)
 {
        list_destroy(&inject_handlers);
+       mutex_destroy(&inject_delay_mtx);
        rw_destroy(&inject_lock);
 }