]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Illumos #734: Use taskq_dispatch_ent() interface
authorGarrett D'Amore <garrett@nexenta.com>
Tue, 8 Nov 2011 00:26:52 +0000 (16:26 -0800)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 14 Dec 2011 17:19:30 +0000 (09:19 -0800)
It has been observed that some of the hottest locks are those
of the zio taskqs.  Contention on these locks can limit the
rate at which zios are dispatched which limits performance.

This upstream change from Illumos uses new interface to the
taskqs which allow them to utilize a prealloc'ed taskq_ent_t.
This removes the need to perform an allocation at dispatch
time while holding the contended lock.  This has the effect
of improving system performance.

Reviewed by: Albert Lee <trisk@nexenta.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Alexey Zaytsev <alexey.zaytsev@nexenta.com>
Reviewed by: Jason Brian King <jason.brian.king@gmail.com>
Reviewed by: George Wilson <gwilson@zfsmail.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Approved by: Gordon Ross <gwr@nexenta.com>

References to Illumos issue:
  https://www.illumos.org/issues/734

Ported-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #482

include/sys/zfs_context.h
include/sys/zio.h
lib/libzpool/taskq.c
module/zfs/spa.c
module/zfs/zio.c

index a32848941557b40dd0b1b46cdd6bf1ac6286bb57..4abafcc6f1b87e318f97006b26c9e75b2e036bec 100644 (file)
@@ -22,6 +22,9 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define        _SYS_ZFS_CONTEXT_H
@@ -365,6 +368,16 @@ typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+typedef struct taskq_ent {
+       struct taskq_ent        *tqent_next;
+       struct taskq_ent        *tqent_prev;
+       task_func_t             *tqent_func;
+       void                    *tqent_arg;
+       uintptr_t               tqent_flags;
+} taskq_ent_t;
+
+#define        TQENT_FLAG_PREALLOC     0x1     /* taskq_dispatch_ent used */
+
 #define        TASKQ_PREPOPULATE       0x0001
 #define        TASKQ_CPR_SAFE          0x0002  /* Use CPR safe protocol */
 #define        TASKQ_DYNAMIC           0x0004  /* Use dynamic thread scheduling */
@@ -385,6 +398,10 @@ extern taskq_t     *taskq_create(const char *, int, pri_t, int, int, uint_t);
 #define        taskq_create_sysdc(a, b, d, e, p, dc, f) \
            (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+extern void    taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+    taskq_ent_t *);
+extern int     taskq_empty_ent(taskq_ent_t *);
+extern void    taskq_init_ent(taskq_ent_t *);
 extern void    taskq_destroy(taskq_t *);
 extern void    taskq_wait(taskq_t *);
 extern int     taskq_member(taskq_t *, kthread_t *);
index a469181745b9dbd7cc21b9c36d710a7179778a6d..c0da4e2d78cdbe271728640191b0d42303fd462d 100644 (file)
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #ifndef _ZIO_H
 #define        _ZIO_H
@@ -423,6 +426,9 @@ struct zio {
        /* FMA state */
        zio_cksum_report_t *io_cksum_report;
        uint64_t        io_ena;
+
+       /* Taskq dispatching state */
+       taskq_ent_t     io_tqent;
 };
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
index 36c0ec7dfc78e8401659a984bde5fb77e2dfe1b7..6143a9189951fe7a1e38aca5fac8b74fe96ca79c 100644 (file)
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 #include <sys/zfs_context.h>
 
 int taskq_now;
 taskq_t *system_taskq;
 
-typedef struct task {
-       struct task     *task_next;
-       struct task     *task_prev;
-       task_func_t     *task_func;
-       void            *task_arg;
-} task_t;
-
 #define        TASKQ_ACTIVE    0x00010000
 
 struct taskq {
@@ -51,18 +47,19 @@ struct taskq {
        int             tq_maxalloc;
        kcondvar_t      tq_maxalloc_cv;
        int             tq_maxalloc_wait;
-       task_t          *tq_freelist;
-       task_t          tq_task;
+       taskq_ent_t     *tq_freelist;
+       taskq_ent_t     tq_task;
 };
 
-static task_t *
+static taskq_ent_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
-       task_t *t;
+       taskq_ent_t *t;
        int rv;
 
 again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
-               tq->tq_freelist = t->task_next;
+               ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+               tq->tq_freelist = t->tqent_next;
        } else {
                if (tq->tq_nalloc >= tq->tq_maxalloc) {
                        if (!(tqflags & KM_SLEEP))
@@ -87,25 +84,28 @@ again:      if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
                }
                mutex_exit(&tq->tq_lock);
 
-               t = kmem_alloc(sizeof (task_t), tqflags);
+               t = kmem_alloc(sizeof (taskq_ent_t), tqflags);
 
                mutex_enter(&tq->tq_lock);
-               if (t != NULL)
+               if (t != NULL) {
+                       /* Make sure we start without any flags */
+                       t->tqent_flags = 0;
                        tq->tq_nalloc++;
+               }
        }
        return (t);
 }
 
 static void
-task_free(taskq_t *tq, task_t *t)
+task_free(taskq_t *tq, taskq_ent_t *t)
 {
        if (tq->tq_nalloc <= tq->tq_minalloc) {
-               t->task_next = tq->tq_freelist;
+               t->tqent_next = tq->tq_freelist;
                tq->tq_freelist = t;
        } else {
                tq->tq_nalloc--;
                mutex_exit(&tq->tq_lock);
-               kmem_free(t, sizeof (task_t));
+               kmem_free(t, sizeof (taskq_ent_t));
                mutex_enter(&tq->tq_lock);
        }
 
@@ -116,7 +116,7 @@ task_free(taskq_t *tq, task_t *t)
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 {
-       task_t *t;
+       taskq_ent_t *t;
 
        if (taskq_now) {
                func(arg);
@@ -130,26 +130,77 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
                return (0);
        }
        if (tqflags & TQ_FRONT) {
-               t->task_next = tq->tq_task.task_next;
-               t->task_prev = &tq->tq_task;
+               t->tqent_next = tq->tq_task.tqent_next;
+               t->tqent_prev = &tq->tq_task;
        } else {
-               t->task_next = &tq->tq_task;
-               t->task_prev = tq->tq_task.task_prev;
+               t->tqent_next = &tq->tq_task;
+               t->tqent_prev = tq->tq_task.tqent_prev;
        }
-       t->task_next->task_prev = t;
-       t->task_prev->task_next = t;
-       t->task_func = func;
-       t->task_arg = arg;
+       t->tqent_next->tqent_prev = t;
+       t->tqent_prev->tqent_next = t;
+       t->tqent_func = func;
+       t->tqent_arg = arg;
+
+       ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
        cv_signal(&tq->tq_dispatch_cv);
        mutex_exit(&tq->tq_lock);
        return (1);
 }
 
+int
+taskq_empty_ent(taskq_ent_t *t)
+{
+       return t->tqent_next == NULL;
+}
+
+void
+taskq_init_ent(taskq_ent_t *t)
+{
+       t->tqent_next = NULL;
+       t->tqent_prev = NULL;
+       t->tqent_func = NULL;
+       t->tqent_arg = NULL;
+       t->tqent_flags = 0;
+}
+
+void
+taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+    taskq_ent_t *t)
+{
+       ASSERT(func != NULL);
+       ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
+       /*
+        * Mark it as a prealloc'd task.  This is important
+        * to ensure that we don't free it later.
+        */
+       t->tqent_flags |= TQENT_FLAG_PREALLOC;
+       /*
+        * Enqueue the task to the underlying queue.
+        */
+       mutex_enter(&tq->tq_lock);
+
+       if (flags & TQ_FRONT) {
+               t->tqent_next = tq->tq_task.tqent_next;
+               t->tqent_prev = &tq->tq_task;
+       } else {
+               t->tqent_next = &tq->tq_task;
+               t->tqent_prev = tq->tq_task.tqent_prev;
+       }
+       t->tqent_next->tqent_prev = t;
+       t->tqent_prev->tqent_next = t;
+       t->tqent_func = func;
+       t->tqent_arg = arg;
+       cv_signal(&tq->tq_dispatch_cv);
+       mutex_exit(&tq->tq_lock);
+}
+
 void
 taskq_wait(taskq_t *tq)
 {
        mutex_enter(&tq->tq_lock);
-       while (tq->tq_task.task_next != &tq->tq_task || tq->tq_active != 0)
+       while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
                cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
        mutex_exit(&tq->tq_lock);
 }
@@ -158,27 +209,32 @@ static void
 taskq_thread(void *arg)
 {
        taskq_t *tq = arg;
-       task_t *t;
+       taskq_ent_t *t;
+       boolean_t prealloc;
 
        mutex_enter(&tq->tq_lock);
        while (tq->tq_flags & TASKQ_ACTIVE) {
-               if ((t = tq->tq_task.task_next) == &tq->tq_task) {
+               if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
                        if (--tq->tq_active == 0)
                                cv_broadcast(&tq->tq_wait_cv);
                        cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
                        tq->tq_active++;
                        continue;
                }
-               t->task_prev->task_next = t->task_next;
-               t->task_next->task_prev = t->task_prev;
+               t->tqent_prev->tqent_next = t->tqent_next;
+               t->tqent_next->tqent_prev = t->tqent_prev;
+               t->tqent_next = NULL;
+               t->tqent_prev = NULL;
+               prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
                mutex_exit(&tq->tq_lock);
 
                rw_enter(&tq->tq_threadlock, RW_READER);
-               t->task_func(t->task_arg);
+               t->tqent_func(t->tqent_arg);
                rw_exit(&tq->tq_threadlock);
 
                mutex_enter(&tq->tq_lock);
-               task_free(tq, t);
+               if (!prealloc)
+                       task_free(tq, t);
        }
        tq->tq_nthreads--;
        cv_broadcast(&tq->tq_wait_cv);
@@ -217,8 +273,8 @@ taskq_create(const char *name, int nthreads, pri_t pri,
        tq->tq_nthreads = nthreads;
        tq->tq_minalloc = minalloc;
        tq->tq_maxalloc = maxalloc;
-       tq->tq_task.task_next = &tq->tq_task;
-       tq->tq_task.task_prev = &tq->tq_task;
+       tq->tq_task.tqent_next = &tq->tq_task;
+       tq->tq_task.tqent_prev = &tq->tq_task;
        tq->tq_threadlist = kmem_alloc(nthreads*sizeof(kthread_t *), KM_SLEEP);
 
        if (flags & TASKQ_PREPOPULATE) {
index 868a0d9d270407f8091d736bdc6f7ee4abf57cb1..0b9649723c904cb3506cd0d9d901cf8adebe6feb 100644 (file)
@@ -22,6 +22,9 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  */
+/*
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ */
 
 /*
  * This file contains all the routines used when modifying on-disk SPA state.
@@ -665,7 +668,7 @@ spa_create_zio_taskqs(spa_t *spa)
                        const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
                        enum zti_modes mode = ztip->zti_mode;
                        uint_t value = ztip->zti_value;
-                       uint_t flags = TASKQ_PREPOPULATE;
+                       uint_t flags = 0;
                        char name[32];
 
                        if (t == ZIO_TYPE_WRITE)
index 6b03be6f3bbf62e8d7c442e145e8bd5fd8ca90cb..c96442d0bd715af7b781ce6ca70ebdf80753a013 100644 (file)
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -570,6 +571,8 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
                zio_add_child(pio, zio);
        }
 
+       taskq_init_ent(&zio->io_tqent);
+
        return (zio);
 }
 
@@ -1073,7 +1076,7 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
        spa_t *spa = zio->io_spa;
        zio_type_t t = zio->io_type;
-       int flags = TQ_NOSLEEP | (cutinline ? TQ_FRONT : 0);
+       int flags = (cutinline ? TQ_FRONT : 0);
 
        /*
         * If we're a config writer or a probe, the normal issue and
@@ -1098,8 +1101,14 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 
        ASSERT3U(q, <, ZIO_TASKQ_TYPES);
 
-       while (taskq_dispatch(spa->spa_zio_taskq[t][q],
-           (task_func_t *)zio_execute, zio, flags) == 0); /* do nothing */
+       /*
+        * NB: We are assuming that the zio can only be dispatched
+        * to a single taskq at a time.  It would be a grievous error
+        * to dispatch the zio to another taskq at the same time.
+        */
+       ASSERT(taskq_empty_ent(&zio->io_tqent));
+       taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
+           (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
 }
 
 static boolean_t
@@ -2947,9 +2956,11 @@ zio_done(zio_t *zio)
                         * Reexecution is potentially a huge amount of work.
                         * Hand it off to the otherwise-unused claim taskq.
                         */
-                       (void) taskq_dispatch(
+                       ASSERT(taskq_empty_ent(&zio->io_tqent));
+                       (void) taskq_dispatch_ent(
                            zio->io_spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
-                           (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+                           (task_func_t *)zio_reexecute, zio, 0,
+                           &zio->io_tqent);
                }
                return (ZIO_PIPELINE_STOP);
        }