]> git.proxmox.com Git - mirror_spl.git/blobdiff - module/spl/spl-taskq.c
Remove TQ_SLEEP -> KM_SLEEP mapping
[mirror_spl.git] / module / spl / spl-taskq.c
index 201cb59490104ca3e4d73bf63b7a65a9ddde420e..7ea20461b1aae21e81bd67cc3905422d8a494aa5 100644 (file)
 taskq_t *system_taskq;
 EXPORT_SYMBOL(system_taskq);
 
-typedef struct spl_task {
-        spinlock_t              t_lock;
-        struct list_head        t_list;
-        taskqid_t               t_id;
-        task_func_t             *t_func;
-        void                    *t_arg;
-} spl_task_t;
+static int
+task_km_flags(uint_t flags)
+{
+       if (flags & TQ_NOSLEEP)
+               return KM_NOSLEEP;
+
+       if (flags & TQ_PUSHPAGE)
+               return KM_PUSHPAGE;
+
+       return KM_SLEEP;
+}
 
 /*
  * NOTE: Must be called with tq->tq_lock held, returns a list_t which
  * is not attached to the free, work, or pending taskq lists.
  */
-static spl_task_t *
+static taskq_ent_t *
 task_alloc(taskq_t *tq, uint_t flags)
 {
-        spl_task_t *t;
+        taskq_ent_t *t;
         int count = 0;
         SENTRY;
 
         ASSERT(tq);
-        ASSERT(flags & (TQ_SLEEP | TQ_NOSLEEP));               /* One set */
-        ASSERT(!((flags & TQ_SLEEP) && (flags & TQ_NOSLEEP))); /* Not both */
         ASSERT(spin_is_locked(&tq->tq_lock));
 retry:
-        /* Acquire spl_task_t's from free list if available */
+        /* Acquire taskq_ent_t's from free list if available */
         if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
-                t = list_entry(tq->tq_free_list.next, spl_task_t, t_list);
-                list_del_init(&t->t_list);
+                t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+                ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+                list_del_init(&t->tqent_list);
                 SRETURN(t);
         }
 
@@ -73,59 +78,56 @@ retry:
         if (flags & TQ_NOALLOC)
                 SRETURN(NULL);
 
-        /* Hit maximum spl_task_t pool size */
+        /* Hit maximum taskq_ent_t pool size */
         if (tq->tq_nalloc >= tq->tq_maxalloc) {
                 if (flags & TQ_NOSLEEP)
                         SRETURN(NULL);
 
-                /* Sleep periodically polling the free list for an available
-                 * spl_task_t.  If a full second passes and we have not found
-                 * one gives up and return a NULL to the caller. */
-                if (flags & TQ_SLEEP) {
-                        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
-                        schedule_timeout(HZ / 100);
-                        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
-                        if (count < 100)
-                                SGOTO(retry, count++);
-
-                        SRETURN(NULL);
-                }
-
-                /* Unreachable, Neither TQ_SLEEP or TQ_NOSLEEP set */
-                PANIC("Neither TQ_SLEEP or TQ_NOSLEEP set");
+                /*
+                 * Sleep periodically polling the free list for an available
+                 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
+                 * but we cannot block forever waiting for an taskq_entq_t to
+                 * show up in the free list, otherwise a deadlock can happen.
+                 *
+                 * Therefore, we need to allocate a new task even if the number
+                 * of allocated tasks is above tq->tq_maxalloc, but we still
+                 * end up delaying the task allocation by one second, thereby
+                 * throttling the task dispatch rate.
+                 */
+                 spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+                 schedule_timeout(HZ / 100);
+                 spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+                 if (count < 100)
+                        SGOTO(retry, count++);
         }
 
-       spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
-        t = kmem_alloc(sizeof(spl_task_t), flags & (TQ_SLEEP | TQ_NOSLEEP));
+        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+        t = kmem_alloc(sizeof(taskq_ent_t), task_km_flags(flags));
         spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
-       if (t) {
-               spin_lock_init(&t->t_lock);
-                INIT_LIST_HEAD(&t->t_list);
-               t->t_id = 0;
-               t->t_func = NULL;
-               t->t_arg = NULL;
-               tq->tq_nalloc++;
-       }
+        if (t) {
+                taskq_init_ent(t);
+                tq->tq_nalloc++;
+        }
 
         SRETURN(t);
 }
 
 /*
- * NOTE: Must be called with tq->tq_lock held, expects the spl_task_t
+ * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
  * to already be removed from the free, work, or pending taskq lists.
  */
 static void
-task_free(taskq_t *tq, spl_task_t *t)
+task_free(taskq_t *tq, taskq_ent_t *t)
 {
         SENTRY;
 
         ASSERT(tq);
         ASSERT(t);
        ASSERT(spin_is_locked(&tq->tq_lock));
-       ASSERT(list_empty(&t->t_list));
+       ASSERT(list_empty(&t->tqent_list));
 
-        kmem_free(t, sizeof(spl_task_t));
+        kmem_free(t, sizeof(taskq_ent_t));
         tq->tq_nalloc--;
 
        SEXIT;
@@ -133,23 +135,25 @@ task_free(taskq_t *tq, spl_task_t *t)
 
 /*
  * NOTE: Must be called with tq->tq_lock held, either destroys the
- * spl_task_t if too many exist or moves it to the free list for later use.
+ * taskq_ent_t if too many exist or moves it to the free list for later use.
  */
 static void
-task_done(taskq_t *tq, spl_task_t *t)
+task_done(taskq_t *tq, taskq_ent_t *t)
 {
        SENTRY;
        ASSERT(tq);
        ASSERT(t);
        ASSERT(spin_is_locked(&tq->tq_lock));
 
-       list_del_init(&t->t_list);
+       list_del_init(&t->tqent_list);
 
         if (tq->tq_nalloc <= tq->tq_minalloc) {
-               t->t_id = 0;
-               t->t_func = NULL;
-               t->t_arg = NULL;
-                list_add_tail(&t->t_list, &tq->tq_free_list);
+               t->tqent_id = 0;
+               t->tqent_func = NULL;
+               t->tqent_arg = NULL;
+               t->tqent_flags = 0;
+
+                list_add_tail(&t->tqent_list, &tq->tq_free_list);
        } else {
                task_free(tq, t);
        }
@@ -162,19 +166,22 @@ task_done(taskq_t *tq, spl_task_t *t)
  * monotonically increasing taskqid and added to the tail of the pending
  * list.  As worker threads become available the tasks are removed from
  * the head of the pending or priority list, giving preference to the
- * priority list.  The tasks are then added to the work list, preserving
- * the ordering by taskqid.  Finally, as tasks complete they are removed
- * from the work list.  This means that the pending and work lists are
- * always kept sorted by taskqid.  Thus the lowest outstanding
- * incomplete taskqid can be determined simply by checking the min
- * taskqid for each head item on the pending, priority, and work list.
- * This value is stored in tq->tq_lowest_id and only updated to the new
- * lowest id when the previous lowest id completes.  All taskqids lower
- * than tq->tq_lowest_id must have completed.  It is also possible
- * larger taskqid's have completed because they may be processed in
- * parallel by several worker threads.  However, this is not a problem
- * because the behavior of taskq_wait_id() is to block until all
- * previously submitted taskqid's have completed.
+ * priority list.  The tasks are then removed from their respective
+ * list, and the taskq_thread servicing the task is added to the active
+ * list, preserving the order using the serviced task's taskqid.
+ * Finally, as tasks complete the taskq_thread servicing the task is
+ * removed from the active list.  This means that the pending task and
+ * active taskq_thread lists are always kept sorted by taskqid. Thus the
+ * lowest outstanding incomplete taskqid can be determined simply by
+ * checking the min taskqid for each head item on the pending, priority,
+ * and active taskq_thread list. This value is stored in
+ * tq->tq_lowest_id and only updated to the new lowest id when the
+ * previous lowest id completes.  All taskqids lower than
+ * tq->tq_lowest_id must have completed.  It is also possible larger
+ * taskqid's have completed because they may be processed in parallel by
+ * several worker threads.  However, this is not a problem because the
+ * behavior of taskq_wait_id() is to block until all previously
+ * submitted taskqid's have completed.
  *
  * XXX: Taskqid_t wrapping is not handled.  However, taskqid_t's are
  * 64-bit values so even if a taskq is processing 2^24 (16,777,216)
@@ -227,15 +234,18 @@ EXPORT_SYMBOL(__taskq_wait);
 int
 __taskq_member(taskq_t *tq, void *t)
 {
-        int i;
+       struct list_head *l;
+       taskq_thread_t *tqt;
         SENTRY;
 
        ASSERT(tq);
         ASSERT(t);
 
-        for (i = 0; i < tq->tq_nthreads; i++)
-                if (tq->tq_threads[i] == (struct task_struct *)t)
-                        SRETURN(1);
+       list_for_each(l, &tq->tq_thread_list) {
+               tqt = list_entry(l, taskq_thread_t, tqt_thread_list);
+               if (tqt->tqt_thread == (struct task_struct *)t)
+                       SRETURN(1);
+       }
 
         SRETURN(0);
 }
@@ -244,21 +254,13 @@ EXPORT_SYMBOL(__taskq_member);
 taskqid_t
 __taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
-        spl_task_t *t;
+        taskq_ent_t *t;
        taskqid_t rc = 0;
         SENTRY;
 
         ASSERT(tq);
         ASSERT(func);
 
-       /* Solaris assumes TQ_SLEEP if not passed explicitly */
-       if (!(flags & (TQ_SLEEP | TQ_NOSLEEP)))
-               flags |= TQ_SLEEP;
-
-       if (unlikely(in_atomic() && (flags & TQ_SLEEP)))
-               PANIC("May schedule while atomic: %s/0x%08x/%d\n",
-                   current->comm, preempt_count(), current->pid);
-
         spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
        /* Taskq being destroyed and all tasks drained */
@@ -273,19 +275,22 @@ __taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
         if ((t = task_alloc(tq, flags)) == NULL)
                SGOTO(out, rc = 0);
 
-       spin_lock(&t->t_lock);
+       spin_lock(&t->tqent_lock);
 
        /* Queue to the priority list instead of the pending list */
        if (flags & TQ_FRONT)
-               list_add_tail(&t->t_list, &tq->tq_prio_list);
+               list_add_tail(&t->tqent_list, &tq->tq_prio_list);
        else
-               list_add_tail(&t->t_list, &tq->tq_pend_list);
+               list_add_tail(&t->tqent_list, &tq->tq_pend_list);
 
-       t->t_id = rc = tq->tq_next_id;
+       t->tqent_id = rc = tq->tq_next_id;
        tq->tq_next_id++;
-        t->t_func = func;
-        t->t_arg = arg;
-       spin_unlock(&t->t_lock);
+        t->tqent_func = func;
+        t->tqent_arg = arg;
+
+       ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+       spin_unlock(&t->tqent_lock);
 
        wake_up(&tq->tq_work_waitq);
 out:
@@ -294,6 +299,71 @@ out:
 }
 EXPORT_SYMBOL(__taskq_dispatch);
 
+void
+__taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
+   taskq_ent_t *t)
+{
+       SENTRY;
+
+       ASSERT(tq);
+       ASSERT(func);
+       ASSERT(!(tq->tq_flags & TASKQ_DYNAMIC));
+
+       spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+
+       /* Taskq being destroyed and all tasks drained */
+       if (!(tq->tq_flags & TQ_ACTIVE)) {
+               t->tqent_id = 0;
+               goto out;
+       }
+
+       spin_lock(&t->tqent_lock);
+
+       /*
+        * Mark it as a prealloc'd task.  This is important
+        * to ensure that we don't free it later.
+        */
+       t->tqent_flags |= TQENT_FLAG_PREALLOC;
+
+       /* Queue to the priority list instead of the pending list */
+       if (flags & TQ_FRONT)
+               list_add_tail(&t->tqent_list, &tq->tq_prio_list);
+       else
+               list_add_tail(&t->tqent_list, &tq->tq_pend_list);
+
+       t->tqent_id = tq->tq_next_id;
+       tq->tq_next_id++;
+       t->tqent_func = func;
+       t->tqent_arg = arg;
+
+       spin_unlock(&t->tqent_lock);
+
+       wake_up(&tq->tq_work_waitq);
+out:
+       spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+       SEXIT;
+}
+EXPORT_SYMBOL(__taskq_dispatch_ent);
+
+int
+__taskq_empty_ent(taskq_ent_t *t)
+{
+       return list_empty(&t->tqent_list);
+}
+EXPORT_SYMBOL(__taskq_empty_ent);
+
+void
+__taskq_init_ent(taskq_ent_t *t)
+{
+       spin_lock_init(&t->tqent_lock);
+       INIT_LIST_HEAD(&t->tqent_list);
+       t->tqent_id = 0;
+       t->tqent_func = NULL;
+       t->tqent_arg = NULL;
+       t->tqent_flags = 0;
+}
+EXPORT_SYMBOL(__taskq_init_ent);
+
 /*
  * Returns the lowest incomplete taskqid_t.  The taskqid_t may
  * be queued on the pending list, on the priority list,  or on
@@ -304,25 +374,28 @@ static taskqid_t
 taskq_lowest_id(taskq_t *tq)
 {
        taskqid_t lowest_id = tq->tq_next_id;
-        spl_task_t *t;
+        taskq_ent_t *t;
+       taskq_thread_t *tqt;
        SENTRY;
 
        ASSERT(tq);
        ASSERT(spin_is_locked(&tq->tq_lock));
 
        if (!list_empty(&tq->tq_pend_list)) {
-               t = list_entry(tq->tq_pend_list.next, spl_task_t, t_list);
-               lowest_id = MIN(lowest_id, t->t_id);
+               t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
+               lowest_id = MIN(lowest_id, t->tqent_id);
        }
 
        if (!list_empty(&tq->tq_prio_list)) {
-               t = list_entry(tq->tq_prio_list.next, spl_task_t, t_list);
-               lowest_id = MIN(lowest_id, t->t_id);
+               t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
+               lowest_id = MIN(lowest_id, t->tqent_id);
        }
 
-       if (!list_empty(&tq->tq_work_list)) {
-               t = list_entry(tq->tq_work_list.next, spl_task_t, t_list);
-               lowest_id = MIN(lowest_id, t->t_id);
+       if (!list_empty(&tq->tq_active_list)) {
+               tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
+                                tqt_active_list);
+               ASSERT(tqt->tqt_id != 0);
+               lowest_id = MIN(lowest_id, tqt->tqt_id);
        }
 
        SRETURN(lowest_id);
@@ -333,25 +406,25 @@ taskq_lowest_id(taskq_t *tq)
  * taskqid.
  */
 static void
-taskq_insert_in_order(taskq_t *tq, spl_task_t *t)
+taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
 {
-       spl_task_t *w;
+       taskq_thread_t *w;
        struct list_head *l;
 
        SENTRY;
        ASSERT(tq);
-       ASSERT(t);
+       ASSERT(tqt);
        ASSERT(spin_is_locked(&tq->tq_lock));
 
-       list_for_each_prev(l, &tq->tq_work_list) {
-               w = list_entry(l, spl_task_t, t_list);
-               if (w->t_id < t->t_id) {
-                       list_add(&t->t_list, l);
+       list_for_each_prev(l, &tq->tq_active_list) {
+               w = list_entry(l, taskq_thread_t, tqt_active_list);
+               if (w->tqt_id < tqt->tqt_id) {
+                       list_add(&tqt->tqt_active_list, l);
                        break;
                }
        }
-       if (l == &tq->tq_work_list)
-               list_add(&t->t_list, &tq->tq_work_list);
+       if (l == &tq->tq_active_list)
+               list_add(&tqt->tqt_active_list, &tq->tq_active_list);
 
        SEXIT;
 }
@@ -361,13 +434,14 @@ taskq_thread(void *args)
 {
         DECLARE_WAITQUEUE(wait, current);
         sigset_t blocked;
-       taskqid_t id;
-        taskq_t *tq = args;
-        spl_task_t *t;
+       taskq_thread_t *tqt = args;
+        taskq_t *tq;
+        taskq_ent_t *t;
        struct list_head *pend_list;
        SENTRY;
 
-        ASSERT(tq);
+        ASSERT(tqt);
+       tq = tqt->tqt_tq;
         current->flags |= PF_NOFREEZE;
 
         sigfillset(&blocked);
@@ -381,17 +455,17 @@ taskq_thread(void *args)
 
         while (!kthread_should_stop()) {
 
-               add_wait_queue(&tq->tq_work_waitq, &wait);
                if (list_empty(&tq->tq_pend_list) &&
                    list_empty(&tq->tq_prio_list)) {
+                       add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
                        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
                        schedule();
                        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+                       remove_wait_queue(&tq->tq_work_waitq, &wait);
                } else {
                        __set_current_state(TASK_RUNNING);
                }
 
-               remove_wait_queue(&tq->tq_work_waitq, &wait);
 
                if (!list_empty(&tq->tq_prio_list))
                        pend_list = &tq->tq_prio_list;
@@ -401,27 +475,46 @@ taskq_thread(void *args)
                        pend_list = NULL;
 
                if (pend_list) {
-                        t = list_entry(pend_list->next, spl_task_t, t_list);
-                        list_del_init(&t->t_list);
-                       taskq_insert_in_order(tq, t);
+                        t = list_entry(pend_list->next, taskq_ent_t, tqent_list);
+                        list_del_init(&t->tqent_list);
+
+                       /* In order to support recursively dispatching a
+                        * preallocated taskq_ent_t, tqent_id must be
+                        * stored prior to executing tqent_func. */
+                       tqt->tqt_id = t->tqent_id;
+
+                       /* We must store a copy of the flags prior to
+                        * servicing the task (servicing a prealloc'd task
+                        * returns the ownership of the tqent back to
+                        * the caller of taskq_dispatch). Thus,
+                        * tqent_flags _may_ change within the call. */
+                       tqt->tqt_flags = t->tqent_flags;
+
+                       taskq_insert_in_order(tq, tqt);
                         tq->tq_nactive++;
                        spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
                        /* Perform the requested task */
-                        t->t_func(t->t_arg);
+                        t->tqent_func(t->tqent_arg);
 
                        spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
                         tq->tq_nactive--;
-                       id = t->t_id;
-                        task_done(tq, t);
+                       list_del_init(&tqt->tqt_active_list);
+
+                       /* For prealloc'd tasks, we don't free anything. */
+                       if ((tq->tq_flags & TASKQ_DYNAMIC) ||
+                           !(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
+                               task_done(tq, t);
 
                        /* When the current lowest outstanding taskqid is
                         * done calculate the new lowest outstanding id */
-                       if (tq->tq_lowest_id == id) {
+                       if (tq->tq_lowest_id == tqt->tqt_id) {
                                tq->tq_lowest_id = taskq_lowest_id(tq);
-                               ASSERT(tq->tq_lowest_id > id);
+                               ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
                        }
 
+                       tqt->tqt_id = 0;
+                       tqt->tqt_flags = 0;
                         wake_up_all(&tq->tq_wait_waitq);
                }
 
@@ -431,6 +524,9 @@ taskq_thread(void *args)
 
        __set_current_state(TASK_RUNNING);
         tq->tq_nthreads--;
+       list_del_init(&tqt->tqt_thread_list);
+       kmem_free(tqt, sizeof(taskq_thread_t));
+
         spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
        SRETURN(0);
@@ -441,7 +537,7 @@ __taskq_create(const char *name, int nthreads, pri_t pri,
                int minalloc, int maxalloc, uint_t flags)
 {
         taskq_t *tq;
-        struct task_struct *t;
+       taskq_thread_t *tqt;
         int rc = 0, i, j = 0;
         SENTRY;
 
@@ -460,18 +556,14 @@ __taskq_create(const char *name, int nthreads, pri_t pri,
                nthreads = MAX((num_online_cpus() * nthreads) / 100, 1);
        }
 
-        tq = kmem_alloc(sizeof(*tq), KM_SLEEP);
+        tq = kmem_alloc(sizeof(*tq), KM_PUSHPAGE);
         if (tq == NULL)
                 SRETURN(NULL);
 
-        tq->tq_threads = kmem_alloc(nthreads * sizeof(t), KM_SLEEP);
-        if (tq->tq_threads == NULL) {
-                kmem_free(tq, sizeof(*tq));
-                SRETURN(NULL);
-        }
-
         spin_lock_init(&tq->tq_lock);
         spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+        INIT_LIST_HEAD(&tq->tq_thread_list);
+        INIT_LIST_HEAD(&tq->tq_active_list);
         tq->tq_name      = name;
         tq->tq_nactive   = 0;
        tq->tq_nthreads  = 0;
@@ -483,7 +575,6 @@ __taskq_create(const char *name, int nthreads, pri_t pri,
        tq->tq_next_id   = 1;
        tq->tq_lowest_id = 1;
         INIT_LIST_HEAD(&tq->tq_free_list);
-        INIT_LIST_HEAD(&tq->tq_work_list);
         INIT_LIST_HEAD(&tq->tq_pend_list);
         INIT_LIST_HEAD(&tq->tq_prio_list);
         init_waitqueue_head(&tq->tq_work_waitq);
@@ -491,23 +582,30 @@ __taskq_create(const char *name, int nthreads, pri_t pri,
 
         if (flags & TASKQ_PREPOPULATE)
                 for (i = 0; i < minalloc; i++)
-                        task_done(tq, task_alloc(tq, TQ_SLEEP | TQ_NEW));
+                        task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW));
 
         spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
 
-        for (i = 0; i < nthreads; i++) {
-                t = kthread_create(taskq_thread, tq, "%s/%d", name, i);
-                if (t) {
-                        tq->tq_threads[i] = t;
-                        kthread_bind(t, i % num_online_cpus());
-                        set_user_nice(t, PRIO_TO_NICE(pri));
-                        wake_up_process(t);
+       for (i = 0; i < nthreads; i++) {
+               tqt = kmem_alloc(sizeof(*tqt), KM_PUSHPAGE);
+               INIT_LIST_HEAD(&tqt->tqt_thread_list);
+               INIT_LIST_HEAD(&tqt->tqt_active_list);
+               tqt->tqt_tq = tq;
+               tqt->tqt_id = 0;
+
+               tqt->tqt_thread = kthread_create(taskq_thread, tqt,
+                                                "%s/%d", name, i);
+               if (tqt->tqt_thread) {
+                       list_add(&tqt->tqt_thread_list, &tq->tq_thread_list);
+                       kthread_bind(tqt->tqt_thread, i % num_online_cpus());
+                       set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(pri));
+                       wake_up_process(tqt->tqt_thread);
                        j++;
-                } else {
-                        tq->tq_threads[i] = NULL;
-                        rc = 1;
-                }
-        }
+               } else {
+                       kmem_free(tqt, sizeof(taskq_thread_t));
+                       rc = 1;
+               }
+       }
 
         /* Wait for all threads to be started before potential destroy */
        wait_event(tq->tq_wait_waitq, tq->tq_nthreads == j);
@@ -524,8 +622,9 @@ EXPORT_SYMBOL(__taskq_create);
 void
 __taskq_destroy(taskq_t *tq)
 {
-       spl_task_t *t;
-       int i, nthreads;
+       struct task_struct *thread;
+       taskq_thread_t *tqt;
+       taskq_ent_t *t;
        SENTRY;
 
        ASSERT(tq);
@@ -536,28 +635,44 @@ __taskq_destroy(taskq_t *tq)
        /* TQ_ACTIVE cleared prevents new tasks being added to pending */
         __taskq_wait(tq);
 
-       nthreads = tq->tq_nthreads;
-       for (i = 0; i < nthreads; i++)
-               if (tq->tq_threads[i])
-                       kthread_stop(tq->tq_threads[i]);
-
         spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
 
+       /*
+        * Signal each thread to exit and block until it does.  Each thread
+        * is responsible for removing itself from the list and freeing its
+        * taskq_thread_t.  This allows for idle threads to opt to remove
+        * themselves from the taskq.  They can be recreated as needed.
+        */
+       while (!list_empty(&tq->tq_thread_list)) {
+               tqt = list_entry(tq->tq_thread_list.next,
+                                taskq_thread_t, tqt_thread_list);
+               thread = tqt->tqt_thread;
+               spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
+
+               kthread_stop(thread);
+
+               spin_lock_irqsave(&tq->tq_lock, tq->tq_lock_flags);
+       }
+
         while (!list_empty(&tq->tq_free_list)) {
-               t = list_entry(tq->tq_free_list.next, spl_task_t, t_list);
-               list_del_init(&t->t_list);
+               t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
+
+               ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
+
+               list_del_init(&t->tqent_list);
                 task_free(tq, t);
         }
 
         ASSERT(tq->tq_nthreads == 0);
         ASSERT(tq->tq_nalloc == 0);
+        ASSERT(list_empty(&tq->tq_thread_list));
+        ASSERT(list_empty(&tq->tq_active_list));
         ASSERT(list_empty(&tq->tq_free_list));
-        ASSERT(list_empty(&tq->tq_work_list));
         ASSERT(list_empty(&tq->tq_pend_list));
         ASSERT(list_empty(&tq->tq_prio_list));
 
         spin_unlock_irqrestore(&tq->tq_lock, tq->tq_lock_flags);
-        kmem_free(tq->tq_threads, nthreads * sizeof(spl_task_t *));
+
         kmem_free(tq, sizeof(taskq_t));
 
        SEXIT;