]> git.proxmox.com Git - mirror_ubuntu-focal-kernel.git/commitdiff
drm/i915/gt: Close race between engine_park and intel_gt_retire_requests
authorChris Wilson <chris@chris-wilson.co.uk>
Wed, 20 Nov 2019 16:55:13 +0000 (16:55 +0000)
committerAndrea Righi <andrea.righi@canonical.com>
Mon, 24 Feb 2020 20:35:40 +0000 (21:35 +0100)
BugLink: https://bugs.launchpad.net/bugs/1853044
The general concept was that intel_timeline.active_count was locked by
the intel_timeline.mutex. The exception was for power management, where
the engine->kernel_context->timeline could be manipulated under the
global wakeref.mutex.

This was quite solid, as we always manipulated the timeline only while
we held an engine wakeref.

And then we started retiring requests outside of struct_mutex, only
using the timelines.active_list and the timeline->mutex. There we
started manipulating intel_timeline.active_count outside of an engine
wakeref, and so introduced a race between __engine_park() and
intel_gt_retire_requests(), a race that could result in the
engine->kernel_context not being added to the active timelines and so
losing requests, which caused us to keep the system permanently powered
up [and unloadable].

The race would be easy to close if we could take the engine wakeref for
the timeline before we retire -- except timelines are not bound to any
engine and so we would need to keep all active engines awake. The
alternative is to guard intel_timeline_enter/intel_timeline_exit for use
outside of the timeline->mutex.

Fixes: e5dadff4b093 ("drm/i915: Protect request retirement with timeline->mutex")
Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Matthew Auld <matthew.auld@intel.com>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191120165514.3955081-1-chris@chris-wilson.co.uk
(cherry picked from commit a6edbca74b305adc165e67065d7ee766006e6a48)
Signed-off-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
(cherry picked from commit ca1711d1991f968d1e88725a2e607e57ecd5c5f1)
Signed-off-by: Andrea Righi <andrea.righi@canonical.com>
drivers/gpu/drm/i915/gt/intel_timeline.c
drivers/gpu/drm/i915/gt/intel_timeline_types.h
drivers/gpu/drm/i915/i915_request.c

index 9cb01d9828f1dbc189793c297b9cf42d7088bf00..898c51b346fc4efdc887696639a976f94f394b69 100644 (file)
@@ -339,15 +339,33 @@ void intel_timeline_enter(struct intel_timeline *tl)
        struct intel_gt_timelines *timelines = &tl->gt->timelines;
        unsigned long flags;
 
+       /*
+        * Pretend we are serialised by the timeline->mutex.
+        *
+        * While generally true, there are a few exceptions to the rule
+        * for the engine->kernel_context being used to manage power
+        * transitions. As the engine_park may be called from under any
+        * timeline, it uses the power mutex as a global serialisation
+        * lock to prevent any other request entering its timeline.
+        *
+        * The rule is generally tl->mutex, otherwise engine->wakeref.mutex.
+        *
+        * However, intel_gt_retire_request() does not know which engine
+        * it is retiring along and so cannot partake in the engine-pm
+        * barrier, and there we use the tl->active_count as a means to
+        * pin the timeline in the active_list while the locks are dropped.
+        * Ergo, as that is outside of the engine-pm barrier, we need to
+        * use atomic to manipulate tl->active_count.
+        */
        lockdep_assert_held(&tl->mutex);
-
        GEM_BUG_ON(!atomic_read(&tl->pin_count));
-       if (tl->active_count++)
+
+       if (atomic_add_unless(&tl->active_count, 1, 0))
                return;
-       GEM_BUG_ON(!tl->active_count); /* overflow? */
 
        spin_lock_irqsave(&timelines->lock, flags);
-       list_add(&tl->link, &timelines->active_list);
+       if (!atomic_fetch_inc(&tl->active_count))
+               list_add_tail(&tl->link, &timelines->active_list);
        spin_unlock_irqrestore(&timelines->lock, flags);
 }
 
@@ -356,14 +374,16 @@ void intel_timeline_exit(struct intel_timeline *tl)
        struct intel_gt_timelines *timelines = &tl->gt->timelines;
        unsigned long flags;
 
+       /* See intel_timeline_enter() */
        lockdep_assert_held(&tl->mutex);
 
-       GEM_BUG_ON(!tl->active_count);
-       if (--tl->active_count)
+       GEM_BUG_ON(!atomic_read(&tl->active_count));
+       if (atomic_add_unless(&tl->active_count, -1, 1))
                return;
 
        spin_lock_irqsave(&timelines->lock, flags);
-       list_del(&tl->link);
+       if (atomic_dec_and_test(&tl->active_count))
+               list_del(&tl->link);
        spin_unlock_irqrestore(&timelines->lock, flags);
 
        /*
index 2b1baf2fcc8e438351e4027b1aa19ea5b31b0ad8..fc80c40f9bc3364e2c79a39ec5904f24c9ba27ed 100644 (file)
@@ -42,7 +42,7 @@ struct intel_timeline {
         * from the intel_context caller plus internal atomicity.
         */
        atomic_t pin_count;
-       unsigned int active_count;
+       atomic_t active_count;
 
        const u32 *hwsp_seqno;
        struct i915_vma *hwsp_ggtt;
index 1c5506822dc7296a9d247de9cbf0330a55f55cb2..2f4ac438b96c7fd3bf74a0203394a6dddd215726 100644 (file)
@@ -1513,8 +1513,8 @@ bool i915_retire_requests(struct drm_i915_private *i915)
                        continue;
 
                intel_timeline_get(tl);
-               GEM_BUG_ON(!tl->active_count);
-               tl->active_count++; /* pin the list element */
+               GEM_BUG_ON(!atomic_read(&tl->active_count));
+               atomic_inc(&tl->active_count); /* pin the list element */
                spin_unlock_irqrestore(&timelines->lock, flags);
 
                retire_requests(tl);
@@ -1523,14 +1523,14 @@ bool i915_retire_requests(struct drm_i915_private *i915)
 
                /* Resume iteration after dropping lock */
                list_safe_reset_next(tl, tn, link);
-               if (!--tl->active_count)
+               if (atomic_dec_and_test(&tl->active_count))
                        list_del(&tl->link);
 
                mutex_unlock(&tl->mutex);
 
                /* Defer the final release to after the spinlock */
                if (refcount_dec_and_test(&tl->kref.refcount)) {
-                       GEM_BUG_ON(tl->active_count);
+                       GEM_BUG_ON(atomic_read(&tl->active_count));
                        list_add(&tl->link, &free);
                }
        }