drm/xe: handle TLB invalidations from CT fast-path

author Matthew Auld <matthew.auld@intel.com>

Mon, 10 Jul 2023 09:40:49 +0000 (10:40 +0100)

committer Rodrigo Vivi <rodrigo.vivi@intel.com>

Thu, 21 Dec 2023 16:35:23 +0000 (11:35 -0500)
author Matthew Auld <matthew.auld@intel.com>
Mon, 10 Jul 2023 09:40:49 +0000 (10:40 +0100)
committer Rodrigo Vivi <rodrigo.vivi@intel.com>
Thu, 21 Dec 2023 16:35:23 +0000 (11:35 -0500)
diff --git a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

index 29819dc820c5d6726746c70cac99d9d292611922..e2b85559257c79d4d184c384ae2695ae41b92d69 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
+++ b/drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c
@@ -25,7 +25,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
                                         tlb_invalidation.fence_tdr.work);
         struct xe_gt_tlb_invalidation_fence *fence, *next;
  
-       mutex_lock(&gt->uc.guc.ct.lock);
+       spin_lock_irq(&gt->tlb_invalidation.pending_lock);
         list_for_each_entry_safe(fence, next,
                                  &gt->tlb_invalidation.pending_fences, link) {
                 s64 since_inval_ms = ktime_ms_delta(ktime_get(),
@@ -47,7 +47,7 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
                 queue_delayed_work(system_wq,
                                    &gt->tlb_invalidation.fence_tdr,
                                    TLB_TIMEOUT);
-       mutex_unlock(&gt->uc.guc.ct.lock);
+       spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
  }
  
  /**
@@ -63,6 +63,7 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
  {
         gt->tlb_invalidation.seqno = 1;
         INIT_LIST_HEAD(&gt->tlb_invalidation.pending_fences);
+       spin_lock_init(&gt->tlb_invalidation.pending_lock);
         spin_lock_init(&gt->tlb_invalidation.lock);
         gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
         INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
@@ -72,14 +73,20 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
  }
  
  static void
-invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
+__invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
  {
         trace_xe_gt_tlb_invalidation_fence_signal(fence);
-       list_del(&fence->link);
         dma_fence_signal(&fence->base);
         dma_fence_put(&fence->base);
  }
  
+static void
+invalidation_fence_signal(struct xe_gt_tlb_invalidation_fence *fence)
+{
+       list_del(&fence->link);
+       __invalidation_fence_signal(fence);
+}
+
  /**
   * xe_gt_tlb_invalidation_reset - Initialize GT TLB invalidation reset
   * @gt: graphics tile
@@ -98,6 +105,7 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
          */
  
         mutex_lock(&gt->uc.guc.ct.lock);
+       spin_lock_irq(&gt->tlb_invalidation.pending_lock);
         cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
         /*
          * We might have various kworkers waiting for TLB flushes to complete
@@ -116,9 +124,23 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
         list_for_each_entry_safe(fence, next,
                                  &gt->tlb_invalidation.pending_fences, link)
                 invalidation_fence_signal(fence);
+       spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
         mutex_unlock(&gt->uc.guc.ct.lock);
  }
  
+static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno)
+{
+       int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv);
+
+       if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2))
+               return false;
+
+       if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2))
+               return true;
+
+       return seqno_recv >= seqno;
+}
+
  static int send_tlb_invalidation(struct xe_guc *guc,
                                  struct xe_gt_tlb_invalidation_fence *fence,
                                  u32 *action, int len)
@@ -126,7 +148,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
         struct xe_gt *gt = guc_to_gt(guc);
         int seqno;
         int ret;
-       bool queue_work;
  
         /*
          * XXX: The seqno algorithm relies on TLB invalidation being processed
@@ -137,21 +158,35 @@ static int send_tlb_invalidation(struct xe_guc *guc,
         mutex_lock(&guc->ct.lock);
         seqno = gt->tlb_invalidation.seqno;
         if (fence) {
-               queue_work = list_empty(&gt->tlb_invalidation.pending_fences);
                 fence->seqno = seqno;
-               list_add_tail(&fence->link,
-                             &gt->tlb_invalidation.pending_fences);
                 trace_xe_gt_tlb_invalidation_fence_send(fence);
         }
         action[1] = seqno;
         ret = xe_guc_ct_send_locked(&guc->ct, action, len,
                                     G2H_LEN_DW_TLB_INVALIDATE, 1);
         if (!ret && fence) {
-               fence->invalidation_time = ktime_get();
-               if (queue_work)
-                       queue_delayed_work(system_wq,
-                                          &gt->tlb_invalidation.fence_tdr,
-                                          TLB_TIMEOUT);
+               spin_lock_irq(&gt->tlb_invalidation.pending_lock);
+               /*
+                * We haven't actually published the TLB fence as per
+                * pending_fences, but in theory our seqno could have already
+                * been written as we acquired the pending_lock. In such a case
+                * we can just go ahead and signal the fence here.
+                */
+               if (tlb_invalidation_seqno_past(gt, seqno)) {
+                       __invalidation_fence_signal(fence);
+               } else {
+                       fence->invalidation_time = ktime_get();
+                       list_add_tail(&fence->link,
+                                     &gt->tlb_invalidation.pending_fences);
+
+                       if (list_is_singular(&gt->tlb_invalidation.pending_fences))
+                               queue_delayed_work(system_wq,
+                                                  &gt->tlb_invalidation.fence_tdr,
+                                                  TLB_TIMEOUT);
+               }
+               spin_unlock_irq(&gt->tlb_invalidation.pending_lock);
+       } else if (ret < 0 && fence) {
+               __invalidation_fence_signal(fence);
         }
         if (!ret) {
                 gt->tlb_invalidation.seqno = (gt->tlb_invalidation.seqno + 1) %
@@ -160,8 +195,6 @@ static int send_tlb_invalidation(struct xe_guc *guc,
                         gt->tlb_invalidation.seqno = 1;
                 ret = seqno;
         }
-       if (ret < 0 && fence)
-               invalidation_fence_signal(fence);
         mutex_unlock(&guc->ct.lock);
  
         return ret;
@@ -276,19 +309,6 @@ int xe_gt_tlb_invalidation_vma(struct xe_gt *gt,
         return ret;
  }
  
-static bool tlb_invalidation_seqno_past(struct xe_gt *gt, int seqno)
-{
-       int seqno_recv = READ_ONCE(gt->tlb_invalidation.seqno_recv);
-
-       if (seqno - seqno_recv < -(TLB_INVALIDATION_SEQNO_MAX / 2))
-               return false;
-
-       if (seqno - seqno_recv > (TLB_INVALIDATION_SEQNO_MAX / 2))
-               return true;
-
-       return seqno_recv >= seqno;
-}
-
  /**
   * xe_gt_tlb_invalidation_wait - Wait for TLB to complete
   * @gt: graphics tile
@@ -336,22 +356,31 @@ int xe_gt_tlb_invalidation_wait(struct xe_gt *gt, int seqno)
  int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
  {
         struct xe_gt *gt = guc_to_gt(guc);
-       struct xe_gt_tlb_invalidation_fence *fence;
-       int expected_seqno;
-
-       lockdep_assert_held(&guc->ct.lock);
+       struct xe_gt_tlb_invalidation_fence *fence, *next;
+       unsigned long flags;
  
         if (unlikely(len != 1))
                 return -EPROTO;
  
-       /* Sanity check on seqno */
-       expected_seqno = (gt->tlb_invalidation.seqno_recv + 1) %
-               TLB_INVALIDATION_SEQNO_MAX;
-       if (!expected_seqno)
-               expected_seqno = 1;
-       if (drm_WARN_ON(&gt_to_xe(gt)->drm, expected_seqno != msg[0])) {
-               drm_err(&gt_to_xe(gt)->drm, "TLB expected_seqno(%d) != msg(%u)\n",
-                       expected_seqno, msg[0]);
+       /*
+        * This can also be run both directly from the IRQ handler and also in
+        * process_g2h_msg(). Only one may process any individual CT message,
+        * however the order they are processed here could result in skipping a
+        * seqno. To handle that we just process all the seqnos from the last
+        * seqno_recv up to and including the one in msg[0]. The delta should be
+        * very small so there shouldn't be much of pending_fences we actually
+        * need to iterate over here.
+        *
+        * From GuC POV we expect the seqnos to always appear in-order, so if we
+        * see something later in the timeline we can be sure that anything
+        * appearing earlier has already signalled, just that we have yet to
+        * officially process the CT message like if racing against
+        * process_g2h_msg().
+        */
+       spin_lock_irqsave(&gt->tlb_invalidation.pending_lock, flags);
+       if (tlb_invalidation_seqno_past(gt, msg[0])) {
+               spin_unlock_irqrestore(&gt->tlb_invalidation.pending_lock, flags);
+               return 0;
         }
  
         /*
@@ -361,19 +390,24 @@ int xe_guc_tlb_invalidation_done_handler(struct xe_guc *guc, u32 *msg, u32 len)
         WRITE_ONCE(gt->tlb_invalidation.seqno_recv, msg[0]);
         wake_up_all(&guc->ct.wq);
  
-       fence = list_first_entry_or_null(&gt->tlb_invalidation.pending_fences,
-                                        typeof(*fence), link);
-       if (fence)
+       list_for_each_entry_safe(fence, next,
+                                &gt->tlb_invalidation.pending_fences, link) {
                 trace_xe_gt_tlb_invalidation_fence_recv(fence);
-       if (fence && tlb_invalidation_seqno_past(gt, fence->seqno)) {
+
+               if (!tlb_invalidation_seqno_past(gt, fence->seqno))
+                       break;
+
                 invalidation_fence_signal(fence);
-               if (!list_empty(&gt->tlb_invalidation.pending_fences))
-                       mod_delayed_work(system_wq,
-                                        &gt->tlb_invalidation.fence_tdr,
-                                        TLB_TIMEOUT);
-               else
-                       cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
         }
  
+       if (!list_empty(&gt->tlb_invalidation.pending_fences))
+               mod_delayed_work(system_wq,
+                                &gt->tlb_invalidation.fence_tdr,
+                                TLB_TIMEOUT);
+       else
+               cancel_delayed_work(&gt->tlb_invalidation.fence_tdr);
+
+       spin_unlock_irqrestore(&gt->tlb_invalidation.pending_lock, flags);
+
         return 0;
  }
diff --git a/drivers/gpu/drm/xe/xe_gt_types.h b/drivers/gpu/drm/xe/xe_gt_types.h

index 7d4de019f9a5e8c24c7f8e98a7362c0bf6107394..28b8e8a86fc90f581ccbc3339adfe6cd4fdd76d0 100644 (file)
--- a/drivers/gpu/drm/xe/xe_gt_types.h
+++ b/drivers/gpu/drm/xe/xe_gt_types.h
@@ -163,6 +163,11 @@ struct xe_gt {
                  * invaliations, protected by CT lock
                  */
                 struct list_head pending_fences;
+               /**
+                * @pending_lock: protects @pending_fences and updating
+                * @seqno_recv.
+                */
+               spinlock_t pending_lock;
                 /**
                  * @fence_tdr: schedules a delayed call to
                  * xe_gt_tlb_fence_timeout after the timeut interval is over.
diff --git a/drivers/gpu/drm/xe/xe_guc_ct.c b/drivers/gpu/drm/xe/xe_guc_ct.c

index 0b086d17c083c6febda97cc24a3e8d90475feb2f..9fb5fd4391d2c1dfeed09142fadd38cda78d1cb3 100644 (file)
--- a/drivers/gpu/drm/xe/xe_guc_ct.c
+++ b/drivers/gpu/drm/xe/xe_guc_ct.c
@@ -994,15 +994,8 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
                         return 0;
  
                 switch (FIELD_GET(GUC_HXG_EVENT_MSG_0_ACTION, msg[1])) {
-               /*
-                * FIXME: We really should process
-                * XE_GUC_ACTION_TLB_INVALIDATION_DONE here in the fast-path as
-                * these critical for page fault performance. We currently can't
-                * due to TLB invalidation done algorithm expecting the seqno
-                * returned in-order. With some small changes to the algorithm
-                * and locking we should be able to support out-of-order seqno.
-                */
                 case XE_GUC_ACTION_REPORT_PAGE_FAULT_REQ_DESC:
+               case XE_GUC_ACTION_TLB_INVALIDATION_DONE:
                         break;  /* Process these in fast-path */
                 default:
                         return 0;
@@ -1056,8 +1049,7 @@ void xe_guc_ct_fast_path(struct xe_guc_ct *ct)
         struct xe_device *xe = ct_to_xe(ct);
         int len;
  
-       if (!xe_device_in_fault_mode(xe) ||
-           !xe_device_mem_access_get_if_ongoing(xe))
+       if (!xe_device_mem_access_get_if_ongoing(xe))
                 return;
  
         spin_lock(&ct->fast_lock);
author	Matthew Auld <matthew.auld@intel.com>
	Mon, 10 Jul 2023 09:40:49 +0000 (10:40 +0100)
committer	Rodrigo Vivi <rodrigo.vivi@intel.com>
	Thu, 21 Dec 2023 16:35:23 +0000 (11:35 -0500)
drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_gt_types.h		patch \| blob \| blame \| history
drivers/gpu/drm/xe/xe_guc_ct.c		patch \| blob \| blame \| history