drm/vc4: Expose performance counters to userspace

author Boris Brezillon <boris.brezillon@free-electrons.com>

Fri, 12 Jan 2018 09:09:26 +0000 (10:09 +0100)

committer Eric Anholt <eric@anholt.net>

Sat, 10 Feb 2018 22:23:26 +0000 (22:23 +0000)
author Boris Brezillon <boris.brezillon@free-electrons.com>
Fri, 12 Jan 2018 09:09:26 +0000 (10:09 +0100)
committer Eric Anholt <eric@anholt.net>
Sat, 10 Feb 2018 22:23:26 +0000 (22:23 +0000)
diff --git a/drivers/gpu/drm/vc4/Makefile b/drivers/gpu/drm/vc4/Makefile

index f5500df51686f0fed9e8f9f7ff5cfa888cfdad7c..4a3a868235f848698c3bcc062475a8d3e4518a69 100644 (file)
--- a/drivers/gpu/drm/vc4/Makefile
+++ b/drivers/gpu/drm/vc4/Makefile
@@ -15,6 +15,7 @@ vc4-y := \
         vc4_vec.o \
         vc4_hvs.o \
         vc4_irq.o \
+       vc4_perfmon.o \
         vc4_plane.o \
         vc4_render_cl.o \
         vc4_trace_points.o \
diff --git a/drivers/gpu/drm/vc4/vc4_drv.c b/drivers/gpu/drm/vc4/vc4_drv.c

index ceb385fd69c595ea5acfca2dcd702921226af174..94b99c90425a488cfaec255b1f2030f9b31242b3 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_drv.c
+++ b/drivers/gpu/drm/vc4/vc4_drv.c
@@ -101,6 +101,7 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
         case DRM_VC4_PARAM_SUPPORTS_THREADED_FS:
         case DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER:
         case DRM_VC4_PARAM_SUPPORTS_MADVISE:
+       case DRM_VC4_PARAM_SUPPORTS_PERFMON:
                 args->value = true;
                 break;
         default:
@@ -111,6 +112,26 @@ static int vc4_get_param_ioctl(struct drm_device *dev, void *data,
         return 0;
  }
  
+static int vc4_open(struct drm_device *dev, struct drm_file *file)
+{
+       struct vc4_file *vc4file;
+
+       vc4file = kzalloc(sizeof(*vc4file), GFP_KERNEL);
+       if (!vc4file)
+               return -ENOMEM;
+
+       vc4_perfmon_open_file(vc4file);
+       file->driver_priv = vc4file;
+       return 0;
+}
+
+static void vc4_close(struct drm_device *dev, struct drm_file *file)
+{
+       struct vc4_file *vc4file = file->driver_priv;
+
+       vc4_perfmon_close_file(vc4file);
+}
+
  static const struct vm_operations_struct vc4_vm_ops = {
         .fault = vc4_fault,
         .open = drm_gem_vm_open,
@@ -143,6 +164,9 @@ static const struct drm_ioctl_desc vc4_drm_ioctls[] = {
         DRM_IOCTL_DEF_DRV(VC4_GET_TILING, vc4_get_tiling_ioctl, DRM_RENDER_ALLOW),
         DRM_IOCTL_DEF_DRV(VC4_LABEL_BO, vc4_label_bo_ioctl, DRM_RENDER_ALLOW),
         DRM_IOCTL_DEF_DRV(VC4_GEM_MADVISE, vc4_gem_madvise_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_CREATE, vc4_perfmon_create_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_DESTROY, vc4_perfmon_destroy_ioctl, DRM_RENDER_ALLOW),
+       DRM_IOCTL_DEF_DRV(VC4_PERFMON_GET_VALUES, vc4_perfmon_get_values_ioctl, DRM_RENDER_ALLOW),
  };
  
  static struct drm_driver vc4_drm_driver = {
@@ -153,6 +177,8 @@ static struct drm_driver vc4_drm_driver = {
                             DRIVER_RENDER |
                             DRIVER_PRIME),
         .lastclose = drm_fb_helper_lastclose,
+       .open = vc4_open,
+       .postclose = vc4_close,
         .irq_handler = vc4_irq,
         .irq_preinstall = vc4_irq_preinstall,
         .irq_postinstall = vc4_irq_postinstall,
diff --git a/drivers/gpu/drm/vc4/vc4_drv.h b/drivers/gpu/drm/vc4/vc4_drv.h

index 3af22936d9b3b20dc34eaa7e6b47f18099bfdb51..fefa1664a9f5f6596616ebbedb80e5965b06388e 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_drv.h
+++ b/drivers/gpu/drm/vc4/vc4_drv.h
@@ -11,6 +11,8 @@
  #include <drm/drm_encoder.h>
  #include <drm/drm_gem_cma_helper.h>
  
+#include "uapi/drm/vc4_drm.h"
+
  /* Don't forget to update vc4_bo.c: bo_type_names[] when adding to
   * this.
   */
@@ -29,6 +31,36 @@ enum vc4_kernel_bo_type {
         VC4_BO_TYPE_COUNT
  };
  
+/* Performance monitor object. The perform lifetime is controlled by userspace
+ * using perfmon related ioctls. A perfmon can be attached to a submit_cl
+ * request, and when this is the case, HW perf counters will be activated just
+ * before the submit_cl is submitted to the GPU and disabled when the job is
+ * done. This way, only events related to a specific job will be counted.
+ */
+struct vc4_perfmon {
+       /* Tracks the number of users of the perfmon, when this counter reaches
+        * zero the perfmon is destroyed.
+        */
+       refcount_t refcnt;
+
+       /* Number of counters activated in this perfmon instance
+        * (should be less than DRM_VC4_MAX_PERF_COUNTERS).
+        */
+       u8 ncounters;
+
+       /* Events counted by the HW perf counters. */
+       u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+
+       /* Storage for counter values. Counters are incremented by the HW
+        * perf counter values every time the perfmon is attached to a GPU job.
+        * This way, perfmon users don't have to retrieve the results after
+        * each job if they want to track events covering several submissions.
+        * Note that counter values can't be reset, but you can fake a reset by
+        * destroying the perfmon and creating a new one.
+        */
+       u64 counters[0];
+};
+
  struct vc4_dev {
         struct drm_device *dev;
  
@@ -121,6 +153,11 @@ struct vc4_dev {
         wait_queue_head_t job_wait_queue;
         struct work_struct job_done_work;
  
+       /* Used to track the active perfmon if any. Access to this field is
+        * protected by job_lock.
+        */
+       struct vc4_perfmon *active_perfmon;
+
         /* List of struct vc4_seqno_cb for callbacks to be made from a
          * workqueue when the given seqno is passed.
          */
@@ -406,6 +443,21 @@ struct vc4_exec_info {
         void *uniforms_v;
         uint32_t uniforms_p;
         uint32_t uniforms_size;
+
+       /* Pointer to a performance monitor object if the user requested it,
+        * NULL otherwise.
+        */
+       struct vc4_perfmon *perfmon;
+};
+
+/* Per-open file private data. Any driver-specific resource that has to be
+ * released when the DRM file is closed should be placed here.
+ */
+struct vc4_file {
+       struct {
+               struct idr idr;
+               struct mutex lock;
+       } perfmon;
  };
  
  static inline struct vc4_exec_info *
@@ -646,3 +698,19 @@ bool vc4_check_tex_size(struct vc4_exec_info *exec,
  /* vc4_validate_shader.c */
  struct vc4_validated_shader_info *
  vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
+
+/* vc4_perfmon.c */
+void vc4_perfmon_get(struct vc4_perfmon *perfmon);
+void vc4_perfmon_put(struct vc4_perfmon *perfmon);
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon);
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+                     bool capture);
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id);
+void vc4_perfmon_open_file(struct vc4_file *vc4file);
+void vc4_perfmon_close_file(struct vc4_file *vc4file);
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+                            struct drm_file *file_priv);
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *file_priv);
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+                                struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/vc4/vc4_gem.c b/drivers/gpu/drm/vc4/vc4_gem.c

index 638540943c61a5e095c87be8d2b2bf543ea933b1..3ac801b14d4eae0161d14efb2ff3a6c18785cf57 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_gem.c
+++ b/drivers/gpu/drm/vc4/vc4_gem.c
@@ -454,14 +454,30 @@ again:
  
         vc4_flush_caches(dev);
  
+       /* Only start the perfmon if it was not already started by a previous
+        * job.
+        */
+       if (exec->perfmon && vc4->active_perfmon != exec->perfmon)
+               vc4_perfmon_start(vc4, exec->perfmon);
+
         /* Either put the job in the binner if it uses the binner, or
          * immediately move it to the to-be-rendered queue.
          */
         if (exec->ct0ca != exec->ct0ea) {
                 submit_cl(dev, 0, exec->ct0ca, exec->ct0ea);
         } else {
+               struct vc4_exec_info *next;
+
                 vc4_move_job_to_render(dev, exec);
-               goto again;
+               next = vc4_first_bin_job(vc4);
+
+               /* We can't start the next bin job if the previous job had a
+                * different perfmon instance attached to it. The same goes
+                * if one of them had a perfmon attached to it and the other
+                * one doesn't.
+                */
+               if (next && next->perfmon == exec->perfmon)
+                       goto again;
         }
  }
  
@@ -621,6 +637,7 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
                  struct ww_acquire_ctx *acquire_ctx)
  {
         struct vc4_dev *vc4 = to_vc4_dev(dev);
+       struct vc4_exec_info *renderjob;
         uint64_t seqno;
         unsigned long irqflags;
         struct vc4_fence *fence;
@@ -646,11 +663,14 @@ vc4_queue_submit(struct drm_device *dev, struct vc4_exec_info *exec,
  
         list_add_tail(&exec->head, &vc4->bin_job_list);
  
-       /* If no job was executing, kick ours off.  Otherwise, it'll
-        * get started when the previous job's flush done interrupt
-        * occurs.
+       /* If no bin job was executing and if the render job (if any) has the
+        * same perfmon as our job attached to it (or if both jobs don't have
+        * perfmon activated), then kick ours off.  Otherwise, it'll get
+        * started when the previous job's flush/render done interrupt occurs.
          */
-       if (vc4_first_bin_job(vc4) == exec) {
+       renderjob = vc4_first_render_job(vc4);
+       if (vc4_first_bin_job(vc4) == exec &&
+           (!renderjob || renderjob->perfmon == exec->perfmon)) {
                 vc4_submit_next_bin_job(dev);
                 vc4_queue_hangcheck(dev);
         }
@@ -915,6 +935,9 @@ vc4_complete_exec(struct drm_device *dev, struct vc4_exec_info *exec)
         vc4->bin_alloc_used &= ~exec->bin_slots;
         spin_unlock_irqrestore(&vc4->job_lock, irqflags);
  
+       /* Release the reference we had on the perf monitor. */
+       vc4_perfmon_put(exec->perfmon);
+
         mutex_lock(&vc4->power_lock);
         if (--vc4->power_refcount == 0) {
                 pm_runtime_mark_last_busy(&vc4->v3d->pdev->dev);
@@ -1067,6 +1090,7 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
                     struct drm_file *file_priv)
  {
         struct vc4_dev *vc4 = to_vc4_dev(dev);
+       struct vc4_file *vc4file = file_priv->driver_priv;
         struct drm_vc4_submit_cl *args = data;
         struct vc4_exec_info *exec;
         struct ww_acquire_ctx acquire_ctx;
@@ -1080,6 +1104,11 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
                 return -EINVAL;
         }
  
+       if (args->pad2 != 0) {
+               DRM_DEBUG("->pad2 must be set to zero\n");
+               return -EINVAL;
+       }
+
         exec = kcalloc(1, sizeof(*exec), GFP_KERNEL);
         if (!exec) {
                 DRM_ERROR("malloc failure on exec struct\n");
@@ -1105,6 +1134,15 @@ vc4_submit_cl_ioctl(struct drm_device *dev, void *data,
         if (ret)
                 goto fail;
  
+       if (args->perfmonid) {
+               exec->perfmon = vc4_perfmon_find(vc4file,
+                                                args->perfmonid);
+               if (!exec->perfmon) {
+                       ret = -ENOENT;
+                       goto fail;
+               }
+       }
+
         if (exec->args->bin_cl_size != 0) {
                 ret = vc4_get_bcl(dev, exec);
                 if (ret)
diff --git a/drivers/gpu/drm/vc4/vc4_irq.c b/drivers/gpu/drm/vc4/vc4_irq.c

index 3dd62d75f5319dbffcd9d27ea6f60927d193eb92..4cd2ccfe15f49b5d992117f172c7dc373fcd4bd2 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_irq.c
+++ b/drivers/gpu/drm/vc4/vc4_irq.c
@@ -104,13 +104,20 @@ static void
  vc4_irq_finish_bin_job(struct drm_device *dev)
  {
         struct vc4_dev *vc4 = to_vc4_dev(dev);
-       struct vc4_exec_info *exec = vc4_first_bin_job(vc4);
+       struct vc4_exec_info *next, *exec = vc4_first_bin_job(vc4);
  
         if (!exec)
                 return;
  
         vc4_move_job_to_render(dev, exec);
-       vc4_submit_next_bin_job(dev);
+       next = vc4_first_bin_job(vc4);
+
+       /* Only submit the next job in the bin list if it matches the perfmon
+        * attached to the one that just finished (or if both jobs don't have
+        * perfmon attached to them).
+        */
+       if (next && next->perfmon == exec->perfmon)
+               vc4_submit_next_bin_job(dev);
  }
  
  static void
@@ -122,6 +129,10 @@ vc4_cancel_bin_job(struct drm_device *dev)
         if (!exec)
                 return;
  
+       /* Stop the perfmon so that the next bin job can be started. */
+       if (exec->perfmon)
+               vc4_perfmon_stop(vc4, exec->perfmon, false);
+
         list_move_tail(&exec->head, &vc4->bin_job_list);
         vc4_submit_next_bin_job(dev);
  }
@@ -131,18 +142,41 @@ vc4_irq_finish_render_job(struct drm_device *dev)
  {
         struct vc4_dev *vc4 = to_vc4_dev(dev);
         struct vc4_exec_info *exec = vc4_first_render_job(vc4);
+       struct vc4_exec_info *nextbin, *nextrender;
  
         if (!exec)
                 return;
  
         vc4->finished_seqno++;
         list_move_tail(&exec->head, &vc4->job_done_list);
+
+       nextbin = vc4_first_bin_job(vc4);
+       nextrender = vc4_first_render_job(vc4);
+
+       /* Only stop the perfmon if following jobs in the queue don't expect it
+        * to be enabled.
+        */
+       if (exec->perfmon && !nextrender &&
+           (!nextbin || nextbin->perfmon != exec->perfmon))
+               vc4_perfmon_stop(vc4, exec->perfmon, true);
+
+       /* If there's a render job waiting, start it. If this is not the case
+        * we may have to unblock the binner if it's been stalled because of
+        * perfmon (this can be checked by comparing the perfmon attached to
+        * the finished renderjob to the one attached to the next bin job: if
+        * they don't match, this means the binner is stalled and should be
+        * restarted).
+        */
+       if (nextrender)
+               vc4_submit_next_render_job(dev);
+       else if (nextbin && nextbin->perfmon != exec->perfmon)
+               vc4_submit_next_bin_job(dev);
+
         if (exec->fence) {
                 dma_fence_signal_locked(exec->fence);
                 dma_fence_put(exec->fence);
                 exec->fence = NULL;
         }
-       vc4_submit_next_render_job(dev);
  
         wake_up_all(&vc4->job_wait_queue);
         schedule_work(&vc4->job_done_work);
diff --git a/drivers/gpu/drm/vc4/vc4_perfmon.c b/drivers/gpu/drm/vc4/vc4_perfmon.c

new file mode 100644 (file)

index 0000000..437e7a2
--- /dev/null
+++ b/drivers/gpu/drm/vc4/vc4_perfmon.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (C) 2018 Broadcom
+ */
+
+/**
+ * DOC: VC4 V3D performance monitor module
+ *
+ * The V3D block provides 16 hardware counters which can count various events.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_regs.h"
+
+#define VC4_PERFMONID_MIN      1
+#define VC4_PERFMONID_MAX      U32_MAX
+
+void vc4_perfmon_get(struct vc4_perfmon *perfmon)
+{
+       if (perfmon)
+               refcount_inc(&perfmon->refcnt);
+}
+
+void vc4_perfmon_put(struct vc4_perfmon *perfmon)
+{
+       if (perfmon && refcount_dec_and_test(&perfmon->refcnt))
+               kfree(perfmon);
+}
+
+void vc4_perfmon_start(struct vc4_dev *vc4, struct vc4_perfmon *perfmon)
+{
+       unsigned int i;
+       u32 mask;
+
+       if (WARN_ON_ONCE(!perfmon || vc4->active_perfmon))
+               return;
+
+       for (i = 0; i < perfmon->ncounters; i++)
+               V3D_WRITE(V3D_PCTRS(i), perfmon->events[i]);
+
+       mask = GENMASK(perfmon->ncounters - 1, 0);
+       V3D_WRITE(V3D_PCTRC, mask);
+       V3D_WRITE(V3D_PCTRE, V3D_PCTRE_EN | mask);
+       vc4->active_perfmon = perfmon;
+}
+
+void vc4_perfmon_stop(struct vc4_dev *vc4, struct vc4_perfmon *perfmon,
+                     bool capture)
+{
+       unsigned int i;
+
+       if (WARN_ON_ONCE(!vc4->active_perfmon ||
+                        perfmon != vc4->active_perfmon))
+               return;
+
+       if (capture) {
+               for (i = 0; i < perfmon->ncounters; i++)
+                       perfmon->counters[i] += V3D_READ(V3D_PCTR(i));
+       }
+
+       V3D_WRITE(V3D_PCTRE, 0);
+       vc4->active_perfmon = NULL;
+}
+
+struct vc4_perfmon *vc4_perfmon_find(struct vc4_file *vc4file, int id)
+{
+       struct vc4_perfmon *perfmon;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_find(&vc4file->perfmon.idr, id);
+       vc4_perfmon_get(perfmon);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       return perfmon;
+}
+
+void vc4_perfmon_open_file(struct vc4_file *vc4file)
+{
+       mutex_init(&vc4file->perfmon.lock);
+       idr_init(&vc4file->perfmon.idr);
+}
+
+static int vc4_perfmon_idr_del(int id, void *elem, void *data)
+{
+       struct vc4_perfmon *perfmon = elem;
+
+       vc4_perfmon_put(perfmon);
+
+       return 0;
+}
+
+void vc4_perfmon_close_file(struct vc4_file *vc4file)
+{
+       mutex_lock(&vc4file->perfmon.lock);
+       idr_for_each(&vc4file->perfmon.idr, vc4_perfmon_idr_del, NULL);
+       idr_destroy(&vc4file->perfmon.idr);
+       mutex_unlock(&vc4file->perfmon.lock);
+}
+
+int vc4_perfmon_create_ioctl(struct drm_device *dev, void *data,
+                            struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_create *req = data;
+       struct vc4_perfmon *perfmon;
+       unsigned int i;
+       int ret;
+
+       /* Number of monitored counters cannot exceed HW limits. */
+       if (req->ncounters > DRM_VC4_MAX_PERF_COUNTERS ||
+           !req->ncounters)
+               return -EINVAL;
+
+       /* Make sure all events are valid. */
+       for (i = 0; i < req->ncounters; i++) {
+               if (req->events[i] >= VC4_PERFCNT_NUM_EVENTS)
+                       return -EINVAL;
+       }
+
+       perfmon = kzalloc(sizeof(*perfmon) + (req->ncounters * sizeof(u64)),
+                         GFP_KERNEL);
+       if (!perfmon)
+               return -ENOMEM;
+
+       for (i = 0; i < req->ncounters; i++)
+               perfmon->events[i] = req->events[i];
+
+       perfmon->ncounters = req->ncounters;
+
+       refcount_set(&perfmon->refcnt, 1);
+
+       mutex_lock(&vc4file->perfmon.lock);
+       ret = idr_alloc(&vc4file->perfmon.idr, perfmon, VC4_PERFMONID_MIN,
+                       VC4_PERFMONID_MAX, GFP_KERNEL);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (ret < 0) {
+               kfree(perfmon);
+               return ret;
+       }
+
+       req->id = ret;
+       return 0;
+}
+
+int vc4_perfmon_destroy_ioctl(struct drm_device *dev, void *data,
+                             struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_destroy *req = data;
+       struct vc4_perfmon *perfmon;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_remove(&vc4file->perfmon.idr, req->id);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       vc4_perfmon_put(perfmon);
+       return 0;
+}
+
+int vc4_perfmon_get_values_ioctl(struct drm_device *dev, void *data,
+                                struct drm_file *file_priv)
+{
+       struct vc4_file *vc4file = file_priv->driver_priv;
+       struct drm_vc4_perfmon_get_values *req = data;
+       struct vc4_perfmon *perfmon;
+       int ret;
+
+       mutex_lock(&vc4file->perfmon.lock);
+       perfmon = idr_find(&vc4file->perfmon.idr, req->id);
+       vc4_perfmon_get(perfmon);
+       mutex_unlock(&vc4file->perfmon.lock);
+
+       if (!perfmon)
+               return -EINVAL;
+
+       if (copy_to_user(u64_to_user_ptr(req->values_ptr), perfmon->counters,
+                        perfmon->ncounters * sizeof(u64)))
+               ret = -EFAULT;
+       else
+               ret = 0;
+
+       vc4_perfmon_put(perfmon);
+       return ret;
+}
diff --git a/drivers/gpu/drm/vc4/vc4_regs.h b/drivers/gpu/drm/vc4/vc4_regs.h

index 55677bd50f6610795e3d04bfdeaa245cf9aad0cb..b9749cb240638ddce98a656409c6b6ffb3e44582 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_regs.h
+++ b/drivers/gpu/drm/vc4/vc4_regs.h
@@ -122,38 +122,9 @@
  #define V3D_VPMBASE  0x00504
  #define V3D_PCTRC    0x00670
  #define V3D_PCTRE    0x00674
-#define V3D_PCTR0    0x00680
-#define V3D_PCTRS0   0x00684
-#define V3D_PCTR1    0x00688
-#define V3D_PCTRS1   0x0068c
-#define V3D_PCTR2    0x00690
-#define V3D_PCTRS2   0x00694
-#define V3D_PCTR3    0x00698
-#define V3D_PCTRS3   0x0069c
-#define V3D_PCTR4    0x006a0
-#define V3D_PCTRS4   0x006a4
-#define V3D_PCTR5    0x006a8
-#define V3D_PCTRS5   0x006ac
-#define V3D_PCTR6    0x006b0
-#define V3D_PCTRS6   0x006b4
-#define V3D_PCTR7    0x006b8
-#define V3D_PCTRS7   0x006bc
-#define V3D_PCTR8    0x006c0
-#define V3D_PCTRS8   0x006c4
-#define V3D_PCTR9    0x006c8
-#define V3D_PCTRS9   0x006cc
-#define V3D_PCTR10   0x006d0
-#define V3D_PCTRS10  0x006d4
-#define V3D_PCTR11   0x006d8
-#define V3D_PCTRS11  0x006dc
-#define V3D_PCTR12   0x006e0
-#define V3D_PCTRS12  0x006e4
-#define V3D_PCTR13   0x006e8
-#define V3D_PCTRS13  0x006ec
-#define V3D_PCTR14   0x006f0
-#define V3D_PCTRS14  0x006f4
-#define V3D_PCTR15   0x006f8
-#define V3D_PCTRS15  0x006fc
+# define V3D_PCTRE_EN  BIT(31)
+#define V3D_PCTR(x)  (0x00680 + ((x) * 8))
+#define V3D_PCTRS(x) (0x00684 + ((x) * 8))
  #define V3D_DBGE     0x00f00
  #define V3D_FDBGO    0x00f04
  #define V3D_FDBGB    0x00f08
diff --git a/drivers/gpu/drm/vc4/vc4_v3d.c b/drivers/gpu/drm/vc4/vc4_v3d.c

index 493f392b3a0a90e70820d3582c0847ebe3cc0bd7..bfc2fa73d2ae949446e64e4c555248ada1ec6fde 100644 (file)
--- a/drivers/gpu/drm/vc4/vc4_v3d.c
+++ b/drivers/gpu/drm/vc4/vc4_v3d.c
@@ -68,38 +68,38 @@ static const struct {
         REGDEF(V3D_VPMBASE),
         REGDEF(V3D_PCTRC),
         REGDEF(V3D_PCTRE),
-       REGDEF(V3D_PCTR0),
-       REGDEF(V3D_PCTRS0),
-       REGDEF(V3D_PCTR1),
-       REGDEF(V3D_PCTRS1),
-       REGDEF(V3D_PCTR2),
-       REGDEF(V3D_PCTRS2),
-       REGDEF(V3D_PCTR3),
-       REGDEF(V3D_PCTRS3),
-       REGDEF(V3D_PCTR4),
-       REGDEF(V3D_PCTRS4),
-       REGDEF(V3D_PCTR5),
-       REGDEF(V3D_PCTRS5),
-       REGDEF(V3D_PCTR6),
-       REGDEF(V3D_PCTRS6),
-       REGDEF(V3D_PCTR7),
-       REGDEF(V3D_PCTRS7),
-       REGDEF(V3D_PCTR8),
-       REGDEF(V3D_PCTRS8),
-       REGDEF(V3D_PCTR9),
-       REGDEF(V3D_PCTRS9),
-       REGDEF(V3D_PCTR10),
-       REGDEF(V3D_PCTRS10),
-       REGDEF(V3D_PCTR11),
-       REGDEF(V3D_PCTRS11),
-       REGDEF(V3D_PCTR12),
-       REGDEF(V3D_PCTRS12),
-       REGDEF(V3D_PCTR13),
-       REGDEF(V3D_PCTRS13),
-       REGDEF(V3D_PCTR14),
-       REGDEF(V3D_PCTRS14),
-       REGDEF(V3D_PCTR15),
-       REGDEF(V3D_PCTRS15),
+       REGDEF(V3D_PCTR(0)),
+       REGDEF(V3D_PCTRS(0)),
+       REGDEF(V3D_PCTR(1)),
+       REGDEF(V3D_PCTRS(1)),
+       REGDEF(V3D_PCTR(2)),
+       REGDEF(V3D_PCTRS(2)),
+       REGDEF(V3D_PCTR(3)),
+       REGDEF(V3D_PCTRS(3)),
+       REGDEF(V3D_PCTR(4)),
+       REGDEF(V3D_PCTRS(4)),
+       REGDEF(V3D_PCTR(5)),
+       REGDEF(V3D_PCTRS(5)),
+       REGDEF(V3D_PCTR(6)),
+       REGDEF(V3D_PCTRS(6)),
+       REGDEF(V3D_PCTR(7)),
+       REGDEF(V3D_PCTRS(7)),
+       REGDEF(V3D_PCTR(8)),
+       REGDEF(V3D_PCTRS(8)),
+       REGDEF(V3D_PCTR(9)),
+       REGDEF(V3D_PCTRS(9)),
+       REGDEF(V3D_PCTR(10)),
+       REGDEF(V3D_PCTRS(10)),
+       REGDEF(V3D_PCTR(11)),
+       REGDEF(V3D_PCTRS(11)),
+       REGDEF(V3D_PCTR(12)),
+       REGDEF(V3D_PCTRS(12)),
+       REGDEF(V3D_PCTR(13)),
+       REGDEF(V3D_PCTRS(13)),
+       REGDEF(V3D_PCTR(14)),
+       REGDEF(V3D_PCTRS(14)),
+       REGDEF(V3D_PCTR(15)),
+       REGDEF(V3D_PCTRS(15)),
         REGDEF(V3D_DBGE),
         REGDEF(V3D_FDBGO),
         REGDEF(V3D_FDBGB),
diff --git a/include/uapi/drm/vc4_drm.h b/include/uapi/drm/vc4_drm.h

index 52263b575bdc4e5d9a6c6e628b12918eba32d918..b95a0e11cb070663d593df2e05fc9e76d3e15598 100644 (file)
--- a/include/uapi/drm/vc4_drm.h
+++ b/include/uapi/drm/vc4_drm.h
@@ -42,6 +42,9 @@ extern "C" {
  #define DRM_VC4_GET_TILING                        0x09
  #define DRM_VC4_LABEL_BO                          0x0a
  #define DRM_VC4_GEM_MADVISE                       0x0b
+#define DRM_VC4_PERFMON_CREATE                    0x0c
+#define DRM_VC4_PERFMON_DESTROY                   0x0d
+#define DRM_VC4_PERFMON_GET_VALUES                0x0e
  
  #define DRM_IOCTL_VC4_SUBMIT_CL           DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_SUBMIT_CL, struct drm_vc4_submit_cl)
  #define DRM_IOCTL_VC4_WAIT_SEQNO          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_WAIT_SEQNO, struct drm_vc4_wait_seqno)
@@ -55,6 +58,9 @@ extern "C" {
  #define DRM_IOCTL_VC4_GET_TILING          DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GET_TILING, struct drm_vc4_get_tiling)
  #define DRM_IOCTL_VC4_LABEL_BO            DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_LABEL_BO, struct drm_vc4_label_bo)
  #define DRM_IOCTL_VC4_GEM_MADVISE         DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_GEM_MADVISE, struct drm_vc4_gem_madvise)
+#define DRM_IOCTL_VC4_PERFMON_CREATE      DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_CREATE, struct drm_vc4_perfmon_create)
+#define DRM_IOCTL_VC4_PERFMON_DESTROY     DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_DESTROY, struct drm_vc4_perfmon_destroy)
+#define DRM_IOCTL_VC4_PERFMON_GET_VALUES  DRM_IOWR(DRM_COMMAND_BASE + DRM_VC4_PERFMON_GET_VALUES, struct drm_vc4_perfmon_get_values)
  
  struct drm_vc4_submit_rcl_surface {
         __u32 hindex; /* Handle index, or ~0 if not present. */
@@ -173,6 +179,15 @@ struct drm_vc4_submit_cl {
          * wait ioctl).
          */
         __u64 seqno;
+
+       /* ID of the perfmon to attach to this job. 0 means no perfmon. */
+       __u32 perfmonid;
+
+       /* Unused field to align this struct on 64 bits. Must be set to 0.
+        * If one ever needs to add an u32 field to this struct, this field
+        * can be used.
+        */
+       __u32 pad2;
  };
  
  /**
@@ -308,6 +323,7 @@ struct drm_vc4_get_hang_state {
  #define DRM_VC4_PARAM_SUPPORTS_THREADED_FS     5
  #define DRM_VC4_PARAM_SUPPORTS_FIXED_RCL_ORDER 6
  #define DRM_VC4_PARAM_SUPPORTS_MADVISE         7
+#define DRM_VC4_PARAM_SUPPORTS_PERFMON         8
  
  struct drm_vc4_get_param {
         __u32 param;
@@ -352,6 +368,66 @@ struct drm_vc4_gem_madvise {
         __u32 pad;
  };
  
+enum {
+       VC4_PERFCNT_FEP_VALID_PRIMS_NO_RENDER,
+       VC4_PERFCNT_FEP_VALID_PRIMS_RENDER,
+       VC4_PERFCNT_FEP_CLIPPED_QUADS,
+       VC4_PERFCNT_FEP_VALID_QUADS,
+       VC4_PERFCNT_TLB_QUADS_NOT_PASSING_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_NOT_PASSING_Z_AND_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_PASSING_Z_AND_STENCIL,
+       VC4_PERFCNT_TLB_QUADS_ZERO_COVERAGE,
+       VC4_PERFCNT_TLB_QUADS_NON_ZERO_COVERAGE,
+       VC4_PERFCNT_TLB_QUADS_WRITTEN_TO_COLOR_BUF,
+       VC4_PERFCNT_PLB_PRIMS_OUTSIDE_VIEWPORT,
+       VC4_PERFCNT_PLB_PRIMS_NEED_CLIPPING,
+       VC4_PERFCNT_PSE_PRIMS_REVERSED,
+       VC4_PERFCNT_QPU_TOTAL_IDLE_CYCLES,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_VERTEX_COORD_SHADING,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_FRAGMENT_SHADING,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_EXEC_VALID_INST,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_TMUS,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_SCOREBOARD,
+       VC4_PERFCNT_QPU_TOTAL_CLK_CYCLES_WAITING_VARYINGS,
+       VC4_PERFCNT_QPU_TOTAL_INST_CACHE_HIT,
+       VC4_PERFCNT_QPU_TOTAL_INST_CACHE_MISS,
+       VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_HIT,
+       VC4_PERFCNT_QPU_TOTAL_UNIFORM_CACHE_MISS,
+       VC4_PERFCNT_TMU_TOTAL_TEXT_QUADS_PROCESSED,
+       VC4_PERFCNT_TMU_TOTAL_TEXT_CACHE_MISS,
+       VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VDW_STALLED,
+       VC4_PERFCNT_VPM_TOTAL_CLK_CYCLES_VCD_STALLED,
+       VC4_PERFCNT_L2C_TOTAL_L2_CACHE_HIT,
+       VC4_PERFCNT_L2C_TOTAL_L2_CACHE_MISS,
+       VC4_PERFCNT_NUM_EVENTS,
+};
+
+#define DRM_VC4_MAX_PERF_COUNTERS      16
+
+struct drm_vc4_perfmon_create {
+       __u32 id;
+       __u32 ncounters;
+       __u8 events[DRM_VC4_MAX_PERF_COUNTERS];
+};
+
+struct drm_vc4_perfmon_destroy {
+       __u32 id;
+};
+
+/*
+ * Returns the values of the performance counters tracked by this
+ * perfmon (as an array of ncounters u64 values).
+ *
+ * No implicit synchronization is performed, so the user has to
+ * guarantee that any jobs using this perfmon have already been
+ * completed  (probably by blocking on the seqno returned by the
+ * last exec that used the perfmon).
+ */
+struct drm_vc4_perfmon_get_values {
+       __u32 id;
+       __u64 values_ptr;
+};
+
  #if defined(__cplusplus)
  }
  #endif
author	Boris Brezillon <boris.brezillon@free-electrons.com>
	Fri, 12 Jan 2018 09:09:26 +0000 (10:09 +0100)
committer	Eric Anholt <eric@anholt.net>
	Sat, 10 Feb 2018 22:23:26 +0000 (22:23 +0000)
drivers/gpu/drm/vc4/Makefile		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_drv.c		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_drv.h		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_gem.c		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_irq.c		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_perfmon.c	[new file with mode: 0644]	patch \| blob
drivers/gpu/drm/vc4/vc4_regs.h		patch \| blob \| blame \| history
drivers/gpu/drm/vc4/vc4_v3d.c		patch \| blob \| blame \| history
include/uapi/drm/vc4_drm.h		patch \| blob \| blame \| history