drm/amd/pm: avoid unintentional shutdown due to temperature momentary fluctuation

author Evan Quan <evan.quan@amd.com>

Fri, 11 Aug 2023 16:40:31 +0000 (11:40 -0500)

committer Stefan Bader <stefan.bader@canonical.com>

Mon, 30 Oct 2023 11:00:02 +0000 (12:00 +0100)
author Evan Quan <evan.quan@amd.com>
Fri, 11 Aug 2023 16:40:31 +0000 (11:40 -0500)
committer Stefan Bader <stefan.bader@canonical.com>
Mon, 30 Oct 2023 11:00:02 +0000 (12:00 +0100)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index a8fe506e43d05f404b2d8426800380859d559d62..d16864e2b935765229b471498bd049d3ddebd302 100644 (file)
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -285,6 +285,9 @@ extern int amdgpu_sg_display;
  #define AMDGPU_SMARTSHIFT_MAX_BIAS (100)
  #define AMDGPU_SMARTSHIFT_MIN_BIAS (-100)
  
+/* Extra time delay(in ms) to eliminate the influence of temperature momentary fluctuation */
+#define AMDGPU_SWCTF_EXTRA_DELAY               50
+
  struct amdgpu_device;
  struct amdgpu_irq_src;
  struct amdgpu_fpriv;
diff --git a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c

index 8f2cc6310340e7b7d70d0e0f092447c34f899875..4779be55f4f9fd5b41ff72b5fdaa60df33f6a031 100644 (file)
--- a/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c
@@ -26,6 +26,7 @@
  #include <linux/gfp.h>
  #include <linux/slab.h>
  #include <linux/firmware.h>
+#include <linux/reboot.h>
  #include "amd_shared.h"
  #include "amd_powerplay.h"
  #include "power_state.h"
@@ -91,6 +92,45 @@ static int pp_early_init(void *handle)
         return 0;
  }
  
+static void pp_swctf_delayed_work_handler(struct work_struct *work)
+{
+       struct pp_hwmgr *hwmgr =
+               container_of(work, struct pp_hwmgr, swctf_delayed_work.work);
+       struct amdgpu_device *adev = hwmgr->adev;
+       struct amdgpu_dpm_thermal *range =
+                               &adev->pm.dpm.thermal;
+       uint32_t gpu_temperature, size;
+       int ret;
+
+       /*
+        * If the hotspot/edge temperature is confirmed as below SW CTF setting point
+        * after the delay enforced, nothing will be done.
+        * Otherwise, a graceful shutdown will be performed to prevent further damage.
+        */
+       if (range->sw_ctf_threshold &&
+           hwmgr->hwmgr_func->read_sensor) {
+               ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+                                                    AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+                                                    &gpu_temperature,
+                                                    &size);
+               /*
+                * For some legacy ASICs, hotspot temperature retrieving might be not
+                * supported. Check the edge temperature instead then.
+                */
+               if (ret == -EOPNOTSUPP)
+                       ret = hwmgr->hwmgr_func->read_sensor(hwmgr,
+                                                            AMDGPU_PP_SENSOR_EDGE_TEMP,
+                                                            &gpu_temperature,
+                                                            &size);
+               if (!ret && gpu_temperature / 1000 < range->sw_ctf_threshold)
+                       return;
+       }
+
+       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+       orderly_poweroff(true);
+}
+
  static int pp_sw_init(void *handle)
  {
         struct amdgpu_device *adev = handle;
@@ -101,6 +141,10 @@ static int pp_sw_init(void *handle)
  
         pr_debug("powerplay sw init %s\n", ret ? "failed" : "successfully");
  
+       if (!ret)
+               INIT_DELAYED_WORK(&hwmgr->swctf_delayed_work,
+                                 pp_swctf_delayed_work_handler);
+
         return ret;
  }
  
@@ -136,6 +180,8 @@ static int pp_hw_fini(void *handle)
         struct amdgpu_device *adev = handle;
         struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
  
+       cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
         hwmgr_hw_fini(hwmgr);
  
         return 0;
@@ -222,6 +268,8 @@ static int pp_suspend(void *handle)
         struct amdgpu_device *adev = handle;
         struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
  
+       cancel_delayed_work_sync(&hwmgr->swctf_delayed_work);
+
         return hwmgr_suspend(hwmgr);
  }
  
diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c

index bfe80ac0ad8c8751dbad4c3cf7818c17030c43a2..d0b1ab6c452312720d1aa600e7d23e1ffa7175d3 100644 (file)
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c
@@ -603,21 +603,17 @@ int phm_irq_process(struct amdgpu_device *adev,
                            struct amdgpu_irq_src *source,
                            struct amdgpu_iv_entry *entry)
  {
+       struct pp_hwmgr *hwmgr = adev->powerplay.pp_handle;
         uint32_t client_id = entry->client_id;
         uint32_t src_id = entry->src_id;
  
         if (client_id == AMDGPU_IRQ_CLIENTID_LEGACY) {
                 if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_LOW_TO_HIGH) {
-                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
-                       /*
-                        * SW CTF just occurred.
-                        * Try to do a graceful shutdown to prevent further damage.
-                        */
-                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
-                       orderly_poweroff(true);
-               } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW)
+                       schedule_delayed_work(&hwmgr->swctf_delayed_work,
+                                             msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+               } else if (src_id == VISLANDS30_IV_SRCID_CG_TSS_THERMAL_HIGH_TO_LOW) {
                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
-               else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
+               } else if (src_id == VISLANDS30_IV_SRCID_GPIO_19) {
                         dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
                         /*
                          * HW CTF just occurred. Shutdown to prevent further damage.
@@ -626,15 +622,10 @@ int phm_irq_process(struct amdgpu_device *adev,
                         orderly_poweroff(true);
                 }
         } else if (client_id == SOC15_IH_CLIENTID_THM) {
-               if (src_id == 0) {
-                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
-                       /*
-                        * SW CTF just occurred.
-                        * Try to do a graceful shutdown to prevent further damage.
-                        */
-                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
-                       orderly_poweroff(true);
-               } else
+               if (src_id == 0)
+                       schedule_delayed_work(&hwmgr->swctf_delayed_work,
+                                             msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
+               else
                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected!\n");
         } else if (client_id == SOC15_IH_CLIENTID_ROM_SMUIO) {
                 dev_emerg(adev->dev, "ERROR: GPU HW Critical Temperature Fault(aka CTF) detected!\n");
diff --git a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h

index 5ce433e2c16a569b89e7a3a2bb17c223d3254105..ec10643edea3e31e9be0f60801cc0523e9f34841 100644 (file)
--- a/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
+++ b/drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h
@@ -811,6 +811,8 @@ struct pp_hwmgr {
         bool gfxoff_state_changed_by_workload;
         uint32_t pstate_sclk_peak;
         uint32_t pstate_mclk_peak;
+
+       struct delayed_work swctf_delayed_work;
  };
  
  int hwmgr_early_init(struct pp_hwmgr *hwmgr);
diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c

index 33f1972d3ef7db779c5efd9f3ce73970b8900b28..e1f7f20414527b75a76255fe2ffce54e196908c7 100644 (file)
--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c
@@ -24,6 +24,7 @@
  
  #include <linux/firmware.h>
  #include <linux/pci.h>
+#include <linux/reboot.h>
  
  #include "amdgpu.h"
  #include "amdgpu_smu.h"
@@ -1061,6 +1062,34 @@ static void smu_interrupt_work_fn(struct work_struct *work)
                 smu->ppt_funcs->interrupt_work(smu);
  }
  
+static void smu_swctf_delayed_work_handler(struct work_struct *work)
+{
+       struct smu_context *smu =
+               container_of(work, struct smu_context, swctf_delayed_work.work);
+       struct smu_temperature_range *range =
+                               &smu->thermal_range;
+       struct amdgpu_device *adev = smu->adev;
+       uint32_t hotspot_tmp, size;
+
+       /*
+        * If the hotspot temperature is confirmed as below SW CTF setting point
+        * after the delay enforced, nothing will be done.
+        * Otherwise, a graceful shutdown will be performed to prevent further damage.
+        */
+       if (range->software_shutdown_temp &&
+           smu->ppt_funcs->read_sensor &&
+           !smu->ppt_funcs->read_sensor(smu,
+                                        AMDGPU_PP_SENSOR_HOTSPOT_TEMP,
+                                        &hotspot_tmp,
+                                        &size) &&
+           hotspot_tmp / 1000 < range->software_shutdown_temp)
+               return;
+
+       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
+       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
+       orderly_poweroff(true);
+}
+
  static int smu_sw_init(void *handle)
  {
         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -1109,6 +1138,9 @@ static int smu_sw_init(void *handle)
                 return ret;
         }
  
+       INIT_DELAYED_WORK(&smu->swctf_delayed_work,
+                         smu_swctf_delayed_work_handler);
+
         ret = smu_smc_table_sw_init(smu);
         if (ret) {
                 dev_err(adev->dev, "Failed to sw init smc table!\n");
@@ -1582,6 +1614,8 @@ static int smu_smc_hw_cleanup(struct smu_context *smu)
                 return ret;
         }
  
+       cancel_delayed_work_sync(&smu->swctf_delayed_work);
+
         ret = smu_disable_dpms(smu);
         if (ret) {
                 dev_err(adev->dev, "Fail to disable dpm features!\n");
diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h

index 3bc4128a22ac2d6b4c411d1e6bdca12091309a83..1ab77a6cdb65365747ad7fc6dcde8f6c30df93a0 100644 (file)
--- a/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
+++ b/drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h
@@ -573,6 +573,8 @@ struct smu_context
         u32 debug_param_reg;
         u32 debug_msg_reg;
         u32 debug_resp_reg;
+
+       struct delayed_work             swctf_delayed_work;
  };
  
  struct i2c_adapter;
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c

index ad66d57aa102e523f002956dff5b9df4568381c7..a48a09e188bedc920a72d0558f5d83b202357437 100644 (file)
--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c
@@ -1449,13 +1449,8 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
         if (client_id == SOC15_IH_CLIENTID_THM) {
                 switch (src_id) {
                 case THM_11_0__SRCID__THM_DIG_THERM_L2H:
-                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
-                       /*
-                        * SW CTF just occurred.
-                        * Try to do a graceful shutdown to prevent further damage.
-                        */
-                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
-                       orderly_poweroff(true);
+                       schedule_delayed_work(&smu->swctf_delayed_work,
+                                             msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
                 break;
                 case THM_11_0__SRCID__THM_DIG_THERM_H2L:
                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c

index 47fafb1fa60886c712be879ed05b0f5209ebd1bd..3104d49379090e2aac9b466da94a2260cfadbaf3 100644 (file)
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c
@@ -1386,13 +1386,8 @@ static int smu_v13_0_irq_process(struct amdgpu_device *adev,
         if (client_id == SOC15_IH_CLIENTID_THM) {
                 switch (src_id) {
                 case THM_11_0__SRCID__THM_DIG_THERM_L2H:
-                       dev_emerg(adev->dev, "ERROR: GPU over temperature range(SW CTF) detected!\n");
-                       /*
-                        * SW CTF just occurred.
-                        * Try to do a graceful shutdown to prevent further damage.
-                        */
-                       dev_emerg(adev->dev, "ERROR: System is going to shutdown due to GPU SW CTF!\n");
-                       orderly_poweroff(true);
+                       schedule_delayed_work(&smu->swctf_delayed_work,
+                                             msecs_to_jiffies(AMDGPU_SWCTF_EXTRA_DELAY));
                         break;
                 case THM_11_0__SRCID__THM_DIG_THERM_H2L:
                         dev_emerg(adev->dev, "ERROR: GPU under temperature range detected\n");
author	Evan Quan <evan.quan@amd.com>
	Fri, 11 Aug 2023 16:40:31 +0000 (11:40 -0500)
committer	Stefan Bader <stefan.bader@canonical.com>
	Mon, 30 Oct 2023 11:00:02 +0000 (12:00 +0100)
drivers/gpu/drm/amd/amdgpu/amdgpu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/powerplay/amd_powerplay.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu_helper.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/powerplay/inc/hwmgr.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/swsmu/inc/amdgpu_smu.h		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/swsmu/smu11/smu_v11_0.c		patch \| blob \| blame \| history
drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c		patch \| blob \| blame \| history