]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
mlxsw: core: Add validation of transceiver temperature thresholds
authorVadim Pasternak <vadimp@nvidia.com>
Fri, 8 Jan 2021 14:52:09 +0000 (16:52 +0200)
committerJakub Kicinski <kuba@kernel.org>
Sun, 10 Jan 2021 00:25:10 +0000 (16:25 -0800)
Validate thresholds to avoid a single failure due to some transceiver
unreliability. Ignore the last readouts in case warning temperature is
above alarm temperature, since it can cause unexpected thermal
shutdown. Stay with the previous values and refresh threshold within
the next iteration.

This is a rare scenario, but it was observed at a customer site.

Fixes: 6a79507cfe94 ("mlxsw: core: Extend thermal module with per QSFP module thermal zones")
Signed-off-by: Vadim Pasternak <vadimp@nvidia.com>
Reviewed-by: Jiri Pirko <jiri@nvidia.com>
Signed-off-by: Ido Schimmel <idosch@nvidia.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
drivers/net/ethernet/mellanox/mlxsw/core_thermal.c

index 8fa286ccdd6bb281a834bc135593487878930403..250a850496977eb8e523e018d63f4a440167e2f5 100644 (file)
@@ -176,6 +176,12 @@ mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
        if (err)
                return err;
 
+       if (crit_temp > emerg_temp) {
+               dev_warn(dev, "%s : Critical threshold %d is above emergency threshold %d\n",
+                        tz->tzdev->type, crit_temp, emerg_temp);
+               return 0;
+       }
+
        /* According to the system thermal requirements, the thermal zones are
         * defined with four trip points. The critical and emergency
         * temperature thresholds, provided by QSFP module are set as "active"
@@ -190,11 +196,8 @@ mlxsw_thermal_module_trips_update(struct device *dev, struct mlxsw_core *core,
                tz->trips[MLXSW_THERMAL_TEMP_TRIP_NORM].temp = crit_temp;
        tz->trips[MLXSW_THERMAL_TEMP_TRIP_HIGH].temp = crit_temp;
        tz->trips[MLXSW_THERMAL_TEMP_TRIP_HOT].temp = emerg_temp;
-       if (emerg_temp > crit_temp)
-               tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
+       tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp +
                                        MLXSW_THERMAL_MODULE_TEMP_SHIFT;
-       else
-               tz->trips[MLXSW_THERMAL_TEMP_TRIP_CRIT].temp = emerg_temp;
 
        return 0;
 }