]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
bnxt_en: implement hw health reporter
authorKalesh AP <kalesh-anakkur.purayil@broadcom.com>
Sat, 5 Mar 2022 08:54:41 +0000 (03:54 -0500)
committerDavid S. Miller <davem@davemloft.net>
Sat, 5 Mar 2022 11:16:56 +0000 (11:16 +0000)
This reporter will report NVM errors which are non-fatal.
When we receive these NVM error events, we'll report it
through this new hw health reporter.

Reviewed-by: Edwin Peer <edwin.peer@broadcom.com>
Signed-off-by: Kalesh AP <kalesh-anakkur.purayil@broadcom.com>
Signed-off-by: Michael Chan <michael.chan@broadcom.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/broadcom/bnxt/bnxt.c
drivers/net/ethernet/broadcom/bnxt/bnxt.h
drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c
drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h

index 2de02950086fbf4bbcdb792cf6370f632c5859cc..63b8fc4f9d42748985563431f8fef47644018dc0 100644 (file)
@@ -2061,6 +2061,22 @@ static void bnxt_event_error_report(struct bnxt *bp, u32 data1, u32 data2)
        case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_DOORBELL_DROP_THRESHOLD:
                netdev_warn(bp->dev, "One or more MMIO doorbells dropped by the device!\n");
                break;
+       case ASYNC_EVENT_CMPL_ERROR_REPORT_BASE_EVENT_DATA1_ERROR_TYPE_NVM: {
+               struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+               hw_health->nvm_err_address = EVENT_DATA2_NVM_ERR_ADDR(data2);
+               if (EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)) {
+                       hw_health->synd = BNXT_HW_STATUS_NVM_WRITE_ERR;
+                       hw_health->nvm_write_errors++;
+               } else if (EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)) {
+                       hw_health->synd = BNXT_HW_STATUS_NVM_ERASE_ERR;
+                       hw_health->nvm_erase_errors++;
+               } else {
+                       hw_health->synd = BNXT_HW_STATUS_NVM_UNKNOWN_ERR;
+               }
+               set_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event);
+               break;
+       }
        default:
                netdev_err(bp->dev, "FW reported unknown error type %u\n",
                           err_type);
@@ -11887,6 +11903,9 @@ static void bnxt_sp_task(struct work_struct *work)
        if (test_and_clear_bit(BNXT_FW_ECHO_REQUEST_SP_EVENT, &bp->sp_event))
                bnxt_fw_echo_reply(bp);
 
+       if (test_and_clear_bit(BNXT_FW_NVM_ERR_SP_EVENT, &bp->sp_event))
+               bnxt_devlink_health_hw_report(bp);
+
        /* These functions below will clear BNXT_STATE_IN_SP_TASK.  They
         * must be the last functions to be called before exiting.
         */
index 447a9406b8a27137ef97214506fefa45abd3f54e..fa0df43ddc1a8fc50bb9911a3b22bf6f1cc04b30 100644 (file)
@@ -516,6 +516,21 @@ struct rx_tpa_end_cmp_ext {
          ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_MASK) >>\
         ASYNC_EVENT_CMPL_ERROR_REPORT_INVALID_SIGNAL_EVENT_DATA2_PIN_ID_SFT)
 
+#define EVENT_DATA2_NVM_ERR_ADDR(data2)                                        \
+       (((data2) &                                                     \
+         ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_MASK) >>\
+        ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA2_ERR_ADDR_SFT)
+
+#define EVENT_DATA1_NVM_ERR_TYPE_WRITE(data1)                          \
+       (((data1) &                                                     \
+         ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
+        ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_WRITE)
+
+#define EVENT_DATA1_NVM_ERR_TYPE_ERASE(data1)                          \
+       (((data1) &                                                     \
+         ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_MASK) ==\
+        ASYNC_EVENT_CMPL_ERROR_REPORT_NVM_EVENT_DATA1_NVM_ERR_TYPE_ERASE)
+
 struct nqe_cn {
        __le16  type;
        #define NQ_CN_TYPE_MASK           0x3fUL
@@ -1528,6 +1543,21 @@ struct bnxt_ctx_mem_info {
        struct bnxt_mem_init    mem_init[BNXT_CTX_MEM_INIT_MAX];
 };
 
+enum bnxt_hw_err {
+       BNXT_HW_STATUS_HEALTHY          = 0x0,
+       BNXT_HW_STATUS_NVM_WRITE_ERR    = 0x1,
+       BNXT_HW_STATUS_NVM_ERASE_ERR    = 0x2,
+       BNXT_HW_STATUS_NVM_UNKNOWN_ERR  = 0x3,
+};
+
+struct bnxt_hw_health {
+       u32 nvm_err_address;
+       u32 nvm_write_errors;
+       u32 nvm_erase_errors;
+       u8 synd;
+       struct devlink_health_reporter *hw_reporter;
+};
+
 enum bnxt_health_severity {
        SEVERITY_NORMAL = 0,
        SEVERITY_WARNING,
@@ -2045,6 +2075,7 @@ struct bnxt {
 #define BNXT_FW_EXCEPTION_SP_EVENT     19
 #define BNXT_LINK_CFG_CHANGE_SP_EVENT  21
 #define BNXT_FW_ECHO_REQUEST_SP_EVENT  23
+#define BNXT_FW_NVM_ERR_SP_EVENT       25
 
        struct delayed_work     fw_reset_task;
        int                     fw_reset_state;
@@ -2145,6 +2176,8 @@ struct bnxt {
        struct dentry           *debugfs_pdev;
        struct device           *hwmon_dev;
        enum board_idx          board_idx;
+
+       struct bnxt_hw_health   hw_health;
 };
 
 #define BNXT_NUM_RX_RING_STATS                 8
index 0c17f90d44a25e5ac4030d7f89fffa1c5669569e..a802bbda1c275d5c6c7552913f30960bebe760e0 100644 (file)
@@ -241,6 +241,69 @@ static const struct devlink_health_reporter_ops bnxt_dl_fw_reporter_ops = {
        .recover = bnxt_fw_recover,
 };
 
+static int bnxt_hw_recover(struct devlink_health_reporter *reporter,
+                          void *priv_ctx,
+                          struct netlink_ext_ack *extack)
+{
+       struct bnxt *bp = devlink_health_reporter_priv(reporter);
+       struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+       hw_health->synd = BNXT_HW_STATUS_HEALTHY;
+       return 0;
+}
+
+static const char *hw_err_str(u8 synd)
+{
+       switch (synd) {
+       case BNXT_HW_STATUS_HEALTHY:
+               return "healthy";
+       case BNXT_HW_STATUS_NVM_WRITE_ERR:
+               return "nvm write error";
+       case BNXT_HW_STATUS_NVM_ERASE_ERR:
+               return "nvm erase error";
+       case BNXT_HW_STATUS_NVM_UNKNOWN_ERR:
+               return "unrecognized nvm error";
+       default:
+               return "unknown hw error";
+       }
+}
+
+static int bnxt_hw_diagnose(struct devlink_health_reporter *reporter,
+                           struct devlink_fmsg *fmsg,
+                           struct netlink_ext_ack *extack)
+{
+       struct bnxt *bp = devlink_health_reporter_priv(reporter);
+       struct bnxt_hw_health *h = &bp->hw_health;
+       int rc;
+
+       rc = devlink_fmsg_string_pair_put(fmsg, "Status", hw_err_str(h->synd));
+       if (rc)
+               return rc;
+       rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_write_errors", h->nvm_write_errors);
+       if (rc)
+               return rc;
+       rc = devlink_fmsg_u32_pair_put(fmsg, "nvm_erase_errors", h->nvm_erase_errors);
+       if (rc)
+               return rc;
+       return 0;
+}
+
+void bnxt_devlink_health_hw_report(struct bnxt *bp)
+{
+       struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+       netdev_warn(bp->dev, "%s reported at address 0x%x\n", hw_err_str(hw_health->synd),
+                   hw_health->nvm_err_address);
+
+       devlink_health_report(hw_health->hw_reporter, hw_err_str(hw_health->synd), NULL);
+}
+
+static const struct devlink_health_reporter_ops bnxt_dl_hw_reporter_ops = {
+       .name = "hw",
+       .diagnose = bnxt_hw_diagnose,
+       .recover = bnxt_hw_recover,
+};
+
 static struct devlink_health_reporter *
 __bnxt_dl_reporter_create(struct bnxt *bp,
                          const struct devlink_health_reporter_ops *ops)
@@ -260,6 +323,10 @@ __bnxt_dl_reporter_create(struct bnxt *bp,
 void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 {
        struct bnxt_fw_health *fw_health = bp->fw_health;
+       struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+       if (!hw_health->hw_reporter)
+               hw_health->hw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_hw_reporter_ops);
 
        if (fw_health && !fw_health->fw_reporter)
                fw_health->fw_reporter = __bnxt_dl_reporter_create(bp, &bnxt_dl_fw_reporter_ops);
@@ -268,6 +335,12 @@ void bnxt_dl_fw_reporters_create(struct bnxt *bp)
 void bnxt_dl_fw_reporters_destroy(struct bnxt *bp)
 {
        struct bnxt_fw_health *fw_health = bp->fw_health;
+       struct bnxt_hw_health *hw_health = &bp->hw_health;
+
+       if (hw_health->hw_reporter) {
+               devlink_health_reporter_destroy(hw_health->hw_reporter);
+               hw_health->hw_reporter = NULL;
+       }
 
        if (fw_health && fw_health->fw_reporter) {
                devlink_health_reporter_destroy(fw_health->fw_reporter);
index b8105065367b6029d03863e3c3be83637906e2d4..056962e4b177701b2f6588f78408c73f1c4ff2ec 100644 (file)
@@ -74,6 +74,7 @@ enum bnxt_dl_version_type {
 void bnxt_devlink_health_fw_report(struct bnxt *bp);
 void bnxt_dl_health_fw_status_update(struct bnxt *bp, bool healthy);
 void bnxt_dl_health_fw_recovery_done(struct bnxt *bp);
+void bnxt_devlink_health_hw_report(struct bnxt *bp);
 void bnxt_dl_fw_reporters_create(struct bnxt *bp);
 void bnxt_dl_fw_reporters_destroy(struct bnxt *bp);
 int bnxt_dl_register(struct bnxt *bp);