]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/commitdiff
net: hns3: add handling of RDMA RAS errors
authorShiju Jose <shiju.jose@huawei.com>
Fri, 7 Dec 2018 21:08:11 +0000 (21:08 +0000)
committerKleber Sacilotto de Souza <kleber.souza@canonical.com>
Mon, 14 Jan 2019 09:28:55 +0000 (09:28 +0000)
BugLink: https://bugs.launchpad.net/bugs/1810457
This patch handles the RDMA RAS errors.
1. Enable RAS interrupt, print error detail info and clear error status.
2. Do CORE reset to recovery when these non-fatal errors happened.

Signed-off-by: Xiaofei Tan <tanxiaofei@huawei.com>
Signed-off-by: Shiju Jose <shiju.jose@huawei.com>
Signed-off-by: Salil Mehta <salil.mehta@huawei.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
(cherry picked from commit 630ba007f4750722bc56dfebfaf1c0316c2fcb69)
Signed-off-by: dann frazier <dann.frazier@canonical.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
Acked-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
Signed-off-by: Kleber Sacilotto de Souza <kleber.souza@canonical.com>
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.h
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.c
drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_err.h

index fff1e7268e90932030a0a5e5c7dd5671244a5672..a61f0bdc0fbad7b94888b2ef9b1f078e50d79c8c 100644 (file)
@@ -227,6 +227,9 @@ enum hclge_opcode_type {
        HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
        HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT      = 0x1514,
        HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT       = 0x1515,
+       HCLGE_CONFIG_ROCEE_RAS_INT_EN   = 0x1580,
+       HCLGE_QUERY_CLEAR_ROCEE_RAS_INT = 0x1581,
+       HCLGE_ROCEE_PF_RAS_INT_CMD      = 0x1584,
        HCLGE_IGU_EGU_TNL_INT_EN        = 0x1803,
        HCLGE_IGU_COMMON_INT_EN         = 0x1806,
        HCLGE_TM_QCN_MEM_INT_CFG        = 0x1A14,
index 81ad9b415556f5e09002fd6c7dd773c2c542fb20..77deea0beeba9e64c0ba003873844d08357e1ed8 100644 (file)
@@ -337,6 +337,30 @@ static const struct hclge_hw_error hclge_ssu_port_based_pf_int[] = {
        { /* sentinel */ }
 };
 
+static const struct hclge_hw_error hclge_rocee_qmm_ovf_err_int[] = {
+       { .int_msk = 0, .msg = "rocee qmm ovf: sgid invalid err" },
+       { .int_msk = 0x4, .msg = "rocee qmm ovf: sgid ovf err" },
+       { .int_msk = 0x8, .msg = "rocee qmm ovf: smac invalid err" },
+       { .int_msk = 0xC, .msg = "rocee qmm ovf: smac ovf err" },
+       { .int_msk = 0x10, .msg = "rocee qmm ovf: cqc invalid err" },
+       { .int_msk = 0x11, .msg = "rocee qmm ovf: cqc ovf err" },
+       { .int_msk = 0x12, .msg = "rocee qmm ovf: cqc hopnum err" },
+       { .int_msk = 0x13, .msg = "rocee qmm ovf: cqc ba0 err" },
+       { .int_msk = 0x14, .msg = "rocee qmm ovf: srqc invalid err" },
+       { .int_msk = 0x15, .msg = "rocee qmm ovf: srqc ovf err" },
+       { .int_msk = 0x16, .msg = "rocee qmm ovf: srqc hopnum err" },
+       { .int_msk = 0x17, .msg = "rocee qmm ovf: srqc ba0 err" },
+       { .int_msk = 0x18, .msg = "rocee qmm ovf: mpt invalid err" },
+       { .int_msk = 0x19, .msg = "rocee qmm ovf: mpt ovf err" },
+       { .int_msk = 0x1A, .msg = "rocee qmm ovf: mpt hopnum err" },
+       { .int_msk = 0x1B, .msg = "rocee qmm ovf: mpt ba0 err" },
+       { .int_msk = 0x1C, .msg = "rocee qmm ovf: qpc invalid err" },
+       { .int_msk = 0x1D, .msg = "rocee qmm ovf: qpc ovf err" },
+       { .int_msk = 0x1E, .msg = "rocee qmm ovf: qpc hopnum err" },
+       { .int_msk = 0x1F, .msg = "rocee qmm ovf: qpc ba0 err" },
+       { /* sentinel */ }
+};
+
 static void hclge_log_error(struct device *dev, char *reg,
                            const struct hclge_hw_error *err,
                            u32 err_sts)
@@ -1023,6 +1047,148 @@ static int hclge_handle_all_ras_errors(struct hclge_dev *hdev)
        return ret;
 }
 
+static int hclge_log_rocee_ovf_error(struct hclge_dev *hdev)
+{
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_desc desc[2];
+       int ret;
+
+       /* read overflow error status */
+       ret = hclge_cmd_query_error(hdev, &desc[0],
+                                   HCLGE_ROCEE_PF_RAS_INT_CMD,
+                                   0, 0, 0);
+       if (ret) {
+               dev_err(dev, "failed(%d) to query ROCEE OVF error sts\n", ret);
+               return ret;
+       }
+
+       /* log overflow error */
+       if (le32_to_cpu(desc[0].data[0]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
+               const struct hclge_hw_error *err;
+               u32 err_sts;
+
+               err = &hclge_rocee_qmm_ovf_err_int[0];
+               err_sts = HCLGE_ROCEE_OVF_ERR_TYPE_MASK &
+                         le32_to_cpu(desc[0].data[0]);
+               while (err->msg) {
+                       if (err->int_msk == err_sts) {
+                               dev_warn(dev, "%s [error status=0x%x] found\n",
+                                        err->msg,
+                                        le32_to_cpu(desc[0].data[0]));
+                               break;
+                       }
+                       err++;
+               }
+       }
+
+       if (le32_to_cpu(desc[0].data[1]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
+               dev_warn(dev, "ROCEE TSP OVF [error status=0x%x] found\n",
+                        le32_to_cpu(desc[0].data[1]));
+       }
+
+       if (le32_to_cpu(desc[0].data[2]) & HCLGE_ROCEE_OVF_ERR_INT_MASK) {
+               dev_warn(dev, "ROCEE SCC OVF [error status=0x%x] found\n",
+                        le32_to_cpu(desc[0].data[2]));
+       }
+
+       return 0;
+}
+
+static int hclge_log_and_clear_rocee_ras_error(struct hclge_dev *hdev)
+{
+       enum hnae3_reset_type reset_type = HNAE3_FUNC_RESET;
+       struct hnae3_ae_dev *ae_dev = hdev->ae_dev;
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_desc desc[2];
+       unsigned int status;
+       int ret;
+
+       /* read RAS error interrupt status */
+       ret = hclge_cmd_query_error(hdev, &desc[0],
+                                   HCLGE_QUERY_CLEAR_ROCEE_RAS_INT,
+                                   0, 0, 0);
+       if (ret) {
+               dev_err(dev, "failed(%d) to query ROCEE RAS INT SRC\n", ret);
+               /* reset everything for now */
+               HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
+               return ret;
+       }
+
+       status = le32_to_cpu(desc[0].data[0]);
+
+       if (status & HCLGE_ROCEE_RERR_INT_MASK)
+               dev_warn(dev, "ROCEE RAS AXI rresp error\n");
+
+       if (status & HCLGE_ROCEE_BERR_INT_MASK)
+               dev_warn(dev, "ROCEE RAS AXI bresp error\n");
+
+       if (status & HCLGE_ROCEE_ECC_INT_MASK) {
+               dev_warn(dev, "ROCEE RAS 2bit ECC error\n");
+               reset_type = HNAE3_GLOBAL_RESET;
+       }
+
+       if (status & HCLGE_ROCEE_OVF_INT_MASK) {
+               ret = hclge_log_rocee_ovf_error(hdev);
+               if (ret) {
+                       dev_err(dev, "failed(%d) to process ovf error\n", ret);
+                       /* reset everything for now */
+                       HCLGE_SET_DEFAULT_RESET_REQUEST(HNAE3_GLOBAL_RESET);
+                       return ret;
+               }
+       }
+
+       /* clear error status */
+       hclge_cmd_reuse_desc(&desc[0], false);
+       ret = hclge_cmd_send(&hdev->hw, &desc[0], 1);
+       if (ret) {
+               dev_err(dev, "failed(%d) to clear ROCEE RAS error\n", ret);
+               /* reset everything for now */
+               reset_type = HNAE3_GLOBAL_RESET;
+       }
+
+       HCLGE_SET_DEFAULT_RESET_REQUEST(reset_type);
+
+       return ret;
+}
+
+static int hclge_config_rocee_ras_interrupt(struct hclge_dev *hdev, bool en)
+{
+       struct device *dev = &hdev->pdev->dev;
+       struct hclge_desc desc;
+       int ret;
+
+       if (hdev->pdev->revision < 0x21 || !hnae3_dev_roce_supported(hdev))
+               return 0;
+
+       hclge_cmd_setup_basic_desc(&desc, HCLGE_CONFIG_ROCEE_RAS_INT_EN, false);
+       if (en) {
+               /* enable ROCEE hw error interrupts */
+               desc.data[0] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN);
+               desc.data[1] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN);
+
+               hclge_log_and_clear_rocee_ras_error(hdev);
+       }
+       desc.data[2] = cpu_to_le32(HCLGE_ROCEE_RAS_NFE_INT_EN_MASK);
+       desc.data[3] = cpu_to_le32(HCLGE_ROCEE_RAS_CE_INT_EN_MASK);
+
+       ret = hclge_cmd_send(&hdev->hw, &desc, 1);
+       if (ret)
+               dev_err(dev, "failed(%d) to config ROCEE RAS interrupt\n", ret);
+
+       return ret;
+}
+
+static int hclge_handle_rocee_ras_error(struct hnae3_ae_dev *ae_dev)
+{
+       struct hclge_dev *hdev = ae_dev->priv;
+
+       if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
+           hdev->pdev->revision < 0x21)
+               return HNAE3_NONE_RESET;
+
+       return hclge_log_and_clear_rocee_ras_error(hdev);
+}
+
 static const struct hclge_hw_blk hw_blk[] = {
        {
          .msk = BIT(0), .name = "IGU_EGU",
@@ -1058,6 +1224,7 @@ static const struct hclge_hw_blk hw_blk[] = {
 int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
 {
        const struct hclge_hw_blk *module = hw_blk;
+       struct device *dev = &hdev->pdev->dev;
        int ret = 0;
 
        while (module->name) {
@@ -1069,6 +1236,10 @@ int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state)
                module++;
        }
 
+       ret = hclge_config_rocee_ras_interrupt(hdev, state);
+       if (ret)
+               dev_err(dev, "fail(%d) to configure ROCEE err int\n", ret);
+
        return ret;
 }
 
@@ -1086,9 +1257,21 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
                         "HNS Non-Fatal RAS error(status=0x%x) identified\n",
                         status);
                hclge_handle_all_ras_errors(hdev);
-               return PCI_ERS_RESULT_NEED_RESET;
+       } else {
+               if (test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state) ||
+                   hdev->pdev->revision < 0x21)
+                       return PCI_ERS_RESULT_RECOVERED;
+       }
+
+       if (status & HCLGE_RAS_REG_ROCEE_ERR_MASK) {
+               dev_warn(dev, "ROCEE uncorrected RAS error identified\n");
+               hclge_handle_rocee_ras_error(ae_dev);
        }
 
+       if (status & HCLGE_RAS_REG_NFE_MASK ||
+           status & HCLGE_RAS_REG_ROCEE_ERR_MASK)
+               return PCI_ERS_RESULT_NEED_RESET;
+
        return PCI_ERS_RESULT_RECOVERED;
 }
 
index d2077f2795fac57d088c7af01237c9aa2b64e3e4..51a7d4eb066a9ba12f501eb351fb2de5f66e10e6 100644 (file)
@@ -8,6 +8,7 @@
 
 #define HCLGE_RAS_PF_OTHER_INT_STS_REG   0x20B00
 #define HCLGE_RAS_REG_NFE_MASK   0xFF00
+#define HCLGE_RAS_REG_ROCEE_ERR_MASK   0x3000000
 
 #define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG   0x20800
 #define HCLGE_VECTOR0_REG_MSIX_MASK   0x1FF00
 #define HCLGE_QCN_ECC_INT_MASK         GENMASK(21, 0)
 #define HCLGE_NCSI_ECC_INT_MASK                GENMASK(1, 0)
 
+#define HCLGE_ROCEE_RAS_NFE_INT_EN             0xF
+#define HCLGE_ROCEE_RAS_CE_INT_EN              0x1
+#define HCLGE_ROCEE_RAS_NFE_INT_EN_MASK                0xF
+#define HCLGE_ROCEE_RAS_CE_INT_EN_MASK         0x1
+#define HCLGE_ROCEE_RERR_INT_MASK              BIT(0)
+#define HCLGE_ROCEE_BERR_INT_MASK              BIT(1)
+#define HCLGE_ROCEE_ECC_INT_MASK               BIT(2)
+#define HCLGE_ROCEE_OVF_INT_MASK               BIT(3)
+#define HCLGE_ROCEE_OVF_ERR_INT_MASK           0x10000
+#define HCLGE_ROCEE_OVF_ERR_TYPE_MASK          0x3F
+
 enum hclge_err_int_type {
        HCLGE_ERR_INT_MSIX = 0,
        HCLGE_ERR_INT_RAS_CE = 1,