]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - arch/ppc64/kernel/eeh.c
[PATCH] ppc64: avoid PCI error reporting for empty slots
[mirror_ubuntu-bionic-kernel.git] / arch / ppc64 / kernel / eeh.c
index 99f11b66b5a00f48a7c4dfb2b5a0e3a7d62aa1ce..0060934dffd208649b850badd8943e292a3cc15a 100644 (file)
@@ -102,6 +102,10 @@ static DEFINE_SPINLOCK(slot_errbuf_lock);
 static int eeh_error_buf_size;
 
 /* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, no_device);
+static DEFINE_PER_CPU(unsigned long, no_dn);
+static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
+static DEFINE_PER_CPU(unsigned long, ignored_check);
 static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
 static DEFINE_PER_CPU(unsigned long, false_positives);
 static DEFINE_PER_CPU(unsigned long, ignored_failures);
@@ -393,6 +397,28 @@ void __init pci_addr_cache_build(void)
 /* --------------------------------------------------------------- */
 /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
 
+void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
+{
+       unsigned long flags;
+       int rc;
+
+       /* Log the error with the rtas logger */
+       spin_lock_irqsave(&slot_errbuf_lock, flags);
+       memset(slot_errbuf, 0, eeh_error_buf_size);
+
+       rc = rtas_call(ibm_slot_error_detail,
+                      8, 1, NULL, pdn->eeh_config_addr,
+                      BUID_HI(pdn->phb->buid),
+                      BUID_LO(pdn->phb->buid), NULL, 0,
+                      virt_to_phys(slot_errbuf),
+                      eeh_error_buf_size,
+                      severity);
+
+       if (rc == 0)
+               log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+       spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
 /**
  * eeh_register_notifier - Register to find out about EEH events.
  * @nb: notifier block to callback on events
@@ -450,9 +476,12 @@ static void eeh_panic(struct pci_dev *dev, int reset_state)
         * Since the panic_on_oops sysctl is used to halt the system
         * in light of potential corruption, we can use it here.
         */
-       if (panic_on_oops)
+       if (panic_on_oops) {
+               struct device_node *dn = pci_device_to_OF_node(dev);
+               eeh_slot_error_detail (PCI_DN(dn), 2 /* Permanent Error */);
                panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
                      pci_name(dev));
+       }
        else {
                __get_cpu_var(ignored_failures)++;
                printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
@@ -493,8 +522,6 @@ static void eeh_event_handler(void *dummy)
                notifier_call_chain (&eeh_notifier_chain,
                                     EEH_NOTIFY_FREEZE, event);
 
-               __get_cpu_var(slot_resets)++;
-
                pci_dev_put(event->dev);
                kfree(event);
        }
@@ -537,7 +564,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        int ret;
        int rets[3];
        unsigned long flags;
-       int rc, reset_state;
+       int reset_state;
        struct eeh_event  *event;
        struct pci_dn *pdn;
 
@@ -546,17 +573,24 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        if (!eeh_subsystem_enabled)
                return 0;
 
-       if (!dn)
+       if (!dn) {
+               __get_cpu_var(no_dn)++;
                return 0;
+       }
        pdn = PCI_DN(dn);
 
        /* Access to IO BARs might get this far and still not want checking. */
        if (!pdn->eeh_capable || !(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
            pdn->eeh_mode & EEH_MODE_NOCHECK) {
+               __get_cpu_var(ignored_check)++;
+#ifdef DEBUG
+               printk ("EEH:ignored check for %s %s\n", pci_name (dev), dn->full_name);
+#endif
                return 0;
        }
 
        if (!pdn->eeh_config_addr) {
+               __get_cpu_var(no_cfg_addr)++;
                return 0;
        }
 
@@ -583,30 +617,43 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         * In any case they must share a common PHB.
         */
        ret = read_slot_reset_state(pdn, rets);
-       if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) {
+
+       /* If the call to firmware failed, punt */
+       if (ret != 0) {
+               printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
+                      ret, dn->full_name);
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* If EEH is not supported on this device, punt. */
+       if (rets[1] != 1) {
+               printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
+                      ret, dn->full_name);
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* If not the kind of error we know about, punt. */
+       if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* Note that config-io to empty slots may fail;
+        * we recognize empty because they don't have children. */
+       if ((rets[0] == 5) && (dn->child == NULL)) {
                __get_cpu_var(false_positives)++;
                return 0;
        }
 
        /* prevent repeated reports of this failure */
        pdn->eeh_mode |= EEH_MODE_ISOLATED;
+        __get_cpu_var(slot_resets)++;
 
        reset_state = rets[0];
 
-       spin_lock_irqsave(&slot_errbuf_lock, flags);
-       memset(slot_errbuf, 0, eeh_error_buf_size);
-
-       rc = rtas_call(ibm_slot_error_detail,
-                      8, 1, NULL, pdn->eeh_config_addr,
-                      BUID_HI(pdn->phb->buid),
-                      BUID_LO(pdn->phb->buid), NULL, 0,
-                      virt_to_phys(slot_errbuf),
-                      eeh_error_buf_size,
-                      1 /* Temporary Error */);
-
-       if (rc == 0)
-               log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-       spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+       eeh_slot_error_detail (pdn, 1 /* Temporary Error */);
 
        printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
               rets[0], dn->name, dn->full_name);
@@ -628,7 +675,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
        /* Most EEH events are due to device driver bugs.  Having
         * a stack trace will help the device-driver authors figure
         * out what happened.  So print that out. */
-       dump_stack();
+       if (rets[0] != 5) dump_stack();
        schedule_work(&eeh_event_wq);
 
        return 0;
@@ -657,8 +704,10 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
        /* Finding the phys addr + pci device; this is pretty quick. */
        addr = eeh_token_to_phys((unsigned long __force) token);
        dev = pci_get_device_by_addr(addr);
-       if (!dev)
+       if (!dev) {
+               __get_cpu_var(no_device)++;
                return val;
+       }
 
        dn = pci_device_to_OF_node(dev);
        eeh_dn_check_failure (dn, dev);
@@ -771,6 +820,8 @@ void __init eeh_init(void)
        struct device_node *phb, *np;
        struct eeh_early_enable_info info;
 
+       spin_lock_init(&slot_errbuf_lock);
+
        np = of_find_node_by_path("/rtas");
        if (np == NULL)
                return;
@@ -903,12 +954,17 @@ static int proc_eeh_show(struct seq_file *m, void *v)
        unsigned int cpu;
        unsigned long ffs = 0, positives = 0, failures = 0;
        unsigned long resets = 0;
+       unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0;
 
        for_each_cpu(cpu) {
                ffs += per_cpu(total_mmio_ffs, cpu);
                positives += per_cpu(false_positives, cpu);
                failures += per_cpu(ignored_failures, cpu);
                resets += per_cpu(slot_resets, cpu);
+               no_dev += per_cpu(no_device, cpu);
+               no_dn += per_cpu(no_dn, cpu);
+               no_cfg += per_cpu(no_cfg_addr, cpu);
+               no_check += per_cpu(ignored_check, cpu);
        }
 
        if (0 == eeh_subsystem_enabled) {
@@ -916,13 +972,17 @@ static int proc_eeh_show(struct seq_file *m, void *v)
                seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
        } else {
                seq_printf(m, "EEH Subsystem is enabled\n");
-               seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
-                          "eeh_false_positives=%ld\n"
-                          "eeh_ignored_failures=%ld\n"
-                          "eeh_slot_resets=%ld\n"
-                               "eeh_fail_count=%d\n",
-                          ffs, positives, failures, resets,
-                               eeh_fail_count.counter);
+               seq_printf(m,
+                               "no device=%ld\n"
+                               "no device node=%ld\n"
+                               "no config address=%ld\n"
+                               "check not wanted=%ld\n"
+                               "eeh_total_mmio_ffs=%ld\n"
+                               "eeh_false_positives=%ld\n"
+                               "eeh_ignored_failures=%ld\n"
+                               "eeh_slot_resets=%ld\n",
+                               no_dev, no_dn, no_cfg, no_check,
+                               ffs, positives, failures, resets);
        }
 
        return 0;