[PATCH] ppc64: avoid PCI error reporting for empty slots

[mirror_ubuntu-bionic-kernel.git] / arch / ppc64 / kernel / eeh.c
diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c

index 99f11b66b5a00f48a7c4dfb2b5a0e3a7d62aa1ce..0060934dffd208649b850badd8943e292a3cc15a 100644 (file)
--- a/arch/ppc64/kernel/eeh.c
+++ b/arch/ppc64/kernel/eeh.c
@@ -102,6 +102,10 @@ static DEFINE_SPINLOCK(slot_errbuf_lock);
  static int eeh_error_buf_size;
  
  /* System monitoring statistics */
+static DEFINE_PER_CPU(unsigned long, no_device);
+static DEFINE_PER_CPU(unsigned long, no_dn);
+static DEFINE_PER_CPU(unsigned long, no_cfg_addr);
+static DEFINE_PER_CPU(unsigned long, ignored_check);
  static DEFINE_PER_CPU(unsigned long, total_mmio_ffs);
  static DEFINE_PER_CPU(unsigned long, false_positives);
  static DEFINE_PER_CPU(unsigned long, ignored_failures);
@@ -393,6 +397,28 @@ void __init pci_addr_cache_build(void)
  /* --------------------------------------------------------------- */
  /* Above lies the PCI Address Cache. Below lies the EEH event infrastructure */
  
+void eeh_slot_error_detail (struct pci_dn *pdn, int severity)
+{
+       unsigned long flags;
+       int rc;
+
+       /* Log the error with the rtas logger */
+       spin_lock_irqsave(&slot_errbuf_lock, flags);
+       memset(slot_errbuf, 0, eeh_error_buf_size);
+
+       rc = rtas_call(ibm_slot_error_detail,
+                      8, 1, NULL, pdn->eeh_config_addr,
+                      BUID_HI(pdn->phb->buid),
+                      BUID_LO(pdn->phb->buid), NULL, 0,
+                      virt_to_phys(slot_errbuf),
+                      eeh_error_buf_size,
+                      severity);
+
+       if (rc == 0)
+               log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
+       spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+}
+
  /**
   * eeh_register_notifier - Register to find out about EEH events.
   * @nb: notifier block to callback on events
@@ -450,9 +476,12 @@ static void eeh_panic(struct pci_dev *dev, int reset_state)
          * Since the panic_on_oops sysctl is used to halt the system
          * in light of potential corruption, we can use it here.
          */
-       if (panic_on_oops)
+       if (panic_on_oops) {
+               struct device_node *dn = pci_device_to_OF_node(dev);
+               eeh_slot_error_detail (PCI_DN(dn), 2 /* Permanent Error */);
                 panic("EEH: MMIO failure (%d) on device:%s\n", reset_state,
                       pci_name(dev));
+       }
         else {
                 __get_cpu_var(ignored_failures)++;
                 printk(KERN_INFO "EEH: Ignored MMIO failure (%d) on device:%s\n",
@@ -493,8 +522,6 @@ static void eeh_event_handler(void *dummy)
                 notifier_call_chain (&eeh_notifier_chain,
                                      EEH_NOTIFY_FREEZE, event);
  
-               __get_cpu_var(slot_resets)++;
-
                 pci_dev_put(event->dev);
                 kfree(event);
         }
@@ -537,7 +564,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         int ret;
         int rets[3];
         unsigned long flags;
-       int rc, reset_state;
+       int reset_state;
         struct eeh_event  *event;
         struct pci_dn *pdn;
  
@@ -546,17 +573,24 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         if (!eeh_subsystem_enabled)
                 return 0;
  
-       if (!dn)
+       if (!dn) {
+               __get_cpu_var(no_dn)++;
                 return 0;
+       }
         pdn = PCI_DN(dn);
  
         /* Access to IO BARs might get this far and still not want checking. */
         if (!pdn->eeh_capable || !(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
             pdn->eeh_mode & EEH_MODE_NOCHECK) {
+               __get_cpu_var(ignored_check)++;
+#ifdef DEBUG
+               printk ("EEH:ignored check for %s %s\n", pci_name (dev), dn->full_name);
+#endif
                 return 0;
         }
  
         if (!pdn->eeh_config_addr) {
+               __get_cpu_var(no_cfg_addr)++;
                 return 0;
         }
  
@@ -583,30 +617,43 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
          * In any case they must share a common PHB.
          */
         ret = read_slot_reset_state(pdn, rets);
-       if (!(ret == 0 && rets[1] == 1 && (rets[0] == 2 || rets[0] == 4))) {
+
+       /* If the call to firmware failed, punt */
+       if (ret != 0) {
+               printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
+                      ret, dn->full_name);
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* If EEH is not supported on this device, punt. */
+       if (rets[1] != 1) {
+               printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
+                      ret, dn->full_name);
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* If not the kind of error we know about, punt. */
+       if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
+               __get_cpu_var(false_positives)++;
+               return 0;
+       }
+
+       /* Note that config-io to empty slots may fail;
+        * we recognize empty because they don't have children. */
+       if ((rets[0] == 5) && (dn->child == NULL)) {
                 __get_cpu_var(false_positives)++;
                 return 0;
         }
  
         /* prevent repeated reports of this failure */
         pdn->eeh_mode |= EEH_MODE_ISOLATED;
+        __get_cpu_var(slot_resets)++;
  
         reset_state = rets[0];
  
-       spin_lock_irqsave(&slot_errbuf_lock, flags);
-       memset(slot_errbuf, 0, eeh_error_buf_size);
-
-       rc = rtas_call(ibm_slot_error_detail,
-                      8, 1, NULL, pdn->eeh_config_addr,
-                      BUID_HI(pdn->phb->buid),
-                      BUID_LO(pdn->phb->buid), NULL, 0,
-                      virt_to_phys(slot_errbuf),
-                      eeh_error_buf_size,
-                      1 /* Temporary Error */);
-
-       if (rc == 0)
-               log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0);
-       spin_unlock_irqrestore(&slot_errbuf_lock, flags);
+       eeh_slot_error_detail (pdn, 1 /* Temporary Error */);
  
         printk(KERN_INFO "EEH: MMIO failure (%d) on device: %s %s\n",
                rets[0], dn->name, dn->full_name);
@@ -628,7 +675,7 @@ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
         /* Most EEH events are due to device driver bugs.  Having
          * a stack trace will help the device-driver authors figure
          * out what happened.  So print that out. */
-       dump_stack();
+       if (rets[0] != 5) dump_stack();
         schedule_work(&eeh_event_wq);
  
         return 0;
@@ -657,8 +704,10 @@ unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned lon
         /* Finding the phys addr + pci device; this is pretty quick. */
         addr = eeh_token_to_phys((unsigned long __force) token);
         dev = pci_get_device_by_addr(addr);
-       if (!dev)
+       if (!dev) {
+               __get_cpu_var(no_device)++;
                 return val;
+       }
  
         dn = pci_device_to_OF_node(dev);
         eeh_dn_check_failure (dn, dev);
@@ -771,6 +820,8 @@ void __init eeh_init(void)
         struct device_node *phb, *np;
         struct eeh_early_enable_info info;
  
+       spin_lock_init(&slot_errbuf_lock);
+
         np = of_find_node_by_path("/rtas");
         if (np == NULL)
                 return;
@@ -903,12 +954,17 @@ static int proc_eeh_show(struct seq_file *m, void *v)
         unsigned int cpu;
         unsigned long ffs = 0, positives = 0, failures = 0;
         unsigned long resets = 0;
+       unsigned long no_dev = 0, no_dn = 0, no_cfg = 0, no_check = 0;
  
         for_each_cpu(cpu) {
                 ffs += per_cpu(total_mmio_ffs, cpu);
                 positives += per_cpu(false_positives, cpu);
                 failures += per_cpu(ignored_failures, cpu);
                 resets += per_cpu(slot_resets, cpu);
+               no_dev += per_cpu(no_device, cpu);
+               no_dn += per_cpu(no_dn, cpu);
+               no_cfg += per_cpu(no_cfg_addr, cpu);
+               no_check += per_cpu(ignored_check, cpu);
         }
  
         if (0 == eeh_subsystem_enabled) {
@@ -916,13 +972,17 @@ static int proc_eeh_show(struct seq_file *m, void *v)
                 seq_printf(m, "eeh_total_mmio_ffs=%ld\n", ffs);
         } else {
                 seq_printf(m, "EEH Subsystem is enabled\n");
-               seq_printf(m, "eeh_total_mmio_ffs=%ld\n"
-                          "eeh_false_positives=%ld\n"
-                          "eeh_ignored_failures=%ld\n"
-                          "eeh_slot_resets=%ld\n"
-                               "eeh_fail_count=%d\n",
-                          ffs, positives, failures, resets,
-                               eeh_fail_count.counter);
+               seq_printf(m,
+                               "no device=%ld\n"
+                               "no device node=%ld\n"
+                               "no config address=%ld\n"
+                               "check not wanted=%ld\n"
+                               "eeh_total_mmio_ffs=%ld\n"
+                               "eeh_false_positives=%ld\n"
+                               "eeh_ignored_failures=%ld\n"
+                               "eeh_slot_resets=%ld\n",
+                               no_dev, no_dn, no_cfg, no_check,
+                               ffs, positives, failures, resets);
         }
  
         return 0;