]>
Commit | Line | Data |
---|---|---|
bbbd7f11 | 1 | // SPDX-License-Identifier: GPL-2.0-or-later |
77bd7415 LV |
2 | /* |
3 | * PCI Error Recovery Driver for RPA-compliant PPC64 platform. | |
3c8c90ab LV |
4 | * Copyright IBM Corp. 2004 2005 |
5 | * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 | |
77bd7415 | 6 | * |
3c8c90ab | 7 | * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> |
77bd7415 LV |
8 | */ |
9 | #include <linux/delay.h> | |
77bd7415 | 10 | #include <linux/interrupt.h> |
ac325acd | 11 | #include <linux/irq.h> |
feadf7c0 | 12 | #include <linux/module.h> |
77bd7415 | 13 | #include <linux/pci.h> |
b104af5a | 14 | #include <linux/pci_hotplug.h> |
77bd7415 LV |
15 | #include <asm/eeh.h> |
16 | #include <asm/eeh_event.h> | |
17 | #include <asm/ppc-pci.h> | |
18 | #include <asm/pci-bridge.h> | |
19 | #include <asm/prom.h> | |
20 | #include <asm/rtas.h> | |
21 | ||
67086e32 | 22 | struct eeh_rmv_data { |
1c5c533b SB |
23 | struct list_head removed_vf_list; |
24 | int removed_dev_count; | |
67086e32 WY |
25 | }; |
26 | ||
30424e38 SB |
27 | static int eeh_result_priority(enum pci_ers_result result) |
28 | { | |
29 | switch (result) { | |
30 | case PCI_ERS_RESULT_NONE: | |
31 | return 1; | |
32 | case PCI_ERS_RESULT_NO_AER_DRIVER: | |
33 | return 2; | |
34 | case PCI_ERS_RESULT_RECOVERED: | |
35 | return 3; | |
36 | case PCI_ERS_RESULT_CAN_RECOVER: | |
37 | return 4; | |
38 | case PCI_ERS_RESULT_DISCONNECT: | |
39 | return 5; | |
40 | case PCI_ERS_RESULT_NEED_RESET: | |
41 | return 6; | |
42 | default: | |
43 | WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); | |
44 | return 0; | |
45 | } | |
46 | }; | |
47 | ||
c36c5ffd | 48 | static const char *pci_ers_result_name(enum pci_ers_result result) |
20b34497 SB |
49 | { |
50 | switch (result) { | |
51 | case PCI_ERS_RESULT_NONE: | |
52 | return "none"; | |
53 | case PCI_ERS_RESULT_CAN_RECOVER: | |
54 | return "can recover"; | |
55 | case PCI_ERS_RESULT_NEED_RESET: | |
56 | return "need reset"; | |
57 | case PCI_ERS_RESULT_DISCONNECT: | |
58 | return "disconnect"; | |
59 | case PCI_ERS_RESULT_RECOVERED: | |
60 | return "recovered"; | |
61 | case PCI_ERS_RESULT_NO_AER_DRIVER: | |
62 | return "no AER driver"; | |
63 | default: | |
64 | WARN_ONCE(1, "Unknown result type: %d\n", (int)result); | |
65 | return "unknown"; | |
66 | } | |
67 | }; | |
68 | ||
30424e38 SB |
69 | static enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, |
70 | enum pci_ers_result new) | |
71 | { | |
72 | if (eeh_result_priority(new) > eeh_result_priority(old)) | |
73 | return new; | |
74 | return old; | |
75 | } | |
76 | ||
e2b810d5 SB |
77 | static bool eeh_dev_removed(struct eeh_dev *edev) |
78 | { | |
79 | return !edev || (edev->mode & EEH_DEV_REMOVED); | |
80 | } | |
81 | ||
82 | static bool eeh_edev_actionable(struct eeh_dev *edev) | |
83 | { | |
38ddc011 OH |
84 | if (!edev->pdev) |
85 | return false; | |
86 | if (edev->pdev->error_state == pci_channel_io_perm_failure) | |
87 | return false; | |
88 | if (eeh_dev_removed(edev)) | |
89 | return false; | |
90 | if (eeh_pe_passed(edev->pe)) | |
91 | return false; | |
92 | ||
93 | return true; | |
e2b810d5 SB |
94 | } |
95 | ||
feadf7c0 GS |
96 | /** |
97 | * eeh_pcid_get - Get the PCI device driver | |
98 | * @pdev: PCI device | |
99 | * | |
100 | * The function is used to retrieve the PCI device driver for | |
101 | * the indicated PCI device. Besides, we will increase the reference | |
102 | * of the PCI device driver to prevent that being unloaded on | |
103 | * the fly. Otherwise, kernel crash would be seen. | |
104 | */ | |
105 | static inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) | |
106 | { | |
107 | if (!pdev || !pdev->driver) | |
108 | return NULL; | |
109 | ||
110 | if (!try_module_get(pdev->driver->driver.owner)) | |
111 | return NULL; | |
112 | ||
113 | return pdev->driver; | |
114 | } | |
115 | ||
116 | /** | |
117 | * eeh_pcid_put - Dereference on the PCI device driver | |
118 | * @pdev: PCI device | |
119 | * | |
120 | * The function is called to do dereference on the PCI device | |
121 | * driver of the indicated PCI device. | |
122 | */ | |
123 | static inline void eeh_pcid_put(struct pci_dev *pdev) | |
124 | { | |
125 | if (!pdev || !pdev->driver) | |
126 | return; | |
127 | ||
128 | module_put(pdev->driver->driver.owner); | |
129 | } | |
130 | ||
8535ef05 | 131 | /** |
29f8bf1b GS |
132 | * eeh_disable_irq - Disable interrupt for the recovering device |
133 | * @dev: PCI device | |
134 | * | |
135 | * This routine must be called when reporting temporary or permanent | |
136 | * error to the particular PCI device to disable interrupt of that | |
137 | * device. If the device has enabled MSI or MSI-X interrupt, we needn't | |
138 | * do real work because EEH should freeze DMA transfers for those PCI | |
139 | * devices encountering EEH errors, which includes MSI or MSI-X. | |
8535ef05 | 140 | */ |
010acfa1 | 141 | static void eeh_disable_irq(struct eeh_dev *edev) |
8535ef05 | 142 | { |
8535ef05 MM |
143 | /* Don't disable MSI and MSI-X interrupts. They are |
144 | * effectively disabled by the DMA Stopped state | |
145 | * when an EEH error occurs. | |
29f8bf1b | 146 | */ |
010acfa1 | 147 | if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) |
8535ef05 MM |
148 | return; |
149 | ||
010acfa1 | 150 | if (!irq_has_action(edev->pdev->irq)) |
8535ef05 MM |
151 | return; |
152 | ||
dbbceee1 | 153 | edev->mode |= EEH_DEV_IRQ_DISABLED; |
010acfa1 | 154 | disable_irq_nosync(edev->pdev->irq); |
8535ef05 MM |
155 | } |
156 | ||
157 | /** | |
29f8bf1b GS |
158 | * eeh_enable_irq - Enable interrupt for the recovering device |
159 | * @dev: PCI device | |
160 | * | |
161 | * This routine must be called to enable interrupt while failed | |
162 | * device could be resumed. | |
8535ef05 | 163 | */ |
010acfa1 | 164 | static void eeh_enable_irq(struct eeh_dev *edev) |
8535ef05 | 165 | { |
dbbceee1 GS |
166 | if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { |
167 | edev->mode &= ~EEH_DEV_IRQ_DISABLED; | |
b8a9a11b TG |
168 | /* |
169 | * FIXME !!!!! | |
170 | * | |
171 | * This is just ass backwards. This maze has | |
172 | * unbalanced irq_enable/disable calls. So instead of | |
173 | * finding the root cause it works around the warning | |
174 | * in the irq_enable code by conditionally calling | |
175 | * into it. | |
176 | * | |
177 | * That's just wrong.The warning in the core code is | |
027dfac6 | 178 | * there to tell people to fix their asymmetries in |
b8a9a11b TG |
179 | * their own code, not by abusing the core information |
180 | * to avoid it. | |
181 | * | |
182 | * I so wish that the assymetry would be the other way | |
183 | * round and a few more irq_disable calls render that | |
184 | * shit unusable forever. | |
185 | * | |
186 | * tglx | |
187 | */ | |
010acfa1 SB |
188 | if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq))) |
189 | enable_irq(edev->pdev->irq); | |
57310c3c | 190 | } |
8535ef05 MM |
191 | } |
192 | ||
cef50c67 | 193 | static void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) |
5cfb20b9 | 194 | { |
5cfb20b9 GS |
195 | struct pci_dev *pdev; |
196 | ||
197 | if (!edev) | |
cef50c67 | 198 | return; |
5cfb20b9 | 199 | |
5a0cdbfd GS |
200 | /* |
201 | * We cannot access the config space on some adapters. | |
202 | * Otherwise, it will cause fenced PHB. We don't save | |
203 | * the content in their config space and will restore | |
204 | * from the initial config space saved when the EEH | |
205 | * device is created. | |
206 | */ | |
207 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) | |
cef50c67 | 208 | return; |
5a0cdbfd | 209 | |
5cfb20b9 GS |
210 | pdev = eeh_dev_to_pci_dev(edev); |
211 | if (!pdev) | |
cef50c67 | 212 | return; |
5cfb20b9 GS |
213 | |
214 | pci_save_state(pdev); | |
5cfb20b9 GS |
215 | } |
216 | ||
47cc8c1c SB |
217 | static void eeh_set_channel_state(struct eeh_pe *root, enum pci_channel_state s) |
218 | { | |
219 | struct eeh_pe *pe; | |
220 | struct eeh_dev *edev, *tmp; | |
221 | ||
222 | eeh_for_each_pe(root, pe) | |
223 | eeh_pe_for_each_dev(pe, edev, tmp) | |
224 | if (eeh_edev_actionable(edev)) | |
225 | edev->pdev->error_state = s; | |
226 | } | |
227 | ||
010acfa1 SB |
228 | static void eeh_set_irq_state(struct eeh_pe *root, bool enable) |
229 | { | |
230 | struct eeh_pe *pe; | |
231 | struct eeh_dev *edev, *tmp; | |
232 | ||
233 | eeh_for_each_pe(root, pe) { | |
234 | eeh_pe_for_each_dev(pe, edev, tmp) { | |
235 | if (!eeh_edev_actionable(edev)) | |
236 | continue; | |
237 | ||
238 | if (!eeh_pcid_get(edev->pdev)) | |
239 | continue; | |
240 | ||
241 | if (enable) | |
242 | eeh_enable_irq(edev); | |
243 | else | |
244 | eeh_disable_irq(edev); | |
245 | ||
246 | eeh_pcid_put(edev->pdev); | |
247 | } | |
248 | } | |
249 | } | |
250 | ||
20b34497 | 251 | typedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, |
2e255051 | 252 | struct pci_dev *, |
20b34497 SB |
253 | struct pci_driver *); |
254 | static void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, | |
255 | enum pci_ers_result *result) | |
77bd7415 | 256 | { |
2e255051 | 257 | struct pci_dev *pdev; |
feadf7c0 | 258 | struct pci_driver *driver; |
20b34497 SB |
259 | enum pci_ers_result new_result; |
260 | ||
2e255051 SB |
261 | pci_lock_rescan_remove(); |
262 | pdev = edev->pdev; | |
263 | if (pdev) | |
264 | get_device(&pdev->dev); | |
265 | pci_unlock_rescan_remove(); | |
266 | if (!pdev) { | |
bcbe3730 SB |
267 | eeh_edev_info(edev, "no device"); |
268 | return; | |
269 | } | |
2e255051 | 270 | device_lock(&pdev->dev); |
20b34497 | 271 | if (eeh_edev_actionable(edev)) { |
2e255051 | 272 | driver = eeh_pcid_get(pdev); |
20b34497 SB |
273 | |
274 | if (!driver) | |
275 | eeh_edev_info(edev, "no driver"); | |
276 | else if (!driver->err_handler) | |
277 | eeh_edev_info(edev, "driver not EEH aware"); | |
278 | else if (edev->mode & EEH_DEV_NO_HANDLER) | |
279 | eeh_edev_info(edev, "driver bound too late"); | |
280 | else { | |
2e255051 | 281 | new_result = fn(edev, pdev, driver); |
20b34497 SB |
282 | eeh_edev_info(edev, "%s driver reports: '%s'", |
283 | driver->name, | |
284 | pci_ers_result_name(new_result)); | |
285 | if (result) | |
286 | *result = pci_ers_merge_result(*result, | |
287 | new_result); | |
288 | } | |
289 | if (driver) | |
2e255051 | 290 | eeh_pcid_put(pdev); |
20b34497 | 291 | } else { |
2e255051 | 292 | eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, |
20b34497 SB |
293 | !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); |
294 | } | |
2e255051 SB |
295 | device_unlock(&pdev->dev); |
296 | if (edev->pdev != pdev) | |
297 | eeh_edev_warn(edev, "Device changed during processing!\n"); | |
298 | put_device(&pdev->dev); | |
20b34497 | 299 | } |
77bd7415 | 300 | |
20b34497 SB |
301 | static void eeh_pe_report(const char *name, struct eeh_pe *root, |
302 | eeh_report_fn fn, enum pci_ers_result *result) | |
303 | { | |
304 | struct eeh_pe *pe; | |
305 | struct eeh_dev *edev, *tmp; | |
f0295e04 | 306 | |
20b34497 SB |
307 | pr_info("EEH: Beginning: '%s'\n", name); |
308 | eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) | |
309 | eeh_pe_report_edev(edev, fn, result); | |
310 | if (result) | |
311 | pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", | |
312 | name, pci_ers_result_name(*result)); | |
313 | else | |
314 | pr_info("EEH: Finished:'%s'", name); | |
315 | } | |
77bd7415 | 316 | |
20b34497 SB |
317 | /** |
318 | * eeh_report_error - Report pci error to each device driver | |
319 | * @edev: eeh device | |
320 | * @driver: device's PCI driver | |
321 | * | |
322 | * Report an EEH error to each device driver. | |
323 | */ | |
324 | static enum pci_ers_result eeh_report_error(struct eeh_dev *edev, | |
2e255051 | 325 | struct pci_dev *pdev, |
20b34497 SB |
326 | struct pci_driver *driver) |
327 | { | |
328 | enum pci_ers_result rc; | |
77bd7415 | 329 | |
20b34497 SB |
330 | if (!driver->err_handler->error_detected) |
331 | return PCI_ERS_RESULT_NONE; | |
77bd7415 | 332 | |
20b34497 SB |
333 | eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", |
334 | driver->name); | |
2e255051 | 335 | rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); |
2a50f144 | 336 | |
67086e32 | 337 | edev->in_error = true; |
2e255051 | 338 | pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); |
20b34497 | 339 | return rc; |
6a1ca373 LV |
340 | } |
341 | ||
342 | /** | |
29f8bf1b | 343 | * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled |
20b34497 SB |
344 | * @edev: eeh device |
345 | * @driver: device's PCI driver | |
6a1ca373 | 346 | * |
638799b3 | 347 | * Tells each device driver that IO ports, MMIO and config space I/O |
20b34497 | 348 | * are now enabled. |
6a1ca373 | 349 | */ |
20b34497 | 350 | static enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, |
2e255051 | 351 | struct pci_dev *pdev, |
20b34497 | 352 | struct pci_driver *driver) |
6a1ca373 | 353 | { |
20b34497 SB |
354 | if (!driver->err_handler->mmio_enabled) |
355 | return PCI_ERS_RESULT_NONE; | |
356 | eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); | |
2e255051 | 357 | return driver->err_handler->mmio_enabled(pdev); |
77bd7415 LV |
358 | } |
359 | ||
cb5b5624 | 360 | /** |
29f8bf1b | 361 | * eeh_report_reset - Tell device that slot has been reset |
20b34497 SB |
362 | * @edev: eeh device |
363 | * @driver: device's PCI driver | |
29f8bf1b GS |
364 | * |
365 | * This routine must be called while EEH tries to reset particular | |
366 | * PCI device so that the associated PCI device driver could take | |
367 | * some actions, usually to save data the driver needs so that the | |
368 | * driver can work again while the device is recovered. | |
77bd7415 | 369 | */ |
20b34497 | 370 | static enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, |
2e255051 | 371 | struct pci_dev *pdev, |
20b34497 | 372 | struct pci_driver *driver) |
77bd7415 | 373 | { |
20b34497 SB |
374 | if (!driver->err_handler->slot_reset || !edev->in_error) |
375 | return PCI_ERS_RESULT_NONE; | |
376 | eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); | |
2e255051 | 377 | return driver->err_handler->slot_reset(pdev); |
77bd7415 LV |
378 | } |
379 | ||
cef50c67 | 380 | static void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) |
5cfb20b9 | 381 | { |
5cfb20b9 GS |
382 | struct pci_dev *pdev; |
383 | ||
384 | if (!edev) | |
cef50c67 | 385 | return; |
5cfb20b9 | 386 | |
5a0cdbfd GS |
387 | /* |
388 | * The content in the config space isn't saved because | |
389 | * the blocked config space on some adapters. We have | |
390 | * to restore the initial saved config space when the | |
391 | * EEH device is created. | |
392 | */ | |
393 | if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { | |
80e65b00 | 394 | if (list_is_last(&edev->entry, &edev->pe->edevs)) |
5a0cdbfd GS |
395 | eeh_pe_restore_bars(edev->pe); |
396 | ||
cef50c67 | 397 | return; |
5a0cdbfd GS |
398 | } |
399 | ||
5cfb20b9 GS |
400 | pdev = eeh_dev_to_pci_dev(edev); |
401 | if (!pdev) | |
cef50c67 | 402 | return; |
5cfb20b9 GS |
403 | |
404 | pci_restore_state(pdev); | |
5cfb20b9 GS |
405 | } |
406 | ||
cb5b5624 | 407 | /** |
29f8bf1b | 408 | * eeh_report_resume - Tell device to resume normal operations |
20b34497 SB |
409 | * @edev: eeh device |
410 | * @driver: device's PCI driver | |
29f8bf1b GS |
411 | * |
412 | * This routine must be called to notify the device driver that it | |
413 | * could resume so that the device driver can do some initialization | |
414 | * to make the recovered device work again. | |
cb5b5624 | 415 | */ |
20b34497 | 416 | static enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, |
2e255051 | 417 | struct pci_dev *pdev, |
20b34497 | 418 | struct pci_driver *driver) |
77bd7415 | 419 | { |
20b34497 SB |
420 | if (!driver->err_handler->resume || !edev->in_error) |
421 | return PCI_ERS_RESULT_NONE; | |
d0e70341 | 422 | |
20b34497 | 423 | eeh_edev_info(edev, "Invoking %s->resume()", driver->name); |
2e255051 | 424 | driver->err_handler->resume(pdev); |
8535ef05 | 425 | |
20b34497 | 426 | pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); |
856e1eb9 | 427 | #ifdef CONFIG_PCI_IOV |
521ca5a9 JA |
428 | if (eeh_ops->notify_resume && eeh_dev_to_pdn(edev)) |
429 | eeh_ops->notify_resume(eeh_dev_to_pdn(edev)); | |
856e1eb9 | 430 | #endif |
20b34497 | 431 | return PCI_ERS_RESULT_NONE; |
77bd7415 LV |
432 | } |
433 | ||
cb5b5624 | 434 | /** |
29f8bf1b | 435 | * eeh_report_failure - Tell device driver that device is dead. |
20b34497 SB |
436 | * @edev: eeh device |
437 | * @driver: device's PCI driver | |
cb5b5624 LV |
438 | * |
439 | * This informs the device driver that the device is permanently | |
440 | * dead, and that no further recovery attempts will be made on it. | |
441 | */ | |
20b34497 | 442 | static enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, |
2e255051 | 443 | struct pci_dev *pdev, |
20b34497 | 444 | struct pci_driver *driver) |
77bd7415 | 445 | { |
20b34497 | 446 | enum pci_ers_result rc; |
77bd7415 | 447 | |
20b34497 SB |
448 | if (!driver->err_handler->error_detected) |
449 | return PCI_ERS_RESULT_NONE; | |
8535ef05 | 450 | |
20b34497 SB |
451 | eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", |
452 | driver->name); | |
2e255051 | 453 | rc = driver->err_handler->error_detected(pdev, |
20b34497 | 454 | pci_channel_io_perm_failure); |
70298c6e | 455 | |
2e255051 | 456 | pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); |
20b34497 | 457 | return rc; |
77bd7415 LV |
458 | } |
459 | ||
bf773df9 | 460 | static void *eeh_add_virt_device(struct eeh_dev *edev) |
67086e32 WY |
461 | { |
462 | struct pci_driver *driver; | |
67086e32 | 463 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
67086e32 WY |
464 | |
465 | if (!(edev->physfn)) { | |
1ff8f36f | 466 | eeh_edev_warn(edev, "Not for VF\n"); |
67086e32 WY |
467 | return NULL; |
468 | } | |
469 | ||
470 | driver = eeh_pcid_get(dev); | |
471 | if (driver) { | |
46d4be41 SB |
472 | if (driver->err_handler) { |
473 | eeh_pcid_put(dev); | |
67086e32 | 474 | return NULL; |
46d4be41 SB |
475 | } |
476 | eeh_pcid_put(dev); | |
67086e32 WY |
477 | } |
478 | ||
988fc3ba | 479 | #ifdef CONFIG_PCI_IOV |
1ff8f36f | 480 | pci_iov_add_virtfn(edev->physfn, eeh_dev_to_pdn(edev)->vf_index); |
67086e32 WY |
481 | #endif |
482 | return NULL; | |
483 | } | |
484 | ||
cef50c67 | 485 | static void eeh_rmv_device(struct eeh_dev *edev, void *userdata) |
f5c57710 GS |
486 | { |
487 | struct pci_driver *driver; | |
f5c57710 | 488 | struct pci_dev *dev = eeh_dev_to_pci_dev(edev); |
67086e32 | 489 | struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; |
f5c57710 GS |
490 | |
491 | /* | |
492 | * Actually, we should remove the PCI bridges as well. | |
493 | * However, that's lots of complexity to do that, | |
494 | * particularly some of devices under the bridge might | |
495 | * support EEH. So we just care about PCI devices for | |
496 | * simplicity here. | |
497 | */ | |
1ef52073 SB |
498 | if (!eeh_edev_actionable(edev) || |
499 | (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) | |
cef50c67 | 500 | return; |
d2b0f6f7 | 501 | |
1c5c533b | 502 | if (rmv_data) { |
46d4be41 SB |
503 | driver = eeh_pcid_get(dev); |
504 | if (driver) { | |
505 | if (driver->err_handler && | |
506 | driver->err_handler->error_detected && | |
507 | driver->err_handler->slot_reset) { | |
508 | eeh_pcid_put(dev); | |
cef50c67 | 509 | return; |
46d4be41 SB |
510 | } |
511 | eeh_pcid_put(dev); | |
512 | } | |
8cc6b6cd | 513 | } |
f5c57710 GS |
514 | |
515 | /* Remove it from PCI subsystem */ | |
1ef52073 SB |
516 | pr_info("EEH: Removing %s without EEH sensitive driver\n", |
517 | pci_name(dev)); | |
f5c57710 | 518 | edev->mode |= EEH_DEV_DISCONNECTED; |
1c5c533b SB |
519 | if (rmv_data) |
520 | rmv_data->removed_dev_count++; | |
f5c57710 | 521 | |
67086e32 | 522 | if (edev->physfn) { |
988fc3ba | 523 | #ifdef CONFIG_PCI_IOV |
67086e32 WY |
524 | struct pci_dn *pdn = eeh_dev_to_pdn(edev); |
525 | ||
753f6124 | 526 | pci_iov_remove_virtfn(edev->physfn, pdn->vf_index); |
67086e32 | 527 | edev->pdev = NULL; |
67086e32 WY |
528 | #endif |
529 | if (rmv_data) | |
1c5c533b | 530 | list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); |
67086e32 WY |
531 | } else { |
532 | pci_lock_rescan_remove(); | |
533 | pci_stop_and_remove_bus_device(dev); | |
534 | pci_unlock_rescan_remove(); | |
535 | } | |
f5c57710 GS |
536 | } |
537 | ||
d6c4932f | 538 | static void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) |
f5c57710 | 539 | { |
f5c57710 GS |
540 | struct eeh_dev *edev, *tmp; |
541 | ||
542 | eeh_pe_for_each_dev(pe, edev, tmp) { | |
543 | if (!(edev->mode & EEH_DEV_DISCONNECTED)) | |
544 | continue; | |
545 | ||
546 | edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); | |
547 | eeh_rmv_from_parent_pe(edev); | |
548 | } | |
549 | ||
550 | return NULL; | |
551 | } | |
552 | ||
78954700 GS |
553 | /* |
554 | * Explicitly clear PE's frozen state for PowerNV where | |
555 | * we have frozen PE until BAR restore is completed. It's | |
556 | * harmless to clear it for pSeries. To be consistent with | |
557 | * PE reset (for 3 times), we try to clear the frozen state | |
558 | * for 3 times as well. | |
559 | */ | |
4d8e325d | 560 | static int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) |
78954700 | 561 | { |
3376cb91 SB |
562 | struct eeh_pe *pe; |
563 | int i; | |
78954700 | 564 | |
3376cb91 | 565 | eeh_for_each_pe(root, pe) { |
4d8e325d SB |
566 | if (include_passed || !eeh_pe_passed(pe)) { |
567 | for (i = 0; i < 3; i++) | |
568 | if (!eeh_unfreeze_pe(pe)) | |
569 | break; | |
570 | if (i >= 3) | |
571 | return -EIO; | |
572 | } | |
2c665992 | 573 | } |
4d8e325d | 574 | eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); |
3376cb91 | 575 | return 0; |
78954700 GS |
576 | } |
577 | ||
5cfb20b9 GS |
578 | int eeh_pe_reset_and_recover(struct eeh_pe *pe) |
579 | { | |
2efc771f | 580 | int ret; |
5cfb20b9 GS |
581 | |
582 | /* Bail if the PE is being recovered */ | |
583 | if (pe->state & EEH_PE_RECOVERING) | |
584 | return 0; | |
585 | ||
586 | /* Put the PE into recovery mode */ | |
587 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); | |
588 | ||
589 | /* Save states */ | |
590 | eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); | |
591 | ||
5cfb20b9 | 592 | /* Issue reset */ |
1ef52073 | 593 | ret = eeh_pe_reset_full(pe, true); |
5cfb20b9 | 594 | if (ret) { |
9ed5ca66 | 595 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
5cfb20b9 GS |
596 | return ret; |
597 | } | |
5cfb20b9 GS |
598 | |
599 | /* Unfreeze the PE */ | |
4d8e325d | 600 | ret = eeh_clear_pe_frozen_state(pe, true); |
5cfb20b9 | 601 | if (ret) { |
9ed5ca66 | 602 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
5cfb20b9 GS |
603 | return ret; |
604 | } | |
605 | ||
5cfb20b9 GS |
606 | /* Restore device state */ |
607 | eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); | |
608 | ||
5cfb20b9 | 609 | /* Clear recovery mode */ |
9ed5ca66 | 610 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
5cfb20b9 GS |
611 | |
612 | return 0; | |
613 | } | |
614 | ||
77bd7415 | 615 | /** |
29f8bf1b | 616 | * eeh_reset_device - Perform actual reset of a pci slot |
5fd13460 | 617 | * @driver_eeh_aware: Does the device's driver provide EEH support? |
9b3c76f0 | 618 | * @pe: EEH PE |
29f8bf1b | 619 | * @bus: PCI bus corresponding to the isolcated slot |
5fd13460 | 620 | * @rmv_data: Optional, list to record removed devices |
77bd7415 | 621 | * |
29f8bf1b GS |
622 | * This routine must be called to do reset on the indicated PE. |
623 | * During the reset, udev might be invoked because those affected | |
624 | * PCI devices will be removed and then added. | |
77bd7415 | 625 | */ |
67086e32 | 626 | static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, |
5fd13460 SB |
627 | struct eeh_rmv_data *rmv_data, |
628 | bool driver_eeh_aware) | |
77bd7415 | 629 | { |
edfd17ff | 630 | time64_t tstamp; |
67086e32 WY |
631 | int cnt, rc; |
632 | struct eeh_dev *edev; | |
1ef52073 SB |
633 | struct eeh_pe *tmp_pe; |
634 | bool any_passed = false; | |
635 | ||
636 | eeh_for_each_pe(pe, tmp_pe) | |
637 | any_passed |= eeh_pe_passed(tmp_pe); | |
42405456 LV |
638 | |
639 | /* pcibios will clear the counter; save the value */ | |
9b3c76f0 | 640 | cnt = pe->freeze_count; |
5a71978e | 641 | tstamp = pe->tstamp; |
42405456 | 642 | |
20ee6a97 GS |
643 | /* |
644 | * We don't remove the corresponding PE instances because | |
645 | * we need the information afterwords. The attached EEH | |
646 | * devices are expected to be attached soon when calling | |
bd251b89 | 647 | * into pci_hp_add_devices(). |
20ee6a97 | 648 | */ |
f5c57710 | 649 | eeh_pe_state_mark(pe, EEH_PE_KEEP); |
1ef52073 | 650 | if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { |
cca0e542 | 651 | eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); |
54048cf8 SB |
652 | } else { |
653 | pci_lock_rescan_remove(); | |
654 | pci_hp_remove_devices(bus); | |
655 | pci_unlock_rescan_remove(); | |
1c2042c8 | 656 | } |
77bd7415 | 657 | |
d0914f50 GS |
658 | /* |
659 | * Reset the pci controller. (Asserts RST#; resets config space). | |
b6495c0c | 660 | * Reconfigure bridges and devices. Don't try to bring the system |
29f8bf1b | 661 | * up if the reset failed for some reason. |
d0914f50 GS |
662 | * |
663 | * During the reset, it's very dangerous to have uncontrolled PCI | |
664 | * config accesses. So we prefer to block them. However, controlled | |
665 | * PCI config accesses initiated from EEH itself are allowed. | |
29f8bf1b | 666 | */ |
1ef52073 | 667 | rc = eeh_pe_reset_full(pe, false); |
28bf36f9 | 668 | if (rc) |
b6495c0c | 669 | return rc; |
77bd7415 | 670 | |
1c2042c8 RW |
671 | pci_lock_rescan_remove(); |
672 | ||
9b3c76f0 GS |
673 | /* Restore PE */ |
674 | eeh_ops->configure_bridge(pe); | |
675 | eeh_pe_restore_bars(pe); | |
77bd7415 | 676 | |
dc9c41bd | 677 | /* Clear frozen state */ |
1ef52073 | 678 | rc = eeh_clear_pe_frozen_state(pe, false); |
409bf7f8 AD |
679 | if (rc) { |
680 | pci_unlock_rescan_remove(); | |
dc9c41bd | 681 | return rc; |
409bf7f8 | 682 | } |
78954700 | 683 | |
77bd7415 | 684 | /* Give the system 5 seconds to finish running the user-space |
a84f273c GS |
685 | * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, |
686 | * this is a hack, but if we don't do this, and try to bring | |
687 | * the device up before the scripts have taken it down, | |
77bd7415 LV |
688 | * potentially weird things happen. |
689 | */ | |
1c5c533b | 690 | if (!driver_eeh_aware || rmv_data->removed_dev_count) { |
54048cf8 SB |
691 | pr_info("EEH: Sleep 5s ahead of %s hotplug\n", |
692 | (driver_eeh_aware ? "partial" : "complete")); | |
29f8bf1b | 693 | ssleep(5); |
f5c57710 GS |
694 | |
695 | /* | |
696 | * The EEH device is still connected with its parent | |
697 | * PE. We should disconnect it so the binding can be | |
698 | * rebuilt when adding PCI devices. | |
699 | */ | |
80e65b00 | 700 | edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); |
f5c57710 | 701 | eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); |
a3aa256b | 702 | if (pe->type & EEH_PE_VF) { |
bf773df9 | 703 | eeh_add_virt_device(edev); |
a3aa256b | 704 | } else { |
54048cf8 | 705 | if (!driver_eeh_aware) |
9ed5ca66 | 706 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
bd251b89 | 707 | pci_hp_add_devices(bus); |
a3aa256b | 708 | } |
77bd7415 | 709 | } |
9ed5ca66 | 710 | eeh_pe_state_clear(pe, EEH_PE_KEEP, true); |
5a71978e GS |
711 | |
712 | pe->tstamp = tstamp; | |
9b3c76f0 | 713 | pe->freeze_count = cnt; |
b6495c0c | 714 | |
1c2042c8 | 715 | pci_unlock_rescan_remove(); |
b6495c0c | 716 | return 0; |
77bd7415 LV |
717 | } |
718 | ||
719 | /* The longest amount of time to wait for a pci device | |
720 | * to come back on line, in seconds. | |
721 | */ | |
fb48dc22 | 722 | #define MAX_WAIT_FOR_RECOVERY 300 |
77bd7415 | 723 | |
799abe28 OH |
724 | |
725 | /* Walks the PE tree after processing an event to remove any stale PEs. | |
726 | * | |
727 | * NB: This needs to be recursive to ensure the leaf PEs get removed | |
728 | * before their parents do. Although this is possible to do recursively | |
729 | * we don't since this is easier to read and we need to garantee | |
730 | * the leaf nodes will be handled first. | |
731 | */ | |
732 | static void eeh_pe_cleanup(struct eeh_pe *pe) | |
733 | { | |
734 | struct eeh_pe *child_pe, *tmp; | |
735 | ||
736 | list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) | |
737 | eeh_pe_cleanup(child_pe); | |
738 | ||
739 | if (pe->state & EEH_PE_KEEP) | |
740 | return; | |
741 | ||
742 | if (!(pe->state & EEH_PE_INVALID)) | |
743 | return; | |
744 | ||
745 | if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { | |
746 | list_del(&pe->child); | |
747 | kfree(pe); | |
748 | } | |
749 | } | |
750 | ||
b104af5a OH |
751 | /** |
752 | * eeh_check_slot_presence - Check if a device is still present in a slot | |
753 | * @pdev: pci_dev to check | |
754 | * | |
755 | * This function may return a false positive if we can't determine the slot's | |
756 | * presence state. This might happen for for PCIe slots if the PE containing | |
757 | * the upstream bridge is also frozen, or the bridge is part of the same PE | |
758 | * as the device. | |
759 | * | |
760 | * This shouldn't happen often, but you might see it if you hotplug a PCIe | |
761 | * switch. | |
762 | */ | |
763 | static bool eeh_slot_presence_check(struct pci_dev *pdev) | |
764 | { | |
765 | const struct hotplug_slot_ops *ops; | |
766 | struct pci_slot *slot; | |
767 | u8 state; | |
768 | int rc; | |
769 | ||
770 | if (!pdev) | |
771 | return false; | |
772 | ||
773 | if (pdev->error_state == pci_channel_io_perm_failure) | |
774 | return false; | |
775 | ||
776 | slot = pdev->slot; | |
777 | if (!slot || !slot->hotplug) | |
778 | return true; | |
779 | ||
780 | ops = slot->hotplug->ops; | |
781 | if (!ops || !ops->get_adapter_status) | |
782 | return true; | |
783 | ||
aeff27c1 OH |
784 | /* set the attention indicator while we've got the slot ops */ |
785 | if (ops->set_attention_status) | |
786 | ops->set_attention_status(slot->hotplug, 1); | |
787 | ||
b104af5a OH |
788 | rc = ops->get_adapter_status(slot->hotplug, &state); |
789 | if (rc) | |
790 | return true; | |
791 | ||
792 | return !!state; | |
793 | } | |
794 | ||
aeff27c1 OH |
795 | static void eeh_clear_slot_attention(struct pci_dev *pdev) |
796 | { | |
797 | const struct hotplug_slot_ops *ops; | |
798 | struct pci_slot *slot; | |
799 | ||
800 | if (!pdev) | |
801 | return; | |
802 | ||
803 | if (pdev->error_state == pci_channel_io_perm_failure) | |
804 | return; | |
805 | ||
806 | slot = pdev->slot; | |
807 | if (!slot || !slot->hotplug) | |
808 | return; | |
809 | ||
810 | ops = slot->hotplug->ops; | |
811 | if (!ops || !ops->set_attention_status) | |
812 | return; | |
813 | ||
814 | ops->set_attention_status(slot->hotplug, 0); | |
815 | } | |
816 | ||
c0b64978 RC |
817 | /** |
818 | * eeh_handle_normal_event - Handle EEH events on a specific PE | |
37fd8125 SB |
819 | * @pe: EEH PE - which should not be used after we return, as it may |
820 | * have been invalidated. | |
c0b64978 RC |
821 | * |
822 | * Attempts to recover the given PE. If recovery fails or the PE has failed | |
823 | * too many times, remove the PE. | |
824 | * | |
68701780 SB |
825 | * While PHB detects address or data parity errors on particular PCI |
826 | * slot, the associated PE will be frozen. Besides, DMA's occurring | |
827 | * to wild addresses (which usually happen due to bugs in device | |
828 | * drivers or in PCI adapter firmware) can cause EEH error. #SERR, | |
829 | * #PERR or other misc PCI-related errors also can trigger EEH errors. | |
830 | * | |
831 | * Recovery process consists of unplugging the device driver (which | |
832 | * generated hotplug events to userspace), then issuing a PCI #RST to | |
833 | * the device, then reconfiguring the PCI config space for all bridges | |
834 | * & devices under this slot, and then finally restarting the device | |
835 | * drivers (which cause a second set of hotplug events to go out to | |
836 | * userspace). | |
c0b64978 | 837 | */ |
37fd8125 | 838 | void eeh_handle_normal_event(struct eeh_pe *pe) |
77bd7415 | 839 | { |
cd95f804 | 840 | struct pci_bus *bus; |
67086e32 | 841 | struct eeh_dev *edev, *tmp; |
665012c5 | 842 | struct eeh_pe *tmp_pe; |
b6495c0c | 843 | int rc = 0; |
18eb3b39 | 844 | enum pci_ers_result result = PCI_ERS_RESULT_NONE; |
1c5c533b SB |
845 | struct eeh_rmv_data rmv_data = |
846 | {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; | |
b104af5a | 847 | int devices = 0; |
77bd7415 | 848 | |
cd95f804 SB |
849 | bus = eeh_pe_bus_get(pe); |
850 | if (!bus) { | |
1f52f176 | 851 | pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", |
9b3c76f0 | 852 | __func__, pe->phb->global_number, pe->addr); |
37fd8125 | 853 | return; |
77bd7415 LV |
854 | } |
855 | ||
b104af5a OH |
856 | /* |
857 | * When devices are hot-removed we might get an EEH due to | |
858 | * a driver attempting to touch the MMIO space of a removed | |
859 | * device. In this case we don't have a device to recover | |
860 | * so suppress the event if we can't find any present devices. | |
861 | * | |
862 | * The hotplug driver should take care of tearing down the | |
863 | * device itself. | |
864 | */ | |
865 | eeh_for_each_pe(pe, tmp_pe) | |
866 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) | |
867 | if (eeh_slot_presence_check(edev->pdev)) | |
868 | devices++; | |
869 | ||
25baf3d8 OH |
870 | if (!devices) { |
871 | pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", | |
872 | pe->phb->global_number, pe->addr); | |
b104af5a | 873 | goto out; /* nothing to recover */ |
25baf3d8 OH |
874 | } |
875 | ||
876 | /* Log the event */ | |
877 | if (pe->type & EEH_PE_PHB) { | |
de84ffc3 | 878 | pr_err("EEH: Recovering PHB#%x, location: %s\n", |
25baf3d8 OH |
879 | pe->phb->global_number, eeh_pe_loc_get(pe)); |
880 | } else { | |
881 | struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); | |
882 | ||
de84ffc3 | 883 | pr_err("EEH: Recovering PHB#%x-PE#%x\n", |
25baf3d8 OH |
884 | pe->phb->global_number, pe->addr); |
885 | pr_err("EEH: PE location: %s, PHB location: %s\n", | |
886 | eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); | |
887 | } | |
888 | ||
1b7f3b6c | 889 | #ifdef CONFIG_STACKTRACE |
25baf3d8 OH |
890 | /* |
891 | * Print the saved stack trace now that we've verified there's | |
892 | * something to recover. | |
893 | */ | |
894 | if (pe->trace_entries) { | |
895 | void **ptrs = (void **) pe->stack_trace; | |
896 | int i; | |
897 | ||
898 | pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", | |
899 | pe->phb->global_number, pe->addr); | |
900 | ||
901 | /* FIXME: Use the same format as dump_stack() */ | |
902 | pr_err("EEH: Call Trace:\n"); | |
903 | for (i = 0; i < pe->trace_entries; i++) | |
904 | pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); | |
905 | ||
906 | pe->trace_entries = 0; | |
907 | } | |
1b7f3b6c | 908 | #endif /* CONFIG_STACKTRACE */ |
b104af5a | 909 | |
5a71978e | 910 | eeh_pe_update_time_stamp(pe); |
9b3c76f0 | 911 | pe->freeze_count++; |
c0b64978 | 912 | if (pe->freeze_count > eeh_max_freezes) { |
796b9f5b | 913 | pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", |
c0b64978 RC |
914 | pe->phb->global_number, pe->addr, |
915 | pe->freeze_count); | |
b90484ec | 916 | result = PCI_ERS_RESULT_DISCONNECT; |
c0b64978 | 917 | } |
77bd7415 | 918 | |
aa06e3d6 SB |
919 | eeh_for_each_pe(pe, tmp_pe) |
920 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) | |
921 | edev->mode &= ~EEH_DEV_NO_HANDLER; | |
922 | ||
77bd7415 LV |
923 | /* Walk the various device drivers attached to this slot through |
924 | * a reset sequence, giving each an opportunity to do what it needs | |
925 | * to accomplish the reset. Each child gets a report of the | |
926 | * status ... if any child can't handle the reset, then the entire | |
927 | * slot is dlpar removed and added. | |
8234fced GS |
928 | * |
929 | * When the PHB is fenced, we have to issue a reset to recover from | |
930 | * the error. Override the result if necessary to have partially | |
931 | * hotplug for this case. | |
77bd7415 | 932 | */ |
b90484ec SB |
933 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
934 | pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", | |
935 | pe->freeze_count, eeh_max_freezes); | |
936 | pr_info("EEH: Notify device drivers to shutdown\n"); | |
937 | eeh_set_channel_state(pe, pci_channel_io_frozen); | |
938 | eeh_set_irq_state(pe, false); | |
939 | eeh_pe_report("error_detected(IO frozen)", pe, | |
940 | eeh_report_error, &result); | |
941 | if ((pe->type & EEH_PE_PHB) && | |
942 | result != PCI_ERS_RESULT_NONE && | |
943 | result != PCI_ERS_RESULT_NEED_RESET) | |
944 | result = PCI_ERS_RESULT_NEED_RESET; | |
945 | } | |
77bd7415 | 946 | |
5f1a7c81 | 947 | /* Get the current PCI slot state. This can take a long time, |
2ac3990c | 948 | * sometimes over 300 seconds for certain systems. |
29f8bf1b | 949 | */ |
b90484ec SB |
950 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
951 | rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); | |
952 | if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { | |
953 | pr_warn("EEH: Permanent failure\n"); | |
954 | result = PCI_ERS_RESULT_DISCONNECT; | |
955 | } | |
5f1a7c81 LV |
956 | } |
957 | ||
ede8ca26 LV |
958 | /* Since rtas may enable MMIO when posting the error log, |
959 | * don't post the error log until after all dev drivers | |
17213c3b LV |
960 | * have been informed. |
961 | */ | |
b90484ec SB |
962 | if (result != PCI_ERS_RESULT_DISCONNECT) { |
963 | pr_info("EEH: Collect temporary log\n"); | |
964 | eeh_slot_error_detail(pe, EEH_LOG_TEMP); | |
965 | } | |
ede8ca26 | 966 | |
77bd7415 LV |
967 | /* If all device drivers were EEH-unaware, then shut |
968 | * down all of the device drivers, and hope they | |
969 | * go down willingly, without panicing the system. | |
970 | */ | |
18eb3b39 | 971 | if (result == PCI_ERS_RESULT_NONE) { |
56ca4fde | 972 | pr_info("EEH: Reset with hotplug activity\n"); |
5fd13460 | 973 | rc = eeh_reset_device(pe, bus, NULL, false); |
e0f90b64 | 974 | if (rc) { |
0dae2743 GS |
975 | pr_warn("%s: Unable to reset, err=%d\n", |
976 | __func__, rc); | |
b90484ec | 977 | result = PCI_ERS_RESULT_DISCONNECT; |
e0f90b64 | 978 | } |
77bd7415 LV |
979 | } |
980 | ||
6a1ca373 LV |
981 | /* If all devices reported they can proceed, then re-enable MMIO */ |
982 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { | |
56ca4fde | 983 | pr_info("EEH: Enable I/O for affected devices\n"); |
9b3c76f0 | 984 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); |
6a1ca373 | 985 | |
b90484ec SB |
986 | if (rc < 0) { |
987 | result = PCI_ERS_RESULT_DISCONNECT; | |
988 | } else if (rc) { | |
6a1ca373 LV |
989 | result = PCI_ERS_RESULT_NEED_RESET; |
990 | } else { | |
56ca4fde | 991 | pr_info("EEH: Notify device drivers to resume I/O\n"); |
20b34497 SB |
992 | eeh_pe_report("mmio_enabled", pe, |
993 | eeh_report_mmio_enabled, &result); | |
6a1ca373 | 994 | } |
77bd7415 LV |
995 | } |
996 | ||
6a1ca373 | 997 | /* If all devices reported they can proceed, then re-enable DMA */ |
18eb3b39 | 998 | if (result == PCI_ERS_RESULT_CAN_RECOVER) { |
56ca4fde | 999 | pr_info("EEH: Enabled DMA for affected devices\n"); |
9b3c76f0 | 1000 | rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); |
6a1ca373 | 1001 | |
b90484ec SB |
1002 | if (rc < 0) { |
1003 | result = PCI_ERS_RESULT_DISCONNECT; | |
1004 | } else if (rc) { | |
6a1ca373 | 1005 | result = PCI_ERS_RESULT_NEED_RESET; |
35845a78 GS |
1006 | } else { |
1007 | /* | |
1008 | * We didn't do PE reset for the case. The PE | |
1009 | * is still in frozen state. Clear it before | |
1010 | * resuming the PE. | |
1011 | */ | |
9ed5ca66 | 1012 | eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); |
d0e70341 | 1013 | result = PCI_ERS_RESULT_RECOVERED; |
35845a78 | 1014 | } |
6a1ca373 LV |
1015 | } |
1016 | ||
6a1ca373 LV |
1017 | /* If any device called out for a reset, then reset the slot */ |
1018 | if (result == PCI_ERS_RESULT_NEED_RESET) { | |
56ca4fde | 1019 | pr_info("EEH: Reset without hotplug activity\n"); |
5fd13460 | 1020 | rc = eeh_reset_device(pe, bus, &rmv_data, true); |
e0f90b64 | 1021 | if (rc) { |
0dae2743 GS |
1022 | pr_warn("%s: Cannot reset, err=%d\n", |
1023 | __func__, rc); | |
b90484ec SB |
1024 | result = PCI_ERS_RESULT_DISCONNECT; |
1025 | } else { | |
1026 | result = PCI_ERS_RESULT_NONE; | |
1027 | eeh_set_channel_state(pe, pci_channel_io_normal); | |
1028 | eeh_set_irq_state(pe, true); | |
1029 | eeh_pe_report("slot_reset", pe, eeh_report_reset, | |
1030 | &result); | |
e0f90b64 | 1031 | } |
e0f90b64 | 1032 | } |
6a1ca373 | 1033 | |
b90484ec SB |
1034 | if ((result == PCI_ERS_RESULT_RECOVERED) || |
1035 | (result == PCI_ERS_RESULT_NONE)) { | |
1036 | /* | |
1037 | * For those hot removed VFs, we should add back them after PF | |
1038 | * get recovered properly. | |
1039 | */ | |
1040 | list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, | |
1041 | rmv_entry) { | |
1042 | eeh_add_virt_device(edev); | |
1043 | list_del(&edev->rmv_entry); | |
20b34497 | 1044 | } |
665012c5 | 1045 | |
b90484ec SB |
1046 | /* Tell all device drivers that they can resume operations */ |
1047 | pr_info("EEH: Notify device driver to resume\n"); | |
1048 | eeh_set_channel_state(pe, pci_channel_io_normal); | |
1049 | eeh_set_irq_state(pe, true); | |
1050 | eeh_pe_report("resume", pe, eeh_report_resume, NULL); | |
1051 | eeh_for_each_pe(pe, tmp_pe) { | |
1052 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) { | |
1053 | edev->mode &= ~EEH_DEV_NO_HANDLER; | |
1054 | edev->in_error = false; | |
1055 | } | |
1056 | } | |
a84f273c | 1057 | |
b90484ec SB |
1058 | pr_info("EEH: Recovery successful.\n"); |
1059 | } else { | |
1060 | /* | |
1061 | * About 90% of all real-life EEH failures in the field | |
1062 | * are due to poorly seated PCI cards. Only 10% or so are | |
1063 | * due to actual, failed cards. | |
1064 | */ | |
1065 | pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" | |
1066 | "Please try reseating or replacing it\n", | |
1067 | pe->phb->global_number, pe->addr); | |
b6495c0c | 1068 | |
b90484ec | 1069 | eeh_slot_error_detail(pe, EEH_LOG_PERM); |
b6495c0c | 1070 | |
b90484ec SB |
1071 | /* Notify all devices that they're about to go down. */ |
1072 | eeh_set_channel_state(pe, pci_channel_io_perm_failure); | |
1073 | eeh_set_irq_state(pe, false); | |
1074 | eeh_pe_report("error_detected(permanent failure)", pe, | |
1075 | eeh_report_failure, NULL); | |
b6495c0c | 1076 | |
b90484ec SB |
1077 | /* Mark the PE to be removed permanently */ |
1078 | eeh_pe_state_mark(pe, EEH_PE_REMOVED); | |
d2b0f6f7 | 1079 | |
b90484ec SB |
1080 | /* |
1081 | * Shut down the device drivers for good. We mark | |
1082 | * all removed devices correctly to avoid access | |
1083 | * the their PCI config any more. | |
1084 | */ | |
1085 | if (pe->type & EEH_PE_VF) { | |
1086 | eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); | |
1087 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); | |
1088 | } else { | |
9ed5ca66 | 1089 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); |
b90484ec SB |
1090 | eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); |
1091 | ||
1092 | pci_lock_rescan_remove(); | |
1093 | pci_hp_remove_devices(bus); | |
1094 | pci_unlock_rescan_remove(); | |
1095 | /* The passed PE should no longer be used */ | |
1096 | return; | |
1097 | } | |
1c2042c8 | 1098 | } |
799abe28 | 1099 | |
b104af5a | 1100 | out: |
799abe28 OH |
1101 | /* |
1102 | * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING | |
1103 | * we don't want to modify the PE tree structure so we do it here. | |
1104 | */ | |
1105 | eeh_pe_cleanup(pe); | |
aeff27c1 OH |
1106 | |
1107 | /* clear the slot attention LED for all recovered devices */ | |
1108 | eeh_for_each_pe(pe, tmp_pe) | |
1109 | eeh_pe_for_each_dev(tmp_pe, edev, tmp) | |
1110 | eeh_clear_slot_attention(edev->pdev); | |
1111 | ||
9ed5ca66 | 1112 | eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); |
77bd7415 | 1113 | } |
8a6b1bc7 | 1114 | |
c0b64978 RC |
1115 | /** |
1116 | * eeh_handle_special_event - Handle EEH events without a specific failing PE | |
1117 | * | |
1118 | * Called when an EEH event is detected but can't be narrowed down to a | |
1119 | * specific PE. Iterates through possible failures and handles them as | |
1120 | * necessary. | |
1121 | */ | |
68701780 | 1122 | void eeh_handle_special_event(void) |
8a6b1bc7 | 1123 | { |
aa06e3d6 SB |
1124 | struct eeh_pe *pe, *phb_pe, *tmp_pe; |
1125 | struct eeh_dev *edev, *tmp_edev; | |
8a6b1bc7 | 1126 | struct pci_bus *bus; |
7e4e7867 | 1127 | struct pci_controller *hose; |
8a6b1bc7 | 1128 | unsigned long flags; |
7e4e7867 | 1129 | int rc; |
8a6b1bc7 | 1130 | |
8a6b1bc7 | 1131 | |
7e4e7867 GS |
1132 | do { |
1133 | rc = eeh_ops->next_error(&pe); | |
1134 | ||
1135 | switch (rc) { | |
1136 | case EEH_NEXT_ERR_DEAD_IOC: | |
1137 | /* Mark all PHBs in dead state */ | |
1138 | eeh_serialize_lock(&flags); | |
1139 | ||
1140 | /* Purge all events */ | |
5c7a35e3 | 1141 | eeh_remove_event(NULL, true); |
7e4e7867 GS |
1142 | |
1143 | list_for_each_entry(hose, &hose_list, list_node) { | |
1144 | phb_pe = eeh_phb_pe_get(hose); | |
1145 | if (!phb_pe) continue; | |
1146 | ||
e762bb89 | 1147 | eeh_pe_mark_isolated(phb_pe); |
7e4e7867 GS |
1148 | } |
1149 | ||
1150 | eeh_serialize_unlock(flags); | |
1151 | ||
1152 | break; | |
1153 | case EEH_NEXT_ERR_FROZEN_PE: | |
1154 | case EEH_NEXT_ERR_FENCED_PHB: | |
1155 | case EEH_NEXT_ERR_DEAD_PHB: | |
1156 | /* Mark the PE in fenced state */ | |
1157 | eeh_serialize_lock(&flags); | |
1158 | ||
1159 | /* Purge all events of the PHB */ | |
5c7a35e3 | 1160 | eeh_remove_event(pe, true); |
7e4e7867 | 1161 | |
e762bb89 SB |
1162 | if (rc != EEH_NEXT_ERR_DEAD_PHB) |
1163 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); | |
1164 | eeh_pe_mark_isolated(pe); | |
7e4e7867 GS |
1165 | |
1166 | eeh_serialize_unlock(flags); | |
1167 | ||
1168 | break; | |
1169 | case EEH_NEXT_ERR_NONE: | |
1170 | return; | |
1171 | default: | |
1172 | pr_warn("%s: Invalid value %d from next_error()\n", | |
1173 | __func__, rc); | |
1174 | return; | |
8a6b1bc7 | 1175 | } |
8a6b1bc7 | 1176 | |
7e4e7867 GS |
1177 | /* |
1178 | * For fenced PHB and frozen PE, it's handled as normal | |
1179 | * event. We have to remove the affected PHBs for dead | |
1180 | * PHB and IOC | |
1181 | */ | |
1182 | if (rc == EEH_NEXT_ERR_FROZEN_PE || | |
1183 | rc == EEH_NEXT_ERR_FENCED_PHB) { | |
799abe28 | 1184 | eeh_pe_state_mark(pe, EEH_PE_RECOVERING); |
37fd8125 | 1185 | eeh_handle_normal_event(pe); |
7e4e7867 | 1186 | } else { |
d4f194ed SB |
1187 | eeh_for_each_pe(pe, tmp_pe) |
1188 | eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) | |
1189 | edev->mode &= ~EEH_DEV_NO_HANDLER; | |
1190 | ||
1191 | /* Notify all devices to be down */ | |
1192 | eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); | |
1193 | eeh_set_channel_state(pe, pci_channel_io_perm_failure); | |
1194 | eeh_pe_report( | |
1195 | "error_detected(permanent failure)", pe, | |
1196 | eeh_report_failure, NULL); | |
1197 | ||
1b17366d | 1198 | pci_lock_rescan_remove(); |
7e4e7867 GS |
1199 | list_for_each_entry(hose, &hose_list, list_node) { |
1200 | phb_pe = eeh_phb_pe_get(hose); | |
1201 | if (!phb_pe || | |
9e049375 GS |
1202 | !(phb_pe->state & EEH_PE_ISOLATED) || |
1203 | (phb_pe->state & EEH_PE_RECOVERING)) | |
7e4e7867 GS |
1204 | continue; |
1205 | ||
7e4e7867 | 1206 | bus = eeh_pe_bus_get(phb_pe); |
04fec21c RC |
1207 | if (!bus) { |
1208 | pr_err("%s: Cannot find PCI bus for " | |
1f52f176 | 1209 | "PHB#%x-PE#%x\n", |
04fec21c RC |
1210 | __func__, |
1211 | pe->phb->global_number, | |
1212 | pe->addr); | |
1213 | break; | |
1214 | } | |
bd251b89 | 1215 | pci_hp_remove_devices(bus); |
7e4e7867 | 1216 | } |
1b17366d | 1217 | pci_unlock_rescan_remove(); |
8a6b1bc7 | 1218 | } |
7e4e7867 GS |
1219 | |
1220 | /* | |
1221 | * If we have detected dead IOC, we needn't proceed | |
1222 | * any more since all PHBs would have been removed | |
1223 | */ | |
1224 | if (rc == EEH_NEXT_ERR_DEAD_IOC) | |
1225 | break; | |
1226 | } while (rc != EEH_NEXT_ERR_NONE); | |
8a6b1bc7 | 1227 | } |