2 * IBM Accelerator Family 'GenWQE'
4 * (C) Copyright IBM Corp. 2013
6 * Author: Frank Haverkamp <haver@linux.vnet.ibm.com>
7 * Author: Joerg-Stephan Vogt <jsvogt@de.ibm.com>
8 * Author: Michael Jung <mijung@de.ibm.com>
9 * Author: Michael Ruettger <michael@ibmra.de>
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License (version 2 only)
13 * as published by the Free Software Foundation.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
22 * Module initialization and PCIe setup. Card health monitoring and
23 * recovery functionality. Character device creation and deletion are
24 * controlled from here.
27 #include <linux/module.h>
28 #include <linux/types.h>
29 #include <linux/pci.h>
30 #include <linux/err.h>
31 #include <linux/aer.h>
32 #include <linux/string.h>
33 #include <linux/sched.h>
34 #include <linux/wait.h>
35 #include <linux/delay.h>
36 #include <linux/dma-mapping.h>
37 #include <linux/module.h>
38 #include <linux/notifier.h>
39 #include <linux/device.h>
40 #include <linux/log2.h>
41 #include <linux/genwqe/genwqe_card.h>
43 #include "card_base.h"
44 #include "card_ddcb.h"
46 MODULE_AUTHOR("Frank Haverkamp <haver@linux.vnet.ibm.com>");
47 MODULE_AUTHOR("Michael Ruettger <michael@ibmra.de>");
48 MODULE_AUTHOR("Joerg-Stephan Vogt <jsvogt@de.ibm.com>");
49 MODULE_AUTHOR("Michal Jung <mijung@de.ibm.com>");
51 MODULE_DESCRIPTION("GenWQE Card");
52 MODULE_VERSION(DRV_VERS_STRING
);
53 MODULE_LICENSE("GPL");
55 static char genwqe_driver_name
[] = GENWQE_DEVNAME
;
56 static struct class *class_genwqe
;
57 static struct dentry
*debugfs_genwqe
;
58 static struct genwqe_dev
*genwqe_devices
[GENWQE_CARD_NO_MAX
];
60 /* PCI structure for identifying device by PCI vendor and device ID */
61 static DEFINE_PCI_DEVICE_TABLE(genwqe_device_table
) = {
62 { .vendor
= PCI_VENDOR_ID_IBM
,
63 .device
= PCI_DEVICE_GENWQE
,
64 .subvendor
= PCI_SUBVENDOR_ID_IBM
,
65 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5
,
66 .class = (PCI_CLASSCODE_GENWQE5
<< 8),
70 /* Initial SR-IOV bring-up image */
71 { .vendor
= PCI_VENDOR_ID_IBM
,
72 .device
= PCI_DEVICE_GENWQE
,
73 .subvendor
= PCI_SUBVENDOR_ID_IBM_SRIOV
,
74 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5_SRIOV
,
75 .class = (PCI_CLASSCODE_GENWQE5_SRIOV
<< 8),
79 { .vendor
= PCI_VENDOR_ID_IBM
, /* VF Vendor ID */
80 .device
= 0x0000, /* VF Device ID */
81 .subvendor
= PCI_SUBVENDOR_ID_IBM_SRIOV
,
82 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5_SRIOV
,
83 .class = (PCI_CLASSCODE_GENWQE5_SRIOV
<< 8),
88 { .vendor
= PCI_VENDOR_ID_IBM
,
89 .device
= PCI_DEVICE_GENWQE
,
90 .subvendor
= PCI_SUBVENDOR_ID_IBM_SRIOV
,
91 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5
,
92 .class = (PCI_CLASSCODE_GENWQE5_SRIOV
<< 8),
96 { .vendor
= PCI_VENDOR_ID_IBM
, /* VF Vendor ID */
97 .device
= 0x0000, /* VF Device ID */
98 .subvendor
= PCI_SUBVENDOR_ID_IBM_SRIOV
,
99 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5
,
100 .class = (PCI_CLASSCODE_GENWQE5_SRIOV
<< 8),
104 /* Even one more ... */
105 { .vendor
= PCI_VENDOR_ID_IBM
,
106 .device
= PCI_DEVICE_GENWQE
,
107 .subvendor
= PCI_SUBVENDOR_ID_IBM
,
108 .subdevice
= PCI_SUBSYSTEM_ID_GENWQE5_NEW
,
109 .class = (PCI_CLASSCODE_GENWQE5
<< 8),
113 { 0, } /* 0 terminated list. */
116 MODULE_DEVICE_TABLE(pci
, genwqe_device_table
);
119 * genwqe_dev_alloc() - Create and prepare a new card descriptor
121 * Return: Pointer to card descriptor, or ERR_PTR(err) on error
123 static struct genwqe_dev
*genwqe_dev_alloc(void)
125 unsigned int i
= 0, j
;
126 struct genwqe_dev
*cd
;
128 for (i
= 0; i
< GENWQE_CARD_NO_MAX
; i
++) {
129 if (genwqe_devices
[i
] == NULL
)
132 if (i
>= GENWQE_CARD_NO_MAX
)
133 return ERR_PTR(-ENODEV
);
135 cd
= kzalloc(sizeof(struct genwqe_dev
), GFP_KERNEL
);
137 return ERR_PTR(-ENOMEM
);
140 cd
->class_genwqe
= class_genwqe
;
141 cd
->debugfs_genwqe
= debugfs_genwqe
;
144 * This comes from kernel config option and can be overritten via
147 cd
->use_platform_recovery
= CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY
;
149 init_waitqueue_head(&cd
->queue_waitq
);
151 spin_lock_init(&cd
->file_lock
);
152 INIT_LIST_HEAD(&cd
->file_list
);
154 cd
->card_state
= GENWQE_CARD_UNUSED
;
155 spin_lock_init(&cd
->print_lock
);
157 cd
->ddcb_software_timeout
= genwqe_ddcb_software_timeout
;
158 cd
->kill_timeout
= genwqe_kill_timeout
;
160 for (j
= 0; j
< GENWQE_MAX_VFS
; j
++)
161 cd
->vf_jobtimeout_msec
[j
] = genwqe_vf_jobtimeout_msec
;
163 genwqe_devices
[i
] = cd
;
167 static void genwqe_dev_free(struct genwqe_dev
*cd
)
172 genwqe_devices
[cd
->card_idx
] = NULL
;
177 * genwqe_bus_reset() - Card recovery
179 * pci_reset_function() will recover the device and ensure that the
180 * registers are accessible again when it completes with success. If
181 * not, the card will stay dead and registers will be unaccessible
184 static int genwqe_bus_reset(struct genwqe_dev
*cd
)
187 struct pci_dev
*pci_dev
= cd
->pci_dev
;
190 if (cd
->err_inject
& GENWQE_INJECT_BUS_RESET_FAILURE
)
195 pci_iounmap(pci_dev
, mmio
);
197 bars
= pci_select_bars(pci_dev
, IORESOURCE_MEM
);
198 pci_release_selected_regions(pci_dev
, bars
);
201 * Firmware/BIOS might change memory mapping during bus reset.
202 * Settings like enable bus-mastering, ... are backuped and
203 * restored by the pci_reset_function().
205 dev_dbg(&pci_dev
->dev
, "[%s] pci_reset function ...\n", __func__
);
206 rc
= pci_reset_function(pci_dev
);
208 dev_err(&pci_dev
->dev
,
209 "[%s] err: failed reset func (rc %d)\n", __func__
, rc
);
212 dev_dbg(&pci_dev
->dev
, "[%s] done with rc=%d\n", __func__
, rc
);
215 * Here is the right spot to clear the register read
216 * failure. pci_bus_reset() does this job in real systems.
218 cd
->err_inject
&= ~(GENWQE_INJECT_HARDWARE_FAILURE
|
219 GENWQE_INJECT_GFIR_FATAL
|
220 GENWQE_INJECT_GFIR_INFO
);
222 rc
= pci_request_selected_regions(pci_dev
, bars
, genwqe_driver_name
);
224 dev_err(&pci_dev
->dev
,
225 "[%s] err: request bars failed (%d)\n", __func__
, rc
);
229 cd
->mmio
= pci_iomap(pci_dev
, 0, 0);
230 if (cd
->mmio
== NULL
) {
231 dev_err(&pci_dev
->dev
,
232 "[%s] err: mapping BAR0 failed\n", __func__
);
239 * Hardware circumvention section. Certain bitstreams in our test-lab
240 * had different kinds of problems. Here is where we adjust those
241 * bitstreams to function will with this version of our device driver.
243 * Thise circumventions are applied to the physical function only.
244 * The magical numbers below are identifying development/manufacturing
245 * versions of the bitstream used on the card.
247 * Turn off error reporting for old/manufacturing images.
250 bool genwqe_need_err_masking(struct genwqe_dev
*cd
)
252 return (cd
->slu_unitcfg
& 0xFFFF0ull
) < 0x32170ull
;
255 static void genwqe_tweak_hardware(struct genwqe_dev
*cd
)
257 struct pci_dev
*pci_dev
= cd
->pci_dev
;
259 /* Mask FIRs for development images */
260 if (((cd
->slu_unitcfg
& 0xFFFF0ull
) >= 0x32000ull
) &&
261 ((cd
->slu_unitcfg
& 0xFFFF0ull
) <= 0x33250ull
)) {
262 dev_warn(&pci_dev
->dev
,
263 "FIRs masked due to bitstream %016llx.%016llx\n",
264 cd
->slu_unitcfg
, cd
->app_unitcfg
);
266 __genwqe_writeq(cd
, IO_APP_SEC_LEM_DEBUG_OVR
,
267 0xFFFFFFFFFFFFFFFFull
);
269 __genwqe_writeq(cd
, IO_APP_ERR_ACT_MASK
,
270 0x0000000000000000ull
);
275 * genwqe_recovery_on_fatal_gfir_required() - Version depended actions
277 * Bitstreams older than 2013-02-17 have a bug where fatal GFIRs must
278 * be ignored. This is e.g. true for the bitstream we gave to the card
279 * manufacturer, but also for some old bitstreams we released to our
282 int genwqe_recovery_on_fatal_gfir_required(struct genwqe_dev
*cd
)
284 return (cd
->slu_unitcfg
& 0xFFFF0ull
) >= 0x32170ull
;
287 int genwqe_flash_readback_fails(struct genwqe_dev
*cd
)
289 return (cd
->slu_unitcfg
& 0xFFFF0ull
) < 0x32170ull
;
293 * genwqe_T_psec() - Calculate PF/VF timeout register content
295 * Note: From a design perspective it turned out to be a bad idea to
296 * use codes here to specifiy the frequency/speed values. An old
297 * driver cannot understand new codes and is therefore always a
298 * problem. Better is to measure out the value or put the
299 * speed/frequency directly into a register which is always a valid
300 * value for old as well as for new software.
303 static int genwqe_T_psec(struct genwqe_dev
*cd
)
305 u16 speed
; /* 1/f -> 250, 200, 166, 175 */
306 static const int T
[] = { 4000, 5000, 6000, 5714 };
308 speed
= (u16
)((cd
->slu_unitcfg
>> 28) & 0x0full
);
309 if (speed
>= ARRAY_SIZE(T
))
310 return -1; /* illegal value */
316 * genwqe_setup_pf_jtimer() - Setup PF hardware timeouts for DDCB execution
318 * Do this _after_ card_reset() is called. Otherwise the values will
319 * vanish. The settings need to be done when the queues are inactive.
321 * The max. timeout value is 2^(10+x) * T (6ns for 166MHz) * 15/16.
322 * The min. timeout value is 2^(10+x) * T (6ns for 166MHz) * 14/16.
324 static bool genwqe_setup_pf_jtimer(struct genwqe_dev
*cd
)
326 u32 T
= genwqe_T_psec(cd
);
329 if (genwqe_pf_jobtimeout_msec
== 0)
332 /* PF: large value needed, flash update 2sec per block */
333 x
= ilog2(genwqe_pf_jobtimeout_msec
*
334 16000000000uL/(T
* 15)) - 10;
336 genwqe_write_vreg(cd
, IO_SLC_VF_APPJOB_TIMEOUT
,
337 0xff00 | (x
& 0xff), 0);
342 * genwqe_setup_vf_jtimer() - Setup VF hardware timeouts for DDCB execution
344 static bool genwqe_setup_vf_jtimer(struct genwqe_dev
*cd
)
346 struct pci_dev
*pci_dev
= cd
->pci_dev
;
348 u32 T
= genwqe_T_psec(cd
);
351 for (vf
= 0; vf
< pci_sriov_get_totalvfs(pci_dev
); vf
++) {
353 if (cd
->vf_jobtimeout_msec
[vf
] == 0)
356 x
= ilog2(cd
->vf_jobtimeout_msec
[vf
] *
357 16000000000uL/(T
* 15)) - 10;
359 genwqe_write_vreg(cd
, IO_SLC_VF_APPJOB_TIMEOUT
,
360 0xff00 | (x
& 0xff), vf
+ 1);
365 static int genwqe_ffdc_buffs_alloc(struct genwqe_dev
*cd
)
367 unsigned int type
, e
= 0;
369 for (type
= 0; type
< GENWQE_DBG_UNITS
; type
++) {
371 case GENWQE_DBG_UNIT0
:
372 e
= genwqe_ffdc_buff_size(cd
, 0);
374 case GENWQE_DBG_UNIT1
:
375 e
= genwqe_ffdc_buff_size(cd
, 1);
377 case GENWQE_DBG_UNIT2
:
378 e
= genwqe_ffdc_buff_size(cd
, 2);
380 case GENWQE_DBG_REGS
:
381 e
= GENWQE_FFDC_REGS
;
385 /* currently support only the debug units mentioned here */
386 cd
->ffdc
[type
].entries
= e
;
387 cd
->ffdc
[type
].regs
= kmalloc(e
* sizeof(struct genwqe_reg
),
390 * regs == NULL is ok, the using code treats this as no regs,
391 * Printing warning is ok in this case.
397 static void genwqe_ffdc_buffs_free(struct genwqe_dev
*cd
)
401 for (type
= 0; type
< GENWQE_DBG_UNITS
; type
++) {
402 kfree(cd
->ffdc
[type
].regs
);
403 cd
->ffdc
[type
].regs
= NULL
;
407 static int genwqe_read_ids(struct genwqe_dev
*cd
)
411 struct pci_dev
*pci_dev
= cd
->pci_dev
;
413 cd
->slu_unitcfg
= __genwqe_readq(cd
, IO_SLU_UNITCFG
);
414 if (cd
->slu_unitcfg
== IO_ILLEGAL_VALUE
) {
415 dev_err(&pci_dev
->dev
,
416 "err: SLUID=%016llx\n", cd
->slu_unitcfg
);
421 slu_id
= genwqe_get_slu_id(cd
);
422 if (slu_id
< GENWQE_SLU_ARCH_REQ
|| slu_id
== 0xff) {
423 dev_err(&pci_dev
->dev
,
424 "err: incompatible SLU Architecture %u\n", slu_id
);
429 cd
->app_unitcfg
= __genwqe_readq(cd
, IO_APP_UNITCFG
);
430 if (cd
->app_unitcfg
== IO_ILLEGAL_VALUE
) {
431 dev_err(&pci_dev
->dev
,
432 "err: APPID=%016llx\n", cd
->app_unitcfg
);
436 genwqe_read_app_id(cd
, cd
->app_name
, sizeof(cd
->app_name
));
439 * Is access to all registers possible? If we are a VF the
440 * answer is obvious. If we run fully virtualized, we need to
441 * check if we can access all registers. If we do not have
442 * full access we will cause an UR and some informational FIRs
443 * in the PF, but that should not harm.
445 if (pci_dev
->is_virtfn
)
446 cd
->is_privileged
= 0;
448 cd
->is_privileged
= (__genwqe_readq(cd
, IO_SLU_BITSTREAM
)
449 != IO_ILLEGAL_VALUE
);
455 static int genwqe_start(struct genwqe_dev
*cd
)
458 struct pci_dev
*pci_dev
= cd
->pci_dev
;
460 err
= genwqe_read_ids(cd
);
464 if (genwqe_is_privileged(cd
)) {
465 /* do this after the tweaks. alloc fail is acceptable */
466 genwqe_ffdc_buffs_alloc(cd
);
467 genwqe_stop_traps(cd
);
469 /* Collect registers e.g. FIRs, UNITIDs, traces ... */
470 genwqe_read_ffdc_regs(cd
, cd
->ffdc
[GENWQE_DBG_REGS
].regs
,
471 cd
->ffdc
[GENWQE_DBG_REGS
].entries
, 0);
473 genwqe_ffdc_buff_read(cd
, GENWQE_DBG_UNIT0
,
474 cd
->ffdc
[GENWQE_DBG_UNIT0
].regs
,
475 cd
->ffdc
[GENWQE_DBG_UNIT0
].entries
);
477 genwqe_ffdc_buff_read(cd
, GENWQE_DBG_UNIT1
,
478 cd
->ffdc
[GENWQE_DBG_UNIT1
].regs
,
479 cd
->ffdc
[GENWQE_DBG_UNIT1
].entries
);
481 genwqe_ffdc_buff_read(cd
, GENWQE_DBG_UNIT2
,
482 cd
->ffdc
[GENWQE_DBG_UNIT2
].regs
,
483 cd
->ffdc
[GENWQE_DBG_UNIT2
].entries
);
485 genwqe_start_traps(cd
);
487 if (cd
->card_state
== GENWQE_CARD_FATAL_ERROR
) {
488 dev_warn(&pci_dev
->dev
,
489 "[%s] chip reload/recovery!\n", __func__
);
492 * Stealth Mode: Reload chip on either hot
495 cd
->softreset
= 0x7Cull
;
496 __genwqe_writeq(cd
, IO_SLC_CFGREG_SOFTRESET
,
499 err
= genwqe_bus_reset(cd
);
501 dev_err(&pci_dev
->dev
,
502 "[%s] err: bus reset failed!\n",
508 * Re-read the IDs because
509 * it could happen that the bitstream load
512 err
= genwqe_read_ids(cd
);
518 err
= genwqe_setup_service_layer(cd
); /* does a reset to the card */
520 dev_err(&pci_dev
->dev
,
521 "[%s] err: could not setup servicelayer!\n", __func__
);
526 if (genwqe_is_privileged(cd
)) { /* code is running _after_ reset */
527 genwqe_tweak_hardware(cd
);
529 genwqe_setup_pf_jtimer(cd
);
530 genwqe_setup_vf_jtimer(cd
);
533 err
= genwqe_device_create(cd
);
535 dev_err(&pci_dev
->dev
,
536 "err: chdev init failed! (err=%d)\n", err
);
537 goto out_release_service_layer
;
541 out_release_service_layer
:
542 genwqe_release_service_layer(cd
);
544 if (genwqe_is_privileged(cd
))
545 genwqe_ffdc_buffs_free(cd
);
550 * genwqe_stop() - Stop card operation
553 * As long as genwqe_thread runs we might access registers during
554 * error data capture. Same is with the genwqe_health_thread.
555 * When genwqe_bus_reset() fails this function might called two times:
556 * first by the genwqe_health_thread() and later by genwqe_remove() to
557 * unbind the device. We must be able to survive that.
559 * This function must be robust enough to be called twice.
561 static int genwqe_stop(struct genwqe_dev
*cd
)
563 genwqe_finish_queue(cd
); /* no register access */
564 genwqe_device_remove(cd
); /* device removed, procs killed */
565 genwqe_release_service_layer(cd
); /* here genwqe_thread is stopped */
567 if (genwqe_is_privileged(cd
)) {
568 pci_disable_sriov(cd
->pci_dev
); /* access pci config space */
569 genwqe_ffdc_buffs_free(cd
);
576 * genwqe_recover_card() - Try to recover the card if it is possible
578 * If fatal_err is set no register access is possible anymore. It is
579 * likely that genwqe_start fails in that situation. Proper error
580 * handling is required in this case.
582 * genwqe_bus_reset() will cause the pci code to call genwqe_remove()
583 * and later genwqe_probe() for all virtual functions.
585 static int genwqe_recover_card(struct genwqe_dev
*cd
, int fatal_err
)
588 struct pci_dev
*pci_dev
= cd
->pci_dev
;
593 * Make sure chip is not reloaded to maintain FFDC. Write SLU
594 * Reset Register, CPLDReset field to 0.
597 cd
->softreset
= 0x70ull
;
598 __genwqe_writeq(cd
, IO_SLC_CFGREG_SOFTRESET
, cd
->softreset
);
601 rc
= genwqe_bus_reset(cd
);
603 dev_err(&pci_dev
->dev
,
604 "[%s] err: card recovery impossible!\n", __func__
);
608 rc
= genwqe_start(cd
);
610 dev_err(&pci_dev
->dev
,
611 "[%s] err: failed to launch device!\n", __func__
);
617 static int genwqe_health_check_cond(struct genwqe_dev
*cd
, u64
*gfir
)
619 *gfir
= __genwqe_readq(cd
, IO_SLC_CFGREG_GFIR
);
620 return (*gfir
& GFIR_ERR_TRIGGER
) &&
621 genwqe_recovery_on_fatal_gfir_required(cd
);
625 * genwqe_fir_checking() - Check the fault isolation registers of the card
627 * If this code works ok, can be tried out with help of the genwqe_poke tool:
628 * sudo ./tools/genwqe_poke 0x8 0xfefefefefef
630 * Now the relevant FIRs/sFIRs should be printed out and the driver should
631 * invoke recovery (devices are removed and readded).
633 static u64
genwqe_fir_checking(struct genwqe_dev
*cd
)
635 int j
, iterations
= 0;
636 u64 mask
, fir
, fec
, uid
, gfir
, gfir_masked
, sfir
, sfec
;
637 u32 fir_addr
, fir_clr_addr
, fec_addr
, sfir_addr
, sfec_addr
;
638 struct pci_dev
*pci_dev
= cd
->pci_dev
;
642 if (iterations
> 16) {
643 dev_err(&pci_dev
->dev
, "* exit looping after %d times\n",
648 gfir
= __genwqe_readq(cd
, IO_SLC_CFGREG_GFIR
);
650 dev_err(&pci_dev
->dev
, "* 0x%08x 0x%016llx\n",
651 IO_SLC_CFGREG_GFIR
, gfir
);
652 if (gfir
== IO_ILLEGAL_VALUE
)
656 * Avoid printing when to GFIR bit is on prevents contignous
657 * printout e.g. for the following bug:
658 * FIR set without a 2ndary FIR/FIR cannot be cleared
659 * Comment out the following if to get the prints:
664 gfir_masked
= gfir
& GFIR_ERR_TRIGGER
; /* fatal errors */
666 for (uid
= 0; uid
< GENWQE_MAX_UNITS
; uid
++) { /* 0..2 in zEDC */
668 /* read the primary FIR (pfir) */
669 fir_addr
= (uid
<< 24) + 0x08;
670 fir
= __genwqe_readq(cd
, fir_addr
);
672 continue; /* no error in this unit */
674 dev_err(&pci_dev
->dev
, "* 0x%08x 0x%016llx\n", fir_addr
, fir
);
675 if (fir
== IO_ILLEGAL_VALUE
)
678 /* read primary FEC */
679 fec_addr
= (uid
<< 24) + 0x18;
680 fec
= __genwqe_readq(cd
, fec_addr
);
682 dev_err(&pci_dev
->dev
, "* 0x%08x 0x%016llx\n", fec_addr
, fec
);
683 if (fec
== IO_ILLEGAL_VALUE
)
686 for (j
= 0, mask
= 1ULL; j
< 64; j
++, mask
<<= 1) {
688 /* secondary fir empty, skip it */
689 if ((fir
& mask
) == 0x0)
692 sfir_addr
= (uid
<< 24) + 0x100 + 0x08 * j
;
693 sfir
= __genwqe_readq(cd
, sfir_addr
);
695 if (sfir
== IO_ILLEGAL_VALUE
)
697 dev_err(&pci_dev
->dev
,
698 "* 0x%08x 0x%016llx\n", sfir_addr
, sfir
);
700 sfec_addr
= (uid
<< 24) + 0x300 + 0x08 * j
;
701 sfec
= __genwqe_readq(cd
, sfec_addr
);
703 if (sfec
== IO_ILLEGAL_VALUE
)
705 dev_err(&pci_dev
->dev
,
706 "* 0x%08x 0x%016llx\n", sfec_addr
, sfec
);
708 gfir
= __genwqe_readq(cd
, IO_SLC_CFGREG_GFIR
);
709 if (gfir
== IO_ILLEGAL_VALUE
)
712 /* gfir turned on during routine! get out and
714 if ((gfir_masked
== 0x0) &&
715 (gfir
& GFIR_ERR_TRIGGER
)) {
719 /* do not clear if we entered with a fatal gfir */
720 if (gfir_masked
== 0x0) {
722 /* NEW clear by mask the logged bits */
723 sfir_addr
= (uid
<< 24) + 0x100 + 0x08 * j
;
724 __genwqe_writeq(cd
, sfir_addr
, sfir
);
726 dev_dbg(&pci_dev
->dev
,
727 "[HM] Clearing 2ndary FIR 0x%08x "
728 "with 0x%016llx\n", sfir_addr
, sfir
);
731 * note, these cannot be error-Firs
732 * since gfir_masked is 0 after sfir
733 * was read. Also, it is safe to do
734 * this write if sfir=0. Still need to
735 * clear the primary. This just means
736 * there is no secondary FIR.
739 /* clear by mask the logged bit. */
740 fir_clr_addr
= (uid
<< 24) + 0x10;
741 __genwqe_writeq(cd
, fir_clr_addr
, mask
);
743 dev_dbg(&pci_dev
->dev
,
744 "[HM] Clearing primary FIR 0x%08x "
745 "with 0x%016llx\n", fir_clr_addr
,
750 gfir
= __genwqe_readq(cd
, IO_SLC_CFGREG_GFIR
);
751 if (gfir
== IO_ILLEGAL_VALUE
)
754 if ((gfir_masked
== 0x0) && (gfir
& GFIR_ERR_TRIGGER
)) {
756 * Check once more that it didn't go on after all the
759 dev_dbg(&pci_dev
->dev
, "ACK! Another FIR! Recursing %d!\n",
766 return IO_ILLEGAL_VALUE
;
770 * genwqe_pci_fundamental_reset() - trigger a PCIe fundamental reset on the slot
772 * Note: pci_set_pcie_reset_state() is not implemented on all archs, so this
773 * reset method will not work in all cases.
775 * Return: 0 on success or error code from pci_set_pcie_reset_state()
777 static int genwqe_pci_fundamental_reset(struct pci_dev
*pci_dev
)
782 * lock pci config space access from userspace,
783 * save state and issue PCIe fundamental reset
785 pci_cfg_access_lock(pci_dev
);
786 pci_save_state(pci_dev
);
787 rc
= pci_set_pcie_reset_state(pci_dev
, pcie_warm_reset
);
789 /* keep PCIe reset asserted for 250ms */
791 pci_set_pcie_reset_state(pci_dev
, pcie_deassert_reset
);
792 /* Wait for 2s to reload flash and train the link */
795 pci_restore_state(pci_dev
);
796 pci_cfg_access_unlock(pci_dev
);
801 static int genwqe_platform_recovery(struct genwqe_dev
*cd
)
803 struct pci_dev
*pci_dev
= cd
->pci_dev
;
806 dev_info(&pci_dev
->dev
,
807 "[%s] resetting card for error recovery\n", __func__
);
809 /* Clear out error injection flags */
810 cd
->err_inject
&= ~(GENWQE_INJECT_HARDWARE_FAILURE
|
811 GENWQE_INJECT_GFIR_FATAL
|
812 GENWQE_INJECT_GFIR_INFO
);
816 /* Try recoverying the card with fundamental reset */
817 rc
= genwqe_pci_fundamental_reset(pci_dev
);
819 rc
= genwqe_start(cd
);
821 dev_info(&pci_dev
->dev
,
822 "[%s] card recovered\n", __func__
);
824 dev_err(&pci_dev
->dev
,
825 "[%s] err: cannot start card services! (err=%d)\n",
828 dev_err(&pci_dev
->dev
,
829 "[%s] card reset failed\n", __func__
);
836 * genwqe_reload_bistream() - reload card bitstream
838 * Set the appropriate register and call fundamental reset to reaload the card
841 * Return: 0 on success, error code otherwise
843 static int genwqe_reload_bistream(struct genwqe_dev
*cd
)
845 struct pci_dev
*pci_dev
= cd
->pci_dev
;
848 dev_info(&pci_dev
->dev
,
849 "[%s] resetting card for bitstream reload\n",
855 * Cause a CPLD reprogram with the 'next_bitstream'
856 * partition on PCIe hot or fundamental reset
858 __genwqe_writeq(cd
, IO_SLC_CFGREG_SOFTRESET
,
859 (cd
->softreset
& 0xcull
) | 0x70ull
);
861 rc
= genwqe_pci_fundamental_reset(pci_dev
);
864 * A fundamental reset failure can be caused
865 * by lack of support on the arch, so we just
866 * log the error and try to start the card
869 dev_err(&pci_dev
->dev
,
870 "[%s] err: failed to reset card for bitstream reload\n",
874 rc
= genwqe_start(cd
);
876 dev_err(&pci_dev
->dev
,
877 "[%s] err: cannot start card services! (err=%d)\n",
881 dev_info(&pci_dev
->dev
,
882 "[%s] card reloaded\n", __func__
);
888 * genwqe_health_thread() - Health checking thread
890 * This thread is only started for the PF of the card.
892 * This thread monitors the health of the card. A critical situation
893 * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In
894 * this case we need to be recovered from outside. Writing to
895 * registers will very likely not work either.
897 * This thread must only exit if kthread_should_stop() becomes true.
899 * Condition for the health-thread to trigger:
900 * a) when a kthread_stop() request comes in or
901 * b) a critical GFIR occured
903 * Informational GFIRs are checked and potentially printed in
904 * health_check_interval seconds.
906 static int genwqe_health_thread(void *data
)
908 int rc
, should_stop
= 0;
909 struct genwqe_dev
*cd
= data
;
910 struct pci_dev
*pci_dev
= cd
->pci_dev
;
911 u64 gfir
, gfir_masked
, slu_unitcfg
, app_unitcfg
;
914 while (!kthread_should_stop()) {
915 rc
= wait_event_interruptible_timeout(cd
->health_waitq
,
916 (genwqe_health_check_cond(cd
, &gfir
) ||
917 (should_stop
= kthread_should_stop())),
918 genwqe_health_check_interval
* HZ
);
923 if (gfir
== IO_ILLEGAL_VALUE
) {
924 dev_err(&pci_dev
->dev
,
925 "[%s] GFIR=%016llx\n", __func__
, gfir
);
929 slu_unitcfg
= __genwqe_readq(cd
, IO_SLU_UNITCFG
);
930 if (slu_unitcfg
== IO_ILLEGAL_VALUE
) {
931 dev_err(&pci_dev
->dev
,
932 "[%s] SLU_UNITCFG=%016llx\n",
933 __func__
, slu_unitcfg
);
937 app_unitcfg
= __genwqe_readq(cd
, IO_APP_UNITCFG
);
938 if (app_unitcfg
== IO_ILLEGAL_VALUE
) {
939 dev_err(&pci_dev
->dev
,
940 "[%s] APP_UNITCFG=%016llx\n",
941 __func__
, app_unitcfg
);
945 gfir
= __genwqe_readq(cd
, IO_SLC_CFGREG_GFIR
);
946 if (gfir
== IO_ILLEGAL_VALUE
) {
947 dev_err(&pci_dev
->dev
,
948 "[%s] %s: GFIR=%016llx\n", __func__
,
949 (gfir
& GFIR_ERR_TRIGGER
) ? "err" : "info",
954 gfir_masked
= genwqe_fir_checking(cd
);
955 if (gfir_masked
== IO_ILLEGAL_VALUE
)
959 * GFIR ErrorTrigger bits set => reset the card!
960 * Never do this for old/manufacturing images!
962 if ((gfir_masked
) && !cd
->skip_recovery
&&
963 genwqe_recovery_on_fatal_gfir_required(cd
)) {
965 cd
->card_state
= GENWQE_CARD_FATAL_ERROR
;
967 rc
= genwqe_recover_card(cd
, 0);
969 /* FIXME Card is unusable and needs unbind! */
974 if (cd
->card_state
== GENWQE_CARD_RELOAD_BITSTREAM
) {
975 /* Userspace requested card bitstream reload */
976 rc
= genwqe_reload_bistream(cd
);
981 cd
->last_gfir
= gfir
;
988 if (cd
->use_platform_recovery
) {
990 * Since we use raw accessors, EEH errors won't be detected
991 * by the platform until we do a non-raw MMIO or config space
994 readq(cd
->mmio
+ IO_SLC_CFGREG_GFIR
);
996 /* We do nothing if the card is going over PCI recovery */
997 if (pci_channel_offline(pci_dev
))
1001 * If it's supported by the platform, we try a fundamental reset
1002 * to recover from a fatal error. Otherwise, we continue to wait
1003 * for an external recovery procedure to take care of it.
1005 rc
= genwqe_platform_recovery(cd
);
1007 goto health_thread_begin
;
1010 dev_err(&pci_dev
->dev
,
1011 "[%s] card unusable. Please trigger unbind!\n", __func__
);
1013 /* Bring down logical devices to inform user space via udev remove. */
1014 cd
->card_state
= GENWQE_CARD_FATAL_ERROR
;
1017 /* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */
1018 while (!kthread_should_stop())
1024 static int genwqe_health_check_start(struct genwqe_dev
*cd
)
1028 if (genwqe_health_check_interval
<= 0)
1029 return 0; /* valid for disabling the service */
1031 /* moved before request_irq() */
1032 /* init_waitqueue_head(&cd->health_waitq); */
1034 cd
->health_thread
= kthread_run(genwqe_health_thread
, cd
,
1035 GENWQE_DEVNAME
"%d_health",
1037 if (IS_ERR(cd
->health_thread
)) {
1038 rc
= PTR_ERR(cd
->health_thread
);
1039 cd
->health_thread
= NULL
;
1045 static int genwqe_health_thread_running(struct genwqe_dev
*cd
)
1047 return cd
->health_thread
!= NULL
;
1050 static int genwqe_health_check_stop(struct genwqe_dev
*cd
)
1054 if (!genwqe_health_thread_running(cd
))
1057 rc
= kthread_stop(cd
->health_thread
);
1058 cd
->health_thread
= NULL
;
1063 * genwqe_pci_setup() - Allocate PCIe related resources for our card
1065 static int genwqe_pci_setup(struct genwqe_dev
*cd
)
1068 struct pci_dev
*pci_dev
= cd
->pci_dev
;
1070 bars
= pci_select_bars(pci_dev
, IORESOURCE_MEM
);
1071 err
= pci_enable_device_mem(pci_dev
);
1073 dev_err(&pci_dev
->dev
,
1074 "err: failed to enable pci memory (err=%d)\n", err
);
1078 /* Reserve PCI I/O and memory resources */
1079 err
= pci_request_selected_regions(pci_dev
, bars
, genwqe_driver_name
);
1081 dev_err(&pci_dev
->dev
,
1082 "[%s] err: request bars failed (%d)\n", __func__
, err
);
1084 goto err_disable_device
;
1087 /* check for 64-bit DMA address supported (DAC) */
1088 if (!pci_set_dma_mask(pci_dev
, DMA_BIT_MASK(64))) {
1089 err
= pci_set_consistent_dma_mask(pci_dev
, DMA_BIT_MASK(64));
1091 dev_err(&pci_dev
->dev
,
1092 "err: DMA64 consistent mask error\n");
1094 goto out_release_resources
;
1096 /* check for 32-bit DMA address supported (SAC) */
1097 } else if (!pci_set_dma_mask(pci_dev
, DMA_BIT_MASK(32))) {
1098 err
= pci_set_consistent_dma_mask(pci_dev
, DMA_BIT_MASK(32));
1100 dev_err(&pci_dev
->dev
,
1101 "err: DMA32 consistent mask error\n");
1103 goto out_release_resources
;
1106 dev_err(&pci_dev
->dev
,
1107 "err: neither DMA32 nor DMA64 supported\n");
1109 goto out_release_resources
;
1112 pci_set_master(pci_dev
);
1113 pci_enable_pcie_error_reporting(pci_dev
);
1115 /* EEH recovery requires PCIe fundamental reset */
1116 pci_dev
->needs_freset
= 1;
1118 /* request complete BAR-0 space (length = 0) */
1119 cd
->mmio_len
= pci_resource_len(pci_dev
, 0);
1120 cd
->mmio
= pci_iomap(pci_dev
, 0, 0);
1121 if (cd
->mmio
== NULL
) {
1122 dev_err(&pci_dev
->dev
,
1123 "[%s] err: mapping BAR0 failed\n", __func__
);
1125 goto out_release_resources
;
1128 cd
->num_vfs
= pci_sriov_get_totalvfs(pci_dev
);
1130 err
= genwqe_read_ids(cd
);
1137 pci_iounmap(pci_dev
, cd
->mmio
);
1138 out_release_resources
:
1139 pci_release_selected_regions(pci_dev
, bars
);
1141 pci_disable_device(pci_dev
);
1147 * genwqe_pci_remove() - Free PCIe related resources for our card
1149 static void genwqe_pci_remove(struct genwqe_dev
*cd
)
1152 struct pci_dev
*pci_dev
= cd
->pci_dev
;
1155 pci_iounmap(pci_dev
, cd
->mmio
);
1157 bars
= pci_select_bars(pci_dev
, IORESOURCE_MEM
);
1158 pci_release_selected_regions(pci_dev
, bars
);
1159 pci_disable_device(pci_dev
);
1163 * genwqe_probe() - Device initialization
1164 * @pdev: PCI device information struct
1166 * Callable for multiple cards. This function is called on bind.
1168 * Return: 0 if succeeded, < 0 when failed
1170 static int genwqe_probe(struct pci_dev
*pci_dev
,
1171 const struct pci_device_id
*id
)
1174 struct genwqe_dev
*cd
;
1176 genwqe_init_crc32();
1178 cd
= genwqe_dev_alloc();
1180 dev_err(&pci_dev
->dev
, "err: could not alloc mem (err=%d)!\n",
1185 dev_set_drvdata(&pci_dev
->dev
, cd
);
1186 cd
->pci_dev
= pci_dev
;
1188 err
= genwqe_pci_setup(cd
);
1190 dev_err(&pci_dev
->dev
,
1191 "err: problems with PCI setup (err=%d)\n", err
);
1195 err
= genwqe_start(cd
);
1197 dev_err(&pci_dev
->dev
,
1198 "err: cannot start card services! (err=%d)\n", err
);
1199 goto out_pci_remove
;
1202 if (genwqe_is_privileged(cd
)) {
1203 err
= genwqe_health_check_start(cd
);
1205 dev_err(&pci_dev
->dev
,
1206 "err: cannot start health checking! "
1208 goto out_stop_services
;
1216 genwqe_pci_remove(cd
);
1218 genwqe_dev_free(cd
);
1223 * genwqe_remove() - Called when device is removed (hot-plugable)
1225 * Or when driver is unloaded respecitively when unbind is done.
1227 static void genwqe_remove(struct pci_dev
*pci_dev
)
1229 struct genwqe_dev
*cd
= dev_get_drvdata(&pci_dev
->dev
);
1231 genwqe_health_check_stop(cd
);
1234 * genwqe_stop() must survive if it is called twice
1235 * sequentially. This happens when the health thread calls it
1236 * and fails on genwqe_bus_reset().
1239 genwqe_pci_remove(cd
);
1240 genwqe_dev_free(cd
);
1244 * genwqe_err_error_detected() - Error detection callback
1246 * This callback is called by the PCI subsystem whenever a PCI bus
1247 * error is detected.
1249 static pci_ers_result_t
genwqe_err_error_detected(struct pci_dev
*pci_dev
,
1250 enum pci_channel_state state
)
1252 struct genwqe_dev
*cd
;
1254 dev_err(&pci_dev
->dev
, "[%s] state=%d\n", __func__
, state
);
1256 cd
= dev_get_drvdata(&pci_dev
->dev
);
1258 return PCI_ERS_RESULT_DISCONNECT
;
1261 genwqe_health_check_stop(cd
);
1265 * On permanent failure, the PCI code will call device remove
1266 * after the return of this function.
1267 * genwqe_stop() can be called twice.
1269 if (state
== pci_channel_io_perm_failure
) {
1270 return PCI_ERS_RESULT_DISCONNECT
;
1272 genwqe_pci_remove(cd
);
1273 return PCI_ERS_RESULT_NEED_RESET
;
1277 static pci_ers_result_t
genwqe_err_slot_reset(struct pci_dev
*pci_dev
)
1280 struct genwqe_dev
*cd
= dev_get_drvdata(&pci_dev
->dev
);
1282 rc
= genwqe_pci_setup(cd
);
1284 return PCI_ERS_RESULT_RECOVERED
;
1286 dev_err(&pci_dev
->dev
,
1287 "err: problems with PCI setup (err=%d)\n", rc
);
1288 return PCI_ERS_RESULT_DISCONNECT
;
1292 static pci_ers_result_t
genwqe_err_result_none(struct pci_dev
*dev
)
1294 return PCI_ERS_RESULT_NONE
;
1297 static void genwqe_err_resume(struct pci_dev
*pci_dev
)
1300 struct genwqe_dev
*cd
= dev_get_drvdata(&pci_dev
->dev
);
1302 rc
= genwqe_start(cd
);
1304 rc
= genwqe_health_check_start(cd
);
1306 dev_err(&pci_dev
->dev
,
1307 "err: cannot start health checking! (err=%d)\n",
1310 dev_err(&pci_dev
->dev
,
1311 "err: cannot start card services! (err=%d)\n", rc
);
1315 static int genwqe_sriov_configure(struct pci_dev
*dev
, int numvfs
)
1317 struct genwqe_dev
*cd
= dev_get_drvdata(&dev
->dev
);
1320 genwqe_setup_vf_jtimer(cd
);
1321 pci_enable_sriov(dev
, numvfs
);
1325 pci_disable_sriov(dev
);
1331 static struct pci_error_handlers genwqe_err_handler
= {
1332 .error_detected
= genwqe_err_error_detected
,
1333 .mmio_enabled
= genwqe_err_result_none
,
1334 .link_reset
= genwqe_err_result_none
,
1335 .slot_reset
= genwqe_err_slot_reset
,
1336 .resume
= genwqe_err_resume
,
1339 static struct pci_driver genwqe_driver
= {
1340 .name
= genwqe_driver_name
,
1341 .id_table
= genwqe_device_table
,
1342 .probe
= genwqe_probe
,
1343 .remove
= genwqe_remove
,
1344 .sriov_configure
= genwqe_sriov_configure
,
1345 .err_handler
= &genwqe_err_handler
,
1349 * genwqe_init_module() - Driver registration and initialization
1351 static int __init
genwqe_init_module(void)
1355 class_genwqe
= class_create(THIS_MODULE
, GENWQE_DEVNAME
);
1356 if (IS_ERR(class_genwqe
)) {
1357 pr_err("[%s] create class failed\n", __func__
);
1361 debugfs_genwqe
= debugfs_create_dir(GENWQE_DEVNAME
, NULL
);
1362 if (!debugfs_genwqe
) {
1367 rc
= pci_register_driver(&genwqe_driver
);
1369 pr_err("[%s] pci_reg_driver (rc=%d)\n", __func__
, rc
);
1376 debugfs_remove(debugfs_genwqe
);
1378 class_destroy(class_genwqe
);
1383 * genwqe_exit_module() - Driver exit
1385 static void __exit
genwqe_exit_module(void)
1387 pci_unregister_driver(&genwqe_driver
);
1388 debugfs_remove(debugfs_genwqe
);
1389 class_destroy(class_genwqe
);
1392 module_init(genwqe_init_module
);
1393 module_exit(genwqe_exit_module
);