summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
bdcff1c)
BugLink: http://bugs.launchpad.net/bugs/1702521
An EEH during probe can lead to a crash as the recovery thread races with the
probe thread. To avoid this issue, introduce new states to fence out EEH
recovery until probe has completed. Also ensure the reset wait queue is
flushed during device removal to avoid orphaned threads.
Signed-off-by: Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Signed-off-by: Uma Krishnan <ukrishn@linux.vnet.ibm.com>
Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
(cherry picked from commit
323e33428ea23bfb1ae5010b18b4540048b2ad51)
Signed-off-by: Victor Aoqui <victora@linux.vnet.ibm.com>
Acked-by: Stefan Bader <stefan.bader@canonical.com>
Acked-by: Seth Forshee <seth.forshee@canonical.com>
Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
+ STATE_PROBING, /* Initial state during probe */
+ STATE_PROBED, /* Temporary state, probe completed but EEH occurred */
STATE_NORMAL, /* Normal running state, everything good */
STATE_RESET, /* Reset state, trying to reset/recover */
STATE_FAILTERM /* Failed/terminating state, error out users/threads */
STATE_NORMAL, /* Normal running state, everything good */
STATE_RESET, /* Reset state, trying to reset/recover */
STATE_FAILTERM /* Failed/terminating state, error out users/threads */
spin_unlock_irqrestore(&cfg->tmf_slock, lock_flags);
switch (cfg->state) {
spin_unlock_irqrestore(&cfg->tmf_slock, lock_flags);
switch (cfg->state) {
+ case STATE_PROBING:
+ case STATE_PROBED:
case STATE_RESET:
dev_dbg_ratelimited(dev, "%s: device is in reset\n", __func__);
rc = SCSI_MLQUEUE_HOST_BUSY;
case STATE_RESET:
dev_dbg_ratelimited(dev, "%s: device is in reset\n", __func__);
rc = SCSI_MLQUEUE_HOST_BUSY;
* cxlflash_remove() - PCI entry point to tear down host
* @pdev: PCI device associated with the host.
*
* cxlflash_remove() - PCI entry point to tear down host
* @pdev: PCI device associated with the host.
*
- * Safe to use as a cleanup in partially allocated/initialized state.
+ * Safe to use as a cleanup in partially allocated/initialized state. Note that
+ * the reset_waitq is flushed as part of the stop/termination of user contexts.
*/
static void cxlflash_remove(struct pci_dev *pdev)
{
*/
static void cxlflash_remove(struct pci_dev *pdev)
{
case INIT_STATE_SCSI:
cxlflash_term_local_luns(cfg);
scsi_remove_host(cfg->host);
case INIT_STATE_SCSI:
cxlflash_term_local_luns(cfg);
scsi_remove_host(cfg->host);
case INIT_STATE_AFU:
term_afu(cfg);
case INIT_STATE_PCI:
case INIT_STATE_AFU:
term_afu(cfg);
case INIT_STATE_PCI:
* @pdev: PCI device associated with the host.
* @dev_id: PCI device id associated with device.
*
* @pdev: PCI device associated with the host.
* @dev_id: PCI device id associated with device.
*
+ * The device will initially start out in a 'probing' state and
+ * transition to the 'normal' state at the end of a successful
+ * probe. Should an EEH event occur during probe, the notification
+ * thread (error_detected()) will wait until the probe handler
+ * is nearly complete. At that time, the device will be moved to
+ * a 'probed' state and the EEH thread woken up to drive the slot
+ * reset and recovery (device moves to 'normal' state). Meanwhile,
+ * the probe will be allowed to exit successfully.
+ *
* Return: 0 on success, -errno on failure
*/
static int cxlflash_probe(struct pci_dev *pdev,
* Return: 0 on success, -errno on failure
*/
static int cxlflash_probe(struct pci_dev *pdev,
cfg->init_state = INIT_STATE_PCI;
rc = init_afu(cfg);
cfg->init_state = INIT_STATE_PCI;
rc = init_afu(cfg);
+ if (rc && !wq_has_sleeper(&cfg->reset_waitq)) {
dev_err(dev, "%s: init_afu failed rc=%d\n", __func__, rc);
goto out_remove;
}
dev_err(dev, "%s: init_afu failed rc=%d\n", __func__, rc);
goto out_remove;
}
}
cfg->init_state = INIT_STATE_SCSI;
}
cfg->init_state = INIT_STATE_SCSI;
+ if (wq_has_sleeper(&cfg->reset_waitq)) {
+ cfg->state = STATE_PROBED;
+ wake_up_all(&cfg->reset_waitq);
+ } else
+ cfg->state = STATE_NORMAL;
out:
dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
return rc;
out:
dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
return rc;
switch (state) {
case pci_channel_io_frozen:
switch (state) {
case pci_channel_io_frozen:
- wait_event(cfg->reset_waitq, cfg->state != STATE_RESET);
+ wait_event(cfg->reset_waitq, cfg->state != STATE_RESET &&
+ cfg->state != STATE_PROBING);
if (cfg->state == STATE_FAILTERM)
return PCI_ERS_RESULT_DISCONNECT;
if (cfg->state == STATE_FAILTERM)
return PCI_ERS_RESULT_DISCONNECT;
* memory freed. This is accomplished by putting the contexts in error
* state which will notify the user and let them 'drive' the tear down.
* Meanwhile, this routine camps until all user contexts have been removed.
* memory freed. This is accomplished by putting the contexts in error
* state which will notify the user and let them 'drive' the tear down.
* Meanwhile, this routine camps until all user contexts have been removed.
+ *
+ * Note that the main loop in this routine will always execute at least once
+ * to flush the reset_waitq.
*/
void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
{
struct device *dev = &cfg->dev->dev;
*/
void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
{
struct device *dev = &cfg->dev->dev;
cxlflash_mark_contexts_error(cfg);
while (true) {
cxlflash_mark_contexts_error(cfg);
while (true) {
for (i = 0; i < MAX_CONTEXT; i++)
if (cfg->ctx_tbl[i]) {
found = true;
for (i = 0; i < MAX_CONTEXT; i++)
if (cfg->ctx_tbl[i]) {
found = true;
__func__);
wake_up_all(&cfg->reset_waitq);
ssleep(1);
__func__);
wake_up_all(&cfg->reset_waitq);
ssleep(1);