scsi: cxlflash: Fence EEH during probe

author Matthew R. Ochs <mrochs@linux.vnet.ibm.com>

Fri, 7 Jul 2017 16:05:29 +0000 (13:05 -0300)

committer Thadeu Lima de Souza Cascardo <cascardo@canonical.com>

Fri, 14 Jul 2017 14:33:03 +0000 (11:33 -0300)
author Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
Fri, 7 Jul 2017 16:05:29 +0000 (13:05 -0300)
committer Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
Fri, 14 Jul 2017 14:33:03 +0000 (11:33 -0300)
diff --git a/drivers/scsi/cxlflash/common.h b/drivers/scsi/cxlflash/common.h

index 28bb716e78fefc7351674c2fbc3cdba5b7873e9d..17aa74a83d39b1a7163432ee184ad38ee68a2061 100644 (file)
--- a/drivers/scsi/cxlflash/common.h
+++ b/drivers/scsi/cxlflash/common.h
@@ -90,6 +90,8 @@ enum cxlflash_init_state {
  };
  
  enum cxlflash_state {
  };
  
  enum cxlflash_state {
+       STATE_PROBING,  /* Initial state during probe */
+       STATE_PROBED,   /* Temporary state, probe completed but EEH occurred */
         STATE_NORMAL,   /* Normal running state, everything good */
         STATE_RESET,    /* Reset state, trying to reset/recover */
         STATE_FAILTERM  /* Failed/terminating state, error out users/threads */
         STATE_NORMAL,   /* Normal running state, everything good */
         STATE_RESET,    /* Reset state, trying to reset/recover */
         STATE_FAILTERM  /* Failed/terminating state, error out users/threads */
diff --git a/drivers/scsi/cxlflash/main.c b/drivers/scsi/cxlflash/main.c

index 568cd636607a3371f4701c1c9228a264eef94741..ebba3c90a2428353290b0c53797d142d9a8443ef 100644 (file)
--- a/drivers/scsi/cxlflash/main.c
+++ b/drivers/scsi/cxlflash/main.c
@@ -470,6 +470,8 @@ static int cxlflash_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scp)
         spin_unlock_irqrestore(&cfg->tmf_slock, lock_flags);
  
         switch (cfg->state) {
         spin_unlock_irqrestore(&cfg->tmf_slock, lock_flags);
  
         switch (cfg->state) {
+       case STATE_PROBING:
+       case STATE_PROBED:
         case STATE_RESET:
                 dev_dbg_ratelimited(dev, "%s: device is in reset\n", __func__);
                 rc = SCSI_MLQUEUE_HOST_BUSY;
         case STATE_RESET:
                 dev_dbg_ratelimited(dev, "%s: device is in reset\n", __func__);
                 rc = SCSI_MLQUEUE_HOST_BUSY;
@@ -719,7 +721,8 @@ static void notify_shutdown(struct cxlflash_cfg *cfg, bool wait)
   * cxlflash_remove() - PCI entry point to tear down host
   * @pdev:      PCI device associated with the host.
   *
   * cxlflash_remove() - PCI entry point to tear down host
   * @pdev:      PCI device associated with the host.
   *
- * Safe to use as a cleanup in partially allocated/initialized state.
+ * Safe to use as a cleanup in partially allocated/initialized state. Note that
+ * the reset_waitq is flushed as part of the stop/termination of user contexts.
   */
  static void cxlflash_remove(struct pci_dev *pdev)
  {
   */
  static void cxlflash_remove(struct pci_dev *pdev)
  {
@@ -752,7 +755,6 @@ static void cxlflash_remove(struct pci_dev *pdev)
         case INIT_STATE_SCSI:
                 cxlflash_term_local_luns(cfg);
                 scsi_remove_host(cfg->host);
         case INIT_STATE_SCSI:
                 cxlflash_term_local_luns(cfg);
                 scsi_remove_host(cfg->host);
-               /* fall through */
         case INIT_STATE_AFU:
                 term_afu(cfg);
         case INIT_STATE_PCI:
         case INIT_STATE_AFU:
                 term_afu(cfg);
         case INIT_STATE_PCI:
@@ -2624,6 +2626,15 @@ static void cxlflash_worker_thread(struct work_struct *work)
   * @pdev:      PCI device associated with the host.
   * @dev_id:    PCI device id associated with device.
   *
   * @pdev:      PCI device associated with the host.
   * @dev_id:    PCI device id associated with device.
   *
+ * The device will initially start out in a 'probing' state and
+ * transition to the 'normal' state at the end of a successful
+ * probe. Should an EEH event occur during probe, the notification
+ * thread (error_detected()) will wait until the probe handler
+ * is nearly complete. At that time, the device will be moved to
+ * a 'probed' state and the EEH thread woken up to drive the slot
+ * reset and recovery (device moves to 'normal' state). Meanwhile,
+ * the probe will be allowed to exit successfully.
+ *
   * Return: 0 on success, -errno on failure
   */
  static int cxlflash_probe(struct pci_dev *pdev,
   * Return: 0 on success, -errno on failure
   */
  static int cxlflash_probe(struct pci_dev *pdev,
@@ -2707,7 +2718,7 @@ static int cxlflash_probe(struct pci_dev *pdev,
         cfg->init_state = INIT_STATE_PCI;
  
         rc = init_afu(cfg);
         cfg->init_state = INIT_STATE_PCI;
  
         rc = init_afu(cfg);
-       if (rc) {
+       if (rc && !wq_has_sleeper(&cfg->reset_waitq)) {
                 dev_err(dev, "%s: init_afu failed rc=%d\n", __func__, rc);
                 goto out_remove;
         }
                 dev_err(dev, "%s: init_afu failed rc=%d\n", __func__, rc);
                 goto out_remove;
         }
@@ -2720,6 +2731,11 @@ static int cxlflash_probe(struct pci_dev *pdev,
         }
         cfg->init_state = INIT_STATE_SCSI;
  
         }
         cfg->init_state = INIT_STATE_SCSI;
  
+       if (wq_has_sleeper(&cfg->reset_waitq)) {
+               cfg->state = STATE_PROBED;
+               wake_up_all(&cfg->reset_waitq);
+       } else
+               cfg->state = STATE_NORMAL;
  out:
         dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
         return rc;
  out:
         dev_dbg(dev, "%s: returning rc=%d\n", __func__, rc);
         return rc;
@@ -2750,7 +2766,8 @@ static pci_ers_result_t cxlflash_pci_error_detected(struct pci_dev *pdev,
  
         switch (state) {
         case pci_channel_io_frozen:
  
         switch (state) {
         case pci_channel_io_frozen:
-               wait_event(cfg->reset_waitq, cfg->state != STATE_RESET);
+               wait_event(cfg->reset_waitq, cfg->state != STATE_RESET &&
+                                            cfg->state != STATE_PROBING);
                 if (cfg->state == STATE_FAILTERM)
                         return PCI_ERS_RESULT_DISCONNECT;
  
                 if (cfg->state == STATE_FAILTERM)
                         return PCI_ERS_RESULT_DISCONNECT;
  
diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c

index 488330ffdf0260d3876cf7b4878866c384e24fb2..158fa0099c37578edc268fd3134f537fb49b974c 100644 (file)
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -78,17 +78,18 @@ void cxlflash_free_errpage(void)
   * memory freed. This is accomplished by putting the contexts in error
   * state which will notify the user and let them 'drive' the tear down.
   * Meanwhile, this routine camps until all user contexts have been removed.
   * memory freed. This is accomplished by putting the contexts in error
   * state which will notify the user and let them 'drive' the tear down.
   * Meanwhile, this routine camps until all user contexts have been removed.
+ *
+ * Note that the main loop in this routine will always execute at least once
+ * to flush the reset_waitq.
   */
  void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
  {
         struct device *dev = &cfg->dev->dev;
   */
  void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
  {
         struct device *dev = &cfg->dev->dev;
-       int i, found;
+       int i, found = true;
  
         cxlflash_mark_contexts_error(cfg);
  
         while (true) {
  
         cxlflash_mark_contexts_error(cfg);
  
         while (true) {
-               found = false;
-
                 for (i = 0; i < MAX_CONTEXT; i++)
                         if (cfg->ctx_tbl[i]) {
                                 found = true;
                 for (i = 0; i < MAX_CONTEXT; i++)
                         if (cfg->ctx_tbl[i]) {
                                 found = true;
@@ -102,6 +103,7 @@ void cxlflash_stop_term_user_contexts(struct cxlflash_cfg *cfg)
                         __func__);
                 wake_up_all(&cfg->reset_waitq);
                 ssleep(1);
                         __func__);
                 wake_up_all(&cfg->reset_waitq);
                 ssleep(1);
+               found = false;
         }
  }
  
         }
  }
author	Matthew R. Ochs <mrochs@linux.vnet.ibm.com>
	Fri, 7 Jul 2017 16:05:29 +0000 (13:05 -0300)
committer	Thadeu Lima de Souza Cascardo <cascardo@canonical.com>
	Fri, 14 Jul 2017 14:33:03 +0000 (11:33 -0300)
drivers/scsi/cxlflash/common.h		patch \| blob \| blame \| history
drivers/scsi/cxlflash/main.c		patch \| blob \| blame \| history
drivers/scsi/cxlflash/superpipe.c		patch \| blob \| blame \| history