]> git.proxmox.com Git - mirror_ubuntu-bionic-kernel.git/blobdiff - drivers/nvme/host/pci.c
nvme-pci: Use host managed power state for suspend
[mirror_ubuntu-bionic-kernel.git] / drivers / nvme / host / pci.c
index 4276ebfff22ba00fd90e8c241cc7edd56deca353..427239f3ad3640880b03cba814e51dfeb1b210ca 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/mutex.h>
 #include <linux/once.h>
 #include <linux/pci.h>
+#include <linux/suspend.h>
 #include <linux/t10-pi.h>
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
@@ -98,6 +99,7 @@ struct nvme_dev {
        u32 cmbloc;
        struct nvme_ctrl ctrl;
        struct completion ioq_wait;
+       u32 last_ps;
 
        /* shadow doorbell buffer support: */
        u32 *dbbuf_dbs;
@@ -305,6 +307,14 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
                old_value = *dbbuf_db;
                *dbbuf_db = value;
 
+               /*
+                * Ensure that the doorbell is updated before reading the event
+                * index from memory.  The controller needs to provide similar
+                * ordering to ensure the envent index is updated before reading
+                * the doorbell.
+                */
+               mb();
+
                if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
                        return false;
        }
@@ -965,9 +975,11 @@ static inline bool nvme_read_cqe(struct nvme_queue *nvmeq,
        if (nvme_cqe_valid(nvmeq, nvmeq->cq_head, nvmeq->cq_phase)) {
                *cqe = nvmeq->cqes[nvmeq->cq_head];
 
-               if (++nvmeq->cq_head == nvmeq->q_depth) {
+               if (nvmeq->cq_head == nvmeq->q_depth - 1) {
                        nvmeq->cq_head = 0;
                        nvmeq->cq_phase = !nvmeq->cq_phase;
+               } else {
+                       nvmeq->cq_head++;
                }
                return true;
        }
@@ -1070,9 +1082,18 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id)
 static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid,
                                                struct nvme_queue *nvmeq)
 {
+       struct nvme_ctrl *ctrl = &dev->ctrl;
        struct nvme_command c;
        int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED;
 
+       /*
+        * Some drives have a bug that auto-enables WRRU if MEDIUM isn't
+        * set. Since URGENT priority is zeroes, it makes all queues
+        * URGENT.
+        */
+       if (ctrl->quirks & NVME_QUIRK_MEDIUM_PRIO_SQ)
+               flags |= NVME_SQ_PRIO_MEDIUM;
+
        /*
         * Note: we (ab)use the fact that the prp fields survive if no data
         * is attached to the request.
@@ -1148,12 +1169,6 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
        if (!(csts & NVME_CSTS_CFS) && !nssro)
                return false;
 
-       /* If PCI error recovery process is happening, we cannot reset or
-        * the recovery mechanism will surely fail.
-        */
-       if (pci_channel_offline(to_pci_dev(dev->dev)))
-               return false;
-
        return true;
 }
 
@@ -1184,6 +1199,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
        struct nvme_command cmd;
        u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
+       /* If PCI error recovery process is happening, we cannot reset or
+        * the recovery mechanism will surely fail.
+        */
+       mb();
+       if (pci_channel_offline(to_pci_dev(dev->dev)))
+               return BLK_EH_RESET_TIMER;
+
        /*
         * Reset immediately if the controller is failed
         */
@@ -1367,17 +1389,15 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
                                int qid, int depth)
 {
-       if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
-               unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
-                                                     dev->ctrl.page_size);
-               nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-               nvmeq->sq_cmds_io = dev->cmb + offset;
-       } else {
-               nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-                                       &nvmeq->sq_dma_addr, GFP_KERNEL);
-               if (!nvmeq->sq_cmds)
-                       return -ENOMEM;
-       }
+
+       /* CMB SQEs will be mapped before creation */
+       if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz))
+               return 0;
+
+       nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+                                           &nvmeq->sq_dma_addr, GFP_KERNEL);
+       if (!nvmeq->sq_cmds)
+               return -ENOMEM;
 
        return 0;
 }
@@ -1454,10 +1474,17 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
        struct nvme_dev *dev = nvmeq->dev;
        int result;
 
+       if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+               unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
+                                                     dev->ctrl.page_size);
+               nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
+               nvmeq->sq_cmds_io = dev->cmb + offset;
+       }
+
        nvmeq->cq_vector = qid - 1;
        result = adapter_alloc_cq(dev, qid, nvmeq);
        if (result < 0)
-               return result;
+               goto release_vector;
 
        result = adapter_alloc_sq(dev, qid, nvmeq);
        if (result < 0)
@@ -1471,9 +1498,12 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid)
        return result;
 
  release_sq:
+       dev->online_queues--;
        adapter_delete_sq(dev, qid);
  release_cq:
        adapter_delete_cq(dev, qid);
+ release_vector:
+       nvmeq->cq_vector = -1;
        return result;
 }
 
@@ -1897,7 +1927,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
        int result, nr_io_queues;
        unsigned long size;
 
-       nr_io_queues = num_present_cpus();
+       nr_io_queues = num_possible_cpus();
        result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues);
        if (result < 0)
                return result;
@@ -2299,6 +2329,7 @@ static void nvme_reset_work(struct work_struct *work)
         */
        if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
                nvme_dev_disable(dev, false);
+       nvme_sync_queues(&dev->ctrl);
 
        result = nvme_pci_enable(dev);
        if (result)
@@ -2445,10 +2476,13 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
        } else if (pdev->vendor == 0x144d && pdev->device == 0xa804) {
                /*
                 * Samsung SSD 960 EVO drops off the PCIe bus after system
-                * suspend on a Ryzen board, ASUS PRIME B350M-A.
+                * suspend on a Ryzen board, ASUS PRIME B350M-A, as well as
+                * within few minutes after bootup on a Coffee Lake board -
+                * ASUS PRIME Z370-A
                 */
                if (dmi_match(DMI_BOARD_VENDOR, "ASUSTeK COMPUTER INC.") &&
-                   dmi_match(DMI_BOARD_NAME, "PRIME B350M-A"))
+                   (dmi_match(DMI_BOARD_NAME, "PRIME B350M-A") ||
+                    dmi_match(DMI_BOARD_NAME, "PRIME Z370-A")))
                        return NVME_QUIRK_NO_APST;
        }
 
@@ -2583,16 +2617,93 @@ static int nvme_pci_sriov_configure(struct pci_dev *pdev, int numvfs)
 }
 
 #ifdef CONFIG_PM_SLEEP
+static int nvme_get_power_state(struct nvme_ctrl *ctrl, u32 *ps)
+{
+       return nvme_get_features(ctrl, NVME_FEAT_POWER_MGMT, 0, NULL, 0, ps);
+}
+
+static int nvme_set_power_state(struct nvme_ctrl *ctrl, u32 ps)
+{
+       return nvme_set_features(ctrl, NVME_FEAT_POWER_MGMT, ps, NULL, 0, NULL);
+}
+
+static int nvme_resume(struct device *dev)
+{
+       struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
+       struct nvme_ctrl *ctrl = &ndev->ctrl;
+
+       if (pm_resume_via_firmware() || !ctrl->npss ||
+           nvme_set_power_state(ctrl, ndev->last_ps) != 0)
+               nvme_reset_ctrl(ctrl);
+       return 0;
+}
+
 static int nvme_suspend(struct device *dev)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        struct nvme_dev *ndev = pci_get_drvdata(pdev);
+       struct nvme_ctrl *ctrl = &ndev->ctrl;
+       int ret = -EBUSY;
+
+       /*
+        * The platform does not remove power for a kernel managed suspend so
+        * use host managed nvme power settings for lowest idle power if
+        * possible. This should have quicker resume latency than a full device
+        * shutdown.  But if the firmware is involved after the suspend or the
+        * device does not support any non-default power states, shut down the
+        * device fully.
+        */
+       if (pm_suspend_via_firmware() || !ctrl->npss) {
+               nvme_dev_disable(ndev, true);
+               return 0;
+       }
+
+       nvme_start_freeze(ctrl);
+       nvme_wait_freeze(ctrl);
+       nvme_sync_queues(ctrl);
+
+       if (ctrl->state != NVME_CTRL_LIVE)
+               goto unfreeze;
+
+       ndev->last_ps = 0;
+       ret = nvme_get_power_state(ctrl, &ndev->last_ps);
+       if (ret < 0)
+               goto unfreeze;
+
+       ret = nvme_set_power_state(ctrl, ctrl->npss);
+       if (ret < 0)
+               goto unfreeze;
+
+       if (ret) {
+               /*
+                * Clearing npss forces a controller reset on resume. The
+                * correct value will be resdicovered then.
+                */
+               nvme_dev_disable(ndev, true);
+               ctrl->npss = 0;
+               ret = 0;
+               goto unfreeze;
+       }
+       /*
+        * A saved state prevents pci pm from generically controlling the
+        * device's power. If we're using protocol specific settings, we don't
+        * want pci interfering.
+        */
+       pci_save_state(pdev);
+unfreeze:
+       nvme_unfreeze(ctrl);
+       return ret;
+}
+
+static int nvme_simple_suspend(struct device *dev)
+{
+       struct nvme_dev *ndev = pci_get_drvdata(to_pci_dev(dev));
 
        nvme_dev_disable(ndev, true);
        return 0;
 }
 
-static int nvme_resume(struct device *dev)
+static int nvme_simple_resume(struct device *dev)
 {
        struct pci_dev *pdev = to_pci_dev(dev);
        struct nvme_dev *ndev = pci_get_drvdata(pdev);
@@ -2600,9 +2711,19 @@ static int nvme_resume(struct device *dev)
        nvme_reset_ctrl(&ndev->ctrl);
        return 0;
 }
-#endif
 
-static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume);
+const struct dev_pm_ops nvme_dev_pm_ops = {
+       .suspend        = nvme_suspend,
+       .resume         = nvme_resume,
+       .freeze         = nvme_simple_suspend,
+       .thaw           = nvme_simple_resume,
+       .poweroff       = nvme_simple_suspend,
+       .restore        = nvme_simple_resume,
+};
+
+#else
+#define nvme_dev_pm_ops                NULL
+#endif
 
 static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev,
                                                pci_channel_state_t state)
@@ -2642,6 +2763,9 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev)
 
 static void nvme_error_resume(struct pci_dev *pdev)
 {
+       struct nvme_dev *dev = pci_get_drvdata(pdev);
+
+       flush_work(&dev->ctrl.reset_work);
        pci_cleanup_aer_uncorrect_error_status(pdev);
 }
 
@@ -2667,7 +2791,10 @@ static const struct pci_device_id nvme_id_table[] = {
                .driver_data = NVME_QUIRK_STRIPE_SIZE |
                                NVME_QUIRK_DEALLOCATE_ZEROES, },
        { PCI_VDEVICE(INTEL, 0xf1a5),   /* Intel 600P/P3100 */
-               .driver_data = NVME_QUIRK_NO_DEEPEST_PS },
+               .driver_data = NVME_QUIRK_NO_DEEPEST_PS |
+                               NVME_QUIRK_MEDIUM_PRIO_SQ },
+       { PCI_VDEVICE(INTEL, 0xf1a6),   /* Intel 760p/Pro 7600p */
+               .driver_data = NVME_QUIRK_IGNORE_DEV_SUBNQN, },
        { PCI_VDEVICE(INTEL, 0x5845),   /* Qemu emulated controller */
                .driver_data = NVME_QUIRK_IDENTIFY_CNS, },
        { PCI_DEVICE(0x1c58, 0x0003),   /* HGST adapter */
@@ -2684,6 +2811,8 @@ static const struct pci_device_id nvme_id_table[] = {
                .driver_data = NVME_QUIRK_LIGHTNVM, },
        { PCI_DEVICE(0x1d1d, 0x2807),   /* CNEX WL */
                .driver_data = NVME_QUIRK_LIGHTNVM, },
+       { PCI_DEVICE(0x1d1d, 0x2601),   /* CNEX Granby */
+               .driver_data = NVME_QUIRK_LIGHTNVM, },
        { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
        { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },