]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
ibmvnic: Allow queueing resets during probe
authorSukadev Bhattiprolu <sukadev@linux.ibm.com>
Fri, 25 Feb 2022 06:23:58 +0000 (22:23 -0800)
committerDavid S. Miller <davem@davemloft.net>
Fri, 25 Feb 2022 10:57:47 +0000 (10:57 +0000)
We currently don't allow queuing resets when adapter is in VNIC_PROBING
state - instead we throw away the reset and return EBUSY. The reasoning
is probably that during ibmvnic_probe() the ibmvnic_adapter itself is
being initialized so performing a reset during this time can lead us to
accessing fields in the ibmvnic_adapter that are not fully initialized.
A review of the code shows that all the adapter state neede to process a
reset is initialized before registering the CRQ so that should no longer
be a concern.

Further the expectation is that if we do get a reset (transport event)
during probe, the do..while() loop in ibmvnic_probe() will handle this
by reinitializing the CRQ.

While that is true to some extent, it is possible that the reset might
occur _after_ the CRQ is registered and CRQ_INIT message was exchanged
but _before_ the adapter state is set to VNIC_PROBED. As mentioned above,
such a reset will be thrown away. While the client assumes that the
adapter is functional, the vnic server will wait for the client to reinit
the adapter. This disconnect between the two leaves the adapter down
needing manual intervention.

Because ibmvnic_probe() has other work to do after initializing the CRQ
(such as registering the netdev at a minimum) and because the reset event
can occur at any instant after the CRQ is initialized, there will always
be a window between initializing the CRQ and considering the adapter
ready for resets (ie state == PROBED).

So rather than discarding resets during this window, allow queueing them
- but only process them after the adapter is fully initialized.

To do this, introduce a new completion state ->probe_done and have the
reset worker thread wait on this before processing resets.

This change brings up two new situations in or just after ibmvnic_probe().
First after one or more resets were queued, we encounter an error and
decide to retry the initialization.  At that point the queued resets are
no longer relevant since we could be talking to a new vnic server. So we
must purge/flush the queued resets before restarting the initialization.
As a side note, since we are still in the probing stage and we have not
registered the netdev, it will not be CHANGE_PARAM reset.

Second this change opens up a potential race between the worker thread
in __ibmvnic_reset(), the tasklet and the ibmvnic_open() due to the
following sequence of events:

1. Register CRQ
2. Get transport event before CRQ_INIT completes.
3. Tasklet schedules reset:
a) add rwi to list
b) schedule_work() to start worker thread which runs
   and waits for ->probe_done.
4. ibmvnic_probe() decides to retry, purges rwi_list
5. Re-register crq and this time rest of probe succeeds - register
   netdev and complete(->probe_done).
6. Worker thread resumes in __ibmvnic_reset() from 3b.
7. Worker thread sets ->resetting bit
8. ibmvnic_open() comes in, notices ->resetting bit, sets state
   to IBMVNIC_OPEN and returns early expecting worker thread to
   finish the open.
9. Worker thread finds rwi_list empty and returns without
   opening the interface.

If this happens, the ->ndo_open() call is effectively lost and the
interface remains down. To address this, ensure that ->rwi_list is
not empty before setting the ->resetting  bit. See also comments in
__ibmvnic_reset().

Fixes: 6a2fb0e99f9c ("ibmvnic: driver initialization for kdump/kexec")
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
drivers/net/ethernet/ibm/ibmvnic.c
drivers/net/ethernet/ibm/ibmvnic.h

index 93580c68600a872a1fd4c3b9f8d9401d9bfb520c..b423e94956f1088a1b27afa963597e1873f8c4fc 100644 (file)
@@ -2618,23 +2618,82 @@ out:
 static void __ibmvnic_reset(struct work_struct *work)
 {
        struct ibmvnic_adapter *adapter;
-       bool saved_state = false;
+       unsigned int timeout = 5000;
        struct ibmvnic_rwi *tmprwi;
+       bool saved_state = false;
        struct ibmvnic_rwi *rwi;
        unsigned long flags;
-       u32 reset_state;
+       struct device *dev;
+       bool need_reset;
        int num_fails = 0;
+       u32 reset_state;
        int rc = 0;
 
        adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset);
+               dev = &adapter->vdev->dev;
 
-       if (test_and_set_bit_lock(0, &adapter->resetting)) {
+       /* Wait for ibmvnic_probe() to complete. If probe is taking too long
+        * or if another reset is in progress, defer work for now. If probe
+        * eventually fails it will flush and terminate our work.
+        *
+        * Three possibilities here:
+        * 1. Adpater being removed  - just return
+        * 2. Timed out on probe or another reset in progress - delay the work
+        * 3. Completed probe - perform any resets in queue
+        */
+       if (adapter->state == VNIC_PROBING &&
+           !wait_for_completion_timeout(&adapter->probe_done, timeout)) {
+               dev_err(dev, "Reset thread timed out on probe");
                queue_delayed_work(system_long_wq,
                                   &adapter->ibmvnic_delayed_reset,
                                   IBMVNIC_RESET_DELAY);
                return;
        }
 
+       /* adapter is done with probe (i.e state is never VNIC_PROBING now) */
+       if (adapter->state == VNIC_REMOVING)
+               return;
+
+       /* ->rwi_list is stable now (no one else is removing entries) */
+
+       /* ibmvnic_probe() may have purged the reset queue after we were
+        * scheduled to process a reset so there maybe no resets to process.
+        * Before setting the ->resetting bit though, we have to make sure
+        * that there is infact a reset to process. Otherwise we may race
+        * with ibmvnic_open() and end up leaving the vnic down:
+        *
+        *      __ibmvnic_reset()           ibmvnic_open()
+        *      -----------------           --------------
+        *
+        *  set ->resetting bit
+        *                              find ->resetting bit is set
+        *                              set ->state to IBMVNIC_OPEN (i.e
+        *                              assume reset will open device)
+        *                              return
+        *  find reset queue empty
+        *  return
+        *
+        *      Neither performed vnic login/open and vnic stays down
+        *
+        * If we hold the lock and conditionally set the bit, either we
+        * or ibmvnic_open() will complete the open.
+        */
+       need_reset = false;
+       spin_lock(&adapter->rwi_lock);
+       if (!list_empty(&adapter->rwi_list)) {
+               if (test_and_set_bit_lock(0, &adapter->resetting)) {
+                       queue_delayed_work(system_long_wq,
+                                          &adapter->ibmvnic_delayed_reset,
+                                          IBMVNIC_RESET_DELAY);
+               } else {
+                       need_reset = true;
+               }
+       }
+       spin_unlock(&adapter->rwi_lock);
+
+       if (!need_reset)
+               return;
+
        rwi = get_next_rwi(adapter);
        while (rwi) {
                spin_lock_irqsave(&adapter->state_lock, flags);
@@ -2786,13 +2845,6 @@ static int ibmvnic_reset(struct ibmvnic_adapter *adapter,
                goto err;
        }
 
-       if (adapter->state == VNIC_PROBING) {
-               netdev_warn(netdev, "Adapter reset during probe\n");
-               adapter->init_done_rc = -EAGAIN;
-               ret = EAGAIN;
-               goto err;
-       }
-
        list_for_each_entry(tmp, &adapter->rwi_list, list) {
                if (tmp->reset_reason == reason) {
                        netdev_dbg(netdev, "Skipping matching reset, reason=%s\n",
@@ -5755,6 +5807,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
        struct ibmvnic_adapter *adapter;
        struct net_device *netdev;
        unsigned char *mac_addr_p;
+       unsigned long flags;
        bool init_success;
        int rc;
 
@@ -5799,6 +5852,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
        spin_lock_init(&adapter->rwi_lock);
        spin_lock_init(&adapter->state_lock);
        mutex_init(&adapter->fw_lock);
+       init_completion(&adapter->probe_done);
        init_completion(&adapter->init_done);
        init_completion(&adapter->fw_done);
        init_completion(&adapter->reset_done);
@@ -5816,6 +5870,26 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
                 */
                adapter->failover_pending = false;
 
+               /* If we had already initialized CRQ, we may have one or
+                * more resets queued already. Discard those and release
+                * the CRQ before initializing the CRQ again.
+                */
+               release_crq_queue(adapter);
+
+               /* Since we are still in PROBING state, __ibmvnic_reset()
+                * will not access the ->rwi_list and since we released CRQ,
+                * we won't get _new_ transport events. But there maybe an
+                * ongoing ibmvnic_reset() call. So serialize access to
+                * rwi_list. If we win the race, ibvmnic_reset() could add
+                * a reset after we purged but thats ok - we just may end
+                * up with an extra reset (i.e similar to having two or more
+                * resets in the queue at once).
+                * CHECK.
+                */
+               spin_lock_irqsave(&adapter->rwi_lock, flags);
+               flush_reset_queue(adapter);
+               spin_unlock_irqrestore(&adapter->rwi_lock, flags);
+
                rc = init_crq_queue(adapter);
                if (rc) {
                        dev_err(&dev->dev, "Couldn't initialize crq. rc=%d\n",
@@ -5867,6 +5941,8 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id)
        }
        dev_info(&dev->dev, "ibmvnic registered\n");
 
+       complete(&adapter->probe_done);
+
        return 0;
 
 ibmvnic_register_fail:
@@ -5881,6 +5957,17 @@ ibmvnic_stats_fail:
 ibmvnic_init_fail:
        release_sub_crqs(adapter, 1);
        release_crq_queue(adapter);
+
+       /* cleanup worker thread after releasing CRQ so we don't get
+        * transport events (i.e new work items for the worker thread).
+        */
+       adapter->state = VNIC_REMOVING;
+       complete(&adapter->probe_done);
+       flush_work(&adapter->ibmvnic_reset);
+       flush_delayed_work(&adapter->ibmvnic_delayed_reset);
+
+       flush_reset_queue(adapter);
+
        mutex_destroy(&adapter->fw_lock);
        free_netdev(netdev);
 
index 4a7a56ff74ce16af8682808b9ae41c68e90a39b6..fa2d607a7b1b9712930267d96f1b644c1a1ddc98 100644 (file)
@@ -930,6 +930,7 @@ struct ibmvnic_adapter {
 
        struct ibmvnic_tx_pool *tx_pool;
        struct ibmvnic_tx_pool *tso_pool;
+       struct completion probe_done;
        struct completion init_done;
        int init_done_rc;