2 files changed, 111 insertions(+)
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
-index 8deb4cd7b133..291885ea26dd 100644
+index 8df03f305028..da782f53ce4b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3808,6 +3808,15 @@
Safety option to keep boot IRQs enabled. This
should never be necessary.
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
-index d44037a6e109..1f576ba148dd 100644
+index 444ba11f3e79..5843ba2cad8e 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -193,6 +193,106 @@ static int __init pci_apply_final_quirks(void)
/*
* Decoding should be disabled for a PCI device during BAR sizing to avoid
* conflict. But doing so may cause problems on host bridge and perhaps other
-@@ -4875,6 +4975,8 @@ static const struct pci_dev_acs_enabled {
+@@ -4876,6 +4976,8 @@ static const struct pci_dev_acs_enabled {
{ PCI_VENDOR_ID_CAVIUM, 0xA060, pci_quirk_mf_endpoint_acs },
/* APM X-Gene */
{ PCI_VENDOR_ID_AMCC, 0xE004, pci_quirk_xgene_acs },
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/core/dev.c b/net/core/dev.c
-index 156b44f35102..66014d76a61b 100644
+index 7f5c73b6166f..cd2f2f5b00a5 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
-@@ -10521,7 +10521,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
+@@ -10530,7 +10530,7 @@ static void netdev_wait_allrefs(struct net_device *dev)
if (refcnt != 1 &&
time_after(jiffies, warning_time +
netdev_unregister_timeout_secs * HZ)) {
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
- drivers/pci/probe.c | 50 ++++-----------------------------------------
- 1 file changed, 4 insertions(+), 46 deletions(-)
+ drivers/pci/probe.c | 52 +++++----------------------------------------
+ 1 file changed, 5 insertions(+), 47 deletions(-)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
-index f6046a16dac1..275204646c68 100644
+index 59ef93f1d953..b38a345c5df6 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -19,7 +19,6 @@
-
/* Add initial resources to the bus */
resource_list_for_each_entry_safe(window, n, &resources) {
-+ list_move_tail(&window->node, &bridge->windows);
- offset = window->offset;
- res = window->res;
+- offset = window->offset;
+- res = window->res;
- if (!res->end)
- continue;
-
-- list_move_tail(&window->node, &bridge->windows);
+ list_move_tail(&window->node, &bridge->windows);
++ offset = window->offset;
++ res = window->res;
if (res->flags & IORESOURCE_BUS)
pci_bus_insert_busn_res(bus, bus->number, res->end);
Suggested-by: Bjorn Helgaas <bhelgaas@google.com>
Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
---
- drivers/pci/probe.c | 31 +++++++++++++++++++++++++++----
- 1 file changed, 27 insertions(+), 4 deletions(-)
+ drivers/pci/probe.c | 33 ++++++++++++++++++++++++++++-----
+ 1 file changed, 28 insertions(+), 5 deletions(-)
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
-index 275204646c68..944c35d87258 100644
+index b38a345c5df6..485167ca62a1 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -877,11 +877,11 @@ static void pci_set_bus_msi_domain(struct pci_bus *bus)
+
/* Add initial resources to the bus */
resource_list_for_each_entry_safe(window, n, &resources) {
-- list_move_tail(&window->node, &bridge->windows);
- offset = window->offset;
- res = window->res;
++ offset = window->offset;
++ res = window->res;
+ if (!res->end)
+ continue;
+
-+ list_move_tail(&window->node, &bridge->windows);
+ list_move_tail(&window->node, &bridge->windows);
+- offset = window->offset;
+- res = window->res;
if (res->flags & IORESOURCE_BUS)
pci_bus_insert_busn_res(bus, bus->number, res->end);
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jens Axboe <axboe@kernel.dk>
-Date: Tue, 31 Aug 2021 13:53:00 -0600
-Subject: [PATCH] io-wq: fix queue stalling race
-
-We need to set the stalled bit early, before we drop the lock for adding
-us to the stall hash queue. If not, then we can race with new work being
-queued between adding us to the stall hash and io_worker_handle_work()
-marking us stalled.
-
-Signed-off-by: Jens Axboe <axboe@kernel.dk>
-[backport]
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
----
- fs/io-wq.c | 15 +++++++--------
- 1 file changed, 7 insertions(+), 8 deletions(-)
-
-diff --git a/fs/io-wq.c b/fs/io-wq.c
-index 6612d0aa497e..33678185f3bc 100644
---- a/fs/io-wq.c
-+++ b/fs/io-wq.c
-@@ -437,8 +437,7 @@ static bool io_worker_can_run_work(struct io_worker *worker,
- }
-
- static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
-- struct io_worker *worker,
-- bool *stalled)
-+ struct io_worker *worker)
- __must_hold(wqe->lock)
- {
- struct io_wq_work_node *node, *prev;
-@@ -476,10 +475,14 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
- }
-
- if (stall_hash != -1U) {
-+ /*
-+ * Set this before dropping the lock to avoid racing with new
-+ * work being added and clearing the stalled bit.
-+ */
-+ wqe->flags |= IO_WQE_FLAG_STALLED;
- raw_spin_unlock(&wqe->lock);
- io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
-- *stalled = true;
- }
-
- return NULL;
-@@ -519,7 +522,6 @@ static void io_worker_handle_work(struct io_worker *worker)
-
- do {
- struct io_wq_work *work;
-- bool stalled;
- get_next:
- /*
- * If we got some work, mark us as busy. If we didn't, but
-@@ -528,12 +530,9 @@ static void io_worker_handle_work(struct io_worker *worker)
- * can't make progress, any work completion or insertion will
- * clear the stalled flag.
- */
-- stalled = false;
-- work = io_get_next_work(wqe, worker, &stalled);
-+ work = io_get_next_work(wqe, worker);
- if (work)
- __io_worker_busy(wqe, worker, work);
-- else if (stalled)
-- wqe->flags |= IO_WQE_FLAG_STALLED;
-
- raw_spin_unlock_irq(&wqe->lock);
- if (!work)
--- /dev/null
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Tue, 31 Aug 2021 13:57:32 -0600
+Subject: [PATCH] io-wq: split bounded and unbounded work into separate lists
+
+We've got a few issues that all boil down to the fact that we have one
+list of pending work items, yet two different types of workers to
+serve them. This causes some oddities around workers switching type and
+even hashed work vs regular work on the same bounded list.
+
+Just separate them out cleanly, similarly to how we already do
+accounting of what is running. That provides a clean separation and
+removes some corner cases that can cause stalls when handling IO
+that is punted to io-wq.
+
+Fixes: ecc53c48c13d ("io-wq: check max_worker limits if a worker transitions bound state")
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+[backport]
+Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ fs/io-wq.c | 160 +++++++++++++++++++++++------------------------------
+ 1 file changed, 70 insertions(+), 90 deletions(-)
+
+diff --git a/fs/io-wq.c b/fs/io-wq.c
+index ba7aaf2b95d0..6710da3d4445 100644
+--- a/fs/io-wq.c
++++ b/fs/io-wq.c
+@@ -34,7 +34,7 @@ enum {
+ };
+
+ enum {
+- IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
++ IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
+ };
+
+ /*
+@@ -73,26 +73,25 @@ struct io_wqe_acct {
+ unsigned max_workers;
+ int index;
+ atomic_t nr_running;
++ struct io_wq_work_list work_list;
++ unsigned long flags;
+ };
+
+ enum {
+ IO_WQ_ACCT_BOUND,
+ IO_WQ_ACCT_UNBOUND,
++ IO_WQ_ACCT_NR,
+ };
+
+ /*
+ * Per-node worker thread pool
+ */
+ struct io_wqe {
+- struct {
+- raw_spinlock_t lock;
+- struct io_wq_work_list work_list;
+- unsigned flags;
+- } ____cacheline_aligned_in_smp;
+-
+- int node;
++ raw_spinlock_t lock;
+ struct io_wqe_acct acct[2];
+
++ int node;
++
+ struct hlist_nulls_head free_list;
+ struct list_head all_list;
+
+@@ -196,11 +195,10 @@ static void io_worker_exit(struct io_worker *worker)
+ do_exit(0);
+ }
+
+-static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+- __must_hold(wqe->lock)
++static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
+ {
+- if (!wq_list_empty(&wqe->work_list) &&
+- !(wqe->flags & IO_WQE_FLAG_STALLED))
++ if (!wq_list_empty(&acct->work_list) &&
++ !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
+ return true;
+ return false;
+ }
+@@ -209,7 +207,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
+ * Check head of free list for an available worker. If one isn't available,
+ * caller must create one.
+ */
+-static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
++static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
++ struct io_wqe_acct *acct)
+ __must_hold(RCU)
+ {
+ struct hlist_nulls_node *n;
+@@ -223,6 +222,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
+ hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
+ if (!io_worker_get(worker))
+ continue;
++ if (io_wqe_get_acct(worker) != acct) {
++ io_worker_release(worker);
++ continue;
++ }
+ if (wake_up_process(worker->task)) {
+ io_worker_release(worker);
+ return true;
+@@ -341,7 +344,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
+ if (!(worker->flags & IO_WORKER_F_UP))
+ return;
+
+- if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
++ if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
+ atomic_inc(&acct->nr_running);
+ atomic_inc(&wqe->wq->worker_refs);
+ io_queue_worker_create(wqe, worker, acct);
+@@ -356,29 +359,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
+ struct io_wq_work *work)
+ __must_hold(wqe->lock)
+ {
+- bool worker_bound, work_bound;
+-
+- BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1);
+-
+ if (worker->flags & IO_WORKER_F_FREE) {
+ worker->flags &= ~IO_WORKER_F_FREE;
+ hlist_nulls_del_init_rcu(&worker->nulls_node);
+ }
+-
+- /*
+- * If worker is moving from bound to unbound (or vice versa), then
+- * ensure we update the running accounting.
+- */
+- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
+- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
+- if (worker_bound != work_bound) {
+- int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND;
+- io_wqe_dec_running(worker);
+- worker->flags ^= IO_WORKER_F_BOUND;
+- wqe->acct[index].nr_workers--;
+- wqe->acct[index ^ 1].nr_workers++;
+- io_wqe_inc_running(worker);
+- }
+ }
+
+ /*
+@@ -420,44 +404,23 @@ static bool io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
+ return ret;
+ }
+
+-/*
+- * We can always run the work if the worker is currently the same type as
+- * the work (eg both are bound, or both are unbound). If they are not the
+- * same, only allow it if incrementing the worker count would be allowed.
+- */
+-static bool io_worker_can_run_work(struct io_worker *worker,
+- struct io_wq_work *work)
+-{
+- struct io_wqe_acct *acct;
+-
+- if (!(worker->flags & IO_WORKER_F_BOUND) !=
+- !(work->flags & IO_WQ_WORK_UNBOUND))
+- return true;
+-
+- /* not the same type, check if we'd go over the limit */
+- acct = io_work_get_acct(worker->wqe, work);
+- return acct->nr_workers < acct->max_workers;
+-}
+-
+-static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
++static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
+ struct io_worker *worker)
+ __must_hold(wqe->lock)
+ {
+ struct io_wq_work_node *node, *prev;
+ struct io_wq_work *work, *tail;
+ unsigned int stall_hash = -1U;
++ struct io_wqe *wqe = worker->wqe;
+
+- wq_list_for_each(node, prev, &wqe->work_list) {
++ wq_list_for_each(node, prev, &acct->work_list) {
+ unsigned int hash;
+
+ work = container_of(node, struct io_wq_work, list);
+
+- if (!io_worker_can_run_work(worker, work))
+- break;
+-
+ /* not hashed, can run anytime */
+ if (!io_wq_is_hashed(work)) {
+- wq_list_del(&wqe->work_list, node, prev);
++ wq_list_del(&acct->work_list, node, prev);
+ return work;
+ }
+
+@@ -468,7 +431,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+ /* hashed, can run if not already running */
+ if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
+ wqe->hash_tail[hash] = NULL;
+- wq_list_cut(&wqe->work_list, &tail->list, prev);
++ wq_list_cut(&acct->work_list, &tail->list, prev);
+ return work;
+ }
+ if (stall_hash == -1U)
+@@ -484,12 +447,12 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
+ * Set this before dropping the lock to avoid racing with new
+ * work being added and clearing the stalled bit.
+ */
+- wqe->flags |= IO_WQE_FLAG_STALLED;
++ set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ raw_spin_unlock(&wqe->lock);
+ unstalled = io_wait_on_hash(wqe, stall_hash);
+ raw_spin_lock(&wqe->lock);
+ if (unstalled) {
+- wqe->flags &= ~IO_WQE_FLAG_STALLED;
++ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ if (wq_has_sleeper(&wqe->wq->hash->wait))
+ wake_up(&wqe->wq->hash->wait);
+ }
+@@ -526,6 +489,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
+ static void io_worker_handle_work(struct io_worker *worker)
+ __releases(wqe->lock)
+ {
++ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+ bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
+@@ -540,7 +504,7 @@ static void io_worker_handle_work(struct io_worker *worker)
+ * can't make progress, any work completion or insertion will
+ * clear the stalled flag.
+ */
+- work = io_get_next_work(wqe, worker);
++ work = io_get_next_work(acct, worker);
+ if (work)
+ __io_worker_busy(wqe, worker, work);
+
+@@ -576,7 +540,7 @@ static void io_worker_handle_work(struct io_worker *worker)
+ /* serialize hash clear with wake_up() */
+ spin_lock_irq(&wq->hash->wait.lock);
+ clear_bit(hash, &wq->hash->map);
+- wqe->flags &= ~IO_WQE_FLAG_STALLED;
++ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+ spin_unlock_irq(&wq->hash->wait.lock);
+ if (wq_has_sleeper(&wq->hash->wait))
+ wake_up(&wq->hash->wait);
+@@ -595,6 +559,7 @@ static void io_worker_handle_work(struct io_worker *worker)
+ static int io_wqe_worker(void *data)
+ {
+ struct io_worker *worker = data;
++ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
+ struct io_wqe *wqe = worker->wqe;
+ struct io_wq *wq = wqe->wq;
+ char buf[TASK_COMM_LEN];
+@@ -610,7 +575,7 @@ static int io_wqe_worker(void *data)
+ set_current_state(TASK_INTERRUPTIBLE);
+ loop:
+ raw_spin_lock_irq(&wqe->lock);
+- if (io_wqe_run_queue(wqe)) {
++ if (io_acct_run_queue(acct)) {
+ io_worker_handle_work(worker);
+ goto loop;
+ }
+@@ -636,7 +601,7 @@ static int io_wqe_worker(void *data)
+
+ if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
+ raw_spin_lock_irq(&wqe->lock);
+- if (!wq_list_empty(&wqe->work_list))
++ if (!wq_list_empty(&acct->work_list))
+ io_worker_handle_work(worker);
+ else
+ raw_spin_unlock_irq(&wqe->lock);
+@@ -782,12 +747,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
+
+ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+ {
++ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ unsigned int hash;
+ struct io_wq_work *tail;
+
+ if (!io_wq_is_hashed(work)) {
+ append:
+- wq_list_add_tail(&work->list, &wqe->work_list);
++ wq_list_add_tail(&work->list, &acct->work_list);
+ return;
+ }
+
+@@ -797,7 +763,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
+ if (!tail)
+ goto append;
+
+- wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
++ wq_list_add_after(&work->list, &tail->list, &acct->work_list);
+ }
+
+ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+@@ -819,10 +785,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
+
+ raw_spin_lock_irqsave(&wqe->lock, flags);
+ io_wqe_insert_work(wqe, work);
+- wqe->flags &= ~IO_WQE_FLAG_STALLED;
++ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
+
+ rcu_read_lock();
+- do_create = !io_wqe_activate_free_worker(wqe);
++ do_create = !io_wqe_activate_free_worker(wqe, acct);
+ rcu_read_unlock();
+
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
+@@ -875,6 +841,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
+ struct io_wq_work *work,
+ struct io_wq_work_node *prev)
+ {
++ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
+ unsigned int hash = io_get_work_hash(work);
+ struct io_wq_work *prev_work = NULL;
+
+@@ -886,7 +853,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
+ else
+ wqe->hash_tail[hash] = NULL;
+ }
+- wq_list_del(&wqe->work_list, &work->list, prev);
++ wq_list_del(&acct->work_list, &work->list, prev);
+ }
+
+ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+@@ -895,22 +862,27 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
+ struct io_wq_work_node *node, *prev;
+ struct io_wq_work *work;
+ unsigned long flags;
++ int i;
+
+ retry:
+ raw_spin_lock_irqsave(&wqe->lock, flags);
+- wq_list_for_each(node, prev, &wqe->work_list) {
+- work = container_of(node, struct io_wq_work, list);
+- if (!match->fn(work, match->data))
+- continue;
+- io_wqe_remove_pending(wqe, work, prev);
+- raw_spin_unlock_irqrestore(&wqe->lock, flags);
+- io_run_cancel(work, wqe);
+- match->nr_pending++;
+- if (!match->cancel_all)
+- return;
++ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
++ struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
+
+- /* not safe to continue after unlock */
+- goto retry;
++ wq_list_for_each(node, prev, &acct->work_list) {
++ work = container_of(node, struct io_wq_work, list);
++ if (!match->fn(work, match->data))
++ continue;
++ io_wqe_remove_pending(wqe, work, prev);
++ raw_spin_unlock_irqrestore(&wqe->lock, flags);
++ io_run_cancel(work, wqe);
++ match->nr_pending++;
++ if (!match->cancel_all)
++ return;
++
++ /* not safe to continue after unlock */
++ goto retry;
++ }
+ }
+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
+ }
+@@ -971,18 +943,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
+ int sync, void *key)
+ {
+ struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
++ int i;
+
+ list_del_init(&wait->entry);
+
+ rcu_read_lock();
+- io_wqe_activate_free_worker(wqe);
++ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
++ struct io_wqe_acct *acct = &wqe->acct[i];
++
++ if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
++ io_wqe_activate_free_worker(wqe, acct);
++ }
+ rcu_read_unlock();
+ return 1;
+ }
+
+ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
+ {
+- int ret = -ENOMEM, node;
++ int ret, node, i;
+ struct io_wq *wq;
+
+ if (WARN_ON_ONCE(!data->free_work || !data->do_work))
+@@ -1019,18 +997,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
+ goto err;
+ wq->wqes[node] = wqe;
+ wqe->node = alloc_node;
+- wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
+- wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND;
+ wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
+- atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
+ wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
+ task_rlimit(current, RLIMIT_NPROC);
+- atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
+- wqe->wait.func = io_wqe_hash_wake;
+ INIT_LIST_HEAD(&wqe->wait.entry);
++ wqe->wait.func = io_wqe_hash_wake;
++ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
++ struct io_wqe_acct *acct = &wqe->acct[i];
++
++ acct->index = i;
++ atomic_set(&acct->nr_running, 0);
++ INIT_WQ_LIST(&acct->work_list);
++ }
+ wqe->wq = wq;
+ raw_spin_lock_init(&wqe->lock);
+- INIT_WQ_LIST(&wqe->work_list);
+ INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
+ INIT_LIST_HEAD(&wqe->all_list);
+ }
+++ /dev/null
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Jens Axboe <axboe@kernel.dk>
-Date: Tue, 31 Aug 2021 13:57:32 -0600
-Subject: [PATCH] io-wq: split bounded and unbounded work into separate lists
-
-We've got a few issues that all boil down to the fact that we have one
-list of pending work items, yet two different types of workers to
-serve them. This causes some oddities around workers switching type and
-even hashed work vs regular work on the same bounded list.
-
-Just separate them out cleanly, similarly to how we already do
-accounting of what is running. That provides a clean separation and
-removes some corner cases that can cause stalls when handling IO
-that is punted to io-wq.
-
-Fixes: ecc53c48c13d ("io-wq: check max_worker limits if a worker transitions bound state")
-Signed-off-by: Jens Axboe <axboe@kernel.dk>
-[backport]
-Signed-off-by: Fabian Ebner <f.ebner@proxmox.com>
----
- fs/io-wq.c | 156 +++++++++++++++++++++++------------------------------
- 1 file changed, 68 insertions(+), 88 deletions(-)
-
-diff --git a/fs/io-wq.c b/fs/io-wq.c
-index 33678185f3bc..2496d8781ea1 100644
---- a/fs/io-wq.c
-+++ b/fs/io-wq.c
-@@ -34,7 +34,7 @@ enum {
- };
-
- enum {
-- IO_WQE_FLAG_STALLED = 1, /* stalled on hash */
-+ IO_ACCT_STALLED_BIT = 0, /* stalled on hash */
- };
-
- /*
-@@ -73,25 +73,24 @@ struct io_wqe_acct {
- unsigned max_workers;
- int index;
- atomic_t nr_running;
-+ struct io_wq_work_list work_list;
-+ unsigned long flags;
- };
-
- enum {
- IO_WQ_ACCT_BOUND,
- IO_WQ_ACCT_UNBOUND,
-+ IO_WQ_ACCT_NR,
- };
-
- /*
- * Per-node worker thread pool
- */
- struct io_wqe {
-- struct {
-- raw_spinlock_t lock;
-- struct io_wq_work_list work_list;
-- unsigned flags;
-- } ____cacheline_aligned_in_smp;
-+ raw_spinlock_t lock;
-+ struct io_wqe_acct acct[2];
-
- int node;
-- struct io_wqe_acct acct[2];
-
- struct hlist_nulls_head free_list;
- struct list_head all_list;
-@@ -196,11 +195,10 @@ static void io_worker_exit(struct io_worker *worker)
- do_exit(0);
- }
-
--static inline bool io_wqe_run_queue(struct io_wqe *wqe)
-- __must_hold(wqe->lock)
-+static inline bool io_acct_run_queue(struct io_wqe_acct *acct)
- {
-- if (!wq_list_empty(&wqe->work_list) &&
-- !(wqe->flags & IO_WQE_FLAG_STALLED))
-+ if (!wq_list_empty(&acct->work_list) &&
-+ !test_bit(IO_ACCT_STALLED_BIT, &acct->flags))
- return true;
- return false;
- }
-@@ -209,7 +207,8 @@ static inline bool io_wqe_run_queue(struct io_wqe *wqe)
- * Check head of free list for an available worker. If one isn't available,
- * caller must create one.
- */
--static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
-+static bool io_wqe_activate_free_worker(struct io_wqe *wqe,
-+ struct io_wqe_acct *acct)
- __must_hold(RCU)
- {
- struct hlist_nulls_node *n;
-@@ -223,6 +222,10 @@ static bool io_wqe_activate_free_worker(struct io_wqe *wqe)
- hlist_nulls_for_each_entry_rcu(worker, n, &wqe->free_list, nulls_node) {
- if (!io_worker_get(worker))
- continue;
-+ if (io_wqe_get_acct(worker) != acct) {
-+ io_worker_release(worker);
-+ continue;
-+ }
- if (wake_up_process(worker->task)) {
- io_worker_release(worker);
- return true;
-@@ -341,7 +344,7 @@ static void io_wqe_dec_running(struct io_worker *worker)
- if (!(worker->flags & IO_WORKER_F_UP))
- return;
-
-- if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) {
-+ if (atomic_dec_and_test(&acct->nr_running) && io_acct_run_queue(acct)) {
- atomic_inc(&acct->nr_running);
- atomic_inc(&wqe->wq->worker_refs);
- io_queue_worker_create(wqe, worker, acct);
-@@ -356,29 +359,10 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker,
- struct io_wq_work *work)
- __must_hold(wqe->lock)
- {
-- bool worker_bound, work_bound;
--
-- BUILD_BUG_ON((IO_WQ_ACCT_UNBOUND ^ IO_WQ_ACCT_BOUND) != 1);
--
- if (worker->flags & IO_WORKER_F_FREE) {
- worker->flags &= ~IO_WORKER_F_FREE;
- hlist_nulls_del_init_rcu(&worker->nulls_node);
- }
--
-- /*
-- * If worker is moving from bound to unbound (or vice versa), then
-- * ensure we update the running accounting.
-- */
-- worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0;
-- work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0;
-- if (worker_bound != work_bound) {
-- int index = work_bound ? IO_WQ_ACCT_UNBOUND : IO_WQ_ACCT_BOUND;
-- io_wqe_dec_running(worker);
-- worker->flags ^= IO_WORKER_F_BOUND;
-- wqe->acct[index].nr_workers--;
-- wqe->acct[index ^ 1].nr_workers++;
-- io_wqe_inc_running(worker);
-- }
- }
-
- /*
-@@ -417,44 +401,23 @@ static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash)
- spin_unlock(&wq->hash->wait.lock);
- }
-
--/*
-- * We can always run the work if the worker is currently the same type as
-- * the work (eg both are bound, or both are unbound). If they are not the
-- * same, only allow it if incrementing the worker count would be allowed.
-- */
--static bool io_worker_can_run_work(struct io_worker *worker,
-- struct io_wq_work *work)
--{
-- struct io_wqe_acct *acct;
--
-- if (!(worker->flags & IO_WORKER_F_BOUND) !=
-- !(work->flags & IO_WQ_WORK_UNBOUND))
-- return true;
--
-- /* not the same type, check if we'd go over the limit */
-- acct = io_work_get_acct(worker->wqe, work);
-- return acct->nr_workers < acct->max_workers;
--}
--
--static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
-+static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct,
- struct io_worker *worker)
- __must_hold(wqe->lock)
- {
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work, *tail;
- unsigned int stall_hash = -1U;
-+ struct io_wqe *wqe = worker->wqe;
-
-- wq_list_for_each(node, prev, &wqe->work_list) {
-+ wq_list_for_each(node, prev, &acct->work_list) {
- unsigned int hash;
-
- work = container_of(node, struct io_wq_work, list);
-
-- if (!io_worker_can_run_work(worker, work))
-- break;
--
- /* not hashed, can run anytime */
- if (!io_wq_is_hashed(work)) {
-- wq_list_del(&wqe->work_list, node, prev);
-+ wq_list_del(&acct->work_list, node, prev);
- return work;
- }
-
-@@ -465,7 +428,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
- /* hashed, can run if not already running */
- if (!test_and_set_bit(hash, &wqe->wq->hash->map)) {
- wqe->hash_tail[hash] = NULL;
-- wq_list_cut(&wqe->work_list, &tail->list, prev);
-+ wq_list_cut(&acct->work_list, &tail->list, prev);
- return work;
- }
- if (stall_hash == -1U)
-@@ -479,7 +442,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe,
- * Set this before dropping the lock to avoid racing with new
- * work being added and clearing the stalled bit.
- */
-- wqe->flags |= IO_WQE_FLAG_STALLED;
-+ set_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- raw_spin_unlock(&wqe->lock);
- io_wait_on_hash(wqe, stall_hash);
- raw_spin_lock(&wqe->lock);
-@@ -516,6 +479,7 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work);
- static void io_worker_handle_work(struct io_worker *worker)
- __releases(wqe->lock)
- {
-+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- bool do_kill = test_bit(IO_WQ_BIT_EXIT, &wq->state);
-@@ -530,7 +494,7 @@ static void io_worker_handle_work(struct io_worker *worker)
- * can't make progress, any work completion or insertion will
- * clear the stalled flag.
- */
-- work = io_get_next_work(wqe, worker);
-+ work = io_get_next_work(acct, worker);
- if (work)
- __io_worker_busy(wqe, worker, work);
-
-@@ -564,10 +528,10 @@ static void io_worker_handle_work(struct io_worker *worker)
-
- if (hash != -1U && !next_hashed) {
- clear_bit(hash, &wq->hash->map);
-+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
- if (wq_has_sleeper(&wq->hash->wait))
- wake_up(&wq->hash->wait);
- raw_spin_lock_irq(&wqe->lock);
-- wqe->flags &= ~IO_WQE_FLAG_STALLED;
- /* skip unnecessary unlock-lock wqe->lock */
- if (!work)
- goto get_next;
-@@ -582,6 +546,7 @@ static void io_worker_handle_work(struct io_worker *worker)
- static int io_wqe_worker(void *data)
- {
- struct io_worker *worker = data;
-+ struct io_wqe_acct *acct = io_wqe_get_acct(worker);
- struct io_wqe *wqe = worker->wqe;
- struct io_wq *wq = wqe->wq;
- char buf[TASK_COMM_LEN];
-@@ -597,7 +562,7 @@ static int io_wqe_worker(void *data)
- set_current_state(TASK_INTERRUPTIBLE);
- loop:
- raw_spin_lock_irq(&wqe->lock);
-- if (io_wqe_run_queue(wqe)) {
-+ if (io_acct_run_queue(acct)) {
- io_worker_handle_work(worker);
- goto loop;
- }
-@@ -623,7 +588,7 @@ static int io_wqe_worker(void *data)
-
- if (test_bit(IO_WQ_BIT_EXIT, &wq->state)) {
- raw_spin_lock_irq(&wqe->lock);
-- if (!wq_list_empty(&wqe->work_list))
-+ if (!wq_list_empty(&acct->work_list))
- io_worker_handle_work(worker);
- else
- raw_spin_unlock_irq(&wqe->lock);
-@@ -769,12 +734,13 @@ static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe)
-
- static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
- {
-+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash;
- struct io_wq_work *tail;
-
- if (!io_wq_is_hashed(work)) {
- append:
-- wq_list_add_tail(&work->list, &wqe->work_list);
-+ wq_list_add_tail(&work->list, &acct->work_list);
- return;
- }
-
-@@ -784,7 +750,7 @@ static void io_wqe_insert_work(struct io_wqe *wqe, struct io_wq_work *work)
- if (!tail)
- goto append;
-
-- wq_list_add_after(&work->list, &tail->list, &wqe->work_list);
-+ wq_list_add_after(&work->list, &tail->list, &acct->work_list);
- }
-
- static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
-@@ -806,10 +772,10 @@ static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work)
-
- raw_spin_lock_irqsave(&wqe->lock, flags);
- io_wqe_insert_work(wqe, work);
-- wqe->flags &= ~IO_WQE_FLAG_STALLED;
-+ clear_bit(IO_ACCT_STALLED_BIT, &acct->flags);
-
- rcu_read_lock();
-- do_create = !io_wqe_activate_free_worker(wqe);
-+ do_create = !io_wqe_activate_free_worker(wqe, acct);
- rcu_read_unlock();
-
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
-@@ -862,6 +828,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
- struct io_wq_work *work,
- struct io_wq_work_node *prev)
- {
-+ struct io_wqe_acct *acct = io_work_get_acct(wqe, work);
- unsigned int hash = io_get_work_hash(work);
- struct io_wq_work *prev_work = NULL;
-
-@@ -873,7 +840,7 @@ static inline void io_wqe_remove_pending(struct io_wqe *wqe,
- else
- wqe->hash_tail[hash] = NULL;
- }
-- wq_list_del(&wqe->work_list, &work->list, prev);
-+ wq_list_del(&acct->work_list, &work->list, prev);
- }
-
- static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
-@@ -882,22 +849,27 @@ static void io_wqe_cancel_pending_work(struct io_wqe *wqe,
- struct io_wq_work_node *node, *prev;
- struct io_wq_work *work;
- unsigned long flags;
-+ int i;
-
- retry:
- raw_spin_lock_irqsave(&wqe->lock, flags);
-- wq_list_for_each(node, prev, &wqe->work_list) {
-- work = container_of(node, struct io_wq_work, list);
-- if (!match->fn(work, match->data))
-- continue;
-- io_wqe_remove_pending(wqe, work, prev);
-- raw_spin_unlock_irqrestore(&wqe->lock, flags);
-- io_run_cancel(work, wqe);
-- match->nr_pending++;
-- if (!match->cancel_all)
-- return;
-+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-+ struct io_wqe_acct *acct = io_get_acct(wqe, i == 0);
-
-- /* not safe to continue after unlock */
-- goto retry;
-+ wq_list_for_each(node, prev, &acct->work_list) {
-+ work = container_of(node, struct io_wq_work, list);
-+ if (!match->fn(work, match->data))
-+ continue;
-+ io_wqe_remove_pending(wqe, work, prev);
-+ raw_spin_unlock_irqrestore(&wqe->lock, flags);
-+ io_run_cancel(work, wqe);
-+ match->nr_pending++;
-+ if (!match->cancel_all)
-+ return;
-+
-+ /* not safe to continue after unlock */
-+ goto retry;
-+ }
- }
- raw_spin_unlock_irqrestore(&wqe->lock, flags);
- }
-@@ -958,18 +930,24 @@ static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode,
- int sync, void *key)
- {
- struct io_wqe *wqe = container_of(wait, struct io_wqe, wait);
-+ int i;
-
- list_del_init(&wait->entry);
-
- rcu_read_lock();
-- io_wqe_activate_free_worker(wqe);
-+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-+ struct io_wqe_acct *acct = &wqe->acct[i];
-+
-+ if (test_and_clear_bit(IO_ACCT_STALLED_BIT, &acct->flags))
-+ io_wqe_activate_free_worker(wqe, acct);
-+ }
- rcu_read_unlock();
- return 1;
- }
-
- struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
- {
-- int ret = -ENOMEM, node;
-+ int ret, node, i;
- struct io_wq *wq;
-
- if (WARN_ON_ONCE(!data->free_work || !data->do_work))
-@@ -1006,18 +984,20 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data)
- goto err;
- wq->wqes[node] = wqe;
- wqe->node = alloc_node;
-- wqe->acct[IO_WQ_ACCT_BOUND].index = IO_WQ_ACCT_BOUND;
-- wqe->acct[IO_WQ_ACCT_UNBOUND].index = IO_WQ_ACCT_UNBOUND;
- wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded;
-- atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0);
- wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers =
- task_rlimit(current, RLIMIT_NPROC);
-- atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0);
-- wqe->wait.func = io_wqe_hash_wake;
- INIT_LIST_HEAD(&wqe->wait.entry);
-+ wqe->wait.func = io_wqe_hash_wake;
-+ for (i = 0; i < IO_WQ_ACCT_NR; i++) {
-+ struct io_wqe_acct *acct = &wqe->acct[i];
-+
-+ acct->index = i;
-+ atomic_set(&acct->nr_running, 0);
-+ INIT_WQ_LIST(&acct->work_list);
-+ }
- wqe->wq = wq;
- raw_spin_lock_init(&wqe->lock);
-- INIT_WQ_LIST(&wqe->work_list);
- INIT_HLIST_NULLS_HEAD(&wqe->free_list, 0);
- INIT_LIST_HEAD(&wqe->all_list);
- }
-Subproject commit b188ba567fc92eebd36f679de7ba1b8c2354839d
+Subproject commit f71c3af206e5a967bf6058be2b4e6cc8df42bc29