The btrfs zstd workspace manager uses a background timer to reclaim not
recently used workspaces. I used spin_lock() from this context which
should have been caught with lockdep, but was not. This deadlock was
reported in bugzilla. The fix is to switch the zstd wsm lock to use
spin_lock_bh() from the softirq context.
This happened quite relibably on ppc64, unlike on other architectures.
[ 313.402874] ================================
[ 313.402875] WARNING: inconsistent lock state
[ 313.402879] 5.1.0-rc7 #1 Not tainted
[ 313.402880] --------------------------------
[ 313.402882] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
[ 313.402885] swapper/5/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
[ 313.402888]
0000000080d1120c (&(&wsm.lock)->rlock){+.?.}, at: .zstd_reclaim_timer_fn+0x40/0x230
[ 313.402895] {SOFTIRQ-ON-W} state was registered at:
[ 313.402899] .lock_acquire+0xd0/0x240
[ 313.402903] ._raw_spin_lock+0x34/0x60
[ 313.402906] .zstd_get_workspace+0xd0/0x360
[ 313.402908] .end_compressed_bio_read+0x3b8/0x540
[ 313.402911] .bio_endio+0x174/0x2c0
[ 313.402914] .end_workqueue_fn+0x4c/0x70
[ 313.402917] .normal_work_helper+0x138/0x7e0
[ 313.402920] .process_one_work+0x324/0x790
[ 313.402922] .worker_thread+0x68/0x570
[ 313.402925] .kthread+0x19c/0x1b0
[ 313.402928] .ret_from_kernel_thread+0x58/0x78
[ 313.402930] irq event stamp:
2629216
[ 313.402933] hardirqs last enabled at (
2629216): [<
c0000000009da738>] ._raw_spin_unlock_irq+0x38/0x60
[ 313.402936] hardirqs last disabled at (
2629215): [<
c0000000009da4c4>] ._raw_spin_lock_irq+0x24/0x70
[ 313.402939] softirqs last enabled at (
2629212): [<
c0000000000af9fc>] .irq_enter+0x8c/0xd0
[ 313.402942] softirqs last disabled at (
2629213): [<
c0000000000afb58>] .irq_exit+0x118/0x170
[ 313.402944]
other info that might help us debug this:
[ 313.402945] Possible unsafe locking scenario:
[ 313.402947] CPU0
[ 313.402948] ----
[ 313.402949] lock(&(&wsm.lock)->rlock);
[ 313.402951] <Interrupt>
[ 313.402952] lock(&(&wsm.lock)->rlock);
[ 313.402954]
*** DEADLOCK ***
[ 313.402957] 1 lock held by swapper/5/0:
[ 313.402958] #0:
000000004b612042 ((&wsm.timer)){+.-.}, at: .call_timer_fn+0x0/0x3c0
[ 313.402963]
stack backtrace:
[ 313.402967] CPU: 5 PID: 0 Comm: swapper/5 Not tainted 5.1.0-rc7 #1
[ 313.402968] Call Trace:
[ 313.402972] [
c0000007fa262e70] [
c0000000009b3294] .dump_stack+0xe0/0x15c (unreliable)
[ 313.402975] [
c0000007fa262f10] [
c000000000125548] .print_usage_bug+0x348/0x390
[ 313.402978] [
c0000007fa262fd0] [
c000000000125cb4] .mark_lock+0x724/0x930
[ 313.402981] [
c0000007fa263080] [
c000000000126c20] .__lock_acquire+0xc90/0x16a0
[ 313.402984] [
c0000007fa2631b0] [
c000000000128040] .lock_acquire+0xd0/0x240
[ 313.402987] [
c0000007fa263280] [
c0000000009da2b4] ._raw_spin_lock+0x34/0x60
[ 313.402990] [
c0000007fa263300] [
c00000000054b0b0] .zstd_reclaim_timer_fn+0x40/0x230
[ 313.402993] [
c0000007fa2633d0] [
c000000000158b38] .call_timer_fn+0xc8/0x3c0
[ 313.402996] [
c0000007fa2634a0] [
c000000000158f74] .expire_timers+0x144/0x260
[ 313.402999] [
c0000007fa263550] [
c000000000159178] .run_timer_softirq+0xe8/0x230
[ 313.403002] [
c0000007fa263680] [
c0000000009db288] .__do_softirq+0x188/0x5d4
[ 313.403004] [
c0000007fa263790] [
c0000000000afb58] .irq_exit+0x118/0x170
[ 313.403008] [
c0000007fa263800] [
c000000000028d88] .timer_interrupt+0x158/0x430
[ 313.403012] [
c0000007fa2638b0] [
c0000000000091d4] decrementer_common+0x134/0x140
[ 313.403017] --- interrupt: 901 at replay_interrupt_return+0x0/0x4
LR = .arch_local_irq_restore.part.0+0x68/0x80
[ 313.403020] [
c0000007fa263bb0] [
c00000000001a3ac] .arch_local_irq_restore.part.0+0x2c/0x80 (unreliable)
[ 313.403024] [
c0000007fa263c30] [
c0000000007bbbcc] .cpuidle_enter_state+0xec/0x670
[ 313.403027] [
c0000007fa263d00] [
c0000000000f5130] .call_cpuidle+0x40/0x90
[ 313.403031] [
c0000007fa263d70] [
c0000000000f554c] .do_idle+0x2dc/0x3a0
[ 313.403034] [
c0000007fa263e30] [
c0000000000f59ac] .cpu_startup_entry+0x2c/0x30
[ 313.403037] [
c0000007fa263ea0] [
c000000000045674] .start_secondary+0x644/0x650
[ 313.403041] [
c0000007fa263f90] [
c00000000000ad5c] start_secondary_prolog+0x10/0x14
Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=203517
Fixes: 3f93aef535c8 ("btrfs: add zstd compression level support")
CC: stable@vger.kernel.org # 5.1+
Signed-off-by: Dennis Zhou <dennis@kernel.org>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return;
}
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
}
/*
struct workspace *workspace;
int i;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
for (i = 0; i < ZSTD_BTRFS_MAX_LEVEL; i++) {
while (!list_empty(&wsm.idle_ws[i])) {
workspace = container_of(wsm.idle_ws[i].next,
zstd_free_workspace(&workspace->list);
}
}
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
del_timer_sync(&wsm.timer);
}
struct workspace *workspace;
int i = level - 1;
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) {
if (!list_empty(&wsm.idle_ws[i])) {
ws = wsm.idle_ws[i].next;
list_del(&workspace->lru_list);
if (list_empty(&wsm.idle_ws[i]))
clear_bit(i, &wsm.active_map);
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return ws;
}
}
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
return NULL;
}
{
struct workspace *workspace = list_to_workspace(ws);
- spin_lock(&wsm.lock);
+ spin_lock_bh(&wsm.lock);
/* A node is only taken off the lru if we are the corresponding level */
if (workspace->req_level == workspace->level) {
list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]);
workspace->req_level = 0;
- spin_unlock(&wsm.lock);
+ spin_unlock_bh(&wsm.lock);
if (workspace->level == ZSTD_BTRFS_MAX_LEVEL)
cond_wake_up(&wsm.wait);