From f58040c0fc8bc6490fcc75db7fc3e709dfc3c656 Mon Sep 17 00:00:00 2001 From: Chunwei Chen Date: Wed, 25 May 2016 16:35:42 -0700 Subject: [PATCH] Implement a proper rw_tryupgrade Current rw_tryupgrade does rw_exit and then rw_tryenter(RW_RWITER), and then does rw_enter(RW_READER) if it fails. This violate the assumption that rw_tryupgrade should be atomic and could cause extra contention or even lock inversion. This patch we implement a proper rw_tryupgrade. For rwsem-spinlock, we take the spinlock to check rwsem->count and rwsem->wait_list. For normal rwsem, we use cmpxchg on rwsem->count to change the value from single reader to single writer. Signed-off-by: Chunwei Chen Signed-off-by: Brian Behlendorf Signed-off-by: Tim Chase Closes zfsonlinux/zfs#4692 Closes #554 --- config/spl-build.m4 | 25 ++++++++++++++ include/linux/rwsem_compat.h | 17 ++++++++++ include/sys/rwlock.h | 11 +++--- module/spl/spl-rwlock.c | 41 ++++++++++++++++++++++ module/splat/splat-rwlock.c | 66 ++++++++++++++++++++++++++++++++---- 5 files changed, 147 insertions(+), 13 deletions(-) diff --git a/config/spl-build.m4 b/config/spl-build.m4 index 7205069..d705c65 100644 --- a/config/spl-build.m4 +++ b/config/spl-build.m4 @@ -39,6 +39,7 @@ AC_DEFUN([SPL_AC_CONFIG_KERNEL], [ SPL_AC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE SPL_AC_SHRINK_CONTROL_STRUCT SPL_AC_RWSEM_SPINLOCK_IS_RAW + SPL_AC_RWSEM_ACTIVITY SPL_AC_SCHED_RT_HEADER SPL_AC_2ARGS_VFS_GETATTR SPL_AC_USLEEP_RANGE @@ -1316,6 +1317,30 @@ AC_DEFUN([SPL_AC_RWSEM_SPINLOCK_IS_RAW], [ EXTRA_KCFLAGS="$tmp_flags" ]) +dnl # +dnl # 3.16 API Change +dnl # +dnl # rwsem-spinlock "->activity" changed to "->count" +dnl # +AC_DEFUN([SPL_AC_RWSEM_ACTIVITY], [ + AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) + tmp_flags="$EXTRA_KCFLAGS" + EXTRA_KCFLAGS="-Werror" + SPL_LINUX_TRY_COMPILE([ + #include + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + dummy_semaphore.activity = 0; + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1, + [struct rw_semaphore has member activity]) + ],[ + AC_MSG_RESULT(no) + ]) + EXTRA_KCFLAGS="$tmp_flags" +]) + dnl # dnl # 3.9 API change, dnl # Moved things from linux/sched.h to linux/sched/rt.h diff --git a/include/linux/rwsem_compat.h b/include/linux/rwsem_compat.h index 5841d7c..9a4df26 100644 --- a/include/linux/rwsem_compat.h +++ b/include/linux/rwsem_compat.h @@ -27,6 +27,23 @@ #include +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +#define SPL_RWSEM_SINGLE_READER_VALUE (1) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (-1) +#else +#define SPL_RWSEM_SINGLE_READER_VALUE (RWSEM_ACTIVE_READ_BIAS) +#define SPL_RWSEM_SINGLE_WRITER_VALUE (RWSEM_ACTIVE_WRITE_BIAS) +#endif + +/* Linux 3.16 change activity to count for rwsem-spinlock */ +#ifdef HAVE_RWSEM_ACTIVITY +#define RWSEM_COUNT(sem) sem->activity +#else +#define RWSEM_COUNT(sem) sem->count +#endif + +int rwsem_tryupgrade(struct rw_semaphore *rwsem); + #if defined(RWSEM_SPINLOCK_IS_RAW) #define spl_rwsem_lock_irqsave(lk, fl) raw_spin_lock_irqsave(lk, fl) #define spl_rwsem_unlock_irqrestore(lk, fl) raw_spin_unlock_irqrestore(lk, fl) diff --git a/include/sys/rwlock.h b/include/sys/rwlock.h index 14d097b..facebe3 100644 --- a/include/sys/rwlock.h +++ b/include/sys/rwlock.h @@ -223,13 +223,10 @@ RW_LOCK_HELD(krwlock_t *rwp) if (RW_WRITE_HELD(rwp)) { \ _rc_ = 1; \ } else { \ - rw_exit(rwp); \ - if (rw_tryenter(rwp, RW_WRITER)) { \ - _rc_ = 1; \ - } else { \ - rw_enter(rwp, RW_READER); \ - _rc_ = 0; \ - } \ + spl_rw_lockdep_off_maybe(rwp); \ + if ((_rc_ = rwsem_tryupgrade(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + spl_rw_lockdep_on_maybe(rwp); \ } \ _rc_; \ }) diff --git a/module/spl/spl-rwlock.c b/module/spl/spl-rwlock.c index 98251c0..9b356a8 100644 --- a/module/spl/spl-rwlock.c +++ b/module/spl/spl-rwlock.c @@ -32,5 +32,46 @@ #define DEBUG_SUBSYSTEM S_RWLOCK +#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + int ret = 0; + unsigned long flags; + spl_rwsem_lock_irqsave(&rwsem->wait_lock, flags); + if (RWSEM_COUNT(rwsem) == SPL_RWSEM_SINGLE_READER_VALUE && + list_empty(&rwsem->wait_list)) { + ret = 1; + RWSEM_COUNT(rwsem) = SPL_RWSEM_SINGLE_WRITER_VALUE; + } + spl_rwsem_unlock_irqrestore(&rwsem->wait_lock, flags); + return (ret); +} +#else +static int +__rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + typeof (rwsem->count) val; + val = cmpxchg(&rwsem->count, SPL_RWSEM_SINGLE_READER_VALUE, + SPL_RWSEM_SINGLE_WRITER_VALUE); + return (val == SPL_RWSEM_SINGLE_READER_VALUE); +} +#endif + +int +rwsem_tryupgrade(struct rw_semaphore *rwsem) +{ + if (__rwsem_tryupgrade(rwsem)) { + rwsem_release(&rwsem->dep_map, 1, _RET_IP_); + rwsem_acquire(&rwsem->dep_map, 0, 1, _RET_IP_); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + rwsem->owner = current; +#endif + return (1); + } + return (0); +} +EXPORT_SYMBOL(rwsem_tryupgrade); + int spl_rw_init(void) { return 0; } void spl_rw_fini(void) { } diff --git a/module/splat/splat-rwlock.c b/module/splat/splat-rwlock.c index abd6a0e..c17eb07 100644 --- a/module/splat/splat-rwlock.c +++ b/module/splat/splat-rwlock.c @@ -55,8 +55,12 @@ #define SPLAT_RWLOCK_TEST5_DESC "Write downgrade" #define SPLAT_RWLOCK_TEST6_ID 0x0706 -#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade" -#define SPLAT_RWLOCK_TEST6_DESC "Read upgrade" +#define SPLAT_RWLOCK_TEST6_NAME "rw_tryupgrade-1" +#define SPLAT_RWLOCK_TEST6_DESC "rwsem->count value" + +#define SPLAT_RWLOCK_TEST7_ID 0x0707 +#define SPLAT_RWLOCK_TEST7_NAME "rw_tryupgrade-2" +#define SPLAT_RWLOCK_TEST7_DESC "Read upgrade" #define SPLAT_RWLOCK_TEST_MAGIC 0x115599DDUL #define SPLAT_RWLOCK_TEST_NAME "rwlock_test" @@ -580,8 +584,55 @@ splat_rwlock_test6(struct file *file, void *arg) splat_init_rw_priv(rwp, file); rw_enter(&rwp->rw_rwlock, RW_READER); - if (!RW_READ_HELD(&rwp->rw_rwlock)) { + if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) != + SPL_RWSEM_SINGLE_READER_VALUE) { + splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + "We assumed single reader rwsem->count " + "should be %ld, but is %ld\n", + SPL_RWSEM_SINGLE_READER_VALUE, + RWSEM_COUNT(SEM(&rwp->rw_rwlock))); + rc = -ENOLCK; + goto out; + } + rw_exit(&rwp->rw_rwlock); + + rw_enter(&rwp->rw_rwlock, RW_WRITER); + if (RWSEM_COUNT(SEM(&rwp->rw_rwlock)) != + SPL_RWSEM_SINGLE_WRITER_VALUE) { splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + "We assumed single writer rwsem->count " + "should be %ld, but is %ld\n", + SPL_RWSEM_SINGLE_WRITER_VALUE, + RWSEM_COUNT(SEM(&rwp->rw_rwlock))); + rc = -ENOLCK; + goto out; + } + rc = 0; + splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", + "rwsem->count same as we assumed\n"); +out: + rw_exit(&rwp->rw_rwlock); + rw_destroy(&rwp->rw_rwlock); + kfree(rwp); + + return rc; +} + +static int +splat_rwlock_test7(struct file *file, void *arg) +{ + rw_priv_t *rwp; + int rc; + + rwp = (rw_priv_t *)kmalloc(sizeof(*rwp), GFP_KERNEL); + if (rwp == NULL) + return -ENOMEM; + + splat_init_rw_priv(rwp, file); + + rw_enter(&rwp->rw_rwlock, RW_READER); + if (!RW_READ_HELD(&rwp->rw_rwlock)) { + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock should be read lock: %d\n", RW_READ_HELD(&rwp->rw_rwlock)); rc = -ENOLCK; @@ -591,7 +642,7 @@ splat_rwlock_test6(struct file *file, void *arg) /* With one reader upgrade should never fail. */ rc = rw_tryupgrade(&rwp->rw_rwlock); if (!rc) { - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock failed upgrade from reader: %d\n", RW_READ_HELD(&rwp->rw_rwlock)); rc = -ENOLCK; @@ -599,7 +650,7 @@ splat_rwlock_test6(struct file *file, void *arg) } if (RW_READ_HELD(&rwp->rw_rwlock) || !RW_WRITE_HELD(&rwp->rw_rwlock)) { - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "rwlock should " + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "rwlock should " "have 0 (not %d) reader and 1 (not %d) writer\n", RW_READ_HELD(&rwp->rw_rwlock), RW_WRITE_HELD(&rwp->rw_rwlock)); @@ -607,7 +658,7 @@ splat_rwlock_test6(struct file *file, void *arg) } rc = 0; - splat_vprint(file, SPLAT_RWLOCK_TEST6_NAME, "%s", + splat_vprint(file, SPLAT_RWLOCK_TEST7_NAME, "%s", "rwlock properly upgraded\n"); out: rw_exit(&rwp->rw_rwlock); @@ -646,6 +697,8 @@ splat_rwlock_init(void) SPLAT_RWLOCK_TEST5_ID, splat_rwlock_test5); SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST6_NAME, SPLAT_RWLOCK_TEST6_DESC, SPLAT_RWLOCK_TEST6_ID, splat_rwlock_test6); + SPLAT_TEST_INIT(sub, SPLAT_RWLOCK_TEST7_NAME, SPLAT_RWLOCK_TEST7_DESC, + SPLAT_RWLOCK_TEST7_ID, splat_rwlock_test7); return sub; } @@ -654,6 +707,7 @@ void splat_rwlock_fini(splat_subsystem_t *sub) { ASSERT(sub); + SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST7_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST6_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST5_ID); SPLAT_TEST_FINI(sub, SPLAT_RWLOCK_TEST4_ID); -- 2.39.2