]> git.proxmox.com Git - mirror_zfs.git/commitdiff
More aggsum optimizations
authorAlexander Motin <mav@FreeBSD.org>
Mon, 7 Jun 2021 16:02:47 +0000 (12:02 -0400)
committerGitHub <noreply@github.com>
Mon, 7 Jun 2021 16:02:47 +0000 (09:02 -0700)
- Avoid atomic_add() when updating as_lower_bound/as_upper_bound.
Previous code was excessively strong on 64bit systems while not
strong enough on 32bit ones.  Instead introduce and use real
atomic_load() and atomic_store() operations, just an assignments
on 64bit machines, but using proper atomics on 32bit ones to avoid
torn reads/writes.

 - Reduce number of buckets on large systems.  Extra buckets not as
much improve add speed, as hurt reads.  Unlike wmsum for aggsum
reads are still important.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Ryan Moeller <ryan@iXsystems.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored-By: iXsystems, Inc.
Closes #12145

include/os/linux/spl/sys/atomic.h
include/sys/aggsum.h
lib/libspl/atomic.c
lib/libspl/include/atomic.h
module/zfs/aggsum.c

index 2d21cbb3e140fa71c6638c6acdade6ec17999721..8f7fa5aeda114c5112665b4b8cc00a6f2997ff23 100644 (file)
@@ -48,6 +48,8 @@
 #define        atomic_sub_32_nv(v, i)  atomic_sub_return((i), (atomic_t *)(v))
 #define        atomic_cas_32(v, x, y)  atomic_cmpxchg((atomic_t *)(v), x, y)
 #define        atomic_swap_32(v, x)    atomic_xchg((atomic_t *)(v), x)
+#define        atomic_load_32(v)       atomic_read((atomic_t *)(v))
+#define        atomic_store_32(v, x)   atomic_set((atomic_t *)(v), x)
 #define        atomic_inc_64(v)        atomic64_inc((atomic64_t *)(v))
 #define        atomic_dec_64(v)        atomic64_dec((atomic64_t *)(v))
 #define        atomic_add_64(v, i)     atomic64_add((i), (atomic64_t *)(v))
@@ -58,6 +60,8 @@
 #define        atomic_sub_64_nv(v, i)  atomic64_sub_return((i), (atomic64_t *)(v))
 #define        atomic_cas_64(v, x, y)  atomic64_cmpxchg((atomic64_t *)(v), x, y)
 #define        atomic_swap_64(v, x)    atomic64_xchg((atomic64_t *)(v), x)
+#define        atomic_load_64(v)       atomic64_read((atomic64_t *)(v))
+#define        atomic_store_64(v, x)   atomic64_set((atomic64_t *)(v), x)
 
 #ifdef _LP64
 static __inline__ void *
index cb43727f1df426673b6333cb592ddbb9da564a1d..65800058cbf6f3906e381e5d24497bf7a60150e7 100644 (file)
@@ -39,15 +39,16 @@ struct aggsum_bucket {
 typedef struct aggsum {
        kmutex_t as_lock;
        int64_t as_lower_bound;
-       int64_t as_upper_bound;
+       uint64_t as_upper_bound;
+       aggsum_bucket_t *as_buckets ____cacheline_aligned;
        uint_t as_numbuckets;
-       aggsum_bucket_t *as_buckets;
+       uint_t as_bucketshift;
 } aggsum_t;
 
 void aggsum_init(aggsum_t *, uint64_t);
 void aggsum_fini(aggsum_t *);
 int64_t aggsum_lower_bound(aggsum_t *);
-int64_t aggsum_upper_bound(aggsum_t *);
+uint64_t aggsum_upper_bound(aggsum_t *);
 int aggsum_compare(aggsum_t *, uint64_t);
 uint64_t aggsum_value(aggsum_t *);
 void aggsum_add(aggsum_t *, int64_t);
index d1a71b77710f263d24d5583d0ca970207a6b7b8c..4717d818ce5c0ee8bcc91db5e894f10ff7b1a998 100644 (file)
@@ -313,6 +313,19 @@ atomic_swap_ptr(volatile void *target, void *bits)
        return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
 }
 
+#ifndef _LP64
+uint64_t
+atomic_load_64(volatile uint64_t *target)
+{
+       return (__atomic_load_n(target, __ATOMIC_RELAXED));
+}
+
+void
+atomic_store_64(volatile uint64_t *target, uint64_t bits)
+{
+       return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
+}
+#endif
 
 int
 atomic_set_long_excl(volatile ulong_t *target, uint_t value)
index f8c257f9696ba859c1cbbb33c5b5f9f84572c4b5..8dd1d654a486baf4be55f09d89f7a5c6457bab84 100644 (file)
@@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
 extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
 #endif
 
+/*
+ * Atomically read variable.
+ */
+#define        atomic_load_char(p)     (*(volatile uchar_t *)(p))
+#define        atomic_load_short(p)    (*(volatile ushort_t *)(p))
+#define        atomic_load_int(p)      (*(volatile uint_t *)(p))
+#define        atomic_load_long(p)     (*(volatile ulong_t *)(p))
+#define        atomic_load_ptr(p)      (*(volatile __typeof(*p) *)(p))
+#define        atomic_load_8(p)        (*(volatile uint8_t *)(p))
+#define        atomic_load_16(p)       (*(volatile uint16_t *)(p))
+#define        atomic_load_32(p)       (*(volatile uint32_t *)(p))
+#ifdef _LP64
+#define        atomic_load_64(p)       (*(volatile uint64_t *)(p))
+#elif defined(_INT64_TYPE)
+extern uint64_t atomic_load_64(volatile uint64_t *);
+#endif
+
+/*
+ * Atomically write variable.
+ */
+#define        atomic_store_char(p, v)         \
+       (*(volatile uchar_t *)(p) = (uchar_t)(v))
+#define        atomic_store_short(p, v)        \
+       (*(volatile ushort_t *)(p) = (ushort_t)(v))
+#define        atomic_store_int(p, v)          \
+       (*(volatile uint_t *)(p) = (uint_t)(v))
+#define        atomic_store_long(p, v)         \
+       (*(volatile ulong_t *)(p) = (ulong_t)(v))
+#define        atomic_store_ptr(p, v)          \
+       (*(volatile __typeof(*p) *)(p) = (v))
+#define        atomic_store_8(p, v)            \
+       (*(volatile uint8_t *)(p) = (uint8_t)(v))
+#define        atomic_store_16(p, v)           \
+       (*(volatile uint16_t *)(p) = (uint16_t)(v))
+#define        atomic_store_32(p, v)           \
+       (*(volatile uint32_t *)(p) = (uint32_t)(v))
+#ifdef _LP64
+#define        atomic_store_64(p, v)           \
+       (*(volatile uint64_t *)(p) = (uint64_t)(v))
+#elif defined(_INT64_TYPE)
+extern void atomic_store_64(volatile uint64_t *, uint64_t);
+#endif
+
 /*
  * Perform an exclusive atomic bit set/clear on a target.
  * Returns 0 if bit was successfully set/cleared, or -1
index e46da95f676cd64b14bca22dff2eb18c2a1e9256..c4ea4f86fc5f89b88fc2ef29d7640fbd3189f2fe 100644 (file)
  */
 
 /*
- * We will borrow aggsum_borrow_multiplier times the current request, so we will
- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
- * aggsum_delta().
+ * We will borrow 2^aggsum_borrow_shift times the current request, so we will
+ * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
+ * aggsum_add().
  */
-static uint_t aggsum_borrow_multiplier = 10;
+static uint_t aggsum_borrow_shift = 4;
 
 void
 aggsum_init(aggsum_t *as, uint64_t value)
@@ -90,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value)
        bzero(as, sizeof (*as));
        as->as_lower_bound = as->as_upper_bound = value;
        mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
-       as->as_numbuckets = boot_ncpus;
-       as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
-           KM_SLEEP);
+       /*
+        * Too many buckets may hurt read performance without improving
+        * write.  From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
+        */
+       as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
+       as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
+       as->as_buckets = kmem_zalloc(as->as_numbuckets *
+           sizeof (aggsum_bucket_t), KM_SLEEP);
        for (int i = 0; i < as->as_numbuckets; i++) {
                mutex_init(&as->as_buckets[i].asc_lock,
                    NULL, MUTEX_DEFAULT, NULL);
@@ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as)
 int64_t
 aggsum_lower_bound(aggsum_t *as)
 {
-       return (as->as_lower_bound);
+       return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound));
 }
 
-int64_t
+uint64_t
 aggsum_upper_bound(aggsum_t *as)
 {
-       return (as->as_upper_bound);
-}
-
-static void
-aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
-{
-       ASSERT(MUTEX_HELD(&as->as_lock));
-       ASSERT(MUTEX_HELD(&asb->asc_lock));
-
-       /*
-        * We use atomic instructions for this because we read the upper and
-        * lower bounds without the lock, so we need stores to be atomic.
-        */
-       atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-           asb->asc_delta + asb->asc_borrowed);
-       atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-           asb->asc_delta - asb->asc_borrowed);
-       asb->asc_delta = 0;
-       asb->asc_borrowed = 0;
+       return (atomic_load_64(&as->as_upper_bound));
 }
 
 uint64_t
 aggsum_value(aggsum_t *as)
 {
-       int64_t rv;
+       int64_t lb;
+       uint64_t ub;
 
        mutex_enter(&as->as_lock);
-       if (as->as_lower_bound == as->as_upper_bound) {
-               rv = as->as_lower_bound;
+       lb = as->as_lower_bound;
+       ub = as->as_upper_bound;
+       if (lb == ub) {
                for (int i = 0; i < as->as_numbuckets; i++) {
                        ASSERT0(as->as_buckets[i].asc_delta);
                        ASSERT0(as->as_buckets[i].asc_borrowed);
                }
                mutex_exit(&as->as_lock);
-               return (rv);
+               return (lb);
        }
        for (int i = 0; i < as->as_numbuckets; i++) {
                struct aggsum_bucket *asb = &as->as_buckets[i];
+               if (asb->asc_borrowed == 0)
+                       continue;
                mutex_enter(&asb->asc_lock);
-               aggsum_flush_bucket(as, asb);
+               lb += asb->asc_delta + asb->asc_borrowed;
+               ub += asb->asc_delta - asb->asc_borrowed;
+               asb->asc_delta = 0;
+               asb->asc_borrowed = 0;
                mutex_exit(&asb->asc_lock);
        }
-       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-       rv = as->as_lower_bound;
+       ASSERT3U(lb, ==, ub);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+       atomic_store_64(&as->as_upper_bound, lb);
        mutex_exit(&as->as_lock);
 
-       return (rv);
+       return (lb);
 }
 
 void
@@ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta)
        struct aggsum_bucket *asb;
        int64_t borrow;
 
-       asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
+       asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
+           as->as_numbuckets];
 
        /* Try fast path if we already borrowed enough before. */
        mutex_enter(&asb->asc_lock);
@@ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta)
         * We haven't borrowed enough.  Take the global lock and borrow
         * considering what is requested now and what we borrowed before.
         */
-       borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+       borrow = (delta < 0 ? -delta : delta);
+       borrow <<= aggsum_borrow_shift + as->as_bucketshift;
        mutex_enter(&as->as_lock);
-       mutex_enter(&asb->asc_lock);
-       delta += asb->asc_delta;
-       asb->asc_delta = 0;
        if (borrow >= asb->asc_borrowed)
                borrow -= asb->asc_borrowed;
        else
                borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+       mutex_enter(&asb->asc_lock);
+       delta += asb->asc_delta;
+       asb->asc_delta = 0;
        asb->asc_borrowed += borrow;
-       atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-           delta - borrow);
-       atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-           delta + borrow);
        mutex_exit(&asb->asc_lock);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound,
+           as->as_lower_bound + delta - borrow);
+       atomic_store_64(&as->as_upper_bound,
+           as->as_upper_bound + delta + borrow);
        mutex_exit(&as->as_lock);
 }
 
@@ -214,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta)
 int
 aggsum_compare(aggsum_t *as, uint64_t target)
 {
-       if (as->as_upper_bound < target)
+       int64_t lb;
+       uint64_t ub;
+       int i;
+
+       if (atomic_load_64(&as->as_upper_bound) < target)
                return (-1);
-       if (as->as_lower_bound > target)
+       lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound);
+       if (lb > 0 && (uint64_t)lb > target)
                return (1);
        mutex_enter(&as->as_lock);
-       for (int i = 0; i < as->as_numbuckets; i++) {
+       lb = as->as_lower_bound;
+       ub = as->as_upper_bound;
+       for (i = 0; i < as->as_numbuckets; i++) {
                struct aggsum_bucket *asb = &as->as_buckets[i];
+               if (asb->asc_borrowed == 0)
+                       continue;
                mutex_enter(&asb->asc_lock);
-               aggsum_flush_bucket(as, asb);
+               lb += asb->asc_delta + asb->asc_borrowed;
+               ub += asb->asc_delta - asb->asc_borrowed;
+               asb->asc_delta = 0;
+               asb->asc_borrowed = 0;
                mutex_exit(&asb->asc_lock);
-               if (as->as_upper_bound < target) {
-                       mutex_exit(&as->as_lock);
-                       return (-1);
-               }
-               if (as->as_lower_bound > target) {
-                       mutex_exit(&as->as_lock);
-                       return (1);
-               }
+               if (ub < target || (lb > 0 && (uint64_t)lb > target))
+                       break;
        }
-       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-       ASSERT3U(as->as_lower_bound, ==, target);
+       if (i >= as->as_numbuckets)
+               ASSERT3U(lb, ==, ub);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+       atomic_store_64(&as->as_upper_bound, ub);
        mutex_exit(&as->as_lock);
-       return (0);
+       return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
 }