More aggsum optimizations

author Alexander Motin <mav@FreeBSD.org>

Mon, 7 Jun 2021 16:02:47 +0000 (12:02 -0400)

committer GitHub <noreply@github.com>

Mon, 7 Jun 2021 16:02:47 +0000 (09:02 -0700)
author Alexander Motin <mav@FreeBSD.org>
Mon, 7 Jun 2021 16:02:47 +0000 (12:02 -0400)
committer GitHub <noreply@github.com>
Mon, 7 Jun 2021 16:02:47 +0000 (09:02 -0700)
diff --git a/include/os/linux/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h

index 2d21cbb3e140fa71c6638c6acdade6ec17999721..8f7fa5aeda114c5112665b4b8cc00a6f2997ff23 100644 (file)
--- a/include/os/linux/spl/sys/atomic.h
+++ b/include/os/linux/spl/sys/atomic.h
@@ -48,6 +48,8 @@
  #define        atomic_sub_32_nv(v, i)  atomic_sub_return((i), (atomic_t *)(v))
  #define        atomic_cas_32(v, x, y)  atomic_cmpxchg((atomic_t *)(v), x, y)
  #define        atomic_swap_32(v, x)    atomic_xchg((atomic_t *)(v), x)
+#define        atomic_load_32(v)       atomic_read((atomic_t *)(v))
+#define        atomic_store_32(v, x)   atomic_set((atomic_t *)(v), x)
  #define        atomic_inc_64(v)        atomic64_inc((atomic64_t *)(v))
  #define        atomic_dec_64(v)        atomic64_dec((atomic64_t *)(v))
  #define        atomic_add_64(v, i)     atomic64_add((i), (atomic64_t *)(v))
@@ -58,6 +60,8 @@
  #define        atomic_sub_64_nv(v, i)  atomic64_sub_return((i), (atomic64_t *)(v))
  #define        atomic_cas_64(v, x, y)  atomic64_cmpxchg((atomic64_t *)(v), x, y)
  #define        atomic_swap_64(v, x)    atomic64_xchg((atomic64_t *)(v), x)
+#define        atomic_load_64(v)       atomic64_read((atomic64_t *)(v))
+#define        atomic_store_64(v, x)   atomic64_set((atomic64_t *)(v), x)
  
  #ifdef _LP64
  static __inline__ void *
diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h

index cb43727f1df426673b6333cb592ddbb9da564a1d..65800058cbf6f3906e381e5d24497bf7a60150e7 100644 (file)
--- a/include/sys/aggsum.h
+++ b/include/sys/aggsum.h
@@ -39,15 +39,16 @@ struct aggsum_bucket {
  typedef struct aggsum {
         kmutex_t as_lock;
         int64_t as_lower_bound;
-       int64_t as_upper_bound;
+       uint64_t as_upper_bound;
+       aggsum_bucket_t *as_buckets ____cacheline_aligned;
         uint_t as_numbuckets;
-       aggsum_bucket_t *as_buckets;
+       uint_t as_bucketshift;
  } aggsum_t;
  
  void aggsum_init(aggsum_t *, uint64_t);
  void aggsum_fini(aggsum_t *);
  int64_t aggsum_lower_bound(aggsum_t *);
-int64_t aggsum_upper_bound(aggsum_t *);
+uint64_t aggsum_upper_bound(aggsum_t *);
  int aggsum_compare(aggsum_t *, uint64_t);
  uint64_t aggsum_value(aggsum_t *);
  void aggsum_add(aggsum_t *, int64_t);
diff --git a/lib/libspl/atomic.c b/lib/libspl/atomic.c

index d1a71b77710f263d24d5583d0ca970207a6b7b8c..4717d818ce5c0ee8bcc91db5e894f10ff7b1a998 100644 (file)
--- a/lib/libspl/atomic.c
+++ b/lib/libspl/atomic.c
@@ -313,6 +313,19 @@ atomic_swap_ptr(volatile void *target, void *bits)
         return (__atomic_exchange_n((void **)target, bits, __ATOMIC_SEQ_CST));
  }
  
+#ifndef _LP64
+uint64_t
+atomic_load_64(volatile uint64_t *target)
+{
+       return (__atomic_load_n(target, __ATOMIC_RELAXED));
+}
+
+void
+atomic_store_64(volatile uint64_t *target, uint64_t bits)
+{
+       return (__atomic_store_n(target, bits, __ATOMIC_RELAXED));
+}
+#endif
  
  int
  atomic_set_long_excl(volatile ulong_t *target, uint_t value)
diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h

index f8c257f9696ba859c1cbbb33c5b5f9f84572c4b5..8dd1d654a486baf4be55f09d89f7a5c6457bab84 100644 (file)
--- a/lib/libspl/include/atomic.h
+++ b/lib/libspl/include/atomic.h
@@ -245,6 +245,49 @@ extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t);
  extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t);
  #endif
  
+/*
+ * Atomically read variable.
+ */
+#define        atomic_load_char(p)     (*(volatile uchar_t *)(p))
+#define        atomic_load_short(p)    (*(volatile ushort_t *)(p))
+#define        atomic_load_int(p)      (*(volatile uint_t *)(p))
+#define        atomic_load_long(p)     (*(volatile ulong_t *)(p))
+#define        atomic_load_ptr(p)      (*(volatile __typeof(*p) *)(p))
+#define        atomic_load_8(p)        (*(volatile uint8_t *)(p))
+#define        atomic_load_16(p)       (*(volatile uint16_t *)(p))
+#define        atomic_load_32(p)       (*(volatile uint32_t *)(p))
+#ifdef _LP64
+#define        atomic_load_64(p)       (*(volatile uint64_t *)(p))
+#elif defined(_INT64_TYPE)
+extern uint64_t atomic_load_64(volatile uint64_t *);
+#endif
+
+/*
+ * Atomically write variable.
+ */
+#define        atomic_store_char(p, v)         \
+       (*(volatile uchar_t *)(p) = (uchar_t)(v))
+#define        atomic_store_short(p, v)        \
+       (*(volatile ushort_t *)(p) = (ushort_t)(v))
+#define        atomic_store_int(p, v)          \
+       (*(volatile uint_t *)(p) = (uint_t)(v))
+#define        atomic_store_long(p, v)         \
+       (*(volatile ulong_t *)(p) = (ulong_t)(v))
+#define        atomic_store_ptr(p, v)          \
+       (*(volatile __typeof(*p) *)(p) = (v))
+#define        atomic_store_8(p, v)            \
+       (*(volatile uint8_t *)(p) = (uint8_t)(v))
+#define        atomic_store_16(p, v)           \
+       (*(volatile uint16_t *)(p) = (uint16_t)(v))
+#define        atomic_store_32(p, v)           \
+       (*(volatile uint32_t *)(p) = (uint32_t)(v))
+#ifdef _LP64
+#define        atomic_store_64(p, v)           \
+       (*(volatile uint64_t *)(p) = (uint64_t)(v))
+#elif defined(_INT64_TYPE)
+extern void atomic_store_64(volatile uint64_t *, uint64_t);
+#endif
+
  /*
   * Perform an exclusive atomic bit set/clear on a target.
   * Returns 0 if bit was successfully set/cleared, or -1
diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c

index e46da95f676cd64b14bca22dff2eb18c2a1e9256..c4ea4f86fc5f89b88fc2ef29d7640fbd3189f2fe 100644 (file)
--- a/module/zfs/aggsum.c
+++ b/module/zfs/aggsum.c
@@ -78,11 +78,11 @@
   */
  
  /*
- * We will borrow aggsum_borrow_multiplier times the current request, so we will
- * have to get the as_lock approximately every aggsum_borrow_multiplier calls to
- * aggsum_delta().
+ * We will borrow 2^aggsum_borrow_shift times the current request, so we will
+ * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to
+ * aggsum_add().
   */
-static uint_t aggsum_borrow_multiplier = 10;
+static uint_t aggsum_borrow_shift = 4;
  
  void
  aggsum_init(aggsum_t *as, uint64_t value)
@@ -90,9 +90,14 @@ aggsum_init(aggsum_t *as, uint64_t value)
         bzero(as, sizeof (*as));
         as->as_lower_bound = as->as_upper_bound = value;
         mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL);
-       as->as_numbuckets = boot_ncpus;
-       as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t),
-           KM_SLEEP);
+       /*
+        * Too many buckets may hurt read performance without improving
+        * write.  From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc.
+        */
+       as->as_bucketshift = highbit64(boot_ncpus / 6) / 2;
+       as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1;
+       as->as_buckets = kmem_zalloc(as->as_numbuckets *
+           sizeof (aggsum_bucket_t), KM_SLEEP);
         for (int i = 0; i < as->as_numbuckets; i++) {
                 mutex_init(&as->as_buckets[i].asc_lock,
                     NULL, MUTEX_DEFAULT, NULL);
@@ -111,59 +116,49 @@ aggsum_fini(aggsum_t *as)
  int64_t
  aggsum_lower_bound(aggsum_t *as)
  {
-       return (as->as_lower_bound);
+       return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound));
  }
  
-int64_t
+uint64_t
  aggsum_upper_bound(aggsum_t *as)
  {
-       return (as->as_upper_bound);
-}
-
-static void
-aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb)
-{
-       ASSERT(MUTEX_HELD(&as->as_lock));
-       ASSERT(MUTEX_HELD(&asb->asc_lock));
-
-       /*
-        * We use atomic instructions for this because we read the upper and
-        * lower bounds without the lock, so we need stores to be atomic.
-        */
-       atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-           asb->asc_delta + asb->asc_borrowed);
-       atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-           asb->asc_delta - asb->asc_borrowed);
-       asb->asc_delta = 0;
-       asb->asc_borrowed = 0;
+       return (atomic_load_64(&as->as_upper_bound));
  }
  
  uint64_t
  aggsum_value(aggsum_t *as)
  {
-       int64_t rv;
+       int64_t lb;
+       uint64_t ub;
  
         mutex_enter(&as->as_lock);
-       if (as->as_lower_bound == as->as_upper_bound) {
-               rv = as->as_lower_bound;
+       lb = as->as_lower_bound;
+       ub = as->as_upper_bound;
+       if (lb == ub) {
                 for (int i = 0; i < as->as_numbuckets; i++) {
                         ASSERT0(as->as_buckets[i].asc_delta);
                         ASSERT0(as->as_buckets[i].asc_borrowed);
                 }
                 mutex_exit(&as->as_lock);
-               return (rv);
+               return (lb);
         }
         for (int i = 0; i < as->as_numbuckets; i++) {
                 struct aggsum_bucket *asb = &as->as_buckets[i];
+               if (asb->asc_borrowed == 0)
+                       continue;
                 mutex_enter(&asb->asc_lock);
-               aggsum_flush_bucket(as, asb);
+               lb += asb->asc_delta + asb->asc_borrowed;
+               ub += asb->asc_delta - asb->asc_borrowed;
+               asb->asc_delta = 0;
+               asb->asc_borrowed = 0;
                 mutex_exit(&asb->asc_lock);
         }
-       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-       rv = as->as_lower_bound;
+       ASSERT3U(lb, ==, ub);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+       atomic_store_64(&as->as_upper_bound, lb);
         mutex_exit(&as->as_lock);
  
-       return (rv);
+       return (lb);
  }
  
  void
@@ -172,7 +167,8 @@ aggsum_add(aggsum_t *as, int64_t delta)
         struct aggsum_bucket *asb;
         int64_t borrow;
  
-       asb = &as->as_buckets[CPU_SEQID_UNSTABLE % as->as_numbuckets];
+       asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) %
+           as->as_numbuckets];
  
         /* Try fast path if we already borrowed enough before. */
         mutex_enter(&asb->asc_lock);
@@ -188,21 +184,22 @@ aggsum_add(aggsum_t *as, int64_t delta)
          * We haven't borrowed enough.  Take the global lock and borrow
          * considering what is requested now and what we borrowed before.
          */
-       borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier;
+       borrow = (delta < 0 ? -delta : delta);
+       borrow <<= aggsum_borrow_shift + as->as_bucketshift;
         mutex_enter(&as->as_lock);
-       mutex_enter(&asb->asc_lock);
-       delta += asb->asc_delta;
-       asb->asc_delta = 0;
         if (borrow >= asb->asc_borrowed)
                 borrow -= asb->asc_borrowed;
         else
                 borrow = (borrow - (int64_t)asb->asc_borrowed) / 4;
+       mutex_enter(&asb->asc_lock);
+       delta += asb->asc_delta;
+       asb->asc_delta = 0;
         asb->asc_borrowed += borrow;
-       atomic_add_64((volatile uint64_t *)&as->as_lower_bound,
-           delta - borrow);
-       atomic_add_64((volatile uint64_t *)&as->as_upper_bound,
-           delta + borrow);
         mutex_exit(&asb->asc_lock);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound,
+           as->as_lower_bound + delta - borrow);
+       atomic_store_64(&as->as_upper_bound,
+           as->as_upper_bound + delta + borrow);
         mutex_exit(&as->as_lock);
  }
  
@@ -214,27 +211,35 @@ aggsum_add(aggsum_t *as, int64_t delta)
  int
  aggsum_compare(aggsum_t *as, uint64_t target)
  {
-       if (as->as_upper_bound < target)
+       int64_t lb;
+       uint64_t ub;
+       int i;
+
+       if (atomic_load_64(&as->as_upper_bound) < target)
                 return (-1);
-       if (as->as_lower_bound > target)
+       lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound);
+       if (lb > 0 && (uint64_t)lb > target)
                 return (1);
         mutex_enter(&as->as_lock);
-       for (int i = 0; i < as->as_numbuckets; i++) {
+       lb = as->as_lower_bound;
+       ub = as->as_upper_bound;
+       for (i = 0; i < as->as_numbuckets; i++) {
                 struct aggsum_bucket *asb = &as->as_buckets[i];
+               if (asb->asc_borrowed == 0)
+                       continue;
                 mutex_enter(&asb->asc_lock);
-               aggsum_flush_bucket(as, asb);
+               lb += asb->asc_delta + asb->asc_borrowed;
+               ub += asb->asc_delta - asb->asc_borrowed;
+               asb->asc_delta = 0;
+               asb->asc_borrowed = 0;
                 mutex_exit(&asb->asc_lock);
-               if (as->as_upper_bound < target) {
-                       mutex_exit(&as->as_lock);
-                       return (-1);
-               }
-               if (as->as_lower_bound > target) {
-                       mutex_exit(&as->as_lock);
-                       return (1);
-               }
+               if (ub < target || (lb > 0 && (uint64_t)lb > target))
+                       break;
         }
-       VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound);
-       ASSERT3U(as->as_lower_bound, ==, target);
+       if (i >= as->as_numbuckets)
+               ASSERT3U(lb, ==, ub);
+       atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb);
+       atomic_store_64(&as->as_upper_bound, ub);
         mutex_exit(&as->as_lock);
-       return (0);
+       return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0);
  }
author	Alexander Motin <mav@FreeBSD.org>
	Mon, 7 Jun 2021 16:02:47 +0000 (12:02 -0400)
committer	GitHub <noreply@github.com>
	Mon, 7 Jun 2021 16:02:47 +0000 (09:02 -0700)
include/os/linux/spl/sys/atomic.h		patch \| blob \| blame \| history
include/sys/aggsum.h		patch \| blob \| blame \| history
lib/libspl/atomic.c		patch \| blob \| blame \| history
lib/libspl/include/atomic.h		patch \| blob \| blame \| history
module/zfs/aggsum.c		patch \| blob \| blame \| history