]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Tune zio buffer caches and their alignments
authorAlexander Motin <mav@FreeBSD.org>
Mon, 30 Oct 2023 21:55:32 +0000 (17:55 -0400)
committerGitHub <noreply@github.com>
Mon, 30 Oct 2023 21:55:32 +0000 (14:55 -0700)
We should not always use PAGESIZE alignment for caches bigger than
it and SPA_MINBLOCKSIZE otherwise.  Doing that caches for 5, 6, 7,
10 and 14KB rounded up to 8, 12 and 16KB respectively make no sense.
Instead specify as alignment the biggest power-of-2 divisor.  This
way 2KB and 6KB caches are both aligned to 2KB, while 4KB and 8KB
are aligned to 4KB.

Reduce number of caches to half-power of 2 instead of quarter-power
of 2.  This removes caches difficult for underlying allocators to
fit into page-granular slabs, such as: 2.5, 3.5, 5, 7, 10KB, etc.
Since these caches are mostly used for transient allocations like
ZIOs and small DBUF cache it does not worth being too aggressive.
Due to the above alignment issue some of those caches were not
working properly any way.  6KB cache now finally has a chance to
work right, placing 2 buffers into 3 pages, that makes sense.

Remove explicit alignment in Linux user-space case.  I don't think
it should be needed any more with the above fixes.

As result on FreeBSD instead of such numbers of pages per slab:

vm.uma.zio_buf_comb_16384.keg.ppera: 4
vm.uma.zio_buf_comb_14336.keg.ppera: 4
vm.uma.zio_buf_comb_12288.keg.ppera: 3
vm.uma.zio_buf_comb_10240.keg.ppera: 3
vm.uma.zio_buf_comb_8192.keg.ppera: 2
vm.uma.zio_buf_comb_7168.keg.ppera: 2
vm.uma.zio_buf_comb_6144.keg.ppera: 2   <= Broken
vm.uma.zio_buf_comb_5120.keg.ppera: 2
vm.uma.zio_buf_comb_4096.keg.ppera: 1
vm.uma.zio_buf_comb_3584.keg.ppera: 7   <= Hard to free
vm.uma.zio_buf_comb_3072.keg.ppera: 3
vm.uma.zio_buf_comb_2560.keg.ppera: 2
vm.uma.zio_buf_comb_2048.keg.ppera: 1
vm.uma.zio_buf_comb_1536.keg.ppera: 2
vm.uma.zio_buf_comb_1024.keg.ppera: 1
vm.uma.zio_buf_comb_512.keg.ppera: 1

I am now getting such:

vm.uma.zio_buf_comb_16384.keg.ppera: 4
vm.uma.zio_buf_comb_12288.keg.ppera: 3
vm.uma.zio_buf_comb_8192.keg.ppera: 2
vm.uma.zio_buf_comb_6144.keg.ppera: 3   <= Fixed, 2 in 3 pages
vm.uma.zio_buf_comb_4096.keg.ppera: 1
vm.uma.zio_buf_comb_3072.keg.ppera: 3
vm.uma.zio_buf_comb_2048.keg.ppera: 1
vm.uma.zio_buf_comb_1536.keg.ppera: 2
vm.uma.zio_buf_comb_1024.keg.ppera: 1
vm.uma.zio_buf_comb_512.keg.ppera: 1

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15452

module/zfs/zio.c

index 3eb472a9fd2ab0103230f049721fc3ad003f6239..4eb276352a23984b6377db83f7e0ae573b39381e 100644 (file)
@@ -158,23 +158,22 @@ zio_init(void)
        zio_link_cache = kmem_cache_create("zio_link_cache",
            sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
-       /*
-        * For small buffers, we want a cache for each multiple of
-        * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
-        * for each quarter-power of 2.
-        */
        for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
                size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
-               size_t p2 = size;
-               size_t align = 0;
-               size_t data_cflags, cflags;
-
-               data_cflags = KMC_NODEBUG;
-               cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
-                   KMC_NODEBUG : 0;
+               size_t align, cflags, data_cflags;
+               char name[32];
 
+               /*
+                * Create cache for each half-power of 2 size, starting from
+                * SPA_MINBLOCKSIZE.  It should give us memory space efficiency
+                * of ~7/8, sufficient for transient allocations mostly using
+                * these caches.
+                */
+               size_t p2 = size;
                while (!ISP2(p2))
                        p2 &= p2 - 1;
+               if (!IS_P2ALIGNED(size, p2 / 2))
+                       continue;
 
 #ifndef _KERNEL
                /*
@@ -185,47 +184,37 @@ zio_init(void)
                 */
                if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
                        continue;
-               /*
-                * Here's the problem - on 4K native devices in userland on
-                * Linux using O_DIRECT, buffers must be 4K aligned or I/O
-                * will fail with EINVAL, causing zdb (and others) to coredump.
-                * Since userland probably doesn't need optimized buffer caches,
-                * we just force 4K alignment on everything.
-                */
-               align = 8 * SPA_MINBLOCKSIZE;
-#else
-               if (size < PAGESIZE) {
-                       align = SPA_MINBLOCKSIZE;
-               } else if (IS_P2ALIGNED(size, p2 >> 2)) {
-                       align = PAGESIZE;
-               }
 #endif
 
-               if (align != 0) {
-                       char name[36];
-                       if (cflags == data_cflags) {
-                               /*
-                                * Resulting kmem caches would be identical.
-                                * Save memory by creating only one.
-                                */
-                               (void) snprintf(name, sizeof (name),
-                                   "zio_buf_comb_%lu", (ulong_t)size);
-                               zio_buf_cache[c] = kmem_cache_create(name,
-                                   size, align, NULL, NULL, NULL, NULL, NULL,
-                                   cflags);
-                               zio_data_buf_cache[c] = zio_buf_cache[c];
-                               continue;
-                       }
-                       (void) snprintf(name, sizeof (name), "zio_buf_%lu",
-                           (ulong_t)size);
-                       zio_buf_cache[c] = kmem_cache_create(name, size,
-                           align, NULL, NULL, NULL, NULL, NULL, cflags);
-
-                       (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
-                           (ulong_t)size);
-                       zio_data_buf_cache[c] = kmem_cache_create(name, size,
-                           align, NULL, NULL, NULL, NULL, NULL, data_cflags);
+               if (IS_P2ALIGNED(size, PAGESIZE))
+                       align = PAGESIZE;
+               else
+                       align = 1 << (highbit64(size ^ (size - 1)) - 1);
+
+               cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ?
+                   KMC_NODEBUG : 0;
+               data_cflags = KMC_NODEBUG;
+               if (cflags == data_cflags) {
+                       /*
+                        * Resulting kmem caches would be identical.
+                        * Save memory by creating only one.
+                        */
+                       (void) snprintf(name, sizeof (name),
+                           "zio_buf_comb_%lu", (ulong_t)size);
+                       zio_buf_cache[c] = kmem_cache_create(name, size, align,
+                           NULL, NULL, NULL, NULL, NULL, cflags);
+                       zio_data_buf_cache[c] = zio_buf_cache[c];
+                       continue;
                }
+               (void) snprintf(name, sizeof (name), "zio_buf_%lu",
+                   (ulong_t)size);
+               zio_buf_cache[c] = kmem_cache_create(name, size, align,
+                   NULL, NULL, NULL, NULL, NULL, cflags);
+
+               (void) snprintf(name, sizeof (name), "zio_data_buf_%lu",
+                   (ulong_t)size);
+               zio_data_buf_cache[c] = kmem_cache_create(name, size, align,
+                   NULL, NULL, NULL, NULL, NULL, data_cflags);
        }
 
        while (--c != 0) {