]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add explicit prefetches to bpobj_iterate().
authorAlexander Motin <mav@FreeBSD.org>
Fri, 21 Jul 2023 18:50:48 +0000 (14:50 -0400)
committerGitHub <noreply@github.com>
Fri, 21 Jul 2023 18:50:48 +0000 (11:50 -0700)
To simplify error handling bpobj_iterate_blkptrs() iterates through
the list of block pointers backwards.  Unfortunately speculative
prefetcher is currently unable to detect such patterns, that makes
each block read there synchronous and very slow on HDD pools.

According to my tests, added explicit prefetch reduces time needed
to asynchronously delete 8 snapshots of 4 million blocks each from
20 seconds to less than one, that should free sync thread for other
useful work, such as async writes, scrub, etc.

While there, plug one memory leak in case of bpobj_open() error and
harmonize some variable names.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15071

include/sys/bpobj.h
module/zfs/bpobj.c

index f3384f5264545d021983e0e3604dd233f80e7109..81bc0fe21086c7fa9fcf2c681bf84862b84a461a 100644 (file)
@@ -60,7 +60,7 @@ typedef struct bpobj {
        kmutex_t        bpo_lock;
        objset_t        *bpo_os;
        uint64_t        bpo_object;
-       int             bpo_epb;
+       uint32_t        bpo_epb;
        uint8_t         bpo_havecomp;
        uint8_t         bpo_havesubobj;
        uint8_t         bpo_havefreed;
index 211bab56519cbdbba9e0bffd5ca976159e59aeb4..e772caead29bd84fa06e852306981cfb4ae85ce6 100644 (file)
@@ -284,7 +284,17 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
        dmu_buf_t *dbuf = NULL;
        bpobj_t *bpo = bpi->bpi_bpo;
 
-       for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) {
+       int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1;
+       uint64_t pe = P2ALIGN_TYPED(i, bpo->bpo_epb, uint64_t) *
+           sizeof (blkptr_t);
+       uint64_t ps = start * sizeof (blkptr_t);
+       uint64_t pb = MAX((pe > dmu_prefetch_max) ? pe - dmu_prefetch_max : 0,
+           ps);
+       if (pe > pb) {
+               dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0, pb, pe - pb,
+                   ZIO_PRIORITY_ASYNC_READ);
+       }
+       for (; i >= start; i--) {
                uint64_t offset = i * sizeof (blkptr_t);
                uint64_t blkoff = P2PHASE(i, bpo->bpo_epb);
 
@@ -292,9 +302,16 @@ bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg,
                        if (dbuf)
                                dmu_buf_rele(dbuf, FTAG);
                        err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
-                           offset, FTAG, &dbuf, 0);
+                           offset, FTAG, &dbuf, DMU_READ_NO_PREFETCH);
                        if (err)
                                break;
+                       pe = pb;
+                       pb = MAX((dbuf->db_offset > dmu_prefetch_max) ?
+                           dbuf->db_offset - dmu_prefetch_max : 0, ps);
+                       if (pe > pb) {
+                               dmu_prefetch(bpo->bpo_os, bpo->bpo_object, 0,
+                                   pb, pe - pb, ZIO_PRIORITY_ASYNC_READ);
+                       }
                }
 
                ASSERT3U(offset, >=, dbuf->db_offset);
@@ -466,22 +483,30 @@ bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg,
                        int64_t i = bpi->bpi_unprocessed_subobjs - 1;
                        uint64_t offset = i * sizeof (uint64_t);
 
-                       uint64_t obj_from_sublist;
+                       uint64_t subobj;
                        err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
-                           offset, sizeof (uint64_t), &obj_from_sublist,
-                           DMU_READ_PREFETCH);
+                           offset, sizeof (uint64_t), &subobj,
+                           DMU_READ_NO_PREFETCH);
                        if (err)
                                break;
-                       bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t),
-                           KM_SLEEP);
 
-                       err = bpobj_open(sublist, bpo->bpo_os,
-                           obj_from_sublist);
-                       if (err)
+                       bpobj_t *subbpo = kmem_alloc(sizeof (bpobj_t),
+                           KM_SLEEP);
+                       err = bpobj_open(subbpo, bpo->bpo_os, subobj);
+                       if (err) {
+                               kmem_free(subbpo, sizeof (bpobj_t));
                                break;
+                       }
+
+                       if (subbpo->bpo_havesubobj &&
+                           subbpo->bpo_phys->bpo_subobjs != 0) {
+                               dmu_prefetch(subbpo->bpo_os,
+                                   subbpo->bpo_phys->bpo_subobjs, 0, 0, 0,
+                                   ZIO_PRIORITY_ASYNC_READ);
+                       }
 
-                       list_insert_head(&stack, bpi_alloc(sublist, bpi, i));
-                       mutex_enter(&sublist->bpo_lock);
+                       list_insert_head(&stack, bpi_alloc(subbpo, bpi, i));
+                       mutex_enter(&subbpo->bpo_lock);
                        bpi->bpi_unprocessed_subobjs--;
                }
        }