]> git.proxmox.com Git - mirror_zfs.git/commitdiff
L2ARC: Relax locking during write
authorAlexander Motin <mav@FreeBSD.org>
Tue, 9 Apr 2024 23:23:19 +0000 (19:23 -0400)
committerGitHub <noreply@github.com>
Tue, 9 Apr 2024 23:23:19 +0000 (16:23 -0700)
Previous code held ARC state sublist lock throughout all L2ARC
write process, which included number of allocations and even ZIO
issues.  Being blocked in any of those places the code could also
block ARC eviction, that could cause OOM activation or even dead-
lock if system is low on memory or one is too fragmented.

Fix it by dropping the lock as soon as we see a block eligible
for L2ARC writing and pick it up later using earlier inserted
marker.  While there, also reduce scope of hash lock, moving
ZIO allocation and other operations not requiring header access
out of it.  All operations requiring header access move under
hash lock, since L2_WRITING flag does not prevent header eviction
only transition to arc_l2c_only state with L1 header.

To be able to manipulate sublist lock and marker as needed add few
more multilist functions and modify one.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16040

include/sys/multilist.h
module/zfs/arc.c
module/zfs/dbuf.c
module/zfs/dmu_objset.c
module/zfs/metaslab.c
module/zfs/multilist.c

index 26f37c37ab38b177967cbf4ed0361363189cca87..e7de86f2379b2d3870c5909f7a0cabb8fd986a63 100644 (file)
@@ -82,12 +82,15 @@ int  multilist_is_empty(multilist_t *);
 unsigned int multilist_get_num_sublists(multilist_t *);
 unsigned int multilist_get_random_index(multilist_t *);
 
-multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int);
+void multilist_sublist_lock(multilist_sublist_t *);
+multilist_sublist_t *multilist_sublist_lock_idx(multilist_t *, unsigned int);
 multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *);
 void multilist_sublist_unlock(multilist_sublist_t *);
 
 void multilist_sublist_insert_head(multilist_sublist_t *, void *);
 void multilist_sublist_insert_tail(multilist_sublist_t *, void *);
+void multilist_sublist_insert_after(multilist_sublist_t *, void *, void *);
+void multilist_sublist_insert_before(multilist_sublist_t *, void *, void *);
 void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj);
 void multilist_sublist_remove(multilist_sublist_t *, void *);
 int  multilist_sublist_is_empty(multilist_sublist_t *);
index b1bcac6c44bc9f9e82870c7e96ed17270dc1d157..16c95db10f47fa4760138c19ef1ce7b610018b3a 100644 (file)
@@ -3872,7 +3872,7 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
 
        ASSERT3P(marker, !=, NULL);
 
-       mls = multilist_sublist_lock(ml, idx);
+       mls = multilist_sublist_lock_idx(ml, idx);
 
        for (hdr = multilist_sublist_prev(mls, marker); likely(hdr != NULL);
            hdr = multilist_sublist_prev(mls, marker)) {
@@ -3984,6 +3984,26 @@ arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
        return (bytes_evicted);
 }
 
+static arc_buf_hdr_t *
+arc_state_alloc_marker(void)
+{
+       arc_buf_hdr_t *marker = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+       /*
+        * A b_spa of 0 is used to indicate that this header is
+        * a marker. This fact is used in arc_evict_state_impl().
+        */
+       marker->b_spa = 0;
+
+       return (marker);
+}
+
+static void
+arc_state_free_marker(arc_buf_hdr_t *marker)
+{
+       kmem_cache_free(hdr_full_cache, marker);
+}
+
 /*
  * Allocate an array of buffer headers used as placeholders during arc state
  * eviction.
@@ -3994,16 +4014,8 @@ arc_state_alloc_markers(int count)
        arc_buf_hdr_t **markers;
 
        markers = kmem_zalloc(sizeof (*markers) * count, KM_SLEEP);
-       for (int i = 0; i < count; i++) {
-               markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
-
-               /*
-                * A b_spa of 0 is used to indicate that this header is
-                * a marker. This fact is used in arc_evict_state_impl().
-                */
-               markers[i]->b_spa = 0;
-
-       }
+       for (int i = 0; i < count; i++)
+               markers[i] = arc_state_alloc_marker();
        return (markers);
 }
 
@@ -4011,7 +4023,7 @@ static void
 arc_state_free_markers(arc_buf_hdr_t **markers, int count)
 {
        for (int i = 0; i < count; i++)
-               kmem_cache_free(hdr_full_cache, markers[i]);
+               arc_state_free_marker(markers[i]);
        kmem_free(markers, sizeof (*markers) * count);
 }
 
@@ -4055,7 +4067,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
        for (int i = 0; i < num_sublists; i++) {
                multilist_sublist_t *mls;
 
-               mls = multilist_sublist_lock(ml, i);
+               mls = multilist_sublist_lock_idx(ml, i);
                multilist_sublist_insert_tail(mls, markers[i]);
                multilist_sublist_unlock(mls);
        }
@@ -4120,7 +4132,7 @@ arc_evict_state(arc_state_t *state, arc_buf_contents_t type, uint64_t spa,
        }
 
        for (int i = 0; i < num_sublists; i++) {
-               multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+               multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
                multilist_sublist_remove(mls, markers[i]);
                multilist_sublist_unlock(mls);
        }
@@ -8633,7 +8645,7 @@ l2arc_sublist_lock(int list_num)
         * sublists being selected.
         */
        idx = multilist_get_random_index(ml);
-       return (multilist_sublist_lock(ml, idx));
+       return (multilist_sublist_lock_idx(ml, idx));
 }
 
 /*
@@ -9046,9 +9058,9 @@ l2arc_blk_fetch_done(zio_t *zio)
 static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
-       arc_buf_hdr_t           *hdr, *hdr_prev, *head;
-       uint64_t                write_asize, write_psize, write_lsize, headroom;
-       boolean_t               full;
+       arc_buf_hdr_t           *hdr, *head, *marker;
+       uint64_t                write_asize, write_psize, headroom;
+       boolean_t               full, from_head = !arc_warm;
        l2arc_write_callback_t  *cb = NULL;
        zio_t                   *pio, *wzio;
        uint64_t                guid = spa_load_guid(spa);
@@ -9057,10 +9069,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
        ASSERT3P(dev->l2ad_vdev, !=, NULL);
 
        pio = NULL;
-       write_lsize = write_asize = write_psize = 0;
+       write_asize = write_psize = 0;
        full = B_FALSE;
        head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
        arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
+       marker = arc_state_alloc_marker();
 
        /*
         * Copy buffers for L2ARC writing.
@@ -9075,40 +9088,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                continue;
                }
 
-               multilist_sublist_t *mls = l2arc_sublist_lock(pass);
                uint64_t passed_sz = 0;
-
-               VERIFY3P(mls, !=, NULL);
+               headroom = target_sz * l2arc_headroom;
+               if (zfs_compressed_arc_enabled)
+                       headroom = (headroom * l2arc_headroom_boost) / 100;
 
                /*
-                * L2ARC fast warmup.
-                *
                 * Until the ARC is warm and starts to evict, read from the
                 * head of the ARC lists rather than the tail.
                 */
-               if (arc_warm == B_FALSE)
+               multilist_sublist_t *mls = l2arc_sublist_lock(pass);
+               ASSERT3P(mls, !=, NULL);
+               if (from_head)
                        hdr = multilist_sublist_head(mls);
                else
                        hdr = multilist_sublist_tail(mls);
 
-               headroom = target_sz * l2arc_headroom;
-               if (zfs_compressed_arc_enabled)
-                       headroom = (headroom * l2arc_headroom_boost) / 100;
-
-               for (; hdr; hdr = hdr_prev) {
+               while (hdr != NULL) {
                        kmutex_t *hash_lock;
                        abd_t *to_write = NULL;
 
-                       if (arc_warm == B_FALSE)
-                               hdr_prev = multilist_sublist_next(mls, hdr);
-                       else
-                               hdr_prev = multilist_sublist_prev(mls, hdr);
-
                        hash_lock = HDR_LOCK(hdr);
                        if (!mutex_tryenter(hash_lock)) {
-                               /*
-                                * Skip this buffer rather than waiting.
-                                */
+skip:
+                               /* Skip this buffer rather than waiting. */
+                               if (from_head)
+                                       hdr = multilist_sublist_next(mls, hdr);
+                               else
+                                       hdr = multilist_sublist_prev(mls, hdr);
                                continue;
                        }
 
@@ -9123,11 +9130,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 
                        if (!l2arc_write_eligible(guid, hdr)) {
                                mutex_exit(hash_lock);
-                               continue;
+                               goto skip;
                        }
 
                        ASSERT(HDR_HAS_L1HDR(hdr));
-
                        ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
                        ASSERT3U(arc_hdr_size(hdr), >, 0);
                        ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
@@ -9149,12 +9155,18 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                        }
 
                        /*
-                        * We rely on the L1 portion of the header below, so
-                        * it's invalid for this header to have been evicted out
-                        * of the ghost cache, prior to being written out. The
-                        * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+                        * We should not sleep with sublist lock held or it
+                        * may block ARC eviction.  Insert a marker to save
+                        * the position and drop the lock.
                         */
-                       arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
+                       if (from_head) {
+                               multilist_sublist_insert_after(mls, hdr,
+                                   marker);
+                       } else {
+                               multilist_sublist_insert_before(mls, hdr,
+                                   marker);
+                       }
+                       multilist_sublist_unlock(mls);
 
                        /*
                         * If this header has b_rabd, we can use this since it
@@ -9185,32 +9197,45 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                    &to_write);
                                if (ret != 0) {
                                        arc_hdr_clear_flags(hdr,
-                                           ARC_FLAG_L2_WRITING);
+                                           ARC_FLAG_L2CACHE);
                                        mutex_exit(hash_lock);
-                                       continue;
+                                       goto next;
                                }
 
                                l2arc_free_abd_on_write(to_write, asize, type);
                        }
 
+                       hdr->b_l2hdr.b_dev = dev;
+                       hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+                       hdr->b_l2hdr.b_hits = 0;
+                       hdr->b_l2hdr.b_arcs_state =
+                           hdr->b_l1hdr.b_state->arcs_state;
+                       mutex_enter(&dev->l2ad_mtx);
                        if (pio == NULL) {
                                /*
                                 * Insert a dummy header on the buflist so
                                 * l2arc_write_done() can find where the
                                 * write buffers begin without searching.
                                 */
-                               mutex_enter(&dev->l2ad_mtx);
                                list_insert_head(&dev->l2ad_buflist, head);
-                               mutex_exit(&dev->l2ad_mtx);
+                       }
+                       list_insert_head(&dev->l2ad_buflist, hdr);
+                       mutex_exit(&dev->l2ad_mtx);
+                       arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR |
+                           ARC_FLAG_L2_WRITING);
+
+                       (void) zfs_refcount_add_many(&dev->l2ad_alloc,
+                           arc_hdr_size(hdr), hdr);
+                       l2arc_hdr_arcstats_increment(hdr);
 
+                       boolean_t commit = l2arc_log_blk_insert(dev, hdr);
+                       mutex_exit(hash_lock);
+
+                       if (pio == NULL) {
                                cb = kmem_alloc(
                                    sizeof (l2arc_write_callback_t), KM_SLEEP);
                                cb->l2wcb_dev = dev;
                                cb->l2wcb_head = head;
-                               /*
-                                * Create a list to save allocated abd buffers
-                                * for l2arc_log_blk_commit().
-                                */
                                list_create(&cb->l2wcb_abd_list,
                                    sizeof (l2arc_lb_abd_buf_t),
                                    offsetof(l2arc_lb_abd_buf_t, node));
@@ -9218,54 +9243,34 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                                    ZIO_FLAG_CANFAIL);
                        }
 
-                       hdr->b_l2hdr.b_dev = dev;
-                       hdr->b_l2hdr.b_hits = 0;
-
-                       hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
-                       hdr->b_l2hdr.b_arcs_state =
-                           hdr->b_l1hdr.b_state->arcs_state;
-                       arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR);
-
-                       mutex_enter(&dev->l2ad_mtx);
-                       list_insert_head(&dev->l2ad_buflist, hdr);
-                       mutex_exit(&dev->l2ad_mtx);
-
-                       (void) zfs_refcount_add_many(&dev->l2ad_alloc,
-                           arc_hdr_size(hdr), hdr);
-
                        wzio = zio_write_phys(pio, dev->l2ad_vdev,
-                           hdr->b_l2hdr.b_daddr, asize, to_write,
+                           dev->l2ad_hand, asize, to_write,
                            ZIO_CHECKSUM_OFF, NULL, hdr,
                            ZIO_PRIORITY_ASYNC_WRITE,
                            ZIO_FLAG_CANFAIL, B_FALSE);
 
-                       write_lsize += HDR_GET_LSIZE(hdr);
                        DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
                            zio_t *, wzio);
+                       zio_nowait(wzio);
 
                        write_psize += psize;
                        write_asize += asize;
                        dev->l2ad_hand += asize;
-                       l2arc_hdr_arcstats_increment(hdr);
                        vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
 
-                       mutex_exit(hash_lock);
-
-                       /*
-                        * Append buf info to current log and commit if full.
-                        * arcstat_l2_{size,asize} kstats are updated
-                        * internally.
-                        */
-                       if (l2arc_log_blk_insert(dev, hdr)) {
-                               /*
-                                * l2ad_hand will be adjusted in
-                                * l2arc_log_blk_commit().
-                                */
+                       if (commit) {
+                               /* l2ad_hand will be adjusted inside. */
                                write_asize +=
                                    l2arc_log_blk_commit(dev, pio, cb);
                        }
 
-                       zio_nowait(wzio);
+next:
+                       multilist_sublist_lock(mls);
+                       if (from_head)
+                               hdr = multilist_sublist_next(mls, marker);
+                       else
+                               hdr = multilist_sublist_prev(mls, marker);
+                       multilist_sublist_remove(mls, marker);
                }
 
                multilist_sublist_unlock(mls);
@@ -9274,9 +9279,11 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
                        break;
        }
 
+       arc_state_free_marker(marker);
+
        /* No buffers selected for writing? */
        if (pio == NULL) {
-               ASSERT0(write_lsize);
+               ASSERT0(write_psize);
                ASSERT(!HDR_HAS_L1HDR(head));
                kmem_cache_free(hdr_l2only_cache, head);
 
@@ -10604,7 +10611,7 @@ l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr)
        L2BLK_SET_TYPE((le)->le_prop, hdr->b_type);
        L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr)));
        L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr)));
-       L2BLK_SET_STATE((le)->le_prop, hdr->b_l1hdr.b_state->arcs_state);
+       L2BLK_SET_STATE((le)->le_prop, hdr->b_l2hdr.b_arcs_state);
 
        dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev,
            HDR_GET_PSIZE(hdr));
index d9fc6cf6af34b0598eb2f464999b1496213892fd..5f3643f573f7c36454bb32cca1d7291a5481c047 100644 (file)
@@ -769,7 +769,7 @@ static void
 dbuf_evict_one(void)
 {
        int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
-       multilist_sublist_t *mls = multilist_sublist_lock(
+       multilist_sublist_t *mls = multilist_sublist_lock_idx(
            &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
        ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
index f098e1daa44bda5cc9e56b85854cbf784047a2e5..2ba26f68e39811a6cfecd1bcfec6ecbae98e076b 100644 (file)
@@ -1665,7 +1665,7 @@ sync_dnodes_task(void *arg)
        objset_t *os = soa->soa_os;
 
        multilist_sublist_t *ms =
-           multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
+           multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
        dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
@@ -2076,8 +2076,8 @@ userquota_updates_task(void *arg)
        dnode_t *dn;
        userquota_cache_t cache = { { 0 } };
 
-       multilist_sublist_t *list =
-           multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+       multilist_sublist_t *list = multilist_sublist_lock_idx(
+           &os->os_synced_dnodes, uua->uua_sublist_idx);
 
        ASSERT(multilist_sublist_head(list) == NULL ||
            dmu_objset_userused_enabled(os));
@@ -2159,8 +2159,8 @@ dnode_rele_task(void *arg)
        userquota_updates_arg_t *uua = arg;
        objset_t *os = uua->uua_os;
 
-       multilist_sublist_t *list =
-           multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx);
+       multilist_sublist_t *list = multilist_sublist_lock_idx(
+           &os->os_synced_dnodes, uua->uua_sublist_idx);
 
        dnode_t *dn;
        while ((dn = multilist_sublist_head(list)) != NULL) {
index c4aa98ced433b16f5ce66e142f5466942a2e14ff..9e762357b7270775564762ea117e75791cd1f986 100644 (file)
@@ -639,7 +639,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
 {
        multilist_t *ml = &mc->mc_metaslab_txg_list;
        for (int i = 0; i < multilist_get_num_sublists(ml); i++) {
-               multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+               multilist_sublist_t *mls = multilist_sublist_lock_idx(ml, i);
                metaslab_t *msp = multilist_sublist_head(mls);
                multilist_sublist_unlock(mls);
                while (msp != NULL) {
@@ -656,7 +656,7 @@ metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg)
                                i--;
                                break;
                        }
-                       mls = multilist_sublist_lock(ml, i);
+                       mls = multilist_sublist_lock_idx(ml, i);
                        metaslab_t *next_msp = multilist_sublist_next(mls, msp);
                        multilist_sublist_unlock(mls);
                        if (txg >
@@ -2232,12 +2232,12 @@ metaslab_potentially_evict(metaslab_class_t *mc)
                unsigned int idx = multilist_get_random_index(
                    &mc->mc_metaslab_txg_list);
                multilist_sublist_t *mls =
-                   multilist_sublist_lock(&mc->mc_metaslab_txg_list, idx);
+                   multilist_sublist_lock_idx(&mc->mc_metaslab_txg_list, idx);
                metaslab_t *msp = multilist_sublist_head(mls);
                multilist_sublist_unlock(mls);
                while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 <
                    inuse * size) {
-                       VERIFY3P(mls, ==, multilist_sublist_lock(
+                       VERIFY3P(mls, ==, multilist_sublist_lock_idx(
                            &mc->mc_metaslab_txg_list, idx));
                        ASSERT3U(idx, ==,
                            metaslab_idx_func(&mc->mc_metaslab_txg_list, msp));
index b1cdf1c5c5f4ea7e30093811321a46b4edfc2b2f..3d3ef86e68391f941725e0e17b82b72a2f3d014a 100644 (file)
@@ -277,9 +277,15 @@ multilist_get_random_index(multilist_t *ml)
        return (random_in_range(ml->ml_num_sublists));
 }
 
+void
+multilist_sublist_lock(multilist_sublist_t *mls)
+{
+       mutex_enter(&mls->mls_lock);
+}
+
 /* Lock and return the sublist specified at the given index */
 multilist_sublist_t *
-multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
+multilist_sublist_lock_idx(multilist_t *ml, unsigned int sublist_idx)
 {
        multilist_sublist_t *mls;
 
@@ -294,7 +300,7 @@ multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx)
 multilist_sublist_t *
 multilist_sublist_lock_obj(multilist_t *ml, void *obj)
 {
-       return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj)));
+       return (multilist_sublist_lock_idx(ml, ml->ml_index_func(ml, obj)));
 }
 
 void
@@ -327,6 +333,22 @@ multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj)
        list_insert_tail(&mls->mls_list, obj);
 }
 
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_after(multilist_sublist_t *mls, void *prev, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       list_insert_after(&mls->mls_list, prev, obj);
+}
+
+/* please see comment above multilist_sublist_insert_head */
+void
+multilist_sublist_insert_before(multilist_sublist_t *mls, void *next, void *obj)
+{
+       ASSERT(MUTEX_HELD(&mls->mls_lock));
+       list_insert_before(&mls->mls_list, next, obj);
+}
+
 /*
  * Move the object one element forward in the list.
  *