]> git.proxmox.com Git - mirror_ubuntu-kernels.git/commitdiff
bcachefs: Be more careful about JOURNAL_RES_GET_RESERVED
authorKent Overstreet <kent.overstreet@gmail.com>
Sat, 3 Apr 2021 20:24:13 +0000 (16:24 -0400)
committerKent Overstreet <kent.overstreet@linux.dev>
Sun, 22 Oct 2023 21:08:59 +0000 (17:08 -0400)
JOURNAL_RES_GET_RESERVED should only be used for updatse that need to be
done to free up space in the journal. In particular, when we're flushing
keys from the key cache, if we're flushing them out of order we
shouldn't be using it, since we're using up our remaining space in the
journal without dropping a pin that will let us make forward progress.

With this patch, BTREE_INSERT_JOURNAL_RECLAIM without
BTREE_INSERT_JOURNAL_RESERVED may return -EAGAIN - we can't wait on
journal reclaim if we're already in journal reclaim.

This means we need to propagate these errors up to journal reclaim,
indicating that flushing a journal pin should be retried in the future.

This is prep work for a patch to change the way journal reclaim works,
to split out flushing key cache keys because the btree key cache is too
dirty from journal reclaim because we need space in the journal.

Signed-off-by: Kent Overstreet <kent.overstreet@gmail.com>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
fs/bcachefs/btree_key_cache.c
fs/bcachefs/btree_update_interior.c
fs/bcachefs/btree_update_leaf.c
fs/bcachefs/journal.c
fs/bcachefs/journal.h
fs/bcachefs/journal_reclaim.c
fs/bcachefs/journal_types.h

index 0858f469f7c28ad32661330642aa76ef1a321e96..74d982c3402a093fa4c162879de7959b33abc525 100644 (file)
@@ -353,6 +353,7 @@ err:
 static int btree_key_cache_flush_pos(struct btree_trans *trans,
                                     struct bkey_cached_key key,
                                     u64 journal_seq,
+                                    unsigned commit_flags,
                                     bool evict)
 {
        struct bch_fs *c = trans->c;
@@ -391,12 +392,17 @@ retry:
                                  BTREE_INSERT_NOUNLOCK|
                                  BTREE_INSERT_NOCHECK_RW|
                                  BTREE_INSERT_NOFAIL|
-                                 BTREE_INSERT_JOURNAL_RESERVED|
-                                 BTREE_INSERT_JOURNAL_RECLAIM);
+                                 (ck->journal.seq == journal_last_seq(j)
+                                  ? BTREE_INSERT_JOURNAL_RESERVED
+                                  : 0)|
+                                 commit_flags);
 err:
        if (ret == -EINTR)
                goto retry;
 
+       if (ret == -EAGAIN)
+               goto out;
+
        if (ret) {
                bch2_fs_fatal_err_on(!bch2_journal_error(j), c,
                        "error flushing key cache: %i", ret);
@@ -439,15 +445,16 @@ out:
        return ret;
 }
 
-static void btree_key_cache_journal_flush(struct journal *j,
-                                         struct journal_entry_pin *pin,
-                                         u64 seq)
+static int btree_key_cache_journal_flush(struct journal *j,
+                                        struct journal_entry_pin *pin,
+                                        u64 seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        struct bkey_cached *ck =
                container_of(pin, struct bkey_cached, journal);
        struct bkey_cached_key key;
        struct btree_trans trans;
+       int ret = 0;
 
        int srcu_idx = srcu_read_lock(&c->btree_trans_barrier);
 
@@ -462,10 +469,13 @@ static void btree_key_cache_journal_flush(struct journal *j,
        six_unlock_read(&ck->c.lock);
 
        bch2_trans_init(&trans, c, 0, 0);
-       btree_key_cache_flush_pos(&trans, key, seq, false);
+       ret = btree_key_cache_flush_pos(&trans, key, seq,
+                                 BTREE_INSERT_JOURNAL_RECLAIM, false);
        bch2_trans_exit(&trans);
 unlock:
        srcu_read_unlock(&c->btree_trans_barrier, srcu_idx);
+
+       return ret;
 }
 
 /*
@@ -481,7 +491,7 @@ int bch2_btree_key_cache_flush(struct btree_trans *trans,
        if (!bch2_btree_key_cache_find(c, id, pos))
                return 0;
 
-       return btree_key_cache_flush_pos(trans, key, 0, true);
+       return btree_key_cache_flush_pos(trans, key, 0, 0, true);
 }
 
 bool bch2_btree_insert_key_cached(struct btree_trans *trans,
index 988922699e8bb808720c82e5704d336be417c190..7aba0e9d99c1a3dabd6adaa76a3677dbb48b249a 100644 (file)
@@ -916,10 +916,12 @@ bch2_btree_update_start(struct btree_iter *iter, unsigned level,
        struct closure cl;
        int disk_res_flags = (flags & BTREE_INSERT_NOFAIL)
                ? BCH_DISK_RESERVATION_NOFAIL : 0;
-       int journal_flags = (flags & BTREE_INSERT_JOURNAL_RESERVED)
-               ? JOURNAL_RES_GET_RECLAIM : 0;
+       int journal_flags = 0;
        int ret = 0;
 
+       if (flags & BTREE_INSERT_JOURNAL_RESERVED)
+               journal_flags |= JOURNAL_RES_GET_RESERVED;
+
        closure_init_stack(&cl);
 retry:
        /*
@@ -982,6 +984,9 @@ retry:
 
                bch2_trans_unlock(trans);
 
+               if (flags & BTREE_INSERT_JOURNAL_RECLAIM)
+                       goto err;
+
                ret = bch2_journal_preres_get(&c->journal, &as->journal_preres,
                                BTREE_UPDATE_JOURNAL_RES,
                                journal_flags);
index d3d86aa0ee95ff5a1de5bab7aeb23320d7154d57..ee1c26f2901f675cdef3cb26337be874ab90b5c3 100644 (file)
@@ -134,7 +134,7 @@ fix_iter:
        return true;
 }
 
-static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
+static int __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
                               unsigned i, u64 seq)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
@@ -145,14 +145,15 @@ static void __btree_node_flush(struct journal *j, struct journal_entry_pin *pin,
        bch2_btree_node_write_cond(c, b,
                (btree_current_write(b) == w && w->journal.seq == seq));
        six_unlock_read(&b->c.lock);
+       return 0;
 }
 
-static void btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush0(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 0, seq);
 }
 
-static void btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
+static int btree_node_flush1(struct journal *j, struct journal_entry_pin *pin, u64 seq)
 {
        return __btree_node_flush(j, pin, 1, seq);
 }
@@ -563,8 +564,8 @@ static inline int do_bch2_trans_commit(struct btree_trans *trans,
        ret = bch2_journal_preres_get(&c->journal,
                        &trans->journal_preres, trans->journal_preres_u64s,
                        JOURNAL_RES_GET_NONBLOCK|
-                       ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM)
-                        ? JOURNAL_RES_GET_RECLAIM : 0));
+                       ((trans->flags & BTREE_INSERT_JOURNAL_RESERVED)
+                        ? JOURNAL_RES_GET_RESERVED : 0));
        if (unlikely(ret == -EAGAIN))
                ret = bch2_trans_journal_preres_get_cold(trans,
                                                trans->journal_preres_u64s);
@@ -721,6 +722,10 @@ int bch2_trans_commit_error(struct btree_trans *trans,
        case BTREE_INSERT_NEED_JOURNAL_RES:
                bch2_trans_unlock(trans);
 
+               if ((trans->flags & BTREE_INSERT_JOURNAL_RECLAIM) &&
+                   !(trans->flags & BTREE_INSERT_JOURNAL_RESERVED))
+                       return -EAGAIN;
+
                ret = bch2_trans_journal_res_get(trans, JOURNAL_RES_GET_CHECK);
                if (ret)
                        return ret;
index edbcbe7fb31f59aeb200fa3b7e86477dbad44cdb..bce056cb6841a4d1fb58dae66703edda41fcc38f 100644 (file)
@@ -11,6 +11,7 @@
 #include "btree_gc.h"
 #include "btree_update.h"
 #include "buckets.h"
+#include "error.h"
 #include "journal.h"
 #include "journal_io.h"
 #include "journal_reclaim.h"
@@ -449,6 +450,27 @@ unlock:
        if (!ret)
                goto retry;
 
+       if ((ret == cur_entry_journal_full ||
+            ret == cur_entry_journal_pin_full) &&
+           !can_discard &&
+           j->reservations.idx == j->reservations.unwritten_idx &&
+           (flags & JOURNAL_RES_GET_RESERVED)) {
+               char *journal_debug_buf = kmalloc(4096, GFP_ATOMIC);
+
+               bch_err(c, "Journal stuck!");
+               if (journal_debug_buf) {
+                       bch2_journal_debug_to_text(&_PBUF(journal_debug_buf, 4096), j);
+                       bch_err(c, "%s", journal_debug_buf);
+
+                       bch2_journal_pins_to_text(&_PBUF(journal_debug_buf, 4096), j);
+                       bch_err(c, "Journal pins:\n%s", journal_debug_buf);
+                       kfree(journal_debug_buf);
+               }
+
+               bch2_fatal_error(c);
+               dump_stack();
+       }
+
        /*
         * Journal is full - can't rely on reclaim from work item due to
         * freezing:
@@ -1169,6 +1191,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               "last_seq_ondisk:\t%llu\n"
               "flushed_seq_ondisk:\t%llu\n"
               "prereserved:\t\t%u/%u\n"
+              "each entry reserved:\t%u\n"
               "nr flush writes:\t%llu\n"
               "nr noflush writes:\t%llu\n"
               "nr direct reclaim:\t%llu\n"
@@ -1183,6 +1206,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j)
               j->flushed_seq_ondisk,
               j->prereserved.reserved,
               j->prereserved.remaining,
+              j->entry_u64s_reserved,
               j->nr_flush_writes,
               j->nr_noflush_writes,
               j->nr_direct_reclaim,
index 547c735ce3cb635b065d94bfda6500d7423b5599..a0d19fad3bdd29f6ce8a0fef6919a365494eb909 100644 (file)
@@ -308,7 +308,6 @@ int bch2_journal_res_get_slowpath(struct journal *, struct journal_res *,
 #define JOURNAL_RES_GET_NONBLOCK       (1 << 0)
 #define JOURNAL_RES_GET_CHECK          (1 << 1)
 #define JOURNAL_RES_GET_RESERVED       (1 << 2)
-#define JOURNAL_RES_GET_RECLAIM                (1 << 3)
 
 static inline int journal_res_get_fast(struct journal *j,
                                       struct journal_res *res,
@@ -446,7 +445,7 @@ static inline int bch2_journal_preres_get_fast(struct journal *j,
                 * into the reclaim path and deadlock:
                 */
 
-               if (!(flags & JOURNAL_RES_GET_RECLAIM) &&
+               if (!(flags & JOURNAL_RES_GET_RESERVED) &&
                    new.reserved > new.remaining)
                        return 0;
        } while ((v = atomic64_cmpxchg(&j->prereserved.counter,
index 3ef42a47f60dc26b1bab5b7fa86350612883d137..42ed7a3525b117eefc916fb1046a41cb02cd643a 100644 (file)
@@ -239,7 +239,7 @@ void bch2_journal_space_available(struct journal *j)
        u64s_remaining  = (u64) clean << 6;
        u64s_remaining -= (u64) total << 3;
        u64s_remaining = max(0LL, u64s_remaining);
-       u64s_remaining /= 2;
+       u64s_remaining /= 4;
        u64s_remaining = min_t(u64, u64s_remaining, U32_MAX);
 out:
        j->cur_entry_sectors    = !ret ? j->space[journal_space_discarded].next_entry : 0;
@@ -353,6 +353,9 @@ static inline void __journal_pin_drop(struct journal *j,
        if (!journal_pin_active(pin))
                return;
 
+       if (j->flush_in_progress == pin)
+               j->flush_in_progress_dropped = true;
+
        pin_list = journal_seq_pin(j, pin->seq);
        pin->seq = 0;
        list_del_init(&pin->list);
@@ -439,34 +442,27 @@ journal_get_next_pin(struct journal *j, u64 max_seq, u64 *seq)
        struct journal_entry_pin_list *pin_list;
        struct journal_entry_pin *ret = NULL;
 
-       if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
-               return NULL;
-
-       spin_lock(&j->lock);
-
        fifo_for_each_entry_ptr(pin_list, &j->pin, *seq)
                if (*seq > max_seq ||
                    (ret = list_first_entry_or_null(&pin_list->list,
                                struct journal_entry_pin, list)))
                        break;
 
-       if (ret) {
-               list_move(&ret->list, &pin_list->flushed);
-               BUG_ON(j->flush_in_progress);
-               j->flush_in_progress = ret;
-       }
-
-       spin_unlock(&j->lock);
-
        return ret;
 }
 
 /* returns true if we did work */
-static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
-                             unsigned min_nr)
+static size_t journal_flush_pins(struct journal *j, u64 seq_to_flush,
+                                unsigned min_nr)
 {
        struct journal_entry_pin *pin;
-       u64 seq, ret = 0;
+       size_t nr_flushed = 0;
+       journal_pin_flush_fn flush_fn;
+       u64 seq;
+       int err;
+
+       if (!test_bit(JOURNAL_RECLAIM_STARTED, &j->flags))
+               return 0;
 
        lockdep_assert_held(&j->reclaim_lock);
 
@@ -475,23 +471,42 @@ static u64 journal_flush_pins(struct journal *j, u64 seq_to_flush,
 
                j->last_flushed = jiffies;
 
+               spin_lock(&j->lock);
                pin = journal_get_next_pin(j, min_nr
                                ? U64_MAX : seq_to_flush, &seq);
+               if (pin) {
+                       BUG_ON(j->flush_in_progress);
+                       j->flush_in_progress = pin;
+                       j->flush_in_progress_dropped = false;
+                       flush_fn = pin->flush;
+               }
+               spin_unlock(&j->lock);
+
                if (!pin)
                        break;
 
                if (min_nr)
                        min_nr--;
 
-               pin->flush(j, pin, seq);
+               err = flush_fn(j, pin, seq);
 
-               BUG_ON(j->flush_in_progress != pin);
+               spin_lock(&j->lock);
+               /* Pin might have been dropped or rearmed: */
+               if (likely(!err && !j->flush_in_progress_dropped))
+                       list_move(&pin->list, &journal_seq_pin(j, seq)->flushed);
                j->flush_in_progress = NULL;
+               j->flush_in_progress_dropped = false;
+               spin_unlock(&j->lock);
+
                wake_up(&j->pin_flush_wait);
-               ret++;
+
+               if (err)
+                       break;
+
+               nr_flushed++;
        }
 
-       return ret;
+       return nr_flushed;
 }
 
 static u64 journal_seq_to_flush(struct journal *j)
@@ -556,8 +571,8 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct)
 {
        struct bch_fs *c = container_of(j, struct bch_fs, journal);
        bool kthread = (current->flags & PF_KTHREAD) != 0;
-       u64 seq_to_flush, nr_flushed = 0;
-       size_t min_nr;
+       u64 seq_to_flush;
+       size_t min_nr, nr_flushed;
        unsigned flags;
        int ret = 0;
 
index 3db8c3760ccaafc5db5a54be9047443919ca6888..ec3c604cdf224c2d8a1327bdbb2c79dff5ebd882 100644 (file)
@@ -50,7 +50,7 @@ struct journal_entry_pin_list {
 
 struct journal;
 struct journal_entry_pin;
-typedef void (*journal_pin_flush_fn)(struct journal *j,
+typedef int (*journal_pin_flush_fn)(struct journal *j,
                                struct journal_entry_pin *, u64);
 
 struct journal_entry_pin {
@@ -251,6 +251,7 @@ struct journal {
 
        unsigned long           last_flushed;
        struct journal_entry_pin *flush_in_progress;
+       bool                    flush_in_progress_dropped;
        wait_queue_head_t       pin_flush_wait;
 
        /* protects advancing ja->discard_idx: */