]> git.proxmox.com Git - mirror_ubuntu-jammy-kernel.git/commitdiff
erofs: introduce multipage per-CPU buffers
authorGao Xiang <hsiangkao@redhat.com>
Fri, 9 Apr 2021 19:06:30 +0000 (03:06 +0800)
committerGao Xiang <hsiangkao@redhat.com>
Fri, 9 Apr 2021 19:19:59 +0000 (03:19 +0800)
To deal the with the cases which inplace decompression is infeasible
for some inplace I/O. Per-CPU buffers was introduced to get rid of page
allocation latency and thrash for low-latency decompression algorithms
such as lz4.

For the big pcluster feature, introduce multipage per-CPU buffers to
keep such inplace I/O pclusters temporarily as well but note that
per-CPU pages are just consecutive virtually.

When a new big pcluster fs is mounted, its max pclustersize will be
read and per-CPU buffers can be growed if needed. Shrinking adjustable
per-CPU buffers is more complex (because we don't know if such size
is still be used), so currently just release them all when unloading.

Link: https://lore.kernel.org/r/20210409190630.19569-1-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
fs/erofs/Makefile
fs/erofs/decompressor.c
fs/erofs/internal.h
fs/erofs/pcpubuf.c [new file with mode: 0644]
fs/erofs/super.c
fs/erofs/utils.c

index af159539fc1b221b15d93987aef45c593b80791b..1f9aced49070ac77c2a1907adc5c51bea898c6ee 100644 (file)
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
index 27aa6a99b3716dfc8a6cac49b63adb414d4b8f87..fb4838c0f0df9586b4bb4443fdbca90bda811a45 100644 (file)
@@ -47,7 +47,9 @@ int z_erofs_load_lz4_config(struct super_block *sb,
        EROFS_SB(sb)->lz4.max_distance_pages = distance ?
                                        DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
                                        LZ4_MAX_DISTANCE_PAGES;
-       return 0;
+
+       /* TODO: use max pclusterblks after bigpcluster is enabled */
+       return erofs_pcpubuf_growsize(1);
 }
 
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
@@ -114,7 +116,7 @@ static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
         * pages should be copied in order to avoid being overlapped.
         */
        struct page **in = rq->in;
-       u8 *const tmp = erofs_get_pcpubuf(0);
+       u8 *const tmp = erofs_get_pcpubuf(1);
        u8 *tmpp = tmp;
        unsigned int inlen = rq->inputsize - pageofs_in;
        unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
@@ -271,7 +273,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
         * compressed data is preferred.
         */
        if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-               dst = erofs_get_pcpubuf(0);
+               dst = erofs_get_pcpubuf(1);
                if (IS_ERR(dst))
                        return PTR_ERR(dst);
 
index 05b02f99324c1a5f1531a334b31bb71036fabac3..4db08541330447388b48a5ecb8c7589c08be619b 100644 (file)
@@ -197,9 +197,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES          0
 #endif /* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -405,24 +402,16 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 /* dir.c */
 extern const struct file_operations erofs_dir_fops;
 
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
 /* utils.c / zdata.c */
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
-       (void)&(buf);   \
-       preempt_enable();       \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-       return ERR_PTR(-EOPNOTSUPP);
-}
-
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 int erofs_workgroup_put(struct erofs_workgroup *grp);
 struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644 (file)
index 0000000..6c88557
--- /dev/null
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+       raw_spinlock_t lock;
+       void *ptr;
+       struct page **pages;
+       unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+       __acquires(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+       raw_spin_lock(&pcb->lock);
+       /* check if the per-CPU buffer is too small */
+       if (requiredpages > pcb->nrpages) {
+               raw_spin_unlock(&pcb->lock);
+               put_cpu_var(erofs_pcb);
+               /* (for sparse checker) pretend pcb->lock is still taken */
+               __acquire(pcb->lock);
+               return NULL;
+       }
+       return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+       struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+       DBG_BUGON(pcb->ptr != ptr);
+       raw_spin_unlock(&pcb->lock);
+       put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+       static DEFINE_MUTEX(pcb_resize_mutex);
+       static unsigned int pcb_nrpages;
+       LIST_HEAD(pagepool);
+       int delta, cpu, ret, i;
+
+       mutex_lock(&pcb_resize_mutex);
+       delta = nrpages - pcb_nrpages;
+       ret = 0;
+       /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+       if (delta <= 0)
+               goto out;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+               struct page **pages, **oldpages;
+               void *ptr, *old_ptr;
+
+               pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+               if (!pages) {
+                       ret = -ENOMEM;
+                       break;
+               }
+
+               for (i = 0; i < nrpages; ++i) {
+                       pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+                       if (!pages[i]) {
+                               ret = -ENOMEM;
+                               oldpages = pages;
+                               goto free_pagearray;
+                       }
+               }
+               ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+               if (!ptr) {
+                       ret = -ENOMEM;
+                       oldpages = pages;
+                       goto free_pagearray;
+               }
+               raw_spin_lock(&pcb->lock);
+               old_ptr = pcb->ptr;
+               pcb->ptr = ptr;
+               oldpages = pcb->pages;
+               pcb->pages = pages;
+               i = pcb->nrpages;
+               pcb->nrpages = nrpages;
+               raw_spin_unlock(&pcb->lock);
+
+               if (!oldpages) {
+                       DBG_BUGON(old_ptr);
+                       continue;
+               }
+
+               if (old_ptr)
+                       vunmap(old_ptr);
+free_pagearray:
+               while (i)
+                       list_add(&oldpages[--i]->lru, &pagepool);
+               kfree(oldpages);
+               if (ret)
+                       break;
+       }
+       pcb_nrpages = nrpages;
+       put_pages_list(&pagepool);
+out:
+       mutex_unlock(&pcb_resize_mutex);
+       return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               raw_spin_lock_init(&pcb->lock);
+       }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+       int cpu, i;
+
+       for_each_possible_cpu(cpu) {
+               struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+               if (pcb->ptr) {
+                       vunmap(pcb->ptr);
+                       pcb->ptr = NULL;
+               }
+               if (!pcb->pages)
+                       continue;
+
+               for (i = 0; i < pcb->nrpages; ++i)
+                       if (pcb->pages[i])
+                               put_page(pcb->pages[i]);
+               kfree(pcb->pages);
+               pcb->pages = NULL;
+       }
+}
index b641658e772ff249c1cad786ce6c06f1793f7d80..bbf3bbd908e0816e4d4d894a69431c58f0c95b3f 100644 (file)
@@ -655,6 +655,7 @@ static int __init erofs_module_init(void)
        if (err)
                goto shrinker_err;
 
+       erofs_pcpubuf_init();
        err = z_erofs_init_zip_subsystem();
        if (err)
                goto zip_err;
@@ -684,6 +685,7 @@ static void __exit erofs_module_exit(void)
        /* Ensure all RCU free inodes are safe before cache is destroyed. */
        rcu_barrier();
        kmem_cache_destroy(erofs_inode_cachep);
+       erofs_pcpubuf_exit();
 }
 
 /* get filesystem statistics */
index de9986d2f82fd5f46b16c90e14fbb4024569e9fa..6758c5b19f7cf8f05dec69b9e2e05edd453df535 100644 (file)
@@ -21,18 +21,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
        return page;
 }
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
-       u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-       preempt_disable();
-       return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
-}
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;