1 // SPDX-License-Identifier: GPL-2.0-only
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * https://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
9 #include <linux/prefetch.h>
11 #include <trace/events/erofs.h>
14 * since pclustersize is variable for big pcluster feature, introduce slab
15 * pools implementation for different pcluster sizes.
17 struct z_erofs_pcluster_slab
{
18 struct kmem_cache
*slab
;
19 unsigned int maxpages
;
23 #define _PCLP(n) { .maxpages = n }
25 static struct z_erofs_pcluster_slab pcluster_pool
[] __read_mostly
= {
26 _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
27 _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES
)
30 static void z_erofs_destroy_pcluster_pool(void)
34 for (i
= 0; i
< ARRAY_SIZE(pcluster_pool
); ++i
) {
35 if (!pcluster_pool
[i
].slab
)
37 kmem_cache_destroy(pcluster_pool
[i
].slab
);
38 pcluster_pool
[i
].slab
= NULL
;
42 static int z_erofs_create_pcluster_pool(void)
44 struct z_erofs_pcluster_slab
*pcs
;
45 struct z_erofs_pcluster
*a
;
48 for (pcs
= pcluster_pool
;
49 pcs
< pcluster_pool
+ ARRAY_SIZE(pcluster_pool
); ++pcs
) {
50 size
= struct_size(a
, compressed_pages
, pcs
->maxpages
);
52 sprintf(pcs
->name
, "erofs_pcluster-%u", pcs
->maxpages
);
53 pcs
->slab
= kmem_cache_create(pcs
->name
, size
, 0,
54 SLAB_RECLAIM_ACCOUNT
, NULL
);
58 z_erofs_destroy_pcluster_pool();
64 static struct z_erofs_pcluster
*z_erofs_alloc_pcluster(unsigned int nrpages
)
68 for (i
= 0; i
< ARRAY_SIZE(pcluster_pool
); ++i
) {
69 struct z_erofs_pcluster_slab
*pcs
= pcluster_pool
+ i
;
70 struct z_erofs_pcluster
*pcl
;
72 if (nrpages
> pcs
->maxpages
)
75 pcl
= kmem_cache_zalloc(pcs
->slab
, GFP_NOFS
);
77 return ERR_PTR(-ENOMEM
);
78 pcl
->pclusterpages
= nrpages
;
81 return ERR_PTR(-EINVAL
);
84 static void z_erofs_free_pcluster(struct z_erofs_pcluster
*pcl
)
88 for (i
= 0; i
< ARRAY_SIZE(pcluster_pool
); ++i
) {
89 struct z_erofs_pcluster_slab
*pcs
= pcluster_pool
+ i
;
91 if (pcl
->pclusterpages
> pcs
->maxpages
)
94 kmem_cache_free(pcs
->slab
, pcl
);
101 * a compressed_pages[] placeholder in order to avoid
102 * being filled with file pages for in-place decompression.
104 #define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
106 /* how to allocate cached pages for a pcluster */
107 enum z_erofs_cache_alloctype
{
108 DONTALLOC
, /* don't allocate any cached pages */
109 DELAYEDALLOC
, /* delayed allocation (at the time of submitting io) */
111 * try to use cached I/O if page allocation succeeds or fallback
112 * to in-place I/O instead to avoid any direct reclaim.
118 * tagged pointer with 1-bit tag for all compressed pages
119 * tag 0 - the page is just found with an extra page reference
121 typedef tagptr1_t compressed_page_t
;
123 #define tag_compressed_page_justfound(page) \
124 tagptr_fold(compressed_page_t, page, 1)
126 static struct workqueue_struct
*z_erofs_workqueue __read_mostly
;
128 void z_erofs_exit_zip_subsystem(void)
130 destroy_workqueue(z_erofs_workqueue
);
131 z_erofs_destroy_pcluster_pool();
134 static inline int z_erofs_init_workqueue(void)
136 const unsigned int onlinecpus
= num_possible_cpus();
139 * no need to spawn too many threads, limiting threads could minimum
140 * scheduling overhead, perhaps per-CPU threads should be better?
142 z_erofs_workqueue
= alloc_workqueue("erofs_unzipd",
143 WQ_UNBOUND
| WQ_HIGHPRI
,
144 onlinecpus
+ onlinecpus
/ 4);
145 return z_erofs_workqueue
? 0 : -ENOMEM
;
148 int __init
z_erofs_init_zip_subsystem(void)
150 int err
= z_erofs_create_pcluster_pool();
154 err
= z_erofs_init_workqueue();
156 z_erofs_destroy_pcluster_pool();
160 enum z_erofs_collectmode
{
164 * The current collection was the tail of an exist chain, in addition
165 * that the previous processed chained collections are all decided to
166 * be hooked up to it.
167 * A new chain will be created for the remaining collections which are
168 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
169 * the next collection cannot reuse the whole page safely in
170 * the following scenario:
171 * ________________________________________________________________
172 * | tail (partial) page | head (partial) page |
173 * | (belongs to the next cl) | (belongs to the current cl) |
174 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
176 COLLECT_PRIMARY_HOOKED
,
178 * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
179 * could be dispatched into bypass queue later due to uptodated managed
180 * pages. All related online pages cannot be reused for inplace I/O (or
181 * pagevec) since it can be directly decoded without I/O submission.
183 COLLECT_PRIMARY_FOLLOWED_NOINPLACE
,
185 * The current collection has been linked with the owned chain, and
186 * could also be linked with the remaining collections, which means
187 * if the processing page is the tail page of the collection, thus
188 * the current collection can safely use the whole page (since
189 * the previous collection is under control) for in-place I/O, as
191 * ________________________________________________________________
192 * | tail (partial) page | head (partial) page |
193 * | (of the current cl) | (of the previous collection) |
194 * | PRIMARY_FOLLOWED or | |
195 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
197 * [ (*) the above page can be used as inplace I/O. ]
199 COLLECT_PRIMARY_FOLLOWED
,
202 struct z_erofs_collector
{
203 struct z_erofs_pagevec_ctor vector
;
205 struct z_erofs_pcluster
*pcl
, *tailpcl
;
206 struct z_erofs_collection
*cl
;
207 /* a pointer used to pick up inplace I/O pages */
208 struct page
**icpage_ptr
;
209 z_erofs_next_pcluster_t owned_head
;
211 enum z_erofs_collectmode mode
;
214 struct z_erofs_decompress_frontend
{
215 struct inode
*const inode
;
217 struct z_erofs_collector clt
;
218 struct erofs_map_blocks map
;
221 /* used for applying cache strategy on the fly */
223 erofs_off_t headoffset
;
226 #define COLLECTOR_INIT() { \
227 .owned_head = Z_EROFS_PCLUSTER_TAIL, \
228 .mode = COLLECT_PRIMARY_FOLLOWED }
230 #define DECOMPRESS_FRONTEND_INIT(__i) { \
231 .inode = __i, .clt = COLLECTOR_INIT(), \
234 static struct page
*z_pagemap_global
[Z_EROFS_VMAP_GLOBAL_PAGES
];
235 static DEFINE_MUTEX(z_pagemap_global_lock
);
237 static void preload_compressed_pages(struct z_erofs_collector
*clt
,
238 struct address_space
*mc
,
239 enum z_erofs_cache_alloctype type
,
240 struct list_head
*pagepool
)
242 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
243 bool standalone
= true;
244 gfp_t gfp
= (mapping_gfp_mask(mc
) & ~__GFP_DIRECT_RECLAIM
) |
245 __GFP_NOMEMALLOC
| __GFP_NORETRY
| __GFP_NOWARN
;
249 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED
)
252 pages
= pcl
->compressed_pages
;
253 index
= pcl
->obj
.index
;
254 for (; index
< pcl
->obj
.index
+ pcl
->pclusterpages
; ++index
, ++pages
) {
257 struct page
*newpage
= NULL
;
259 /* the compressed page was loaded before */
260 if (READ_ONCE(*pages
))
263 page
= find_get_page(mc
, index
);
266 t
= tag_compressed_page_justfound(page
);
268 /* I/O is needed, no possible to decompress directly */
272 t
= tagptr_init(compressed_page_t
,
276 newpage
= erofs_allocpage(pagepool
, gfp
);
279 set_page_private(newpage
,
280 Z_EROFS_PREALLOCATED_PAGE
);
281 t
= tag_compressed_page_justfound(newpage
);
283 default: /* DONTALLOC */
288 if (!cmpxchg_relaxed(pages
, NULL
, tagptr_cast_ptr(t
)))
293 } else if (newpage
) {
294 set_page_private(newpage
, 0);
295 list_add(&newpage
->lru
, pagepool
);
300 * don't do inplace I/O if all compressed pages are available in
301 * managed cache since it can be moved to the bypass queue instead.
304 clt
->mode
= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
;
307 /* called by erofs_shrinker to get rid of all compressed_pages */
308 int erofs_try_to_free_all_cached_pages(struct erofs_sb_info
*sbi
,
309 struct erofs_workgroup
*grp
)
311 struct z_erofs_pcluster
*const pcl
=
312 container_of(grp
, struct z_erofs_pcluster
, obj
);
313 struct address_space
*const mapping
= MNGD_MAPPING(sbi
);
317 * refcount of workgroup is now freezed as 1,
318 * therefore no need to worry about available decompression users.
320 for (i
= 0; i
< pcl
->pclusterpages
; ++i
) {
321 struct page
*page
= pcl
->compressed_pages
[i
];
326 /* block other users from reclaiming or migrating the page */
327 if (!trylock_page(page
))
330 if (page
->mapping
!= mapping
)
333 /* barrier is implied in the following 'unlock_page' */
334 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
335 detach_page_private(page
);
341 int erofs_try_to_free_cached_page(struct address_space
*mapping
,
344 struct z_erofs_pcluster
*const pcl
= (void *)page_private(page
);
345 int ret
= 0; /* 0 - busy */
347 if (erofs_workgroup_try_to_freeze(&pcl
->obj
, 1)) {
350 for (i
= 0; i
< pcl
->pclusterpages
; ++i
) {
351 if (pcl
->compressed_pages
[i
] == page
) {
352 WRITE_ONCE(pcl
->compressed_pages
[i
], NULL
);
357 erofs_workgroup_unfreeze(&pcl
->obj
, 1);
360 detach_page_private(page
);
365 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
366 static bool z_erofs_try_inplace_io(struct z_erofs_collector
*clt
,
369 struct z_erofs_pcluster
*const pcl
= clt
->pcl
;
371 while (clt
->icpage_ptr
> pcl
->compressed_pages
)
372 if (!cmpxchg(--clt
->icpage_ptr
, NULL
, page
))
377 /* callers must be with collection lock held */
378 static int z_erofs_attach_page(struct z_erofs_collector
*clt
,
380 enum z_erofs_page_type type
)
385 /* give priority for inplaceio */
386 if (clt
->mode
>= COLLECT_PRIMARY
&&
387 type
== Z_EROFS_PAGE_TYPE_EXCLUSIVE
&&
388 z_erofs_try_inplace_io(clt
, page
))
391 ret
= z_erofs_pagevec_enqueue(&clt
->vector
,
392 page
, type
, &occupied
);
393 clt
->cl
->vcnt
+= (unsigned int)ret
;
395 return ret
? 0 : -EAGAIN
;
398 static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector
*clt
)
400 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
401 z_erofs_next_pcluster_t
*owned_head
= &clt
->owned_head
;
403 /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
404 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_NIL
,
405 *owned_head
) == Z_EROFS_PCLUSTER_NIL
) {
406 *owned_head
= &pcl
->next
;
407 /* so we can attach this pcluster to our submission chain. */
408 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
413 * type 2, link to the end of an existing open chain, be careful
414 * that its submission is controlled by the original attached chain.
416 if (cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
417 *owned_head
) == Z_EROFS_PCLUSTER_TAIL
) {
418 *owned_head
= Z_EROFS_PCLUSTER_TAIL
;
419 clt
->mode
= COLLECT_PRIMARY_HOOKED
;
423 /* type 3, it belongs to a chain, but it isn't the end of the chain */
424 clt
->mode
= COLLECT_PRIMARY
;
427 static int z_erofs_lookup_collection(struct z_erofs_collector
*clt
,
429 struct erofs_map_blocks
*map
)
431 struct z_erofs_pcluster
*pcl
= clt
->pcl
;
432 struct z_erofs_collection
*cl
;
435 /* to avoid unexpected loop formed by corrupted images */
436 if (clt
->owned_head
== &pcl
->next
|| pcl
== clt
->tailpcl
) {
438 return -EFSCORRUPTED
;
441 cl
= z_erofs_primarycollection(pcl
);
442 if (cl
->pageofs
!= (map
->m_la
& ~PAGE_MASK
)) {
444 return -EFSCORRUPTED
;
447 length
= READ_ONCE(pcl
->length
);
448 if (length
& Z_EROFS_PCLUSTER_FULL_LENGTH
) {
449 if ((map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) > length
) {
451 return -EFSCORRUPTED
;
454 unsigned int llen
= map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
;
456 if (map
->m_flags
& EROFS_MAP_FULL_MAPPED
)
457 llen
|= Z_EROFS_PCLUSTER_FULL_LENGTH
;
459 while (llen
> length
&&
460 length
!= cmpxchg_relaxed(&pcl
->length
, length
, llen
)) {
462 length
= READ_ONCE(pcl
->length
);
465 mutex_lock(&cl
->lock
);
466 /* used to check tail merging loop due to corrupted images */
467 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
470 z_erofs_try_to_claim_pcluster(clt
);
475 static int z_erofs_register_collection(struct z_erofs_collector
*clt
,
477 struct erofs_map_blocks
*map
)
479 struct z_erofs_pcluster
*pcl
;
480 struct z_erofs_collection
*cl
;
481 struct erofs_workgroup
*grp
;
484 /* no available pcluster, let's allocate one */
485 pcl
= z_erofs_alloc_pcluster(map
->m_plen
>> PAGE_SHIFT
);
489 atomic_set(&pcl
->obj
.refcount
, 1);
490 pcl
->obj
.index
= map
->m_pa
>> PAGE_SHIFT
;
492 pcl
->length
= (map
->m_llen
<< Z_EROFS_PCLUSTER_LENGTH_BIT
) |
493 (map
->m_flags
& EROFS_MAP_FULL_MAPPED
?
494 Z_EROFS_PCLUSTER_FULL_LENGTH
: 0);
496 if (map
->m_flags
& EROFS_MAP_ZIPPED
)
497 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_LZ4
;
499 pcl
->algorithmformat
= Z_EROFS_COMPRESSION_SHIFTED
;
501 /* new pclusters should be claimed as type 1, primary and followed */
502 pcl
->next
= clt
->owned_head
;
503 clt
->mode
= COLLECT_PRIMARY_FOLLOWED
;
505 cl
= z_erofs_primarycollection(pcl
);
506 cl
->pageofs
= map
->m_la
& ~PAGE_MASK
;
509 * lock all primary followed works before visible to others
510 * and mutex_trylock *never* fails for a new pcluster.
512 mutex_init(&cl
->lock
);
513 DBG_BUGON(!mutex_trylock(&cl
->lock
));
515 grp
= erofs_insert_workgroup(inode
->i_sb
, &pcl
->obj
);
521 if (grp
!= &pcl
->obj
) {
522 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
526 /* used to check tail merging loop due to corrupted images */
527 if (clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL
)
529 clt
->owned_head
= &pcl
->next
;
535 mutex_unlock(&cl
->lock
);
536 z_erofs_free_pcluster(pcl
);
540 static int z_erofs_collector_begin(struct z_erofs_collector
*clt
,
542 struct erofs_map_blocks
*map
)
544 struct erofs_workgroup
*grp
;
549 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
550 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_NIL
);
551 DBG_BUGON(clt
->owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
553 if (!PAGE_ALIGNED(map
->m_pa
)) {
558 grp
= erofs_find_workgroup(inode
->i_sb
, map
->m_pa
>> PAGE_SHIFT
);
560 clt
->pcl
= container_of(grp
, struct z_erofs_pcluster
, obj
);
562 ret
= z_erofs_register_collection(clt
, inode
, map
);
570 ret
= z_erofs_lookup_collection(clt
, inode
, map
);
572 erofs_workgroup_put(&clt
->pcl
->obj
);
577 z_erofs_pagevec_ctor_init(&clt
->vector
, Z_EROFS_NR_INLINE_PAGEVECS
,
578 clt
->cl
->pagevec
, clt
->cl
->vcnt
);
580 /* since file-backed online pages are traversed in reverse order */
581 clt
->icpage_ptr
= clt
->pcl
->compressed_pages
+ clt
->pcl
->pclusterpages
;
586 * keep in mind that no referenced pclusters will be freed
587 * only after a RCU grace period.
589 static void z_erofs_rcu_callback(struct rcu_head
*head
)
591 struct z_erofs_collection
*const cl
=
592 container_of(head
, struct z_erofs_collection
, rcu
);
594 z_erofs_free_pcluster(container_of(cl
, struct z_erofs_pcluster
,
595 primary_collection
));
598 void erofs_workgroup_free_rcu(struct erofs_workgroup
*grp
)
600 struct z_erofs_pcluster
*const pcl
=
601 container_of(grp
, struct z_erofs_pcluster
, obj
);
602 struct z_erofs_collection
*const cl
= z_erofs_primarycollection(pcl
);
604 call_rcu(&cl
->rcu
, z_erofs_rcu_callback
);
607 static void z_erofs_collection_put(struct z_erofs_collection
*cl
)
609 struct z_erofs_pcluster
*const pcl
=
610 container_of(cl
, struct z_erofs_pcluster
, primary_collection
);
612 erofs_workgroup_put(&pcl
->obj
);
615 static bool z_erofs_collector_end(struct z_erofs_collector
*clt
)
617 struct z_erofs_collection
*cl
= clt
->cl
;
622 z_erofs_pagevec_ctor_exit(&clt
->vector
, false);
623 mutex_unlock(&cl
->lock
);
626 * if all pending pages are added, don't hold its reference
627 * any longer if the pcluster isn't hosted by ourselves.
629 if (clt
->mode
< COLLECT_PRIMARY_FOLLOWED_NOINPLACE
)
630 z_erofs_collection_put(cl
);
636 static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend
*fe
,
637 unsigned int cachestrategy
,
640 if (cachestrategy
<= EROFS_ZIP_CACHE_DISABLED
)
646 return cachestrategy
>= EROFS_ZIP_CACHE_READAROUND
&&
650 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend
*fe
,
651 struct page
*page
, struct list_head
*pagepool
)
653 struct inode
*const inode
= fe
->inode
;
654 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
655 struct erofs_map_blocks
*const map
= &fe
->map
;
656 struct z_erofs_collector
*const clt
= &fe
->clt
;
657 const loff_t offset
= page_offset(page
);
660 enum z_erofs_cache_alloctype cache_strategy
;
661 enum z_erofs_page_type page_type
;
662 unsigned int cur
, end
, spiltted
, index
;
665 /* register locked file pages as online pages in pack */
666 z_erofs_onlinepage_init(page
);
673 /* lucky, within the range of the current map_blocks */
674 if (offset
+ cur
>= map
->m_la
&&
675 offset
+ cur
< map
->m_la
+ map
->m_llen
) {
676 /* didn't get a valid collection previously (very rare) */
682 /* go ahead the next map_blocks */
683 erofs_dbg("%s: [out-of-range] pos %llu", __func__
, offset
+ cur
);
685 if (z_erofs_collector_end(clt
))
686 fe
->backmost
= false;
688 map
->m_la
= offset
+ cur
;
690 err
= z_erofs_map_blocks_iter(inode
, map
, 0);
695 if (!(map
->m_flags
& EROFS_MAP_MAPPED
))
698 err
= z_erofs_collector_begin(clt
, inode
, map
);
702 /* preload all compressed pages (maybe downgrade role if necessary) */
703 if (should_alloc_managed_pages(fe
, sbi
->ctx
.cache_strategy
, map
->m_la
))
704 cache_strategy
= TRYALLOC
;
706 cache_strategy
= DONTALLOC
;
708 preload_compressed_pages(clt
, MNGD_MAPPING(sbi
),
709 cache_strategy
, pagepool
);
713 * Ensure the current partial page belongs to this submit chain rather
714 * than other concurrent submit chains or the noio(bypass) chain since
715 * those chains are handled asynchronously thus the page cannot be used
716 * for inplace I/O or pagevec (should be processed in strict order.)
718 tight
&= (clt
->mode
>= COLLECT_PRIMARY_HOOKED
&&
719 clt
->mode
!= COLLECT_PRIMARY_FOLLOWED_NOINPLACE
);
721 cur
= end
- min_t(unsigned int, offset
+ end
- map
->m_la
, end
);
722 if (!(map
->m_flags
& EROFS_MAP_MAPPED
)) {
723 zero_user_segment(page
, cur
, end
);
727 /* let's derive page type */
728 page_type
= cur
? Z_EROFS_VLE_PAGE_TYPE_HEAD
:
729 (!spiltted
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
730 (tight
? Z_EROFS_PAGE_TYPE_EXCLUSIVE
:
731 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED
));
734 tight
&= (clt
->mode
>= COLLECT_PRIMARY_FOLLOWED
);
737 err
= z_erofs_attach_page(clt
, page
, page_type
);
738 /* should allocate an additional short-lived page for pagevec */
739 if (err
== -EAGAIN
) {
740 struct page
*const newpage
=
741 alloc_page(GFP_NOFS
| __GFP_NOFAIL
);
743 set_page_private(newpage
, Z_EROFS_SHORTLIVED_PAGE
);
744 err
= z_erofs_attach_page(clt
, newpage
,
745 Z_EROFS_PAGE_TYPE_EXCLUSIVE
);
753 index
= page
->index
- (map
->m_la
>> PAGE_SHIFT
);
755 z_erofs_onlinepage_fixup(page
, index
, true);
757 /* bump up the number of spiltted parts of a page */
759 /* also update nr_pages */
760 clt
->cl
->nr_pages
= max_t(pgoff_t
, clt
->cl
->nr_pages
, index
+ 1);
762 /* can be used for verification */
763 map
->m_llen
= offset
+ cur
- map
->m_la
;
770 z_erofs_onlinepage_endio(page
);
772 erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
773 __func__
, page
, spiltted
, map
->m_llen
);
776 /* if some error occurred while processing this page */
782 static void z_erofs_decompressqueue_work(struct work_struct
*work
);
783 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue
*io
,
786 struct erofs_sb_info
*const sbi
= EROFS_SB(io
->sb
);
788 /* wake up the caller thread for sync decompression */
792 spin_lock_irqsave(&io
->u
.wait
.lock
, flags
);
793 if (!atomic_add_return(bios
, &io
->pending_bios
))
794 wake_up_locked(&io
->u
.wait
);
795 spin_unlock_irqrestore(&io
->u
.wait
.lock
, flags
);
799 if (atomic_add_return(bios
, &io
->pending_bios
))
801 /* Use workqueue and sync decompression for atomic contexts only */
802 if (in_atomic() || irqs_disabled()) {
803 queue_work(z_erofs_workqueue
, &io
->u
.work
);
804 sbi
->ctx
.readahead_sync_decompress
= true;
807 z_erofs_decompressqueue_work(&io
->u
.work
);
810 static bool z_erofs_page_is_invalidated(struct page
*page
)
812 return !page
->mapping
&& !z_erofs_is_shortlived_page(page
);
815 static void z_erofs_decompressqueue_endio(struct bio
*bio
)
817 tagptr1_t t
= tagptr_init(tagptr1_t
, bio
->bi_private
);
818 struct z_erofs_decompressqueue
*q
= tagptr_unfold_ptr(t
);
819 blk_status_t err
= bio
->bi_status
;
820 struct bio_vec
*bvec
;
821 struct bvec_iter_all iter_all
;
823 bio_for_each_segment_all(bvec
, bio
, iter_all
) {
824 struct page
*page
= bvec
->bv_page
;
826 DBG_BUGON(PageUptodate(page
));
827 DBG_BUGON(z_erofs_page_is_invalidated(page
));
832 if (erofs_page_is_managed(EROFS_SB(q
->sb
), page
)) {
834 SetPageUptodate(page
);
838 z_erofs_decompress_kickoff(q
, tagptr_unfold_tags(t
), -1);
842 static int z_erofs_decompress_pcluster(struct super_block
*sb
,
843 struct z_erofs_pcluster
*pcl
,
844 struct list_head
*pagepool
)
846 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
847 struct z_erofs_pagevec_ctor ctor
;
848 unsigned int i
, inputsize
, outputsize
, llen
, nr_pages
;
849 struct page
*pages_onstack
[Z_EROFS_VMAP_ONSTACK_PAGES
];
850 struct page
**pages
, **compressed_pages
, *page
;
852 enum z_erofs_page_type page_type
;
853 bool overlapped
, partial
;
854 struct z_erofs_collection
*cl
;
858 cl
= z_erofs_primarycollection(pcl
);
859 DBG_BUGON(!READ_ONCE(cl
->nr_pages
));
861 mutex_lock(&cl
->lock
);
862 nr_pages
= cl
->nr_pages
;
864 if (nr_pages
<= Z_EROFS_VMAP_ONSTACK_PAGES
) {
865 pages
= pages_onstack
;
866 } else if (nr_pages
<= Z_EROFS_VMAP_GLOBAL_PAGES
&&
867 mutex_trylock(&z_pagemap_global_lock
)) {
868 pages
= z_pagemap_global
;
870 gfp_t gfp_flags
= GFP_KERNEL
;
872 if (nr_pages
> Z_EROFS_VMAP_GLOBAL_PAGES
)
873 gfp_flags
|= __GFP_NOFAIL
;
875 pages
= kvmalloc_array(nr_pages
, sizeof(struct page
*),
878 /* fallback to global pagemap for the lowmem scenario */
880 mutex_lock(&z_pagemap_global_lock
);
881 pages
= z_pagemap_global
;
885 for (i
= 0; i
< nr_pages
; ++i
)
889 z_erofs_pagevec_ctor_init(&ctor
, Z_EROFS_NR_INLINE_PAGEVECS
,
892 for (i
= 0; i
< cl
->vcnt
; ++i
) {
895 page
= z_erofs_pagevec_dequeue(&ctor
, &page_type
);
897 /* all pages in pagevec ought to be valid */
899 DBG_BUGON(z_erofs_page_is_invalidated(page
));
901 if (z_erofs_put_shortlivedpage(pagepool
, page
))
904 if (page_type
== Z_EROFS_VLE_PAGE_TYPE_HEAD
)
907 pagenr
= z_erofs_onlinepage_index(page
);
909 DBG_BUGON(pagenr
>= nr_pages
);
912 * currently EROFS doesn't support multiref(dedup),
913 * so here erroring out one multiref page.
917 SetPageError(pages
[pagenr
]);
918 z_erofs_onlinepage_endio(pages
[pagenr
]);
921 pages
[pagenr
] = page
;
923 z_erofs_pagevec_ctor_exit(&ctor
, true);
926 compressed_pages
= pcl
->compressed_pages
;
928 for (i
= 0; i
< pcl
->pclusterpages
; ++i
) {
931 page
= compressed_pages
[i
];
933 /* all compressed pages ought to be valid */
935 DBG_BUGON(z_erofs_page_is_invalidated(page
));
937 if (!z_erofs_is_shortlived_page(page
)) {
938 if (erofs_page_is_managed(sbi
, page
)) {
939 if (!PageUptodate(page
))
945 * only if non-head page can be selected
946 * for inplace decompression
948 pagenr
= z_erofs_onlinepage_index(page
);
950 DBG_BUGON(pagenr
>= nr_pages
);
953 SetPageError(pages
[pagenr
]);
954 z_erofs_onlinepage_endio(pages
[pagenr
]);
957 pages
[pagenr
] = page
;
962 /* PG_error needs checking for all non-managed pages */
963 if (PageError(page
)) {
964 DBG_BUGON(PageUptodate(page
));
972 llen
= pcl
->length
>> Z_EROFS_PCLUSTER_LENGTH_BIT
;
973 if (nr_pages
<< PAGE_SHIFT
>= cl
->pageofs
+ llen
) {
975 partial
= !(pcl
->length
& Z_EROFS_PCLUSTER_FULL_LENGTH
);
977 outputsize
= (nr_pages
<< PAGE_SHIFT
) - cl
->pageofs
;
981 inputsize
= pcl
->pclusterpages
* PAGE_SIZE
;
982 err
= z_erofs_decompress(&(struct z_erofs_decompress_req
) {
984 .in
= compressed_pages
,
986 .pageofs_out
= cl
->pageofs
,
987 .inputsize
= inputsize
,
988 .outputsize
= outputsize
,
989 .alg
= pcl
->algorithmformat
,
990 .inplace_io
= overlapped
,
991 .partial_decoding
= partial
995 /* must handle all compressed pages before ending pages */
996 for (i
= 0; i
< pcl
->pclusterpages
; ++i
) {
997 page
= compressed_pages
[i
];
999 if (erofs_page_is_managed(sbi
, page
))
1002 /* recycle all individual short-lived pages */
1003 (void)z_erofs_put_shortlivedpage(pagepool
, page
);
1005 WRITE_ONCE(compressed_pages
[i
], NULL
);
1008 for (i
= 0; i
< nr_pages
; ++i
) {
1013 DBG_BUGON(z_erofs_page_is_invalidated(page
));
1015 /* recycle all individual short-lived pages */
1016 if (z_erofs_put_shortlivedpage(pagepool
, page
))
1022 z_erofs_onlinepage_endio(page
);
1025 if (pages
== z_pagemap_global
)
1026 mutex_unlock(&z_pagemap_global_lock
);
1027 else if (pages
!= pages_onstack
)
1033 /* all cl locks MUST be taken before the following line */
1034 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_NIL
);
1036 /* all cl locks SHOULD be released right now */
1037 mutex_unlock(&cl
->lock
);
1039 z_erofs_collection_put(cl
);
1043 static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue
*io
,
1044 struct list_head
*pagepool
)
1046 z_erofs_next_pcluster_t owned
= io
->head
;
1048 while (owned
!= Z_EROFS_PCLUSTER_TAIL_CLOSED
) {
1049 struct z_erofs_pcluster
*pcl
;
1051 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
1052 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_TAIL
);
1054 /* no possible that 'owned' equals NULL */
1055 DBG_BUGON(owned
== Z_EROFS_PCLUSTER_NIL
);
1057 pcl
= container_of(owned
, struct z_erofs_pcluster
, next
);
1058 owned
= READ_ONCE(pcl
->next
);
1060 z_erofs_decompress_pcluster(io
->sb
, pcl
, pagepool
);
1064 static void z_erofs_decompressqueue_work(struct work_struct
*work
)
1066 struct z_erofs_decompressqueue
*bgq
=
1067 container_of(work
, struct z_erofs_decompressqueue
, u
.work
);
1068 LIST_HEAD(pagepool
);
1070 DBG_BUGON(bgq
->head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1071 z_erofs_decompress_queue(bgq
, &pagepool
);
1073 put_pages_list(&pagepool
);
1077 static struct page
*pickup_page_for_submission(struct z_erofs_pcluster
*pcl
,
1079 struct list_head
*pagepool
,
1080 struct address_space
*mc
,
1083 const pgoff_t index
= pcl
->obj
.index
;
1084 bool tocache
= false;
1086 struct address_space
*mapping
;
1087 struct page
*oldpage
, *page
;
1089 compressed_page_t t
;
1093 page
= READ_ONCE(pcl
->compressed_pages
[nr
]);
1100 * the cached page has not been allocated and
1101 * an placeholder is out there, prepare it now.
1103 if (page
== PAGE_UNALLOCATED
) {
1108 /* process the target tagged pointer */
1109 t
= tagptr_init(compressed_page_t
, page
);
1110 justfound
= tagptr_unfold_tags(t
);
1111 page
= tagptr_unfold_ptr(t
);
1114 * preallocated cached pages, which is used to avoid direct reclaim
1115 * otherwise, it will go inplace I/O path instead.
1117 if (page
->private == Z_EROFS_PREALLOCATED_PAGE
) {
1118 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1119 set_page_private(page
, 0);
1123 mapping
= READ_ONCE(page
->mapping
);
1126 * file-backed online pages in plcuster are all locked steady,
1127 * therefore it is impossible for `mapping' to be NULL.
1129 if (mapping
&& mapping
!= mc
)
1130 /* ought to be unmanaged pages */
1133 /* directly return for shortlived page as well */
1134 if (z_erofs_is_shortlived_page(page
))
1139 /* only true if page reclaim goes wrong, should never happen */
1140 DBG_BUGON(justfound
&& PagePrivate(page
));
1142 /* the page is still in manage cache */
1143 if (page
->mapping
== mc
) {
1144 WRITE_ONCE(pcl
->compressed_pages
[nr
], page
);
1146 ClearPageError(page
);
1147 if (!PagePrivate(page
)) {
1149 * impossible to be !PagePrivate(page) for
1150 * the current restriction as well if
1151 * the page is already in compressed_pages[].
1153 DBG_BUGON(!justfound
);
1156 set_page_private(page
, (unsigned long)pcl
);
1157 SetPagePrivate(page
);
1160 /* no need to submit io if it is already up-to-date */
1161 if (PageUptodate(page
)) {
1169 * the managed page has been truncated, it's unsafe to
1170 * reuse this one, let's allocate a new cache-managed page.
1172 DBG_BUGON(page
->mapping
);
1173 DBG_BUGON(!justfound
);
1179 page
= erofs_allocpage(pagepool
, gfp
| __GFP_NOFAIL
);
1180 if (oldpage
!= cmpxchg(&pcl
->compressed_pages
[nr
], oldpage
, page
)) {
1181 list_add(&page
->lru
, pagepool
);
1186 if (!tocache
|| add_to_page_cache_lru(page
, mc
, index
+ nr
, gfp
)) {
1187 /* turn into temporary page if fails (1 ref) */
1188 set_page_private(page
, Z_EROFS_SHORTLIVED_PAGE
);
1191 attach_page_private(page
, pcl
);
1192 /* drop a refcount added by allocpage (then we have 2 refs here) */
1195 out
: /* the only exit (for tracing and debugging) */
1199 static struct z_erofs_decompressqueue
*
1200 jobqueue_init(struct super_block
*sb
,
1201 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1203 struct z_erofs_decompressqueue
*q
;
1206 q
= kvzalloc(sizeof(*q
), GFP_KERNEL
| __GFP_NOWARN
);
1211 INIT_WORK(&q
->u
.work
, z_erofs_decompressqueue_work
);
1215 init_waitqueue_head(&fgq
->u
.wait
);
1216 atomic_set(&fgq
->pending_bios
, 0);
1219 q
->head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1223 /* define decompression jobqueue types */
1230 static void *jobqueueset_init(struct super_block
*sb
,
1231 struct z_erofs_decompressqueue
*q
[],
1232 struct z_erofs_decompressqueue
*fgq
, bool *fg
)
1235 * if managed cache is enabled, bypass jobqueue is needed,
1236 * no need to read from device for all pclusters in this queue.
1238 q
[JQ_BYPASS
] = jobqueue_init(sb
, fgq
+ JQ_BYPASS
, NULL
);
1239 q
[JQ_SUBMIT
] = jobqueue_init(sb
, fgq
+ JQ_SUBMIT
, fg
);
1241 return tagptr_cast_ptr(tagptr_fold(tagptr1_t
, q
[JQ_SUBMIT
], *fg
));
1244 static void move_to_bypass_jobqueue(struct z_erofs_pcluster
*pcl
,
1245 z_erofs_next_pcluster_t qtail
[],
1246 z_erofs_next_pcluster_t owned_head
)
1248 z_erofs_next_pcluster_t
*const submit_qtail
= qtail
[JQ_SUBMIT
];
1249 z_erofs_next_pcluster_t
*const bypass_qtail
= qtail
[JQ_BYPASS
];
1251 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1252 if (owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1253 owned_head
= Z_EROFS_PCLUSTER_TAIL_CLOSED
;
1255 WRITE_ONCE(pcl
->next
, Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1257 WRITE_ONCE(*submit_qtail
, owned_head
);
1258 WRITE_ONCE(*bypass_qtail
, &pcl
->next
);
1260 qtail
[JQ_BYPASS
] = &pcl
->next
;
1263 static void z_erofs_submit_queue(struct super_block
*sb
,
1264 struct z_erofs_decompress_frontend
*f
,
1265 struct list_head
*pagepool
,
1266 struct z_erofs_decompressqueue
*fgq
,
1269 struct erofs_sb_info
*const sbi
= EROFS_SB(sb
);
1270 z_erofs_next_pcluster_t qtail
[NR_JOBQUEUES
];
1271 struct z_erofs_decompressqueue
*q
[NR_JOBQUEUES
];
1273 z_erofs_next_pcluster_t owned_head
= f
->clt
.owned_head
;
1274 /* since bio will be NULL, no need to initialize last_index */
1276 unsigned int nr_bios
= 0;
1277 struct bio
*bio
= NULL
;
1279 bi_private
= jobqueueset_init(sb
, q
, fgq
, force_fg
);
1280 qtail
[JQ_BYPASS
] = &q
[JQ_BYPASS
]->head
;
1281 qtail
[JQ_SUBMIT
] = &q
[JQ_SUBMIT
]->head
;
1283 /* by default, all need io submission */
1284 q
[JQ_SUBMIT
]->head
= owned_head
;
1287 struct z_erofs_pcluster
*pcl
;
1292 /* no possible 'owned_head' equals the following */
1293 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1294 DBG_BUGON(owned_head
== Z_EROFS_PCLUSTER_NIL
);
1296 pcl
= container_of(owned_head
, struct z_erofs_pcluster
, next
);
1298 cur
= pcl
->obj
.index
;
1299 end
= cur
+ pcl
->pclusterpages
;
1301 /* close the main owned chain at first */
1302 owned_head
= cmpxchg(&pcl
->next
, Z_EROFS_PCLUSTER_TAIL
,
1303 Z_EROFS_PCLUSTER_TAIL_CLOSED
);
1308 page
= pickup_page_for_submission(pcl
, i
++, pagepool
,
1314 if (bio
&& cur
!= last_index
+ 1) {
1321 bio
= bio_alloc(GFP_NOIO
, BIO_MAX_VECS
);
1323 bio
->bi_end_io
= z_erofs_decompressqueue_endio
;
1324 bio_set_dev(bio
, sb
->s_bdev
);
1325 bio
->bi_iter
.bi_sector
= (sector_t
)cur
<<
1326 LOG_SECTORS_PER_BLOCK
;
1327 bio
->bi_private
= bi_private
;
1328 bio
->bi_opf
= REQ_OP_READ
;
1330 bio
->bi_opf
|= REQ_RAHEAD
;
1334 if (bio_add_page(bio
, page
, PAGE_SIZE
, 0) < PAGE_SIZE
)
1335 goto submit_bio_retry
;
1339 } while (++cur
< end
);
1342 qtail
[JQ_SUBMIT
] = &pcl
->next
;
1344 move_to_bypass_jobqueue(pcl
, qtail
, owned_head
);
1345 } while (owned_head
!= Z_EROFS_PCLUSTER_TAIL
);
1351 * although background is preferred, no one is pending for submission.
1352 * don't issue workqueue for decompression but drop it directly instead.
1354 if (!*force_fg
&& !nr_bios
) {
1355 kvfree(q
[JQ_SUBMIT
]);
1358 z_erofs_decompress_kickoff(q
[JQ_SUBMIT
], *force_fg
, nr_bios
);
1361 static void z_erofs_runqueue(struct super_block
*sb
,
1362 struct z_erofs_decompress_frontend
*f
,
1363 struct list_head
*pagepool
, bool force_fg
)
1365 struct z_erofs_decompressqueue io
[NR_JOBQUEUES
];
1367 if (f
->clt
.owned_head
== Z_EROFS_PCLUSTER_TAIL
)
1369 z_erofs_submit_queue(sb
, f
, pagepool
, io
, &force_fg
);
1371 /* handle bypass queue (no i/o pclusters) immediately */
1372 z_erofs_decompress_queue(&io
[JQ_BYPASS
], pagepool
);
1377 /* wait until all bios are completed */
1378 io_wait_event(io
[JQ_SUBMIT
].u
.wait
,
1379 !atomic_read(&io
[JQ_SUBMIT
].pending_bios
));
1381 /* handle synchronous decompress queue in the caller context */
1382 z_erofs_decompress_queue(&io
[JQ_SUBMIT
], pagepool
);
1385 static int z_erofs_readpage(struct file
*file
, struct page
*page
)
1387 struct inode
*const inode
= page
->mapping
->host
;
1388 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1390 LIST_HEAD(pagepool
);
1392 trace_erofs_readpage(page
, false);
1394 f
.headoffset
= (erofs_off_t
)page
->index
<< PAGE_SHIFT
;
1396 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1397 (void)z_erofs_collector_end(&f
.clt
);
1399 /* if some compressed cluster ready, need submit them anyway */
1400 z_erofs_runqueue(inode
->i_sb
, &f
, &pagepool
, true);
1403 erofs_err(inode
->i_sb
, "failed to read, err [%d]", err
);
1406 put_page(f
.map
.mpage
);
1408 /* clean up the remaining free pages */
1409 put_pages_list(&pagepool
);
1413 static void z_erofs_readahead(struct readahead_control
*rac
)
1415 struct inode
*const inode
= rac
->mapping
->host
;
1416 struct erofs_sb_info
*const sbi
= EROFS_I_SB(inode
);
1418 unsigned int nr_pages
= readahead_count(rac
);
1419 bool sync
= (sbi
->ctx
.readahead_sync_decompress
&&
1420 nr_pages
<= sbi
->ctx
.max_sync_decompress_pages
);
1421 struct z_erofs_decompress_frontend f
= DECOMPRESS_FRONTEND_INIT(inode
);
1422 struct page
*page
, *head
= NULL
;
1423 LIST_HEAD(pagepool
);
1425 trace_erofs_readpages(inode
, readahead_index(rac
), nr_pages
, false);
1428 f
.headoffset
= readahead_pos(rac
);
1430 while ((page
= readahead_page(rac
))) {
1431 prefetchw(&page
->flags
);
1434 * A pure asynchronous readahead is indicated if
1435 * a PG_readahead marked page is hitted at first.
1436 * Let's also do asynchronous decompression for this case.
1438 sync
&= !(PageReadahead(page
) && !head
);
1440 set_page_private(page
, (unsigned long)head
);
1445 struct page
*page
= head
;
1448 /* traversal in reverse order */
1449 head
= (void *)page_private(page
);
1451 err
= z_erofs_do_read_page(&f
, page
, &pagepool
);
1453 erofs_err(inode
->i_sb
,
1454 "readahead error at page %lu @ nid %llu",
1455 page
->index
, EROFS_I(inode
)->nid
);
1459 (void)z_erofs_collector_end(&f
.clt
);
1461 z_erofs_runqueue(inode
->i_sb
, &f
, &pagepool
, sync
);
1464 put_page(f
.map
.mpage
);
1466 /* clean up the remaining free pages */
1467 put_pages_list(&pagepool
);
1470 const struct address_space_operations z_erofs_aops
= {
1471 .readpage
= z_erofs_readpage
,
1472 .readahead
= z_erofs_readahead
,