1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2008 Oracle. All rights reserved.
6 #include <linux/kernel.h>
8 #include <linux/file.h>
10 #include <linux/pagemap.h>
11 #include <linux/highmem.h>
12 #include <linux/kthread.h>
13 #include <linux/time.h>
14 #include <linux/init.h>
15 #include <linux/string.h>
16 #include <linux/backing-dev.h>
17 #include <linux/writeback.h>
18 #include <linux/slab.h>
19 #include <linux/sched/mm.h>
20 #include <linux/log2.h>
21 #include <crypto/hash.h>
25 #include "transaction.h"
26 #include "btrfs_inode.h"
28 #include "ordered-data.h"
29 #include "compression.h"
30 #include "extent_io.h"
31 #include "extent_map.h"
34 static const char* const btrfs_compress_types
[] = { "", "zlib", "lzo", "zstd" };
36 const char* btrfs_compress_type2str(enum btrfs_compression_type type
)
39 case BTRFS_COMPRESS_ZLIB
:
40 case BTRFS_COMPRESS_LZO
:
41 case BTRFS_COMPRESS_ZSTD
:
42 case BTRFS_COMPRESS_NONE
:
43 return btrfs_compress_types
[type
];
51 bool btrfs_compress_is_valid_type(const char *str
, size_t len
)
55 for (i
= 1; i
< ARRAY_SIZE(btrfs_compress_types
); i
++) {
56 size_t comp_len
= strlen(btrfs_compress_types
[i
]);
61 if (!strncmp(btrfs_compress_types
[i
], str
, comp_len
))
67 static int compression_compress_pages(int type
, struct list_head
*ws
,
68 struct address_space
*mapping
, u64 start
, struct page
**pages
,
69 unsigned long *out_pages
, unsigned long *total_in
,
70 unsigned long *total_out
)
73 case BTRFS_COMPRESS_ZLIB
:
74 return zlib_compress_pages(ws
, mapping
, start
, pages
,
75 out_pages
, total_in
, total_out
);
76 case BTRFS_COMPRESS_LZO
:
77 return lzo_compress_pages(ws
, mapping
, start
, pages
,
78 out_pages
, total_in
, total_out
);
79 case BTRFS_COMPRESS_ZSTD
:
80 return zstd_compress_pages(ws
, mapping
, start
, pages
,
81 out_pages
, total_in
, total_out
);
82 case BTRFS_COMPRESS_NONE
:
85 * This can happen when compression races with remount setting
86 * it to 'no compress', while caller doesn't call
87 * inode_need_compress() to check if we really need to
90 * Not a big deal, just need to inform caller that we
91 * haven't allocated any pages yet.
98 static int compression_decompress_bio(int type
, struct list_head
*ws
,
99 struct compressed_bio
*cb
)
102 case BTRFS_COMPRESS_ZLIB
: return zlib_decompress_bio(ws
, cb
);
103 case BTRFS_COMPRESS_LZO
: return lzo_decompress_bio(ws
, cb
);
104 case BTRFS_COMPRESS_ZSTD
: return zstd_decompress_bio(ws
, cb
);
105 case BTRFS_COMPRESS_NONE
:
108 * This can't happen, the type is validated several times
109 * before we get here.
115 static int compression_decompress(int type
, struct list_head
*ws
,
116 unsigned char *data_in
, struct page
*dest_page
,
117 unsigned long start_byte
, size_t srclen
, size_t destlen
)
120 case BTRFS_COMPRESS_ZLIB
: return zlib_decompress(ws
, data_in
, dest_page
,
121 start_byte
, srclen
, destlen
);
122 case BTRFS_COMPRESS_LZO
: return lzo_decompress(ws
, data_in
, dest_page
,
123 start_byte
, srclen
, destlen
);
124 case BTRFS_COMPRESS_ZSTD
: return zstd_decompress(ws
, data_in
, dest_page
,
125 start_byte
, srclen
, destlen
);
126 case BTRFS_COMPRESS_NONE
:
129 * This can't happen, the type is validated several times
130 * before we get here.
136 static int btrfs_decompress_bio(struct compressed_bio
*cb
);
138 static inline int compressed_bio_size(struct btrfs_fs_info
*fs_info
,
139 unsigned long disk_size
)
141 return sizeof(struct compressed_bio
) +
142 (DIV_ROUND_UP(disk_size
, fs_info
->sectorsize
)) * fs_info
->csum_size
;
145 static int check_compressed_csum(struct btrfs_inode
*inode
, struct bio
*bio
,
148 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
149 SHASH_DESC_ON_STACK(shash
, fs_info
->csum_shash
);
150 const u32 csum_size
= fs_info
->csum_size
;
151 const u32 sectorsize
= fs_info
->sectorsize
;
155 u8 csum
[BTRFS_CSUM_SIZE
];
156 struct compressed_bio
*cb
= bio
->bi_private
;
157 u8
*cb_sum
= cb
->sums
;
159 if (!fs_info
->csum_root
|| (inode
->flags
& BTRFS_INODE_NODATASUM
))
162 shash
->tfm
= fs_info
->csum_shash
;
164 for (i
= 0; i
< cb
->nr_pages
; i
++) {
166 u32 bytes_left
= PAGE_SIZE
;
167 page
= cb
->compressed_pages
[i
];
169 /* Determine the remaining bytes inside the page first */
170 if (i
== cb
->nr_pages
- 1)
171 bytes_left
= cb
->compressed_len
- i
* PAGE_SIZE
;
173 /* Hash through the page sector by sector */
174 for (pg_offset
= 0; pg_offset
< bytes_left
;
175 pg_offset
+= sectorsize
) {
176 kaddr
= kmap_atomic(page
);
177 crypto_shash_digest(shash
, kaddr
+ pg_offset
,
179 kunmap_atomic(kaddr
);
181 if (memcmp(&csum
, cb_sum
, csum_size
) != 0) {
182 btrfs_print_data_csum_error(inode
, disk_start
,
183 csum
, cb_sum
, cb
->mirror_num
);
184 if (btrfs_io_bio(bio
)->device
)
185 btrfs_dev_stat_inc_and_print(
186 btrfs_io_bio(bio
)->device
,
187 BTRFS_DEV_STAT_CORRUPTION_ERRS
);
191 disk_start
+= sectorsize
;
197 /* when we finish reading compressed pages from the disk, we
198 * decompress them and then run the bio end_io routines on the
199 * decompressed pages (in the inode address space).
201 * This allows the checksumming and other IO error handling routines
204 * The compressed pages are freed here, and it must be run
207 static void end_compressed_bio_read(struct bio
*bio
)
209 struct compressed_bio
*cb
= bio
->bi_private
;
213 unsigned int mirror
= btrfs_io_bio(bio
)->mirror_num
;
219 /* if there are more bios still pending for this compressed
222 if (!refcount_dec_and_test(&cb
->pending_bios
))
226 * Record the correct mirror_num in cb->orig_bio so that
227 * read-repair can work properly.
229 btrfs_io_bio(cb
->orig_bio
)->mirror_num
= mirror
;
230 cb
->mirror_num
= mirror
;
233 * Some IO in this cb have failed, just skip checksum as there
234 * is no way it could be correct.
240 ret
= check_compressed_csum(BTRFS_I(inode
), bio
,
241 bio
->bi_iter
.bi_sector
<< 9);
245 /* ok, we're the last bio for this extent, lets start
248 ret
= btrfs_decompress_bio(cb
);
254 /* release the compressed pages */
256 for (index
= 0; index
< cb
->nr_pages
; index
++) {
257 page
= cb
->compressed_pages
[index
];
258 page
->mapping
= NULL
;
262 /* do io completion on the original bio */
264 bio_io_error(cb
->orig_bio
);
266 struct bio_vec
*bvec
;
267 struct bvec_iter_all iter_all
;
270 * we have verified the checksum already, set page
271 * checked so the end_io handlers know about it
273 ASSERT(!bio_flagged(bio
, BIO_CLONED
));
274 bio_for_each_segment_all(bvec
, cb
->orig_bio
, iter_all
)
275 SetPageChecked(bvec
->bv_page
);
277 bio_endio(cb
->orig_bio
);
280 /* finally free the cb struct */
281 kfree(cb
->compressed_pages
);
288 * Clear the writeback bits on all of the file
289 * pages for a compressed write
291 static noinline
void end_compressed_writeback(struct inode
*inode
,
292 const struct compressed_bio
*cb
)
294 unsigned long index
= cb
->start
>> PAGE_SHIFT
;
295 unsigned long end_index
= (cb
->start
+ cb
->len
- 1) >> PAGE_SHIFT
;
296 struct page
*pages
[16];
297 unsigned long nr_pages
= end_index
- index
+ 1;
302 mapping_set_error(inode
->i_mapping
, -EIO
);
304 while (nr_pages
> 0) {
305 ret
= find_get_pages_contig(inode
->i_mapping
, index
,
307 nr_pages
, ARRAY_SIZE(pages
)), pages
);
313 for (i
= 0; i
< ret
; i
++) {
315 SetPageError(pages
[i
]);
316 end_page_writeback(pages
[i
]);
322 /* the inode may be gone now */
326 * do the cleanup once all the compressed pages hit the disk.
327 * This will clear writeback on the file pages and free the compressed
330 * This also calls the writeback end hooks for the file pages so that
331 * metadata and checksums can be updated in the file.
333 static void end_compressed_bio_write(struct bio
*bio
)
335 struct compressed_bio
*cb
= bio
->bi_private
;
343 /* if there are more bios still pending for this compressed
346 if (!refcount_dec_and_test(&cb
->pending_bios
))
349 /* ok, we're the last bio for this extent, step one is to
350 * call back into the FS and do all the end_io operations
353 btrfs_record_physical_zoned(inode
, cb
->start
, bio
);
354 btrfs_writepage_endio_finish_ordered(BTRFS_I(inode
), NULL
,
355 cb
->start
, cb
->start
+ cb
->len
- 1,
358 end_compressed_writeback(inode
, cb
);
359 /* note, our inode could be gone now */
362 * release the compressed pages, these came from alloc_page and
363 * are not attached to the inode at all
366 for (index
= 0; index
< cb
->nr_pages
; index
++) {
367 page
= cb
->compressed_pages
[index
];
368 page
->mapping
= NULL
;
372 /* finally free the cb struct */
373 kfree(cb
->compressed_pages
);
380 * worker function to build and submit bios for previously compressed pages.
381 * The corresponding pages in the inode should be marked for writeback
382 * and the compressed pages should have a reference on them for dropping
383 * when the IO is complete.
385 * This also checksums the file bytes and gets things ready for
388 blk_status_t
btrfs_submit_compressed_write(struct btrfs_inode
*inode
, u64 start
,
389 unsigned int len
, u64 disk_start
,
390 unsigned int compressed_len
,
391 struct page
**compressed_pages
,
392 unsigned int nr_pages
,
393 unsigned int write_flags
,
394 struct cgroup_subsys_state
*blkcg_css
)
396 struct btrfs_fs_info
*fs_info
= inode
->root
->fs_info
;
397 struct bio
*bio
= NULL
;
398 struct compressed_bio
*cb
;
399 unsigned long bytes_left
;
402 u64 first_byte
= disk_start
;
404 int skip_sum
= inode
->flags
& BTRFS_INODE_NODATASUM
;
405 const bool use_append
= btrfs_use_zone_append(inode
, disk_start
);
406 const unsigned int bio_op
= use_append
? REQ_OP_ZONE_APPEND
: REQ_OP_WRITE
;
408 WARN_ON(!PAGE_ALIGNED(start
));
409 cb
= kmalloc(compressed_bio_size(fs_info
, compressed_len
), GFP_NOFS
);
411 return BLK_STS_RESOURCE
;
412 refcount_set(&cb
->pending_bios
, 0);
414 cb
->inode
= &inode
->vfs_inode
;
418 cb
->compressed_pages
= compressed_pages
;
419 cb
->compressed_len
= compressed_len
;
421 cb
->nr_pages
= nr_pages
;
423 bio
= btrfs_bio_alloc(first_byte
);
424 bio
->bi_opf
= bio_op
| write_flags
;
425 bio
->bi_private
= cb
;
426 bio
->bi_end_io
= end_compressed_bio_write
;
429 struct btrfs_device
*device
;
431 device
= btrfs_zoned_get_device(fs_info
, disk_start
, PAGE_SIZE
);
432 if (IS_ERR(device
)) {
435 return BLK_STS_NOTSUPP
;
438 bio_set_dev(bio
, device
->bdev
);
442 bio
->bi_opf
|= REQ_CGROUP_PUNT
;
443 kthread_associate_blkcg(blkcg_css
);
445 refcount_set(&cb
->pending_bios
, 1);
447 /* create and submit bios for the compressed pages */
448 bytes_left
= compressed_len
;
449 for (pg_index
= 0; pg_index
< cb
->nr_pages
; pg_index
++) {
453 page
= compressed_pages
[pg_index
];
454 page
->mapping
= inode
->vfs_inode
.i_mapping
;
455 if (bio
->bi_iter
.bi_size
)
456 submit
= btrfs_bio_fits_in_stripe(page
, PAGE_SIZE
, bio
,
460 * Page can only be added to bio if the current bio fits in
464 if (pg_index
== 0 && use_append
)
465 len
= bio_add_zone_append_page(bio
, page
,
468 len
= bio_add_page(bio
, page
, PAGE_SIZE
, 0);
471 page
->mapping
= NULL
;
472 if (submit
|| len
< PAGE_SIZE
) {
474 * inc the count before we submit the bio so
475 * we know the end IO handler won't happen before
476 * we inc the count. Otherwise, the cb might get
477 * freed before we're done setting it up
479 refcount_inc(&cb
->pending_bios
);
480 ret
= btrfs_bio_wq_end_io(fs_info
, bio
,
481 BTRFS_WQ_ENDIO_DATA
);
482 BUG_ON(ret
); /* -ENOMEM */
485 ret
= btrfs_csum_one_bio(inode
, bio
, start
, 1);
486 BUG_ON(ret
); /* -ENOMEM */
489 ret
= btrfs_map_bio(fs_info
, bio
, 0);
491 bio
->bi_status
= ret
;
495 bio
= btrfs_bio_alloc(first_byte
);
496 bio
->bi_opf
= bio_op
| write_flags
;
497 bio
->bi_private
= cb
;
498 bio
->bi_end_io
= end_compressed_bio_write
;
500 bio
->bi_opf
|= REQ_CGROUP_PUNT
;
502 * Use bio_add_page() to ensure the bio has at least one
505 bio_add_page(bio
, page
, PAGE_SIZE
, 0);
507 if (bytes_left
< PAGE_SIZE
) {
509 "bytes left %lu compress len %u nr %u",
510 bytes_left
, cb
->compressed_len
, cb
->nr_pages
);
512 bytes_left
-= PAGE_SIZE
;
513 first_byte
+= PAGE_SIZE
;
517 ret
= btrfs_bio_wq_end_io(fs_info
, bio
, BTRFS_WQ_ENDIO_DATA
);
518 BUG_ON(ret
); /* -ENOMEM */
521 ret
= btrfs_csum_one_bio(inode
, bio
, start
, 1);
522 BUG_ON(ret
); /* -ENOMEM */
525 ret
= btrfs_map_bio(fs_info
, bio
, 0);
527 bio
->bi_status
= ret
;
532 kthread_associate_blkcg(NULL
);
537 static u64
bio_end_offset(struct bio
*bio
)
539 struct bio_vec
*last
= bio_last_bvec_all(bio
);
541 return page_offset(last
->bv_page
) + last
->bv_len
+ last
->bv_offset
;
544 static noinline
int add_ra_bio_pages(struct inode
*inode
,
546 struct compressed_bio
*cb
)
548 unsigned long end_index
;
549 unsigned long pg_index
;
551 u64 isize
= i_size_read(inode
);
554 unsigned long nr_pages
= 0;
555 struct extent_map
*em
;
556 struct address_space
*mapping
= inode
->i_mapping
;
557 struct extent_map_tree
*em_tree
;
558 struct extent_io_tree
*tree
;
562 last_offset
= bio_end_offset(cb
->orig_bio
);
563 em_tree
= &BTRFS_I(inode
)->extent_tree
;
564 tree
= &BTRFS_I(inode
)->io_tree
;
570 * For current subpage support, we only support 64K page size,
571 * which means maximum compressed extent size (128K) is just 2x page
573 * This makes readahead less effective, so here disable readahead for
574 * subpage for now, until full compressed write is supported.
576 if (btrfs_sb(inode
->i_sb
)->sectorsize
< PAGE_SIZE
)
579 end_index
= (i_size_read(inode
) - 1) >> PAGE_SHIFT
;
581 while (last_offset
< compressed_end
) {
582 pg_index
= last_offset
>> PAGE_SHIFT
;
584 if (pg_index
> end_index
)
587 page
= xa_load(&mapping
->i_pages
, pg_index
);
588 if (page
&& !xa_is_value(page
)) {
595 page
= __page_cache_alloc(mapping_gfp_constraint(mapping
,
600 if (add_to_page_cache_lru(page
, mapping
, pg_index
, GFP_NOFS
)) {
606 * at this point, we have a locked page in the page cache
607 * for these bytes in the file. But, we have to make
608 * sure they map to this compressed extent on disk.
610 ret
= set_page_extent_mapped(page
);
617 end
= last_offset
+ PAGE_SIZE
- 1;
618 lock_extent(tree
, last_offset
, end
);
619 read_lock(&em_tree
->lock
);
620 em
= lookup_extent_mapping(em_tree
, last_offset
,
622 read_unlock(&em_tree
->lock
);
624 if (!em
|| last_offset
< em
->start
||
625 (last_offset
+ PAGE_SIZE
> extent_map_end(em
)) ||
626 (em
->block_start
>> 9) != cb
->orig_bio
->bi_iter
.bi_sector
) {
628 unlock_extent(tree
, last_offset
, end
);
635 if (page
->index
== end_index
) {
636 size_t zero_offset
= offset_in_page(isize
);
640 zeros
= PAGE_SIZE
- zero_offset
;
641 memzero_page(page
, zero_offset
, zeros
);
642 flush_dcache_page(page
);
646 ret
= bio_add_page(cb
->orig_bio
, page
,
649 if (ret
== PAGE_SIZE
) {
653 unlock_extent(tree
, last_offset
, end
);
659 last_offset
+= PAGE_SIZE
;
665 * for a compressed read, the bio we get passed has all the inode pages
666 * in it. We don't actually do IO on those pages but allocate new ones
667 * to hold the compressed pages on disk.
669 * bio->bi_iter.bi_sector points to the compressed extent on disk
670 * bio->bi_io_vec points to all of the inode pages
672 * After the compressed pages are read, we copy the bytes into the
673 * bio we were passed and then call the bio end_io calls
675 blk_status_t
btrfs_submit_compressed_read(struct inode
*inode
, struct bio
*bio
,
676 int mirror_num
, unsigned long bio_flags
)
678 struct btrfs_fs_info
*fs_info
= btrfs_sb(inode
->i_sb
);
679 struct extent_map_tree
*em_tree
;
680 struct compressed_bio
*cb
;
681 unsigned int compressed_len
;
682 unsigned int nr_pages
;
683 unsigned int pg_index
;
685 struct bio
*comp_bio
;
686 u64 cur_disk_byte
= bio
->bi_iter
.bi_sector
<< 9;
690 struct extent_map
*em
;
691 blk_status_t ret
= BLK_STS_RESOURCE
;
695 em_tree
= &BTRFS_I(inode
)->extent_tree
;
697 file_offset
= bio_first_bvec_all(bio
)->bv_offset
+
698 page_offset(bio_first_page_all(bio
));
700 /* we need the actual starting offset of this extent in the file */
701 read_lock(&em_tree
->lock
);
702 em
= lookup_extent_mapping(em_tree
, file_offset
, fs_info
->sectorsize
);
703 read_unlock(&em_tree
->lock
);
705 return BLK_STS_IOERR
;
707 ASSERT(em
->compress_type
!= BTRFS_COMPRESS_NONE
);
708 compressed_len
= em
->block_len
;
709 cb
= kmalloc(compressed_bio_size(fs_info
, compressed_len
), GFP_NOFS
);
713 refcount_set(&cb
->pending_bios
, 0);
716 cb
->mirror_num
= mirror_num
;
719 cb
->start
= em
->orig_start
;
721 em_start
= em
->start
;
726 cb
->len
= bio
->bi_iter
.bi_size
;
727 cb
->compressed_len
= compressed_len
;
728 cb
->compress_type
= extent_compress_type(bio_flags
);
731 nr_pages
= DIV_ROUND_UP(compressed_len
, PAGE_SIZE
);
732 cb
->compressed_pages
= kcalloc(nr_pages
, sizeof(struct page
*),
734 if (!cb
->compressed_pages
)
737 for (pg_index
= 0; pg_index
< nr_pages
; pg_index
++) {
738 cb
->compressed_pages
[pg_index
] = alloc_page(GFP_NOFS
);
739 if (!cb
->compressed_pages
[pg_index
]) {
740 faili
= pg_index
- 1;
741 ret
= BLK_STS_RESOURCE
;
745 faili
= nr_pages
- 1;
746 cb
->nr_pages
= nr_pages
;
748 add_ra_bio_pages(inode
, em_start
+ em_len
, cb
);
750 /* include any pages we added in add_ra-bio_pages */
751 cb
->len
= bio
->bi_iter
.bi_size
;
753 comp_bio
= btrfs_bio_alloc(cur_disk_byte
);
754 comp_bio
->bi_opf
= REQ_OP_READ
;
755 comp_bio
->bi_private
= cb
;
756 comp_bio
->bi_end_io
= end_compressed_bio_read
;
757 refcount_set(&cb
->pending_bios
, 1);
759 for (pg_index
= 0; pg_index
< nr_pages
; pg_index
++) {
760 u32 pg_len
= PAGE_SIZE
;
764 * To handle subpage case, we need to make sure the bio only
765 * covers the range we need.
767 * If we're at the last page, truncate the length to only cover
768 * the remaining part.
770 if (pg_index
== nr_pages
- 1)
771 pg_len
= min_t(u32
, PAGE_SIZE
,
772 compressed_len
- pg_index
* PAGE_SIZE
);
774 page
= cb
->compressed_pages
[pg_index
];
775 page
->mapping
= inode
->i_mapping
;
776 page
->index
= em_start
>> PAGE_SHIFT
;
778 if (comp_bio
->bi_iter
.bi_size
)
779 submit
= btrfs_bio_fits_in_stripe(page
, pg_len
,
782 page
->mapping
= NULL
;
783 if (submit
|| bio_add_page(comp_bio
, page
, pg_len
, 0) < pg_len
) {
784 unsigned int nr_sectors
;
786 ret
= btrfs_bio_wq_end_io(fs_info
, comp_bio
,
787 BTRFS_WQ_ENDIO_DATA
);
788 BUG_ON(ret
); /* -ENOMEM */
791 * inc the count before we submit the bio so
792 * we know the end IO handler won't happen before
793 * we inc the count. Otherwise, the cb might get
794 * freed before we're done setting it up
796 refcount_inc(&cb
->pending_bios
);
798 ret
= btrfs_lookup_bio_sums(inode
, comp_bio
, sums
);
799 BUG_ON(ret
); /* -ENOMEM */
801 nr_sectors
= DIV_ROUND_UP(comp_bio
->bi_iter
.bi_size
,
802 fs_info
->sectorsize
);
803 sums
+= fs_info
->csum_size
* nr_sectors
;
805 ret
= btrfs_map_bio(fs_info
, comp_bio
, mirror_num
);
807 comp_bio
->bi_status
= ret
;
811 comp_bio
= btrfs_bio_alloc(cur_disk_byte
);
812 comp_bio
->bi_opf
= REQ_OP_READ
;
813 comp_bio
->bi_private
= cb
;
814 comp_bio
->bi_end_io
= end_compressed_bio_read
;
816 bio_add_page(comp_bio
, page
, pg_len
, 0);
818 cur_disk_byte
+= pg_len
;
821 ret
= btrfs_bio_wq_end_io(fs_info
, comp_bio
, BTRFS_WQ_ENDIO_DATA
);
822 BUG_ON(ret
); /* -ENOMEM */
824 ret
= btrfs_lookup_bio_sums(inode
, comp_bio
, sums
);
825 BUG_ON(ret
); /* -ENOMEM */
827 ret
= btrfs_map_bio(fs_info
, comp_bio
, mirror_num
);
829 comp_bio
->bi_status
= ret
;
837 __free_page(cb
->compressed_pages
[faili
]);
841 kfree(cb
->compressed_pages
);
850 * Heuristic uses systematic sampling to collect data from the input data
851 * range, the logic can be tuned by the following constants:
853 * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
854 * @SAMPLING_INTERVAL - range from which the sampled data can be collected
856 #define SAMPLING_READ_SIZE (16)
857 #define SAMPLING_INTERVAL (256)
860 * For statistical analysis of the input data we consider bytes that form a
861 * Galois Field of 256 objects. Each object has an attribute count, ie. how
862 * many times the object appeared in the sample.
864 #define BUCKET_SIZE (256)
867 * The size of the sample is based on a statistical sampling rule of thumb.
868 * The common way is to perform sampling tests as long as the number of
869 * elements in each cell is at least 5.
871 * Instead of 5, we choose 32 to obtain more accurate results.
872 * If the data contain the maximum number of symbols, which is 256, we obtain a
873 * sample size bound by 8192.
875 * For a sample of at most 8KB of data per data range: 16 consecutive bytes
876 * from up to 512 locations.
878 #define MAX_SAMPLE_SIZE (BTRFS_MAX_UNCOMPRESSED * \
879 SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
885 struct heuristic_ws
{
886 /* Partial copy of input data */
889 /* Buckets store counters for each byte value */
890 struct bucket_item
*bucket
;
892 struct bucket_item
*bucket_b
;
893 struct list_head list
;
896 static struct workspace_manager heuristic_wsm
;
898 static void free_heuristic_ws(struct list_head
*ws
)
900 struct heuristic_ws
*workspace
;
902 workspace
= list_entry(ws
, struct heuristic_ws
, list
);
904 kvfree(workspace
->sample
);
905 kfree(workspace
->bucket
);
906 kfree(workspace
->bucket_b
);
910 static struct list_head
*alloc_heuristic_ws(unsigned int level
)
912 struct heuristic_ws
*ws
;
914 ws
= kzalloc(sizeof(*ws
), GFP_KERNEL
);
916 return ERR_PTR(-ENOMEM
);
918 ws
->sample
= kvmalloc(MAX_SAMPLE_SIZE
, GFP_KERNEL
);
922 ws
->bucket
= kcalloc(BUCKET_SIZE
, sizeof(*ws
->bucket
), GFP_KERNEL
);
926 ws
->bucket_b
= kcalloc(BUCKET_SIZE
, sizeof(*ws
->bucket_b
), GFP_KERNEL
);
930 INIT_LIST_HEAD(&ws
->list
);
933 free_heuristic_ws(&ws
->list
);
934 return ERR_PTR(-ENOMEM
);
937 const struct btrfs_compress_op btrfs_heuristic_compress
= {
938 .workspace_manager
= &heuristic_wsm
,
941 static const struct btrfs_compress_op
* const btrfs_compress_op
[] = {
942 /* The heuristic is represented as compression type 0 */
943 &btrfs_heuristic_compress
,
944 &btrfs_zlib_compress
,
946 &btrfs_zstd_compress
,
949 static struct list_head
*alloc_workspace(int type
, unsigned int level
)
952 case BTRFS_COMPRESS_NONE
: return alloc_heuristic_ws(level
);
953 case BTRFS_COMPRESS_ZLIB
: return zlib_alloc_workspace(level
);
954 case BTRFS_COMPRESS_LZO
: return lzo_alloc_workspace(level
);
955 case BTRFS_COMPRESS_ZSTD
: return zstd_alloc_workspace(level
);
958 * This can't happen, the type is validated several times
959 * before we get here.
965 static void free_workspace(int type
, struct list_head
*ws
)
968 case BTRFS_COMPRESS_NONE
: return free_heuristic_ws(ws
);
969 case BTRFS_COMPRESS_ZLIB
: return zlib_free_workspace(ws
);
970 case BTRFS_COMPRESS_LZO
: return lzo_free_workspace(ws
);
971 case BTRFS_COMPRESS_ZSTD
: return zstd_free_workspace(ws
);
974 * This can't happen, the type is validated several times
975 * before we get here.
981 static void btrfs_init_workspace_manager(int type
)
983 struct workspace_manager
*wsm
;
984 struct list_head
*workspace
;
986 wsm
= btrfs_compress_op
[type
]->workspace_manager
;
987 INIT_LIST_HEAD(&wsm
->idle_ws
);
988 spin_lock_init(&wsm
->ws_lock
);
989 atomic_set(&wsm
->total_ws
, 0);
990 init_waitqueue_head(&wsm
->ws_wait
);
993 * Preallocate one workspace for each compression type so we can
994 * guarantee forward progress in the worst case
996 workspace
= alloc_workspace(type
, 0);
997 if (IS_ERR(workspace
)) {
999 "BTRFS: cannot preallocate compression workspace, will try later\n");
1001 atomic_set(&wsm
->total_ws
, 1);
1003 list_add(workspace
, &wsm
->idle_ws
);
1007 static void btrfs_cleanup_workspace_manager(int type
)
1009 struct workspace_manager
*wsman
;
1010 struct list_head
*ws
;
1012 wsman
= btrfs_compress_op
[type
]->workspace_manager
;
1013 while (!list_empty(&wsman
->idle_ws
)) {
1014 ws
= wsman
->idle_ws
.next
;
1016 free_workspace(type
, ws
);
1017 atomic_dec(&wsman
->total_ws
);
1022 * This finds an available workspace or allocates a new one.
1023 * If it's not possible to allocate a new one, waits until there's one.
1024 * Preallocation makes a forward progress guarantees and we do not return
1027 struct list_head
*btrfs_get_workspace(int type
, unsigned int level
)
1029 struct workspace_manager
*wsm
;
1030 struct list_head
*workspace
;
1031 int cpus
= num_online_cpus();
1033 struct list_head
*idle_ws
;
1034 spinlock_t
*ws_lock
;
1036 wait_queue_head_t
*ws_wait
;
1039 wsm
= btrfs_compress_op
[type
]->workspace_manager
;
1040 idle_ws
= &wsm
->idle_ws
;
1041 ws_lock
= &wsm
->ws_lock
;
1042 total_ws
= &wsm
->total_ws
;
1043 ws_wait
= &wsm
->ws_wait
;
1044 free_ws
= &wsm
->free_ws
;
1048 if (!list_empty(idle_ws
)) {
1049 workspace
= idle_ws
->next
;
1050 list_del(workspace
);
1052 spin_unlock(ws_lock
);
1056 if (atomic_read(total_ws
) > cpus
) {
1059 spin_unlock(ws_lock
);
1060 prepare_to_wait(ws_wait
, &wait
, TASK_UNINTERRUPTIBLE
);
1061 if (atomic_read(total_ws
) > cpus
&& !*free_ws
)
1063 finish_wait(ws_wait
, &wait
);
1066 atomic_inc(total_ws
);
1067 spin_unlock(ws_lock
);
1070 * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
1071 * to turn it off here because we might get called from the restricted
1072 * context of btrfs_compress_bio/btrfs_compress_pages
1074 nofs_flag
= memalloc_nofs_save();
1075 workspace
= alloc_workspace(type
, level
);
1076 memalloc_nofs_restore(nofs_flag
);
1078 if (IS_ERR(workspace
)) {
1079 atomic_dec(total_ws
);
1083 * Do not return the error but go back to waiting. There's a
1084 * workspace preallocated for each type and the compression
1085 * time is bounded so we get to a workspace eventually. This
1086 * makes our caller's life easier.
1088 * To prevent silent and low-probability deadlocks (when the
1089 * initial preallocation fails), check if there are any
1090 * workspaces at all.
1092 if (atomic_read(total_ws
) == 0) {
1093 static DEFINE_RATELIMIT_STATE(_rs
,
1094 /* once per minute */ 60 * HZ
,
1097 if (__ratelimit(&_rs
)) {
1098 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
1106 static struct list_head
*get_workspace(int type
, int level
)
1109 case BTRFS_COMPRESS_NONE
: return btrfs_get_workspace(type
, level
);
1110 case BTRFS_COMPRESS_ZLIB
: return zlib_get_workspace(level
);
1111 case BTRFS_COMPRESS_LZO
: return btrfs_get_workspace(type
, level
);
1112 case BTRFS_COMPRESS_ZSTD
: return zstd_get_workspace(level
);
1115 * This can't happen, the type is validated several times
1116 * before we get here.
1123 * put a workspace struct back on the list or free it if we have enough
1124 * idle ones sitting around
1126 void btrfs_put_workspace(int type
, struct list_head
*ws
)
1128 struct workspace_manager
*wsm
;
1129 struct list_head
*idle_ws
;
1130 spinlock_t
*ws_lock
;
1132 wait_queue_head_t
*ws_wait
;
1135 wsm
= btrfs_compress_op
[type
]->workspace_manager
;
1136 idle_ws
= &wsm
->idle_ws
;
1137 ws_lock
= &wsm
->ws_lock
;
1138 total_ws
= &wsm
->total_ws
;
1139 ws_wait
= &wsm
->ws_wait
;
1140 free_ws
= &wsm
->free_ws
;
1143 if (*free_ws
<= num_online_cpus()) {
1144 list_add(ws
, idle_ws
);
1146 spin_unlock(ws_lock
);
1149 spin_unlock(ws_lock
);
1151 free_workspace(type
, ws
);
1152 atomic_dec(total_ws
);
1154 cond_wake_up(ws_wait
);
1157 static void put_workspace(int type
, struct list_head
*ws
)
1160 case BTRFS_COMPRESS_NONE
: return btrfs_put_workspace(type
, ws
);
1161 case BTRFS_COMPRESS_ZLIB
: return btrfs_put_workspace(type
, ws
);
1162 case BTRFS_COMPRESS_LZO
: return btrfs_put_workspace(type
, ws
);
1163 case BTRFS_COMPRESS_ZSTD
: return zstd_put_workspace(ws
);
1166 * This can't happen, the type is validated several times
1167 * before we get here.
1174 * Adjust @level according to the limits of the compression algorithm or
1175 * fallback to default
1177 static unsigned int btrfs_compress_set_level(int type
, unsigned level
)
1179 const struct btrfs_compress_op
*ops
= btrfs_compress_op
[type
];
1182 level
= ops
->default_level
;
1184 level
= min(level
, ops
->max_level
);
1190 * Given an address space and start and length, compress the bytes into @pages
1191 * that are allocated on demand.
1193 * @type_level is encoded algorithm and level, where level 0 means whatever
1194 * default the algorithm chooses and is opaque here;
1195 * - compression algo are 0-3
1196 * - the level are bits 4-7
1198 * @out_pages is an in/out parameter, holds maximum number of pages to allocate
1199 * and returns number of actually allocated pages
1201 * @total_in is used to return the number of bytes actually read. It
1202 * may be smaller than the input length if we had to exit early because we
1203 * ran out of room in the pages array or because we cross the
1204 * max_out threshold.
1206 * @total_out is an in/out parameter, must be set to the input length and will
1207 * be also used to return the total number of compressed bytes
1209 int btrfs_compress_pages(unsigned int type_level
, struct address_space
*mapping
,
1210 u64 start
, struct page
**pages
,
1211 unsigned long *out_pages
,
1212 unsigned long *total_in
,
1213 unsigned long *total_out
)
1215 int type
= btrfs_compress_type(type_level
);
1216 int level
= btrfs_compress_level(type_level
);
1217 struct list_head
*workspace
;
1220 level
= btrfs_compress_set_level(type
, level
);
1221 workspace
= get_workspace(type
, level
);
1222 ret
= compression_compress_pages(type
, workspace
, mapping
, start
, pages
,
1223 out_pages
, total_in
, total_out
);
1224 put_workspace(type
, workspace
);
1228 static int btrfs_decompress_bio(struct compressed_bio
*cb
)
1230 struct list_head
*workspace
;
1232 int type
= cb
->compress_type
;
1234 workspace
= get_workspace(type
, 0);
1235 ret
= compression_decompress_bio(type
, workspace
, cb
);
1236 put_workspace(type
, workspace
);
1242 * a less complex decompression routine. Our compressed data fits in a
1243 * single page, and we want to read a single page out of it.
1244 * start_byte tells us the offset into the compressed data we're interested in
1246 int btrfs_decompress(int type
, unsigned char *data_in
, struct page
*dest_page
,
1247 unsigned long start_byte
, size_t srclen
, size_t destlen
)
1249 struct list_head
*workspace
;
1252 workspace
= get_workspace(type
, 0);
1253 ret
= compression_decompress(type
, workspace
, data_in
, dest_page
,
1254 start_byte
, srclen
, destlen
);
1255 put_workspace(type
, workspace
);
1260 void __init
btrfs_init_compress(void)
1262 btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE
);
1263 btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB
);
1264 btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO
);
1265 zstd_init_workspace_manager();
1268 void __cold
btrfs_exit_compress(void)
1270 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE
);
1271 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB
);
1272 btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO
);
1273 zstd_cleanup_workspace_manager();
1277 * Copy decompressed data from working buffer to pages.
1279 * @buf: The decompressed data buffer
1280 * @buf_len: The decompressed data length
1281 * @decompressed: Number of bytes that are already decompressed inside the
1283 * @cb: The compressed extent descriptor
1284 * @orig_bio: The original bio that the caller wants to read for
1286 * An easier to understand graph is like below:
1288 * |<- orig_bio ->| |<- orig_bio->|
1289 * |<------- full decompressed extent ----->|
1290 * |<----------- @cb range ---->|
1291 * | |<-- @buf_len -->|
1292 * |<--- @decompressed --->|
1294 * Note that, @cb can be a subpage of the full decompressed extent, but
1295 * @cb->start always has the same as the orig_file_offset value of the full
1296 * decompressed extent.
1298 * When reading compressed extent, we have to read the full compressed extent,
1299 * while @orig_bio may only want part of the range.
1300 * Thus this function will ensure only data covered by @orig_bio will be copied
1303 * Return 0 if we have copied all needed contents for @orig_bio.
1304 * Return >0 if we need continue decompress.
1306 int btrfs_decompress_buf2page(const char *buf
, u32 buf_len
,
1307 struct compressed_bio
*cb
, u32 decompressed
)
1309 struct bio
*orig_bio
= cb
->orig_bio
;
1310 /* Offset inside the full decompressed extent */
1313 cur_offset
= decompressed
;
1314 /* The main loop to do the copy */
1315 while (cur_offset
< decompressed
+ buf_len
) {
1316 struct bio_vec bvec
;
1319 /* Offset inside the full decompressed extent */
1322 bvec
= bio_iter_iovec(orig_bio
, orig_bio
->bi_iter
);
1324 * cb->start may underflow, but subtracting that value can still
1325 * give us correct offset inside the full decompressed extent.
1327 bvec_offset
= page_offset(bvec
.bv_page
) + bvec
.bv_offset
- cb
->start
;
1329 /* Haven't reached the bvec range, exit */
1330 if (decompressed
+ buf_len
<= bvec_offset
)
1333 copy_start
= max(cur_offset
, bvec_offset
);
1334 copy_len
= min(bvec_offset
+ bvec
.bv_len
,
1335 decompressed
+ buf_len
) - copy_start
;
1339 * Extra range check to ensure we didn't go beyond
1342 ASSERT(copy_start
- decompressed
< buf_len
);
1343 memcpy_to_page(bvec
.bv_page
, bvec
.bv_offset
,
1344 buf
+ copy_start
- decompressed
, copy_len
);
1345 flush_dcache_page(bvec
.bv_page
);
1346 cur_offset
+= copy_len
;
1348 bio_advance(orig_bio
, copy_len
);
1349 /* Finished the bio */
1350 if (!orig_bio
->bi_iter
.bi_size
)
1357 * Shannon Entropy calculation
1359 * Pure byte distribution analysis fails to determine compressibility of data.
1360 * Try calculating entropy to estimate the average minimum number of bits
1361 * needed to encode the sampled data.
1363 * For convenience, return the percentage of needed bits, instead of amount of
1366 * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1367 * and can be compressible with high probability
1369 * @ENTROPY_LVL_HIGH - data are not compressible with high probability
1371 * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1373 #define ENTROPY_LVL_ACEPTABLE (65)
1374 #define ENTROPY_LVL_HIGH (80)
1377 * For increasead precision in shannon_entropy calculation,
1378 * let's do pow(n, M) to save more digits after comma:
1380 * - maximum int bit length is 64
1381 * - ilog2(MAX_SAMPLE_SIZE) -> 13
1382 * - 13 * 4 = 52 < 64 -> M = 4
1386 static inline u32
ilog2_w(u64 n
)
1388 return ilog2(n
* n
* n
* n
);
1391 static u32
shannon_entropy(struct heuristic_ws
*ws
)
1393 const u32 entropy_max
= 8 * ilog2_w(2);
1394 u32 entropy_sum
= 0;
1395 u32 p
, p_base
, sz_base
;
1398 sz_base
= ilog2_w(ws
->sample_size
);
1399 for (i
= 0; i
< BUCKET_SIZE
&& ws
->bucket
[i
].count
> 0; i
++) {
1400 p
= ws
->bucket
[i
].count
;
1401 p_base
= ilog2_w(p
);
1402 entropy_sum
+= p
* (sz_base
- p_base
);
1405 entropy_sum
/= ws
->sample_size
;
1406 return entropy_sum
* 100 / entropy_max
;
1409 #define RADIX_BASE 4U
1410 #define COUNTERS_SIZE (1U << RADIX_BASE)
1412 static u8
get4bits(u64 num
, int shift
) {
1417 low4bits
= (COUNTERS_SIZE
- 1) - (num
% COUNTERS_SIZE
);
1422 * Use 4 bits as radix base
1423 * Use 16 u32 counters for calculating new position in buf array
1425 * @array - array that will be sorted
1426 * @array_buf - buffer array to store sorting results
1427 * must be equal in size to @array
1430 static void radix_sort(struct bucket_item
*array
, struct bucket_item
*array_buf
,
1435 u32 counters
[COUNTERS_SIZE
];
1443 * Try avoid useless loop iterations for small numbers stored in big
1444 * counters. Example: 48 33 4 ... in 64bit array
1446 max_num
= array
[0].count
;
1447 for (i
= 1; i
< num
; i
++) {
1448 buf_num
= array
[i
].count
;
1449 if (buf_num
> max_num
)
1453 buf_num
= ilog2(max_num
);
1454 bitlen
= ALIGN(buf_num
, RADIX_BASE
* 2);
1457 while (shift
< bitlen
) {
1458 memset(counters
, 0, sizeof(counters
));
1460 for (i
= 0; i
< num
; i
++) {
1461 buf_num
= array
[i
].count
;
1462 addr
= get4bits(buf_num
, shift
);
1466 for (i
= 1; i
< COUNTERS_SIZE
; i
++)
1467 counters
[i
] += counters
[i
- 1];
1469 for (i
= num
- 1; i
>= 0; i
--) {
1470 buf_num
= array
[i
].count
;
1471 addr
= get4bits(buf_num
, shift
);
1473 new_addr
= counters
[addr
];
1474 array_buf
[new_addr
] = array
[i
];
1477 shift
+= RADIX_BASE
;
1480 * Normal radix expects to move data from a temporary array, to
1481 * the main one. But that requires some CPU time. Avoid that
1482 * by doing another sort iteration to original array instead of
1485 memset(counters
, 0, sizeof(counters
));
1487 for (i
= 0; i
< num
; i
++) {
1488 buf_num
= array_buf
[i
].count
;
1489 addr
= get4bits(buf_num
, shift
);
1493 for (i
= 1; i
< COUNTERS_SIZE
; i
++)
1494 counters
[i
] += counters
[i
- 1];
1496 for (i
= num
- 1; i
>= 0; i
--) {
1497 buf_num
= array_buf
[i
].count
;
1498 addr
= get4bits(buf_num
, shift
);
1500 new_addr
= counters
[addr
];
1501 array
[new_addr
] = array_buf
[i
];
1504 shift
+= RADIX_BASE
;
1509 * Size of the core byte set - how many bytes cover 90% of the sample
1511 * There are several types of structured binary data that use nearly all byte
1512 * values. The distribution can be uniform and counts in all buckets will be
1513 * nearly the same (eg. encrypted data). Unlikely to be compressible.
1515 * Other possibility is normal (Gaussian) distribution, where the data could
1516 * be potentially compressible, but we have to take a few more steps to decide
1519 * @BYTE_CORE_SET_LOW - main part of byte values repeated frequently,
1520 * compression algo can easy fix that
1521 * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1522 * probability is not compressible
1524 #define BYTE_CORE_SET_LOW (64)
1525 #define BYTE_CORE_SET_HIGH (200)
1527 static int byte_core_set_size(struct heuristic_ws
*ws
)
1530 u32 coreset_sum
= 0;
1531 const u32 core_set_threshold
= ws
->sample_size
* 90 / 100;
1532 struct bucket_item
*bucket
= ws
->bucket
;
1534 /* Sort in reverse order */
1535 radix_sort(ws
->bucket
, ws
->bucket_b
, BUCKET_SIZE
);
1537 for (i
= 0; i
< BYTE_CORE_SET_LOW
; i
++)
1538 coreset_sum
+= bucket
[i
].count
;
1540 if (coreset_sum
> core_set_threshold
)
1543 for (; i
< BYTE_CORE_SET_HIGH
&& bucket
[i
].count
> 0; i
++) {
1544 coreset_sum
+= bucket
[i
].count
;
1545 if (coreset_sum
> core_set_threshold
)
1553 * Count byte values in buckets.
1554 * This heuristic can detect textual data (configs, xml, json, html, etc).
1555 * Because in most text-like data byte set is restricted to limited number of
1556 * possible characters, and that restriction in most cases makes data easy to
1559 * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1560 * less - compressible
1561 * more - need additional analysis
1563 #define BYTE_SET_THRESHOLD (64)
1565 static u32
byte_set_size(const struct heuristic_ws
*ws
)
1568 u32 byte_set_size
= 0;
1570 for (i
= 0; i
< BYTE_SET_THRESHOLD
; i
++) {
1571 if (ws
->bucket
[i
].count
> 0)
1576 * Continue collecting count of byte values in buckets. If the byte
1577 * set size is bigger then the threshold, it's pointless to continue,
1578 * the detection technique would fail for this type of data.
1580 for (; i
< BUCKET_SIZE
; i
++) {
1581 if (ws
->bucket
[i
].count
> 0) {
1583 if (byte_set_size
> BYTE_SET_THRESHOLD
)
1584 return byte_set_size
;
1588 return byte_set_size
;
1591 static bool sample_repeated_patterns(struct heuristic_ws
*ws
)
1593 const u32 half_of_sample
= ws
->sample_size
/ 2;
1594 const u8
*data
= ws
->sample
;
1596 return memcmp(&data
[0], &data
[half_of_sample
], half_of_sample
) == 0;
1599 static void heuristic_collect_sample(struct inode
*inode
, u64 start
, u64 end
,
1600 struct heuristic_ws
*ws
)
1603 u64 index
, index_end
;
1604 u32 i
, curr_sample_pos
;
1608 * Compression handles the input data by chunks of 128KiB
1609 * (defined by BTRFS_MAX_UNCOMPRESSED)
1611 * We do the same for the heuristic and loop over the whole range.
1613 * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1614 * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1616 if (end
- start
> BTRFS_MAX_UNCOMPRESSED
)
1617 end
= start
+ BTRFS_MAX_UNCOMPRESSED
;
1619 index
= start
>> PAGE_SHIFT
;
1620 index_end
= end
>> PAGE_SHIFT
;
1622 /* Don't miss unaligned end */
1623 if (!IS_ALIGNED(end
, PAGE_SIZE
))
1626 curr_sample_pos
= 0;
1627 while (index
< index_end
) {
1628 page
= find_get_page(inode
->i_mapping
, index
);
1629 in_data
= kmap_local_page(page
);
1630 /* Handle case where the start is not aligned to PAGE_SIZE */
1631 i
= start
% PAGE_SIZE
;
1632 while (i
< PAGE_SIZE
- SAMPLING_READ_SIZE
) {
1633 /* Don't sample any garbage from the last page */
1634 if (start
> end
- SAMPLING_READ_SIZE
)
1636 memcpy(&ws
->sample
[curr_sample_pos
], &in_data
[i
],
1637 SAMPLING_READ_SIZE
);
1638 i
+= SAMPLING_INTERVAL
;
1639 start
+= SAMPLING_INTERVAL
;
1640 curr_sample_pos
+= SAMPLING_READ_SIZE
;
1642 kunmap_local(in_data
);
1648 ws
->sample_size
= curr_sample_pos
;
1652 * Compression heuristic.
1654 * For now is's a naive and optimistic 'return true', we'll extend the logic to
1655 * quickly (compared to direct compression) detect data characteristics
1656 * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
1659 * The following types of analysis can be performed:
1660 * - detect mostly zero data
1661 * - detect data with low "byte set" size (text, etc)
1662 * - detect data with low/high "core byte" set
1664 * Return non-zero if the compression should be done, 0 otherwise.
1666 int btrfs_compress_heuristic(struct inode
*inode
, u64 start
, u64 end
)
1668 struct list_head
*ws_list
= get_workspace(0, 0);
1669 struct heuristic_ws
*ws
;
1674 ws
= list_entry(ws_list
, struct heuristic_ws
, list
);
1676 heuristic_collect_sample(inode
, start
, end
, ws
);
1678 if (sample_repeated_patterns(ws
)) {
1683 memset(ws
->bucket
, 0, sizeof(*ws
->bucket
)*BUCKET_SIZE
);
1685 for (i
= 0; i
< ws
->sample_size
; i
++) {
1686 byte
= ws
->sample
[i
];
1687 ws
->bucket
[byte
].count
++;
1690 i
= byte_set_size(ws
);
1691 if (i
< BYTE_SET_THRESHOLD
) {
1696 i
= byte_core_set_size(ws
);
1697 if (i
<= BYTE_CORE_SET_LOW
) {
1702 if (i
>= BYTE_CORE_SET_HIGH
) {
1707 i
= shannon_entropy(ws
);
1708 if (i
<= ENTROPY_LVL_ACEPTABLE
) {
1714 * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1715 * needed to give green light to compression.
1717 * For now just assume that compression at that level is not worth the
1718 * resources because:
1720 * 1. it is possible to defrag the data later
1722 * 2. the data would turn out to be hardly compressible, eg. 150 byte
1723 * values, every bucket has counter at level ~54. The heuristic would
1724 * be confused. This can happen when data have some internal repeated
1725 * patterns like "abbacbbc...". This can be detected by analyzing
1726 * pairs of bytes, which is too costly.
1728 if (i
< ENTROPY_LVL_HIGH
) {
1737 put_workspace(0, ws_list
);
1742 * Convert the compression suffix (eg. after "zlib" starting with ":") to
1743 * level, unrecognized string will set the default level
1745 unsigned int btrfs_compress_str2level(unsigned int type
, const char *str
)
1747 unsigned int level
= 0;
1753 if (str
[0] == ':') {
1754 ret
= kstrtouint(str
+ 1, 10, &level
);
1759 level
= btrfs_compress_set_level(type
, level
);