fs/btrfs/compression.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2008 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/kernel.h>
   7 #include <linux/bio.h>
   8 #include <linux/file.h>
   9 #include <linux/fs.h>
  10 #include <linux/pagemap.h>
  11 #include <linux/highmem.h>
  12 #include <linux/kthread.h>
  13 #include <linux/time.h>
  14 #include <linux/init.h>
  15 #include <linux/string.h>
  16 #include <linux/backing-dev.h>
  17 #include <linux/writeback.h>
  18 #include <linux/slab.h>
  19 #include <linux/sched/mm.h>
  20 #include <linux/log2.h>
  21 #include <crypto/hash.h>
  22 #include "misc.h"
  23 #include "ctree.h"
  24 #include "disk-io.h"
  25 #include "transaction.h"
  26 #include "btrfs_inode.h"
  27 #include "volumes.h"
  28 #include "ordered-data.h"
  29 #include "compression.h"
  30 #include "extent_io.h"
  31 #include "extent_map.h"
  32 #include "zoned.h"
  33
  34 static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
  35
  36 const char* btrfs_compress_type2str(enum btrfs_compression_type type)
  37 {
  38         switch (type) {
  39         case BTRFS_COMPRESS_ZLIB:
  40         case BTRFS_COMPRESS_LZO:
  41         case BTRFS_COMPRESS_ZSTD:
  42         case BTRFS_COMPRESS_NONE:
  43                 return btrfs_compress_types[type];
  44         default:
  45                 break;
  46         }
  47
  48         return NULL;
  49 }
  50
  51 bool btrfs_compress_is_valid_type(const char *str, size_t len)
  52 {
  53         int i;
  54
  55         for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
  56                 size_t comp_len = strlen(btrfs_compress_types[i]);
  57
  58                 if (len < comp_len)
  59                         continue;
  60
  61                 if (!strncmp(btrfs_compress_types[i], str, comp_len))
  62                         return true;
  63         }
  64         return false;
  65 }
  66
  67 static int compression_compress_pages(int type, struct list_head *ws,
  68                struct address_space *mapping, u64 start, struct page **pages,
  69                unsigned long *out_pages, unsigned long *total_in,
  70                unsigned long *total_out)
  71 {
  72         switch (type) {
  73         case BTRFS_COMPRESS_ZLIB:
  74                 return zlib_compress_pages(ws, mapping, start, pages,
  75                                 out_pages, total_in, total_out);
  76         case BTRFS_COMPRESS_LZO:
  77                 return lzo_compress_pages(ws, mapping, start, pages,
  78                                 out_pages, total_in, total_out);
  79         case BTRFS_COMPRESS_ZSTD:
  80                 return zstd_compress_pages(ws, mapping, start, pages,
  81                                 out_pages, total_in, total_out);
  82         case BTRFS_COMPRESS_NONE:
  83         default:
  84                 /*
  85                  * This can happen when compression races with remount setting
  86                  * it to 'no compress', while caller doesn't call
  87                  * inode_need_compress() to check if we really need to
  88                  * compress.
  89                  *
  90                  * Not a big deal, just need to inform caller that we
  91                  * haven't allocated any pages yet.
  92                  */
  93                 *out_pages = 0;
  94                 return -E2BIG;
  95         }
  96 }
  97
  98 static int compression_decompress_bio(int type, struct list_head *ws,
  99                 struct compressed_bio *cb)
 100 {
 101         switch (type) {
 102         case BTRFS_COMPRESS_ZLIB: return zlib_decompress_bio(ws, cb);
 103         case BTRFS_COMPRESS_LZO:  return lzo_decompress_bio(ws, cb);
 104         case BTRFS_COMPRESS_ZSTD: return zstd_decompress_bio(ws, cb);
 105         case BTRFS_COMPRESS_NONE:
 106         default:
 107                 /*
 108                  * This can't happen, the type is validated several times
 109                  * before we get here.
 110                  */
 111                 BUG();
 112         }
 113 }
 114
 115 static int compression_decompress(int type, struct list_head *ws,
 116                unsigned char *data_in, struct page *dest_page,
 117                unsigned long start_byte, size_t srclen, size_t destlen)
 118 {
 119         switch (type) {
 120         case BTRFS_COMPRESS_ZLIB: return zlib_decompress(ws, data_in, dest_page,
 121                                                 start_byte, srclen, destlen);
 122         case BTRFS_COMPRESS_LZO:  return lzo_decompress(ws, data_in, dest_page,
 123                                                 start_byte, srclen, destlen);
 124         case BTRFS_COMPRESS_ZSTD: return zstd_decompress(ws, data_in, dest_page,
 125                                                 start_byte, srclen, destlen);
 126         case BTRFS_COMPRESS_NONE:
 127         default:
 128                 /*
 129                  * This can't happen, the type is validated several times
 130                  * before we get here.
 131                  */
 132                 BUG();
 133         }
 134 }
 135
 136 static int btrfs_decompress_bio(struct compressed_bio *cb);
 137
 138 static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
 139                                       unsigned long disk_size)
 140 {
 141         return sizeof(struct compressed_bio) +
 142                 (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size;
 143 }
 144
 145 static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio,
 146                                  u64 disk_start)
 147 {
 148         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 149         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 150         const u32 csum_size = fs_info->csum_size;
 151         const u32 sectorsize = fs_info->sectorsize;
 152         struct page *page;
 153         unsigned int i;
 154         char *kaddr;
 155         u8 csum[BTRFS_CSUM_SIZE];
 156         struct compressed_bio *cb = bio->bi_private;
 157         u8 *cb_sum = cb->sums;
 158
 159         if (!fs_info->csum_root || (inode->flags & BTRFS_INODE_NODATASUM))
 160                 return 0;
 161
 162         shash->tfm = fs_info->csum_shash;
 163
 164         for (i = 0; i < cb->nr_pages; i++) {
 165                 u32 pg_offset;
 166                 u32 bytes_left = PAGE_SIZE;
 167                 page = cb->compressed_pages[i];
 168
 169                 /* Determine the remaining bytes inside the page first */
 170                 if (i == cb->nr_pages - 1)
 171                         bytes_left = cb->compressed_len - i * PAGE_SIZE;
 172
 173                 /* Hash through the page sector by sector */
 174                 for (pg_offset = 0; pg_offset < bytes_left;
 175                      pg_offset += sectorsize) {
 176                         kaddr = kmap_atomic(page);
 177                         crypto_shash_digest(shash, kaddr + pg_offset,
 178                                             sectorsize, csum);
 179                         kunmap_atomic(kaddr);
 180
 181                         if (memcmp(&csum, cb_sum, csum_size) != 0) {
 182                                 btrfs_print_data_csum_error(inode, disk_start,
 183                                                 csum, cb_sum, cb->mirror_num);
 184                                 if (btrfs_io_bio(bio)->device)
 185                                         btrfs_dev_stat_inc_and_print(
 186                                                 btrfs_io_bio(bio)->device,
 187                                                 BTRFS_DEV_STAT_CORRUPTION_ERRS);
 188                                 return -EIO;
 189                         }
 190                         cb_sum += csum_size;
 191                         disk_start += sectorsize;
 192                 }
 193         }
 194         return 0;
 195 }
 196
 197 /* when we finish reading compressed pages from the disk, we
 198  * decompress them and then run the bio end_io routines on the
 199  * decompressed pages (in the inode address space).
 200  *
 201  * This allows the checksumming and other IO error handling routines
 202  * to work normally
 203  *
 204  * The compressed pages are freed here, and it must be run
 205  * in process context
 206  */
 207 static void end_compressed_bio_read(struct bio *bio)
 208 {
 209         struct compressed_bio *cb = bio->bi_private;
 210         struct inode *inode;
 211         struct page *page;
 212         unsigned int index;
 213         unsigned int mirror = btrfs_io_bio(bio)->mirror_num;
 214         int ret = 0;
 215
 216         if (bio->bi_status)
 217                 cb->errors = 1;
 218
 219         /* if there are more bios still pending for this compressed
 220          * extent, just exit
 221          */
 222         if (!refcount_dec_and_test(&cb->pending_bios))
 223                 goto out;
 224
 225         /*
 226          * Record the correct mirror_num in cb->orig_bio so that
 227          * read-repair can work properly.
 228          */
 229         btrfs_io_bio(cb->orig_bio)->mirror_num = mirror;
 230         cb->mirror_num = mirror;
 231
 232         /*
 233          * Some IO in this cb have failed, just skip checksum as there
 234          * is no way it could be correct.
 235          */
 236         if (cb->errors == 1)
 237                 goto csum_failed;
 238
 239         inode = cb->inode;
 240         ret = check_compressed_csum(BTRFS_I(inode), bio,
 241                                     bio->bi_iter.bi_sector << 9);
 242         if (ret)
 243                 goto csum_failed;
 244
 245         /* ok, we're the last bio for this extent, lets start
 246          * the decompression.
 247          */
 248         ret = btrfs_decompress_bio(cb);
 249
 250 csum_failed:
 251         if (ret)
 252                 cb->errors = 1;
 253
 254         /* release the compressed pages */
 255         index = 0;
 256         for (index = 0; index < cb->nr_pages; index++) {
 257                 page = cb->compressed_pages[index];
 258                 page->mapping = NULL;
 259                 put_page(page);
 260         }
 261
 262         /* do io completion on the original bio */
 263         if (cb->errors) {
 264                 bio_io_error(cb->orig_bio);
 265         } else {
 266                 struct bio_vec *bvec;
 267                 struct bvec_iter_all iter_all;
 268
 269                 /*
 270                  * we have verified the checksum already, set page
 271                  * checked so the end_io handlers know about it
 272                  */
 273                 ASSERT(!bio_flagged(bio, BIO_CLONED));
 274                 bio_for_each_segment_all(bvec, cb->orig_bio, iter_all)
 275                         SetPageChecked(bvec->bv_page);
 276
 277                 bio_endio(cb->orig_bio);
 278         }
 279
 280         /* finally free the cb struct */
 281         kfree(cb->compressed_pages);
 282         kfree(cb);
 283 out:
 284         bio_put(bio);
 285 }
 286
 287 /*
 288  * Clear the writeback bits on all of the file
 289  * pages for a compressed write
 290  */
 291 static noinline void end_compressed_writeback(struct inode *inode,
 292                                               const struct compressed_bio *cb)
 293 {
 294         unsigned long index = cb->start >> PAGE_SHIFT;
 295         unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT;
 296         struct page *pages[16];
 297         unsigned long nr_pages = end_index - index + 1;
 298         int i;
 299         int ret;
 300
 301         if (cb->errors)
 302                 mapping_set_error(inode->i_mapping, -EIO);
 303
 304         while (nr_pages > 0) {
 305                 ret = find_get_pages_contig(inode->i_mapping, index,
 306                                      min_t(unsigned long,
 307                                      nr_pages, ARRAY_SIZE(pages)), pages);
 308                 if (ret == 0) {
 309                         nr_pages -= 1;
 310                         index += 1;
 311                         continue;
 312                 }
 313                 for (i = 0; i < ret; i++) {
 314                         if (cb->errors)
 315                                 SetPageError(pages[i]);
 316                         end_page_writeback(pages[i]);
 317                         put_page(pages[i]);
 318                 }
 319                 nr_pages -= ret;
 320                 index += ret;
 321         }
 322         /* the inode may be gone now */
 323 }
 324
 325 /*
 326  * do the cleanup once all the compressed pages hit the disk.
 327  * This will clear writeback on the file pages and free the compressed
 328  * pages.
 329  *
 330  * This also calls the writeback end hooks for the file pages so that
 331  * metadata and checksums can be updated in the file.
 332  */
 333 static void end_compressed_bio_write(struct bio *bio)
 334 {
 335         struct compressed_bio *cb = bio->bi_private;
 336         struct inode *inode;
 337         struct page *page;
 338         unsigned int index;
 339
 340         if (bio->bi_status)
 341                 cb->errors = 1;
 342
 343         /* if there are more bios still pending for this compressed
 344          * extent, just exit
 345          */
 346         if (!refcount_dec_and_test(&cb->pending_bios))
 347                 goto out;
 348
 349         /* ok, we're the last bio for this extent, step one is to
 350          * call back into the FS and do all the end_io operations
 351          */
 352         inode = cb->inode;
 353         btrfs_record_physical_zoned(inode, cb->start, bio);
 354         btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL,
 355                         cb->start, cb->start + cb->len - 1,
 356                         !cb->errors);
 357
 358         end_compressed_writeback(inode, cb);
 359         /* note, our inode could be gone now */
 360
 361         /*
 362          * release the compressed pages, these came from alloc_page and
 363          * are not attached to the inode at all
 364          */
 365         index = 0;
 366         for (index = 0; index < cb->nr_pages; index++) {
 367                 page = cb->compressed_pages[index];
 368                 page->mapping = NULL;
 369                 put_page(page);
 370         }
 371
 372         /* finally free the cb struct */
 373         kfree(cb->compressed_pages);
 374         kfree(cb);
 375 out:
 376         bio_put(bio);
 377 }
 378
 379 /*
 380  * worker function to build and submit bios for previously compressed pages.
 381  * The corresponding pages in the inode should be marked for writeback
 382  * and the compressed pages should have a reference on them for dropping
 383  * when the IO is complete.
 384  *
 385  * This also checksums the file bytes and gets things ready for
 386  * the end io hooks.
 387  */
 388 blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
 389                                  unsigned int len, u64 disk_start,
 390                                  unsigned int compressed_len,
 391                                  struct page **compressed_pages,
 392                                  unsigned int nr_pages,
 393                                  unsigned int write_flags,
 394                                  struct cgroup_subsys_state *blkcg_css)
 395 {
 396         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 397         struct bio *bio = NULL;
 398         struct compressed_bio *cb;
 399         unsigned long bytes_left;
 400         int pg_index = 0;
 401         struct page *page;
 402         u64 first_byte = disk_start;
 403         blk_status_t ret;
 404         int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
 405         const bool use_append = btrfs_use_zone_append(inode, disk_start);
 406         const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
 407
 408         WARN_ON(!PAGE_ALIGNED(start));
 409         cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 410         if (!cb)
 411                 return BLK_STS_RESOURCE;
 412         refcount_set(&cb->pending_bios, 0);
 413         cb->errors = 0;
 414         cb->inode = &inode->vfs_inode;
 415         cb->start = start;
 416         cb->len = len;
 417         cb->mirror_num = 0;
 418         cb->compressed_pages = compressed_pages;
 419         cb->compressed_len = compressed_len;
 420         cb->orig_bio = NULL;
 421         cb->nr_pages = nr_pages;
 422
 423         bio = btrfs_bio_alloc(first_byte);
 424         bio->bi_opf = bio_op | write_flags;
 425         bio->bi_private = cb;
 426         bio->bi_end_io = end_compressed_bio_write;
 427
 428         if (use_append) {
 429                 struct btrfs_device *device;
 430
 431                 device = btrfs_zoned_get_device(fs_info, disk_start, PAGE_SIZE);
 432                 if (IS_ERR(device)) {
 433                         kfree(cb);
 434                         bio_put(bio);
 435                         return BLK_STS_NOTSUPP;
 436                 }
 437
 438                 bio_set_dev(bio, device->bdev);
 439         }
 440
 441         if (blkcg_css) {
 442                 bio->bi_opf |= REQ_CGROUP_PUNT;
 443                 kthread_associate_blkcg(blkcg_css);
 444         }
 445         refcount_set(&cb->pending_bios, 1);
 446
 447         /* create and submit bios for the compressed pages */
 448         bytes_left = compressed_len;
 449         for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
 450                 int submit = 0;
 451                 int len = 0;
 452
 453                 page = compressed_pages[pg_index];
 454                 page->mapping = inode->vfs_inode.i_mapping;
 455                 if (bio->bi_iter.bi_size)
 456                         submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
 457                                                           0);
 458
 459                 /*
 460                  * Page can only be added to bio if the current bio fits in
 461                  * stripe.
 462                  */
 463                 if (!submit) {
 464                         if (pg_index == 0 && use_append)
 465                                 len = bio_add_zone_append_page(bio, page,
 466                                                                PAGE_SIZE, 0);
 467                         else
 468                                 len = bio_add_page(bio, page, PAGE_SIZE, 0);
 469                 }
 470
 471                 page->mapping = NULL;
 472                 if (submit || len < PAGE_SIZE) {
 473                         /*
 474                          * inc the count before we submit the bio so
 475                          * we know the end IO handler won't happen before
 476                          * we inc the count.  Otherwise, the cb might get
 477                          * freed before we're done setting it up
 478                          */
 479                         refcount_inc(&cb->pending_bios);
 480                         ret = btrfs_bio_wq_end_io(fs_info, bio,
 481                                                   BTRFS_WQ_ENDIO_DATA);
 482                         BUG_ON(ret); /* -ENOMEM */
 483
 484                         if (!skip_sum) {
 485                                 ret = btrfs_csum_one_bio(inode, bio, start, 1);
 486                                 BUG_ON(ret); /* -ENOMEM */
 487                         }
 488
 489                         ret = btrfs_map_bio(fs_info, bio, 0);
 490                         if (ret) {
 491                                 bio->bi_status = ret;
 492                                 bio_endio(bio);
 493                         }
 494
 495                         bio = btrfs_bio_alloc(first_byte);
 496                         bio->bi_opf = bio_op | write_flags;
 497                         bio->bi_private = cb;
 498                         bio->bi_end_io = end_compressed_bio_write;
 499                         if (blkcg_css)
 500                                 bio->bi_opf |= REQ_CGROUP_PUNT;
 501                         /*
 502                          * Use bio_add_page() to ensure the bio has at least one
 503                          * page.
 504                          */
 505                         bio_add_page(bio, page, PAGE_SIZE, 0);
 506                 }
 507                 if (bytes_left < PAGE_SIZE) {
 508                         btrfs_info(fs_info,
 509                                         "bytes left %lu compress len %u nr %u",
 510                                bytes_left, cb->compressed_len, cb->nr_pages);
 511                 }
 512                 bytes_left -= PAGE_SIZE;
 513                 first_byte += PAGE_SIZE;
 514                 cond_resched();
 515         }
 516
 517         ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
 518         BUG_ON(ret); /* -ENOMEM */
 519
 520         if (!skip_sum) {
 521                 ret = btrfs_csum_one_bio(inode, bio, start, 1);
 522                 BUG_ON(ret); /* -ENOMEM */
 523         }
 524
 525         ret = btrfs_map_bio(fs_info, bio, 0);
 526         if (ret) {
 527                 bio->bi_status = ret;
 528                 bio_endio(bio);
 529         }
 530
 531         if (blkcg_css)
 532                 kthread_associate_blkcg(NULL);
 533
 534         return 0;
 535 }
 536
 537 static u64 bio_end_offset(struct bio *bio)
 538 {
 539         struct bio_vec *last = bio_last_bvec_all(bio);
 540
 541         return page_offset(last->bv_page) + last->bv_len + last->bv_offset;
 542 }
 543
 544 static noinline int add_ra_bio_pages(struct inode *inode,
 545                                      u64 compressed_end,
 546                                      struct compressed_bio *cb)
 547 {
 548         unsigned long end_index;
 549         unsigned long pg_index;
 550         u64 last_offset;
 551         u64 isize = i_size_read(inode);
 552         int ret;
 553         struct page *page;
 554         unsigned long nr_pages = 0;
 555         struct extent_map *em;
 556         struct address_space *mapping = inode->i_mapping;
 557         struct extent_map_tree *em_tree;
 558         struct extent_io_tree *tree;
 559         u64 end;
 560         int misses = 0;
 561
 562         last_offset = bio_end_offset(cb->orig_bio);
 563         em_tree = &BTRFS_I(inode)->extent_tree;
 564         tree = &BTRFS_I(inode)->io_tree;
 565
 566         if (isize == 0)
 567                 return 0;
 568
 569         /*
 570          * For current subpage support, we only support 64K page size,
 571          * which means maximum compressed extent size (128K) is just 2x page
 572          * size.
 573          * This makes readahead less effective, so here disable readahead for
 574          * subpage for now, until full compressed write is supported.
 575          */
 576         if (btrfs_sb(inode->i_sb)->sectorsize < PAGE_SIZE)
 577                 return 0;
 578
 579         end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT;
 580
 581         while (last_offset < compressed_end) {
 582                 pg_index = last_offset >> PAGE_SHIFT;
 583
 584                 if (pg_index > end_index)
 585                         break;
 586
 587                 page = xa_load(&mapping->i_pages, pg_index);
 588                 if (page && !xa_is_value(page)) {
 589                         misses++;
 590                         if (misses > 4)
 591                                 break;
 592                         goto next;
 593                 }
 594
 595                 page = __page_cache_alloc(mapping_gfp_constraint(mapping,
 596                                                                  ~__GFP_FS));
 597                 if (!page)
 598                         break;
 599
 600                 if (add_to_page_cache_lru(page, mapping, pg_index, GFP_NOFS)) {
 601                         put_page(page);
 602                         goto next;
 603                 }
 604
 605                 /*
 606                  * at this point, we have a locked page in the page cache
 607                  * for these bytes in the file.  But, we have to make
 608                  * sure they map to this compressed extent on disk.
 609                  */
 610                 ret = set_page_extent_mapped(page);
 611                 if (ret < 0) {
 612                         unlock_page(page);
 613                         put_page(page);
 614                         break;
 615                 }
 616
 617                 end = last_offset + PAGE_SIZE - 1;
 618                 lock_extent(tree, last_offset, end);
 619                 read_lock(&em_tree->lock);
 620                 em = lookup_extent_mapping(em_tree, last_offset,
 621                                            PAGE_SIZE);
 622                 read_unlock(&em_tree->lock);
 623
 624                 if (!em || last_offset < em->start ||
 625                     (last_offset + PAGE_SIZE > extent_map_end(em)) ||
 626                     (em->block_start >> 9) != cb->orig_bio->bi_iter.bi_sector) {
 627                         free_extent_map(em);
 628                         unlock_extent(tree, last_offset, end);
 629                         unlock_page(page);
 630                         put_page(page);
 631                         break;
 632                 }
 633                 free_extent_map(em);
 634
 635                 if (page->index == end_index) {
 636                         size_t zero_offset = offset_in_page(isize);
 637
 638                         if (zero_offset) {
 639                                 int zeros;
 640                                 zeros = PAGE_SIZE - zero_offset;
 641                                 memzero_page(page, zero_offset, zeros);
 642                                 flush_dcache_page(page);
 643                         }
 644                 }
 645
 646                 ret = bio_add_page(cb->orig_bio, page,
 647                                    PAGE_SIZE, 0);
 648
 649                 if (ret == PAGE_SIZE) {
 650                         nr_pages++;
 651                         put_page(page);
 652                 } else {
 653                         unlock_extent(tree, last_offset, end);
 654                         unlock_page(page);
 655                         put_page(page);
 656                         break;
 657                 }
 658 next:
 659                 last_offset += PAGE_SIZE;
 660         }
 661         return 0;
 662 }
 663
 664 /*
 665  * for a compressed read, the bio we get passed has all the inode pages
 666  * in it.  We don't actually do IO on those pages but allocate new ones
 667  * to hold the compressed pages on disk.
 668  *
 669  * bio->bi_iter.bi_sector points to the compressed extent on disk
 670  * bio->bi_io_vec points to all of the inode pages
 671  *
 672  * After the compressed pages are read, we copy the bytes into the
 673  * bio we were passed and then call the bio end_io calls
 674  */
 675 blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 676                                  int mirror_num, unsigned long bio_flags)
 677 {
 678         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 679         struct extent_map_tree *em_tree;
 680         struct compressed_bio *cb;
 681         unsigned int compressed_len;
 682         unsigned int nr_pages;
 683         unsigned int pg_index;
 684         struct page *page;
 685         struct bio *comp_bio;
 686         u64 cur_disk_byte = bio->bi_iter.bi_sector << 9;
 687         u64 file_offset;
 688         u64 em_len;
 689         u64 em_start;
 690         struct extent_map *em;
 691         blk_status_t ret = BLK_STS_RESOURCE;
 692         int faili = 0;
 693         u8 *sums;
 694
 695         em_tree = &BTRFS_I(inode)->extent_tree;
 696
 697         file_offset = bio_first_bvec_all(bio)->bv_offset +
 698                       page_offset(bio_first_page_all(bio));
 699
 700         /* we need the actual starting offset of this extent in the file */
 701         read_lock(&em_tree->lock);
 702         em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize);
 703         read_unlock(&em_tree->lock);
 704         if (!em)
 705                 return BLK_STS_IOERR;
 706
 707         ASSERT(em->compress_type != BTRFS_COMPRESS_NONE);
 708         compressed_len = em->block_len;
 709         cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
 710         if (!cb)
 711                 goto out;
 712
 713         refcount_set(&cb->pending_bios, 0);
 714         cb->errors = 0;
 715         cb->inode = inode;
 716         cb->mirror_num = mirror_num;
 717         sums = cb->sums;
 718
 719         cb->start = em->orig_start;
 720         em_len = em->len;
 721         em_start = em->start;
 722
 723         free_extent_map(em);
 724         em = NULL;
 725
 726         cb->len = bio->bi_iter.bi_size;
 727         cb->compressed_len = compressed_len;
 728         cb->compress_type = extent_compress_type(bio_flags);
 729         cb->orig_bio = bio;
 730
 731         nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
 732         cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
 733                                        GFP_NOFS);
 734         if (!cb->compressed_pages)
 735                 goto fail1;
 736
 737         for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 738                 cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
 739                 if (!cb->compressed_pages[pg_index]) {
 740                         faili = pg_index - 1;
 741                         ret = BLK_STS_RESOURCE;
 742                         goto fail2;
 743                 }
 744         }
 745         faili = nr_pages - 1;
 746         cb->nr_pages = nr_pages;
 747
 748         add_ra_bio_pages(inode, em_start + em_len, cb);
 749
 750         /* include any pages we added in add_ra-bio_pages */
 751         cb->len = bio->bi_iter.bi_size;
 752
 753         comp_bio = btrfs_bio_alloc(cur_disk_byte);
 754         comp_bio->bi_opf = REQ_OP_READ;
 755         comp_bio->bi_private = cb;
 756         comp_bio->bi_end_io = end_compressed_bio_read;
 757         refcount_set(&cb->pending_bios, 1);
 758
 759         for (pg_index = 0; pg_index < nr_pages; pg_index++) {
 760                 u32 pg_len = PAGE_SIZE;
 761                 int submit = 0;
 762
 763                 /*
 764                  * To handle subpage case, we need to make sure the bio only
 765                  * covers the range we need.
 766                  *
 767                  * If we're at the last page, truncate the length to only cover
 768                  * the remaining part.
 769                  */
 770                 if (pg_index == nr_pages - 1)
 771                         pg_len = min_t(u32, PAGE_SIZE,
 772                                         compressed_len - pg_index * PAGE_SIZE);
 773
 774                 page = cb->compressed_pages[pg_index];
 775                 page->mapping = inode->i_mapping;
 776                 page->index = em_start >> PAGE_SHIFT;
 777
 778                 if (comp_bio->bi_iter.bi_size)
 779                         submit = btrfs_bio_fits_in_stripe(page, pg_len,
 780                                                           comp_bio, 0);
 781
 782                 page->mapping = NULL;
 783                 if (submit || bio_add_page(comp_bio, page, pg_len, 0) < pg_len) {
 784                         unsigned int nr_sectors;
 785
 786                         ret = btrfs_bio_wq_end_io(fs_info, comp_bio,
 787                                                   BTRFS_WQ_ENDIO_DATA);
 788                         BUG_ON(ret); /* -ENOMEM */
 789
 790                         /*
 791                          * inc the count before we submit the bio so
 792                          * we know the end IO handler won't happen before
 793                          * we inc the count.  Otherwise, the cb might get
 794                          * freed before we're done setting it up
 795                          */
 796                         refcount_inc(&cb->pending_bios);
 797
 798                         ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
 799                         BUG_ON(ret); /* -ENOMEM */
 800
 801                         nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size,
 802                                                   fs_info->sectorsize);
 803                         sums += fs_info->csum_size * nr_sectors;
 804
 805                         ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 806                         if (ret) {
 807                                 comp_bio->bi_status = ret;
 808                                 bio_endio(comp_bio);
 809                         }
 810
 811                         comp_bio = btrfs_bio_alloc(cur_disk_byte);
 812                         comp_bio->bi_opf = REQ_OP_READ;
 813                         comp_bio->bi_private = cb;
 814                         comp_bio->bi_end_io = end_compressed_bio_read;
 815
 816                         bio_add_page(comp_bio, page, pg_len, 0);
 817                 }
 818                 cur_disk_byte += pg_len;
 819         }
 820
 821         ret = btrfs_bio_wq_end_io(fs_info, comp_bio, BTRFS_WQ_ENDIO_DATA);
 822         BUG_ON(ret); /* -ENOMEM */
 823
 824         ret = btrfs_lookup_bio_sums(inode, comp_bio, sums);
 825         BUG_ON(ret); /* -ENOMEM */
 826
 827         ret = btrfs_map_bio(fs_info, comp_bio, mirror_num);
 828         if (ret) {
 829                 comp_bio->bi_status = ret;
 830                 bio_endio(comp_bio);
 831         }
 832
 833         return 0;
 834
 835 fail2:
 836         while (faili >= 0) {
 837                 __free_page(cb->compressed_pages[faili]);
 838                 faili--;
 839         }
 840
 841         kfree(cb->compressed_pages);
 842 fail1:
 843         kfree(cb);
 844 out:
 845         free_extent_map(em);
 846         return ret;
 847 }
 848
 849 /*
 850  * Heuristic uses systematic sampling to collect data from the input data
 851  * range, the logic can be tuned by the following constants:
 852  *
 853  * @SAMPLING_READ_SIZE - how many bytes will be copied from for each sample
 854  * @SAMPLING_INTERVAL  - range from which the sampled data can be collected
 855  */
 856 #define SAMPLING_READ_SIZE      (16)
 857 #define SAMPLING_INTERVAL       (256)
 858
 859 /*
 860  * For statistical analysis of the input data we consider bytes that form a
 861  * Galois Field of 256 objects. Each object has an attribute count, ie. how
 862  * many times the object appeared in the sample.
 863  */
 864 #define BUCKET_SIZE             (256)
 865
 866 /*
 867  * The size of the sample is based on a statistical sampling rule of thumb.
 868  * The common way is to perform sampling tests as long as the number of
 869  * elements in each cell is at least 5.
 870  *
 871  * Instead of 5, we choose 32 to obtain more accurate results.
 872  * If the data contain the maximum number of symbols, which is 256, we obtain a
 873  * sample size bound by 8192.
 874  *
 875  * For a sample of at most 8KB of data per data range: 16 consecutive bytes
 876  * from up to 512 locations.
 877  */
 878 #define MAX_SAMPLE_SIZE         (BTRFS_MAX_UNCOMPRESSED *               \
 879                                  SAMPLING_READ_SIZE / SAMPLING_INTERVAL)
 880
 881 struct bucket_item {
 882         u32 count;
 883 };
 884
 885 struct heuristic_ws {
 886         /* Partial copy of input data */
 887         u8 *sample;
 888         u32 sample_size;
 889         /* Buckets store counters for each byte value */
 890         struct bucket_item *bucket;
 891         /* Sorting buffer */
 892         struct bucket_item *bucket_b;
 893         struct list_head list;
 894 };
 895
 896 static struct workspace_manager heuristic_wsm;
 897
 898 static void free_heuristic_ws(struct list_head *ws)
 899 {
 900         struct heuristic_ws *workspace;
 901
 902         workspace = list_entry(ws, struct heuristic_ws, list);
 903
 904         kvfree(workspace->sample);
 905         kfree(workspace->bucket);
 906         kfree(workspace->bucket_b);
 907         kfree(workspace);
 908 }
 909
 910 static struct list_head *alloc_heuristic_ws(unsigned int level)
 911 {
 912         struct heuristic_ws *ws;
 913
 914         ws = kzalloc(sizeof(*ws), GFP_KERNEL);
 915         if (!ws)
 916                 return ERR_PTR(-ENOMEM);
 917
 918         ws->sample = kvmalloc(MAX_SAMPLE_SIZE, GFP_KERNEL);
 919         if (!ws->sample)
 920                 goto fail;
 921
 922         ws->bucket = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket), GFP_KERNEL);
 923         if (!ws->bucket)
 924                 goto fail;
 925
 926         ws->bucket_b = kcalloc(BUCKET_SIZE, sizeof(*ws->bucket_b), GFP_KERNEL);
 927         if (!ws->bucket_b)
 928                 goto fail;
 929
 930         INIT_LIST_HEAD(&ws->list);
 931         return &ws->list;
 932 fail:
 933         free_heuristic_ws(&ws->list);
 934         return ERR_PTR(-ENOMEM);
 935 }
 936
 937 const struct btrfs_compress_op btrfs_heuristic_compress = {
 938         .workspace_manager = &heuristic_wsm,
 939 };
 940
 941 static const struct btrfs_compress_op * const btrfs_compress_op[] = {
 942         /* The heuristic is represented as compression type 0 */
 943         &btrfs_heuristic_compress,
 944         &btrfs_zlib_compress,
 945         &btrfs_lzo_compress,
 946         &btrfs_zstd_compress,
 947 };
 948
 949 static struct list_head *alloc_workspace(int type, unsigned int level)
 950 {
 951         switch (type) {
 952         case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(level);
 953         case BTRFS_COMPRESS_ZLIB: return zlib_alloc_workspace(level);
 954         case BTRFS_COMPRESS_LZO:  return lzo_alloc_workspace(level);
 955         case BTRFS_COMPRESS_ZSTD: return zstd_alloc_workspace(level);
 956         default:
 957                 /*
 958                  * This can't happen, the type is validated several times
 959                  * before we get here.
 960                  */
 961                 BUG();
 962         }
 963 }
 964
 965 static void free_workspace(int type, struct list_head *ws)
 966 {
 967         switch (type) {
 968         case BTRFS_COMPRESS_NONE: return free_heuristic_ws(ws);
 969         case BTRFS_COMPRESS_ZLIB: return zlib_free_workspace(ws);
 970         case BTRFS_COMPRESS_LZO:  return lzo_free_workspace(ws);
 971         case BTRFS_COMPRESS_ZSTD: return zstd_free_workspace(ws);
 972         default:
 973                 /*
 974                  * This can't happen, the type is validated several times
 975                  * before we get here.
 976                  */
 977                 BUG();
 978         }
 979 }
 980
 981 static void btrfs_init_workspace_manager(int type)
 982 {
 983         struct workspace_manager *wsm;
 984         struct list_head *workspace;
 985
 986         wsm = btrfs_compress_op[type]->workspace_manager;
 987         INIT_LIST_HEAD(&wsm->idle_ws);
 988         spin_lock_init(&wsm->ws_lock);
 989         atomic_set(&wsm->total_ws, 0);
 990         init_waitqueue_head(&wsm->ws_wait);
 991
 992         /*
 993          * Preallocate one workspace for each compression type so we can
 994          * guarantee forward progress in the worst case
 995          */
 996         workspace = alloc_workspace(type, 0);
 997         if (IS_ERR(workspace)) {
 998                 pr_warn(
 999         "BTRFS: cannot preallocate compression workspace, will try later\n");
1000         } else {
1001                 atomic_set(&wsm->total_ws, 1);
1002                 wsm->free_ws = 1;
1003                 list_add(workspace, &wsm->idle_ws);
1004         }
1005 }
1006
1007 static void btrfs_cleanup_workspace_manager(int type)
1008 {
1009         struct workspace_manager *wsman;
1010         struct list_head *ws;
1011
1012         wsman = btrfs_compress_op[type]->workspace_manager;
1013         while (!list_empty(&wsman->idle_ws)) {
1014                 ws = wsman->idle_ws.next;
1015                 list_del(ws);
1016                 free_workspace(type, ws);
1017                 atomic_dec(&wsman->total_ws);
1018         }
1019 }
1020
1021 /*
1022  * This finds an available workspace or allocates a new one.
1023  * If it's not possible to allocate a new one, waits until there's one.
1024  * Preallocation makes a forward progress guarantees and we do not return
1025  * errors.
1026  */
1027 struct list_head *btrfs_get_workspace(int type, unsigned int level)
1028 {
1029         struct workspace_manager *wsm;
1030         struct list_head *workspace;
1031         int cpus = num_online_cpus();
1032         unsigned nofs_flag;
1033         struct list_head *idle_ws;
1034         spinlock_t *ws_lock;
1035         atomic_t *total_ws;
1036         wait_queue_head_t *ws_wait;
1037         int *free_ws;
1038
1039         wsm = btrfs_compress_op[type]->workspace_manager;
1040         idle_ws  = &wsm->idle_ws;
1041         ws_lock  = &wsm->ws_lock;
1042         total_ws = &wsm->total_ws;
1043         ws_wait  = &wsm->ws_wait;
1044         free_ws  = &wsm->free_ws;
1045
1046 again:
1047         spin_lock(ws_lock);
1048         if (!list_empty(idle_ws)) {
1049                 workspace = idle_ws->next;
1050                 list_del(workspace);
1051                 (*free_ws)--;
1052                 spin_unlock(ws_lock);
1053                 return workspace;
1054
1055         }
1056         if (atomic_read(total_ws) > cpus) {
1057                 DEFINE_WAIT(wait);
1058
1059                 spin_unlock(ws_lock);
1060                 prepare_to_wait(ws_wait, &wait, TASK_UNINTERRUPTIBLE);
1061                 if (atomic_read(total_ws) > cpus && !*free_ws)
1062                         schedule();
1063                 finish_wait(ws_wait, &wait);
1064                 goto again;
1065         }
1066         atomic_inc(total_ws);
1067         spin_unlock(ws_lock);
1068
1069         /*
1070          * Allocation helpers call vmalloc that can't use GFP_NOFS, so we have
1071          * to turn it off here because we might get called from the restricted
1072          * context of btrfs_compress_bio/btrfs_compress_pages
1073          */
1074         nofs_flag = memalloc_nofs_save();
1075         workspace = alloc_workspace(type, level);
1076         memalloc_nofs_restore(nofs_flag);
1077
1078         if (IS_ERR(workspace)) {
1079                 atomic_dec(total_ws);
1080                 wake_up(ws_wait);
1081
1082                 /*
1083                  * Do not return the error but go back to waiting. There's a
1084                  * workspace preallocated for each type and the compression
1085                  * time is bounded so we get to a workspace eventually. This
1086                  * makes our caller's life easier.
1087                  *
1088                  * To prevent silent and low-probability deadlocks (when the
1089                  * initial preallocation fails), check if there are any
1090                  * workspaces at all.
1091                  */
1092                 if (atomic_read(total_ws) == 0) {
1093                         static DEFINE_RATELIMIT_STATE(_rs,
1094                                         /* once per minute */ 60 * HZ,
1095                                         /* no burst */ 1);
1096
1097                         if (__ratelimit(&_rs)) {
1098                                 pr_warn("BTRFS: no compression workspaces, low memory, retrying\n");
1099                         }
1100                 }
1101                 goto again;
1102         }
1103         return workspace;
1104 }
1105
1106 static struct list_head *get_workspace(int type, int level)
1107 {
1108         switch (type) {
1109         case BTRFS_COMPRESS_NONE: return btrfs_get_workspace(type, level);
1110         case BTRFS_COMPRESS_ZLIB: return zlib_get_workspace(level);
1111         case BTRFS_COMPRESS_LZO:  return btrfs_get_workspace(type, level);
1112         case BTRFS_COMPRESS_ZSTD: return zstd_get_workspace(level);
1113         default:
1114                 /*
1115                  * This can't happen, the type is validated several times
1116                  * before we get here.
1117                  */
1118                 BUG();
1119         }
1120 }
1121
1122 /*
1123  * put a workspace struct back on the list or free it if we have enough
1124  * idle ones sitting around
1125  */
1126 void btrfs_put_workspace(int type, struct list_head *ws)
1127 {
1128         struct workspace_manager *wsm;
1129         struct list_head *idle_ws;
1130         spinlock_t *ws_lock;
1131         atomic_t *total_ws;
1132         wait_queue_head_t *ws_wait;
1133         int *free_ws;
1134
1135         wsm = btrfs_compress_op[type]->workspace_manager;
1136         idle_ws  = &wsm->idle_ws;
1137         ws_lock  = &wsm->ws_lock;
1138         total_ws = &wsm->total_ws;
1139         ws_wait  = &wsm->ws_wait;
1140         free_ws  = &wsm->free_ws;
1141
1142         spin_lock(ws_lock);
1143         if (*free_ws <= num_online_cpus()) {
1144                 list_add(ws, idle_ws);
1145                 (*free_ws)++;
1146                 spin_unlock(ws_lock);
1147                 goto wake;
1148         }
1149         spin_unlock(ws_lock);
1150
1151         free_workspace(type, ws);
1152         atomic_dec(total_ws);
1153 wake:
1154         cond_wake_up(ws_wait);
1155 }
1156
1157 static void put_workspace(int type, struct list_head *ws)
1158 {
1159         switch (type) {
1160         case BTRFS_COMPRESS_NONE: return btrfs_put_workspace(type, ws);
1161         case BTRFS_COMPRESS_ZLIB: return btrfs_put_workspace(type, ws);
1162         case BTRFS_COMPRESS_LZO:  return btrfs_put_workspace(type, ws);
1163         case BTRFS_COMPRESS_ZSTD: return zstd_put_workspace(ws);
1164         default:
1165                 /*
1166                  * This can't happen, the type is validated several times
1167                  * before we get here.
1168                  */
1169                 BUG();
1170         }
1171 }
1172
1173 /*
1174  * Adjust @level according to the limits of the compression algorithm or
1175  * fallback to default
1176  */
1177 static unsigned int btrfs_compress_set_level(int type, unsigned level)
1178 {
1179         const struct btrfs_compress_op *ops = btrfs_compress_op[type];
1180
1181         if (level == 0)
1182                 level = ops->default_level;
1183         else
1184                 level = min(level, ops->max_level);
1185
1186         return level;
1187 }
1188
1189 /*
1190  * Given an address space and start and length, compress the bytes into @pages
1191  * that are allocated on demand.
1192  *
1193  * @type_level is encoded algorithm and level, where level 0 means whatever
1194  * default the algorithm chooses and is opaque here;
1195  * - compression algo are 0-3
1196  * - the level are bits 4-7
1197  *
1198  * @out_pages is an in/out parameter, holds maximum number of pages to allocate
1199  * and returns number of actually allocated pages
1200  *
1201  * @total_in is used to return the number of bytes actually read.  It
1202  * may be smaller than the input length if we had to exit early because we
1203  * ran out of room in the pages array or because we cross the
1204  * max_out threshold.
1205  *
1206  * @total_out is an in/out parameter, must be set to the input length and will
1207  * be also used to return the total number of compressed bytes
1208  */
1209 int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
1210                          u64 start, struct page **pages,
1211                          unsigned long *out_pages,
1212                          unsigned long *total_in,
1213                          unsigned long *total_out)
1214 {
1215         int type = btrfs_compress_type(type_level);
1216         int level = btrfs_compress_level(type_level);
1217         struct list_head *workspace;
1218         int ret;
1219
1220         level = btrfs_compress_set_level(type, level);
1221         workspace = get_workspace(type, level);
1222         ret = compression_compress_pages(type, workspace, mapping, start, pages,
1223                                          out_pages, total_in, total_out);
1224         put_workspace(type, workspace);
1225         return ret;
1226 }
1227
1228 static int btrfs_decompress_bio(struct compressed_bio *cb)
1229 {
1230         struct list_head *workspace;
1231         int ret;
1232         int type = cb->compress_type;
1233
1234         workspace = get_workspace(type, 0);
1235         ret = compression_decompress_bio(type, workspace, cb);
1236         put_workspace(type, workspace);
1237
1238         return ret;
1239 }
1240
1241 /*
1242  * a less complex decompression routine.  Our compressed data fits in a
1243  * single page, and we want to read a single page out of it.
1244  * start_byte tells us the offset into the compressed data we're interested in
1245  */
1246 int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
1247                      unsigned long start_byte, size_t srclen, size_t destlen)
1248 {
1249         struct list_head *workspace;
1250         int ret;
1251
1252         workspace = get_workspace(type, 0);
1253         ret = compression_decompress(type, workspace, data_in, dest_page,
1254                                      start_byte, srclen, destlen);
1255         put_workspace(type, workspace);
1256
1257         return ret;
1258 }
1259
1260 void __init btrfs_init_compress(void)
1261 {
1262         btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
1263         btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
1264         btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
1265         zstd_init_workspace_manager();
1266 }
1267
1268 void __cold btrfs_exit_compress(void)
1269 {
1270         btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE);
1271         btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB);
1272         btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO);
1273         zstd_cleanup_workspace_manager();
1274 }
1275
1276 /*
1277  * Copy decompressed data from working buffer to pages.
1278  *
1279  * @buf:                The decompressed data buffer
1280  * @buf_len:            The decompressed data length
1281  * @decompressed:       Number of bytes that are already decompressed inside the
1282  *                      compressed extent
1283  * @cb:                 The compressed extent descriptor
1284  * @orig_bio:           The original bio that the caller wants to read for
1285  *
1286  * An easier to understand graph is like below:
1287  *
1288  *              |<- orig_bio ->|     |<- orig_bio->|
1289  *      |<-------      full decompressed extent      ----->|
1290  *      |<-----------    @cb range   ---->|
1291  *      |                       |<-- @buf_len -->|
1292  *      |<--- @decompressed --->|
1293  *
1294  * Note that, @cb can be a subpage of the full decompressed extent, but
1295  * @cb->start always has the same as the orig_file_offset value of the full
1296  * decompressed extent.
1297  *
1298  * When reading compressed extent, we have to read the full compressed extent,
1299  * while @orig_bio may only want part of the range.
1300  * Thus this function will ensure only data covered by @orig_bio will be copied
1301  * to.
1302  *
1303  * Return 0 if we have copied all needed contents for @orig_bio.
1304  * Return >0 if we need continue decompress.
1305  */
1306 int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
1307                               struct compressed_bio *cb, u32 decompressed)
1308 {
1309         struct bio *orig_bio = cb->orig_bio;
1310         /* Offset inside the full decompressed extent */
1311         u32 cur_offset;
1312
1313         cur_offset = decompressed;
1314         /* The main loop to do the copy */
1315         while (cur_offset < decompressed + buf_len) {
1316                 struct bio_vec bvec;
1317                 size_t copy_len;
1318                 u32 copy_start;
1319                 /* Offset inside the full decompressed extent */
1320                 u32 bvec_offset;
1321
1322                 bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter);
1323                 /*
1324                  * cb->start may underflow, but subtracting that value can still
1325                  * give us correct offset inside the full decompressed extent.
1326                  */
1327                 bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start;
1328
1329                 /* Haven't reached the bvec range, exit */
1330                 if (decompressed + buf_len <= bvec_offset)
1331                         return 1;
1332
1333                 copy_start = max(cur_offset, bvec_offset);
1334                 copy_len = min(bvec_offset + bvec.bv_len,
1335                                decompressed + buf_len) - copy_start;
1336                 ASSERT(copy_len);
1337
1338                 /*
1339                  * Extra range check to ensure we didn't go beyond
1340                  * @buf + @buf_len.
1341                  */
1342                 ASSERT(copy_start - decompressed < buf_len);
1343                 memcpy_to_page(bvec.bv_page, bvec.bv_offset,
1344                                buf + copy_start - decompressed, copy_len);
1345                 flush_dcache_page(bvec.bv_page);
1346                 cur_offset += copy_len;
1347
1348                 bio_advance(orig_bio, copy_len);
1349                 /* Finished the bio */
1350                 if (!orig_bio->bi_iter.bi_size)
1351                         return 0;
1352         }
1353         return 1;
1354 }
1355
1356 /*
1357  * Shannon Entropy calculation
1358  *
1359  * Pure byte distribution analysis fails to determine compressibility of data.
1360  * Try calculating entropy to estimate the average minimum number of bits
1361  * needed to encode the sampled data.
1362  *
1363  * For convenience, return the percentage of needed bits, instead of amount of
1364  * bits directly.
1365  *
1366  * @ENTROPY_LVL_ACEPTABLE - below that threshold, sample has low byte entropy
1367  *                          and can be compressible with high probability
1368  *
1369  * @ENTROPY_LVL_HIGH - data are not compressible with high probability
1370  *
1371  * Use of ilog2() decreases precision, we lower the LVL to 5 to compensate.
1372  */
1373 #define ENTROPY_LVL_ACEPTABLE           (65)
1374 #define ENTROPY_LVL_HIGH                (80)
1375
1376 /*
1377  * For increasead precision in shannon_entropy calculation,
1378  * let's do pow(n, M) to save more digits after comma:
1379  *
1380  * - maximum int bit length is 64
1381  * - ilog2(MAX_SAMPLE_SIZE)     -> 13
1382  * - 13 * 4 = 52 < 64           -> M = 4
1383  *
1384  * So use pow(n, 4).
1385  */
1386 static inline u32 ilog2_w(u64 n)
1387 {
1388         return ilog2(n * n * n * n);
1389 }
1390
1391 static u32 shannon_entropy(struct heuristic_ws *ws)
1392 {
1393         const u32 entropy_max = 8 * ilog2_w(2);
1394         u32 entropy_sum = 0;
1395         u32 p, p_base, sz_base;
1396         u32 i;
1397
1398         sz_base = ilog2_w(ws->sample_size);
1399         for (i = 0; i < BUCKET_SIZE && ws->bucket[i].count > 0; i++) {
1400                 p = ws->bucket[i].count;
1401                 p_base = ilog2_w(p);
1402                 entropy_sum += p * (sz_base - p_base);
1403         }
1404
1405         entropy_sum /= ws->sample_size;
1406         return entropy_sum * 100 / entropy_max;
1407 }
1408
1409 #define RADIX_BASE              4U
1410 #define COUNTERS_SIZE           (1U << RADIX_BASE)
1411
1412 static u8 get4bits(u64 num, int shift) {
1413         u8 low4bits;
1414
1415         num >>= shift;
1416         /* Reverse order */
1417         low4bits = (COUNTERS_SIZE - 1) - (num % COUNTERS_SIZE);
1418         return low4bits;
1419 }
1420
1421 /*
1422  * Use 4 bits as radix base
1423  * Use 16 u32 counters for calculating new position in buf array
1424  *
1425  * @array     - array that will be sorted
1426  * @array_buf - buffer array to store sorting results
1427  *              must be equal in size to @array
1428  * @num       - array size
1429  */
1430 static void radix_sort(struct bucket_item *array, struct bucket_item *array_buf,
1431                        int num)
1432 {
1433         u64 max_num;
1434         u64 buf_num;
1435         u32 counters[COUNTERS_SIZE];
1436         u32 new_addr;
1437         u32 addr;
1438         int bitlen;
1439         int shift;
1440         int i;
1441
1442         /*
1443          * Try avoid useless loop iterations for small numbers stored in big
1444          * counters.  Example: 48 33 4 ... in 64bit array
1445          */
1446         max_num = array[0].count;
1447         for (i = 1; i < num; i++) {
1448                 buf_num = array[i].count;
1449                 if (buf_num > max_num)
1450                         max_num = buf_num;
1451         }
1452
1453         buf_num = ilog2(max_num);
1454         bitlen = ALIGN(buf_num, RADIX_BASE * 2);
1455
1456         shift = 0;
1457         while (shift < bitlen) {
1458                 memset(counters, 0, sizeof(counters));
1459
1460                 for (i = 0; i < num; i++) {
1461                         buf_num = array[i].count;
1462                         addr = get4bits(buf_num, shift);
1463                         counters[addr]++;
1464                 }
1465
1466                 for (i = 1; i < COUNTERS_SIZE; i++)
1467                         counters[i] += counters[i - 1];
1468
1469                 for (i = num - 1; i >= 0; i--) {
1470                         buf_num = array[i].count;
1471                         addr = get4bits(buf_num, shift);
1472                         counters[addr]--;
1473                         new_addr = counters[addr];
1474                         array_buf[new_addr] = array[i];
1475                 }
1476
1477                 shift += RADIX_BASE;
1478
1479                 /*
1480                  * Normal radix expects to move data from a temporary array, to
1481                  * the main one.  But that requires some CPU time. Avoid that
1482                  * by doing another sort iteration to original array instead of
1483                  * memcpy()
1484                  */
1485                 memset(counters, 0, sizeof(counters));
1486
1487                 for (i = 0; i < num; i ++) {
1488                         buf_num = array_buf[i].count;
1489                         addr = get4bits(buf_num, shift);
1490                         counters[addr]++;
1491                 }
1492
1493                 for (i = 1; i < COUNTERS_SIZE; i++)
1494                         counters[i] += counters[i - 1];
1495
1496                 for (i = num - 1; i >= 0; i--) {
1497                         buf_num = array_buf[i].count;
1498                         addr = get4bits(buf_num, shift);
1499                         counters[addr]--;
1500                         new_addr = counters[addr];
1501                         array[new_addr] = array_buf[i];
1502                 }
1503
1504                 shift += RADIX_BASE;
1505         }
1506 }
1507
1508 /*
1509  * Size of the core byte set - how many bytes cover 90% of the sample
1510  *
1511  * There are several types of structured binary data that use nearly all byte
1512  * values. The distribution can be uniform and counts in all buckets will be
1513  * nearly the same (eg. encrypted data). Unlikely to be compressible.
1514  *
1515  * Other possibility is normal (Gaussian) distribution, where the data could
1516  * be potentially compressible, but we have to take a few more steps to decide
1517  * how much.
1518  *
1519  * @BYTE_CORE_SET_LOW  - main part of byte values repeated frequently,
1520  *                       compression algo can easy fix that
1521  * @BYTE_CORE_SET_HIGH - data have uniform distribution and with high
1522  *                       probability is not compressible
1523  */
1524 #define BYTE_CORE_SET_LOW               (64)
1525 #define BYTE_CORE_SET_HIGH              (200)
1526
1527 static int byte_core_set_size(struct heuristic_ws *ws)
1528 {
1529         u32 i;
1530         u32 coreset_sum = 0;
1531         const u32 core_set_threshold = ws->sample_size * 90 / 100;
1532         struct bucket_item *bucket = ws->bucket;
1533
1534         /* Sort in reverse order */
1535         radix_sort(ws->bucket, ws->bucket_b, BUCKET_SIZE);
1536
1537         for (i = 0; i < BYTE_CORE_SET_LOW; i++)
1538                 coreset_sum += bucket[i].count;
1539
1540         if (coreset_sum > core_set_threshold)
1541                 return i;
1542
1543         for (; i < BYTE_CORE_SET_HIGH && bucket[i].count > 0; i++) {
1544                 coreset_sum += bucket[i].count;
1545                 if (coreset_sum > core_set_threshold)
1546                         break;
1547         }
1548
1549         return i;
1550 }
1551
1552 /*
1553  * Count byte values in buckets.
1554  * This heuristic can detect textual data (configs, xml, json, html, etc).
1555  * Because in most text-like data byte set is restricted to limited number of
1556  * possible characters, and that restriction in most cases makes data easy to
1557  * compress.
1558  *
1559  * @BYTE_SET_THRESHOLD - consider all data within this byte set size:
1560  *      less - compressible
1561  *      more - need additional analysis
1562  */
1563 #define BYTE_SET_THRESHOLD              (64)
1564
1565 static u32 byte_set_size(const struct heuristic_ws *ws)
1566 {
1567         u32 i;
1568         u32 byte_set_size = 0;
1569
1570         for (i = 0; i < BYTE_SET_THRESHOLD; i++) {
1571                 if (ws->bucket[i].count > 0)
1572                         byte_set_size++;
1573         }
1574
1575         /*
1576          * Continue collecting count of byte values in buckets.  If the byte
1577          * set size is bigger then the threshold, it's pointless to continue,
1578          * the detection technique would fail for this type of data.
1579          */
1580         for (; i < BUCKET_SIZE; i++) {
1581                 if (ws->bucket[i].count > 0) {
1582                         byte_set_size++;
1583                         if (byte_set_size > BYTE_SET_THRESHOLD)
1584                                 return byte_set_size;
1585                 }
1586         }
1587
1588         return byte_set_size;
1589 }
1590
1591 static bool sample_repeated_patterns(struct heuristic_ws *ws)
1592 {
1593         const u32 half_of_sample = ws->sample_size / 2;
1594         const u8 *data = ws->sample;
1595
1596         return memcmp(&data[0], &data[half_of_sample], half_of_sample) == 0;
1597 }
1598
1599 static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
1600                                      struct heuristic_ws *ws)
1601 {
1602         struct page *page;
1603         u64 index, index_end;
1604         u32 i, curr_sample_pos;
1605         u8 *in_data;
1606
1607         /*
1608          * Compression handles the input data by chunks of 128KiB
1609          * (defined by BTRFS_MAX_UNCOMPRESSED)
1610          *
1611          * We do the same for the heuristic and loop over the whole range.
1612          *
1613          * MAX_SAMPLE_SIZE - calculated under assumption that heuristic will
1614          * process no more than BTRFS_MAX_UNCOMPRESSED at a time.
1615          */
1616         if (end - start > BTRFS_MAX_UNCOMPRESSED)
1617                 end = start + BTRFS_MAX_UNCOMPRESSED;
1618
1619         index = start >> PAGE_SHIFT;
1620         index_end = end >> PAGE_SHIFT;
1621
1622         /* Don't miss unaligned end */
1623         if (!IS_ALIGNED(end, PAGE_SIZE))
1624                 index_end++;
1625
1626         curr_sample_pos = 0;
1627         while (index < index_end) {
1628                 page = find_get_page(inode->i_mapping, index);
1629                 in_data = kmap_local_page(page);
1630                 /* Handle case where the start is not aligned to PAGE_SIZE */
1631                 i = start % PAGE_SIZE;
1632                 while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
1633                         /* Don't sample any garbage from the last page */
1634                         if (start > end - SAMPLING_READ_SIZE)
1635                                 break;
1636                         memcpy(&ws->sample[curr_sample_pos], &in_data[i],
1637                                         SAMPLING_READ_SIZE);
1638                         i += SAMPLING_INTERVAL;
1639                         start += SAMPLING_INTERVAL;
1640                         curr_sample_pos += SAMPLING_READ_SIZE;
1641                 }
1642                 kunmap_local(in_data);
1643                 put_page(page);
1644
1645                 index++;
1646         }
1647
1648         ws->sample_size = curr_sample_pos;
1649 }
1650
1651 /*
1652  * Compression heuristic.
1653  *
1654  * For now is's a naive and optimistic 'return true', we'll extend the logic to
1655  * quickly (compared to direct compression) detect data characteristics
1656  * (compressible/uncompressible) to avoid wasting CPU time on uncompressible
1657  * data.
1658  *
1659  * The following types of analysis can be performed:
1660  * - detect mostly zero data
1661  * - detect data with low "byte set" size (text, etc)
1662  * - detect data with low/high "core byte" set
1663  *
1664  * Return non-zero if the compression should be done, 0 otherwise.
1665  */
1666 int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end)
1667 {
1668         struct list_head *ws_list = get_workspace(0, 0);
1669         struct heuristic_ws *ws;
1670         u32 i;
1671         u8 byte;
1672         int ret = 0;
1673
1674         ws = list_entry(ws_list, struct heuristic_ws, list);
1675
1676         heuristic_collect_sample(inode, start, end, ws);
1677
1678         if (sample_repeated_patterns(ws)) {
1679                 ret = 1;
1680                 goto out;
1681         }
1682
1683         memset(ws->bucket, 0, sizeof(*ws->bucket)*BUCKET_SIZE);
1684
1685         for (i = 0; i < ws->sample_size; i++) {
1686                 byte = ws->sample[i];
1687                 ws->bucket[byte].count++;
1688         }
1689
1690         i = byte_set_size(ws);
1691         if (i < BYTE_SET_THRESHOLD) {
1692                 ret = 2;
1693                 goto out;
1694         }
1695
1696         i = byte_core_set_size(ws);
1697         if (i <= BYTE_CORE_SET_LOW) {
1698                 ret = 3;
1699                 goto out;
1700         }
1701
1702         if (i >= BYTE_CORE_SET_HIGH) {
1703                 ret = 0;
1704                 goto out;
1705         }
1706
1707         i = shannon_entropy(ws);
1708         if (i <= ENTROPY_LVL_ACEPTABLE) {
1709                 ret = 4;
1710                 goto out;
1711         }
1712
1713         /*
1714          * For the levels below ENTROPY_LVL_HIGH, additional analysis would be
1715          * needed to give green light to compression.
1716          *
1717          * For now just assume that compression at that level is not worth the
1718          * resources because:
1719          *
1720          * 1. it is possible to defrag the data later
1721          *
1722          * 2. the data would turn out to be hardly compressible, eg. 150 byte
1723          * values, every bucket has counter at level ~54. The heuristic would
1724          * be confused. This can happen when data have some internal repeated
1725          * patterns like "abbacbbc...". This can be detected by analyzing
1726          * pairs of bytes, which is too costly.
1727          */
1728         if (i < ENTROPY_LVL_HIGH) {
1729                 ret = 5;
1730                 goto out;
1731         } else {
1732                 ret = 0;
1733                 goto out;
1734         }
1735
1736 out:
1737         put_workspace(0, ws_list);
1738         return ret;
1739 }
1740
1741 /*
1742  * Convert the compression suffix (eg. after "zlib" starting with ":") to
1743  * level, unrecognized string will set the default level
1744  */
1745 unsigned int btrfs_compress_str2level(unsigned int type, const char *str)
1746 {
1747         unsigned int level = 0;
1748         int ret;
1749
1750         if (!type)
1751                 return 0;
1752
1753         if (str[0] == ':') {
1754                 ret = kstrtouint(str + 1, 10, &level);
1755                 if (ret)
1756                         level = 0;
1757         }
1758
1759         level = btrfs_compress_set_level(type, level);
1760
1761         return level;
1762 }