module/os/linux/zfs/abd_os.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or https://opensource.org/licenses/CDDL-1.0.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  23  * Copyright (c) 2019 by Delphix. All rights reserved.
  24  * Copyright (c) 2023, 2024, Klara Inc.
  25  */
  26
  27 /*
  28  * See abd.c for a general overview of the arc buffered data (ABD).
  29  *
  30  * Linear buffers act exactly like normal buffers and are always mapped into the
  31  * kernel's virtual memory space, while scattered ABD data chunks are allocated
  32  * as physical pages and then mapped in only while they are actually being
  33  * accessed through one of the abd_* library functions. Using scattered ABDs
  34  * provides several benefits:
  35  *
  36  *  (1) They avoid use of kmem_*, preventing performance problems where running
  37  *      kmem_reap on very large memory systems never finishes and causes
  38  *      constant TLB shootdowns.
  39  *
  40  *  (2) Fragmentation is less of an issue since when we are at the limit of
  41  *      allocatable space, we won't have to search around for a long free
  42  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
  43  *      individually, so even if we are using HIGHMEM (see next point) we
  44  *      wouldn't need to worry about finding a contiguous address range.
  45  *
  46  *  (3) If we are not using HIGHMEM, then all physical memory is always
  47  *      mapped into the kernel's address space, so we also avoid the map /
  48  *      unmap costs on each ABD access.
  49  *
  50  * If we are not using HIGHMEM, scattered buffers which have only one chunk
  51  * can be treated as linear buffers, because they are contiguous in the
  52  * kernel's virtual address space.  See abd_alloc_chunks() for details.
  53  */
  54
  55 #include <sys/abd_impl.h>
  56 #include <sys/param.h>
  57 #include <sys/zio.h>
  58 #include <sys/arc.h>
  59 #include <sys/zfs_context.h>
  60 #include <sys/zfs_znode.h>
  61 #ifdef _KERNEL
  62 #include <linux/kmap_compat.h>
  63 #include <linux/mm_compat.h>
  64 #include <linux/scatterlist.h>
  65 #include <linux/version.h>
  66 #endif
  67
  68 #ifdef _KERNEL
  69 #if defined(MAX_ORDER)
  70 #define ABD_MAX_ORDER   (MAX_ORDER)
  71 #elif defined(MAX_PAGE_ORDER)
  72 #define ABD_MAX_ORDER   (MAX_PAGE_ORDER)
  73 #endif
  74 #else
  75 #define ABD_MAX_ORDER   (1)
  76 #endif
  77
  78 typedef struct abd_stats {
  79         kstat_named_t abdstat_struct_size;
  80         kstat_named_t abdstat_linear_cnt;
  81         kstat_named_t abdstat_linear_data_size;
  82         kstat_named_t abdstat_scatter_cnt;
  83         kstat_named_t abdstat_scatter_data_size;
  84         kstat_named_t abdstat_scatter_chunk_waste;
  85         kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER];
  86         kstat_named_t abdstat_scatter_page_multi_chunk;
  87         kstat_named_t abdstat_scatter_page_multi_zone;
  88         kstat_named_t abdstat_scatter_page_alloc_retry;
  89         kstat_named_t abdstat_scatter_sg_table_retry;
  90 } abd_stats_t;
  91
  92 static abd_stats_t abd_stats = {
  93         /* Amount of memory occupied by all of the abd_t struct allocations */
  94         { "struct_size",                        KSTAT_DATA_UINT64 },
  95         /*
  96          * The number of linear ABDs which are currently allocated, excluding
  97          * ABDs which don't own their data (for instance the ones which were
  98          * allocated through abd_get_offset() and abd_get_from_buf()). If an
  99          * ABD takes ownership of its buf then it will become tracked.
 100          */
 101         { "linear_cnt",                         KSTAT_DATA_UINT64 },
 102         /* Amount of data stored in all linear ABDs tracked by linear_cnt */
 103         { "linear_data_size",                   KSTAT_DATA_UINT64 },
 104         /*
 105          * The number of scatter ABDs which are currently allocated, excluding
 106          * ABDs which don't own their data (for instance the ones which were
 107          * allocated through abd_get_offset()).
 108          */
 109         { "scatter_cnt",                        KSTAT_DATA_UINT64 },
 110         /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
 111         { "scatter_data_size",                  KSTAT_DATA_UINT64 },
 112         /*
 113          * The amount of space wasted at the end of the last chunk across all
 114          * scatter ABDs tracked by scatter_cnt.
 115          */
 116         { "scatter_chunk_waste",                KSTAT_DATA_UINT64 },
 117         /*
 118          * The number of compound allocations of a given order.  These
 119          * allocations are spread over all currently allocated ABDs, and
 120          * act as a measure of memory fragmentation.
 121          */
 122         { { "scatter_order_N",                  KSTAT_DATA_UINT64 } },
 123         /*
 124          * The number of scatter ABDs which contain multiple chunks.
 125          * ABDs are preferentially allocated from the minimum number of
 126          * contiguous multi-page chunks, a single chunk is optimal.
 127          */
 128         { "scatter_page_multi_chunk",           KSTAT_DATA_UINT64 },
 129         /*
 130          * The number of scatter ABDs which are split across memory zones.
 131          * ABDs are preferentially allocated using pages from a single zone.
 132          */
 133         { "scatter_page_multi_zone",            KSTAT_DATA_UINT64 },
 134         /*
 135          *  The total number of retries encountered when attempting to
 136          *  allocate the pages to populate the scatter ABD.
 137          */
 138         { "scatter_page_alloc_retry",           KSTAT_DATA_UINT64 },
 139         /*
 140          *  The total number of retries encountered when attempting to
 141          *  allocate the sg table for an ABD.
 142          */
 143         { "scatter_sg_table_retry",             KSTAT_DATA_UINT64 },
 144 };
 145
 146 static struct {
 147         wmsum_t abdstat_struct_size;
 148         wmsum_t abdstat_linear_cnt;
 149         wmsum_t abdstat_linear_data_size;
 150         wmsum_t abdstat_scatter_cnt;
 151         wmsum_t abdstat_scatter_data_size;
 152         wmsum_t abdstat_scatter_chunk_waste;
 153         wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER];
 154         wmsum_t abdstat_scatter_page_multi_chunk;
 155         wmsum_t abdstat_scatter_page_multi_zone;
 156         wmsum_t abdstat_scatter_page_alloc_retry;
 157         wmsum_t abdstat_scatter_sg_table_retry;
 158 } abd_sums;
 159
 160 #define abd_for_each_sg(abd, sg, n, i)  \
 161         for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
 162
 163 /*
 164  * zfs_abd_scatter_min_size is the minimum allocation size to use scatter
 165  * ABD's.  Smaller allocations will use linear ABD's which uses
 166  * zio_[data_]buf_alloc().
 167  *
 168  * Scatter ABD's use at least one page each, so sub-page allocations waste
 169  * some space when allocated as scatter (e.g. 2KB scatter allocation wastes
 170  * half of each page).  Using linear ABD's for small allocations means that
 171  * they will be put on slabs which contain many allocations.  This can
 172  * improve memory efficiency, but it also makes it much harder for ARC
 173  * evictions to actually free pages, because all the buffers on one slab need
 174  * to be freed in order for the slab (and underlying pages) to be freed.
 175  * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's
 176  * possible for them to actually waste more memory than scatter (one page per
 177  * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th).
 178  *
 179  * Spill blocks are typically 512B and are heavily used on systems running
 180  * selinux with the default dnode size and the `xattr=sa` property set.
 181  *
 182  * By default we use linear allocations for 512B and 1KB, and scatter
 183  * allocations for larger (1.5KB and up).
 184  */
 185 static int zfs_abd_scatter_min_size = 512 * 3;
 186
 187 /*
 188  * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are
 189  * just a single zero'd page. This allows us to conserve memory by
 190  * only using a single zero page for the scatterlist.
 191  */
 192 abd_t *abd_zero_scatter = NULL;
 193
 194 struct page;
 195 /*
 196  * _KERNEL   - Will point to ZERO_PAGE if it is available or it will be
 197  *             an allocated zero'd PAGESIZE buffer.
 198  * Userspace - Will be an allocated zero'ed PAGESIZE buffer.
 199  *
 200  * abd_zero_page is assigned to each of the pages of abd_zero_scatter.
 201  */
 202 static struct page *abd_zero_page = NULL;
 203
 204 static kmem_cache_t *abd_cache = NULL;
 205 static kstat_t *abd_ksp;
 206
 207 static uint_t
 208 abd_chunkcnt_for_bytes(size_t size)
 209 {
 210         return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
 211 }
 212
 213 abd_t *
 214 abd_alloc_struct_impl(size_t size)
 215 {
 216         /*
 217          * In Linux we do not use the size passed in during ABD
 218          * allocation, so we just ignore it.
 219          */
 220         (void) size;
 221         abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
 222         ASSERT3P(abd, !=, NULL);
 223         ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
 224
 225         return (abd);
 226 }
 227
 228 void
 229 abd_free_struct_impl(abd_t *abd)
 230 {
 231         kmem_cache_free(abd_cache, abd);
 232         ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t));
 233 }
 234
 235 #ifdef _KERNEL
 236 static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1;
 237
 238 /*
 239  * Mark zfs data pages so they can be excluded from kernel crash dumps
 240  */
 241 #ifdef _LP64
 242 #define ABD_FILE_CACHE_PAGE     0x2F5ABDF11ECAC4E
 243
 244 static inline void
 245 abd_mark_zfs_page(struct page *page)
 246 {
 247         get_page(page);
 248         SetPagePrivate(page);
 249         set_page_private(page, ABD_FILE_CACHE_PAGE);
 250 }
 251
 252 static inline void
 253 abd_unmark_zfs_page(struct page *page)
 254 {
 255         set_page_private(page, 0UL);
 256         ClearPagePrivate(page);
 257         put_page(page);
 258 }
 259 #else
 260 #define abd_mark_zfs_page(page)
 261 #define abd_unmark_zfs_page(page)
 262 #endif /* _LP64 */
 263
 264 #ifndef CONFIG_HIGHMEM
 265
 266 #ifndef __GFP_RECLAIM
 267 #define __GFP_RECLAIM           __GFP_WAIT
 268 #endif
 269
 270 /*
 271  * The goal is to minimize fragmentation by preferentially populating ABDs
 272  * with higher order compound pages from a single zone.  Allocation size is
 273  * progressively decreased until it can be satisfied without performing
 274  * reclaim or compaction.  When necessary this function will degenerate to
 275  * allocating individual pages and allowing reclaim to satisfy allocations.
 276  */
 277 void
 278 abd_alloc_chunks(abd_t *abd, size_t size)
 279 {
 280         struct list_head pages;
 281         struct sg_table table;
 282         struct scatterlist *sg;
 283         struct page *page, *tmp_page = NULL;
 284         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 285         gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
 286         unsigned int max_order = MIN(zfs_abd_scatter_max_order,
 287             ABD_MAX_ORDER - 1);
 288         unsigned int nr_pages = abd_chunkcnt_for_bytes(size);
 289         unsigned int chunks = 0, zones = 0;
 290         size_t remaining_size;
 291         int nid = NUMA_NO_NODE;
 292         unsigned int alloc_pages = 0;
 293
 294         INIT_LIST_HEAD(&pages);
 295
 296         ASSERT3U(alloc_pages, <, nr_pages);
 297
 298         while (alloc_pages < nr_pages) {
 299                 unsigned int chunk_pages;
 300                 unsigned int order;
 301
 302                 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 303                 chunk_pages = (1U << order);
 304
 305                 page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
 306                 if (page == NULL) {
 307                         if (order == 0) {
 308                                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 309                                 schedule_timeout_interruptible(1);
 310                         } else {
 311                                 max_order = MAX(0, order - 1);
 312                         }
 313                         continue;
 314                 }
 315
 316                 list_add_tail(&page->lru, &pages);
 317
 318                 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
 319                         zones++;
 320
 321                 nid = page_to_nid(page);
 322                 ABDSTAT_BUMP(abdstat_scatter_orders[order]);
 323                 chunks++;
 324                 alloc_pages += chunk_pages;
 325         }
 326
 327         ASSERT3S(alloc_pages, ==, nr_pages);
 328
 329         while (sg_alloc_table(&table, chunks, gfp)) {
 330                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 331                 schedule_timeout_interruptible(1);
 332         }
 333
 334         sg = table.sgl;
 335         remaining_size = size;
 336         list_for_each_entry_safe(page, tmp_page, &pages, lru) {
 337                 size_t sg_size = MIN(PAGESIZE << compound_order(page),
 338                     remaining_size);
 339                 sg_set_page(sg, page, sg_size, 0);
 340                 abd_mark_zfs_page(page);
 341                 remaining_size -= sg_size;
 342
 343                 sg = sg_next(sg);
 344                 list_del(&page->lru);
 345         }
 346
 347         /*
 348          * These conditions ensure that a possible transformation to a linear
 349          * ABD would be valid.
 350          */
 351         ASSERT(!PageHighMem(sg_page(table.sgl)));
 352         ASSERT0(ABD_SCATTER(abd).abd_offset);
 353
 354         if (table.nents == 1) {
 355                 /*
 356                  * Since there is only one entry, this ABD can be represented
 357                  * as a linear buffer.  All single-page (4K) ABD's can be
 358                  * represented this way.  Some multi-page ABD's can also be
 359                  * represented this way, if we were able to allocate a single
 360                  * "chunk" (higher-order "page" which represents a power-of-2
 361                  * series of physically-contiguous pages).  This is often the
 362                  * case for 2-page (8K) ABD's.
 363                  *
 364                  * Representing a single-entry scatter ABD as a linear ABD
 365                  * has the performance advantage of avoiding the copy (and
 366                  * allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
 367                  * A performance increase of around 5% has been observed for
 368                  * ARC-cached reads (of small blocks which can take advantage
 369                  * of this).
 370                  *
 371                  * Note that this optimization is only possible because the
 372                  * pages are always mapped into the kernel's address space.
 373                  * This is not the case for highmem pages, so the
 374                  * optimization can not be made there.
 375                  */
 376                 abd->abd_flags |= ABD_FLAG_LINEAR;
 377                 abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
 378                 abd->abd_u.abd_linear.abd_sgl = table.sgl;
 379                 ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl));
 380         } else if (table.nents > 1) {
 381                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 382                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 383
 384                 if (zones) {
 385                         ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 386                         abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 387                 }
 388
 389                 ABD_SCATTER(abd).abd_sgl = table.sgl;
 390                 ABD_SCATTER(abd).abd_nents = table.nents;
 391         }
 392 }
 393 #else
 394
 395 /*
 396  * Allocate N individual pages to construct a scatter ABD.  This function
 397  * makes no attempt to request contiguous pages and requires the minimal
 398  * number of kernel interfaces.  It's designed for maximum compatibility.
 399  */
 400 void
 401 abd_alloc_chunks(abd_t *abd, size_t size)
 402 {
 403         struct scatterlist *sg = NULL;
 404         struct sg_table table;
 405         struct page *page;
 406         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 407         int nr_pages = abd_chunkcnt_for_bytes(size);
 408         int i = 0;
 409
 410         while (sg_alloc_table(&table, nr_pages, gfp)) {
 411                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 412                 schedule_timeout_interruptible(1);
 413         }
 414
 415         ASSERT3U(table.nents, ==, nr_pages);
 416         ABD_SCATTER(abd).abd_sgl = table.sgl;
 417         ABD_SCATTER(abd).abd_nents = nr_pages;
 418
 419         abd_for_each_sg(abd, sg, nr_pages, i) {
 420                 while ((page = __page_cache_alloc(gfp)) == NULL) {
 421                         ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 422                         schedule_timeout_interruptible(1);
 423                 }
 424
 425                 ABDSTAT_BUMP(abdstat_scatter_orders[0]);
 426                 sg_set_page(sg, page, PAGESIZE, 0);
 427                 abd_mark_zfs_page(page);
 428         }
 429
 430         if (nr_pages > 1) {
 431                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 432                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 433         }
 434 }
 435 #endif /* !CONFIG_HIGHMEM */
 436
 437 /*
 438  * This must be called if any of the sg_table allocation functions
 439  * are called.
 440  */
 441 static void
 442 abd_free_sg_table(abd_t *abd)
 443 {
 444         struct sg_table table;
 445
 446         table.sgl = ABD_SCATTER(abd).abd_sgl;
 447         table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents;
 448         sg_free_table(&table);
 449 }
 450
 451 void
 452 abd_free_chunks(abd_t *abd)
 453 {
 454         struct scatterlist *sg = NULL;
 455         struct page *page;
 456         int nr_pages = ABD_SCATTER(abd).abd_nents;
 457         int order, i = 0;
 458
 459         if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
 460                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
 461
 462         if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 463                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 464
 465         abd_for_each_sg(abd, sg, nr_pages, i) {
 466                 page = sg_page(sg);
 467                 abd_unmark_zfs_page(page);
 468                 order = compound_order(page);
 469                 __free_pages(page, order);
 470                 ASSERT3U(sg->length, <=, PAGE_SIZE << order);
 471                 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
 472         }
 473         abd_free_sg_table(abd);
 474 }
 475
 476 /*
 477  * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in
 478  * the scatterlist will be set to the zero'd out buffer abd_zero_page.
 479  */
 480 static void
 481 abd_alloc_zero_scatter(void)
 482 {
 483         struct scatterlist *sg = NULL;
 484         struct sg_table table;
 485         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 486         int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 487         int i = 0;
 488
 489 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 490         gfp_t gfp_zero_page = gfp | __GFP_ZERO;
 491         while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) {
 492                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 493                 schedule_timeout_interruptible(1);
 494         }
 495         abd_mark_zfs_page(abd_zero_page);
 496 #else
 497         abd_zero_page = ZERO_PAGE(0);
 498 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 499
 500         while (sg_alloc_table(&table, nr_pages, gfp)) {
 501                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 502                 schedule_timeout_interruptible(1);
 503         }
 504         ASSERT3U(table.nents, ==, nr_pages);
 505
 506         abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
 507         abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 508         ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
 509         ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl;
 510         ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 511         abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 512         abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
 513
 514         abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 515                 sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
 516         }
 517
 518         ABDSTAT_BUMP(abdstat_scatter_cnt);
 519         ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
 520         ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 521 }
 522
 523 #else /* _KERNEL */
 524
 525 #ifndef PAGE_SHIFT
 526 #define PAGE_SHIFT (highbit64(PAGESIZE)-1)
 527 #endif
 528
 529 #define zfs_kmap_atomic(chunk)          ((void *)chunk)
 530 #define zfs_kunmap_atomic(addr)         do { (void)(addr); } while (0)
 531 #define local_irq_save(flags)           do { (void)(flags); } while (0)
 532 #define local_irq_restore(flags)        do { (void)(flags); } while (0)
 533 #define nth_page(pg, i) \
 534         ((struct page *)((void *)(pg) + (i) * PAGESIZE))
 535
 536 struct scatterlist {
 537         struct page *page;
 538         int length;
 539         int end;
 540 };
 541
 542 static void
 543 sg_init_table(struct scatterlist *sg, int nr)
 544 {
 545         memset(sg, 0, nr * sizeof (struct scatterlist));
 546         sg[nr - 1].end = 1;
 547 }
 548
 549 /*
 550  * This must be called if any of the sg_table allocation functions
 551  * are called.
 552  */
 553 static void
 554 abd_free_sg_table(abd_t *abd)
 555 {
 556         int nents = ABD_SCATTER(abd).abd_nents;
 557         vmem_free(ABD_SCATTER(abd).abd_sgl,
 558             nents * sizeof (struct scatterlist));
 559 }
 560
 561 #define for_each_sg(sgl, sg, nr, i)     \
 562         for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
 563
 564 static inline void
 565 sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
 566     unsigned int offset)
 567 {
 568         /* currently we don't use offset */
 569         ASSERT(offset == 0);
 570         sg->page = page;
 571         sg->length = len;
 572 }
 573
 574 static inline struct page *
 575 sg_page(struct scatterlist *sg)
 576 {
 577         return (sg->page);
 578 }
 579
 580 static inline struct scatterlist *
 581 sg_next(struct scatterlist *sg)
 582 {
 583         if (sg->end)
 584                 return (NULL);
 585
 586         return (sg + 1);
 587 }
 588
 589 void
 590 abd_alloc_chunks(abd_t *abd, size_t size)
 591 {
 592         unsigned nr_pages = abd_chunkcnt_for_bytes(size);
 593         struct scatterlist *sg;
 594         int i;
 595
 596         ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
 597             sizeof (struct scatterlist), KM_SLEEP);
 598         sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
 599
 600         abd_for_each_sg(abd, sg, nr_pages, i) {
 601                 struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
 602                 sg_set_page(sg, p, PAGESIZE, 0);
 603         }
 604         ABD_SCATTER(abd).abd_nents = nr_pages;
 605 }
 606
 607 void
 608 abd_free_chunks(abd_t *abd)
 609 {
 610         int i, n = ABD_SCATTER(abd).abd_nents;
 611         struct scatterlist *sg;
 612
 613         abd_for_each_sg(abd, sg, n, i) {
 614                 struct page *p = nth_page(sg_page(sg), 0);
 615                 umem_free_aligned(p, PAGESIZE);
 616         }
 617         abd_free_sg_table(abd);
 618 }
 619
 620 static void
 621 abd_alloc_zero_scatter(void)
 622 {
 623         unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE);
 624         struct scatterlist *sg;
 625         int i;
 626
 627         abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
 628         memset(abd_zero_page, 0, PAGESIZE);
 629         abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE);
 630         abd_zero_scatter->abd_flags |= ABD_FLAG_OWNER;
 631         abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS;
 632         ABD_SCATTER(abd_zero_scatter).abd_offset = 0;
 633         ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages;
 634         abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE;
 635         ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages *
 636             sizeof (struct scatterlist), KM_SLEEP);
 637
 638         sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages);
 639
 640         abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) {
 641                 sg_set_page(sg, abd_zero_page, PAGESIZE, 0);
 642         }
 643
 644         ABDSTAT_BUMP(abdstat_scatter_cnt);
 645         ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE);
 646         ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 647 }
 648
 649 #endif /* _KERNEL */
 650
 651 boolean_t
 652 abd_size_alloc_linear(size_t size)
 653 {
 654         return (!zfs_abd_scatter_enabled || size < zfs_abd_scatter_min_size);
 655 }
 656
 657 void
 658 abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op)
 659 {
 660         ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 661         int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size;
 662         if (op == ABDSTAT_INCR) {
 663                 ABDSTAT_BUMP(abdstat_scatter_cnt);
 664                 ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size);
 665                 ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste);
 666                 arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 667         } else {
 668                 ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 669                 ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
 670                 ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste);
 671                 arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE);
 672         }
 673 }
 674
 675 void
 676 abd_update_linear_stats(abd_t *abd, abd_stats_op_t op)
 677 {
 678         ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR);
 679         if (op == ABDSTAT_INCR) {
 680                 ABDSTAT_BUMP(abdstat_linear_cnt);
 681                 ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
 682         } else {
 683                 ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
 684                 ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
 685         }
 686 }
 687
 688 void
 689 abd_verify_scatter(abd_t *abd)
 690 {
 691         size_t n;
 692         int i = 0;
 693         struct scatterlist *sg = NULL;
 694
 695         ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 696         ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 697             ABD_SCATTER(abd).abd_sgl->length);
 698         n = ABD_SCATTER(abd).abd_nents;
 699         abd_for_each_sg(abd, sg, n, i) {
 700                 ASSERT3P(sg_page(sg), !=, NULL);
 701         }
 702 }
 703
 704 static void
 705 abd_free_zero_scatter(void)
 706 {
 707         ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 708         ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE);
 709         ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 710
 711         abd_free_sg_table(abd_zero_scatter);
 712         abd_free_struct(abd_zero_scatter);
 713         abd_zero_scatter = NULL;
 714         ASSERT3P(abd_zero_page, !=, NULL);
 715 #if defined(_KERNEL)
 716 #if defined(HAVE_ZERO_PAGE_GPL_ONLY)
 717         abd_unmark_zfs_page(abd_zero_page);
 718         __free_page(abd_zero_page);
 719 #endif /* HAVE_ZERO_PAGE_GPL_ONLY */
 720 #else
 721         umem_free_aligned(abd_zero_page, PAGESIZE);
 722 #endif /* _KERNEL */
 723 }
 724
 725 static int
 726 abd_kstats_update(kstat_t *ksp, int rw)
 727 {
 728         abd_stats_t *as = ksp->ks_data;
 729
 730         if (rw == KSTAT_WRITE)
 731                 return (EACCES);
 732         as->abdstat_struct_size.value.ui64 =
 733             wmsum_value(&abd_sums.abdstat_struct_size);
 734         as->abdstat_linear_cnt.value.ui64 =
 735             wmsum_value(&abd_sums.abdstat_linear_cnt);
 736         as->abdstat_linear_data_size.value.ui64 =
 737             wmsum_value(&abd_sums.abdstat_linear_data_size);
 738         as->abdstat_scatter_cnt.value.ui64 =
 739             wmsum_value(&abd_sums.abdstat_scatter_cnt);
 740         as->abdstat_scatter_data_size.value.ui64 =
 741             wmsum_value(&abd_sums.abdstat_scatter_data_size);
 742         as->abdstat_scatter_chunk_waste.value.ui64 =
 743             wmsum_value(&abd_sums.abdstat_scatter_chunk_waste);
 744         for (int i = 0; i < ABD_MAX_ORDER; i++) {
 745                 as->abdstat_scatter_orders[i].value.ui64 =
 746                     wmsum_value(&abd_sums.abdstat_scatter_orders[i]);
 747         }
 748         as->abdstat_scatter_page_multi_chunk.value.ui64 =
 749             wmsum_value(&abd_sums.abdstat_scatter_page_multi_chunk);
 750         as->abdstat_scatter_page_multi_zone.value.ui64 =
 751             wmsum_value(&abd_sums.abdstat_scatter_page_multi_zone);
 752         as->abdstat_scatter_page_alloc_retry.value.ui64 =
 753             wmsum_value(&abd_sums.abdstat_scatter_page_alloc_retry);
 754         as->abdstat_scatter_sg_table_retry.value.ui64 =
 755             wmsum_value(&abd_sums.abdstat_scatter_sg_table_retry);
 756         return (0);
 757 }
 758
 759 void
 760 abd_init(void)
 761 {
 762         int i;
 763
 764         abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
 765             0, NULL, NULL, NULL, NULL, NULL, 0);
 766
 767         wmsum_init(&abd_sums.abdstat_struct_size, 0);
 768         wmsum_init(&abd_sums.abdstat_linear_cnt, 0);
 769         wmsum_init(&abd_sums.abdstat_linear_data_size, 0);
 770         wmsum_init(&abd_sums.abdstat_scatter_cnt, 0);
 771         wmsum_init(&abd_sums.abdstat_scatter_data_size, 0);
 772         wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0);
 773         for (i = 0; i < ABD_MAX_ORDER; i++)
 774                 wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0);
 775         wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0);
 776         wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0);
 777         wmsum_init(&abd_sums.abdstat_scatter_page_alloc_retry, 0);
 778         wmsum_init(&abd_sums.abdstat_scatter_sg_table_retry, 0);
 779
 780         abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
 781             sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 782         if (abd_ksp != NULL) {
 783                 for (i = 0; i < ABD_MAX_ORDER; i++) {
 784                         snprintf(abd_stats.abdstat_scatter_orders[i].name,
 785                             KSTAT_STRLEN, "scatter_order_%d", i);
 786                         abd_stats.abdstat_scatter_orders[i].data_type =
 787                             KSTAT_DATA_UINT64;
 788                 }
 789                 abd_ksp->ks_data = &abd_stats;
 790                 abd_ksp->ks_update = abd_kstats_update;
 791                 kstat_install(abd_ksp);
 792         }
 793
 794         abd_alloc_zero_scatter();
 795 }
 796
 797 void
 798 abd_fini(void)
 799 {
 800         abd_free_zero_scatter();
 801
 802         if (abd_ksp != NULL) {
 803                 kstat_delete(abd_ksp);
 804                 abd_ksp = NULL;
 805         }
 806
 807         wmsum_fini(&abd_sums.abdstat_struct_size);
 808         wmsum_fini(&abd_sums.abdstat_linear_cnt);
 809         wmsum_fini(&abd_sums.abdstat_linear_data_size);
 810         wmsum_fini(&abd_sums.abdstat_scatter_cnt);
 811         wmsum_fini(&abd_sums.abdstat_scatter_data_size);
 812         wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste);
 813         for (int i = 0; i < ABD_MAX_ORDER; i++)
 814                 wmsum_fini(&abd_sums.abdstat_scatter_orders[i]);
 815         wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk);
 816         wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone);
 817         wmsum_fini(&abd_sums.abdstat_scatter_page_alloc_retry);
 818         wmsum_fini(&abd_sums.abdstat_scatter_sg_table_retry);
 819
 820         if (abd_cache) {
 821                 kmem_cache_destroy(abd_cache);
 822                 abd_cache = NULL;
 823         }
 824 }
 825
 826 void
 827 abd_free_linear_page(abd_t *abd)
 828 {
 829         /* Transform it back into a scatter ABD for freeing */
 830         struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
 831         abd->abd_flags &= ~ABD_FLAG_LINEAR;
 832         abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
 833         ABD_SCATTER(abd).abd_nents = 1;
 834         ABD_SCATTER(abd).abd_offset = 0;
 835         ABD_SCATTER(abd).abd_sgl = sg;
 836         abd_free_chunks(abd);
 837
 838         abd_update_scatter_stats(abd, ABDSTAT_DECR);
 839 }
 840
 841 /*
 842  * If we're going to use this ABD for doing I/O using the block layer, the
 843  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
 844  * plan to store this ABD in memory for a long period of time, we should
 845  * allocate the ABD type that requires the least data copying to do the I/O.
 846  *
 847  * On Linux the optimal thing to do would be to use abd_get_offset() and
 848  * construct a new ABD which shares the original pages thereby eliminating
 849  * the copy.  But for the moment a new linear ABD is allocated until this
 850  * performance optimization can be implemented.
 851  */
 852 abd_t *
 853 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 854 {
 855         return (abd_alloc(size, is_metadata));
 856 }
 857
 858 abd_t *
 859 abd_get_offset_scatter(abd_t *abd, abd_t *sabd, size_t off,
 860     size_t size)
 861 {
 862         (void) size;
 863         int i = 0;
 864         struct scatterlist *sg = NULL;
 865
 866         abd_verify(sabd);
 867         ASSERT3U(off, <=, sabd->abd_size);
 868
 869         size_t new_offset = ABD_SCATTER(sabd).abd_offset + off;
 870
 871         if (abd == NULL)
 872                 abd = abd_alloc_struct(0);
 873
 874         /*
 875          * Even if this buf is filesystem metadata, we only track that
 876          * if we own the underlying data buffer, which is not true in
 877          * this case. Therefore, we don't ever use ABD_FLAG_META here.
 878          */
 879
 880         abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
 881                 if (new_offset < sg->length)
 882                         break;
 883                 new_offset -= sg->length;
 884         }
 885
 886         ABD_SCATTER(abd).abd_sgl = sg;
 887         ABD_SCATTER(abd).abd_offset = new_offset;
 888         ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 889
 890         return (abd);
 891 }
 892
 893 /*
 894  * Initialize the abd_iter.
 895  */
 896 void
 897 abd_iter_init(struct abd_iter *aiter, abd_t *abd)
 898 {
 899         ASSERT(!abd_is_gang(abd));
 900         abd_verify(abd);
 901         memset(aiter, 0, sizeof (struct abd_iter));
 902         aiter->iter_abd = abd;
 903         if (!abd_is_linear(abd)) {
 904                 aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
 905                 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
 906         }
 907 }
 908
 909 /*
 910  * This is just a helper function to see if we have exhausted the
 911  * abd_iter and reached the end.
 912  */
 913 boolean_t
 914 abd_iter_at_end(struct abd_iter *aiter)
 915 {
 916         ASSERT3U(aiter->iter_pos, <=, aiter->iter_abd->abd_size);
 917         return (aiter->iter_pos == aiter->iter_abd->abd_size);
 918 }
 919
 920 /*
 921  * Advance the iterator by a certain amount. Cannot be called when a chunk is
 922  * in use. This can be safely called when the aiter has already exhausted, in
 923  * which case this does nothing.
 924  */
 925 void
 926 abd_iter_advance(struct abd_iter *aiter, size_t amount)
 927 {
 928         /*
 929          * Ensure that last chunk is not in use. abd_iterate_*() must clear
 930          * this state (directly or abd_iter_unmap()) before advancing.
 931          */
 932         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 933         ASSERT0(aiter->iter_mapsize);
 934         ASSERT3P(aiter->iter_page, ==, NULL);
 935         ASSERT0(aiter->iter_page_doff);
 936         ASSERT0(aiter->iter_page_dsize);
 937
 938         /* There's nothing left to advance to, so do nothing */
 939         if (abd_iter_at_end(aiter))
 940                 return;
 941
 942         aiter->iter_pos += amount;
 943         aiter->iter_offset += amount;
 944         if (!abd_is_linear(aiter->iter_abd)) {
 945                 while (aiter->iter_offset >= aiter->iter_sg->length) {
 946                         aiter->iter_offset -= aiter->iter_sg->length;
 947                         aiter->iter_sg = sg_next(aiter->iter_sg);
 948                         if (aiter->iter_sg == NULL) {
 949                                 ASSERT0(aiter->iter_offset);
 950                                 break;
 951                         }
 952                 }
 953         }
 954 }
 955
 956 /*
 957  * Map the current chunk into aiter. This can be safely called when the aiter
 958  * has already exhausted, in which case this does nothing.
 959  */
 960 void
 961 abd_iter_map(struct abd_iter *aiter)
 962 {
 963         void *paddr;
 964         size_t offset = 0;
 965
 966         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
 967         ASSERT0(aiter->iter_mapsize);
 968
 969         /* There's nothing left to iterate over, so do nothing */
 970         if (abd_iter_at_end(aiter))
 971                 return;
 972
 973         if (abd_is_linear(aiter->iter_abd)) {
 974                 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
 975                 offset = aiter->iter_offset;
 976                 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
 977                 paddr = ABD_LINEAR_BUF(aiter->iter_abd);
 978         } else {
 979                 offset = aiter->iter_offset;
 980                 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
 981                     aiter->iter_abd->abd_size - aiter->iter_pos);
 982
 983                 paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg));
 984         }
 985
 986         aiter->iter_mapaddr = (char *)paddr + offset;
 987 }
 988
 989 /*
 990  * Unmap the current chunk from aiter. This can be safely called when the aiter
 991  * has already exhausted, in which case this does nothing.
 992  */
 993 void
 994 abd_iter_unmap(struct abd_iter *aiter)
 995 {
 996         /* There's nothing left to unmap, so do nothing */
 997         if (abd_iter_at_end(aiter))
 998                 return;
 999
1000         if (!abd_is_linear(aiter->iter_abd)) {
1001                 /* LINTED E_FUNC_SET_NOT_USED */
1002                 zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset);
1003         }
1004
1005         ASSERT3P(aiter->iter_mapaddr, !=, NULL);
1006         ASSERT3U(aiter->iter_mapsize, >, 0);
1007
1008         aiter->iter_mapaddr = NULL;
1009         aiter->iter_mapsize = 0;
1010 }
1011
1012 void
1013 abd_cache_reap_now(void)
1014 {
1015 }
1016
1017 #if defined(_KERNEL)
1018 /*
1019  * Yield the next page struct and data offset and size within it, without
1020  * mapping it into the address space.
1021  */
1022 void
1023 abd_iter_page(struct abd_iter *aiter)
1024 {
1025         if (abd_iter_at_end(aiter)) {
1026                 aiter->iter_page = NULL;
1027                 aiter->iter_page_doff = 0;
1028                 aiter->iter_page_dsize = 0;
1029                 return;
1030         }
1031
1032         struct page *page;
1033         size_t doff, dsize;
1034
1035         if (abd_is_linear(aiter->iter_abd)) {
1036                 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1037
1038                 /* memory address at iter_pos */
1039                 void *paddr = ABD_LINEAR_BUF(aiter->iter_abd) + aiter->iter_pos;
1040
1041                 /* struct page for address */
1042                 page = is_vmalloc_addr(paddr) ?
1043                     vmalloc_to_page(paddr) : virt_to_page(paddr);
1044
1045                 /* offset of address within the page */
1046                 doff = offset_in_page(paddr);
1047
1048                 /* total data remaining in abd from this position */
1049                 dsize = aiter->iter_abd->abd_size - aiter->iter_offset;
1050         } else {
1051                 ASSERT(!abd_is_gang(aiter->iter_abd));
1052
1053                 /* current scatter page */
1054                 page = sg_page(aiter->iter_sg);
1055
1056                 /* position within page */
1057                 doff = aiter->iter_offset;
1058
1059                 /* remaining data in scatterlist */
1060                 dsize = MIN(aiter->iter_sg->length - aiter->iter_offset,
1061                     aiter->iter_abd->abd_size - aiter->iter_pos);
1062         }
1063         ASSERT(page);
1064
1065 #if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0)
1066         if (PageTail(page)) {
1067                 /*
1068                  * This page is part of a "compound page", which is a group of
1069                  * pages that can be referenced from a single struct page *.
1070                  * Its organised as a "head" page, followed by a series of
1071                  * "tail" pages.
1072                  *
1073                  * In OpenZFS, compound pages are allocated using the
1074                  * __GFP_COMP flag, which we get from scatter ABDs and SPL
1075                  * vmalloc slabs (ie >16K allocations). So a great many of the
1076                  * IO buffers we get are going to be of this type.
1077                  *
1078                  * The tail pages are just regular PAGE_SIZE pages, and can be
1079                  * safely used as-is. However, the head page has length
1080                  * covering itself and all the tail pages. If this ABD chunk
1081                  * spans multiple pages, then we can use the head page and a
1082                  * >PAGE_SIZE length, which is far more efficient.
1083                  *
1084                  * To do this, we need to adjust the offset to be counted from
1085                  * the head page. struct page for compound pages are stored
1086                  * contiguously, so we can just adjust by a simple offset.
1087                  *
1088                  * Before kernel 4.5, compound page heads were refcounted
1089                  * separately, such that moving back to the head page would
1090                  * require us to take a reference to it and releasing it once
1091                  * we're completely finished with it. In practice, that means
1092                  * when our caller is done with the ABD, which we have no
1093                  * insight into from here. Rather than contort this API to
1094                  * track head page references on such ancient kernels, we just
1095                  * compile this block out and use the tail pages directly. This
1096                  * is slightly less efficient, but makes everything far
1097                  * simpler.
1098                  */
1099                 struct page *head = compound_head(page);
1100                 doff += ((page - head) * PAGESIZE);
1101                 page = head;
1102         }
1103 #endif
1104
1105         /* final page and position within it */
1106         aiter->iter_page = page;
1107         aiter->iter_page_doff = doff;
1108
1109         /* amount of data in the chunk, up to the end of the page */
1110         aiter->iter_page_dsize = MIN(dsize, page_size(page) - doff);
1111 }
1112
1113 /*
1114  * Note: ABD BIO functions only needed to support vdev_classic. See comments in
1115  * vdev_disk.c.
1116  */
1117
1118 /*
1119  * bio_nr_pages for ABD.
1120  * @off is the offset in @abd
1121  */
1122 unsigned long
1123 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1124 {
1125         unsigned long pos;
1126
1127         if (abd_is_gang(abd)) {
1128                 unsigned long count = 0;
1129
1130                 for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1131                     cabd != NULL && size != 0;
1132                     cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1133                         ASSERT3U(off, <, cabd->abd_size);
1134                         int mysize = MIN(size, cabd->abd_size - off);
1135                         count += abd_nr_pages_off(cabd, mysize, off);
1136                         size -= mysize;
1137                         off = 0;
1138                 }
1139                 return (count);
1140         }
1141
1142         if (abd_is_linear(abd))
1143                 pos = (unsigned long)abd_to_buf(abd) + off;
1144         else
1145                 pos = ABD_SCATTER(abd).abd_offset + off;
1146
1147         return (((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1148             (pos >> PAGE_SHIFT));
1149 }
1150
1151 static unsigned int
1152 bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size)
1153 {
1154         unsigned int offset, size, i;
1155         struct page *page;
1156
1157         offset = offset_in_page(buf_ptr);
1158         for (i = 0; i < bio->bi_max_vecs; i++) {
1159                 size = PAGE_SIZE - offset;
1160
1161                 if (bio_size <= 0)
1162                         break;
1163
1164                 if (size > bio_size)
1165                         size = bio_size;
1166
1167                 if (is_vmalloc_addr(buf_ptr))
1168                         page = vmalloc_to_page(buf_ptr);
1169                 else
1170                         page = virt_to_page(buf_ptr);
1171
1172                 /*
1173                  * Some network related block device uses tcp_sendpage, which
1174                  * doesn't behave well when using 0-count page, this is a
1175                  * safety net to catch them.
1176                  */
1177                 ASSERT3S(page_count(page), >, 0);
1178
1179                 if (bio_add_page(bio, page, size, offset) != size)
1180                         break;
1181
1182                 buf_ptr += size;
1183                 bio_size -= size;
1184                 offset = 0;
1185         }
1186
1187         return (bio_size);
1188 }
1189
1190 /*
1191  * bio_map for gang ABD.
1192  */
1193 static unsigned int
1194 abd_gang_bio_map_off(struct bio *bio, abd_t *abd,
1195     unsigned int io_size, size_t off)
1196 {
1197         ASSERT(abd_is_gang(abd));
1198
1199         for (abd_t *cabd = abd_gang_get_offset(abd, &off);
1200             cabd != NULL;
1201             cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) {
1202                 ASSERT3U(off, <, cabd->abd_size);
1203                 int size = MIN(io_size, cabd->abd_size - off);
1204                 int remainder = abd_bio_map_off(bio, cabd, size, off);
1205                 io_size -= (size - remainder);
1206                 if (io_size == 0 || remainder > 0)
1207                         return (io_size);
1208                 off = 0;
1209         }
1210         ASSERT0(io_size);
1211         return (io_size);
1212 }
1213
1214 /*
1215  * bio_map for ABD.
1216  * @off is the offset in @abd
1217  * Remaining IO size is returned
1218  */
1219 unsigned int
1220 abd_bio_map_off(struct bio *bio, abd_t *abd,
1221     unsigned int io_size, size_t off)
1222 {
1223         struct abd_iter aiter;
1224
1225         ASSERT3U(io_size, <=, abd->abd_size - off);
1226         if (abd_is_linear(abd))
1227                 return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size));
1228
1229         ASSERT(!abd_is_linear(abd));
1230         if (abd_is_gang(abd))
1231                 return (abd_gang_bio_map_off(bio, abd, io_size, off));
1232
1233         abd_iter_init(&aiter, abd);
1234         abd_iter_advance(&aiter, off);
1235
1236         for (int i = 0; i < bio->bi_max_vecs; i++) {
1237                 struct page *pg;
1238                 size_t len, sgoff, pgoff;
1239                 struct scatterlist *sg;
1240
1241                 if (io_size <= 0)
1242                         break;
1243
1244                 sg = aiter.iter_sg;
1245                 sgoff = aiter.iter_offset;
1246                 pgoff = sgoff & (PAGESIZE - 1);
1247                 len = MIN(io_size, PAGESIZE - pgoff);
1248                 ASSERT(len > 0);
1249
1250                 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1251                 if (bio_add_page(bio, pg, len, pgoff) != len)
1252                         break;
1253
1254                 io_size -= len;
1255                 abd_iter_advance(&aiter, len);
1256         }
1257
1258         return (io_size);
1259 }
1260
1261 /* Tunable Parameters */
1262 module_param(zfs_abd_scatter_enabled, int, 0644);
1263 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1264         "Toggle whether ABD allocations must be linear.");
1265 module_param(zfs_abd_scatter_min_size, int, 0644);
1266 MODULE_PARM_DESC(zfs_abd_scatter_min_size,
1267         "Minimum size of scatter allocations.");
1268 /* CSTYLED */
1269 module_param(zfs_abd_scatter_max_order, uint, 0644);
1270 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1271         "Maximum order allocation used for a scatter ABD.");
1272
1273 #endif /* _KERNEL */