module/zfs/abd.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2014 by Chunwei Chen. All rights reserved.
  23  * Copyright (c) 2016 by Delphix. All rights reserved.
  24  */
  25
  26 /*
  27  * ARC buffer data (ABD).
  28  *
  29  * ABDs are an abstract data structure for the ARC which can use two
  30  * different ways of storing the underlying data:
  31  *
  32  * (a) Linear buffer. In this case, all the data in the ABD is stored in one
  33  *     contiguous buffer in memory (from a zio_[data_]buf_* kmem cache).
  34  *
  35  *         +-------------------+
  36  *         | ABD (linear)      |
  37  *         |   abd_flags = ... |
  38  *         |   abd_size = ...  |     +--------------------------------+
  39  *         |   abd_buf ------------->| raw buffer of size abd_size    |
  40  *         +-------------------+     +--------------------------------+
  41  *              no abd_chunks
  42  *
  43  * (b) Scattered buffer. In this case, the data in the ABD is split into
  44  *     equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers
  45  *     to the chunks recorded in an array at the end of the ABD structure.
  46  *
  47  *         +-------------------+
  48  *         | ABD (scattered)   |
  49  *         |   abd_flags = ... |
  50  *         |   abd_size = ...  |
  51  *         |   abd_offset = 0  |                           +-----------+
  52  *         |   abd_chunks[0] ----------------------------->| chunk 0   |
  53  *         |   abd_chunks[1] ---------------------+        +-----------+
  54  *         |   ...             |                  |        +-----------+
  55  *         |   abd_chunks[N-1] ---------+         +------->| chunk 1   |
  56  *         +-------------------+        |                  +-----------+
  57  *                                      |                      ...
  58  *                                      |                  +-----------+
  59  *                                      +----------------->| chunk N-1 |
  60  *                                                         +-----------+
  61  *
  62  * Linear buffers act exactly like normal buffers and are always mapped into the
  63  * kernel's virtual memory space, while scattered ABD data chunks are allocated
  64  * as physical pages and then mapped in only while they are actually being
  65  * accessed through one of the abd_* library functions. Using scattered ABDs
  66  * provides several benefits:
  67  *
  68  *  (1) They avoid use of kmem_*, preventing performance problems where running
  69  *      kmem_reap on very large memory systems never finishes and causes
  70  *      constant TLB shootdowns.
  71  *
  72  *  (2) Fragmentation is less of an issue since when we are at the limit of
  73  *      allocatable space, we won't have to search around for a long free
  74  *      hole in the VA space for large ARC allocations. Each chunk is mapped in
  75  *      individually, so even if we weren't using segkpm (see next point) we
  76  *      wouldn't need to worry about finding a contiguous address range.
  77  *
  78  *  (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
  79  *      on each ABD access. (If segkpm isn't available then we use all linear
  80  *      ABDs to avoid this penalty.) See seg_kpm.c for more details.
  81  *
  82  * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
  83  * B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
  84  * available, which is the case on all 32-bit systems and any 64-bit systems
  85  * where kpm_enable is turned off.
  86  *
  87  * In addition to directly allocating a linear or scattered ABD, it is also
  88  * possible to create an ABD by requesting the "sub-ABD" starting at an offset
  89  * within an existing ABD. In linear buffers this is simple (set abd_buf of
  90  * the new ABD to the starting point within the original raw buffer), but
  91  * scattered ABDs are a little more complex. The new ABD makes a copy of the
  92  * relevant abd_chunks pointers (but not the underlying data). However, to
  93  * provide arbitrary rather than only chunk-aligned starting offsets, it also
  94  * tracks an abd_offset field which represents the starting point of the data
  95  * within the first chunk in abd_chunks. For both linear and scattered ABDs,
  96  * creating an offset ABD marks the original ABD as the offset's parent, and the
  97  * original ABD's abd_children refcount is incremented. This data allows us to
  98  * ensure the root ABD isn't deleted before its children.
  99  *
 100  * Most consumers should never need to know what type of ABD they're using --
 101  * the ABD public API ensures that it's possible to transparently switch from
 102  * using a linear ABD to a scattered one when doing so would be beneficial.
 103  *
 104  * If you need to use the data within an ABD directly, if you know it's linear
 105  * (because you allocated it) you can use abd_to_buf() to access the underlying
 106  * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions
 107  * which will allocate a raw buffer if necessary. Use the abd_return_buf*
 108  * functions to return any raw buffers that are no longer necessary when you're
 109  * done using them.
 110  *
 111  * There are a variety of ABD APIs that implement basic buffer operations:
 112  * compare, copy, read, write, and fill with zeroes. If you need a custom
 113  * function which progressively accesses the whole ABD, use the abd_iterate_*
 114  * functions.
 115  */
 116
 117 #include <sys/abd.h>
 118 #include <sys/param.h>
 119 #include <sys/zio.h>
 120 #include <sys/zfs_context.h>
 121 #include <sys/zfs_znode.h>
 122 #ifdef _KERNEL
 123 #include <linux/scatterlist.h>
 124 #include <linux/kmap_compat.h>
 125 #else
 126 #define MAX_ORDER       1
 127 #endif
 128
 129 typedef struct abd_stats {
 130         kstat_named_t abdstat_struct_size;
 131         kstat_named_t abdstat_linear_cnt;
 132         kstat_named_t abdstat_linear_data_size;
 133         kstat_named_t abdstat_scatter_cnt;
 134         kstat_named_t abdstat_scatter_data_size;
 135         kstat_named_t abdstat_scatter_chunk_waste;
 136         kstat_named_t abdstat_scatter_orders[MAX_ORDER];
 137         kstat_named_t abdstat_scatter_page_multi_chunk;
 138         kstat_named_t abdstat_scatter_page_multi_zone;
 139         kstat_named_t abdstat_scatter_page_alloc_retry;
 140         kstat_named_t abdstat_scatter_sg_table_retry;
 141 } abd_stats_t;
 142
 143 static abd_stats_t abd_stats = {
 144         /* Amount of memory occupied by all of the abd_t struct allocations */
 145         { "struct_size",                        KSTAT_DATA_UINT64 },
 146         /*
 147          * The number of linear ABDs which are currently allocated, excluding
 148          * ABDs which don't own their data (for instance the ones which were
 149          * allocated through abd_get_offset() and abd_get_from_buf()). If an
 150          * ABD takes ownership of its buf then it will become tracked.
 151          */
 152         { "linear_cnt",                         KSTAT_DATA_UINT64 },
 153         /* Amount of data stored in all linear ABDs tracked by linear_cnt */
 154         { "linear_data_size",                   KSTAT_DATA_UINT64 },
 155         /*
 156          * The number of scatter ABDs which are currently allocated, excluding
 157          * ABDs which don't own their data (for instance the ones which were
 158          * allocated through abd_get_offset()).
 159          */
 160         { "scatter_cnt",                        KSTAT_DATA_UINT64 },
 161         /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */
 162         { "scatter_data_size",                  KSTAT_DATA_UINT64 },
 163         /*
 164          * The amount of space wasted at the end of the last chunk across all
 165          * scatter ABDs tracked by scatter_cnt.
 166          */
 167         { "scatter_chunk_waste",                KSTAT_DATA_UINT64 },
 168         /*
 169          * The number of compound allocations of a given order.  These
 170          * allocations are spread over all currently allocated ABDs, and
 171          * act as a measure of memory fragmentation.
 172          */
 173         { { "scatter_order_N",                  KSTAT_DATA_UINT64 } },
 174         /*
 175          * The number of scatter ABDs which contain multiple chunks.
 176          * ABDs are preferentially allocated from the minimum number of
 177          * contiguous multi-page chunks, a single chunk is optimal.
 178          */
 179         { "scatter_page_multi_chunk",           KSTAT_DATA_UINT64 },
 180         /*
 181          * The number of scatter ABDs which are split across memory zones.
 182          * ABDs are preferentially allocated using pages from a single zone.
 183          */
 184         { "scatter_page_multi_zone",            KSTAT_DATA_UINT64 },
 185         /*
 186          *  The total number of retries encountered when attempting to
 187          *  allocate the pages to populate the scatter ABD.
 188          */
 189         { "scatter_page_alloc_retry",           KSTAT_DATA_UINT64 },
 190         /*
 191          *  The total number of retries encountered when attempting to
 192          *  allocate the sg table for an ABD.
 193          */
 194         { "scatter_sg_table_retry",             KSTAT_DATA_UINT64 },
 195 };
 196
 197 #define ABDSTAT(stat)           (abd_stats.stat.value.ui64)
 198 #define ABDSTAT_INCR(stat, val) \
 199         atomic_add_64(&abd_stats.stat.value.ui64, (val))
 200 #define ABDSTAT_BUMP(stat)      ABDSTAT_INCR(stat, 1)
 201 #define ABDSTAT_BUMPDOWN(stat)  ABDSTAT_INCR(stat, -1)
 202
 203 #define ABD_SCATTER(abd)        (abd->abd_u.abd_scatter)
 204 #define ABD_BUF(abd)            (abd->abd_u.abd_linear.abd_buf)
 205 #define abd_for_each_sg(abd, sg, n, i)  \
 206         for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i)
 207
 208 /* see block comment above for description */
 209 int zfs_abd_scatter_enabled = B_TRUE;
 210 unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1;
 211
 212 static kmem_cache_t *abd_cache = NULL;
 213 static kstat_t *abd_ksp;
 214
 215 static inline size_t
 216 abd_chunkcnt_for_bytes(size_t size)
 217 {
 218         return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE);
 219 }
 220
 221 #ifdef _KERNEL
 222 #ifndef CONFIG_HIGHMEM
 223
 224 #ifndef __GFP_RECLAIM
 225 #define __GFP_RECLAIM           __GFP_WAIT
 226 #endif
 227
 228 static unsigned long
 229 abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
 230 {
 231         struct page *page;
 232
 233         page = alloc_pages_node(nid, gfp, order);
 234         if (!page)
 235                 return (0);
 236
 237         return ((unsigned long) page_address(page));
 238 }
 239
 240 /*
 241  * The goal is to minimize fragmentation by preferentially populating ABDs
 242  * with higher order compound pages from a single zone.  Allocation size is
 243  * progressively decreased until it can be satisfied without performing
 244  * reclaim or compaction.  When necessary this function will degenerate to
 245  * allocating individual pages and allowing reclaim to satisfy allocations.
 246  */
 247 static void
 248 abd_alloc_pages(abd_t *abd, size_t size)
 249 {
 250         struct list_head pages;
 251         struct sg_table table;
 252         struct scatterlist *sg;
 253         struct page *page, *tmp_page;
 254         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 255         gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM;
 256         int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1);
 257         int nr_pages = abd_chunkcnt_for_bytes(size);
 258         int chunks = 0, zones = 0;
 259         size_t remaining_size;
 260         int nid = NUMA_NO_NODE;
 261         int alloc_pages = 0;
 262         int order;
 263
 264         INIT_LIST_HEAD(&pages);
 265
 266         while (alloc_pages < nr_pages) {
 267                 unsigned long paddr;
 268                 unsigned chunk_pages;
 269
 270                 order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
 271                 chunk_pages = (1U << order);
 272
 273                 paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
 274                 if (paddr == 0) {
 275                         if (order == 0) {
 276                                 ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 277                                 schedule_timeout_interruptible(1);
 278                         } else {
 279                                 max_order = MAX(0, order - 1);
 280                         }
 281                         continue;
 282                 }
 283
 284                 page = virt_to_page(paddr);
 285                 list_add_tail(&page->lru, &pages);
 286
 287                 if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
 288                         zones++;
 289
 290                 nid = page_to_nid(page);
 291                 ABDSTAT_BUMP(abdstat_scatter_orders[order]);
 292                 chunks++;
 293                 alloc_pages += chunk_pages;
 294         }
 295
 296         ASSERT3S(alloc_pages, ==, nr_pages);
 297
 298         while (sg_alloc_table(&table, chunks, gfp)) {
 299                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 300                 schedule_timeout_interruptible(1);
 301         }
 302
 303         sg = table.sgl;
 304         remaining_size = size;
 305         list_for_each_entry_safe(page, tmp_page, &pages, lru) {
 306                 size_t sg_size = MIN(PAGESIZE << compound_order(page),
 307                     remaining_size);
 308                 sg_set_page(sg, page, sg_size, 0);
 309                 remaining_size -= sg_size;
 310
 311                 sg = sg_next(sg);
 312                 list_del(&page->lru);
 313         }
 314
 315         if (chunks > 1) {
 316                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 317                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 318
 319                 if (zones) {
 320                         ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
 321                         abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
 322                 }
 323         }
 324
 325         ABD_SCATTER(abd).abd_sgl = table.sgl;
 326         ABD_SCATTER(abd).abd_nents = table.nents;
 327 }
 328 #else
 329 /*
 330  * Allocate N individual pages to construct a scatter ABD.  This function
 331  * makes no attempt to request contiguous pages and requires the minimal
 332  * number of kernel interfaces.  It's designed for maximum compatibility.
 333  */
 334 static void
 335 abd_alloc_pages(abd_t *abd, size_t size)
 336 {
 337         struct scatterlist *sg;
 338         struct sg_table table;
 339         struct page *page;
 340         gfp_t gfp = __GFP_NOWARN | GFP_NOIO;
 341         int nr_pages = abd_chunkcnt_for_bytes(size);
 342         int i;
 343
 344         while (sg_alloc_table(&table, nr_pages, gfp)) {
 345                 ABDSTAT_BUMP(abdstat_scatter_sg_table_retry);
 346                 schedule_timeout_interruptible(1);
 347         }
 348
 349         ASSERT3U(table.nents, ==, nr_pages);
 350         ABD_SCATTER(abd).abd_sgl = table.sgl;
 351         ABD_SCATTER(abd).abd_nents = nr_pages;
 352
 353         abd_for_each_sg(abd, sg, nr_pages, i) {
 354                 while ((page = __page_cache_alloc(gfp)) == NULL) {
 355                         ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
 356                         schedule_timeout_interruptible(1);
 357                 }
 358
 359                 ABDSTAT_BUMP(abdstat_scatter_orders[0]);
 360                 sg_set_page(sg, page, PAGESIZE, 0);
 361         }
 362
 363         if (nr_pages > 1) {
 364                 ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
 365                 abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;
 366         }
 367 }
 368 #endif /* !CONFIG_HIGHMEM */
 369
 370 static void
 371 abd_free_pages(abd_t *abd)
 372 {
 373         struct scatterlist *sg;
 374         struct sg_table table;
 375         struct page *page;
 376         int nr_pages = ABD_SCATTER(abd).abd_nents;
 377         int order, i;
 378
 379         if (abd->abd_flags & ABD_FLAG_MULTI_ZONE)
 380                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone);
 381
 382         if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK)
 383                 ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk);
 384
 385         abd_for_each_sg(abd, sg, nr_pages, i) {
 386                 page = sg_page(sg);
 387                 order = compound_order(page);
 388                 __free_pages(page, order);
 389                 ASSERT3U(sg->length, <=, PAGE_SIZE << order);
 390                 ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]);
 391         }
 392
 393         table.sgl = ABD_SCATTER(abd).abd_sgl;
 394         table.nents = table.orig_nents = nr_pages;
 395         sg_free_table(&table);
 396 }
 397
 398 #else /* _KERNEL */
 399
 400 #ifndef PAGE_SHIFT
 401 #define PAGE_SHIFT (highbit64(PAGESIZE)-1)
 402 #endif
 403
 404 struct page;
 405
 406 #define kpm_enable                      1
 407 #define abd_alloc_chunk(o) \
 408         ((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
 409 #define abd_free_chunk(chunk, o)        umem_free(chunk, PAGESIZE << (o))
 410 #define zfs_kmap_atomic(chunk, km)      ((void *)chunk)
 411 #define zfs_kunmap_atomic(addr, km)     do { (void)(addr); } while (0)
 412 #define local_irq_save(flags)           do { (void)(flags); } while (0)
 413 #define local_irq_restore(flags)        do { (void)(flags); } while (0)
 414 #define nth_page(pg, i) \
 415         ((struct page *)((void *)(pg) + (i) * PAGESIZE))
 416
 417 struct scatterlist {
 418         struct page *page;
 419         int length;
 420         int end;
 421 };
 422
 423 static void
 424 sg_init_table(struct scatterlist *sg, int nr)
 425 {
 426         memset(sg, 0, nr * sizeof (struct scatterlist));
 427         sg[nr - 1].end = 1;
 428 }
 429
 430 #define for_each_sg(sgl, sg, nr, i)     \
 431         for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg))
 432
 433 static inline void
 434 sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len,
 435     unsigned int offset)
 436 {
 437         /* currently we don't use offset */
 438         ASSERT(offset == 0);
 439         sg->page = page;
 440         sg->length = len;
 441 }
 442
 443 static inline struct page *
 444 sg_page(struct scatterlist *sg)
 445 {
 446         return (sg->page);
 447 }
 448
 449 static inline struct scatterlist *
 450 sg_next(struct scatterlist *sg)
 451 {
 452         if (sg->end)
 453                 return (NULL);
 454
 455         return (sg + 1);
 456 }
 457
 458 static void
 459 abd_alloc_pages(abd_t *abd, size_t size)
 460 {
 461         unsigned nr_pages = abd_chunkcnt_for_bytes(size);
 462         struct scatterlist *sg;
 463         int i;
 464
 465         ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages *
 466             sizeof (struct scatterlist), KM_SLEEP);
 467         sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);
 468
 469         abd_for_each_sg(abd, sg, nr_pages, i) {
 470                 struct page *p = abd_alloc_chunk(0);
 471                 sg_set_page(sg, p, PAGESIZE, 0);
 472         }
 473         ABD_SCATTER(abd).abd_nents = nr_pages;
 474 }
 475
 476 static void
 477 abd_free_pages(abd_t *abd)
 478 {
 479         int i, n = ABD_SCATTER(abd).abd_nents;
 480         struct scatterlist *sg;
 481         int j;
 482
 483         abd_for_each_sg(abd, sg, n, i) {
 484                 for (j = 0; j < sg->length; j += PAGESIZE) {
 485                         struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
 486                         abd_free_chunk(p, 0);
 487                 }
 488         }
 489
 490         vmem_free(ABD_SCATTER(abd).abd_sgl, n * sizeof (struct scatterlist));
 491 }
 492
 493 #endif /* _KERNEL */
 494
 495 void
 496 abd_init(void)
 497 {
 498         int i;
 499
 500         abd_cache = kmem_cache_create("abd_t", sizeof (abd_t),
 501             0, NULL, NULL, NULL, NULL, NULL, 0);
 502
 503         abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED,
 504             sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
 505         if (abd_ksp != NULL) {
 506                 abd_ksp->ks_data = &abd_stats;
 507                 kstat_install(abd_ksp);
 508
 509                 for (i = 0; i < MAX_ORDER; i++) {
 510                         snprintf(abd_stats.abdstat_scatter_orders[i].name,
 511                             KSTAT_STRLEN, "scatter_order_%d", i);
 512                         abd_stats.abdstat_scatter_orders[i].data_type =
 513                             KSTAT_DATA_UINT64;
 514                 }
 515         }
 516 }
 517
 518 void
 519 abd_fini(void)
 520 {
 521         if (abd_ksp != NULL) {
 522                 kstat_delete(abd_ksp);
 523                 abd_ksp = NULL;
 524         }
 525
 526         if (abd_cache) {
 527                 kmem_cache_destroy(abd_cache);
 528                 abd_cache = NULL;
 529         }
 530 }
 531
 532 static inline void
 533 abd_verify(abd_t *abd)
 534 {
 535         ASSERT3U(abd->abd_size, >, 0);
 536         ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
 537         ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
 538             ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
 539             ABD_FLAG_MULTI_CHUNK));
 540         IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
 541         IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
 542         if (abd_is_linear(abd)) {
 543                 ASSERT3P(abd->abd_u.abd_linear.abd_buf, !=, NULL);
 544         } else {
 545                 size_t n;
 546                 int i;
 547                 struct scatterlist *sg;
 548
 549                 ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0);
 550                 ASSERT3U(ABD_SCATTER(abd).abd_offset, <,
 551                     ABD_SCATTER(abd).abd_sgl->length);
 552                 n = ABD_SCATTER(abd).abd_nents;
 553                 abd_for_each_sg(abd, sg, n, i) {
 554                         ASSERT3P(sg_page(sg), !=, NULL);
 555                 }
 556         }
 557 }
 558
 559 static inline abd_t *
 560 abd_alloc_struct(void)
 561 {
 562         abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE);
 563
 564         ASSERT3P(abd, !=, NULL);
 565         ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t));
 566
 567         return (abd);
 568 }
 569
 570 static inline void
 571 abd_free_struct(abd_t *abd)
 572 {
 573         kmem_cache_free(abd_cache, abd);
 574         ABDSTAT_INCR(abdstat_struct_size, -sizeof (abd_t));
 575 }
 576
 577 /*
 578  * Allocate an ABD, along with its own underlying data buffers. Use this if you
 579  * don't care whether the ABD is linear or not.
 580  */
 581 abd_t *
 582 abd_alloc(size_t size, boolean_t is_metadata)
 583 {
 584         abd_t *abd;
 585
 586         if (!zfs_abd_scatter_enabled || size <= PAGESIZE)
 587                 return (abd_alloc_linear(size, is_metadata));
 588
 589         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
 590
 591         abd = abd_alloc_struct();
 592         abd->abd_flags = ABD_FLAG_OWNER;
 593         abd_alloc_pages(abd, size);
 594
 595         if (is_metadata) {
 596                 abd->abd_flags |= ABD_FLAG_META;
 597         }
 598         abd->abd_size = size;
 599         abd->abd_parent = NULL;
 600         refcount_create(&abd->abd_children);
 601
 602         abd->abd_u.abd_scatter.abd_offset = 0;
 603
 604         ABDSTAT_BUMP(abdstat_scatter_cnt);
 605         ABDSTAT_INCR(abdstat_scatter_data_size, size);
 606         ABDSTAT_INCR(abdstat_scatter_chunk_waste,
 607             P2ROUNDUP(size, PAGESIZE) - size);
 608
 609         return (abd);
 610 }
 611
 612 static void
 613 abd_free_scatter(abd_t *abd)
 614 {
 615         abd_free_pages(abd);
 616
 617         refcount_destroy(&abd->abd_children);
 618         ABDSTAT_BUMPDOWN(abdstat_scatter_cnt);
 619         ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size);
 620         ABDSTAT_INCR(abdstat_scatter_chunk_waste,
 621             abd->abd_size - P2ROUNDUP(abd->abd_size, PAGESIZE));
 622
 623         abd_free_struct(abd);
 624 }
 625
 626 /*
 627  * Allocate an ABD that must be linear, along with its own underlying data
 628  * buffer. Only use this when it would be very annoying to write your ABD
 629  * consumer with a scattered ABD.
 630  */
 631 abd_t *
 632 abd_alloc_linear(size_t size, boolean_t is_metadata)
 633 {
 634         abd_t *abd = abd_alloc_struct();
 635
 636         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
 637
 638         abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER;
 639         if (is_metadata) {
 640                 abd->abd_flags |= ABD_FLAG_META;
 641         }
 642         abd->abd_size = size;
 643         abd->abd_parent = NULL;
 644         refcount_create(&abd->abd_children);
 645
 646         if (is_metadata) {
 647                 abd->abd_u.abd_linear.abd_buf = zio_buf_alloc(size);
 648         } else {
 649                 abd->abd_u.abd_linear.abd_buf = zio_data_buf_alloc(size);
 650         }
 651
 652         ABDSTAT_BUMP(abdstat_linear_cnt);
 653         ABDSTAT_INCR(abdstat_linear_data_size, size);
 654
 655         return (abd);
 656 }
 657
 658 static void
 659 abd_free_linear(abd_t *abd)
 660 {
 661         if (abd->abd_flags & ABD_FLAG_META) {
 662                 zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
 663         } else {
 664                 zio_data_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
 665         }
 666
 667         refcount_destroy(&abd->abd_children);
 668         ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
 669         ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
 670
 671         abd_free_struct(abd);
 672 }
 673
 674 /*
 675  * Free an ABD. Only use this on ABDs allocated with abd_alloc() or
 676  * abd_alloc_linear().
 677  */
 678 void
 679 abd_free(abd_t *abd)
 680 {
 681         abd_verify(abd);
 682         ASSERT3P(abd->abd_parent, ==, NULL);
 683         ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
 684         if (abd_is_linear(abd))
 685                 abd_free_linear(abd);
 686         else
 687                 abd_free_scatter(abd);
 688 }
 689
 690 /*
 691  * Allocate an ABD of the same format (same metadata flag, same scatterize
 692  * setting) as another ABD.
 693  */
 694 abd_t *
 695 abd_alloc_sametype(abd_t *sabd, size_t size)
 696 {
 697         boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
 698         if (abd_is_linear(sabd)) {
 699                 return (abd_alloc_linear(size, is_metadata));
 700         } else {
 701                 return (abd_alloc(size, is_metadata));
 702         }
 703 }
 704
 705 /*
 706  * If we're going to use this ABD for doing I/O using the block layer, the
 707  * consumer of the ABD data doesn't care if it's scattered or not, and we don't
 708  * plan to store this ABD in memory for a long period of time, we should
 709  * allocate the ABD type that requires the least data copying to do the I/O.
 710  *
 711  * On Illumos this is linear ABDs, however if ldi_strategy() can ever issue I/Os
 712  * using a scatter/gather list we should switch to that and replace this call
 713  * with vanilla abd_alloc().
 714  *
 715  * On Linux the optimal thing to do would be to use abd_get_offset() and
 716  * construct a new ABD which shares the original pages thereby eliminating
 717  * the copy.  But for the moment a new linear ABD is allocated until this
 718  * performance optimization can be implemented.
 719  */
 720 abd_t *
 721 abd_alloc_for_io(size_t size, boolean_t is_metadata)
 722 {
 723         return (abd_alloc(size, is_metadata));
 724 }
 725
 726 /*
 727  * Allocate a new ABD to point to offset off of sabd. It shares the underlying
 728  * buffer data with sabd. Use abd_put() to free. sabd must not be freed while
 729  * any derived ABDs exist.
 730  */
 731 static inline abd_t *
 732 abd_get_offset_impl(abd_t *sabd, size_t off, size_t size)
 733 {
 734         abd_t *abd;
 735
 736         abd_verify(sabd);
 737         ASSERT3U(off, <=, sabd->abd_size);
 738
 739         if (abd_is_linear(sabd)) {
 740                 abd = abd_alloc_struct();
 741
 742                 /*
 743                  * Even if this buf is filesystem metadata, we only track that
 744                  * if we own the underlying data buffer, which is not true in
 745                  * this case. Therefore, we don't ever use ABD_FLAG_META here.
 746                  */
 747                 abd->abd_flags = ABD_FLAG_LINEAR;
 748
 749                 abd->abd_u.abd_linear.abd_buf =
 750                     (char *)sabd->abd_u.abd_linear.abd_buf + off;
 751         } else {
 752                 int i;
 753                 struct scatterlist *sg;
 754                 size_t new_offset = sabd->abd_u.abd_scatter.abd_offset + off;
 755
 756                 abd = abd_alloc_struct();
 757
 758                 /*
 759                  * Even if this buf is filesystem metadata, we only track that
 760                  * if we own the underlying data buffer, which is not true in
 761                  * this case. Therefore, we don't ever use ABD_FLAG_META here.
 762                  */
 763                 abd->abd_flags = 0;
 764
 765                 abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) {
 766                         if (new_offset < sg->length)
 767                                 break;
 768                         new_offset -= sg->length;
 769                 }
 770
 771                 ABD_SCATTER(abd).abd_sgl = sg;
 772                 ABD_SCATTER(abd).abd_offset = new_offset;
 773                 ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i;
 774         }
 775
 776         abd->abd_size = size;
 777         abd->abd_parent = sabd;
 778         refcount_create(&abd->abd_children);
 779         (void) refcount_add_many(&sabd->abd_children, abd->abd_size, abd);
 780
 781         return (abd);
 782 }
 783
 784 abd_t *
 785 abd_get_offset(abd_t *sabd, size_t off)
 786 {
 787         size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0;
 788
 789         VERIFY3U(size, >, 0);
 790
 791         return (abd_get_offset_impl(sabd, off, size));
 792 }
 793
 794 abd_t *
 795 abd_get_offset_size(abd_t *sabd, size_t off, size_t size)
 796 {
 797         ASSERT3U(off + size, <=, sabd->abd_size);
 798
 799         return (abd_get_offset_impl(sabd, off, size));
 800 }
 801
 802 /*
 803  * Allocate a linear ABD structure for buf. You must free this with abd_put()
 804  * since the resulting ABD doesn't own its own buffer.
 805  */
 806 abd_t *
 807 abd_get_from_buf(void *buf, size_t size)
 808 {
 809         abd_t *abd = abd_alloc_struct();
 810
 811         VERIFY3U(size, <=, SPA_MAXBLOCKSIZE);
 812
 813         /*
 814          * Even if this buf is filesystem metadata, we only track that if we
 815          * own the underlying data buffer, which is not true in this case.
 816          * Therefore, we don't ever use ABD_FLAG_META here.
 817          */
 818         abd->abd_flags = ABD_FLAG_LINEAR;
 819         abd->abd_size = size;
 820         abd->abd_parent = NULL;
 821         refcount_create(&abd->abd_children);
 822
 823         abd->abd_u.abd_linear.abd_buf = buf;
 824
 825         return (abd);
 826 }
 827
 828 /*
 829  * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not
 830  * free the underlying scatterlist or buffer.
 831  */
 832 void
 833 abd_put(abd_t *abd)
 834 {
 835         abd_verify(abd);
 836         ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
 837
 838         if (abd->abd_parent != NULL) {
 839                 (void) refcount_remove_many(&abd->abd_parent->abd_children,
 840                     abd->abd_size, abd);
 841         }
 842
 843         refcount_destroy(&abd->abd_children);
 844         abd_free_struct(abd);
 845 }
 846
 847 /*
 848  * Get the raw buffer associated with a linear ABD.
 849  */
 850 void *
 851 abd_to_buf(abd_t *abd)
 852 {
 853         ASSERT(abd_is_linear(abd));
 854         abd_verify(abd);
 855         return (abd->abd_u.abd_linear.abd_buf);
 856 }
 857
 858 /*
 859  * Borrow a raw buffer from an ABD without copying the contents of the ABD
 860  * into the buffer. If the ABD is scattered, this will allocate a raw buffer
 861  * whose contents are undefined. To copy over the existing data in the ABD, use
 862  * abd_borrow_buf_copy() instead.
 863  */
 864 void *
 865 abd_borrow_buf(abd_t *abd, size_t n)
 866 {
 867         void *buf;
 868         abd_verify(abd);
 869         ASSERT3U(abd->abd_size, >=, n);
 870         if (abd_is_linear(abd)) {
 871                 buf = abd_to_buf(abd);
 872         } else {
 873                 buf = zio_buf_alloc(n);
 874         }
 875         (void) refcount_add_many(&abd->abd_children, n, buf);
 876
 877         return (buf);
 878 }
 879
 880 void *
 881 abd_borrow_buf_copy(abd_t *abd, size_t n)
 882 {
 883         void *buf = abd_borrow_buf(abd, n);
 884         if (!abd_is_linear(abd)) {
 885                 abd_copy_to_buf(buf, abd, n);
 886         }
 887         return (buf);
 888 }
 889
 890 /*
 891  * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will
 892  * not change the contents of the ABD and will ASSERT that you didn't modify
 893  * the buffer since it was borrowed. If you want any changes you made to buf to
 894  * be copied back to abd, use abd_return_buf_copy() instead.
 895  */
 896 void
 897 abd_return_buf(abd_t *abd, void *buf, size_t n)
 898 {
 899         abd_verify(abd);
 900         ASSERT3U(abd->abd_size, >=, n);
 901         if (abd_is_linear(abd)) {
 902                 ASSERT3P(buf, ==, abd_to_buf(abd));
 903         } else {
 904                 ASSERT0(abd_cmp_buf(abd, buf, n));
 905                 zio_buf_free(buf, n);
 906         }
 907         (void) refcount_remove_many(&abd->abd_children, n, buf);
 908 }
 909
 910 void
 911 abd_return_buf_copy(abd_t *abd, void *buf, size_t n)
 912 {
 913         if (!abd_is_linear(abd)) {
 914                 abd_copy_from_buf(abd, buf, n);
 915         }
 916         abd_return_buf(abd, buf, n);
 917 }
 918
 919 /*
 920  * Give this ABD ownership of the buffer that it's storing. Can only be used on
 921  * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated
 922  * with abd_alloc_linear() which subsequently released ownership of their buf
 923  * with abd_release_ownership_of_buf().
 924  */
 925 void
 926 abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata)
 927 {
 928         ASSERT(abd_is_linear(abd));
 929         ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER));
 930         abd_verify(abd);
 931
 932         abd->abd_flags |= ABD_FLAG_OWNER;
 933         if (is_metadata) {
 934                 abd->abd_flags |= ABD_FLAG_META;
 935         }
 936
 937         ABDSTAT_BUMP(abdstat_linear_cnt);
 938         ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size);
 939 }
 940
 941 void
 942 abd_release_ownership_of_buf(abd_t *abd)
 943 {
 944         ASSERT(abd_is_linear(abd));
 945         ASSERT(abd->abd_flags & ABD_FLAG_OWNER);
 946         abd_verify(abd);
 947
 948         abd->abd_flags &= ~ABD_FLAG_OWNER;
 949         /* Disable this flag since we no longer own the data buffer */
 950         abd->abd_flags &= ~ABD_FLAG_META;
 951
 952         ABDSTAT_BUMPDOWN(abdstat_linear_cnt);
 953         ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size);
 954 }
 955
 956 #ifndef HAVE_1ARG_KMAP_ATOMIC
 957 #define NR_KM_TYPE (6)
 958 #ifdef _KERNEL
 959 int km_table[NR_KM_TYPE] = {
 960         KM_USER0,
 961         KM_USER1,
 962         KM_BIO_SRC_IRQ,
 963         KM_BIO_DST_IRQ,
 964         KM_PTE0,
 965         KM_PTE1,
 966 };
 967 #endif
 968 #endif
 969
 970 struct abd_iter {
 971         /* public interface */
 972         void            *iter_mapaddr;  /* addr corresponding to iter_pos */
 973         size_t          iter_mapsize;   /* length of data valid at mapaddr */
 974
 975         /* private */
 976         abd_t           *iter_abd;      /* ABD being iterated through */
 977         size_t          iter_pos;
 978         size_t          iter_offset;    /* offset in current sg/abd_buf, */
 979                                         /* abd_offset included */
 980         struct scatterlist *iter_sg;    /* current sg */
 981 #ifndef HAVE_1ARG_KMAP_ATOMIC
 982         int             iter_km;        /* KM_* for kmap_atomic */
 983 #endif
 984 };
 985
 986 /*
 987  * Initialize the abd_iter.
 988  */
 989 static void
 990 abd_iter_init(struct abd_iter *aiter, abd_t *abd, int km_type)
 991 {
 992         abd_verify(abd);
 993         aiter->iter_abd = abd;
 994         aiter->iter_mapaddr = NULL;
 995         aiter->iter_mapsize = 0;
 996         aiter->iter_pos = 0;
 997         if (abd_is_linear(abd)) {
 998                 aiter->iter_offset = 0;
 999                 aiter->iter_sg = NULL;
1000         } else {
1001                 aiter->iter_offset = ABD_SCATTER(abd).abd_offset;
1002                 aiter->iter_sg = ABD_SCATTER(abd).abd_sgl;
1003         }
1004 #ifndef HAVE_1ARG_KMAP_ATOMIC
1005         ASSERT3U(km_type, <, NR_KM_TYPE);
1006         aiter->iter_km = km_type;
1007 #endif
1008 }
1009
1010 /*
1011  * Advance the iterator by a certain amount. Cannot be called when a chunk is
1012  * in use. This can be safely called when the aiter has already exhausted, in
1013  * which case this does nothing.
1014  */
1015 static void
1016 abd_iter_advance(struct abd_iter *aiter, size_t amount)
1017 {
1018         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
1019         ASSERT0(aiter->iter_mapsize);
1020
1021         /* There's nothing left to advance to, so do nothing */
1022         if (aiter->iter_pos == aiter->iter_abd->abd_size)
1023                 return;
1024
1025         aiter->iter_pos += amount;
1026         aiter->iter_offset += amount;
1027         if (!abd_is_linear(aiter->iter_abd)) {
1028                 while (aiter->iter_offset >= aiter->iter_sg->length) {
1029                         aiter->iter_offset -= aiter->iter_sg->length;
1030                         aiter->iter_sg = sg_next(aiter->iter_sg);
1031                         if (aiter->iter_sg == NULL) {
1032                                 ASSERT0(aiter->iter_offset);
1033                                 break;
1034                         }
1035                 }
1036         }
1037 }
1038
1039 /*
1040  * Map the current chunk into aiter. This can be safely called when the aiter
1041  * has already exhausted, in which case this does nothing.
1042  */
1043 static void
1044 abd_iter_map(struct abd_iter *aiter)
1045 {
1046         void *paddr;
1047         size_t offset = 0;
1048
1049         ASSERT3P(aiter->iter_mapaddr, ==, NULL);
1050         ASSERT0(aiter->iter_mapsize);
1051
1052         /* There's nothing left to iterate over, so do nothing */
1053         if (aiter->iter_pos == aiter->iter_abd->abd_size)
1054                 return;
1055
1056         if (abd_is_linear(aiter->iter_abd)) {
1057                 ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset);
1058                 offset = aiter->iter_offset;
1059                 aiter->iter_mapsize = aiter->iter_abd->abd_size - offset;
1060                 paddr = aiter->iter_abd->abd_u.abd_linear.abd_buf;
1061         } else {
1062                 offset = aiter->iter_offset;
1063                 aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset,
1064                     aiter->iter_abd->abd_size - aiter->iter_pos);
1065
1066                 paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg),
1067                     km_table[aiter->iter_km]);
1068         }
1069
1070         aiter->iter_mapaddr = (char *)paddr + offset;
1071 }
1072
1073 /*
1074  * Unmap the current chunk from aiter. This can be safely called when the aiter
1075  * has already exhausted, in which case this does nothing.
1076  */
1077 static void
1078 abd_iter_unmap(struct abd_iter *aiter)
1079 {
1080         /* There's nothing left to unmap, so do nothing */
1081         if (aiter->iter_pos == aiter->iter_abd->abd_size)
1082                 return;
1083
1084         if (!abd_is_linear(aiter->iter_abd)) {
1085                 /* LINTED E_FUNC_SET_NOT_USED */
1086                 zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset,
1087                     km_table[aiter->iter_km]);
1088         }
1089
1090         ASSERT3P(aiter->iter_mapaddr, !=, NULL);
1091         ASSERT3U(aiter->iter_mapsize, >, 0);
1092
1093         aiter->iter_mapaddr = NULL;
1094         aiter->iter_mapsize = 0;
1095 }
1096
1097 int
1098 abd_iterate_func(abd_t *abd, size_t off, size_t size,
1099     abd_iter_func_t *func, void *private)
1100 {
1101         int ret = 0;
1102         struct abd_iter aiter;
1103
1104         abd_verify(abd);
1105         ASSERT3U(off + size, <=, abd->abd_size);
1106
1107         abd_iter_init(&aiter, abd, 0);
1108         abd_iter_advance(&aiter, off);
1109
1110         while (size > 0) {
1111                 size_t len;
1112                 abd_iter_map(&aiter);
1113
1114                 len = MIN(aiter.iter_mapsize, size);
1115                 ASSERT3U(len, >, 0);
1116
1117                 ret = func(aiter.iter_mapaddr, len, private);
1118
1119                 abd_iter_unmap(&aiter);
1120
1121                 if (ret != 0)
1122                         break;
1123
1124                 size -= len;
1125                 abd_iter_advance(&aiter, len);
1126         }
1127
1128         return (ret);
1129 }
1130
1131 struct buf_arg {
1132         void *arg_buf;
1133 };
1134
1135 static int
1136 abd_copy_to_buf_off_cb(void *buf, size_t size, void *private)
1137 {
1138         struct buf_arg *ba_ptr = private;
1139
1140         (void) memcpy(ba_ptr->arg_buf, buf, size);
1141         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1142
1143         return (0);
1144 }
1145
1146 /*
1147  * Copy abd to buf. (off is the offset in abd.)
1148  */
1149 void
1150 abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size)
1151 {
1152         struct buf_arg ba_ptr = { buf };
1153
1154         (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb,
1155             &ba_ptr);
1156 }
1157
1158 static int
1159 abd_cmp_buf_off_cb(void *buf, size_t size, void *private)
1160 {
1161         int ret;
1162         struct buf_arg *ba_ptr = private;
1163
1164         ret = memcmp(buf, ba_ptr->arg_buf, size);
1165         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1166
1167         return (ret);
1168 }
1169
1170 /*
1171  * Compare the contents of abd to buf. (off is the offset in abd.)
1172  */
1173 int
1174 abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
1175 {
1176         struct buf_arg ba_ptr = { (void *) buf };
1177
1178         return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr));
1179 }
1180
1181 static int
1182 abd_copy_from_buf_off_cb(void *buf, size_t size, void *private)
1183 {
1184         struct buf_arg *ba_ptr = private;
1185
1186         (void) memcpy(buf, ba_ptr->arg_buf, size);
1187         ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size;
1188
1189         return (0);
1190 }
1191
1192 /*
1193  * Copy from buf to abd. (off is the offset in abd.)
1194  */
1195 void
1196 abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size)
1197 {
1198         struct buf_arg ba_ptr = { (void *) buf };
1199
1200         (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb,
1201             &ba_ptr);
1202 }
1203
1204 /*ARGSUSED*/
1205 static int
1206 abd_zero_off_cb(void *buf, size_t size, void *private)
1207 {
1208         (void) memset(buf, 0, size);
1209         return (0);
1210 }
1211
1212 /*
1213  * Zero out the abd from a particular offset to the end.
1214  */
1215 void
1216 abd_zero_off(abd_t *abd, size_t off, size_t size)
1217 {
1218         (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL);
1219 }
1220
1221 /*
1222  * Iterate over two ABDs and call func incrementally on the two ABDs' data in
1223  * equal-sized chunks (passed to func as raw buffers). func could be called many
1224  * times during this iteration.
1225  */
1226 int
1227 abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff,
1228     size_t size, abd_iter_func2_t *func, void *private)
1229 {
1230         int ret = 0;
1231         struct abd_iter daiter, saiter;
1232
1233         abd_verify(dabd);
1234         abd_verify(sabd);
1235
1236         ASSERT3U(doff + size, <=, dabd->abd_size);
1237         ASSERT3U(soff + size, <=, sabd->abd_size);
1238
1239         abd_iter_init(&daiter, dabd, 0);
1240         abd_iter_init(&saiter, sabd, 1);
1241         abd_iter_advance(&daiter, doff);
1242         abd_iter_advance(&saiter, soff);
1243
1244         while (size > 0) {
1245                 size_t dlen, slen, len;
1246                 abd_iter_map(&daiter);
1247                 abd_iter_map(&saiter);
1248
1249                 dlen = MIN(daiter.iter_mapsize, size);
1250                 slen = MIN(saiter.iter_mapsize, size);
1251                 len = MIN(dlen, slen);
1252                 ASSERT(dlen > 0 || slen > 0);
1253
1254                 ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len,
1255                     private);
1256
1257                 abd_iter_unmap(&saiter);
1258                 abd_iter_unmap(&daiter);
1259
1260                 if (ret != 0)
1261                         break;
1262
1263                 size -= len;
1264                 abd_iter_advance(&daiter, len);
1265                 abd_iter_advance(&saiter, len);
1266         }
1267
1268         return (ret);
1269 }
1270
1271 /*ARGSUSED*/
1272 static int
1273 abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private)
1274 {
1275         (void) memcpy(dbuf, sbuf, size);
1276         return (0);
1277 }
1278
1279 /*
1280  * Copy from sabd to dabd starting from soff and doff.
1281  */
1282 void
1283 abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size)
1284 {
1285         (void) abd_iterate_func2(dabd, sabd, doff, soff, size,
1286             abd_copy_off_cb, NULL);
1287 }
1288
1289 /*ARGSUSED*/
1290 static int
1291 abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private)
1292 {
1293         return (memcmp(bufa, bufb, size));
1294 }
1295
1296 /*
1297  * Compares the contents of two ABDs.
1298  */
1299 int
1300 abd_cmp(abd_t *dabd, abd_t *sabd)
1301 {
1302         ASSERT3U(dabd->abd_size, ==, sabd->abd_size);
1303         return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size,
1304             abd_cmp_cb, NULL));
1305 }
1306
1307 /*
1308  * Iterate over code ABDs and a data ABD and call @func_raidz_gen.
1309  *
1310  * @cabds          parity ABDs, must have equal size
1311  * @dabd           data ABD. Can be NULL (in this case @dsize = 0)
1312  * @func_raidz_gen should be implemented so that its behaviour
1313  *                 is the same when taking linear and when taking scatter
1314  */
1315 void
1316 abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd,
1317     ssize_t csize, ssize_t dsize, const unsigned parity,
1318     void (*func_raidz_gen)(void **, const void *, size_t, size_t))
1319 {
1320         int i;
1321         ssize_t len, dlen;
1322         struct abd_iter caiters[3];
1323         struct abd_iter daiter = {0};
1324         void *caddrs[3];
1325         unsigned long flags;
1326
1327         ASSERT3U(parity, <=, 3);
1328
1329         for (i = 0; i < parity; i++)
1330                 abd_iter_init(&caiters[i], cabds[i], i);
1331
1332         if (dabd)
1333                 abd_iter_init(&daiter, dabd, i);
1334
1335         ASSERT3S(dsize, >=, 0);
1336
1337         local_irq_save(flags);
1338         while (csize > 0) {
1339                 len = csize;
1340
1341                 if (dabd && dsize > 0)
1342                         abd_iter_map(&daiter);
1343
1344                 for (i = 0; i < parity; i++) {
1345                         abd_iter_map(&caiters[i]);
1346                         caddrs[i] = caiters[i].iter_mapaddr;
1347                 }
1348
1349                 switch (parity) {
1350                         case 3:
1351                                 len = MIN(caiters[2].iter_mapsize, len);
1352                         case 2:
1353                                 len = MIN(caiters[1].iter_mapsize, len);
1354                         case 1:
1355                                 len = MIN(caiters[0].iter_mapsize, len);
1356                 }
1357
1358                 /* must be progressive */
1359                 ASSERT3S(len, >, 0);
1360
1361                 if (dabd && dsize > 0) {
1362                         /* this needs precise iter.length */
1363                         len = MIN(daiter.iter_mapsize, len);
1364                         dlen = len;
1365                 } else
1366                         dlen = 0;
1367
1368                 /* must be progressive */
1369                 ASSERT3S(len, >, 0);
1370                 /*
1371                  * The iterated function likely will not do well if each
1372                  * segment except the last one is not multiple of 512 (raidz).
1373                  */
1374                 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1375
1376                 func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen);
1377
1378                 for (i = parity-1; i >= 0; i--) {
1379                         abd_iter_unmap(&caiters[i]);
1380                         abd_iter_advance(&caiters[i], len);
1381                 }
1382
1383                 if (dabd && dsize > 0) {
1384                         abd_iter_unmap(&daiter);
1385                         abd_iter_advance(&daiter, dlen);
1386                         dsize -= dlen;
1387                 }
1388
1389                 csize -= len;
1390
1391                 ASSERT3S(dsize, >=, 0);
1392                 ASSERT3S(csize, >=, 0);
1393         }
1394         local_irq_restore(flags);
1395 }
1396
1397 /*
1398  * Iterate over code ABDs and data reconstruction target ABDs and call
1399  * @func_raidz_rec. Function maps at most 6 pages atomically.
1400  *
1401  * @cabds           parity ABDs, must have equal size
1402  * @tabds           rec target ABDs, at most 3
1403  * @tsize           size of data target columns
1404  * @func_raidz_rec  expects syndrome data in target columns. Function
1405  *                  reconstructs data and overwrites target columns.
1406  */
1407 void
1408 abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds,
1409     ssize_t tsize, const unsigned parity,
1410     void (*func_raidz_rec)(void **t, const size_t tsize, void **c,
1411     const unsigned *mul),
1412     const unsigned *mul)
1413 {
1414         int i;
1415         ssize_t len;
1416         struct abd_iter citers[3];
1417         struct abd_iter xiters[3];
1418         void *caddrs[3], *xaddrs[3];
1419         unsigned long flags;
1420
1421         ASSERT3U(parity, <=, 3);
1422
1423         for (i = 0; i < parity; i++) {
1424                 abd_iter_init(&citers[i], cabds[i], 2*i);
1425                 abd_iter_init(&xiters[i], tabds[i], 2*i+1);
1426         }
1427
1428         local_irq_save(flags);
1429         while (tsize > 0) {
1430
1431                 for (i = 0; i < parity; i++) {
1432                         abd_iter_map(&citers[i]);
1433                         abd_iter_map(&xiters[i]);
1434                         caddrs[i] = citers[i].iter_mapaddr;
1435                         xaddrs[i] = xiters[i].iter_mapaddr;
1436                 }
1437
1438                 len = tsize;
1439                 switch (parity) {
1440                         case 3:
1441                                 len = MIN(xiters[2].iter_mapsize, len);
1442                                 len = MIN(citers[2].iter_mapsize, len);
1443                         case 2:
1444                                 len = MIN(xiters[1].iter_mapsize, len);
1445                                 len = MIN(citers[1].iter_mapsize, len);
1446                         case 1:
1447                                 len = MIN(xiters[0].iter_mapsize, len);
1448                                 len = MIN(citers[0].iter_mapsize, len);
1449                 }
1450                 /* must be progressive */
1451                 ASSERT3S(len, >, 0);
1452                 /*
1453                  * The iterated function likely will not do well if each
1454                  * segment except the last one is not multiple of 512 (raidz).
1455                  */
1456                 ASSERT3U(((uint64_t)len & 511ULL), ==, 0);
1457
1458                 func_raidz_rec(xaddrs, len, caddrs, mul);
1459
1460                 for (i = parity-1; i >= 0; i--) {
1461                         abd_iter_unmap(&xiters[i]);
1462                         abd_iter_unmap(&citers[i]);
1463                         abd_iter_advance(&xiters[i], len);
1464                         abd_iter_advance(&citers[i], len);
1465                 }
1466
1467                 tsize -= len;
1468                 ASSERT3S(tsize, >=, 0);
1469         }
1470         local_irq_restore(flags);
1471 }
1472
1473 #if defined(_KERNEL) && defined(HAVE_SPL)
1474 /*
1475  * bio_nr_pages for ABD.
1476  * @off is the offset in @abd
1477  */
1478 unsigned long
1479 abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off)
1480 {
1481         unsigned long pos;
1482
1483         if (abd_is_linear(abd))
1484                 pos = (unsigned long)abd_to_buf(abd) + off;
1485         else
1486                 pos = abd->abd_u.abd_scatter.abd_offset + off;
1487
1488         return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) -
1489             (pos >> PAGE_SHIFT);
1490 }
1491
1492 /*
1493  * bio_map for scatter ABD.
1494  * @off is the offset in @abd
1495  * Remaining IO size is returned
1496  */
1497 unsigned int
1498 abd_scatter_bio_map_off(struct bio *bio, abd_t *abd,
1499     unsigned int io_size, size_t off)
1500 {
1501         int i;
1502         struct abd_iter aiter;
1503
1504         ASSERT(!abd_is_linear(abd));
1505         ASSERT3U(io_size, <=, abd->abd_size - off);
1506
1507         abd_iter_init(&aiter, abd, 0);
1508         abd_iter_advance(&aiter, off);
1509
1510         for (i = 0; i < bio->bi_max_vecs; i++) {
1511                 struct page *pg;
1512                 size_t len, sgoff, pgoff;
1513                 struct scatterlist *sg;
1514
1515                 if (io_size <= 0)
1516                         break;
1517
1518                 sg = aiter.iter_sg;
1519                 sgoff = aiter.iter_offset;
1520                 pgoff = sgoff & (PAGESIZE - 1);
1521                 len = MIN(io_size, PAGESIZE - pgoff);
1522                 ASSERT(len > 0);
1523
1524                 pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT);
1525                 if (bio_add_page(bio, pg, len, pgoff) != len)
1526                         break;
1527
1528                 io_size -= len;
1529                 abd_iter_advance(&aiter, len);
1530         }
1531
1532         return (io_size);
1533 }
1534
1535 /* Tunable Parameters */
1536 module_param(zfs_abd_scatter_enabled, int, 0644);
1537 MODULE_PARM_DESC(zfs_abd_scatter_enabled,
1538         "Toggle whether ABD allocations must be linear.");
1539 /* CSTYLED */
1540 module_param(zfs_abd_scatter_max_order, uint, 0644);
1541 MODULE_PARM_DESC(zfs_abd_scatter_max_order,
1542         "Maximum order allocation used for a scatter ABD.");
1543 #endif