module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/zio_checksum.h>
  33 #include <sys/abd.h>
  34 #include <sys/fs/zfs.h>
  35 #include <sys/fm/fs/zfs.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/vdev_raidz_impl.h>
  38
  39 /*
  40  * Virtual device vector for RAID-Z.
  41  *
  42  * This vdev supports single, double, and triple parity. For single parity,
  43  * we use a simple XOR of all the data columns. For double or triple parity,
  44  * we use a special case of Reed-Solomon coding. This extends the
  45  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  46  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  47  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  48  * former is also based. The latter is designed to provide higher performance
  49  * for writes.
  50  *
  51  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  52  * amended six years later identifying a critical flaw that invalidates its
  53  * claims. Nevertheless, the technique can be adapted to work for up to
  54  * triple parity. For additional parity, the amendment "Note: Correction to
  55  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  56  * is viable, but the additional complexity means that write performance will
  57  * suffer.
  58  *
  59  * All of the methods above operate on a Galois field, defined over the
  60  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  61  * can be expressed with a single byte. Briefly, the operations on the
  62  * field are defined as follows:
  63  *
  64  *   o addition (+) is represented by a bitwise XOR
  65  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  66  *   o multiplication of A by 2 is defined by the following bitwise expression:
  67  *
  68  *      (A * 2)_7 = A_6
  69  *      (A * 2)_6 = A_5
  70  *      (A * 2)_5 = A_4
  71  *      (A * 2)_4 = A_3 + A_7
  72  *      (A * 2)_3 = A_2 + A_7
  73  *      (A * 2)_2 = A_1 + A_7
  74  *      (A * 2)_1 = A_0
  75  *      (A * 2)_0 = A_7
  76  *
  77  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  78  * As an aside, this multiplication is derived from the error correcting
  79  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  80  *
  81  * Observe that any number in the field (except for 0) can be expressed as a
  82  * power of 2 -- a generator for the field. We store a table of the powers of
  83  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  84  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  85  * than field addition). The inverse of a field element A (A^-1) is therefore
  86  * A ^ (255 - 1) = A^254.
  87  *
  88  * The up-to-three parity columns, P, Q, R over several data columns,
  89  * D_0, ... D_n-1, can be expressed by field operations:
  90  *
  91  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  92  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  93  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  94  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  95  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  96  *
  97  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  98  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  99  * independent coefficients. (There are no additional coefficients that have
 100  * this property which is why the uncorrected Plank method breaks down.)
 101  *
 102  * See the reconstruction code below for how P, Q and R can used individually
 103  * or in concert to recover missing data columns.
 104  */
 105
 106 #define VDEV_RAIDZ_P            0
 107 #define VDEV_RAIDZ_Q            1
 108 #define VDEV_RAIDZ_R            2
 109
 110 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 111 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 112
 113 /*
 114  * We provide a mechanism to perform the field multiplication operation on a
 115  * 64-bit value all at once rather than a byte at a time. This works by
 116  * creating a mask from the top bit in each byte and using that to
 117  * conditionally apply the XOR of 0x1d.
 118  */
 119 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 120 { \
 121         (mask) = (x) & 0x8080808080808080ULL; \
 122         (mask) = ((mask) << 1) - ((mask) >> 7); \
 123         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 124             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 125 }
 126
 127 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 128 { \
 129         VDEV_RAIDZ_64MUL_2((x), mask); \
 130         VDEV_RAIDZ_64MUL_2((x), mask); \
 131 }
 132
 133 void
 134 vdev_raidz_map_free(raidz_map_t *rm)
 135 {
 136         int c;
 137
 138         for (c = 0; c < rm->rm_firstdatacol; c++) {
 139                 abd_free(rm->rm_col[c].rc_abd);
 140
 141                 if (rm->rm_col[c].rc_gdata != NULL)
 142                         abd_free(rm->rm_col[c].rc_gdata);
 143         }
 144
 145         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 146                 abd_put(rm->rm_col[c].rc_abd);
 147
 148         if (rm->rm_abd_copy != NULL)
 149                 abd_free(rm->rm_abd_copy);
 150
 151         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 152 }
 153
 154 static void
 155 vdev_raidz_map_free_vsd(zio_t *zio)
 156 {
 157         raidz_map_t *rm = zio->io_vsd;
 158
 159         ASSERT0(rm->rm_freed);
 160         rm->rm_freed = 1;
 161
 162         if (rm->rm_reports == 0)
 163                 vdev_raidz_map_free(rm);
 164 }
 165
 166 /*ARGSUSED*/
 167 static void
 168 vdev_raidz_cksum_free(void *arg, size_t ignored)
 169 {
 170         raidz_map_t *rm = arg;
 171
 172         ASSERT3U(rm->rm_reports, >, 0);
 173
 174         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 175                 vdev_raidz_map_free(rm);
 176 }
 177
 178 static void
 179 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
 180 {
 181         raidz_map_t *rm = zcr->zcr_cbdata;
 182         const size_t c = zcr->zcr_cbinfo;
 183         size_t x, offset;
 184
 185         const abd_t *good = NULL;
 186         const abd_t *bad = rm->rm_col[c].rc_abd;
 187
 188         if (good_data == NULL) {
 189                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 190                 return;
 191         }
 192
 193         if (c < rm->rm_firstdatacol) {
 194                 /*
 195                  * The first time through, calculate the parity blocks for
 196                  * the good data (this relies on the fact that the good
 197                  * data never changes for a given logical ZIO)
 198                  */
 199                 if (rm->rm_col[0].rc_gdata == NULL) {
 200                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 201
 202                         /*
 203                          * Set up the rm_col[]s to generate the parity for
 204                          * good_data, first saving the parity bufs and
 205                          * replacing them with buffers to hold the result.
 206                          */
 207                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 208                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 209                                 rm->rm_col[x].rc_abd =
 210                                     rm->rm_col[x].rc_gdata =
 211                                     abd_alloc_sametype(rm->rm_col[x].rc_abd,
 212                                     rm->rm_col[x].rc_size);
 213                         }
 214
 215                         /* fill in the data columns from good_data */
 216                         offset = 0;
 217                         for (; x < rm->rm_cols; x++) {
 218                                 abd_put(rm->rm_col[x].rc_abd);
 219
 220                                 rm->rm_col[x].rc_abd =
 221                                     abd_get_offset_size((abd_t *)good_data,
 222                                     offset, rm->rm_col[x].rc_size);
 223                                 offset += rm->rm_col[x].rc_size;
 224                         }
 225
 226                         /*
 227                          * Construct the parity from the good data.
 228                          */
 229                         vdev_raidz_generate_parity(rm);
 230
 231                         /* restore everything back to its original state */
 232                         for (x = 0; x < rm->rm_firstdatacol; x++)
 233                                 rm->rm_col[x].rc_abd = bad_parity[x];
 234
 235                         offset = 0;
 236                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 237                                 abd_put(rm->rm_col[x].rc_abd);
 238                                 rm->rm_col[x].rc_abd = abd_get_offset_size(
 239                                     rm->rm_abd_copy, offset,
 240                                     rm->rm_col[x].rc_size);
 241                                 offset += rm->rm_col[x].rc_size;
 242                         }
 243                 }
 244
 245                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 246                 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
 247                     rm->rm_col[c].rc_size);
 248         } else {
 249                 /* adjust good_data to point at the start of our column */
 250                 offset = 0;
 251                 for (x = rm->rm_firstdatacol; x < c; x++)
 252                         offset += rm->rm_col[x].rc_size;
 253
 254                 good = abd_get_offset_size((abd_t *)good_data, offset,
 255                     rm->rm_col[c].rc_size);
 256         }
 257
 258         /* we drop the ereport if it ends up that the data was good */
 259         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 260         abd_put((abd_t *)good);
 261 }
 262
 263 /*
 264  * Invoked indirectly by zfs_ereport_start_checksum(), called
 265  * below when our read operation fails completely.  The main point
 266  * is to keep a copy of everything we read from disk, so that at
 267  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 268  */
 269 static void
 270 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 271 {
 272         size_t c = (size_t)(uintptr_t)arg;
 273         size_t offset;
 274
 275         raidz_map_t *rm = zio->io_vsd;
 276         size_t size;
 277
 278         /* set up the report and bump the refcount  */
 279         zcr->zcr_cbdata = rm;
 280         zcr->zcr_cbinfo = c;
 281         zcr->zcr_finish = vdev_raidz_cksum_finish;
 282         zcr->zcr_free = vdev_raidz_cksum_free;
 283
 284         rm->rm_reports++;
 285         ASSERT3U(rm->rm_reports, >, 0);
 286
 287         if (rm->rm_abd_copy != NULL)
 288                 return;
 289
 290         /*
 291          * It's the first time we're called for this raidz_map_t, so we need
 292          * to copy the data aside; there's no guarantee that our zio's buffer
 293          * won't be re-used for something else.
 294          *
 295          * Our parity data is already in separate buffers, so there's no need
 296          * to copy them.
 297          */
 298
 299         size = 0;
 300         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 301                 size += rm->rm_col[c].rc_size;
 302
 303         rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
 304
 305         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 306                 raidz_col_t *col = &rm->rm_col[c];
 307                 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
 308                     col->rc_size);
 309
 310                 abd_copy(tmp, col->rc_abd, col->rc_size);
 311
 312                 abd_put(col->rc_abd);
 313                 col->rc_abd = tmp;
 314
 315                 offset += col->rc_size;
 316         }
 317         ASSERT3U(offset, ==, size);
 318 }
 319
 320 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 321         .vsd_free = vdev_raidz_map_free_vsd,
 322         .vsd_cksum_report = vdev_raidz_cksum_report
 323 };
 324
 325 /*
 326  * Divides the IO evenly across all child vdevs; usually, dcols is
 327  * the number of children in the target vdev.
 328  *
 329  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 330  * is this functions only caller, as small as possible on the stack.
 331  */
 332 noinline raidz_map_t *
 333 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 334     uint64_t nparity)
 335 {
 336         raidz_map_t *rm;
 337         /* The starting RAIDZ (parent) vdev sector of the block. */
 338         uint64_t b = zio->io_offset >> ashift;
 339         /* The zio's size in units of the vdev's minimum sector size. */
 340         uint64_t s = zio->io_size >> ashift;
 341         /* The first column for this stripe. */
 342         uint64_t f = b % dcols;
 343         /* The starting byte offset on each child vdev. */
 344         uint64_t o = (b / dcols) << ashift;
 345         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 346         uint64_t off = 0;
 347
 348         /*
 349          * "Quotient": The number of data sectors for this stripe on all but
 350          * the "big column" child vdevs that also contain "remainder" data.
 351          */
 352         q = s / (dcols - nparity);
 353
 354         /*
 355          * "Remainder": The number of partial stripe data sectors in this I/O.
 356          * This will add a sector to some, but not all, child vdevs.
 357          */
 358         r = s - q * (dcols - nparity);
 359
 360         /* The number of "big columns" - those which contain remainder data. */
 361         bc = (r == 0 ? 0 : r + nparity);
 362
 363         /*
 364          * The total number of data and parity sectors associated with
 365          * this I/O.
 366          */
 367         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 368
 369         /* acols: The columns that will be accessed. */
 370         /* scols: The columns that will be accessed or skipped. */
 371         if (q == 0) {
 372                 /* Our I/O request doesn't span all child vdevs. */
 373                 acols = bc;
 374                 scols = MIN(dcols, roundup(bc, nparity + 1));
 375         } else {
 376                 acols = dcols;
 377                 scols = dcols;
 378         }
 379
 380         ASSERT3U(acols, <=, scols);
 381
 382         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 383
 384         rm->rm_cols = acols;
 385         rm->rm_scols = scols;
 386         rm->rm_bigcols = bc;
 387         rm->rm_skipstart = bc;
 388         rm->rm_missingdata = 0;
 389         rm->rm_missingparity = 0;
 390         rm->rm_firstdatacol = nparity;
 391         rm->rm_abd_copy = NULL;
 392         rm->rm_reports = 0;
 393         rm->rm_freed = 0;
 394         rm->rm_ecksuminjected = 0;
 395
 396         asize = 0;
 397
 398         for (c = 0; c < scols; c++) {
 399                 col = f + c;
 400                 coff = o;
 401                 if (col >= dcols) {
 402                         col -= dcols;
 403                         coff += 1ULL << ashift;
 404                 }
 405                 rm->rm_col[c].rc_devidx = col;
 406                 rm->rm_col[c].rc_offset = coff;
 407                 rm->rm_col[c].rc_abd = NULL;
 408                 rm->rm_col[c].rc_gdata = NULL;
 409                 rm->rm_col[c].rc_error = 0;
 410                 rm->rm_col[c].rc_tried = 0;
 411                 rm->rm_col[c].rc_skipped = 0;
 412
 413                 if (c >= acols)
 414                         rm->rm_col[c].rc_size = 0;
 415                 else if (c < bc)
 416                         rm->rm_col[c].rc_size = (q + 1) << ashift;
 417                 else
 418                         rm->rm_col[c].rc_size = q << ashift;
 419
 420                 asize += rm->rm_col[c].rc_size;
 421         }
 422
 423         ASSERT3U(asize, ==, tot << ashift);
 424         rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
 425         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 426         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
 427         ASSERT3U(rm->rm_nskip, <=, nparity);
 428
 429         for (c = 0; c < rm->rm_firstdatacol; c++)
 430                 rm->rm_col[c].rc_abd =
 431                     abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
 432
 433         rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
 434             rm->rm_col[c].rc_size);
 435         off = rm->rm_col[c].rc_size;
 436
 437         for (c = c + 1; c < acols; c++) {
 438                 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
 439                     rm->rm_col[c].rc_size);
 440                 off += rm->rm_col[c].rc_size;
 441         }
 442
 443         /*
 444          * If all data stored spans all columns, there's a danger that parity
 445          * will always be on the same device and, since parity isn't read
 446          * during normal operation, that that device's I/O bandwidth won't be
 447          * used effectively. We therefore switch the parity every 1MB.
 448          *
 449          * ... at least that was, ostensibly, the theory. As a practical
 450          * matter unless we juggle the parity between all devices evenly, we
 451          * won't see any benefit. Further, occasional writes that aren't a
 452          * multiple of the LCM of the number of children and the minimum
 453          * stripe width are sufficient to avoid pessimal behavior.
 454          * Unfortunately, this decision created an implicit on-disk format
 455          * requirement that we need to support for all eternity, but only
 456          * for single-parity RAID-Z.
 457          *
 458          * If we intend to skip a sector in the zeroth column for padding
 459          * we must make sure to note this swap. We will never intend to
 460          * skip the first column since at least one data and one parity
 461          * column must appear in each row.
 462          */
 463         ASSERT(rm->rm_cols >= 2);
 464         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 465
 466         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 467                 devidx = rm->rm_col[0].rc_devidx;
 468                 o = rm->rm_col[0].rc_offset;
 469                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 470                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 471                 rm->rm_col[1].rc_devidx = devidx;
 472                 rm->rm_col[1].rc_offset = o;
 473
 474                 if (rm->rm_skipstart == 0)
 475                         rm->rm_skipstart = 1;
 476         }
 477
 478         zio->io_vsd = rm;
 479         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 480
 481         /* init RAIDZ parity ops */
 482         rm->rm_ops = vdev_raidz_math_get_ops();
 483
 484         return (rm);
 485 }
 486
 487 struct pqr_struct {
 488         uint64_t *p;
 489         uint64_t *q;
 490         uint64_t *r;
 491 };
 492
 493 static int
 494 vdev_raidz_p_func(void *buf, size_t size, void *private)
 495 {
 496         struct pqr_struct *pqr = private;
 497         const uint64_t *src = buf;
 498         int i, cnt = size / sizeof (src[0]);
 499
 500         ASSERT(pqr->p && !pqr->q && !pqr->r);
 501
 502         for (i = 0; i < cnt; i++, src++, pqr->p++)
 503                 *pqr->p ^= *src;
 504
 505         return (0);
 506 }
 507
 508 static int
 509 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 510 {
 511         struct pqr_struct *pqr = private;
 512         const uint64_t *src = buf;
 513         uint64_t mask;
 514         int i, cnt = size / sizeof (src[0]);
 515
 516         ASSERT(pqr->p && pqr->q && !pqr->r);
 517
 518         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 519                 *pqr->p ^= *src;
 520                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 521                 *pqr->q ^= *src;
 522         }
 523
 524         return (0);
 525 }
 526
 527 static int
 528 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 529 {
 530         struct pqr_struct *pqr = private;
 531         const uint64_t *src = buf;
 532         uint64_t mask;
 533         int i, cnt = size / sizeof (src[0]);
 534
 535         ASSERT(pqr->p && pqr->q && pqr->r);
 536
 537         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 538                 *pqr->p ^= *src;
 539                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 540                 *pqr->q ^= *src;
 541                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 542                 *pqr->r ^= *src;
 543         }
 544
 545         return (0);
 546 }
 547
 548 static void
 549 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 550 {
 551         uint64_t *p;
 552         int c;
 553         abd_t *src;
 554
 555         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 556                 src = rm->rm_col[c].rc_abd;
 557                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 558
 559                 if (c == rm->rm_firstdatacol) {
 560                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 561                 } else {
 562                         struct pqr_struct pqr = { p, NULL, NULL };
 563                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 564                             vdev_raidz_p_func, &pqr);
 565                 }
 566         }
 567 }
 568
 569 static void
 570 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 571 {
 572         uint64_t *p, *q, pcnt, ccnt, mask, i;
 573         int c;
 574         abd_t *src;
 575
 576         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 577         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 578             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 579
 580         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 581                 src = rm->rm_col[c].rc_abd;
 582                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 583                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 584
 585                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 586
 587                 if (c == rm->rm_firstdatacol) {
 588                         ASSERT(ccnt == pcnt || ccnt == 0);
 589                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 590                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 591
 592                         for (i = ccnt; i < pcnt; i++) {
 593                                 p[i] = 0;
 594                                 q[i] = 0;
 595                         }
 596                 } else {
 597                         struct pqr_struct pqr = { p, q, NULL };
 598
 599                         ASSERT(ccnt <= pcnt);
 600                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 601                             vdev_raidz_pq_func, &pqr);
 602
 603                         /*
 604                          * Treat short columns as though they are full of 0s.
 605                          * Note that there's therefore nothing needed for P.
 606                          */
 607                         for (i = ccnt; i < pcnt; i++) {
 608                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 609                         }
 610                 }
 611         }
 612 }
 613
 614 static void
 615 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 616 {
 617         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 618         int c;
 619         abd_t *src;
 620
 621         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 622         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 623             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 624         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 625             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 626
 627         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 628                 src = rm->rm_col[c].rc_abd;
 629                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 630                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 631                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 632
 633                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 634
 635                 if (c == rm->rm_firstdatacol) {
 636                         ASSERT(ccnt == pcnt || ccnt == 0);
 637                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 638                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 639                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 640
 641                         for (i = ccnt; i < pcnt; i++) {
 642                                 p[i] = 0;
 643                                 q[i] = 0;
 644                                 r[i] = 0;
 645                         }
 646                 } else {
 647                         struct pqr_struct pqr = { p, q, r };
 648
 649                         ASSERT(ccnt <= pcnt);
 650                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 651                             vdev_raidz_pqr_func, &pqr);
 652
 653                         /*
 654                          * Treat short columns as though they are full of 0s.
 655                          * Note that there's therefore nothing needed for P.
 656                          */
 657                         for (i = ccnt; i < pcnt; i++) {
 658                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 659                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 660                         }
 661                 }
 662         }
 663 }
 664
 665 /*
 666  * Generate RAID parity in the first virtual columns according to the number of
 667  * parity columns available.
 668  */
 669 void
 670 vdev_raidz_generate_parity(raidz_map_t *rm)
 671 {
 672         /* Generate using the new math implementation */
 673         if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
 674                 return;
 675
 676         switch (rm->rm_firstdatacol) {
 677         case 1:
 678                 vdev_raidz_generate_parity_p(rm);
 679                 break;
 680         case 2:
 681                 vdev_raidz_generate_parity_pq(rm);
 682                 break;
 683         case 3:
 684                 vdev_raidz_generate_parity_pqr(rm);
 685                 break;
 686         default:
 687                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 688         }
 689 }
 690
 691 /* ARGSUSED */
 692 static int
 693 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 694 {
 695         uint64_t *dst = dbuf;
 696         uint64_t *src = sbuf;
 697         int cnt = size / sizeof (src[0]);
 698         int i;
 699
 700         for (i = 0; i < cnt; i++) {
 701                 dst[i] ^= src[i];
 702         }
 703
 704         return (0);
 705 }
 706
 707 /* ARGSUSED */
 708 static int
 709 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 710     void *private)
 711 {
 712         uint64_t *dst = dbuf;
 713         uint64_t *src = sbuf;
 714         uint64_t mask;
 715         int cnt = size / sizeof (dst[0]);
 716         int i;
 717
 718         for (i = 0; i < cnt; i++, dst++, src++) {
 719                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 720                 *dst ^= *src;
 721         }
 722
 723         return (0);
 724 }
 725
 726 /* ARGSUSED */
 727 static int
 728 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 729 {
 730         uint64_t *dst = buf;
 731         uint64_t mask;
 732         int cnt = size / sizeof (dst[0]);
 733         int i;
 734
 735         for (i = 0; i < cnt; i++, dst++) {
 736                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 737                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 738         }
 739
 740         return (0);
 741 }
 742
 743 struct reconst_q_struct {
 744         uint64_t *q;
 745         int exp;
 746 };
 747
 748 static int
 749 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 750 {
 751         struct reconst_q_struct *rq = private;
 752         uint64_t *dst = buf;
 753         int cnt = size / sizeof (dst[0]);
 754         int i;
 755
 756         for (i = 0; i < cnt; i++, dst++, rq->q++) {
 757                 int j;
 758                 uint8_t *b;
 759
 760                 *dst ^= *rq->q;
 761                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 762                         *b = vdev_raidz_exp2(*b, rq->exp);
 763                 }
 764         }
 765
 766         return (0);
 767 }
 768
 769 struct reconst_pq_struct {
 770         uint8_t *p;
 771         uint8_t *q;
 772         uint8_t *pxy;
 773         uint8_t *qxy;
 774         int aexp;
 775         int bexp;
 776 };
 777
 778 static int
 779 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 780 {
 781         struct reconst_pq_struct *rpq = private;
 782         uint8_t *xd = xbuf;
 783         uint8_t *yd = ybuf;
 784         int i;
 785
 786         for (i = 0; i < size;
 787             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 788                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 789                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 790                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 791         }
 792
 793         return (0);
 794 }
 795
 796 static int
 797 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 798 {
 799         struct reconst_pq_struct *rpq = private;
 800         uint8_t *xd = xbuf;
 801         int i;
 802
 803         for (i = 0; i < size;
 804             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 805                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 806                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 807                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 808         }
 809
 810         return (0);
 811 }
 812
 813 static int
 814 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 815 {
 816         int x = tgts[0];
 817         int c;
 818         abd_t *dst, *src;
 819
 820         ASSERT(ntgts == 1);
 821         ASSERT(x >= rm->rm_firstdatacol);
 822         ASSERT(x < rm->rm_cols);
 823
 824         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
 825         ASSERT(rm->rm_col[x].rc_size > 0);
 826
 827         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 828         dst = rm->rm_col[x].rc_abd;
 829
 830         abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
 831
 832         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 833                 uint64_t size = MIN(rm->rm_col[x].rc_size,
 834                     rm->rm_col[c].rc_size);
 835
 836                 src = rm->rm_col[c].rc_abd;
 837                 dst = rm->rm_col[x].rc_abd;
 838
 839                 if (c == x)
 840                         continue;
 841
 842                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 843                     vdev_raidz_reconst_p_func, NULL);
 844         }
 845
 846         return (1 << VDEV_RAIDZ_P);
 847 }
 848
 849 static int
 850 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 851 {
 852         int x = tgts[0];
 853         int c, exp;
 854         abd_t *dst, *src;
 855         struct reconst_q_struct rq;
 856
 857         ASSERT(ntgts == 1);
 858
 859         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 860
 861         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 862                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
 863                     rm->rm_col[c].rc_size);
 864
 865                 src = rm->rm_col[c].rc_abd;
 866                 dst = rm->rm_col[x].rc_abd;
 867
 868                 if (c == rm->rm_firstdatacol) {
 869                         abd_copy(dst, src, size);
 870                         if (rm->rm_col[x].rc_size > size)
 871                                 abd_zero_off(dst, size,
 872                                     rm->rm_col[x].rc_size - size);
 873
 874                 } else {
 875                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
 876                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 877                             vdev_raidz_reconst_q_pre_func, NULL);
 878                         (void) abd_iterate_func(dst,
 879                             size, rm->rm_col[x].rc_size - size,
 880                             vdev_raidz_reconst_q_pre_tail_func, NULL);
 881                 }
 882         }
 883
 884         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 885         dst = rm->rm_col[x].rc_abd;
 886         exp = 255 - (rm->rm_cols - 1 - x);
 887         rq.q = abd_to_buf(src);
 888         rq.exp = exp;
 889
 890         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
 891             vdev_raidz_reconst_q_post_func, &rq);
 892
 893         return (1 << VDEV_RAIDZ_Q);
 894 }
 895
 896 static int
 897 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 898 {
 899         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 900         abd_t *pdata, *qdata;
 901         uint64_t xsize, ysize;
 902         int x = tgts[0];
 903         int y = tgts[1];
 904         abd_t *xd, *yd;
 905         struct reconst_pq_struct rpq;
 906
 907         ASSERT(ntgts == 2);
 908         ASSERT(x < y);
 909         ASSERT(x >= rm->rm_firstdatacol);
 910         ASSERT(y < rm->rm_cols);
 911
 912         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 913
 914         /*
 915          * Move the parity data aside -- we're going to compute parity as
 916          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 917          * reuse the parity generation mechanism without trashing the actual
 918          * parity so we make those columns appear to be full of zeros by
 919          * setting their lengths to zero.
 920          */
 921         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 922         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 923         xsize = rm->rm_col[x].rc_size;
 924         ysize = rm->rm_col[y].rc_size;
 925
 926         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
 927             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 928         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
 929             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 930         rm->rm_col[x].rc_size = 0;
 931         rm->rm_col[y].rc_size = 0;
 932
 933         vdev_raidz_generate_parity_pq(rm);
 934
 935         rm->rm_col[x].rc_size = xsize;
 936         rm->rm_col[y].rc_size = ysize;
 937
 938         p = abd_to_buf(pdata);
 939         q = abd_to_buf(qdata);
 940         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 941         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 942         xd = rm->rm_col[x].rc_abd;
 943         yd = rm->rm_col[y].rc_abd;
 944
 945         /*
 946          * We now have:
 947          *      Pxy = P + D_x + D_y
 948          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 949          *
 950          * We can then solve for D_x:
 951          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 952          * where
 953          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 954          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 955          *
 956          * With D_x in hand, we can easily solve for D_y:
 957          *      D_y = P + Pxy + D_x
 958          */
 959
 960         a = vdev_raidz_pow2[255 + x - y];
 961         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 962         tmp = 255 - vdev_raidz_log2[a ^ 1];
 963
 964         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 965         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 966
 967         ASSERT3U(xsize, >=, ysize);
 968         rpq.p = p;
 969         rpq.q = q;
 970         rpq.pxy = pxy;
 971         rpq.qxy = qxy;
 972         rpq.aexp = aexp;
 973         rpq.bexp = bexp;
 974
 975         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 976             vdev_raidz_reconst_pq_func, &rpq);
 977         (void) abd_iterate_func(xd, ysize, xsize - ysize,
 978             vdev_raidz_reconst_pq_tail_func, &rpq);
 979
 980         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 981         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 982
 983         /*
 984          * Restore the saved parity data.
 985          */
 986         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
 987         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 988
 989         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 990 }
 991
 992 /* BEGIN CSTYLED */
 993 /*
 994  * In the general case of reconstruction, we must solve the system of linear
 995  * equations defined by the coeffecients used to generate parity as well as
 996  * the contents of the data and parity disks. This can be expressed with
 997  * vectors for the original data (D) and the actual data (d) and parity (p)
 998  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 999  *
1000  *            __   __                     __     __
1001  *            |     |         __     __   |  p_0  |
1002  *            |  V  |         |  D_0  |   | p_m-1 |
1003  *            |     |    x    |   :   | = |  d_0  |
1004  *            |  I  |         | D_n-1 |   |   :   |
1005  *            |     |         ~~     ~~   | d_n-1 |
1006  *            ~~   ~~                     ~~     ~~
1007  *
1008  * I is simply a square identity matrix of size n, and V is a vandermonde
1009  * matrix defined by the coeffecients we chose for the various parity columns
1010  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1011  * computation as well as linear separability.
1012  *
1013  *      __               __               __     __
1014  *      |   1   ..  1 1 1 |               |  p_0  |
1015  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1016  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1017  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1018  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1019  *      |   :       : : : |   |   :   |   |  d_2  |
1020  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1021  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1022  *      |   0   ..  0 0 1 |               | d_n-1 |
1023  *      ~~               ~~               ~~     ~~
1024  *
1025  * Note that I, V, d, and p are known. To compute D, we must invert the
1026  * matrix and use the known data and parity values to reconstruct the unknown
1027  * data values. We begin by removing the rows in V|I and d|p that correspond
1028  * to failed or missing columns; we then make V|I square (n x n) and d|p
1029  * sized n by removing rows corresponding to unused parity from the bottom up
1030  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1031  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1032  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1033  *           __                               __
1034  *           |  1   1   1   1   1   1   1   1  |
1035  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1036  *           |  19 205 116  29  64  16  4   1  |      / /
1037  *           |  1   0   0   0   0   0   0   0  |     / /
1038  *           |  0   1   0   0   0   0   0   0  | <--' /
1039  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1040  *           |  0   0   0   1   0   0   0   0  |
1041  *           |  0   0   0   0   1   0   0   0  |
1042  *           |  0   0   0   0   0   1   0   0  |
1043  *           |  0   0   0   0   0   0   1   0  |
1044  *           |  0   0   0   0   0   0   0   1  |
1045  *           ~~                               ~~
1046  *           __                               __
1047  *           |  1   1   1   1   1   1   1   1  |
1048  *           | 128  64  32  16  8   4   2   1  |
1049  *           |  19 205 116  29  64  16  4   1  |
1050  *           |  1   0   0   0   0   0   0   0  |
1051  *           |  0   1   0   0   0   0   0   0  |
1052  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1053  *           |  0   0   0   1   0   0   0   0  |
1054  *           |  0   0   0   0   1   0   0   0  |
1055  *           |  0   0   0   0   0   1   0   0  |
1056  *           |  0   0   0   0   0   0   1   0  |
1057  *           |  0   0   0   0   0   0   0   1  |
1058  *           ~~                               ~~
1059  *
1060  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1061  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1062  * matrix is not singular.
1063  * __                                                                 __
1064  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1065  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1066  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1067  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1068  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1069  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1070  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1071  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1072  * ~~                                                                 ~~
1073  * __                                                                 __
1074  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1075  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1076  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1077  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1078  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1079  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1080  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1081  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1082  * ~~                                                                 ~~
1083  * __                                                                 __
1084  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1085  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1086  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1087  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1088  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1089  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1090  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1091  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1092  * ~~                                                                 ~~
1093  * __                                                                 __
1094  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1095  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1096  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1097  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1098  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1099  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1100  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1101  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1102  * ~~                                                                 ~~
1103  * __                                                                 __
1104  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1105  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1106  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1107  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1108  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1109  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1110  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1111  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1112  * ~~                                                                 ~~
1113  * __                                                                 __
1114  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1115  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1116  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1117  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1118  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1119  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1120  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1121  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1122  * ~~                                                                 ~~
1123  *                   __                               __
1124  *                   |  0   0   1   0   0   0   0   0  |
1125  *                   | 167 100  5   41 159 169 217 208 |
1126  *                   | 166 100  4   40 158 168 216 209 |
1127  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1128  *                   |  0   0   0   0   1   0   0   0  |
1129  *                   |  0   0   0   0   0   1   0   0  |
1130  *                   |  0   0   0   0   0   0   1   0  |
1131  *                   |  0   0   0   0   0   0   0   1  |
1132  *                   ~~                               ~~
1133  *
1134  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1135  * of the missing data.
1136  *
1137  * As is apparent from the example above, the only non-trivial rows in the
1138  * inverse matrix correspond to the data disks that we're trying to
1139  * reconstruct. Indeed, those are the only rows we need as the others would
1140  * only be useful for reconstructing data known or assumed to be valid. For
1141  * that reason, we only build the coefficients in the rows that correspond to
1142  * targeted columns.
1143  */
1144 /* END CSTYLED */
1145
1146 static void
1147 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1148     uint8_t **rows)
1149 {
1150         int i, j;
1151         int pow;
1152
1153         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1154
1155         /*
1156          * Fill in the missing rows of interest.
1157          */
1158         for (i = 0; i < nmap; i++) {
1159                 ASSERT3S(0, <=, map[i]);
1160                 ASSERT3S(map[i], <=, 2);
1161
1162                 pow = map[i] * n;
1163                 if (pow > 255)
1164                         pow -= 255;
1165                 ASSERT(pow <= 255);
1166
1167                 for (j = 0; j < n; j++) {
1168                         pow -= map[i];
1169                         if (pow < 0)
1170                                 pow += 255;
1171                         rows[i][j] = vdev_raidz_pow2[pow];
1172                 }
1173         }
1174 }
1175
1176 static void
1177 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1178     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1179 {
1180         int i, j, ii, jj;
1181         uint8_t log;
1182
1183         /*
1184          * Assert that the first nmissing entries from the array of used
1185          * columns correspond to parity columns and that subsequent entries
1186          * correspond to data columns.
1187          */
1188         for (i = 0; i < nmissing; i++) {
1189                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1190         }
1191         for (; i < n; i++) {
1192                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1193         }
1194
1195         /*
1196          * First initialize the storage where we'll compute the inverse rows.
1197          */
1198         for (i = 0; i < nmissing; i++) {
1199                 for (j = 0; j < n; j++) {
1200                         invrows[i][j] = (i == j) ? 1 : 0;
1201                 }
1202         }
1203
1204         /*
1205          * Subtract all trivial rows from the rows of consequence.
1206          */
1207         for (i = 0; i < nmissing; i++) {
1208                 for (j = nmissing; j < n; j++) {
1209                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1210                         jj = used[j] - rm->rm_firstdatacol;
1211                         ASSERT3S(jj, <, n);
1212                         invrows[i][j] = rows[i][jj];
1213                         rows[i][jj] = 0;
1214                 }
1215         }
1216
1217         /*
1218          * For each of the rows of interest, we must normalize it and subtract
1219          * a multiple of it from the other rows.
1220          */
1221         for (i = 0; i < nmissing; i++) {
1222                 for (j = 0; j < missing[i]; j++) {
1223                         ASSERT0(rows[i][j]);
1224                 }
1225                 ASSERT3U(rows[i][missing[i]], !=, 0);
1226
1227                 /*
1228                  * Compute the inverse of the first element and multiply each
1229                  * element in the row by that value.
1230                  */
1231                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1232
1233                 for (j = 0; j < n; j++) {
1234                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1235                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1236                 }
1237
1238                 for (ii = 0; ii < nmissing; ii++) {
1239                         if (i == ii)
1240                                 continue;
1241
1242                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1243
1244                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1245
1246                         for (j = 0; j < n; j++) {
1247                                 rows[ii][j] ^=
1248                                     vdev_raidz_exp2(rows[i][j], log);
1249                                 invrows[ii][j] ^=
1250                                     vdev_raidz_exp2(invrows[i][j], log);
1251                         }
1252                 }
1253         }
1254
1255         /*
1256          * Verify that the data that is left in the rows are properly part of
1257          * an identity matrix.
1258          */
1259         for (i = 0; i < nmissing; i++) {
1260                 for (j = 0; j < n; j++) {
1261                         if (j == missing[i]) {
1262                                 ASSERT3U(rows[i][j], ==, 1);
1263                         } else {
1264                                 ASSERT0(rows[i][j]);
1265                         }
1266                 }
1267         }
1268 }
1269
1270 static void
1271 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1272     int *missing, uint8_t **invrows, const uint8_t *used)
1273 {
1274         int i, j, x, cc, c;
1275         uint8_t *src;
1276         uint64_t ccount;
1277         uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1278         uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1279         uint8_t log = 0;
1280         uint8_t val;
1281         int ll;
1282         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1283         uint8_t *p, *pp;
1284         size_t psize;
1285
1286         psize = sizeof (invlog[0][0]) * n * nmissing;
1287         p = kmem_alloc(psize, KM_SLEEP);
1288
1289         for (pp = p, i = 0; i < nmissing; i++) {
1290                 invlog[i] = pp;
1291                 pp += n;
1292         }
1293
1294         for (i = 0; i < nmissing; i++) {
1295                 for (j = 0; j < n; j++) {
1296                         ASSERT3U(invrows[i][j], !=, 0);
1297                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1298                 }
1299         }
1300
1301         for (i = 0; i < n; i++) {
1302                 c = used[i];
1303                 ASSERT3U(c, <, rm->rm_cols);
1304
1305                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1306                 ccount = rm->rm_col[c].rc_size;
1307                 for (j = 0; j < nmissing; j++) {
1308                         cc = missing[j] + rm->rm_firstdatacol;
1309                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1310                         ASSERT3U(cc, <, rm->rm_cols);
1311                         ASSERT3U(cc, !=, c);
1312
1313                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1314                         dcount[j] = rm->rm_col[cc].rc_size;
1315                 }
1316
1317                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1318
1319                 for (x = 0; x < ccount; x++, src++) {
1320                         if (*src != 0)
1321                                 log = vdev_raidz_log2[*src];
1322
1323                         for (cc = 0; cc < nmissing; cc++) {
1324                                 if (x >= dcount[cc])
1325                                         continue;
1326
1327                                 if (*src == 0) {
1328                                         val = 0;
1329                                 } else {
1330                                         if ((ll = log + invlog[cc][i]) >= 255)
1331                                                 ll -= 255;
1332                                         val = vdev_raidz_pow2[ll];
1333                                 }
1334
1335                                 if (i == 0)
1336                                         dst[cc][x] = val;
1337                                 else
1338                                         dst[cc][x] ^= val;
1339                         }
1340                 }
1341         }
1342
1343         kmem_free(p, psize);
1344 }
1345
1346 static int
1347 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1348 {
1349         int n, i, c, t, tt;
1350         int nmissing_rows;
1351         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1352         int parity_map[VDEV_RAIDZ_MAXPARITY];
1353
1354         uint8_t *p, *pp;
1355         size_t psize;
1356
1357         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1358         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1359         uint8_t *used;
1360
1361         abd_t **bufs = NULL;
1362
1363         int code = 0;
1364
1365         /*
1366          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1367          * temporary linear ABDs.
1368          */
1369         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1370                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1371
1372                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1373                         raidz_col_t *col = &rm->rm_col[c];
1374
1375                         bufs[c] = col->rc_abd;
1376                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1377                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1378                 }
1379         }
1380
1381         n = rm->rm_cols - rm->rm_firstdatacol;
1382
1383         /*
1384          * Figure out which data columns are missing.
1385          */
1386         nmissing_rows = 0;
1387         for (t = 0; t < ntgts; t++) {
1388                 if (tgts[t] >= rm->rm_firstdatacol) {
1389                         missing_rows[nmissing_rows++] =
1390                             tgts[t] - rm->rm_firstdatacol;
1391                 }
1392         }
1393
1394         /*
1395          * Figure out which parity columns to use to help generate the missing
1396          * data columns.
1397          */
1398         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1399                 ASSERT(tt < ntgts);
1400                 ASSERT(c < rm->rm_firstdatacol);
1401
1402                 /*
1403                  * Skip any targeted parity columns.
1404                  */
1405                 if (c == tgts[tt]) {
1406                         tt++;
1407                         continue;
1408                 }
1409
1410                 code |= 1 << c;
1411
1412                 parity_map[i] = c;
1413                 i++;
1414         }
1415
1416         ASSERT(code != 0);
1417         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1418
1419         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1420             nmissing_rows * n + sizeof (used[0]) * n;
1421         p = kmem_alloc(psize, KM_SLEEP);
1422
1423         for (pp = p, i = 0; i < nmissing_rows; i++) {
1424                 rows[i] = pp;
1425                 pp += n;
1426                 invrows[i] = pp;
1427                 pp += n;
1428         }
1429         used = pp;
1430
1431         for (i = 0; i < nmissing_rows; i++) {
1432                 used[i] = parity_map[i];
1433         }
1434
1435         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1436                 if (tt < nmissing_rows &&
1437                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1438                         tt++;
1439                         continue;
1440                 }
1441
1442                 ASSERT3S(i, <, n);
1443                 used[i] = c;
1444                 i++;
1445         }
1446
1447         /*
1448          * Initialize the interesting rows of the matrix.
1449          */
1450         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1451
1452         /*
1453          * Invert the matrix.
1454          */
1455         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1456             invrows, used);
1457
1458         /*
1459          * Reconstruct the missing data using the generated matrix.
1460          */
1461         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1462             invrows, used);
1463
1464         kmem_free(p, psize);
1465
1466         /*
1467          * copy back from temporary linear abds and free them
1468          */
1469         if (bufs) {
1470                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1471                         raidz_col_t *col = &rm->rm_col[c];
1472
1473                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1474                         abd_free(col->rc_abd);
1475                         col->rc_abd = bufs[c];
1476                 }
1477                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1478         }
1479
1480         return (code);
1481 }
1482
1483 int
1484 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
1485 {
1486         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1487         int ntgts;
1488         int i, c, ret;
1489         int code;
1490         int nbadparity, nbaddata;
1491         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1492
1493         /*
1494          * The tgts list must already be sorted.
1495          */
1496         for (i = 1; i < nt; i++) {
1497                 ASSERT(t[i] > t[i - 1]);
1498         }
1499
1500         nbadparity = rm->rm_firstdatacol;
1501         nbaddata = rm->rm_cols - nbadparity;
1502         ntgts = 0;
1503         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1504                 if (c < rm->rm_firstdatacol)
1505                         parity_valid[c] = B_FALSE;
1506
1507                 if (i < nt && c == t[i]) {
1508                         tgts[ntgts++] = c;
1509                         i++;
1510                 } else if (rm->rm_col[c].rc_error != 0) {
1511                         tgts[ntgts++] = c;
1512                 } else if (c >= rm->rm_firstdatacol) {
1513                         nbaddata--;
1514                 } else {
1515                         parity_valid[c] = B_TRUE;
1516                         nbadparity--;
1517                 }
1518         }
1519
1520         ASSERT(ntgts >= nt);
1521         ASSERT(nbaddata >= 0);
1522         ASSERT(nbaddata + nbadparity == ntgts);
1523
1524         dt = &tgts[nbadparity];
1525
1526         /* Reconstruct using the new math implementation */
1527         ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
1528         if (ret != RAIDZ_ORIGINAL_IMPL)
1529                 return (ret);
1530
1531         /*
1532          * See if we can use any of our optimized reconstruction routines.
1533          */
1534         switch (nbaddata) {
1535         case 1:
1536                 if (parity_valid[VDEV_RAIDZ_P])
1537                         return (vdev_raidz_reconstruct_p(rm, dt, 1));
1538
1539                 ASSERT(rm->rm_firstdatacol > 1);
1540
1541                 if (parity_valid[VDEV_RAIDZ_Q])
1542                         return (vdev_raidz_reconstruct_q(rm, dt, 1));
1543
1544                 ASSERT(rm->rm_firstdatacol > 2);
1545                 break;
1546
1547         case 2:
1548                 ASSERT(rm->rm_firstdatacol > 1);
1549
1550                 if (parity_valid[VDEV_RAIDZ_P] &&
1551                     parity_valid[VDEV_RAIDZ_Q])
1552                         return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1553
1554                 ASSERT(rm->rm_firstdatacol > 2);
1555
1556                 break;
1557         }
1558
1559         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1560         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1561         ASSERT(code > 0);
1562         return (code);
1563 }
1564
1565 static int
1566 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1567     uint64_t *ashift)
1568 {
1569         vdev_t *cvd;
1570         uint64_t nparity = vd->vdev_nparity;
1571         int c;
1572         int lasterror = 0;
1573         int numerrors = 0;
1574
1575         ASSERT(nparity > 0);
1576
1577         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1578             vd->vdev_children < nparity + 1) {
1579                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1580                 return (SET_ERROR(EINVAL));
1581         }
1582
1583         vdev_open_children(vd);
1584
1585         for (c = 0; c < vd->vdev_children; c++) {
1586                 cvd = vd->vdev_child[c];
1587
1588                 if (cvd->vdev_open_error != 0) {
1589                         lasterror = cvd->vdev_open_error;
1590                         numerrors++;
1591                         continue;
1592                 }
1593
1594                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1595                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1596                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1597         }
1598
1599         *asize *= vd->vdev_children;
1600         *max_asize *= vd->vdev_children;
1601
1602         if (numerrors > nparity) {
1603                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1604                 return (lasterror);
1605         }
1606
1607         return (0);
1608 }
1609
1610 static void
1611 vdev_raidz_close(vdev_t *vd)
1612 {
1613         int c;
1614
1615         for (c = 0; c < vd->vdev_children; c++)
1616                 vdev_close(vd->vdev_child[c]);
1617 }
1618
1619 static uint64_t
1620 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1621 {
1622         uint64_t asize;
1623         uint64_t ashift = vd->vdev_top->vdev_ashift;
1624         uint64_t cols = vd->vdev_children;
1625         uint64_t nparity = vd->vdev_nparity;
1626
1627         asize = ((psize - 1) >> ashift) + 1;
1628         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1629         asize = roundup(asize, nparity + 1) << ashift;
1630
1631         return (asize);
1632 }
1633
1634 static void
1635 vdev_raidz_child_done(zio_t *zio)
1636 {
1637         raidz_col_t *rc = zio->io_private;
1638
1639         rc->rc_error = zio->io_error;
1640         rc->rc_tried = 1;
1641         rc->rc_skipped = 0;
1642 }
1643
1644 /*
1645  * Start an IO operation on a RAIDZ VDev
1646  *
1647  * Outline:
1648  * - For write operations:
1649  *   1. Generate the parity data
1650  *   2. Create child zio write operations to each column's vdev, for both
1651  *      data and parity.
1652  *   3. If the column skips any sectors for padding, create optional dummy
1653  *      write zio children for those areas to improve aggregation continuity.
1654  * - For read operations:
1655  *   1. Create child zio read operations to each data column's vdev to read
1656  *      the range of data required for zio.
1657  *   2. If this is a scrub or resilver operation, or if any of the data
1658  *      vdevs have had errors, then create zio read operations to the parity
1659  *      columns' VDevs as well.
1660  */
1661 static void
1662 vdev_raidz_io_start(zio_t *zio)
1663 {
1664         vdev_t *vd = zio->io_vd;
1665         vdev_t *tvd = vd->vdev_top;
1666         vdev_t *cvd;
1667         raidz_map_t *rm;
1668         raidz_col_t *rc;
1669         int c, i;
1670
1671         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1672             vd->vdev_nparity);
1673
1674         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1675
1676         if (zio->io_type == ZIO_TYPE_WRITE) {
1677                 vdev_raidz_generate_parity(rm);
1678
1679                 for (c = 0; c < rm->rm_cols; c++) {
1680                         rc = &rm->rm_col[c];
1681                         cvd = vd->vdev_child[rc->rc_devidx];
1682                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1683                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1684                             zio->io_type, zio->io_priority, 0,
1685                             vdev_raidz_child_done, rc));
1686                 }
1687
1688                 /*
1689                  * Generate optional I/Os for any skipped sectors to improve
1690                  * aggregation contiguity.
1691                  */
1692                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1693                         ASSERT(c <= rm->rm_scols);
1694                         if (c == rm->rm_scols)
1695                                 c = 0;
1696                         rc = &rm->rm_col[c];
1697                         cvd = vd->vdev_child[rc->rc_devidx];
1698                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1699                             rc->rc_offset + rc->rc_size, NULL,
1700                             1 << tvd->vdev_ashift,
1701                             zio->io_type, zio->io_priority,
1702                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1703                 }
1704
1705                 zio_execute(zio);
1706                 return;
1707         }
1708
1709         ASSERT(zio->io_type == ZIO_TYPE_READ);
1710
1711         /*
1712          * Iterate over the columns in reverse order so that we hit the parity
1713          * last -- any errors along the way will force us to read the parity.
1714          */
1715         for (c = rm->rm_cols - 1; c >= 0; c--) {
1716                 rc = &rm->rm_col[c];
1717                 cvd = vd->vdev_child[rc->rc_devidx];
1718                 if (!vdev_readable(cvd)) {
1719                         if (c >= rm->rm_firstdatacol)
1720                                 rm->rm_missingdata++;
1721                         else
1722                                 rm->rm_missingparity++;
1723                         rc->rc_error = SET_ERROR(ENXIO);
1724                         rc->rc_tried = 1;       /* don't even try */
1725                         rc->rc_skipped = 1;
1726                         continue;
1727                 }
1728                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1729                         if (c >= rm->rm_firstdatacol)
1730                                 rm->rm_missingdata++;
1731                         else
1732                                 rm->rm_missingparity++;
1733                         rc->rc_error = SET_ERROR(ESTALE);
1734                         rc->rc_skipped = 1;
1735                         continue;
1736                 }
1737                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1738                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1739                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1740                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1741                             zio->io_type, zio->io_priority, 0,
1742                             vdev_raidz_child_done, rc));
1743                 }
1744         }
1745
1746         zio_execute(zio);
1747 }
1748
1749
1750 /*
1751  * Report a checksum error for a child of a RAID-Z device.
1752  */
1753 static void
1754 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1755 {
1756         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1757
1758         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1759                 zio_bad_cksum_t zbc;
1760                 raidz_map_t *rm = zio->io_vsd;
1761
1762                 mutex_enter(&vd->vdev_stat_lock);
1763                 vd->vdev_stat.vs_checksum_errors++;
1764                 mutex_exit(&vd->vdev_stat_lock);
1765
1766                 zbc.zbc_has_cksum = 0;
1767                 zbc.zbc_injected = rm->rm_ecksuminjected;
1768
1769                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1770                     rc->rc_offset, rc->rc_size, rc->rc_abd, bad_data,
1771                     &zbc);
1772         }
1773 }
1774
1775 /*
1776  * We keep track of whether or not there were any injected errors, so that
1777  * any ereports we generate can note it.
1778  */
1779 static int
1780 raidz_checksum_verify(zio_t *zio)
1781 {
1782         zio_bad_cksum_t zbc;
1783         raidz_map_t *rm = zio->io_vsd;
1784         int ret;
1785
1786         bzero(&zbc, sizeof (zio_bad_cksum_t));
1787
1788         ret = zio_checksum_error(zio, &zbc);
1789         if (ret != 0 && zbc.zbc_injected != 0)
1790                 rm->rm_ecksuminjected = 1;
1791
1792         return (ret);
1793 }
1794
1795 /*
1796  * Generate the parity from the data columns. If we tried and were able to
1797  * read the parity without error, verify that the generated parity matches the
1798  * data we read. If it doesn't, we fire off a checksum error. Return the
1799  * number such failures.
1800  */
1801 static int
1802 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1803 {
1804         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1805         int c, ret = 0;
1806         raidz_col_t *rc;
1807
1808         blkptr_t *bp = zio->io_bp;
1809         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1810             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1811
1812         if (checksum == ZIO_CHECKSUM_NOPARITY)
1813                 return (ret);
1814
1815         for (c = 0; c < rm->rm_firstdatacol; c++) {
1816                 rc = &rm->rm_col[c];
1817                 if (!rc->rc_tried || rc->rc_error != 0)
1818                         continue;
1819
1820                 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1821                 abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1822         }
1823
1824         vdev_raidz_generate_parity(rm);
1825
1826         for (c = 0; c < rm->rm_firstdatacol; c++) {
1827                 rc = &rm->rm_col[c];
1828                 if (!rc->rc_tried || rc->rc_error != 0)
1829                         continue;
1830                 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1831                         raidz_checksum_error(zio, rc, orig[c]);
1832                         rc->rc_error = SET_ERROR(ECKSUM);
1833                         ret++;
1834                 }
1835                 abd_free(orig[c]);
1836         }
1837
1838         return (ret);
1839 }
1840
1841 static int
1842 vdev_raidz_worst_error(raidz_map_t *rm)
1843 {
1844         int c, error = 0;
1845
1846         for (c = 0; c < rm->rm_cols; c++)
1847                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1848
1849         return (error);
1850 }
1851
1852 /*
1853  * Iterate over all combinations of bad data and attempt a reconstruction.
1854  * Note that the algorithm below is non-optimal because it doesn't take into
1855  * account how reconstruction is actually performed. For example, with
1856  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1857  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1858  * cases we'd only use parity information in column 0.
1859  */
1860 static int
1861 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1862 {
1863         raidz_map_t *rm = zio->io_vsd;
1864         raidz_col_t *rc;
1865         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1866         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1867         int *tgts = &tstore[1];
1868         int curr, next, i, c, n;
1869         int code, ret = 0;
1870
1871         ASSERT(total_errors < rm->rm_firstdatacol);
1872
1873         /*
1874          * This simplifies one edge condition.
1875          */
1876         tgts[-1] = -1;
1877
1878         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1879                 /*
1880                  * Initialize the targets array by finding the first n columns
1881                  * that contain no error.
1882                  *
1883                  * If there were no data errors, we need to ensure that we're
1884                  * always explicitly attempting to reconstruct at least one
1885                  * data column. To do this, we simply push the highest target
1886                  * up into the data columns.
1887                  */
1888                 for (c = 0, i = 0; i < n; i++) {
1889                         if (i == n - 1 && data_errors == 0 &&
1890                             c < rm->rm_firstdatacol) {
1891                                 c = rm->rm_firstdatacol;
1892                         }
1893
1894                         while (rm->rm_col[c].rc_error != 0) {
1895                                 c++;
1896                                 ASSERT3S(c, <, rm->rm_cols);
1897                         }
1898
1899                         tgts[i] = c++;
1900                 }
1901
1902                 /*
1903                  * Setting tgts[n] simplifies the other edge condition.
1904                  */
1905                 tgts[n] = rm->rm_cols;
1906
1907                 /*
1908                  * These buffers were allocated in previous iterations.
1909                  */
1910                 for (i = 0; i < n - 1; i++) {
1911                         ASSERT(orig[i] != NULL);
1912                 }
1913
1914                 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
1915                     rm->rm_col[0].rc_size);
1916
1917                 curr = 0;
1918                 next = tgts[curr];
1919
1920                 while (curr != n) {
1921                         tgts[curr] = next;
1922                         curr = 0;
1923
1924                         /*
1925                          * Save off the original data that we're going to
1926                          * attempt to reconstruct.
1927                          */
1928                         for (i = 0; i < n; i++) {
1929                                 ASSERT(orig[i] != NULL);
1930                                 c = tgts[i];
1931                                 ASSERT3S(c, >=, 0);
1932                                 ASSERT3S(c, <, rm->rm_cols);
1933                                 rc = &rm->rm_col[c];
1934                                 abd_copy(orig[i], rc->rc_abd, rc->rc_size);
1935                         }
1936
1937                         /*
1938                          * Attempt a reconstruction and exit the outer loop on
1939                          * success.
1940                          */
1941                         code = vdev_raidz_reconstruct(rm, tgts, n);
1942                         if (raidz_checksum_verify(zio) == 0) {
1943
1944                                 for (i = 0; i < n; i++) {
1945                                         c = tgts[i];
1946                                         rc = &rm->rm_col[c];
1947                                         ASSERT(rc->rc_error == 0);
1948                                         if (rc->rc_tried)
1949                                                 raidz_checksum_error(zio, rc,
1950                                                     orig[i]);
1951                                         rc->rc_error = SET_ERROR(ECKSUM);
1952                                 }
1953
1954                                 ret = code;
1955                                 goto done;
1956                         }
1957
1958                         /*
1959                          * Restore the original data.
1960                          */
1961                         for (i = 0; i < n; i++) {
1962                                 c = tgts[i];
1963                                 rc = &rm->rm_col[c];
1964                                 abd_copy(rc->rc_abd, orig[i], rc->rc_size);
1965                         }
1966
1967                         do {
1968                                 /*
1969                                  * Find the next valid column after the curr
1970                                  * position..
1971                                  */
1972                                 for (next = tgts[curr] + 1;
1973                                     next < rm->rm_cols &&
1974                                     rm->rm_col[next].rc_error != 0; next++)
1975                                         continue;
1976
1977                                 ASSERT(next <= tgts[curr + 1]);
1978
1979                                 /*
1980                                  * If that spot is available, we're done here.
1981                                  */
1982                                 if (next != tgts[curr + 1])
1983                                         break;
1984
1985                                 /*
1986                                  * Otherwise, find the next valid column after
1987                                  * the previous position.
1988                                  */
1989                                 for (c = tgts[curr - 1] + 1;
1990                                     rm->rm_col[c].rc_error != 0; c++)
1991                                         continue;
1992
1993                                 tgts[curr] = c;
1994                                 curr++;
1995
1996                         } while (curr != n);
1997                 }
1998         }
1999         n--;
2000 done:
2001         for (i = 0; i < n; i++)
2002                 abd_free(orig[i]);
2003
2004         return (ret);
2005 }
2006
2007 /*
2008  * Complete an IO operation on a RAIDZ VDev
2009  *
2010  * Outline:
2011  * - For write operations:
2012  *   1. Check for errors on the child IOs.
2013  *   2. Return, setting an error code if too few child VDevs were written
2014  *      to reconstruct the data later.  Note that partial writes are
2015  *      considered successful if they can be reconstructed at all.
2016  * - For read operations:
2017  *   1. Check for errors on the child IOs.
2018  *   2. If data errors occurred:
2019  *      a. Try to reassemble the data from the parity available.
2020  *      b. If we haven't yet read the parity drives, read them now.
2021  *      c. If all parity drives have been read but the data still doesn't
2022  *         reassemble with a correct checksum, then try combinatorial
2023  *         reconstruction.
2024  *      d. If that doesn't work, return an error.
2025  *   3. If there were unexpected errors or this is a resilver operation,
2026  *      rewrite the vdevs that had errors.
2027  */
2028 static void
2029 vdev_raidz_io_done(zio_t *zio)
2030 {
2031         vdev_t *vd = zio->io_vd;
2032         vdev_t *cvd;
2033         raidz_map_t *rm = zio->io_vsd;
2034         raidz_col_t *rc = NULL;
2035         int unexpected_errors = 0;
2036         int parity_errors = 0;
2037         int parity_untried = 0;
2038         int data_errors = 0;
2039         int total_errors = 0;
2040         int n, c;
2041         int tgts[VDEV_RAIDZ_MAXPARITY];
2042         int code;
2043
2044         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2045
2046         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2047         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2048
2049         for (c = 0; c < rm->rm_cols; c++) {
2050                 rc = &rm->rm_col[c];
2051
2052                 if (rc->rc_error) {
2053                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2054
2055                         if (c < rm->rm_firstdatacol)
2056                                 parity_errors++;
2057                         else
2058                                 data_errors++;
2059
2060                         if (!rc->rc_skipped)
2061                                 unexpected_errors++;
2062
2063                         total_errors++;
2064                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2065                         parity_untried++;
2066                 }
2067         }
2068
2069         if (zio->io_type == ZIO_TYPE_WRITE) {
2070                 /*
2071                  * XXX -- for now, treat partial writes as a success.
2072                  * (If we couldn't write enough columns to reconstruct
2073                  * the data, the I/O failed.  Otherwise, good enough.)
2074                  *
2075                  * Now that we support write reallocation, it would be better
2076                  * to treat partial failure as real failure unless there are
2077                  * no non-degraded top-level vdevs left, and not update DTLs
2078                  * if we intend to reallocate.
2079                  */
2080                 /* XXPOLICY */
2081                 if (total_errors > rm->rm_firstdatacol)
2082                         zio->io_error = vdev_raidz_worst_error(rm);
2083
2084                 return;
2085         }
2086
2087         ASSERT(zio->io_type == ZIO_TYPE_READ);
2088         /*
2089          * There are three potential phases for a read:
2090          *      1. produce valid data from the columns read
2091          *      2. read all disks and try again
2092          *      3. perform combinatorial reconstruction
2093          *
2094          * Each phase is progressively both more expensive and less likely to
2095          * occur. If we encounter more errors than we can repair or all phases
2096          * fail, we have no choice but to return an error.
2097          */
2098
2099         /*
2100          * If the number of errors we saw was correctable -- less than or equal
2101          * to the number of parity disks read -- attempt to produce data that
2102          * has a valid checksum. Naturally, this case applies in the absence of
2103          * any errors.
2104          */
2105         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2106                 if (data_errors == 0) {
2107                         if (raidz_checksum_verify(zio) == 0) {
2108                                 /*
2109                                  * If we read parity information (unnecessarily
2110                                  * as it happens since no reconstruction was
2111                                  * needed) regenerate and verify the parity.
2112                                  * We also regenerate parity when resilvering
2113                                  * so we can write it out to the failed device
2114                                  * later.
2115                                  */
2116                                 if (parity_errors + parity_untried <
2117                                     rm->rm_firstdatacol ||
2118                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2119                                         n = raidz_parity_verify(zio, rm);
2120                                         unexpected_errors += n;
2121                                         ASSERT(parity_errors + n <=
2122                                             rm->rm_firstdatacol);
2123                                 }
2124                                 goto done;
2125                         }
2126                 } else {
2127                         /*
2128                          * We either attempt to read all the parity columns or
2129                          * none of them. If we didn't try to read parity, we
2130                          * wouldn't be here in the correctable case. There must
2131                          * also have been fewer parity errors than parity
2132                          * columns or, again, we wouldn't be in this code path.
2133                          */
2134                         ASSERT(parity_untried == 0);
2135                         ASSERT(parity_errors < rm->rm_firstdatacol);
2136
2137                         /*
2138                          * Identify the data columns that reported an error.
2139                          */
2140                         n = 0;
2141                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2142                                 rc = &rm->rm_col[c];
2143                                 if (rc->rc_error != 0) {
2144                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2145                                         tgts[n++] = c;
2146                                 }
2147                         }
2148
2149                         ASSERT(rm->rm_firstdatacol >= n);
2150
2151                         code = vdev_raidz_reconstruct(rm, tgts, n);
2152
2153                         if (raidz_checksum_verify(zio) == 0) {
2154                                 /*
2155                                  * If we read more parity disks than were used
2156                                  * for reconstruction, confirm that the other
2157                                  * parity disks produced correct data. This
2158                                  * routine is suboptimal in that it regenerates
2159                                  * the parity that we already used in addition
2160                                  * to the parity that we're attempting to
2161                                  * verify, but this should be a relatively
2162                                  * uncommon case, and can be optimized if it
2163                                  * becomes a problem. Note that we regenerate
2164                                  * parity when resilvering so we can write it
2165                                  * out to failed devices later.
2166                                  */
2167                                 if (parity_errors < rm->rm_firstdatacol - n ||
2168                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2169                                         n = raidz_parity_verify(zio, rm);
2170                                         unexpected_errors += n;
2171                                         ASSERT(parity_errors + n <=
2172                                             rm->rm_firstdatacol);
2173                                 }
2174
2175                                 goto done;
2176                         }
2177                 }
2178         }
2179
2180         /*
2181          * This isn't a typical situation -- either we got a read error or
2182          * a child silently returned bad data. Read every block so we can
2183          * try again with as much data and parity as we can track down. If
2184          * we've already been through once before, all children will be marked
2185          * as tried so we'll proceed to combinatorial reconstruction.
2186          */
2187         unexpected_errors = 1;
2188         rm->rm_missingdata = 0;
2189         rm->rm_missingparity = 0;
2190
2191         for (c = 0; c < rm->rm_cols; c++) {
2192                 if (rm->rm_col[c].rc_tried)
2193                         continue;
2194
2195                 zio_vdev_io_redone(zio);
2196                 do {
2197                         rc = &rm->rm_col[c];
2198                         if (rc->rc_tried)
2199                                 continue;
2200                         zio_nowait(zio_vdev_child_io(zio, NULL,
2201                             vd->vdev_child[rc->rc_devidx],
2202                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2203                             zio->io_type, zio->io_priority, 0,
2204                             vdev_raidz_child_done, rc));
2205                 } while (++c < rm->rm_cols);
2206
2207                 return;
2208         }
2209
2210         /*
2211          * At this point we've attempted to reconstruct the data given the
2212          * errors we detected, and we've attempted to read all columns. There
2213          * must, therefore, be one or more additional problems -- silent errors
2214          * resulting in invalid data rather than explicit I/O errors resulting
2215          * in absent data. We check if there is enough additional data to
2216          * possibly reconstruct the data and then perform combinatorial
2217          * reconstruction over all possible combinations. If that fails,
2218          * we're cooked.
2219          */
2220         if (total_errors > rm->rm_firstdatacol) {
2221                 zio->io_error = vdev_raidz_worst_error(rm);
2222
2223         } else if (total_errors < rm->rm_firstdatacol &&
2224             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2225                 /*
2226                  * If we didn't use all the available parity for the
2227                  * combinatorial reconstruction, verify that the remaining
2228                  * parity is correct.
2229                  */
2230                 if (code != (1 << rm->rm_firstdatacol) - 1)
2231                         (void) raidz_parity_verify(zio, rm);
2232         } else {
2233                 /*
2234                  * We're here because either:
2235                  *
2236                  *      total_errors == rm_first_datacol, or
2237                  *      vdev_raidz_combrec() failed
2238                  *
2239                  * In either case, there is enough bad data to prevent
2240                  * reconstruction.
2241                  *
2242                  * Start checksum ereports for all children which haven't
2243                  * failed, and the IO wasn't speculative.
2244                  */
2245                 zio->io_error = SET_ERROR(ECKSUM);
2246
2247                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2248                         for (c = 0; c < rm->rm_cols; c++) {
2249                                 rc = &rm->rm_col[c];
2250                                 if (rc->rc_error == 0) {
2251                                         zio_bad_cksum_t zbc;
2252                                         zbc.zbc_has_cksum = 0;
2253                                         zbc.zbc_injected =
2254                                             rm->rm_ecksuminjected;
2255
2256                                         zfs_ereport_start_checksum(
2257                                             zio->io_spa,
2258                                             vd->vdev_child[rc->rc_devidx],
2259                                             zio, rc->rc_offset, rc->rc_size,
2260                                             (void *)(uintptr_t)c, &zbc);
2261                                 }
2262                         }
2263                 }
2264         }
2265
2266 done:
2267         zio_checksum_verified(zio);
2268
2269         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2270             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2271                 /*
2272                  * Use the good data we have in hand to repair damaged children.
2273                  */
2274                 for (c = 0; c < rm->rm_cols; c++) {
2275                         rc = &rm->rm_col[c];
2276                         cvd = vd->vdev_child[rc->rc_devidx];
2277
2278                         if (rc->rc_error == 0)
2279                                 continue;
2280
2281                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2282                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2283                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2284                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2285                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2286                 }
2287         }
2288 }
2289
2290 static void
2291 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2292 {
2293         if (faulted > vd->vdev_nparity)
2294                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2295                     VDEV_AUX_NO_REPLICAS);
2296         else if (degraded + faulted != 0)
2297                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2298         else
2299                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2300 }
2301
2302 /*
2303  * Determine if any portion of the provided block resides on a child vdev
2304  * with a dirty DTL and therefore needs to be resilvered.  The function
2305  * assumes that at least one DTL is dirty which imples that full stripe
2306  * width blocks must be resilvered.
2307  */
2308 static boolean_t
2309 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2310 {
2311         uint64_t dcols = vd->vdev_children;
2312         uint64_t nparity = vd->vdev_nparity;
2313         uint64_t ashift = vd->vdev_top->vdev_ashift;
2314         /* The starting RAIDZ (parent) vdev sector of the block. */
2315         uint64_t b = offset >> ashift;
2316         /* The zio's size in units of the vdev's minimum sector size. */
2317         uint64_t s = ((psize - 1) >> ashift) + 1;
2318         /* The first column for this stripe. */
2319         uint64_t f = b % dcols;
2320
2321         if (s + nparity >= dcols)
2322                 return (B_TRUE);
2323
2324         for (uint64_t c = 0; c < s + nparity; c++) {
2325                 uint64_t devidx = (f + c) % dcols;
2326                 vdev_t *cvd = vd->vdev_child[devidx];
2327
2328                 /*
2329                  * dsl_scan_need_resilver() already checked vd with
2330                  * vdev_dtl_contains(). So here just check cvd with
2331                  * vdev_dtl_empty(), cheaper and a good approximation.
2332                  */
2333                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2334                         return (B_TRUE);
2335         }
2336
2337         return (B_FALSE);
2338 }
2339
2340 vdev_ops_t vdev_raidz_ops = {
2341         vdev_raidz_open,
2342         vdev_raidz_close,
2343         vdev_raidz_asize,
2344         vdev_raidz_io_start,
2345         vdev_raidz_io_done,
2346         vdev_raidz_state_change,
2347         vdev_raidz_need_resilver,
2348         NULL,
2349         NULL,
2350         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2351         B_FALSE                 /* not a leaf vdev */
2352 };