module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
  26  */
  27
  28 #include <sys/zfs_context.h>
  29 #include <sys/spa.h>
  30 #include <sys/vdev_impl.h>
  31 #include <sys/zio.h>
  32 #include <sys/zio_checksum.h>
  33 #include <sys/abd.h>
  34 #include <sys/fs/zfs.h>
  35 #include <sys/fm/fs/zfs.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/vdev_raidz_impl.h>
  38
  39 /*
  40  * Virtual device vector for RAID-Z.
  41  *
  42  * This vdev supports single, double, and triple parity. For single parity,
  43  * we use a simple XOR of all the data columns. For double or triple parity,
  44  * we use a special case of Reed-Solomon coding. This extends the
  45  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  46  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  47  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  48  * former is also based. The latter is designed to provide higher performance
  49  * for writes.
  50  *
  51  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  52  * amended six years later identifying a critical flaw that invalidates its
  53  * claims. Nevertheless, the technique can be adapted to work for up to
  54  * triple parity. For additional parity, the amendment "Note: Correction to
  55  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  56  * is viable, but the additional complexity means that write performance will
  57  * suffer.
  58  *
  59  * All of the methods above operate on a Galois field, defined over the
  60  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  61  * can be expressed with a single byte. Briefly, the operations on the
  62  * field are defined as follows:
  63  *
  64  *   o addition (+) is represented by a bitwise XOR
  65  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  66  *   o multiplication of A by 2 is defined by the following bitwise expression:
  67  *
  68  *      (A * 2)_7 = A_6
  69  *      (A * 2)_6 = A_5
  70  *      (A * 2)_5 = A_4
  71  *      (A * 2)_4 = A_3 + A_7
  72  *      (A * 2)_3 = A_2 + A_7
  73  *      (A * 2)_2 = A_1 + A_7
  74  *      (A * 2)_1 = A_0
  75  *      (A * 2)_0 = A_7
  76  *
  77  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  78  * As an aside, this multiplication is derived from the error correcting
  79  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  80  *
  81  * Observe that any number in the field (except for 0) can be expressed as a
  82  * power of 2 -- a generator for the field. We store a table of the powers of
  83  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  84  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  85  * than field addition). The inverse of a field element A (A^-1) is therefore
  86  * A ^ (255 - 1) = A^254.
  87  *
  88  * The up-to-three parity columns, P, Q, R over several data columns,
  89  * D_0, ... D_n-1, can be expressed by field operations:
  90  *
  91  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  92  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  93  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  94  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  95  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  96  *
  97  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  98  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  99  * independent coefficients. (There are no additional coefficients that have
 100  * this property which is why the uncorrected Plank method breaks down.)
 101  *
 102  * See the reconstruction code below for how P, Q and R can used individually
 103  * or in concert to recover missing data columns.
 104  */
 105
 106 #define VDEV_RAIDZ_P            0
 107 #define VDEV_RAIDZ_Q            1
 108 #define VDEV_RAIDZ_R            2
 109
 110 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 111 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 112
 113 /*
 114  * We provide a mechanism to perform the field multiplication operation on a
 115  * 64-bit value all at once rather than a byte at a time. This works by
 116  * creating a mask from the top bit in each byte and using that to
 117  * conditionally apply the XOR of 0x1d.
 118  */
 119 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 120 { \
 121         (mask) = (x) & 0x8080808080808080ULL; \
 122         (mask) = ((mask) << 1) - ((mask) >> 7); \
 123         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 124             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 125 }
 126
 127 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 128 { \
 129         VDEV_RAIDZ_64MUL_2((x), mask); \
 130         VDEV_RAIDZ_64MUL_2((x), mask); \
 131 }
 132
 133 void
 134 vdev_raidz_map_free(raidz_map_t *rm)
 135 {
 136         int c;
 137
 138         for (c = 0; c < rm->rm_firstdatacol; c++) {
 139                 abd_free(rm->rm_col[c].rc_abd);
 140
 141                 if (rm->rm_col[c].rc_gdata != NULL)
 142                         abd_free(rm->rm_col[c].rc_gdata);
 143         }
 144
 145         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 146                 abd_put(rm->rm_col[c].rc_abd);
 147
 148         if (rm->rm_abd_copy != NULL)
 149                 abd_free(rm->rm_abd_copy);
 150
 151         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 152 }
 153
 154 static void
 155 vdev_raidz_map_free_vsd(zio_t *zio)
 156 {
 157         raidz_map_t *rm = zio->io_vsd;
 158
 159         ASSERT0(rm->rm_freed);
 160         rm->rm_freed = 1;
 161
 162         if (rm->rm_reports == 0)
 163                 vdev_raidz_map_free(rm);
 164 }
 165
 166 /*ARGSUSED*/
 167 static void
 168 vdev_raidz_cksum_free(void *arg, size_t ignored)
 169 {
 170         raidz_map_t *rm = arg;
 171
 172         ASSERT3U(rm->rm_reports, >, 0);
 173
 174         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 175                 vdev_raidz_map_free(rm);
 176 }
 177
 178 static void
 179 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data)
 180 {
 181         raidz_map_t *rm = zcr->zcr_cbdata;
 182         const size_t c = zcr->zcr_cbinfo;
 183         size_t x, offset;
 184
 185         const abd_t *good = NULL;
 186         const abd_t *bad = rm->rm_col[c].rc_abd;
 187
 188         if (good_data == NULL) {
 189                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 190                 return;
 191         }
 192
 193         if (c < rm->rm_firstdatacol) {
 194                 /*
 195                  * The first time through, calculate the parity blocks for
 196                  * the good data (this relies on the fact that the good
 197                  * data never changes for a given logical ZIO)
 198                  */
 199                 if (rm->rm_col[0].rc_gdata == NULL) {
 200                         abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY];
 201
 202                         /*
 203                          * Set up the rm_col[]s to generate the parity for
 204                          * good_data, first saving the parity bufs and
 205                          * replacing them with buffers to hold the result.
 206                          */
 207                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 208                                 bad_parity[x] = rm->rm_col[x].rc_abd;
 209                                 rm->rm_col[x].rc_abd =
 210                                     rm->rm_col[x].rc_gdata =
 211                                     abd_alloc_sametype(rm->rm_col[x].rc_abd,
 212                                     rm->rm_col[x].rc_size);
 213                         }
 214
 215                         /* fill in the data columns from good_data */
 216                         offset = 0;
 217                         for (; x < rm->rm_cols; x++) {
 218                                 abd_put(rm->rm_col[x].rc_abd);
 219
 220                                 rm->rm_col[x].rc_abd =
 221                                     abd_get_offset_size((abd_t *)good_data,
 222                                     offset, rm->rm_col[x].rc_size);
 223                                 offset += rm->rm_col[x].rc_size;
 224                         }
 225
 226                         /*
 227                          * Construct the parity from the good data.
 228                          */
 229                         vdev_raidz_generate_parity(rm);
 230
 231                         /* restore everything back to its original state */
 232                         for (x = 0; x < rm->rm_firstdatacol; x++)
 233                                 rm->rm_col[x].rc_abd = bad_parity[x];
 234
 235                         offset = 0;
 236                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 237                                 abd_put(rm->rm_col[x].rc_abd);
 238                                 rm->rm_col[x].rc_abd = abd_get_offset_size(
 239                                     rm->rm_abd_copy, offset,
 240                                     rm->rm_col[x].rc_size);
 241                                 offset += rm->rm_col[x].rc_size;
 242                         }
 243                 }
 244
 245                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 246                 good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0,
 247                     rm->rm_col[c].rc_size);
 248         } else {
 249                 /* adjust good_data to point at the start of our column */
 250                 offset = 0;
 251                 for (x = rm->rm_firstdatacol; x < c; x++)
 252                         offset += rm->rm_col[x].rc_size;
 253
 254                 good = abd_get_offset_size((abd_t *)good_data, offset,
 255                     rm->rm_col[c].rc_size);
 256         }
 257
 258         /* we drop the ereport if it ends up that the data was good */
 259         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 260         abd_put((abd_t *)good);
 261 }
 262
 263 /*
 264  * Invoked indirectly by zfs_ereport_start_checksum(), called
 265  * below when our read operation fails completely.  The main point
 266  * is to keep a copy of everything we read from disk, so that at
 267  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 268  */
 269 static void
 270 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 271 {
 272         size_t c = (size_t)(uintptr_t)arg;
 273         size_t offset;
 274
 275         raidz_map_t *rm = zio->io_vsd;
 276         size_t size;
 277
 278         /* set up the report and bump the refcount  */
 279         zcr->zcr_cbdata = rm;
 280         zcr->zcr_cbinfo = c;
 281         zcr->zcr_finish = vdev_raidz_cksum_finish;
 282         zcr->zcr_free = vdev_raidz_cksum_free;
 283
 284         rm->rm_reports++;
 285         ASSERT3U(rm->rm_reports, >, 0);
 286
 287         if (rm->rm_abd_copy != NULL)
 288                 return;
 289
 290         /*
 291          * It's the first time we're called for this raidz_map_t, so we need
 292          * to copy the data aside; there's no guarantee that our zio's buffer
 293          * won't be re-used for something else.
 294          *
 295          * Our parity data is already in separate buffers, so there's no need
 296          * to copy them.
 297          */
 298
 299         size = 0;
 300         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 301                 size += rm->rm_col[c].rc_size;
 302
 303         rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE);
 304
 305         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 306                 raidz_col_t *col = &rm->rm_col[c];
 307                 abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset,
 308                     col->rc_size);
 309
 310                 abd_copy(tmp, col->rc_abd, col->rc_size);
 311
 312                 abd_put(col->rc_abd);
 313                 col->rc_abd = tmp;
 314
 315                 offset += col->rc_size;
 316         }
 317         ASSERT3U(offset, ==, size);
 318 }
 319
 320 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 321         .vsd_free = vdev_raidz_map_free_vsd,
 322         .vsd_cksum_report = vdev_raidz_cksum_report
 323 };
 324
 325 /*
 326  * Divides the IO evenly across all child vdevs; usually, dcols is
 327  * the number of children in the target vdev.
 328  *
 329  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 330  * is this functions only caller, as small as possible on the stack.
 331  */
 332 noinline raidz_map_t *
 333 vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
 334     uint64_t nparity)
 335 {
 336         raidz_map_t *rm;
 337         /* The starting RAIDZ (parent) vdev sector of the block. */
 338         uint64_t b = zio->io_offset >> ashift;
 339         /* The zio's size in units of the vdev's minimum sector size. */
 340         uint64_t s = zio->io_size >> ashift;
 341         /* The first column for this stripe. */
 342         uint64_t f = b % dcols;
 343         /* The starting byte offset on each child vdev. */
 344         uint64_t o = (b / dcols) << ashift;
 345         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 346         uint64_t off = 0;
 347
 348         /*
 349          * "Quotient": The number of data sectors for this stripe on all but
 350          * the "big column" child vdevs that also contain "remainder" data.
 351          */
 352         q = s / (dcols - nparity);
 353
 354         /*
 355          * "Remainder": The number of partial stripe data sectors in this I/O.
 356          * This will add a sector to some, but not all, child vdevs.
 357          */
 358         r = s - q * (dcols - nparity);
 359
 360         /* The number of "big columns" - those which contain remainder data. */
 361         bc = (r == 0 ? 0 : r + nparity);
 362
 363         /*
 364          * The total number of data and parity sectors associated with
 365          * this I/O.
 366          */
 367         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 368
 369         /* acols: The columns that will be accessed. */
 370         /* scols: The columns that will be accessed or skipped. */
 371         if (q == 0) {
 372                 /* Our I/O request doesn't span all child vdevs. */
 373                 acols = bc;
 374                 scols = MIN(dcols, roundup(bc, nparity + 1));
 375         } else {
 376                 acols = dcols;
 377                 scols = dcols;
 378         }
 379
 380         ASSERT3U(acols, <=, scols);
 381
 382         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 383
 384         rm->rm_cols = acols;
 385         rm->rm_scols = scols;
 386         rm->rm_bigcols = bc;
 387         rm->rm_skipstart = bc;
 388         rm->rm_missingdata = 0;
 389         rm->rm_missingparity = 0;
 390         rm->rm_firstdatacol = nparity;
 391         rm->rm_abd_copy = NULL;
 392         rm->rm_reports = 0;
 393         rm->rm_freed = 0;
 394         rm->rm_ecksuminjected = 0;
 395
 396         asize = 0;
 397
 398         for (c = 0; c < scols; c++) {
 399                 col = f + c;
 400                 coff = o;
 401                 if (col >= dcols) {
 402                         col -= dcols;
 403                         coff += 1ULL << ashift;
 404                 }
 405                 rm->rm_col[c].rc_devidx = col;
 406                 rm->rm_col[c].rc_offset = coff;
 407                 rm->rm_col[c].rc_abd = NULL;
 408                 rm->rm_col[c].rc_gdata = NULL;
 409                 rm->rm_col[c].rc_error = 0;
 410                 rm->rm_col[c].rc_tried = 0;
 411                 rm->rm_col[c].rc_skipped = 0;
 412
 413                 if (c >= acols)
 414                         rm->rm_col[c].rc_size = 0;
 415                 else if (c < bc)
 416                         rm->rm_col[c].rc_size = (q + 1) << ashift;
 417                 else
 418                         rm->rm_col[c].rc_size = q << ashift;
 419
 420                 asize += rm->rm_col[c].rc_size;
 421         }
 422
 423         ASSERT3U(asize, ==, tot << ashift);
 424         rm->rm_asize = roundup(asize, (nparity + 1) << ashift);
 425         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 426         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift);
 427         ASSERT3U(rm->rm_nskip, <=, nparity);
 428
 429         for (c = 0; c < rm->rm_firstdatacol; c++)
 430                 rm->rm_col[c].rc_abd =
 431                     abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE);
 432
 433         rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0,
 434             rm->rm_col[c].rc_size);
 435         off = rm->rm_col[c].rc_size;
 436
 437         for (c = c + 1; c < acols; c++) {
 438                 rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off,
 439                     rm->rm_col[c].rc_size);
 440                 off += rm->rm_col[c].rc_size;
 441         }
 442
 443         /*
 444          * If all data stored spans all columns, there's a danger that parity
 445          * will always be on the same device and, since parity isn't read
 446          * during normal operation, that that device's I/O bandwidth won't be
 447          * used effectively. We therefore switch the parity every 1MB.
 448          *
 449          * ... at least that was, ostensibly, the theory. As a practical
 450          * matter unless we juggle the parity between all devices evenly, we
 451          * won't see any benefit. Further, occasional writes that aren't a
 452          * multiple of the LCM of the number of children and the minimum
 453          * stripe width are sufficient to avoid pessimal behavior.
 454          * Unfortunately, this decision created an implicit on-disk format
 455          * requirement that we need to support for all eternity, but only
 456          * for single-parity RAID-Z.
 457          *
 458          * If we intend to skip a sector in the zeroth column for padding
 459          * we must make sure to note this swap. We will never intend to
 460          * skip the first column since at least one data and one parity
 461          * column must appear in each row.
 462          */
 463         ASSERT(rm->rm_cols >= 2);
 464         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 465
 466         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 467                 devidx = rm->rm_col[0].rc_devidx;
 468                 o = rm->rm_col[0].rc_offset;
 469                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 470                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 471                 rm->rm_col[1].rc_devidx = devidx;
 472                 rm->rm_col[1].rc_offset = o;
 473
 474                 if (rm->rm_skipstart == 0)
 475                         rm->rm_skipstart = 1;
 476         }
 477
 478         zio->io_vsd = rm;
 479         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 480
 481         /* init RAIDZ parity ops */
 482         rm->rm_ops = vdev_raidz_math_get_ops();
 483
 484         return (rm);
 485 }
 486
 487 struct pqr_struct {
 488         uint64_t *p;
 489         uint64_t *q;
 490         uint64_t *r;
 491 };
 492
 493 static int
 494 vdev_raidz_p_func(void *buf, size_t size, void *private)
 495 {
 496         struct pqr_struct *pqr = private;
 497         const uint64_t *src = buf;
 498         int i, cnt = size / sizeof (src[0]);
 499
 500         ASSERT(pqr->p && !pqr->q && !pqr->r);
 501
 502         for (i = 0; i < cnt; i++, src++, pqr->p++)
 503                 *pqr->p ^= *src;
 504
 505         return (0);
 506 }
 507
 508 static int
 509 vdev_raidz_pq_func(void *buf, size_t size, void *private)
 510 {
 511         struct pqr_struct *pqr = private;
 512         const uint64_t *src = buf;
 513         uint64_t mask;
 514         int i, cnt = size / sizeof (src[0]);
 515
 516         ASSERT(pqr->p && pqr->q && !pqr->r);
 517
 518         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
 519                 *pqr->p ^= *src;
 520                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 521                 *pqr->q ^= *src;
 522         }
 523
 524         return (0);
 525 }
 526
 527 static int
 528 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 529 {
 530         struct pqr_struct *pqr = private;
 531         const uint64_t *src = buf;
 532         uint64_t mask;
 533         int i, cnt = size / sizeof (src[0]);
 534
 535         ASSERT(pqr->p && pqr->q && pqr->r);
 536
 537         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 538                 *pqr->p ^= *src;
 539                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 540                 *pqr->q ^= *src;
 541                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 542                 *pqr->r ^= *src;
 543         }
 544
 545         return (0);
 546 }
 547
 548 static void
 549 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 550 {
 551         uint64_t *p;
 552         int c;
 553         abd_t *src;
 554
 555         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 556                 src = rm->rm_col[c].rc_abd;
 557                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 558
 559                 if (c == rm->rm_firstdatacol) {
 560                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 561                 } else {
 562                         struct pqr_struct pqr = { p, NULL, NULL };
 563                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 564                             vdev_raidz_p_func, &pqr);
 565                 }
 566         }
 567 }
 568
 569 static void
 570 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 571 {
 572         uint64_t *p, *q, pcnt, ccnt, mask, i;
 573         int c;
 574         abd_t *src;
 575
 576         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 577         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 578             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 579
 580         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 581                 src = rm->rm_col[c].rc_abd;
 582                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 583                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 584
 585                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 586
 587                 if (c == rm->rm_firstdatacol) {
 588                         ASSERT(ccnt == pcnt || ccnt == 0);
 589                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 590                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 591
 592                         for (i = ccnt; i < pcnt; i++) {
 593                                 p[i] = 0;
 594                                 q[i] = 0;
 595                         }
 596                 } else {
 597                         struct pqr_struct pqr = { p, q, NULL };
 598
 599                         ASSERT(ccnt <= pcnt);
 600                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 601                             vdev_raidz_pq_func, &pqr);
 602
 603                         /*
 604                          * Treat short columns as though they are full of 0s.
 605                          * Note that there's therefore nothing needed for P.
 606                          */
 607                         for (i = ccnt; i < pcnt; i++) {
 608                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 609                         }
 610                 }
 611         }
 612 }
 613
 614 static void
 615 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 616 {
 617         uint64_t *p, *q, *r, pcnt, ccnt, mask, i;
 618         int c;
 619         abd_t *src;
 620
 621         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
 622         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 623             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 624         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 625             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 626
 627         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 628                 src = rm->rm_col[c].rc_abd;
 629                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 630                 q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 631                 r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd);
 632
 633                 ccnt = rm->rm_col[c].rc_size / sizeof (p[0]);
 634
 635                 if (c == rm->rm_firstdatacol) {
 636                         ASSERT(ccnt == pcnt || ccnt == 0);
 637                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 638                         (void) memcpy(q, p, rm->rm_col[c].rc_size);
 639                         (void) memcpy(r, p, rm->rm_col[c].rc_size);
 640
 641                         for (i = ccnt; i < pcnt; i++) {
 642                                 p[i] = 0;
 643                                 q[i] = 0;
 644                                 r[i] = 0;
 645                         }
 646                 } else {
 647                         struct pqr_struct pqr = { p, q, r };
 648
 649                         ASSERT(ccnt <= pcnt);
 650                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 651                             vdev_raidz_pqr_func, &pqr);
 652
 653                         /*
 654                          * Treat short columns as though they are full of 0s.
 655                          * Note that there's therefore nothing needed for P.
 656                          */
 657                         for (i = ccnt; i < pcnt; i++) {
 658                                 VDEV_RAIDZ_64MUL_2(q[i], mask);
 659                                 VDEV_RAIDZ_64MUL_4(r[i], mask);
 660                         }
 661                 }
 662         }
 663 }
 664
 665 /*
 666  * Generate RAID parity in the first virtual columns according to the number of
 667  * parity columns available.
 668  */
 669 void
 670 vdev_raidz_generate_parity(raidz_map_t *rm)
 671 {
 672         /* Generate using the new math implementation */
 673         if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL)
 674                 return;
 675
 676         switch (rm->rm_firstdatacol) {
 677         case 1:
 678                 vdev_raidz_generate_parity_p(rm);
 679                 break;
 680         case 2:
 681                 vdev_raidz_generate_parity_pq(rm);
 682                 break;
 683         case 3:
 684                 vdev_raidz_generate_parity_pqr(rm);
 685                 break;
 686         default:
 687                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 688         }
 689 }
 690
 691 /* ARGSUSED */
 692 static int
 693 vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
 694 {
 695         uint64_t *dst = dbuf;
 696         uint64_t *src = sbuf;
 697         int cnt = size / sizeof (src[0]);
 698
 699         for (int i = 0; i < cnt; i++) {
 700                 dst[i] ^= src[i];
 701         }
 702
 703         return (0);
 704 }
 705
 706 /* ARGSUSED */
 707 static int
 708 vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
 709     void *private)
 710 {
 711         uint64_t *dst = dbuf;
 712         uint64_t *src = sbuf;
 713         uint64_t mask;
 714         int cnt = size / sizeof (dst[0]);
 715
 716         for (int i = 0; i < cnt; i++, dst++, src++) {
 717                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 718                 *dst ^= *src;
 719         }
 720
 721         return (0);
 722 }
 723
 724 /* ARGSUSED */
 725 static int
 726 vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
 727 {
 728         uint64_t *dst = buf;
 729         uint64_t mask;
 730         int cnt = size / sizeof (dst[0]);
 731
 732         for (int i = 0; i < cnt; i++, dst++) {
 733                 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
 734                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 735         }
 736
 737         return (0);
 738 }
 739
 740 struct reconst_q_struct {
 741         uint64_t *q;
 742         int exp;
 743 };
 744
 745 static int
 746 vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
 747 {
 748         struct reconst_q_struct *rq = private;
 749         uint64_t *dst = buf;
 750         int cnt = size / sizeof (dst[0]);
 751
 752         for (int i = 0; i < cnt; i++, dst++, rq->q++) {
 753                 int j;
 754                 uint8_t *b;
 755
 756                 *dst ^= *rq->q;
 757                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 758                         *b = vdev_raidz_exp2(*b, rq->exp);
 759                 }
 760         }
 761
 762         return (0);
 763 }
 764
 765 struct reconst_pq_struct {
 766         uint8_t *p;
 767         uint8_t *q;
 768         uint8_t *pxy;
 769         uint8_t *qxy;
 770         int aexp;
 771         int bexp;
 772 };
 773
 774 static int
 775 vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
 776 {
 777         struct reconst_pq_struct *rpq = private;
 778         uint8_t *xd = xbuf;
 779         uint8_t *yd = ybuf;
 780
 781         for (int i = 0; i < size;
 782             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
 783                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 784                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 785                 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
 786         }
 787
 788         return (0);
 789 }
 790
 791 static int
 792 vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
 793 {
 794         struct reconst_pq_struct *rpq = private;
 795         uint8_t *xd = xbuf;
 796
 797         for (int i = 0; i < size;
 798             i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
 799                 /* same operation as vdev_raidz_reconst_pq_func() on xd */
 800                 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
 801                     vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
 802         }
 803
 804         return (0);
 805 }
 806
 807 static int
 808 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 809 {
 810         int x = tgts[0];
 811         int c;
 812         abd_t *dst, *src;
 813
 814         ASSERT(ntgts == 1);
 815         ASSERT(x >= rm->rm_firstdatacol);
 816         ASSERT(x < rm->rm_cols);
 817
 818         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size);
 819         ASSERT(rm->rm_col[x].rc_size > 0);
 820
 821         src = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 822         dst = rm->rm_col[x].rc_abd;
 823
 824         abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size);
 825
 826         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 827                 uint64_t size = MIN(rm->rm_col[x].rc_size,
 828                     rm->rm_col[c].rc_size);
 829
 830                 src = rm->rm_col[c].rc_abd;
 831                 dst = rm->rm_col[x].rc_abd;
 832
 833                 if (c == x)
 834                         continue;
 835
 836                 (void) abd_iterate_func2(dst, src, 0, 0, size,
 837                     vdev_raidz_reconst_p_func, NULL);
 838         }
 839
 840         return (1 << VDEV_RAIDZ_P);
 841 }
 842
 843 static int
 844 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 845 {
 846         int x = tgts[0];
 847         int c, exp;
 848         abd_t *dst, *src;
 849
 850         ASSERT(ntgts == 1);
 851
 852         ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 853
 854         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 855                 uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size,
 856                     rm->rm_col[c].rc_size);
 857
 858                 src = rm->rm_col[c].rc_abd;
 859                 dst = rm->rm_col[x].rc_abd;
 860
 861                 if (c == rm->rm_firstdatacol) {
 862                         abd_copy(dst, src, size);
 863                         if (rm->rm_col[x].rc_size > size)
 864                                 abd_zero_off(dst, size,
 865                                     rm->rm_col[x].rc_size - size);
 866
 867                 } else {
 868                         ASSERT3U(size, <=, rm->rm_col[x].rc_size);
 869                         (void) abd_iterate_func2(dst, src, 0, 0, size,
 870                             vdev_raidz_reconst_q_pre_func, NULL);
 871                         (void) abd_iterate_func(dst,
 872                             size, rm->rm_col[x].rc_size - size,
 873                             vdev_raidz_reconst_q_pre_tail_func, NULL);
 874                 }
 875         }
 876
 877         src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 878         dst = rm->rm_col[x].rc_abd;
 879         exp = 255 - (rm->rm_cols - 1 - x);
 880
 881         struct reconst_q_struct rq = { abd_to_buf(src), exp };
 882         (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size,
 883             vdev_raidz_reconst_q_post_func, &rq);
 884
 885         return (1 << VDEV_RAIDZ_Q);
 886 }
 887
 888 static int
 889 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 890 {
 891         uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
 892         abd_t *pdata, *qdata;
 893         uint64_t xsize, ysize;
 894         int x = tgts[0];
 895         int y = tgts[1];
 896         abd_t *xd, *yd;
 897
 898         ASSERT(ntgts == 2);
 899         ASSERT(x < y);
 900         ASSERT(x >= rm->rm_firstdatacol);
 901         ASSERT(y < rm->rm_cols);
 902
 903         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 904
 905         /*
 906          * Move the parity data aside -- we're going to compute parity as
 907          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 908          * reuse the parity generation mechanism without trashing the actual
 909          * parity so we make those columns appear to be full of zeros by
 910          * setting their lengths to zero.
 911          */
 912         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd;
 913         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd;
 914         xsize = rm->rm_col[x].rc_size;
 915         ysize = rm->rm_col[y].rc_size;
 916
 917         rm->rm_col[VDEV_RAIDZ_P].rc_abd =
 918             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
 919         rm->rm_col[VDEV_RAIDZ_Q].rc_abd =
 920             abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
 921         rm->rm_col[x].rc_size = 0;
 922         rm->rm_col[y].rc_size = 0;
 923
 924         vdev_raidz_generate_parity_pq(rm);
 925
 926         rm->rm_col[x].rc_size = xsize;
 927         rm->rm_col[y].rc_size = ysize;
 928
 929         p = abd_to_buf(pdata);
 930         q = abd_to_buf(qdata);
 931         pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 932         qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 933         xd = rm->rm_col[x].rc_abd;
 934         yd = rm->rm_col[y].rc_abd;
 935
 936         /*
 937          * We now have:
 938          *      Pxy = P + D_x + D_y
 939          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 940          *
 941          * We can then solve for D_x:
 942          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 943          * where
 944          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 945          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 946          *
 947          * With D_x in hand, we can easily solve for D_y:
 948          *      D_y = P + Pxy + D_x
 949          */
 950
 951         a = vdev_raidz_pow2[255 + x - y];
 952         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 953         tmp = 255 - vdev_raidz_log2[a ^ 1];
 954
 955         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 956         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 957
 958         ASSERT3U(xsize, >=, ysize);
 959         struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
 960
 961         (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
 962             vdev_raidz_reconst_pq_func, &rpq);
 963         (void) abd_iterate_func(xd, ysize, xsize - ysize,
 964             vdev_raidz_reconst_pq_tail_func, &rpq);
 965
 966         abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 967         abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd);
 968
 969         /*
 970          * Restore the saved parity data.
 971          */
 972         rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata;
 973         rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata;
 974
 975         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 976 }
 977
 978 /* BEGIN CSTYLED */
 979 /*
 980  * In the general case of reconstruction, we must solve the system of linear
 981  * equations defined by the coeffecients used to generate parity as well as
 982  * the contents of the data and parity disks. This can be expressed with
 983  * vectors for the original data (D) and the actual data (d) and parity (p)
 984  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 985  *
 986  *            __   __                     __     __
 987  *            |     |         __     __   |  p_0  |
 988  *            |  V  |         |  D_0  |   | p_m-1 |
 989  *            |     |    x    |   :   | = |  d_0  |
 990  *            |  I  |         | D_n-1 |   |   :   |
 991  *            |     |         ~~     ~~   | d_n-1 |
 992  *            ~~   ~~                     ~~     ~~
 993  *
 994  * I is simply a square identity matrix of size n, and V is a vandermonde
 995  * matrix defined by the coeffecients we chose for the various parity columns
 996  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 997  * computation as well as linear separability.
 998  *
 999  *      __               __               __     __
1000  *      |   1   ..  1 1 1 |               |  p_0  |
1001  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
1002  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
1003  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
1004  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
1005  *      |   :       : : : |   |   :   |   |  d_2  |
1006  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
1007  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
1008  *      |   0   ..  0 0 1 |               | d_n-1 |
1009  *      ~~               ~~               ~~     ~~
1010  *
1011  * Note that I, V, d, and p are known. To compute D, we must invert the
1012  * matrix and use the known data and parity values to reconstruct the unknown
1013  * data values. We begin by removing the rows in V|I and d|p that correspond
1014  * to failed or missing columns; we then make V|I square (n x n) and d|p
1015  * sized n by removing rows corresponding to unused parity from the bottom up
1016  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1017  * using Gauss-Jordan elimination. In the example below we use m=3 parity
1018  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1019  *           __                               __
1020  *           |  1   1   1   1   1   1   1   1  |
1021  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
1022  *           |  19 205 116  29  64  16  4   1  |      / /
1023  *           |  1   0   0   0   0   0   0   0  |     / /
1024  *           |  0   1   0   0   0   0   0   0  | <--' /
1025  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
1026  *           |  0   0   0   1   0   0   0   0  |
1027  *           |  0   0   0   0   1   0   0   0  |
1028  *           |  0   0   0   0   0   1   0   0  |
1029  *           |  0   0   0   0   0   0   1   0  |
1030  *           |  0   0   0   0   0   0   0   1  |
1031  *           ~~                               ~~
1032  *           __                               __
1033  *           |  1   1   1   1   1   1   1   1  |
1034  *           | 128  64  32  16  8   4   2   1  |
1035  *           |  19 205 116  29  64  16  4   1  |
1036  *           |  1   0   0   0   0   0   0   0  |
1037  *           |  0   1   0   0   0   0   0   0  |
1038  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
1039  *           |  0   0   0   1   0   0   0   0  |
1040  *           |  0   0   0   0   1   0   0   0  |
1041  *           |  0   0   0   0   0   1   0   0  |
1042  *           |  0   0   0   0   0   0   1   0  |
1043  *           |  0   0   0   0   0   0   0   1  |
1044  *           ~~                               ~~
1045  *
1046  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1047  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1048  * matrix is not singular.
1049  * __                                                                 __
1050  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1051  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1052  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1053  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1054  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1055  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1056  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1057  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1058  * ~~                                                                 ~~
1059  * __                                                                 __
1060  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1061  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1062  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1063  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1064  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1065  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1066  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1067  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1068  * ~~                                                                 ~~
1069  * __                                                                 __
1070  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1071  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1072  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1073  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1074  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1075  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1076  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1077  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1078  * ~~                                                                 ~~
1079  * __                                                                 __
1080  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1081  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1082  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1083  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1084  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1085  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1086  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1087  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1088  * ~~                                                                 ~~
1089  * __                                                                 __
1090  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1091  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1092  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1093  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1094  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1095  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1096  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1097  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1098  * ~~                                                                 ~~
1099  * __                                                                 __
1100  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1101  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1102  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1103  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1104  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1105  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1106  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1107  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1108  * ~~                                                                 ~~
1109  *                   __                               __
1110  *                   |  0   0   1   0   0   0   0   0  |
1111  *                   | 167 100  5   41 159 169 217 208 |
1112  *                   | 166 100  4   40 158 168 216 209 |
1113  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1114  *                   |  0   0   0   0   1   0   0   0  |
1115  *                   |  0   0   0   0   0   1   0   0  |
1116  *                   |  0   0   0   0   0   0   1   0  |
1117  *                   |  0   0   0   0   0   0   0   1  |
1118  *                   ~~                               ~~
1119  *
1120  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1121  * of the missing data.
1122  *
1123  * As is apparent from the example above, the only non-trivial rows in the
1124  * inverse matrix correspond to the data disks that we're trying to
1125  * reconstruct. Indeed, those are the only rows we need as the others would
1126  * only be useful for reconstructing data known or assumed to be valid. For
1127  * that reason, we only build the coefficients in the rows that correspond to
1128  * targeted columns.
1129  */
1130 /* END CSTYLED */
1131
1132 static void
1133 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1134     uint8_t **rows)
1135 {
1136         int i, j;
1137         int pow;
1138
1139         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1140
1141         /*
1142          * Fill in the missing rows of interest.
1143          */
1144         for (i = 0; i < nmap; i++) {
1145                 ASSERT3S(0, <=, map[i]);
1146                 ASSERT3S(map[i], <=, 2);
1147
1148                 pow = map[i] * n;
1149                 if (pow > 255)
1150                         pow -= 255;
1151                 ASSERT(pow <= 255);
1152
1153                 for (j = 0; j < n; j++) {
1154                         pow -= map[i];
1155                         if (pow < 0)
1156                                 pow += 255;
1157                         rows[i][j] = vdev_raidz_pow2[pow];
1158                 }
1159         }
1160 }
1161
1162 static void
1163 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1164     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1165 {
1166         int i, j, ii, jj;
1167         uint8_t log;
1168
1169         /*
1170          * Assert that the first nmissing entries from the array of used
1171          * columns correspond to parity columns and that subsequent entries
1172          * correspond to data columns.
1173          */
1174         for (i = 0; i < nmissing; i++) {
1175                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1176         }
1177         for (; i < n; i++) {
1178                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1179         }
1180
1181         /*
1182          * First initialize the storage where we'll compute the inverse rows.
1183          */
1184         for (i = 0; i < nmissing; i++) {
1185                 for (j = 0; j < n; j++) {
1186                         invrows[i][j] = (i == j) ? 1 : 0;
1187                 }
1188         }
1189
1190         /*
1191          * Subtract all trivial rows from the rows of consequence.
1192          */
1193         for (i = 0; i < nmissing; i++) {
1194                 for (j = nmissing; j < n; j++) {
1195                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1196                         jj = used[j] - rm->rm_firstdatacol;
1197                         ASSERT3S(jj, <, n);
1198                         invrows[i][j] = rows[i][jj];
1199                         rows[i][jj] = 0;
1200                 }
1201         }
1202
1203         /*
1204          * For each of the rows of interest, we must normalize it and subtract
1205          * a multiple of it from the other rows.
1206          */
1207         for (i = 0; i < nmissing; i++) {
1208                 for (j = 0; j < missing[i]; j++) {
1209                         ASSERT0(rows[i][j]);
1210                 }
1211                 ASSERT3U(rows[i][missing[i]], !=, 0);
1212
1213                 /*
1214                  * Compute the inverse of the first element and multiply each
1215                  * element in the row by that value.
1216                  */
1217                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1218
1219                 for (j = 0; j < n; j++) {
1220                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1221                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1222                 }
1223
1224                 for (ii = 0; ii < nmissing; ii++) {
1225                         if (i == ii)
1226                                 continue;
1227
1228                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1229
1230                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1231
1232                         for (j = 0; j < n; j++) {
1233                                 rows[ii][j] ^=
1234                                     vdev_raidz_exp2(rows[i][j], log);
1235                                 invrows[ii][j] ^=
1236                                     vdev_raidz_exp2(invrows[i][j], log);
1237                         }
1238                 }
1239         }
1240
1241         /*
1242          * Verify that the data that is left in the rows are properly part of
1243          * an identity matrix.
1244          */
1245         for (i = 0; i < nmissing; i++) {
1246                 for (j = 0; j < n; j++) {
1247                         if (j == missing[i]) {
1248                                 ASSERT3U(rows[i][j], ==, 1);
1249                         } else {
1250                                 ASSERT0(rows[i][j]);
1251                         }
1252                 }
1253         }
1254 }
1255
1256 static void
1257 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1258     int *missing, uint8_t **invrows, const uint8_t *used)
1259 {
1260         int i, j, x, cc, c;
1261         uint8_t *src;
1262         uint64_t ccount;
1263         uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1264         uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
1265         uint8_t log = 0;
1266         uint8_t val;
1267         int ll;
1268         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1269         uint8_t *p, *pp;
1270         size_t psize;
1271
1272         psize = sizeof (invlog[0][0]) * n * nmissing;
1273         p = kmem_alloc(psize, KM_SLEEP);
1274
1275         for (pp = p, i = 0; i < nmissing; i++) {
1276                 invlog[i] = pp;
1277                 pp += n;
1278         }
1279
1280         for (i = 0; i < nmissing; i++) {
1281                 for (j = 0; j < n; j++) {
1282                         ASSERT3U(invrows[i][j], !=, 0);
1283                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1284                 }
1285         }
1286
1287         for (i = 0; i < n; i++) {
1288                 c = used[i];
1289                 ASSERT3U(c, <, rm->rm_cols);
1290
1291                 src = abd_to_buf(rm->rm_col[c].rc_abd);
1292                 ccount = rm->rm_col[c].rc_size;
1293                 for (j = 0; j < nmissing; j++) {
1294                         cc = missing[j] + rm->rm_firstdatacol;
1295                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1296                         ASSERT3U(cc, <, rm->rm_cols);
1297                         ASSERT3U(cc, !=, c);
1298
1299                         dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd);
1300                         dcount[j] = rm->rm_col[cc].rc_size;
1301                 }
1302
1303                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1304
1305                 for (x = 0; x < ccount; x++, src++) {
1306                         if (*src != 0)
1307                                 log = vdev_raidz_log2[*src];
1308
1309                         for (cc = 0; cc < nmissing; cc++) {
1310                                 if (x >= dcount[cc])
1311                                         continue;
1312
1313                                 if (*src == 0) {
1314                                         val = 0;
1315                                 } else {
1316                                         if ((ll = log + invlog[cc][i]) >= 255)
1317                                                 ll -= 255;
1318                                         val = vdev_raidz_pow2[ll];
1319                                 }
1320
1321                                 if (i == 0)
1322                                         dst[cc][x] = val;
1323                                 else
1324                                         dst[cc][x] ^= val;
1325                         }
1326                 }
1327         }
1328
1329         kmem_free(p, psize);
1330 }
1331
1332 static int
1333 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1334 {
1335         int n, i, c, t, tt;
1336         int nmissing_rows;
1337         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1338         int parity_map[VDEV_RAIDZ_MAXPARITY];
1339
1340         uint8_t *p, *pp;
1341         size_t psize;
1342
1343         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1344         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1345         uint8_t *used;
1346
1347         abd_t **bufs = NULL;
1348
1349         int code = 0;
1350
1351         /*
1352          * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1353          * temporary linear ABDs.
1354          */
1355         if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) {
1356                 bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE);
1357
1358                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1359                         raidz_col_t *col = &rm->rm_col[c];
1360
1361                         bufs[c] = col->rc_abd;
1362                         col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE);
1363                         abd_copy(col->rc_abd, bufs[c], col->rc_size);
1364                 }
1365         }
1366
1367         n = rm->rm_cols - rm->rm_firstdatacol;
1368
1369         /*
1370          * Figure out which data columns are missing.
1371          */
1372         nmissing_rows = 0;
1373         for (t = 0; t < ntgts; t++) {
1374                 if (tgts[t] >= rm->rm_firstdatacol) {
1375                         missing_rows[nmissing_rows++] =
1376                             tgts[t] - rm->rm_firstdatacol;
1377                 }
1378         }
1379
1380         /*
1381          * Figure out which parity columns to use to help generate the missing
1382          * data columns.
1383          */
1384         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1385                 ASSERT(tt < ntgts);
1386                 ASSERT(c < rm->rm_firstdatacol);
1387
1388                 /*
1389                  * Skip any targeted parity columns.
1390                  */
1391                 if (c == tgts[tt]) {
1392                         tt++;
1393                         continue;
1394                 }
1395
1396                 code |= 1 << c;
1397
1398                 parity_map[i] = c;
1399                 i++;
1400         }
1401
1402         ASSERT(code != 0);
1403         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1404
1405         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1406             nmissing_rows * n + sizeof (used[0]) * n;
1407         p = kmem_alloc(psize, KM_SLEEP);
1408
1409         for (pp = p, i = 0; i < nmissing_rows; i++) {
1410                 rows[i] = pp;
1411                 pp += n;
1412                 invrows[i] = pp;
1413                 pp += n;
1414         }
1415         used = pp;
1416
1417         for (i = 0; i < nmissing_rows; i++) {
1418                 used[i] = parity_map[i];
1419         }
1420
1421         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1422                 if (tt < nmissing_rows &&
1423                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1424                         tt++;
1425                         continue;
1426                 }
1427
1428                 ASSERT3S(i, <, n);
1429                 used[i] = c;
1430                 i++;
1431         }
1432
1433         /*
1434          * Initialize the interesting rows of the matrix.
1435          */
1436         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1437
1438         /*
1439          * Invert the matrix.
1440          */
1441         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1442             invrows, used);
1443
1444         /*
1445          * Reconstruct the missing data using the generated matrix.
1446          */
1447         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1448             invrows, used);
1449
1450         kmem_free(p, psize);
1451
1452         /*
1453          * copy back from temporary linear abds and free them
1454          */
1455         if (bufs) {
1456                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1457                         raidz_col_t *col = &rm->rm_col[c];
1458
1459                         abd_copy(bufs[c], col->rc_abd, col->rc_size);
1460                         abd_free(col->rc_abd);
1461                         col->rc_abd = bufs[c];
1462                 }
1463                 kmem_free(bufs, rm->rm_cols * sizeof (abd_t *));
1464         }
1465
1466         return (code);
1467 }
1468
1469 int
1470 vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
1471 {
1472         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1473         int ntgts;
1474         int i, c, ret;
1475         int code;
1476         int nbadparity, nbaddata;
1477         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1478
1479         /*
1480          * The tgts list must already be sorted.
1481          */
1482         for (i = 1; i < nt; i++) {
1483                 ASSERT(t[i] > t[i - 1]);
1484         }
1485
1486         nbadparity = rm->rm_firstdatacol;
1487         nbaddata = rm->rm_cols - nbadparity;
1488         ntgts = 0;
1489         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1490                 if (c < rm->rm_firstdatacol)
1491                         parity_valid[c] = B_FALSE;
1492
1493                 if (i < nt && c == t[i]) {
1494                         tgts[ntgts++] = c;
1495                         i++;
1496                 } else if (rm->rm_col[c].rc_error != 0) {
1497                         tgts[ntgts++] = c;
1498                 } else if (c >= rm->rm_firstdatacol) {
1499                         nbaddata--;
1500                 } else {
1501                         parity_valid[c] = B_TRUE;
1502                         nbadparity--;
1503                 }
1504         }
1505
1506         ASSERT(ntgts >= nt);
1507         ASSERT(nbaddata >= 0);
1508         ASSERT(nbaddata + nbadparity == ntgts);
1509
1510         dt = &tgts[nbadparity];
1511
1512         /* Reconstruct using the new math implementation */
1513         ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata);
1514         if (ret != RAIDZ_ORIGINAL_IMPL)
1515                 return (ret);
1516
1517         /*
1518          * See if we can use any of our optimized reconstruction routines.
1519          */
1520         switch (nbaddata) {
1521         case 1:
1522                 if (parity_valid[VDEV_RAIDZ_P])
1523                         return (vdev_raidz_reconstruct_p(rm, dt, 1));
1524
1525                 ASSERT(rm->rm_firstdatacol > 1);
1526
1527                 if (parity_valid[VDEV_RAIDZ_Q])
1528                         return (vdev_raidz_reconstruct_q(rm, dt, 1));
1529
1530                 ASSERT(rm->rm_firstdatacol > 2);
1531                 break;
1532
1533         case 2:
1534                 ASSERT(rm->rm_firstdatacol > 1);
1535
1536                 if (parity_valid[VDEV_RAIDZ_P] &&
1537                     parity_valid[VDEV_RAIDZ_Q])
1538                         return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1539
1540                 ASSERT(rm->rm_firstdatacol > 2);
1541
1542                 break;
1543         }
1544
1545         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1546         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1547         ASSERT(code > 0);
1548         return (code);
1549 }
1550
1551 static int
1552 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1553     uint64_t *ashift)
1554 {
1555         vdev_t *cvd;
1556         uint64_t nparity = vd->vdev_nparity;
1557         int c;
1558         int lasterror = 0;
1559         int numerrors = 0;
1560
1561         ASSERT(nparity > 0);
1562
1563         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1564             vd->vdev_children < nparity + 1) {
1565                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1566                 return (SET_ERROR(EINVAL));
1567         }
1568
1569         vdev_open_children(vd);
1570
1571         for (c = 0; c < vd->vdev_children; c++) {
1572                 cvd = vd->vdev_child[c];
1573
1574                 if (cvd->vdev_open_error != 0) {
1575                         lasterror = cvd->vdev_open_error;
1576                         numerrors++;
1577                         continue;
1578                 }
1579
1580                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1581                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1582                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1583         }
1584
1585         *asize *= vd->vdev_children;
1586         *max_asize *= vd->vdev_children;
1587
1588         if (numerrors > nparity) {
1589                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1590                 return (lasterror);
1591         }
1592
1593         return (0);
1594 }
1595
1596 static void
1597 vdev_raidz_close(vdev_t *vd)
1598 {
1599         int c;
1600
1601         for (c = 0; c < vd->vdev_children; c++)
1602                 vdev_close(vd->vdev_child[c]);
1603 }
1604
1605 static uint64_t
1606 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1607 {
1608         uint64_t asize;
1609         uint64_t ashift = vd->vdev_top->vdev_ashift;
1610         uint64_t cols = vd->vdev_children;
1611         uint64_t nparity = vd->vdev_nparity;
1612
1613         asize = ((psize - 1) >> ashift) + 1;
1614         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1615         asize = roundup(asize, nparity + 1) << ashift;
1616
1617         return (asize);
1618 }
1619
1620 static void
1621 vdev_raidz_child_done(zio_t *zio)
1622 {
1623         raidz_col_t *rc = zio->io_private;
1624
1625         rc->rc_error = zio->io_error;
1626         rc->rc_tried = 1;
1627         rc->rc_skipped = 0;
1628 }
1629
1630 /*
1631  * Start an IO operation on a RAIDZ VDev
1632  *
1633  * Outline:
1634  * - For write operations:
1635  *   1. Generate the parity data
1636  *   2. Create child zio write operations to each column's vdev, for both
1637  *      data and parity.
1638  *   3. If the column skips any sectors for padding, create optional dummy
1639  *      write zio children for those areas to improve aggregation continuity.
1640  * - For read operations:
1641  *   1. Create child zio read operations to each data column's vdev to read
1642  *      the range of data required for zio.
1643  *   2. If this is a scrub or resilver operation, or if any of the data
1644  *      vdevs have had errors, then create zio read operations to the parity
1645  *      columns' VDevs as well.
1646  */
1647 static void
1648 vdev_raidz_io_start(zio_t *zio)
1649 {
1650         vdev_t *vd = zio->io_vd;
1651         vdev_t *tvd = vd->vdev_top;
1652         vdev_t *cvd;
1653         raidz_map_t *rm;
1654         raidz_col_t *rc;
1655         int c, i;
1656
1657         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1658             vd->vdev_nparity);
1659
1660         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1661
1662         if (zio->io_type == ZIO_TYPE_WRITE) {
1663                 vdev_raidz_generate_parity(rm);
1664
1665                 for (c = 0; c < rm->rm_cols; c++) {
1666                         rc = &rm->rm_col[c];
1667                         cvd = vd->vdev_child[rc->rc_devidx];
1668                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1669                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1670                             zio->io_type, zio->io_priority, 0,
1671                             vdev_raidz_child_done, rc));
1672                 }
1673
1674                 /*
1675                  * Generate optional I/Os for any skipped sectors to improve
1676                  * aggregation contiguity.
1677                  */
1678                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1679                         ASSERT(c <= rm->rm_scols);
1680                         if (c == rm->rm_scols)
1681                                 c = 0;
1682                         rc = &rm->rm_col[c];
1683                         cvd = vd->vdev_child[rc->rc_devidx];
1684                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1685                             rc->rc_offset + rc->rc_size, NULL,
1686                             1 << tvd->vdev_ashift,
1687                             zio->io_type, zio->io_priority,
1688                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1689                 }
1690
1691                 zio_execute(zio);
1692                 return;
1693         }
1694
1695         ASSERT(zio->io_type == ZIO_TYPE_READ);
1696
1697         /*
1698          * Iterate over the columns in reverse order so that we hit the parity
1699          * last -- any errors along the way will force us to read the parity.
1700          */
1701         for (c = rm->rm_cols - 1; c >= 0; c--) {
1702                 rc = &rm->rm_col[c];
1703                 cvd = vd->vdev_child[rc->rc_devidx];
1704                 if (!vdev_readable(cvd)) {
1705                         if (c >= rm->rm_firstdatacol)
1706                                 rm->rm_missingdata++;
1707                         else
1708                                 rm->rm_missingparity++;
1709                         rc->rc_error = SET_ERROR(ENXIO);
1710                         rc->rc_tried = 1;       /* don't even try */
1711                         rc->rc_skipped = 1;
1712                         continue;
1713                 }
1714                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1715                         if (c >= rm->rm_firstdatacol)
1716                                 rm->rm_missingdata++;
1717                         else
1718                                 rm->rm_missingparity++;
1719                         rc->rc_error = SET_ERROR(ESTALE);
1720                         rc->rc_skipped = 1;
1721                         continue;
1722                 }
1723                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1724                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1725                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1726                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1727                             zio->io_type, zio->io_priority, 0,
1728                             vdev_raidz_child_done, rc));
1729                 }
1730         }
1731
1732         zio_execute(zio);
1733 }
1734
1735
1736 /*
1737  * Report a checksum error for a child of a RAID-Z device.
1738  */
1739 static void
1740 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
1741 {
1742         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1743
1744         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1745                 zio_bad_cksum_t zbc;
1746                 raidz_map_t *rm = zio->io_vsd;
1747
1748                 mutex_enter(&vd->vdev_stat_lock);
1749                 vd->vdev_stat.vs_checksum_errors++;
1750                 mutex_exit(&vd->vdev_stat_lock);
1751
1752                 zbc.zbc_has_cksum = 0;
1753                 zbc.zbc_injected = rm->rm_ecksuminjected;
1754
1755                 zfs_ereport_post_checksum(zio->io_spa, vd,
1756                     &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
1757                     rc->rc_abd, bad_data, &zbc);
1758         }
1759 }
1760
1761 /*
1762  * We keep track of whether or not there were any injected errors, so that
1763  * any ereports we generate can note it.
1764  */
1765 static int
1766 raidz_checksum_verify(zio_t *zio)
1767 {
1768         zio_bad_cksum_t zbc;
1769         raidz_map_t *rm = zio->io_vsd;
1770
1771         bzero(&zbc, sizeof (zio_bad_cksum_t));
1772
1773         int ret = zio_checksum_error(zio, &zbc);
1774         if (ret != 0 && zbc.zbc_injected != 0)
1775                 rm->rm_ecksuminjected = 1;
1776
1777         return (ret);
1778 }
1779
1780 /*
1781  * Generate the parity from the data columns. If we tried and were able to
1782  * read the parity without error, verify that the generated parity matches the
1783  * data we read. If it doesn't, we fire off a checksum error. Return the
1784  * number such failures.
1785  */
1786 static int
1787 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1788 {
1789         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1790         int c, ret = 0;
1791         raidz_col_t *rc;
1792
1793         blkptr_t *bp = zio->io_bp;
1794         enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
1795             (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
1796
1797         if (checksum == ZIO_CHECKSUM_NOPARITY)
1798                 return (ret);
1799
1800         for (c = 0; c < rm->rm_firstdatacol; c++) {
1801                 rc = &rm->rm_col[c];
1802                 if (!rc->rc_tried || rc->rc_error != 0)
1803                         continue;
1804
1805                 orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size);
1806                 abd_copy(orig[c], rc->rc_abd, rc->rc_size);
1807         }
1808
1809         vdev_raidz_generate_parity(rm);
1810
1811         for (c = 0; c < rm->rm_firstdatacol; c++) {
1812                 rc = &rm->rm_col[c];
1813                 if (!rc->rc_tried || rc->rc_error != 0)
1814                         continue;
1815                 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
1816                         raidz_checksum_error(zio, rc, orig[c]);
1817                         rc->rc_error = SET_ERROR(ECKSUM);
1818                         ret++;
1819                 }
1820                 abd_free(orig[c]);
1821         }
1822
1823         return (ret);
1824 }
1825
1826 static int
1827 vdev_raidz_worst_error(raidz_map_t *rm)
1828 {
1829         int error = 0;
1830
1831         for (int c = 0; c < rm->rm_cols; c++)
1832                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1833
1834         return (error);
1835 }
1836
1837 /*
1838  * Iterate over all combinations of bad data and attempt a reconstruction.
1839  * Note that the algorithm below is non-optimal because it doesn't take into
1840  * account how reconstruction is actually performed. For example, with
1841  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1842  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1843  * cases we'd only use parity information in column 0.
1844  */
1845 static int
1846 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1847 {
1848         raidz_map_t *rm = zio->io_vsd;
1849         raidz_col_t *rc;
1850         abd_t *orig[VDEV_RAIDZ_MAXPARITY];
1851         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1852         int *tgts = &tstore[1];
1853         int curr, next, i, c, n;
1854         int code, ret = 0;
1855
1856         ASSERT(total_errors < rm->rm_firstdatacol);
1857
1858         /*
1859          * This simplifies one edge condition.
1860          */
1861         tgts[-1] = -1;
1862
1863         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1864                 /*
1865                  * Initialize the targets array by finding the first n columns
1866                  * that contain no error.
1867                  *
1868                  * If there were no data errors, we need to ensure that we're
1869                  * always explicitly attempting to reconstruct at least one
1870                  * data column. To do this, we simply push the highest target
1871                  * up into the data columns.
1872                  */
1873                 for (c = 0, i = 0; i < n; i++) {
1874                         if (i == n - 1 && data_errors == 0 &&
1875                             c < rm->rm_firstdatacol) {
1876                                 c = rm->rm_firstdatacol;
1877                         }
1878
1879                         while (rm->rm_col[c].rc_error != 0) {
1880                                 c++;
1881                                 ASSERT3S(c, <, rm->rm_cols);
1882                         }
1883
1884                         tgts[i] = c++;
1885                 }
1886
1887                 /*
1888                  * Setting tgts[n] simplifies the other edge condition.
1889                  */
1890                 tgts[n] = rm->rm_cols;
1891
1892                 /*
1893                  * These buffers were allocated in previous iterations.
1894                  */
1895                 for (i = 0; i < n - 1; i++) {
1896                         ASSERT(orig[i] != NULL);
1897                 }
1898
1899                 orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd,
1900                     rm->rm_col[0].rc_size);
1901
1902                 curr = 0;
1903                 next = tgts[curr];
1904
1905                 while (curr != n) {
1906                         tgts[curr] = next;
1907                         curr = 0;
1908
1909                         /*
1910                          * Save off the original data that we're going to
1911                          * attempt to reconstruct.
1912                          */
1913                         for (i = 0; i < n; i++) {
1914                                 ASSERT(orig[i] != NULL);
1915                                 c = tgts[i];
1916                                 ASSERT3S(c, >=, 0);
1917                                 ASSERT3S(c, <, rm->rm_cols);
1918                                 rc = &rm->rm_col[c];
1919                                 abd_copy(orig[i], rc->rc_abd, rc->rc_size);
1920                         }
1921
1922                         /*
1923                          * Attempt a reconstruction and exit the outer loop on
1924                          * success.
1925                          */
1926                         code = vdev_raidz_reconstruct(rm, tgts, n);
1927                         if (raidz_checksum_verify(zio) == 0) {
1928
1929                                 for (i = 0; i < n; i++) {
1930                                         c = tgts[i];
1931                                         rc = &rm->rm_col[c];
1932                                         ASSERT(rc->rc_error == 0);
1933                                         if (rc->rc_tried)
1934                                                 raidz_checksum_error(zio, rc,
1935                                                     orig[i]);
1936                                         rc->rc_error = SET_ERROR(ECKSUM);
1937                                 }
1938
1939                                 ret = code;
1940                                 goto done;
1941                         }
1942
1943                         /*
1944                          * Restore the original data.
1945                          */
1946                         for (i = 0; i < n; i++) {
1947                                 c = tgts[i];
1948                                 rc = &rm->rm_col[c];
1949                                 abd_copy(rc->rc_abd, orig[i], rc->rc_size);
1950                         }
1951
1952                         do {
1953                                 /*
1954                                  * Find the next valid column after the curr
1955                                  * position..
1956                                  */
1957                                 for (next = tgts[curr] + 1;
1958                                     next < rm->rm_cols &&
1959                                     rm->rm_col[next].rc_error != 0; next++)
1960                                         continue;
1961
1962                                 ASSERT(next <= tgts[curr + 1]);
1963
1964                                 /*
1965                                  * If that spot is available, we're done here.
1966                                  */
1967                                 if (next != tgts[curr + 1])
1968                                         break;
1969
1970                                 /*
1971                                  * Otherwise, find the next valid column after
1972                                  * the previous position.
1973                                  */
1974                                 for (c = tgts[curr - 1] + 1;
1975                                     rm->rm_col[c].rc_error != 0; c++)
1976                                         continue;
1977
1978                                 tgts[curr] = c;
1979                                 curr++;
1980
1981                         } while (curr != n);
1982                 }
1983         }
1984         n--;
1985 done:
1986         for (i = 0; i < n; i++)
1987                 abd_free(orig[i]);
1988
1989         return (ret);
1990 }
1991
1992 /*
1993  * Complete an IO operation on a RAIDZ VDev
1994  *
1995  * Outline:
1996  * - For write operations:
1997  *   1. Check for errors on the child IOs.
1998  *   2. Return, setting an error code if too few child VDevs were written
1999  *      to reconstruct the data later.  Note that partial writes are
2000  *      considered successful if they can be reconstructed at all.
2001  * - For read operations:
2002  *   1. Check for errors on the child IOs.
2003  *   2. If data errors occurred:
2004  *      a. Try to reassemble the data from the parity available.
2005  *      b. If we haven't yet read the parity drives, read them now.
2006  *      c. If all parity drives have been read but the data still doesn't
2007  *         reassemble with a correct checksum, then try combinatorial
2008  *         reconstruction.
2009  *      d. If that doesn't work, return an error.
2010  *   3. If there were unexpected errors or this is a resilver operation,
2011  *      rewrite the vdevs that had errors.
2012  */
2013 static void
2014 vdev_raidz_io_done(zio_t *zio)
2015 {
2016         vdev_t *vd = zio->io_vd;
2017         vdev_t *cvd;
2018         raidz_map_t *rm = zio->io_vsd;
2019         raidz_col_t *rc = NULL;
2020         int unexpected_errors = 0;
2021         int parity_errors = 0;
2022         int parity_untried = 0;
2023         int data_errors = 0;
2024         int total_errors = 0;
2025         int n, c;
2026         int tgts[VDEV_RAIDZ_MAXPARITY];
2027         int code;
2028
2029         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2030
2031         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2032         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2033
2034         for (c = 0; c < rm->rm_cols; c++) {
2035                 rc = &rm->rm_col[c];
2036
2037                 if (rc->rc_error) {
2038                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2039
2040                         if (c < rm->rm_firstdatacol)
2041                                 parity_errors++;
2042                         else
2043                                 data_errors++;
2044
2045                         if (!rc->rc_skipped)
2046                                 unexpected_errors++;
2047
2048                         total_errors++;
2049                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2050                         parity_untried++;
2051                 }
2052         }
2053
2054         if (zio->io_type == ZIO_TYPE_WRITE) {
2055                 /*
2056                  * XXX -- for now, treat partial writes as a success.
2057                  * (If we couldn't write enough columns to reconstruct
2058                  * the data, the I/O failed.  Otherwise, good enough.)
2059                  *
2060                  * Now that we support write reallocation, it would be better
2061                  * to treat partial failure as real failure unless there are
2062                  * no non-degraded top-level vdevs left, and not update DTLs
2063                  * if we intend to reallocate.
2064                  */
2065                 /* XXPOLICY */
2066                 if (total_errors > rm->rm_firstdatacol)
2067                         zio->io_error = vdev_raidz_worst_error(rm);
2068
2069                 return;
2070         }
2071
2072         ASSERT(zio->io_type == ZIO_TYPE_READ);
2073         /*
2074          * There are three potential phases for a read:
2075          *      1. produce valid data from the columns read
2076          *      2. read all disks and try again
2077          *      3. perform combinatorial reconstruction
2078          *
2079          * Each phase is progressively both more expensive and less likely to
2080          * occur. If we encounter more errors than we can repair or all phases
2081          * fail, we have no choice but to return an error.
2082          */
2083
2084         /*
2085          * If the number of errors we saw was correctable -- less than or equal
2086          * to the number of parity disks read -- attempt to produce data that
2087          * has a valid checksum. Naturally, this case applies in the absence of
2088          * any errors.
2089          */
2090         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2091                 if (data_errors == 0) {
2092                         if (raidz_checksum_verify(zio) == 0) {
2093                                 /*
2094                                  * If we read parity information (unnecessarily
2095                                  * as it happens since no reconstruction was
2096                                  * needed) regenerate and verify the parity.
2097                                  * We also regenerate parity when resilvering
2098                                  * so we can write it out to the failed device
2099                                  * later.
2100                                  */
2101                                 if (parity_errors + parity_untried <
2102                                     rm->rm_firstdatacol ||
2103                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2104                                         n = raidz_parity_verify(zio, rm);
2105                                         unexpected_errors += n;
2106                                         ASSERT(parity_errors + n <=
2107                                             rm->rm_firstdatacol);
2108                                 }
2109                                 goto done;
2110                         }
2111                 } else {
2112                         /*
2113                          * We either attempt to read all the parity columns or
2114                          * none of them. If we didn't try to read parity, we
2115                          * wouldn't be here in the correctable case. There must
2116                          * also have been fewer parity errors than parity
2117                          * columns or, again, we wouldn't be in this code path.
2118                          */
2119                         ASSERT(parity_untried == 0);
2120                         ASSERT(parity_errors < rm->rm_firstdatacol);
2121
2122                         /*
2123                          * Identify the data columns that reported an error.
2124                          */
2125                         n = 0;
2126                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2127                                 rc = &rm->rm_col[c];
2128                                 if (rc->rc_error != 0) {
2129                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2130                                         tgts[n++] = c;
2131                                 }
2132                         }
2133
2134                         ASSERT(rm->rm_firstdatacol >= n);
2135
2136                         code = vdev_raidz_reconstruct(rm, tgts, n);
2137
2138                         if (raidz_checksum_verify(zio) == 0) {
2139                                 /*
2140                                  * If we read more parity disks than were used
2141                                  * for reconstruction, confirm that the other
2142                                  * parity disks produced correct data. This
2143                                  * routine is suboptimal in that it regenerates
2144                                  * the parity that we already used in addition
2145                                  * to the parity that we're attempting to
2146                                  * verify, but this should be a relatively
2147                                  * uncommon case, and can be optimized if it
2148                                  * becomes a problem. Note that we regenerate
2149                                  * parity when resilvering so we can write it
2150                                  * out to failed devices later.
2151                                  */
2152                                 if (parity_errors < rm->rm_firstdatacol - n ||
2153                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2154                                         n = raidz_parity_verify(zio, rm);
2155                                         unexpected_errors += n;
2156                                         ASSERT(parity_errors + n <=
2157                                             rm->rm_firstdatacol);
2158                                 }
2159
2160                                 goto done;
2161                         }
2162                 }
2163         }
2164
2165         /*
2166          * This isn't a typical situation -- either we got a read error or
2167          * a child silently returned bad data. Read every block so we can
2168          * try again with as much data and parity as we can track down. If
2169          * we've already been through once before, all children will be marked
2170          * as tried so we'll proceed to combinatorial reconstruction.
2171          */
2172         unexpected_errors = 1;
2173         rm->rm_missingdata = 0;
2174         rm->rm_missingparity = 0;
2175
2176         for (c = 0; c < rm->rm_cols; c++) {
2177                 if (rm->rm_col[c].rc_tried)
2178                         continue;
2179
2180                 zio_vdev_io_redone(zio);
2181                 do {
2182                         rc = &rm->rm_col[c];
2183                         if (rc->rc_tried)
2184                                 continue;
2185                         zio_nowait(zio_vdev_child_io(zio, NULL,
2186                             vd->vdev_child[rc->rc_devidx],
2187                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2188                             zio->io_type, zio->io_priority, 0,
2189                             vdev_raidz_child_done, rc));
2190                 } while (++c < rm->rm_cols);
2191
2192                 return;
2193         }
2194
2195         /*
2196          * At this point we've attempted to reconstruct the data given the
2197          * errors we detected, and we've attempted to read all columns. There
2198          * must, therefore, be one or more additional problems -- silent errors
2199          * resulting in invalid data rather than explicit I/O errors resulting
2200          * in absent data. We check if there is enough additional data to
2201          * possibly reconstruct the data and then perform combinatorial
2202          * reconstruction over all possible combinations. If that fails,
2203          * we're cooked.
2204          */
2205         if (total_errors > rm->rm_firstdatacol) {
2206                 zio->io_error = vdev_raidz_worst_error(rm);
2207
2208         } else if (total_errors < rm->rm_firstdatacol &&
2209             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2210                 /*
2211                  * If we didn't use all the available parity for the
2212                  * combinatorial reconstruction, verify that the remaining
2213                  * parity is correct.
2214                  */
2215                 if (code != (1 << rm->rm_firstdatacol) - 1)
2216                         (void) raidz_parity_verify(zio, rm);
2217         } else {
2218                 /*
2219                  * We're here because either:
2220                  *
2221                  *      total_errors == rm_first_datacol, or
2222                  *      vdev_raidz_combrec() failed
2223                  *
2224                  * In either case, there is enough bad data to prevent
2225                  * reconstruction.
2226                  *
2227                  * Start checksum ereports for all children which haven't
2228                  * failed, and the IO wasn't speculative.
2229                  */
2230                 zio->io_error = SET_ERROR(ECKSUM);
2231
2232                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2233                         for (c = 0; c < rm->rm_cols; c++) {
2234                                 rc = &rm->rm_col[c];
2235                                 if (rc->rc_error == 0) {
2236                                         zio_bad_cksum_t zbc;
2237                                         zbc.zbc_has_cksum = 0;
2238                                         zbc.zbc_injected =
2239                                             rm->rm_ecksuminjected;
2240
2241                                         zfs_ereport_start_checksum(
2242                                             zio->io_spa,
2243                                             vd->vdev_child[rc->rc_devidx],
2244                                             &zio->io_bookmark, zio,
2245                                             rc->rc_offset, rc->rc_size,
2246                                             (void *)(uintptr_t)c, &zbc);
2247                                 }
2248                         }
2249                 }
2250         }
2251
2252 done:
2253         zio_checksum_verified(zio);
2254
2255         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2256             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2257                 /*
2258                  * Use the good data we have in hand to repair damaged children.
2259                  */
2260                 for (c = 0; c < rm->rm_cols; c++) {
2261                         rc = &rm->rm_col[c];
2262                         cvd = vd->vdev_child[rc->rc_devidx];
2263
2264                         if (rc->rc_error == 0)
2265                                 continue;
2266
2267                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2268                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2269                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2270                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2271                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2272                 }
2273         }
2274 }
2275
2276 static void
2277 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2278 {
2279         if (faulted > vd->vdev_nparity)
2280                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2281                     VDEV_AUX_NO_REPLICAS);
2282         else if (degraded + faulted != 0)
2283                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2284         else
2285                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2286 }
2287
2288 /*
2289  * Determine if any portion of the provided block resides on a child vdev
2290  * with a dirty DTL and therefore needs to be resilvered.  The function
2291  * assumes that at least one DTL is dirty which imples that full stripe
2292  * width blocks must be resilvered.
2293  */
2294 static boolean_t
2295 vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize)
2296 {
2297         uint64_t dcols = vd->vdev_children;
2298         uint64_t nparity = vd->vdev_nparity;
2299         uint64_t ashift = vd->vdev_top->vdev_ashift;
2300         /* The starting RAIDZ (parent) vdev sector of the block. */
2301         uint64_t b = offset >> ashift;
2302         /* The zio's size in units of the vdev's minimum sector size. */
2303         uint64_t s = ((psize - 1) >> ashift) + 1;
2304         /* The first column for this stripe. */
2305         uint64_t f = b % dcols;
2306
2307         if (s + nparity >= dcols)
2308                 return (B_TRUE);
2309
2310         for (uint64_t c = 0; c < s + nparity; c++) {
2311                 uint64_t devidx = (f + c) % dcols;
2312                 vdev_t *cvd = vd->vdev_child[devidx];
2313
2314                 /*
2315                  * dsl_scan_need_resilver() already checked vd with
2316                  * vdev_dtl_contains(). So here just check cvd with
2317                  * vdev_dtl_empty(), cheaper and a good approximation.
2318                  */
2319                 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
2320                         return (B_TRUE);
2321         }
2322
2323         return (B_FALSE);
2324 }
2325
2326 vdev_ops_t vdev_raidz_ops = {
2327         vdev_raidz_open,
2328         vdev_raidz_close,
2329         vdev_raidz_asize,
2330         vdev_raidz_io_start,
2331         vdev_raidz_io_done,
2332         vdev_raidz_state_change,
2333         vdev_raidz_need_resilver,
2334         NULL,
2335         NULL,
2336         NULL,
2337         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2338         B_FALSE                 /* not a leaf vdev */
2339 };