module/zfs/vdev_raidz.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  25  */
  26
  27 #include <sys/zfs_context.h>
  28 #include <sys/spa.h>
  29 #include <sys/vdev_impl.h>
  30 #include <sys/zio.h>
  31 #include <sys/zio_checksum.h>
  32 #include <sys/fs/zfs.h>
  33 #include <sys/fm/fs/zfs.h>
  34
  35 /*
  36  * Virtual device vector for RAID-Z.
  37  *
  38  * This vdev supports single, double, and triple parity. For single parity,
  39  * we use a simple XOR of all the data columns. For double or triple parity,
  40  * we use a special case of Reed-Solomon coding. This extends the
  41  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  42  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  43  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  44  * former is also based. The latter is designed to provide higher performance
  45  * for writes.
  46  *
  47  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  48  * amended six years later identifying a critical flaw that invalidates its
  49  * claims. Nevertheless, the technique can be adapted to work for up to
  50  * triple parity. For additional parity, the amendment "Note: Correction to
  51  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  52  * is viable, but the additional complexity means that write performance will
  53  * suffer.
  54  *
  55  * All of the methods above operate on a Galois field, defined over the
  56  * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
  57  * can be expressed with a single byte. Briefly, the operations on the
  58  * field are defined as follows:
  59  *
  60  *   o addition (+) is represented by a bitwise XOR
  61  *   o subtraction (-) is therefore identical to addition: A + B = A - B
  62  *   o multiplication of A by 2 is defined by the following bitwise expression:
  63  *
  64  *      (A * 2)_7 = A_6
  65  *      (A * 2)_6 = A_5
  66  *      (A * 2)_5 = A_4
  67  *      (A * 2)_4 = A_3 + A_7
  68  *      (A * 2)_3 = A_2 + A_7
  69  *      (A * 2)_2 = A_1 + A_7
  70  *      (A * 2)_1 = A_0
  71  *      (A * 2)_0 = A_7
  72  *
  73  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
  74  * As an aside, this multiplication is derived from the error correcting
  75  * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  76  *
  77  * Observe that any number in the field (except for 0) can be expressed as a
  78  * power of 2 -- a generator for the field. We store a table of the powers of
  79  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  80  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
  81  * than field addition). The inverse of a field element A (A^-1) is therefore
  82  * A ^ (255 - 1) = A^254.
  83  *
  84  * The up-to-three parity columns, P, Q, R over several data columns,
  85  * D_0, ... D_n-1, can be expressed by field operations:
  86  *
  87  *      P = D_0 + D_1 + ... + D_n-2 + D_n-1
  88  *      Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  89  *        = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
  90  *      R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
  91  *        = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  92  *
  93  * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
  94  * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
  95  * independent coefficients. (There are no additional coefficients that have
  96  * this property which is why the uncorrected Plank method breaks down.)
  97  *
  98  * See the reconstruction code below for how P, Q and R can used individually
  99  * or in concert to recover missing data columns.
 100  */
 101
 102 typedef struct raidz_col {
 103         uint64_t rc_devidx;             /* child device index for I/O */
 104         uint64_t rc_offset;             /* device offset */
 105         uint64_t rc_size;               /* I/O size */
 106         void *rc_data;                  /* I/O data */
 107         void *rc_gdata;                 /* used to store the "good" version */
 108         int rc_error;                   /* I/O error for this device */
 109         uint8_t rc_tried;               /* Did we attempt this I/O column? */
 110         uint8_t rc_skipped;             /* Did we skip this I/O column? */
 111 } raidz_col_t;
 112
 113 typedef struct raidz_map {
 114         uint64_t rm_cols;               /* Regular column count */
 115         uint64_t rm_scols;              /* Count including skipped columns */
 116         uint64_t rm_bigcols;            /* Number of oversized columns */
 117         uint64_t rm_asize;              /* Actual total I/O size */
 118         uint64_t rm_missingdata;        /* Count of missing data devices */
 119         uint64_t rm_missingparity;      /* Count of missing parity devices */
 120         uint64_t rm_firstdatacol;       /* First data column/parity count */
 121         uint64_t rm_nskip;              /* Skipped sectors for padding */
 122         uint64_t rm_skipstart;          /* Column index of padding start */
 123         void *rm_datacopy;              /* rm_asize-buffer of copied data */
 124         uintptr_t rm_reports;           /* # of referencing checksum reports */
 125         uint8_t rm_freed;               /* map no longer has referencing ZIO */
 126         uint8_t rm_ecksuminjected;      /* checksum error was injected */
 127         raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
 128 } raidz_map_t;
 129
 130 #define VDEV_RAIDZ_P            0
 131 #define VDEV_RAIDZ_Q            1
 132 #define VDEV_RAIDZ_R            2
 133
 134 #define VDEV_RAIDZ_MUL_2(x)     (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
 135 #define VDEV_RAIDZ_MUL_4(x)     (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
 136
 137 /*
 138  * We provide a mechanism to perform the field multiplication operation on a
 139  * 64-bit value all at once rather than a byte at a time. This works by
 140  * creating a mask from the top bit in each byte and using that to
 141  * conditionally apply the XOR of 0x1d.
 142  */
 143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
 144 { \
 145         (mask) = (x) & 0x8080808080808080ULL; \
 146         (mask) = ((mask) << 1) - ((mask) >> 7); \
 147         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 148             ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
 149 }
 150
 151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 152 { \
 153         VDEV_RAIDZ_64MUL_2((x), mask); \
 154         VDEV_RAIDZ_64MUL_2((x), mask); \
 155 }
 156
 157 /*
 158  * Force reconstruction to use the general purpose method.
 159  */
 160 int vdev_raidz_default_to_general;
 161
 162 /* Powers of 2 in the Galois field defined above. */
 163 static const uint8_t vdev_raidz_pow2[256] = {
 164         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 165         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 166         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 167         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 168         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 169         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 170         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 171         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 172         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 173         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 174         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 175         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 176         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 177         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 178         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 179         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 180         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 181         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 182         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
 183         0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
 184         0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
 185         0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
 186         0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
 187         0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
 188         0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
 189         0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
 190         0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
 191         0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
 192         0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
 193         0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
 194         0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
 195         0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
 196 };
 197 /* Logs of 2 in the Galois field defined above. */
 198 static const uint8_t vdev_raidz_log2[256] = {
 199         0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
 200         0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
 201         0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
 202         0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
 203         0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
 204         0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
 205         0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
 206         0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
 207         0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
 208         0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
 209         0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
 210         0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
 211         0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
 212         0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
 213         0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
 214         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 215         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 216         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 217         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 218         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 219         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 220         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 221         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 222         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 223         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 224         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 225         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 226         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 227         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 228         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 229         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 230         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 231 };
 232
 233 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 234
 235 /*
 236  * Multiply a given number by 2 raised to the given power.
 237  */
 238 static uint8_t
 239 vdev_raidz_exp2(uint_t a, int exp)
 240 {
 241         if (a == 0)
 242                 return (0);
 243
 244         ASSERT(exp >= 0);
 245         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 246
 247         exp += vdev_raidz_log2[a];
 248         if (exp > 255)
 249                 exp -= 255;
 250
 251         return (vdev_raidz_pow2[exp]);
 252 }
 253
 254 static void
 255 vdev_raidz_map_free(raidz_map_t *rm)
 256 {
 257         int c;
 258         size_t size;
 259
 260         for (c = 0; c < rm->rm_firstdatacol; c++) {
 261                 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 262
 263                 if (rm->rm_col[c].rc_gdata != NULL)
 264                         zio_buf_free(rm->rm_col[c].rc_gdata,
 265                             rm->rm_col[c].rc_size);
 266         }
 267
 268         size = 0;
 269         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 270                 size += rm->rm_col[c].rc_size;
 271
 272         if (rm->rm_datacopy != NULL)
 273                 zio_buf_free(rm->rm_datacopy, size);
 274
 275         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 276 }
 277
 278 static void
 279 vdev_raidz_map_free_vsd(zio_t *zio)
 280 {
 281         raidz_map_t *rm = zio->io_vsd;
 282
 283         ASSERT0(rm->rm_freed);
 284         rm->rm_freed = 1;
 285
 286         if (rm->rm_reports == 0)
 287                 vdev_raidz_map_free(rm);
 288 }
 289
 290 /*ARGSUSED*/
 291 static void
 292 vdev_raidz_cksum_free(void *arg, size_t ignored)
 293 {
 294         raidz_map_t *rm = arg;
 295
 296         ASSERT3U(rm->rm_reports, >, 0);
 297
 298         if (--rm->rm_reports == 0 && rm->rm_freed != 0)
 299                 vdev_raidz_map_free(rm);
 300 }
 301
 302 static void
 303 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
 304 {
 305         raidz_map_t *rm = zcr->zcr_cbdata;
 306         size_t c = zcr->zcr_cbinfo;
 307         size_t x;
 308
 309         const char *good = NULL;
 310         const char *bad = rm->rm_col[c].rc_data;
 311
 312         if (good_data == NULL) {
 313                 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
 314                 return;
 315         }
 316
 317         if (c < rm->rm_firstdatacol) {
 318                 /*
 319                  * The first time through, calculate the parity blocks for
 320                  * the good data (this relies on the fact that the good
 321                  * data never changes for a given logical ZIO)
 322                  */
 323                 if (rm->rm_col[0].rc_gdata == NULL) {
 324                         char *bad_parity[VDEV_RAIDZ_MAXPARITY];
 325                         char *buf;
 326
 327                         /*
 328                          * Set up the rm_col[]s to generate the parity for
 329                          * good_data, first saving the parity bufs and
 330                          * replacing them with buffers to hold the result.
 331                          */
 332                         for (x = 0; x < rm->rm_firstdatacol; x++) {
 333                                 bad_parity[x] = rm->rm_col[x].rc_data;
 334                                 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
 335                                     zio_buf_alloc(rm->rm_col[x].rc_size);
 336                         }
 337
 338                         /* fill in the data columns from good_data */
 339                         buf = (char *)good_data;
 340                         for (; x < rm->rm_cols; x++) {
 341                                 rm->rm_col[x].rc_data = buf;
 342                                 buf += rm->rm_col[x].rc_size;
 343                         }
 344
 345                         /*
 346                          * Construct the parity from the good data.
 347                          */
 348                         vdev_raidz_generate_parity(rm);
 349
 350                         /* restore everything back to its original state */
 351                         for (x = 0; x < rm->rm_firstdatacol; x++)
 352                                 rm->rm_col[x].rc_data = bad_parity[x];
 353
 354                         buf = rm->rm_datacopy;
 355                         for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
 356                                 rm->rm_col[x].rc_data = buf;
 357                                 buf += rm->rm_col[x].rc_size;
 358                         }
 359                 }
 360
 361                 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
 362                 good = rm->rm_col[c].rc_gdata;
 363         } else {
 364                 /* adjust good_data to point at the start of our column */
 365                 good = good_data;
 366
 367                 for (x = rm->rm_firstdatacol; x < c; x++)
 368                         good += rm->rm_col[x].rc_size;
 369         }
 370
 371         /* we drop the ereport if it ends up that the data was good */
 372         zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
 373 }
 374
 375 /*
 376  * Invoked indirectly by zfs_ereport_start_checksum(), called
 377  * below when our read operation fails completely.  The main point
 378  * is to keep a copy of everything we read from disk, so that at
 379  * vdev_raidz_cksum_finish() time we can compare it with the good data.
 380  */
 381 static void
 382 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
 383 {
 384         size_t c = (size_t)(uintptr_t)arg;
 385         caddr_t buf;
 386
 387         raidz_map_t *rm = zio->io_vsd;
 388         size_t size;
 389
 390         /* set up the report and bump the refcount  */
 391         zcr->zcr_cbdata = rm;
 392         zcr->zcr_cbinfo = c;
 393         zcr->zcr_finish = vdev_raidz_cksum_finish;
 394         zcr->zcr_free = vdev_raidz_cksum_free;
 395
 396         rm->rm_reports++;
 397         ASSERT3U(rm->rm_reports, >, 0);
 398
 399         if (rm->rm_datacopy != NULL)
 400                 return;
 401
 402         /*
 403          * It's the first time we're called for this raidz_map_t, so we need
 404          * to copy the data aside; there's no guarantee that our zio's buffer
 405          * won't be re-used for something else.
 406          *
 407          * Our parity data is already in separate buffers, so there's no need
 408          * to copy them.
 409          */
 410
 411         size = 0;
 412         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
 413                 size += rm->rm_col[c].rc_size;
 414
 415         buf = rm->rm_datacopy = zio_buf_alloc(size);
 416
 417         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 418                 raidz_col_t *col = &rm->rm_col[c];
 419
 420                 bcopy(col->rc_data, buf, col->rc_size);
 421                 col->rc_data = buf;
 422
 423                 buf += col->rc_size;
 424         }
 425         ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 426 }
 427
 428 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 429         vdev_raidz_map_free_vsd,
 430         vdev_raidz_cksum_report
 431 };
 432
 433 /*
 434  * Divides the IO evenly across all child vdevs; usually, dcols is
 435  * the number of children in the target vdev.
 436  *
 437  * Avoid inlining the function to keep vdev_raidz_io_start(), which
 438  * is this functions only caller, as small as possible on the stack.
 439  */
 440 noinline static raidz_map_t *
 441 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 442     uint64_t nparity)
 443 {
 444         raidz_map_t *rm;
 445         /* The starting RAIDZ (parent) vdev sector of the block. */
 446         uint64_t b = zio->io_offset >> unit_shift;
 447         /* The zio's size in units of the vdev's minimum sector size. */
 448         uint64_t s = zio->io_size >> unit_shift;
 449         /* The first column for this stripe. */
 450         uint64_t f = b % dcols;
 451         /* The starting byte offset on each child vdev. */
 452         uint64_t o = (b / dcols) << unit_shift;
 453         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 454
 455         /*
 456          * "Quotient": The number of data sectors for this stripe on all but
 457          * the "big column" child vdevs that also contain "remainder" data.
 458          */
 459         q = s / (dcols - nparity);
 460
 461         /*
 462          * "Remainder": The number of partial stripe data sectors in this I/O.
 463          * This will add a sector to some, but not all, child vdevs.
 464          */
 465         r = s - q * (dcols - nparity);
 466
 467         /* The number of "big columns" - those which contain remainder data. */
 468         bc = (r == 0 ? 0 : r + nparity);
 469
 470         /*
 471          * The total number of data and parity sectors associated with
 472          * this I/O.
 473          */
 474         tot = s + nparity * (q + (r == 0 ? 0 : 1));
 475
 476         /* acols: The columns that will be accessed. */
 477         /* scols: The columns that will be accessed or skipped. */
 478         if (q == 0) {
 479                 /* Our I/O request doesn't span all child vdevs. */
 480                 acols = bc;
 481                 scols = MIN(dcols, roundup(bc, nparity + 1));
 482         } else {
 483                 acols = dcols;
 484                 scols = dcols;
 485         }
 486
 487         ASSERT3U(acols, <=, scols);
 488
 489         rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 490
 491         rm->rm_cols = acols;
 492         rm->rm_scols = scols;
 493         rm->rm_bigcols = bc;
 494         rm->rm_skipstart = bc;
 495         rm->rm_missingdata = 0;
 496         rm->rm_missingparity = 0;
 497         rm->rm_firstdatacol = nparity;
 498         rm->rm_datacopy = NULL;
 499         rm->rm_reports = 0;
 500         rm->rm_freed = 0;
 501         rm->rm_ecksuminjected = 0;
 502
 503         asize = 0;
 504
 505         for (c = 0; c < scols; c++) {
 506                 col = f + c;
 507                 coff = o;
 508                 if (col >= dcols) {
 509                         col -= dcols;
 510                         coff += 1ULL << unit_shift;
 511                 }
 512                 rm->rm_col[c].rc_devidx = col;
 513                 rm->rm_col[c].rc_offset = coff;
 514                 rm->rm_col[c].rc_data = NULL;
 515                 rm->rm_col[c].rc_gdata = NULL;
 516                 rm->rm_col[c].rc_error = 0;
 517                 rm->rm_col[c].rc_tried = 0;
 518                 rm->rm_col[c].rc_skipped = 0;
 519
 520                 if (c >= acols)
 521                         rm->rm_col[c].rc_size = 0;
 522                 else if (c < bc)
 523                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 524                 else
 525                         rm->rm_col[c].rc_size = q << unit_shift;
 526
 527                 asize += rm->rm_col[c].rc_size;
 528         }
 529
 530         ASSERT3U(asize, ==, tot << unit_shift);
 531         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 532         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 533         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 534         ASSERT3U(rm->rm_nskip, <=, nparity);
 535
 536         for (c = 0; c < rm->rm_firstdatacol; c++)
 537                 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
 538
 539         rm->rm_col[c].rc_data = zio->io_data;
 540
 541         for (c = c + 1; c < acols; c++)
 542                 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
 543                     rm->rm_col[c - 1].rc_size;
 544
 545         /*
 546          * If all data stored spans all columns, there's a danger that parity
 547          * will always be on the same device and, since parity isn't read
 548          * during normal operation, that that device's I/O bandwidth won't be
 549          * used effectively. We therefore switch the parity every 1MB.
 550          *
 551          * ... at least that was, ostensibly, the theory. As a practical
 552          * matter unless we juggle the parity between all devices evenly, we
 553          * won't see any benefit. Further, occasional writes that aren't a
 554          * multiple of the LCM of the number of children and the minimum
 555          * stripe width are sufficient to avoid pessimal behavior.
 556          * Unfortunately, this decision created an implicit on-disk format
 557          * requirement that we need to support for all eternity, but only
 558          * for single-parity RAID-Z.
 559          *
 560          * If we intend to skip a sector in the zeroth column for padding
 561          * we must make sure to note this swap. We will never intend to
 562          * skip the first column since at least one data and one parity
 563          * column must appear in each row.
 564          */
 565         ASSERT(rm->rm_cols >= 2);
 566         ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
 567
 568         if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
 569                 devidx = rm->rm_col[0].rc_devidx;
 570                 o = rm->rm_col[0].rc_offset;
 571                 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
 572                 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 573                 rm->rm_col[1].rc_devidx = devidx;
 574                 rm->rm_col[1].rc_offset = o;
 575
 576                 if (rm->rm_skipstart == 0)
 577                         rm->rm_skipstart = 1;
 578         }
 579
 580         zio->io_vsd = rm;
 581         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 582         return (rm);
 583 }
 584
 585 static void
 586 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 587 {
 588         uint64_t *p, *src, pcount, ccount, i;
 589         int c;
 590
 591         pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 592
 593         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 594                 src = rm->rm_col[c].rc_data;
 595                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 596                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 597
 598                 if (c == rm->rm_firstdatacol) {
 599                         ASSERT(ccount == pcount);
 600                         for (i = 0; i < ccount; i++, src++, p++) {
 601                                 *p = *src;
 602                         }
 603                 } else {
 604                         ASSERT(ccount <= pcount);
 605                         for (i = 0; i < ccount; i++, src++, p++) {
 606                                 *p ^= *src;
 607                         }
 608                 }
 609         }
 610 }
 611
 612 static void
 613 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 614 {
 615         uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 616         int c;
 617
 618         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 619         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 620             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 621
 622         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 623                 src = rm->rm_col[c].rc_data;
 624                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 625                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 626
 627                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 628
 629                 if (c == rm->rm_firstdatacol) {
 630                         ASSERT(ccnt == pcnt || ccnt == 0);
 631                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 632                                 *p = *src;
 633                                 *q = *src;
 634                         }
 635                         for (; i < pcnt; i++, src++, p++, q++) {
 636                                 *p = 0;
 637                                 *q = 0;
 638                         }
 639                 } else {
 640                         ASSERT(ccnt <= pcnt);
 641
 642                         /*
 643                          * Apply the algorithm described above by multiplying
 644                          * the previous result and adding in the new value.
 645                          */
 646                         for (i = 0; i < ccnt; i++, src++, p++, q++) {
 647                                 *p ^= *src;
 648
 649                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 650                                 *q ^= *src;
 651                         }
 652
 653                         /*
 654                          * Treat short columns as though they are full of 0s.
 655                          * Note that there's therefore nothing needed for P.
 656                          */
 657                         for (; i < pcnt; i++, q++) {
 658                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 659                         }
 660                 }
 661         }
 662 }
 663
 664 static void
 665 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
 666 {
 667         uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
 668         int c;
 669
 670         pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 671         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 672             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 673         ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 674             rm->rm_col[VDEV_RAIDZ_R].rc_size);
 675
 676         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 677                 src = rm->rm_col[c].rc_data;
 678                 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 679                 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 680                 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
 681
 682                 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 683
 684                 if (c == rm->rm_firstdatacol) {
 685                         ASSERT(ccnt == pcnt || ccnt == 0);
 686                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 687                                 *p = *src;
 688                                 *q = *src;
 689                                 *r = *src;
 690                         }
 691                         for (; i < pcnt; i++, src++, p++, q++, r++) {
 692                                 *p = 0;
 693                                 *q = 0;
 694                                 *r = 0;
 695                         }
 696                 } else {
 697                         ASSERT(ccnt <= pcnt);
 698
 699                         /*
 700                          * Apply the algorithm described above by multiplying
 701                          * the previous result and adding in the new value.
 702                          */
 703                         for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 704                                 *p ^= *src;
 705
 706                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 707                                 *q ^= *src;
 708
 709                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 710                                 *r ^= *src;
 711                         }
 712
 713                         /*
 714                          * Treat short columns as though they are full of 0s.
 715                          * Note that there's therefore nothing needed for P.
 716                          */
 717                         for (; i < pcnt; i++, q++, r++) {
 718                                 VDEV_RAIDZ_64MUL_2(*q, mask);
 719                                 VDEV_RAIDZ_64MUL_4(*r, mask);
 720                         }
 721                 }
 722         }
 723 }
 724
 725 /*
 726  * Generate RAID parity in the first virtual columns according to the number of
 727  * parity columns available.
 728  */
 729 static void
 730 vdev_raidz_generate_parity(raidz_map_t *rm)
 731 {
 732         switch (rm->rm_firstdatacol) {
 733         case 1:
 734                 vdev_raidz_generate_parity_p(rm);
 735                 break;
 736         case 2:
 737                 vdev_raidz_generate_parity_pq(rm);
 738                 break;
 739         case 3:
 740                 vdev_raidz_generate_parity_pqr(rm);
 741                 break;
 742         default:
 743                 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
 744         }
 745 }
 746
 747 static int
 748 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 749 {
 750         uint64_t *dst, *src, xcount, ccount, count, i;
 751         int x = tgts[0];
 752         int c;
 753
 754         ASSERT(ntgts == 1);
 755         ASSERT(x >= rm->rm_firstdatacol);
 756         ASSERT(x < rm->rm_cols);
 757
 758         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 759         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 760         ASSERT(xcount > 0);
 761
 762         src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 763         dst = rm->rm_col[x].rc_data;
 764         for (i = 0; i < xcount; i++, dst++, src++) {
 765                 *dst = *src;
 766         }
 767
 768         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 769                 src = rm->rm_col[c].rc_data;
 770                 dst = rm->rm_col[x].rc_data;
 771
 772                 if (c == x)
 773                         continue;
 774
 775                 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 776                 count = MIN(ccount, xcount);
 777
 778                 for (i = 0; i < count; i++, dst++, src++) {
 779                         *dst ^= *src;
 780                 }
 781         }
 782
 783         return (1 << VDEV_RAIDZ_P);
 784 }
 785
 786 static int
 787 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 788 {
 789         uint64_t *dst, *src, xcount, ccount, count, mask, i;
 790         uint8_t *b;
 791         int x = tgts[0];
 792         int c, j, exp;
 793
 794         ASSERT(ntgts == 1);
 795
 796         xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 797         ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 798
 799         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 800                 src = rm->rm_col[c].rc_data;
 801                 dst = rm->rm_col[x].rc_data;
 802
 803                 if (c == x)
 804                         ccount = 0;
 805                 else
 806                         ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
 807
 808                 count = MIN(ccount, xcount);
 809
 810                 if (c == rm->rm_firstdatacol) {
 811                         for (i = 0; i < count; i++, dst++, src++) {
 812                                 *dst = *src;
 813                         }
 814                         for (; i < xcount; i++, dst++) {
 815                                 *dst = 0;
 816                         }
 817
 818                 } else {
 819                         for (i = 0; i < count; i++, dst++, src++) {
 820                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 821                                 *dst ^= *src;
 822                         }
 823
 824                         for (; i < xcount; i++, dst++) {
 825                                 VDEV_RAIDZ_64MUL_2(*dst, mask);
 826                         }
 827                 }
 828         }
 829
 830         src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 831         dst = rm->rm_col[x].rc_data;
 832         exp = 255 - (rm->rm_cols - 1 - x);
 833
 834         for (i = 0; i < xcount; i++, dst++, src++) {
 835                 *dst ^= *src;
 836                 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
 837                         *b = vdev_raidz_exp2(*b, exp);
 838                 }
 839         }
 840
 841         return (1 << VDEV_RAIDZ_Q);
 842 }
 843
 844 static int
 845 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 846 {
 847         uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 848         void *pdata, *qdata;
 849         uint64_t xsize, ysize, i;
 850         int x = tgts[0];
 851         int y = tgts[1];
 852
 853         ASSERT(ntgts == 2);
 854         ASSERT(x < y);
 855         ASSERT(x >= rm->rm_firstdatacol);
 856         ASSERT(y < rm->rm_cols);
 857
 858         ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
 859
 860         /*
 861          * Move the parity data aside -- we're going to compute parity as
 862          * though columns x and y were full of zeros -- Pxy and Qxy. We want to
 863          * reuse the parity generation mechanism without trashing the actual
 864          * parity so we make those columns appear to be full of zeros by
 865          * setting their lengths to zero.
 866          */
 867         pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 868         qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 869         xsize = rm->rm_col[x].rc_size;
 870         ysize = rm->rm_col[y].rc_size;
 871
 872         rm->rm_col[VDEV_RAIDZ_P].rc_data =
 873             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
 874         rm->rm_col[VDEV_RAIDZ_Q].rc_data =
 875             zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 876         rm->rm_col[x].rc_size = 0;
 877         rm->rm_col[y].rc_size = 0;
 878
 879         vdev_raidz_generate_parity_pq(rm);
 880
 881         rm->rm_col[x].rc_size = xsize;
 882         rm->rm_col[y].rc_size = ysize;
 883
 884         p = pdata;
 885         q = qdata;
 886         pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 887         qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
 888         xd = rm->rm_col[x].rc_data;
 889         yd = rm->rm_col[y].rc_data;
 890
 891         /*
 892          * We now have:
 893          *      Pxy = P + D_x + D_y
 894          *      Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
 895          *
 896          * We can then solve for D_x:
 897          *      D_x = A * (P + Pxy) + B * (Q + Qxy)
 898          * where
 899          *      A = 2^(x - y) * (2^(x - y) + 1)^-1
 900          *      B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
 901          *
 902          * With D_x in hand, we can easily solve for D_y:
 903          *      D_y = P + Pxy + D_x
 904          */
 905
 906         a = vdev_raidz_pow2[255 + x - y];
 907         b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
 908         tmp = 255 - vdev_raidz_log2[a ^ 1];
 909
 910         aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
 911         bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
 912
 913         for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
 914                 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
 915                     vdev_raidz_exp2(*q ^ *qxy, bexp);
 916
 917                 if (i < ysize)
 918                         *yd = *p ^ *pxy ^ *xd;
 919         }
 920
 921         zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
 922             rm->rm_col[VDEV_RAIDZ_P].rc_size);
 923         zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
 924             rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 925
 926         /*
 927          * Restore the saved parity data.
 928          */
 929         rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 930         rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
 931
 932         return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
 933 }
 934
 935 /* BEGIN CSTYLED */
 936 /*
 937  * In the general case of reconstruction, we must solve the system of linear
 938  * equations defined by the coeffecients used to generate parity as well as
 939  * the contents of the data and parity disks. This can be expressed with
 940  * vectors for the original data (D) and the actual data (d) and parity (p)
 941  * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
 942  *
 943  *            __   __                     __     __
 944  *            |     |         __     __   |  p_0  |
 945  *            |  V  |         |  D_0  |   | p_m-1 |
 946  *            |     |    x    |   :   | = |  d_0  |
 947  *            |  I  |         | D_n-1 |   |   :   |
 948  *            |     |         ~~     ~~   | d_n-1 |
 949  *            ~~   ~~                     ~~     ~~
 950  *
 951  * I is simply a square identity matrix of size n, and V is a vandermonde
 952  * matrix defined by the coeffecients we chose for the various parity columns
 953  * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
 954  * computation as well as linear separability.
 955  *
 956  *      __               __               __     __
 957  *      |   1   ..  1 1 1 |               |  p_0  |
 958  *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
 959  *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
 960  *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
 961  *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
 962  *      |   :       : : : |   |   :   |   |  d_2  |
 963  *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
 964  *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
 965  *      |   0   ..  0 0 1 |               | d_n-1 |
 966  *      ~~               ~~               ~~     ~~
 967  *
 968  * Note that I, V, d, and p are known. To compute D, we must invert the
 969  * matrix and use the known data and parity values to reconstruct the unknown
 970  * data values. We begin by removing the rows in V|I and d|p that correspond
 971  * to failed or missing columns; we then make V|I square (n x n) and d|p
 972  * sized n by removing rows corresponding to unused parity from the bottom up
 973  * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
 974  * using Gauss-Jordan elimination. In the example below we use m=3 parity
 975  * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
 976  *           __                               __
 977  *           |  1   1   1   1   1   1   1   1  |
 978  *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
 979  *           |  19 205 116  29  64  16  4   1  |      / /
 980  *           |  1   0   0   0   0   0   0   0  |     / /
 981  *           |  0   1   0   0   0   0   0   0  | <--' /
 982  *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
 983  *           |  0   0   0   1   0   0   0   0  |
 984  *           |  0   0   0   0   1   0   0   0  |
 985  *           |  0   0   0   0   0   1   0   0  |
 986  *           |  0   0   0   0   0   0   1   0  |
 987  *           |  0   0   0   0   0   0   0   1  |
 988  *           ~~                               ~~
 989  *           __                               __
 990  *           |  1   1   1   1   1   1   1   1  |
 991  *           | 128  64  32  16  8   4   2   1  |
 992  *           |  19 205 116  29  64  16  4   1  |
 993  *           |  1   0   0   0   0   0   0   0  |
 994  *           |  0   1   0   0   0   0   0   0  |
 995  *  (V|I)' = |  0   0   1   0   0   0   0   0  |
 996  *           |  0   0   0   1   0   0   0   0  |
 997  *           |  0   0   0   0   1   0   0   0  |
 998  *           |  0   0   0   0   0   1   0   0  |
 999  *           |  0   0   0   0   0   0   1   0  |
1000  *           |  0   0   0   0   0   0   0   1  |
1001  *           ~~                               ~~
1002  *
1003  * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1004  * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1005  * matrix is not singular.
1006  * __                                                                 __
1007  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1008  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1009  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1010  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1011  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1012  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1013  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1014  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1015  * ~~                                                                 ~~
1016  * __                                                                 __
1017  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1018  * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
1019  * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
1020  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1021  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1022  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1023  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1024  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1025  * ~~                                                                 ~~
1026  * __                                                                 __
1027  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1028  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1029  * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
1030  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1031  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1032  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1033  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1034  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1035  * ~~                                                                 ~~
1036  * __                                                                 __
1037  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1038  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1039  * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
1040  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1041  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1042  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1043  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1044  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1045  * ~~                                                                 ~~
1046  * __                                                                 __
1047  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1048  * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
1049  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1050  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1051  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1052  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1053  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1054  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1055  * ~~                                                                 ~~
1056  * __                                                                 __
1057  * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
1058  * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
1059  * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
1060  * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
1061  * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
1062  * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
1063  * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
1064  * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
1065  * ~~                                                                 ~~
1066  *                   __                               __
1067  *                   |  0   0   1   0   0   0   0   0  |
1068  *                   | 167 100  5   41 159 169 217 208 |
1069  *                   | 166 100  4   40 158 168 216 209 |
1070  *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
1071  *                   |  0   0   0   0   1   0   0   0  |
1072  *                   |  0   0   0   0   0   1   0   0  |
1073  *                   |  0   0   0   0   0   0   1   0  |
1074  *                   |  0   0   0   0   0   0   0   1  |
1075  *                   ~~                               ~~
1076  *
1077  * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1078  * of the missing data.
1079  *
1080  * As is apparent from the example above, the only non-trivial rows in the
1081  * inverse matrix correspond to the data disks that we're trying to
1082  * reconstruct. Indeed, those are the only rows we need as the others would
1083  * only be useful for reconstructing data known or assumed to be valid. For
1084  * that reason, we only build the coefficients in the rows that correspond to
1085  * targeted columns.
1086  */
1087 /* END CSTYLED */
1088
1089 static void
1090 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1091     uint8_t **rows)
1092 {
1093         int i, j;
1094         int pow;
1095
1096         ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1097
1098         /*
1099          * Fill in the missing rows of interest.
1100          */
1101         for (i = 0; i < nmap; i++) {
1102                 ASSERT3S(0, <=, map[i]);
1103                 ASSERT3S(map[i], <=, 2);
1104
1105                 pow = map[i] * n;
1106                 if (pow > 255)
1107                         pow -= 255;
1108                 ASSERT(pow <= 255);
1109
1110                 for (j = 0; j < n; j++) {
1111                         pow -= map[i];
1112                         if (pow < 0)
1113                                 pow += 255;
1114                         rows[i][j] = vdev_raidz_pow2[pow];
1115                 }
1116         }
1117 }
1118
1119 static void
1120 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1121     uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1122 {
1123         int i, j, ii, jj;
1124         uint8_t log;
1125
1126         /*
1127          * Assert that the first nmissing entries from the array of used
1128          * columns correspond to parity columns and that subsequent entries
1129          * correspond to data columns.
1130          */
1131         for (i = 0; i < nmissing; i++) {
1132                 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1133         }
1134         for (; i < n; i++) {
1135                 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1136         }
1137
1138         /*
1139          * First initialize the storage where we'll compute the inverse rows.
1140          */
1141         for (i = 0; i < nmissing; i++) {
1142                 for (j = 0; j < n; j++) {
1143                         invrows[i][j] = (i == j) ? 1 : 0;
1144                 }
1145         }
1146
1147         /*
1148          * Subtract all trivial rows from the rows of consequence.
1149          */
1150         for (i = 0; i < nmissing; i++) {
1151                 for (j = nmissing; j < n; j++) {
1152                         ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1153                         jj = used[j] - rm->rm_firstdatacol;
1154                         ASSERT3S(jj, <, n);
1155                         invrows[i][j] = rows[i][jj];
1156                         rows[i][jj] = 0;
1157                 }
1158         }
1159
1160         /*
1161          * For each of the rows of interest, we must normalize it and subtract
1162          * a multiple of it from the other rows.
1163          */
1164         for (i = 0; i < nmissing; i++) {
1165                 for (j = 0; j < missing[i]; j++) {
1166                         ASSERT0(rows[i][j]);
1167                 }
1168                 ASSERT3U(rows[i][missing[i]], !=, 0);
1169
1170                 /*
1171                  * Compute the inverse of the first element and multiply each
1172                  * element in the row by that value.
1173                  */
1174                 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1175
1176                 for (j = 0; j < n; j++) {
1177                         rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1178                         invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1179                 }
1180
1181                 for (ii = 0; ii < nmissing; ii++) {
1182                         if (i == ii)
1183                                 continue;
1184
1185                         ASSERT3U(rows[ii][missing[i]], !=, 0);
1186
1187                         log = vdev_raidz_log2[rows[ii][missing[i]]];
1188
1189                         for (j = 0; j < n; j++) {
1190                                 rows[ii][j] ^=
1191                                     vdev_raidz_exp2(rows[i][j], log);
1192                                 invrows[ii][j] ^=
1193                                     vdev_raidz_exp2(invrows[i][j], log);
1194                         }
1195                 }
1196         }
1197
1198         /*
1199          * Verify that the data that is left in the rows are properly part of
1200          * an identity matrix.
1201          */
1202         for (i = 0; i < nmissing; i++) {
1203                 for (j = 0; j < n; j++) {
1204                         if (j == missing[i]) {
1205                                 ASSERT3U(rows[i][j], ==, 1);
1206                         } else {
1207                                 ASSERT0(rows[i][j]);
1208                         }
1209                 }
1210         }
1211 }
1212
1213 static void
1214 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1215     int *missing, uint8_t **invrows, const uint8_t *used)
1216 {
1217         int i, j, x, cc, c;
1218         uint8_t *src;
1219         uint64_t ccount;
1220         uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1221         uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1222         uint8_t log = 0;
1223         uint8_t val;
1224         int ll;
1225         uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1226         uint8_t *p, *pp;
1227         size_t psize;
1228
1229         psize = sizeof (invlog[0][0]) * n * nmissing;
1230         p = kmem_alloc(psize, KM_SLEEP);
1231
1232         for (pp = p, i = 0; i < nmissing; i++) {
1233                 invlog[i] = pp;
1234                 pp += n;
1235         }
1236
1237         for (i = 0; i < nmissing; i++) {
1238                 for (j = 0; j < n; j++) {
1239                         ASSERT3U(invrows[i][j], !=, 0);
1240                         invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1241                 }
1242         }
1243
1244         for (i = 0; i < n; i++) {
1245                 c = used[i];
1246                 ASSERT3U(c, <, rm->rm_cols);
1247
1248                 src = rm->rm_col[c].rc_data;
1249                 ccount = rm->rm_col[c].rc_size;
1250                 for (j = 0; j < nmissing; j++) {
1251                         cc = missing[j] + rm->rm_firstdatacol;
1252                         ASSERT3U(cc, >=, rm->rm_firstdatacol);
1253                         ASSERT3U(cc, <, rm->rm_cols);
1254                         ASSERT3U(cc, !=, c);
1255
1256                         dst[j] = rm->rm_col[cc].rc_data;
1257                         dcount[j] = rm->rm_col[cc].rc_size;
1258                 }
1259
1260                 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1261
1262                 for (x = 0; x < ccount; x++, src++) {
1263                         if (*src != 0)
1264                                 log = vdev_raidz_log2[*src];
1265
1266                         for (cc = 0; cc < nmissing; cc++) {
1267                                 if (x >= dcount[cc])
1268                                         continue;
1269
1270                                 if (*src == 0) {
1271                                         val = 0;
1272                                 } else {
1273                                         if ((ll = log + invlog[cc][i]) >= 255)
1274                                                 ll -= 255;
1275                                         val = vdev_raidz_pow2[ll];
1276                                 }
1277
1278                                 if (i == 0)
1279                                         dst[cc][x] = val;
1280                                 else
1281                                         dst[cc][x] ^= val;
1282                         }
1283                 }
1284         }
1285
1286         kmem_free(p, psize);
1287 }
1288
1289 static int
1290 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1291 {
1292         int n, i, c, t, tt;
1293         int nmissing_rows;
1294         int missing_rows[VDEV_RAIDZ_MAXPARITY];
1295         int parity_map[VDEV_RAIDZ_MAXPARITY];
1296
1297         uint8_t *p, *pp;
1298         size_t psize;
1299
1300         uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1301         uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1302         uint8_t *used;
1303
1304         int code = 0;
1305
1306
1307         n = rm->rm_cols - rm->rm_firstdatacol;
1308
1309         /*
1310          * Figure out which data columns are missing.
1311          */
1312         nmissing_rows = 0;
1313         for (t = 0; t < ntgts; t++) {
1314                 if (tgts[t] >= rm->rm_firstdatacol) {
1315                         missing_rows[nmissing_rows++] =
1316                             tgts[t] - rm->rm_firstdatacol;
1317                 }
1318         }
1319
1320         /*
1321          * Figure out which parity columns to use to help generate the missing
1322          * data columns.
1323          */
1324         for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1325                 ASSERT(tt < ntgts);
1326                 ASSERT(c < rm->rm_firstdatacol);
1327
1328                 /*
1329                  * Skip any targeted parity columns.
1330                  */
1331                 if (c == tgts[tt]) {
1332                         tt++;
1333                         continue;
1334                 }
1335
1336                 code |= 1 << c;
1337
1338                 parity_map[i] = c;
1339                 i++;
1340         }
1341
1342         ASSERT(code != 0);
1343         ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1344
1345         psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1346             nmissing_rows * n + sizeof (used[0]) * n;
1347         p = kmem_alloc(psize, KM_SLEEP);
1348
1349         for (pp = p, i = 0; i < nmissing_rows; i++) {
1350                 rows[i] = pp;
1351                 pp += n;
1352                 invrows[i] = pp;
1353                 pp += n;
1354         }
1355         used = pp;
1356
1357         for (i = 0; i < nmissing_rows; i++) {
1358                 used[i] = parity_map[i];
1359         }
1360
1361         for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1362                 if (tt < nmissing_rows &&
1363                     c == missing_rows[tt] + rm->rm_firstdatacol) {
1364                         tt++;
1365                         continue;
1366                 }
1367
1368                 ASSERT3S(i, <, n);
1369                 used[i] = c;
1370                 i++;
1371         }
1372
1373         /*
1374          * Initialize the interesting rows of the matrix.
1375          */
1376         vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1377
1378         /*
1379          * Invert the matrix.
1380          */
1381         vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1382             invrows, used);
1383
1384         /*
1385          * Reconstruct the missing data using the generated matrix.
1386          */
1387         vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1388             invrows, used);
1389
1390         kmem_free(p, psize);
1391
1392         return (code);
1393 }
1394
1395 static int
1396 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1397 {
1398         int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1399         int ntgts;
1400         int i, c;
1401         int code;
1402         int nbadparity, nbaddata;
1403         int parity_valid[VDEV_RAIDZ_MAXPARITY];
1404
1405         /*
1406          * The tgts list must already be sorted.
1407          */
1408         for (i = 1; i < nt; i++) {
1409                 ASSERT(t[i] > t[i - 1]);
1410         }
1411
1412         nbadparity = rm->rm_firstdatacol;
1413         nbaddata = rm->rm_cols - nbadparity;
1414         ntgts = 0;
1415         for (i = 0, c = 0; c < rm->rm_cols; c++) {
1416                 if (c < rm->rm_firstdatacol)
1417                         parity_valid[c] = B_FALSE;
1418
1419                 if (i < nt && c == t[i]) {
1420                         tgts[ntgts++] = c;
1421                         i++;
1422                 } else if (rm->rm_col[c].rc_error != 0) {
1423                         tgts[ntgts++] = c;
1424                 } else if (c >= rm->rm_firstdatacol) {
1425                         nbaddata--;
1426                 } else {
1427                         parity_valid[c] = B_TRUE;
1428                         nbadparity--;
1429                 }
1430         }
1431
1432         ASSERT(ntgts >= nt);
1433         ASSERT(nbaddata >= 0);
1434         ASSERT(nbaddata + nbadparity == ntgts);
1435
1436         dt = &tgts[nbadparity];
1437
1438         /*
1439          * See if we can use any of our optimized reconstruction routines.
1440          */
1441         if (!vdev_raidz_default_to_general) {
1442                 switch (nbaddata) {
1443                 case 1:
1444                         if (parity_valid[VDEV_RAIDZ_P])
1445                                 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1446
1447                         ASSERT(rm->rm_firstdatacol > 1);
1448
1449                         if (parity_valid[VDEV_RAIDZ_Q])
1450                                 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1451
1452                         ASSERT(rm->rm_firstdatacol > 2);
1453                         break;
1454
1455                 case 2:
1456                         ASSERT(rm->rm_firstdatacol > 1);
1457
1458                         if (parity_valid[VDEV_RAIDZ_P] &&
1459                             parity_valid[VDEV_RAIDZ_Q])
1460                                 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1461
1462                         ASSERT(rm->rm_firstdatacol > 2);
1463
1464                         break;
1465                 }
1466         }
1467
1468         code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1469         ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1470         ASSERT(code > 0);
1471         return (code);
1472 }
1473
1474 static int
1475 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1476     uint64_t *ashift)
1477 {
1478         vdev_t *cvd;
1479         uint64_t nparity = vd->vdev_nparity;
1480         int c;
1481         int lasterror = 0;
1482         int numerrors = 0;
1483
1484         ASSERT(nparity > 0);
1485
1486         if (nparity > VDEV_RAIDZ_MAXPARITY ||
1487             vd->vdev_children < nparity + 1) {
1488                 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1489                 return (SET_ERROR(EINVAL));
1490         }
1491
1492         vdev_open_children(vd);
1493
1494         for (c = 0; c < vd->vdev_children; c++) {
1495                 cvd = vd->vdev_child[c];
1496
1497                 if (cvd->vdev_open_error != 0) {
1498                         lasterror = cvd->vdev_open_error;
1499                         numerrors++;
1500                         continue;
1501                 }
1502
1503                 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1504                 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1505                 *ashift = MAX(*ashift, cvd->vdev_ashift);
1506         }
1507
1508         *asize *= vd->vdev_children;
1509         *max_asize *= vd->vdev_children;
1510
1511         if (numerrors > nparity) {
1512                 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1513                 return (lasterror);
1514         }
1515
1516         return (0);
1517 }
1518
1519 static void
1520 vdev_raidz_close(vdev_t *vd)
1521 {
1522         int c;
1523
1524         for (c = 0; c < vd->vdev_children; c++)
1525                 vdev_close(vd->vdev_child[c]);
1526 }
1527
1528 static uint64_t
1529 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1530 {
1531         uint64_t asize;
1532         uint64_t ashift = vd->vdev_top->vdev_ashift;
1533         uint64_t cols = vd->vdev_children;
1534         uint64_t nparity = vd->vdev_nparity;
1535
1536         asize = ((psize - 1) >> ashift) + 1;
1537         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1538         asize = roundup(asize, nparity + 1) << ashift;
1539
1540         return (asize);
1541 }
1542
1543 static void
1544 vdev_raidz_child_done(zio_t *zio)
1545 {
1546         raidz_col_t *rc = zio->io_private;
1547
1548         rc->rc_error = zio->io_error;
1549         rc->rc_tried = 1;
1550         rc->rc_skipped = 0;
1551 }
1552
1553 /*
1554  * Start an IO operation on a RAIDZ VDev
1555  *
1556  * Outline:
1557  * - For write operations:
1558  *   1. Generate the parity data
1559  *   2. Create child zio write operations to each column's vdev, for both
1560  *      data and parity.
1561  *   3. If the column skips any sectors for padding, create optional dummy
1562  *      write zio children for those areas to improve aggregation continuity.
1563  * - For read operations:
1564  *   1. Create child zio read operations to each data column's vdev to read
1565  *      the range of data required for zio.
1566  *   2. If this is a scrub or resilver operation, or if any of the data
1567  *      vdevs have had errors, then create zio read operations to the parity
1568  *      columns' VDevs as well.
1569  */
1570 static void
1571 vdev_raidz_io_start(zio_t *zio)
1572 {
1573         vdev_t *vd = zio->io_vd;
1574         vdev_t *tvd = vd->vdev_top;
1575         vdev_t *cvd;
1576         raidz_map_t *rm;
1577         raidz_col_t *rc;
1578         int c, i;
1579
1580         rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1581             vd->vdev_nparity);
1582
1583         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1584
1585         if (zio->io_type == ZIO_TYPE_WRITE) {
1586                 vdev_raidz_generate_parity(rm);
1587
1588                 for (c = 0; c < rm->rm_cols; c++) {
1589                         rc = &rm->rm_col[c];
1590                         cvd = vd->vdev_child[rc->rc_devidx];
1591                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1592                             rc->rc_offset, rc->rc_data, rc->rc_size,
1593                             zio->io_type, zio->io_priority, 0,
1594                             vdev_raidz_child_done, rc));
1595                 }
1596
1597                 /*
1598                  * Generate optional I/Os for any skipped sectors to improve
1599                  * aggregation contiguity.
1600                  */
1601                 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1602                         ASSERT(c <= rm->rm_scols);
1603                         if (c == rm->rm_scols)
1604                                 c = 0;
1605                         rc = &rm->rm_col[c];
1606                         cvd = vd->vdev_child[rc->rc_devidx];
1607                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1608                             rc->rc_offset + rc->rc_size, NULL,
1609                             1 << tvd->vdev_ashift,
1610                             zio->io_type, zio->io_priority,
1611                             ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1612                 }
1613
1614                 zio_execute(zio);
1615                 return;
1616         }
1617
1618         ASSERT(zio->io_type == ZIO_TYPE_READ);
1619
1620         /*
1621          * Iterate over the columns in reverse order so that we hit the parity
1622          * last -- any errors along the way will force us to read the parity.
1623          */
1624         for (c = rm->rm_cols - 1; c >= 0; c--) {
1625                 rc = &rm->rm_col[c];
1626                 cvd = vd->vdev_child[rc->rc_devidx];
1627                 if (!vdev_readable(cvd)) {
1628                         if (c >= rm->rm_firstdatacol)
1629                                 rm->rm_missingdata++;
1630                         else
1631                                 rm->rm_missingparity++;
1632                         rc->rc_error = SET_ERROR(ENXIO);
1633                         rc->rc_tried = 1;       /* don't even try */
1634                         rc->rc_skipped = 1;
1635                         continue;
1636                 }
1637                 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1638                         if (c >= rm->rm_firstdatacol)
1639                                 rm->rm_missingdata++;
1640                         else
1641                                 rm->rm_missingparity++;
1642                         rc->rc_error = SET_ERROR(ESTALE);
1643                         rc->rc_skipped = 1;
1644                         continue;
1645                 }
1646                 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1647                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1648                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1649                             rc->rc_offset, rc->rc_data, rc->rc_size,
1650                             zio->io_type, zio->io_priority, 0,
1651                             vdev_raidz_child_done, rc));
1652                 }
1653         }
1654
1655         zio_execute(zio);
1656 }
1657
1658
1659 /*
1660  * Report a checksum error for a child of a RAID-Z device.
1661  */
1662 static void
1663 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1664 {
1665         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1666
1667         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1668                 zio_bad_cksum_t zbc;
1669                 raidz_map_t *rm = zio->io_vsd;
1670
1671                 mutex_enter(&vd->vdev_stat_lock);
1672                 vd->vdev_stat.vs_checksum_errors++;
1673                 mutex_exit(&vd->vdev_stat_lock);
1674
1675                 zbc.zbc_has_cksum = 0;
1676                 zbc.zbc_injected = rm->rm_ecksuminjected;
1677
1678                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1679                     rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1680                     &zbc);
1681         }
1682 }
1683
1684 /*
1685  * We keep track of whether or not there were any injected errors, so that
1686  * any ereports we generate can note it.
1687  */
1688 static int
1689 raidz_checksum_verify(zio_t *zio)
1690 {
1691         zio_bad_cksum_t zbc;
1692         raidz_map_t *rm = zio->io_vsd;
1693         int ret;
1694
1695         bzero(&zbc, sizeof (zio_bad_cksum_t));
1696
1697         ret = zio_checksum_error(zio, &zbc);
1698         if (ret != 0 && zbc.zbc_injected != 0)
1699                 rm->rm_ecksuminjected = 1;
1700
1701         return (ret);
1702 }
1703
1704 /*
1705  * Generate the parity from the data columns. If we tried and were able to
1706  * read the parity without error, verify that the generated parity matches the
1707  * data we read. If it doesn't, we fire off a checksum error. Return the
1708  * number such failures.
1709  */
1710 static int
1711 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1712 {
1713         void *orig[VDEV_RAIDZ_MAXPARITY];
1714         int c, ret = 0;
1715         raidz_col_t *rc;
1716
1717         for (c = 0; c < rm->rm_firstdatacol; c++) {
1718                 rc = &rm->rm_col[c];
1719                 if (!rc->rc_tried || rc->rc_error != 0)
1720                         continue;
1721                 orig[c] = zio_buf_alloc(rc->rc_size);
1722                 bcopy(rc->rc_data, orig[c], rc->rc_size);
1723         }
1724
1725         vdev_raidz_generate_parity(rm);
1726
1727         for (c = 0; c < rm->rm_firstdatacol; c++) {
1728                 rc = &rm->rm_col[c];
1729                 if (!rc->rc_tried || rc->rc_error != 0)
1730                         continue;
1731                 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1732                         raidz_checksum_error(zio, rc, orig[c]);
1733                         rc->rc_error = SET_ERROR(ECKSUM);
1734                         ret++;
1735                 }
1736                 zio_buf_free(orig[c], rc->rc_size);
1737         }
1738
1739         return (ret);
1740 }
1741
1742 /*
1743  * Keep statistics on all the ways that we used parity to correct data.
1744  */
1745 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1746
1747 static int
1748 vdev_raidz_worst_error(raidz_map_t *rm)
1749 {
1750         int c, error = 0;
1751
1752         for (c = 0; c < rm->rm_cols; c++)
1753                 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1754
1755         return (error);
1756 }
1757
1758 /*
1759  * Iterate over all combinations of bad data and attempt a reconstruction.
1760  * Note that the algorithm below is non-optimal because it doesn't take into
1761  * account how reconstruction is actually performed. For example, with
1762  * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1763  * is targeted as invalid as if columns 1 and 4 are targeted since in both
1764  * cases we'd only use parity information in column 0.
1765  */
1766 static int
1767 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1768 {
1769         raidz_map_t *rm = zio->io_vsd;
1770         raidz_col_t *rc;
1771         void *orig[VDEV_RAIDZ_MAXPARITY];
1772         int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1773         int *tgts = &tstore[1];
1774         int curr, next, i, c, n;
1775         int code, ret = 0;
1776
1777         ASSERT(total_errors < rm->rm_firstdatacol);
1778
1779         /*
1780          * This simplifies one edge condition.
1781          */
1782         tgts[-1] = -1;
1783
1784         for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1785                 /*
1786                  * Initialize the targets array by finding the first n columns
1787                  * that contain no error.
1788                  *
1789                  * If there were no data errors, we need to ensure that we're
1790                  * always explicitly attempting to reconstruct at least one
1791                  * data column. To do this, we simply push the highest target
1792                  * up into the data columns.
1793                  */
1794                 for (c = 0, i = 0; i < n; i++) {
1795                         if (i == n - 1 && data_errors == 0 &&
1796                             c < rm->rm_firstdatacol) {
1797                                 c = rm->rm_firstdatacol;
1798                         }
1799
1800                         while (rm->rm_col[c].rc_error != 0) {
1801                                 c++;
1802                                 ASSERT3S(c, <, rm->rm_cols);
1803                         }
1804
1805                         tgts[i] = c++;
1806                 }
1807
1808                 /*
1809                  * Setting tgts[n] simplifies the other edge condition.
1810                  */
1811                 tgts[n] = rm->rm_cols;
1812
1813                 /*
1814                  * These buffers were allocated in previous iterations.
1815                  */
1816                 for (i = 0; i < n - 1; i++) {
1817                         ASSERT(orig[i] != NULL);
1818                 }
1819
1820                 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1821
1822                 curr = 0;
1823                 next = tgts[curr];
1824
1825                 while (curr != n) {
1826                         tgts[curr] = next;
1827                         curr = 0;
1828
1829                         /*
1830                          * Save off the original data that we're going to
1831                          * attempt to reconstruct.
1832                          */
1833                         for (i = 0; i < n; i++) {
1834                                 ASSERT(orig[i] != NULL);
1835                                 c = tgts[i];
1836                                 ASSERT3S(c, >=, 0);
1837                                 ASSERT3S(c, <, rm->rm_cols);
1838                                 rc = &rm->rm_col[c];
1839                                 bcopy(rc->rc_data, orig[i], rc->rc_size);
1840                         }
1841
1842                         /*
1843                          * Attempt a reconstruction and exit the outer loop on
1844                          * success.
1845                          */
1846                         code = vdev_raidz_reconstruct(rm, tgts, n);
1847                         if (raidz_checksum_verify(zio) == 0) {
1848                                 atomic_inc_64(&raidz_corrected[code]);
1849
1850                                 for (i = 0; i < n; i++) {
1851                                         c = tgts[i];
1852                                         rc = &rm->rm_col[c];
1853                                         ASSERT(rc->rc_error == 0);
1854                                         if (rc->rc_tried)
1855                                                 raidz_checksum_error(zio, rc,
1856                                                     orig[i]);
1857                                         rc->rc_error = SET_ERROR(ECKSUM);
1858                                 }
1859
1860                                 ret = code;
1861                                 goto done;
1862                         }
1863
1864                         /*
1865                          * Restore the original data.
1866                          */
1867                         for (i = 0; i < n; i++) {
1868                                 c = tgts[i];
1869                                 rc = &rm->rm_col[c];
1870                                 bcopy(orig[i], rc->rc_data, rc->rc_size);
1871                         }
1872
1873                         do {
1874                                 /*
1875                                  * Find the next valid column after the curr
1876                                  * position..
1877                                  */
1878                                 for (next = tgts[curr] + 1;
1879                                     next < rm->rm_cols &&
1880                                     rm->rm_col[next].rc_error != 0; next++)
1881                                         continue;
1882
1883                                 ASSERT(next <= tgts[curr + 1]);
1884
1885                                 /*
1886                                  * If that spot is available, we're done here.
1887                                  */
1888                                 if (next != tgts[curr + 1])
1889                                         break;
1890
1891                                 /*
1892                                  * Otherwise, find the next valid column after
1893                                  * the previous position.
1894                                  */
1895                                 for (c = tgts[curr - 1] + 1;
1896                                     rm->rm_col[c].rc_error != 0; c++)
1897                                         continue;
1898
1899                                 tgts[curr] = c;
1900                                 curr++;
1901
1902                         } while (curr != n);
1903                 }
1904         }
1905         n--;
1906 done:
1907         for (i = 0; i < n; i++) {
1908                 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1909         }
1910
1911         return (ret);
1912 }
1913
1914 /*
1915  * Complete an IO operation on a RAIDZ VDev
1916  *
1917  * Outline:
1918  * - For write operations:
1919  *   1. Check for errors on the child IOs.
1920  *   2. Return, setting an error code if too few child VDevs were written
1921  *      to reconstruct the data later.  Note that partial writes are
1922  *      considered successful if they can be reconstructed at all.
1923  * - For read operations:
1924  *   1. Check for errors on the child IOs.
1925  *   2. If data errors occurred:
1926  *      a. Try to reassemble the data from the parity available.
1927  *      b. If we haven't yet read the parity drives, read them now.
1928  *      c. If all parity drives have been read but the data still doesn't
1929  *         reassemble with a correct checksum, then try combinatorial
1930  *         reconstruction.
1931  *      d. If that doesn't work, return an error.
1932  *   3. If there were unexpected errors or this is a resilver operation,
1933  *      rewrite the vdevs that had errors.
1934  */
1935 static void
1936 vdev_raidz_io_done(zio_t *zio)
1937 {
1938         vdev_t *vd = zio->io_vd;
1939         vdev_t *cvd;
1940         raidz_map_t *rm = zio->io_vsd;
1941         raidz_col_t *rc = NULL;
1942         int unexpected_errors = 0;
1943         int parity_errors = 0;
1944         int parity_untried = 0;
1945         int data_errors = 0;
1946         int total_errors = 0;
1947         int n, c;
1948         int tgts[VDEV_RAIDZ_MAXPARITY];
1949         int code;
1950
1951         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
1952
1953         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1954         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1955
1956         for (c = 0; c < rm->rm_cols; c++) {
1957                 rc = &rm->rm_col[c];
1958
1959                 if (rc->rc_error) {
1960                         ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1961
1962                         if (c < rm->rm_firstdatacol)
1963                                 parity_errors++;
1964                         else
1965                                 data_errors++;
1966
1967                         if (!rc->rc_skipped)
1968                                 unexpected_errors++;
1969
1970                         total_errors++;
1971                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1972                         parity_untried++;
1973                 }
1974         }
1975
1976         if (zio->io_type == ZIO_TYPE_WRITE) {
1977                 /*
1978                  * XXX -- for now, treat partial writes as a success.
1979                  * (If we couldn't write enough columns to reconstruct
1980                  * the data, the I/O failed.  Otherwise, good enough.)
1981                  *
1982                  * Now that we support write reallocation, it would be better
1983                  * to treat partial failure as real failure unless there are
1984                  * no non-degraded top-level vdevs left, and not update DTLs
1985                  * if we intend to reallocate.
1986                  */
1987                 /* XXPOLICY */
1988                 if (total_errors > rm->rm_firstdatacol)
1989                         zio->io_error = vdev_raidz_worst_error(rm);
1990
1991                 return;
1992         }
1993
1994         ASSERT(zio->io_type == ZIO_TYPE_READ);
1995         /*
1996          * There are three potential phases for a read:
1997          *      1. produce valid data from the columns read
1998          *      2. read all disks and try again
1999          *      3. perform combinatorial reconstruction
2000          *
2001          * Each phase is progressively both more expensive and less likely to
2002          * occur. If we encounter more errors than we can repair or all phases
2003          * fail, we have no choice but to return an error.
2004          */
2005
2006         /*
2007          * If the number of errors we saw was correctable -- less than or equal
2008          * to the number of parity disks read -- attempt to produce data that
2009          * has a valid checksum. Naturally, this case applies in the absence of
2010          * any errors.
2011          */
2012         if (total_errors <= rm->rm_firstdatacol - parity_untried) {
2013                 if (data_errors == 0) {
2014                         if (raidz_checksum_verify(zio) == 0) {
2015                                 /*
2016                                  * If we read parity information (unnecessarily
2017                                  * as it happens since no reconstruction was
2018                                  * needed) regenerate and verify the parity.
2019                                  * We also regenerate parity when resilvering
2020                                  * so we can write it out to the failed device
2021                                  * later.
2022                                  */
2023                                 if (parity_errors + parity_untried <
2024                                     rm->rm_firstdatacol ||
2025                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2026                                         n = raidz_parity_verify(zio, rm);
2027                                         unexpected_errors += n;
2028                                         ASSERT(parity_errors + n <=
2029                                             rm->rm_firstdatacol);
2030                                 }
2031                                 goto done;
2032                         }
2033                 } else {
2034                         /*
2035                          * We either attempt to read all the parity columns or
2036                          * none of them. If we didn't try to read parity, we
2037                          * wouldn't be here in the correctable case. There must
2038                          * also have been fewer parity errors than parity
2039                          * columns or, again, we wouldn't be in this code path.
2040                          */
2041                         ASSERT(parity_untried == 0);
2042                         ASSERT(parity_errors < rm->rm_firstdatacol);
2043
2044                         /*
2045                          * Identify the data columns that reported an error.
2046                          */
2047                         n = 0;
2048                         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2049                                 rc = &rm->rm_col[c];
2050                                 if (rc->rc_error != 0) {
2051                                         ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2052                                         tgts[n++] = c;
2053                                 }
2054                         }
2055
2056                         ASSERT(rm->rm_firstdatacol >= n);
2057
2058                         code = vdev_raidz_reconstruct(rm, tgts, n);
2059
2060                         if (raidz_checksum_verify(zio) == 0) {
2061                                 atomic_inc_64(&raidz_corrected[code]);
2062
2063                                 /*
2064                                  * If we read more parity disks than were used
2065                                  * for reconstruction, confirm that the other
2066                                  * parity disks produced correct data. This
2067                                  * routine is suboptimal in that it regenerates
2068                                  * the parity that we already used in addition
2069                                  * to the parity that we're attempting to
2070                                  * verify, but this should be a relatively
2071                                  * uncommon case, and can be optimized if it
2072                                  * becomes a problem. Note that we regenerate
2073                                  * parity when resilvering so we can write it
2074                                  * out to failed devices later.
2075                                  */
2076                                 if (parity_errors < rm->rm_firstdatacol - n ||
2077                                     (zio->io_flags & ZIO_FLAG_RESILVER)) {
2078                                         n = raidz_parity_verify(zio, rm);
2079                                         unexpected_errors += n;
2080                                         ASSERT(parity_errors + n <=
2081                                             rm->rm_firstdatacol);
2082                                 }
2083
2084                                 goto done;
2085                         }
2086                 }
2087         }
2088
2089         /*
2090          * This isn't a typical situation -- either we got a read error or
2091          * a child silently returned bad data. Read every block so we can
2092          * try again with as much data and parity as we can track down. If
2093          * we've already been through once before, all children will be marked
2094          * as tried so we'll proceed to combinatorial reconstruction.
2095          */
2096         unexpected_errors = 1;
2097         rm->rm_missingdata = 0;
2098         rm->rm_missingparity = 0;
2099
2100         for (c = 0; c < rm->rm_cols; c++) {
2101                 if (rm->rm_col[c].rc_tried)
2102                         continue;
2103
2104                 zio_vdev_io_redone(zio);
2105                 do {
2106                         rc = &rm->rm_col[c];
2107                         if (rc->rc_tried)
2108                                 continue;
2109                         zio_nowait(zio_vdev_child_io(zio, NULL,
2110                             vd->vdev_child[rc->rc_devidx],
2111                             rc->rc_offset, rc->rc_data, rc->rc_size,
2112                             zio->io_type, zio->io_priority, 0,
2113                             vdev_raidz_child_done, rc));
2114                 } while (++c < rm->rm_cols);
2115
2116                 return;
2117         }
2118
2119         /*
2120          * At this point we've attempted to reconstruct the data given the
2121          * errors we detected, and we've attempted to read all columns. There
2122          * must, therefore, be one or more additional problems -- silent errors
2123          * resulting in invalid data rather than explicit I/O errors resulting
2124          * in absent data. We check if there is enough additional data to
2125          * possibly reconstruct the data and then perform combinatorial
2126          * reconstruction over all possible combinations. If that fails,
2127          * we're cooked.
2128          */
2129         if (total_errors > rm->rm_firstdatacol) {
2130                 zio->io_error = vdev_raidz_worst_error(rm);
2131
2132         } else if (total_errors < rm->rm_firstdatacol &&
2133             (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2134                 /*
2135                  * If we didn't use all the available parity for the
2136                  * combinatorial reconstruction, verify that the remaining
2137                  * parity is correct.
2138                  */
2139                 if (code != (1 << rm->rm_firstdatacol) - 1)
2140                         (void) raidz_parity_verify(zio, rm);
2141         } else {
2142                 /*
2143                  * We're here because either:
2144                  *
2145                  *      total_errors == rm_first_datacol, or
2146                  *      vdev_raidz_combrec() failed
2147                  *
2148                  * In either case, there is enough bad data to prevent
2149                  * reconstruction.
2150                  *
2151                  * Start checksum ereports for all children which haven't
2152                  * failed, and the IO wasn't speculative.
2153                  */
2154                 zio->io_error = SET_ERROR(ECKSUM);
2155
2156                 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2157                         for (c = 0; c < rm->rm_cols; c++) {
2158                                 rc = &rm->rm_col[c];
2159                                 if (rc->rc_error == 0) {
2160                                         zio_bad_cksum_t zbc;
2161                                         zbc.zbc_has_cksum = 0;
2162                                         zbc.zbc_injected =
2163                                             rm->rm_ecksuminjected;
2164
2165                                         zfs_ereport_start_checksum(
2166                                             zio->io_spa,
2167                                             vd->vdev_child[rc->rc_devidx],
2168                                             zio, rc->rc_offset, rc->rc_size,
2169                                             (void *)(uintptr_t)c, &zbc);
2170                                 }
2171                         }
2172                 }
2173         }
2174
2175 done:
2176         zio_checksum_verified(zio);
2177
2178         if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2179             (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2180                 /*
2181                  * Use the good data we have in hand to repair damaged children.
2182                  */
2183                 for (c = 0; c < rm->rm_cols; c++) {
2184                         rc = &rm->rm_col[c];
2185                         cvd = vd->vdev_child[rc->rc_devidx];
2186
2187                         if (rc->rc_error == 0)
2188                                 continue;
2189
2190                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2191                             rc->rc_offset, rc->rc_data, rc->rc_size,
2192                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2193                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2194                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2195                 }
2196         }
2197 }
2198
2199 static void
2200 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2201 {
2202         if (faulted > vd->vdev_nparity)
2203                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2204                     VDEV_AUX_NO_REPLICAS);
2205         else if (degraded + faulted != 0)
2206                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2207         else
2208                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2209 }
2210
2211 vdev_ops_t vdev_raidz_ops = {
2212         vdev_raidz_open,
2213         vdev_raidz_close,
2214         vdev_raidz_asize,
2215         vdev_raidz_io_start,
2216         vdev_raidz_io_done,
2217         vdev_raidz_state_change,
2218         NULL,
2219         NULL,
2220         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2221         B_FALSE                 /* not a leaf vdev */
2222 };