module/zfs/vdev_queue.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  */
  25
  26 /*
  27  * Copyright (c) 2012 by Delphix. All rights reserved.
  28  */
  29
  30 #include <sys/zfs_context.h>
  31 #include <sys/vdev_impl.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/zio.h>
  34 #include <sys/avl.h>
  35 #include <sys/kstat.h>
  36
  37 /*
  38  * These tunables are for performance analysis.
  39  */
  40
  41 /* The maximum number of I/Os concurrently pending to each device. */
  42 int zfs_vdev_max_pending = 10;
  43
  44 /*
  45  * The initial number of I/Os pending to each device, before it starts ramping
  46  * up to zfs_vdev_max_pending.
  47  */
  48 int zfs_vdev_min_pending = 4;
  49
  50 /*
  51  * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
  52  * deadline = pri + gethrtime() >> time_shift)
  53  */
  54 int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
  55
  56 /* exponential I/O issue ramp-up rate */
  57 int zfs_vdev_ramp_rate = 2;
  58
  59 /*
  60  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
  61  * For read I/Os, we also aggregate across small adjacency gaps; for writes
  62  * we include spans of optional I/Os to aid aggregation at the disk even when
  63  * they aren't able to help us aggregate at this level.
  64  */
  65 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
  66 int zfs_vdev_read_gap_limit = 32 << 10;
  67 int zfs_vdev_write_gap_limit = 4 << 10;
  68
  69 /*
  70  * Virtual device vector for disk I/O scheduling.
  71  */
  72 int
  73 vdev_queue_deadline_compare(const void *x1, const void *x2)
  74 {
  75         const zio_t *z1 = x1;
  76         const zio_t *z2 = x2;
  77
  78         if (z1->io_deadline < z2->io_deadline)
  79                 return (-1);
  80         if (z1->io_deadline > z2->io_deadline)
  81                 return (1);
  82
  83         if (z1->io_offset < z2->io_offset)
  84                 return (-1);
  85         if (z1->io_offset > z2->io_offset)
  86                 return (1);
  87
  88         if (z1 < z2)
  89                 return (-1);
  90         if (z1 > z2)
  91                 return (1);
  92
  93         return (0);
  94 }
  95
  96 int
  97 vdev_queue_offset_compare(const void *x1, const void *x2)
  98 {
  99         const zio_t *z1 = x1;
 100         const zio_t *z2 = x2;
 101
 102         if (z1->io_offset < z2->io_offset)
 103                 return (-1);
 104         if (z1->io_offset > z2->io_offset)
 105                 return (1);
 106
 107         if (z1 < z2)
 108                 return (-1);
 109         if (z1 > z2)
 110                 return (1);
 111
 112         return (0);
 113 }
 114
 115 void
 116 vdev_queue_init(vdev_t *vd)
 117 {
 118         vdev_queue_t *vq = &vd->vdev_queue;
 119         int i;
 120
 121         mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 122
 123         avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
 124             sizeof (zio_t), offsetof(struct zio, io_deadline_node));
 125
 126         avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
 127             sizeof (zio_t), offsetof(struct zio, io_offset_node));
 128
 129         avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
 130             sizeof (zio_t), offsetof(struct zio, io_offset_node));
 131
 132         avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
 133             sizeof (zio_t), offsetof(struct zio, io_offset_node));
 134
 135         /*
 136          * A list of buffers which can be used for aggregate I/O, this
 137          * avoids the need to allocate them on demand when memory is low.
 138          */
 139         list_create(&vq->vq_io_list, sizeof (vdev_io_t),
 140             offsetof(vdev_io_t, vi_node));
 141
 142         for (i = 0; i < zfs_vdev_max_pending; i++)
 143                 list_insert_tail(&vq->vq_io_list, zio_vdev_alloc());
 144 }
 145
 146 void
 147 vdev_queue_fini(vdev_t *vd)
 148 {
 149         vdev_queue_t *vq = &vd->vdev_queue;
 150         vdev_io_t *vi;
 151
 152         avl_destroy(&vq->vq_deadline_tree);
 153         avl_destroy(&vq->vq_read_tree);
 154         avl_destroy(&vq->vq_write_tree);
 155         avl_destroy(&vq->vq_pending_tree);
 156
 157         while ((vi = list_head(&vq->vq_io_list)) != NULL) {
 158                 list_remove(&vq->vq_io_list, vi);
 159                 zio_vdev_free(vi);
 160         }
 161
 162         list_destroy(&vq->vq_io_list);
 163
 164         mutex_destroy(&vq->vq_lock);
 165 }
 166
 167 static void
 168 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 169 {
 170         spa_t *spa = zio->io_spa;
 171         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 172
 173         avl_add(&vq->vq_deadline_tree, zio);
 174         avl_add(zio->io_vdev_tree, zio);
 175
 176         if (ssh->kstat != NULL) {
 177                 mutex_enter(&ssh->lock);
 178                 kstat_waitq_enter(ssh->kstat->ks_data);
 179                 mutex_exit(&ssh->lock);
 180         }
 181 }
 182
 183 static void
 184 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 185 {
 186         spa_t *spa = zio->io_spa;
 187         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 188
 189         avl_remove(&vq->vq_deadline_tree, zio);
 190         avl_remove(zio->io_vdev_tree, zio);
 191
 192         if (ssh->kstat != NULL) {
 193                 mutex_enter(&ssh->lock);
 194                 kstat_waitq_exit(ssh->kstat->ks_data);
 195                 mutex_exit(&ssh->lock);
 196         }
 197 }
 198
 199 static void
 200 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 201 {
 202         spa_t *spa = zio->io_spa;
 203         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 204
 205         avl_add(&vq->vq_pending_tree, zio);
 206
 207         if (ssh->kstat != NULL) {
 208                 mutex_enter(&ssh->lock);
 209                 kstat_runq_enter(ssh->kstat->ks_data);
 210                 mutex_exit(&ssh->lock);
 211         }
 212 }
 213
 214 static void
 215 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 216 {
 217         spa_t *spa = zio->io_spa;
 218         spa_stats_history_t *ssh = &spa->spa_stats.io_history;
 219
 220         avl_remove(&vq->vq_pending_tree, zio);
 221
 222         if (ssh->kstat != NULL) {
 223                 kstat_io_t *ksio = ssh->kstat->ks_data;
 224
 225                 mutex_enter(&ssh->lock);
 226                 kstat_runq_exit(ksio);
 227                 if (zio->io_type == ZIO_TYPE_READ) {
 228                         ksio->reads++;
 229                         ksio->nread += zio->io_size;
 230                 } else if (zio->io_type == ZIO_TYPE_WRITE) {
 231                         ksio->writes++;
 232                         ksio->nwritten += zio->io_size;
 233                 }
 234                 mutex_exit(&ssh->lock);
 235         }
 236 }
 237
 238 static void
 239 vdev_queue_agg_io_done(zio_t *aio)
 240 {
 241         vdev_queue_t *vq = &aio->io_vd->vdev_queue;
 242         vdev_io_t *vi = aio->io_data;
 243         zio_t *pio;
 244
 245         while ((pio = zio_walk_parents(aio)) != NULL)
 246                 if (aio->io_type == ZIO_TYPE_READ)
 247                         bcopy((char *)aio->io_data + (pio->io_offset -
 248                             aio->io_offset), pio->io_data, pio->io_size);
 249
 250         mutex_enter(&vq->vq_lock);
 251         list_insert_tail(&vq->vq_io_list, vi);
 252         mutex_exit(&vq->vq_lock);
 253 }
 254
 255 /*
 256  * Compute the range spanned by two i/os, which is the endpoint of the last
 257  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
 258  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
 259  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
 260  */
 261 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 262 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 263
 264 static zio_t *
 265 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 266 {
 267         zio_t *fio, *lio, *aio, *dio, *nio, *mio;
 268         avl_tree_t *t;
 269         vdev_io_t *vi;
 270         int flags;
 271         uint64_t maxspan = MIN(zfs_vdev_aggregation_limit, SPA_MAXBLOCKSIZE);
 272         uint64_t maxgap;
 273         int stretch;
 274
 275 again:
 276         ASSERT(MUTEX_HELD(&vq->vq_lock));
 277
 278         if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
 279             avl_numnodes(&vq->vq_deadline_tree) == 0)
 280                 return (NULL);
 281
 282         fio = lio = avl_first(&vq->vq_deadline_tree);
 283
 284         t = fio->io_vdev_tree;
 285         flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
 286         maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
 287
 288         vi = list_head(&vq->vq_io_list);
 289         if (vi == NULL) {
 290                 vi = zio_vdev_alloc();
 291                 list_insert_head(&vq->vq_io_list, vi);
 292         }
 293
 294         if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
 295                 /*
 296                  * We can aggregate I/Os that are sufficiently adjacent and of
 297                  * the same flavor, as expressed by the AGG_INHERIT flags.
 298                  * The latter requirement is necessary so that certain
 299                  * attributes of the I/O, such as whether it's a normal I/O
 300                  * or a scrub/resilver, can be preserved in the aggregate.
 301                  * We can include optional I/Os, but don't allow them
 302                  * to begin a range as they add no benefit in that situation.
 303                  */
 304
 305                 /*
 306                  * We keep track of the last non-optional I/O.
 307                  */
 308                 mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
 309
 310                 /*
 311                  * Walk backwards through sufficiently contiguous I/Os
 312                  * recording the last non-option I/O.
 313                  */
 314                 while ((dio = AVL_PREV(t, fio)) != NULL &&
 315                     (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 316                     IO_SPAN(dio, lio) <= maxspan &&
 317                     IO_GAP(dio, fio) <= maxgap) {
 318                         fio = dio;
 319                         if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
 320                                 mio = fio;
 321                 }
 322
 323                 /*
 324                  * Skip any initial optional I/Os.
 325                  */
 326                 while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
 327                         fio = AVL_NEXT(t, fio);
 328                         ASSERT(fio != NULL);
 329                 }
 330
 331                 /*
 332                  * Walk forward through sufficiently contiguous I/Os.
 333                  */
 334                 while ((dio = AVL_NEXT(t, lio)) != NULL &&
 335                     (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 336                     IO_SPAN(fio, dio) <= maxspan &&
 337                     IO_GAP(lio, dio) <= maxgap) {
 338                         lio = dio;
 339                         if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
 340                                 mio = lio;
 341                 }
 342
 343                 /*
 344                  * Now that we've established the range of the I/O aggregation
 345                  * we must decide what to do with trailing optional I/Os.
 346                  * For reads, there's nothing to do. While we are unable to
 347                  * aggregate further, it's possible that a trailing optional
 348                  * I/O would allow the underlying device to aggregate with
 349                  * subsequent I/Os. We must therefore determine if the next
 350                  * non-optional I/O is close enough to make aggregation
 351                  * worthwhile.
 352                  */
 353                 stretch = B_FALSE;
 354                 if (t != &vq->vq_read_tree && mio != NULL) {
 355                         nio = lio;
 356                         while ((dio = AVL_NEXT(t, nio)) != NULL &&
 357                             IO_GAP(nio, dio) == 0 &&
 358                             IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
 359                                 nio = dio;
 360                                 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 361                                         stretch = B_TRUE;
 362                                         break;
 363                                 }
 364                         }
 365                 }
 366
 367                 if (stretch) {
 368                         /* This may be a no-op. */
 369                         VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
 370                         dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 371                 } else {
 372                         while (lio != mio && lio != fio) {
 373                                 ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
 374                                 lio = AVL_PREV(t, lio);
 375                                 ASSERT(lio != NULL);
 376                         }
 377                 }
 378         }
 379
 380         if (fio != lio) {
 381                 uint64_t size = IO_SPAN(fio, lio);
 382                 ASSERT(size <= maxspan);
 383                 ASSERT(vi != NULL);
 384
 385                 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
 386                     vi, size, fio->io_type, ZIO_PRIORITY_AGG,
 387                     flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 388                     vdev_queue_agg_io_done, NULL);
 389                 aio->io_timestamp = fio->io_timestamp;
 390
 391                 nio = fio;
 392                 do {
 393                         dio = nio;
 394                         nio = AVL_NEXT(t, dio);
 395                         ASSERT(dio->io_type == aio->io_type);
 396                         ASSERT(dio->io_vdev_tree == t);
 397
 398                         if (dio->io_flags & ZIO_FLAG_NODATA) {
 399                                 ASSERT(dio->io_type == ZIO_TYPE_WRITE);
 400                                 bzero((char *)aio->io_data + (dio->io_offset -
 401                                     aio->io_offset), dio->io_size);
 402                         } else if (dio->io_type == ZIO_TYPE_WRITE) {
 403                                 bcopy(dio->io_data, (char *)aio->io_data +
 404                                     (dio->io_offset - aio->io_offset),
 405                                     dio->io_size);
 406                         }
 407
 408                         zio_add_child(dio, aio);
 409                         vdev_queue_io_remove(vq, dio);
 410                         zio_vdev_io_bypass(dio);
 411                         zio_execute(dio);
 412                 } while (dio != lio);
 413
 414                 vdev_queue_pending_add(vq, aio);
 415                 list_remove(&vq->vq_io_list, vi);
 416
 417                 return (aio);
 418         }
 419
 420         ASSERT(fio->io_vdev_tree == t);
 421         vdev_queue_io_remove(vq, fio);
 422
 423         /*
 424          * If the I/O is or was optional and therefore has no data, we need to
 425          * simply discard it. We need to drop the vdev queue's lock to avoid a
 426          * deadlock that we could encounter since this I/O will complete
 427          * immediately.
 428          */
 429         if (fio->io_flags & ZIO_FLAG_NODATA) {
 430                 mutex_exit(&vq->vq_lock);
 431                 zio_vdev_io_bypass(fio);
 432                 zio_execute(fio);
 433                 mutex_enter(&vq->vq_lock);
 434                 goto again;
 435         }
 436
 437         vdev_queue_pending_add(vq, fio);
 438
 439         return (fio);
 440 }
 441
 442 zio_t *
 443 vdev_queue_io(zio_t *zio)
 444 {
 445         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 446         zio_t *nio;
 447
 448         ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
 449
 450         if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 451                 return (zio);
 452
 453         zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 454
 455         if (zio->io_type == ZIO_TYPE_READ)
 456                 zio->io_vdev_tree = &vq->vq_read_tree;
 457         else
 458                 zio->io_vdev_tree = &vq->vq_write_tree;
 459
 460         mutex_enter(&vq->vq_lock);
 461
 462         zio->io_timestamp = gethrtime();
 463         zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
 464             zio->io_priority;
 465
 466         vdev_queue_io_add(vq, zio);
 467
 468         nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
 469
 470         mutex_exit(&vq->vq_lock);
 471
 472         if (nio == NULL)
 473                 return (NULL);
 474
 475         if (nio->io_done == vdev_queue_agg_io_done) {
 476                 zio_nowait(nio);
 477                 return (NULL);
 478         }
 479
 480         return (nio);
 481 }
 482
 483 void
 484 vdev_queue_io_done(zio_t *zio)
 485 {
 486         vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 487         int i;
 488
 489         if (zio_injection_enabled)
 490                 delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
 491
 492         mutex_enter(&vq->vq_lock);
 493
 494         vdev_queue_pending_remove(vq, zio);
 495
 496         zio->io_delta = gethrtime() - zio->io_timestamp;
 497         vq->vq_io_complete_ts = gethrtime();
 498         vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
 499
 500         for (i = 0; i < zfs_vdev_ramp_rate; i++) {
 501                 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
 502                 if (nio == NULL)
 503                         break;
 504                 mutex_exit(&vq->vq_lock);
 505                 if (nio->io_done == vdev_queue_agg_io_done) {
 506                         zio_nowait(nio);
 507                 } else {
 508                         zio_vdev_io_reissue(nio);
 509                         zio_execute(nio);
 510                 }
 511                 mutex_enter(&vq->vq_lock);
 512         }
 513
 514         mutex_exit(&vq->vq_lock);
 515 }
 516
 517 #if defined(_KERNEL) && defined(HAVE_SPL)
 518 module_param(zfs_vdev_max_pending, int, 0644);
 519 MODULE_PARM_DESC(zfs_vdev_max_pending, "Max pending per-vdev I/Os");
 520
 521 module_param(zfs_vdev_min_pending, int, 0644);
 522 MODULE_PARM_DESC(zfs_vdev_min_pending, "Min pending per-vdev I/Os");
 523
 524 module_param(zfs_vdev_aggregation_limit, int, 0644);
 525 MODULE_PARM_DESC(zfs_vdev_aggregation_limit, "Max vdev I/O aggregation size");
 526
 527 module_param(zfs_vdev_time_shift, int, 0644);
 528 MODULE_PARM_DESC(zfs_vdev_time_shift, "Deadline time shift for vdev I/O");
 529
 530 module_param(zfs_vdev_ramp_rate, int, 0644);
 531 MODULE_PARM_DESC(zfs_vdev_ramp_rate, "Exponential I/O issue ramp-up rate");
 532
 533 module_param(zfs_vdev_read_gap_limit, int, 0644);
 534 MODULE_PARM_DESC(zfs_vdev_read_gap_limit, "Aggregate read I/O over gap");
 535
 536 module_param(zfs_vdev_write_gap_limit, int, 0644);
 537 MODULE_PARM_DESC(zfs_vdev_write_gap_limit, "Aggregate write I/O over gap");
 538 #endif