]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dmu_zfetch.c
Fix typo/etc in module/zfs/zfs_ctldir.c
[mirror_zfs.git] / module / zfs / dmu_zfetch.c
index b25b0e00772e418f159630a5e37f67ef7c4c83cc..2133518ff4ef5a97e42e28061468aecc45d214c9 100644 (file)
@@ -24,7 +24,7 @@
  */
 
 /*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -50,7 +50,9 @@ unsigned int  zfetch_max_streams = 8;
 unsigned int   zfetch_min_sec_reap = 2;
 /* max bytes to prefetch per stream (default 8MB) */
 unsigned int   zfetch_max_distance = 8 * 1024 * 1024;
-/* number of bytes in a array_read at which we stop prefetching (1MB) */
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+unsigned int   zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
 unsigned long  zfetch_array_rd_sz = 1024 * 1024;
 
 typedef struct zfetch_stats {
@@ -150,17 +152,15 @@ dmu_zfetch_fini(zfetch_t *zf)
 static void
 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 {
-       zstream_t *zs;
        zstream_t *zs_next;
        int numstreams = 0;
-       uint32_t max_streams;
 
        ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
 
        /*
         * Clean up old streams.
         */
-       for (zs = list_head(&zf->zf_stream);
+       for (zstream_t *zs = list_head(&zf->zf_stream);
            zs != NULL; zs = zs_next) {
                zs_next = list_next(&zf->zf_stream, zs);
                if (((gethrtime() - zs->zs_atime) / NANOSEC) >
@@ -178,7 +178,7 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
         * If we are already at the maximum number of streams for this file,
         * even after removing old streams, then don't create this stream.
         */
-       max_streams = MAX(1, MIN(zfetch_max_streams,
+       uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
            zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
            zfetch_max_distance));
        if (numstreams >= max_streams) {
@@ -186,9 +186,10 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
                return;
        }
 
-       zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+       zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
        zs->zs_blkid = blkid;
        zs->zs_pf_blkid = blkid;
+       zs->zs_ipf_blkid = blkid;
        zs->zs_atime = gethrtime();
        mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -196,19 +197,35 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
 }
 
 /*
- * This is the prefetch entry point.  It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
+ * This is the predictive prefetch entry point.  It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ *   FALSE -- prefetch only indirect blocks for predicted data blocks;
+ *   TRUE -- prefetch predicted data blocks plus following indirect blocks.
  */
 void
-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
 {
        zstream_t *zs;
-       int64_t pf_start;
-       int pf_nblks;
-       int i;
+       int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+       int64_t pf_ahead_blks, max_blks;
+       int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+       uint64_t end_of_access_blkid;
+       end_of_access_blkid = blkid + nblks;
+       spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
 
        if (zfs_prefetch_disable)
                return;
+       /*
+        * If we haven't yet loaded the indirect vdevs' mappings, we
+        * can only read from blocks that we carefully ensure are on
+        * concrete vdevs (or previously-loaded indirect vdevs).  So we
+        * can't allow the predictive prefetcher to attempt reads of other
+        * blocks (e.g. of the MOS's dnode obejct).
+        */
+       if (!spa_indirect_vdevs_loaded(spa))
+               return;
 
        /*
         * As a fast path for small (single-block) files, ignore access
@@ -219,19 +236,33 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
 
        rw_enter(&zf->zf_rwlock, RW_READER);
 
+       /*
+        * Find matching prefetch stream.  Depending on whether the accesses
+        * are block-aligned, first block of the new access may either follow
+        * the last block of the previous access, or be equal to it.
+        */
        for (zs = list_head(&zf->zf_stream); zs != NULL;
            zs = list_next(&zf->zf_stream, zs)) {
-               if (blkid == zs->zs_blkid) {
+               if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
                        mutex_enter(&zs->zs_lock);
                        /*
                         * zs_blkid could have changed before we
                         * acquired zs_lock; re-check them here.
                         */
-                       if (blkid != zs->zs_blkid) {
-                               mutex_exit(&zs->zs_lock);
-                               continue;
+                       if (blkid == zs->zs_blkid) {
+                               break;
+                       } else if (blkid + 1 == zs->zs_blkid) {
+                               blkid++;
+                               nblks--;
+                               if (nblks == 0) {
+                                       /* Already prefetched this before. */
+                                       mutex_exit(&zs->zs_lock);
+                                       rw_exit(&zf->zf_rwlock);
+                                       return;
+                               }
+                               break;
                        }
-                       break;
+                       mutex_exit(&zs->zs_lock);
                }
        }
 
@@ -242,7 +273,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
                 */
                ZFETCHSTAT_BUMP(zfetchstat_misses);
                if (rw_tryupgrade(&zf->zf_rwlock))
-                       dmu_zfetch_stream_create(zf, blkid + nblks);
+                       dmu_zfetch_stream_create(zf, end_of_access_blkid);
                rw_exit(&zf->zf_rwlock);
                return;
        }
@@ -254,40 +285,80 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
         * Normally, we start prefetching where we stopped
         * prefetching last (zs_pf_blkid).  But when we get our first
         * hit on this stream, zs_pf_blkid == zs_blkid, we don't
-        * want to prefetch to block we just accessed.  In this case,
+        * want to prefetch the block we just accessed.  In this case,
         * start just after the block we just accessed.
         */
-       pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
+       pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
 
        /*
         * Double our amount of prefetched data, but don't let the
         * prefetch get further ahead than zfetch_max_distance.
         */
-       pf_nblks =
-           MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
-           zs->zs_blkid + nblks +
-           (zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
+       if (fetch_data) {
+               max_dist_blks =
+                   zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+               /*
+                * Previously, we were (zs_pf_blkid - blkid) ahead.  We
+                * want to now be double that, so read that amount again,
+                * plus the amount we are catching up by (i.e. the amount
+                * read just now).
+                */
+               pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+               max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+               pf_nblks = MIN(pf_ahead_blks, max_blks);
+       } else {
+               pf_nblks = 0;
+       }
 
        zs->zs_pf_blkid = pf_start + pf_nblks;
-       zs->zs_atime = gethrtime();
-       zs->zs_blkid = blkid + nblks;
 
        /*
-        * dbuf_prefetch() issues the prefetch i/o
-        * asynchronously, but it may need to wait for an
-        * indirect block to be read from disk.  Therefore
-        * we do not want to hold any locks while we call it.
+        * Do the same for indirects, starting from where we stopped last,
+        * or where we will stop reading data blocks (and the indirects
+        * that point to them).
         */
+       ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+       max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+       /*
+        * We want to double our distance ahead of the data prefetch
+        * (or reader, if we are not prefetching data).  Previously, we
+        * were (zs_ipf_blkid - blkid) ahead.  To double that, we read
+        * that amount again, plus the amount we are catching up by
+        * (i.e. the amount read now + the amount of data prefetched now).
+        */
+       pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+       max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+       ipf_nblks = MIN(pf_ahead_blks, max_blks);
+       zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+       epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+       ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+       ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+       zs->zs_atime = gethrtime();
+       zs->zs_blkid = end_of_access_blkid;
        mutex_exit(&zs->zs_lock);
        rw_exit(&zf->zf_rwlock);
-       for (i = 0; i < pf_nblks; i++) {
+
+       /*
+        * dbuf_prefetch() is asynchronous (even when it needs to read
+        * indirect blocks), but we still prefer to drop our locks before
+        * calling it to reduce the time we hold them.
+        */
+
+       for (int i = 0; i < pf_nblks; i++) {
                dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
                    ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
        }
+       for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+               dbuf_prefetch(zf->zf_dnode, 1, iblk,
+                   ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+       }
        ZFETCHSTAT_BUMP(zfetchstat_hits);
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
 module_param(zfs_prefetch_disable, int, 0644);
 MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching");
 
@@ -303,4 +374,5 @@ MODULE_PARM_DESC(zfetch_max_distance,
 
 module_param(zfetch_array_rd_sz, ulong, 0644);
 MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read");
+/* END CSTYLED */
 #endif