*/
/*
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
unsigned int zfetch_min_sec_reap = 2;
/* max bytes to prefetch per stream (default 8MB) */
unsigned int zfetch_max_distance = 8 * 1024 * 1024;
-/* number of bytes in a array_read at which we stop prefetching (1MB) */
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+unsigned int zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
unsigned long zfetch_array_rd_sz = 1024 * 1024;
typedef struct zfetch_stats {
static void
dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
- zstream_t *zs;
zstream_t *zs_next;
int numstreams = 0;
- uint32_t max_streams;
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
/*
* Clean up old streams.
*/
- for (zs = list_head(&zf->zf_stream);
+ for (zstream_t *zs = list_head(&zf->zf_stream);
zs != NULL; zs = zs_next) {
zs_next = list_next(&zf->zf_stream, zs);
if (((gethrtime() - zs->zs_atime) / NANOSEC) >
* If we are already at the maximum number of streams for this file,
* even after removing old streams, then don't create this stream.
*/
- max_streams = MAX(1, MIN(zfetch_max_streams,
+ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
zfetch_max_distance));
if (numstreams >= max_streams) {
return;
}
- zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+ zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
zs->zs_blkid = blkid;
zs->zs_pf_blkid = blkid;
+ zs->zs_ipf_blkid = blkid;
zs->zs_atime = gethrtime();
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
}
/*
- * This is the prefetch entry point. It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
+ * This is the predictive prefetch entry point. It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ * FALSE -- prefetch only indirect blocks for predicted data blocks;
+ * TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/
void
-dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{
zstream_t *zs;
- int64_t pf_start;
- int pf_nblks;
- int i;
+ int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+ int64_t pf_ahead_blks, max_blks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+ uint64_t end_of_access_blkid;
+ end_of_access_blkid = blkid + nblks;
+ spa_t *spa = zf->zf_dnode->dn_objset->os_spa;
if (zfs_prefetch_disable)
return;
+ /*
+ * If we haven't yet loaded the indirect vdevs' mappings, we
+ * can only read from blocks that we carefully ensure are on
+ * concrete vdevs (or previously-loaded indirect vdevs). So we
+ * can't allow the predictive prefetcher to attempt reads of other
+ * blocks (e.g. of the MOS's dnode obejct).
+ */
+ if (!spa_indirect_vdevs_loaded(spa))
+ return;
/*
* As a fast path for small (single-block) files, ignore access
rw_enter(&zf->zf_rwlock, RW_READER);
+ /*
+ * Find matching prefetch stream. Depending on whether the accesses
+ * are block-aligned, first block of the new access may either follow
+ * the last block of the previous access, or be equal to it.
+ */
for (zs = list_head(&zf->zf_stream); zs != NULL;
zs = list_next(&zf->zf_stream, zs)) {
- if (blkid == zs->zs_blkid) {
+ if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) {
mutex_enter(&zs->zs_lock);
/*
* zs_blkid could have changed before we
* acquired zs_lock; re-check them here.
*/
- if (blkid != zs->zs_blkid) {
- mutex_exit(&zs->zs_lock);
- continue;
+ if (blkid == zs->zs_blkid) {
+ break;
+ } else if (blkid + 1 == zs->zs_blkid) {
+ blkid++;
+ nblks--;
+ if (nblks == 0) {
+ /* Already prefetched this before. */
+ mutex_exit(&zs->zs_lock);
+ rw_exit(&zf->zf_rwlock);
+ return;
+ }
+ break;
}
- break;
+ mutex_exit(&zs->zs_lock);
}
}
*/
ZFETCHSTAT_BUMP(zfetchstat_misses);
if (rw_tryupgrade(&zf->zf_rwlock))
- dmu_zfetch_stream_create(zf, blkid + nblks);
+ dmu_zfetch_stream_create(zf, end_of_access_blkid);
rw_exit(&zf->zf_rwlock);
return;
}
* Normally, we start prefetching where we stopped
* prefetching last (zs_pf_blkid). But when we get our first
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
- * want to prefetch to block we just accessed. In this case,
+ * want to prefetch the block we just accessed. In this case,
* start just after the block we just accessed.
*/
- pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
+ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
/*
* Double our amount of prefetched data, but don't let the
* prefetch get further ahead than zfetch_max_distance.
*/
- pf_nblks =
- MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
- zs->zs_blkid + nblks +
- (zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
+ if (fetch_data) {
+ max_dist_blks =
+ zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * Previously, we were (zs_pf_blkid - blkid) ahead. We
+ * want to now be double that, so read that amount again,
+ * plus the amount we are catching up by (i.e. the amount
+ * read just now).
+ */
+ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+ max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+ pf_nblks = MIN(pf_ahead_blks, max_blks);
+ } else {
+ pf_nblks = 0;
+ }
zs->zs_pf_blkid = pf_start + pf_nblks;
- zs->zs_atime = gethrtime();
- zs->zs_blkid = blkid + nblks;
/*
- * dbuf_prefetch() issues the prefetch i/o
- * asynchronously, but it may need to wait for an
- * indirect block to be read from disk. Therefore
- * we do not want to hold any locks while we call it.
+ * Do the same for indirects, starting from where we stopped last,
+ * or where we will stop reading data blocks (and the indirects
+ * that point to them).
*/
+ ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+ max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * We want to double our distance ahead of the data prefetch
+ * (or reader, if we are not prefetching data). Previously, we
+ * were (zs_ipf_blkid - blkid) ahead. To double that, we read
+ * that amount again, plus the amount we are catching up by
+ * (i.e. the amount read now + the amount of data prefetched now).
+ */
+ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+ max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+ ipf_nblks = MIN(pf_ahead_blks, max_blks);
+ zs->zs_ipf_blkid = ipf_start + ipf_nblks;
+
+ epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+ ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
+
+ zs->zs_atime = gethrtime();
+ zs->zs_blkid = end_of_access_blkid;
mutex_exit(&zs->zs_lock);
rw_exit(&zf->zf_rwlock);
- for (i = 0; i < pf_nblks; i++) {
+
+ /*
+ * dbuf_prefetch() is asynchronous (even when it needs to read
+ * indirect blocks), but we still prefer to drop our locks before
+ * calling it to reduce the time we hold them.
+ */
+
+ for (int i = 0; i < pf_nblks; i++) {
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
+ for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+ dbuf_prefetch(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
ZFETCHSTAT_BUMP(zfetchstat_hits);
}
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
+/* BEGIN CSTYLED */
module_param(zfs_prefetch_disable, int, 0644);
MODULE_PARM_DESC(zfs_prefetch_disable, "Disable all ZFS prefetching");
module_param(zfetch_array_rd_sz, ulong, 0644);
MODULE_PARM_DESC(zfetch_array_rd_sz, "Number of bytes in a array_read");
+/* END CSTYLED */
#endif