4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
31 #include <sys/dnode.h>
32 #include <sys/dmu_objset.h>
33 #include <sys/dmu_zfetch.h>
36 #include <sys/kstat.h>
37 #include <sys/wmsum.h>
40 * This tunable disables predictive prefetch. Note that it leaves "prescient"
41 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
42 * prescient prefetch never issues i/os that end up not being needed,
43 * so it can't hurt performance.
46 static int zfs_prefetch_disable
= B_FALSE
;
48 /* max # of streams per zfetch */
49 static unsigned int zfetch_max_streams
= 8;
50 /* min time before stream reclaim */
51 static unsigned int zfetch_min_sec_reap
= 2;
52 /* max bytes to prefetch per stream (default 8MB) */
53 unsigned int zfetch_max_distance
= 8 * 1024 * 1024;
54 /* max bytes to prefetch indirects for per stream (default 64MB) */
55 unsigned int zfetch_max_idistance
= 64 * 1024 * 1024;
56 /* max number of bytes in an array_read in which we allow prefetching (1MB) */
57 unsigned long zfetch_array_rd_sz
= 1024 * 1024;
59 typedef struct zfetch_stats
{
60 kstat_named_t zfetchstat_hits
;
61 kstat_named_t zfetchstat_misses
;
62 kstat_named_t zfetchstat_max_streams
;
63 kstat_named_t zfetchstat_io_issued
;
66 static zfetch_stats_t zfetch_stats
= {
67 { "hits", KSTAT_DATA_UINT64
},
68 { "misses", KSTAT_DATA_UINT64
},
69 { "max_streams", KSTAT_DATA_UINT64
},
70 { "io_issued", KSTAT_DATA_UINT64
},
74 wmsum_t zfetchstat_hits
;
75 wmsum_t zfetchstat_misses
;
76 wmsum_t zfetchstat_max_streams
;
77 wmsum_t zfetchstat_io_issued
;
80 #define ZFETCHSTAT_BUMP(stat) \
81 wmsum_add(&zfetch_sums.stat, 1)
82 #define ZFETCHSTAT_ADD(stat, val) \
83 wmsum_add(&zfetch_sums.stat, val)
86 static kstat_t
*zfetch_ksp
;
89 zfetch_kstats_update(kstat_t
*ksp
, int rw
)
91 zfetch_stats_t
*zs
= ksp
->ks_data
;
93 if (rw
== KSTAT_WRITE
)
95 zs
->zfetchstat_hits
.value
.ui64
=
96 wmsum_value(&zfetch_sums
.zfetchstat_hits
);
97 zs
->zfetchstat_misses
.value
.ui64
=
98 wmsum_value(&zfetch_sums
.zfetchstat_misses
);
99 zs
->zfetchstat_max_streams
.value
.ui64
=
100 wmsum_value(&zfetch_sums
.zfetchstat_max_streams
);
101 zs
->zfetchstat_io_issued
.value
.ui64
=
102 wmsum_value(&zfetch_sums
.zfetchstat_io_issued
);
109 wmsum_init(&zfetch_sums
.zfetchstat_hits
, 0);
110 wmsum_init(&zfetch_sums
.zfetchstat_misses
, 0);
111 wmsum_init(&zfetch_sums
.zfetchstat_max_streams
, 0);
112 wmsum_init(&zfetch_sums
.zfetchstat_io_issued
, 0);
114 zfetch_ksp
= kstat_create("zfs", 0, "zfetchstats", "misc",
115 KSTAT_TYPE_NAMED
, sizeof (zfetch_stats
) / sizeof (kstat_named_t
),
118 if (zfetch_ksp
!= NULL
) {
119 zfetch_ksp
->ks_data
= &zfetch_stats
;
120 zfetch_ksp
->ks_update
= zfetch_kstats_update
;
121 kstat_install(zfetch_ksp
);
128 if (zfetch_ksp
!= NULL
) {
129 kstat_delete(zfetch_ksp
);
133 wmsum_fini(&zfetch_sums
.zfetchstat_hits
);
134 wmsum_fini(&zfetch_sums
.zfetchstat_misses
);
135 wmsum_fini(&zfetch_sums
.zfetchstat_max_streams
);
136 wmsum_fini(&zfetch_sums
.zfetchstat_io_issued
);
140 * This takes a pointer to a zfetch structure and a dnode. It performs the
141 * necessary setup for the zfetch structure, grokking data from the
145 dmu_zfetch_init(zfetch_t
*zf
, dnode_t
*dno
)
150 zf
->zf_numstreams
= 0;
152 list_create(&zf
->zf_stream
, sizeof (zstream_t
),
153 offsetof(zstream_t
, zs_node
));
155 mutex_init(&zf
->zf_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
159 dmu_zfetch_stream_fini(zstream_t
*zs
)
161 ASSERT(!list_link_active(&zs
->zs_node
));
162 zfs_refcount_destroy(&zs
->zs_callers
);
163 zfs_refcount_destroy(&zs
->zs_refs
);
164 kmem_free(zs
, sizeof (*zs
));
168 dmu_zfetch_stream_remove(zfetch_t
*zf
, zstream_t
*zs
)
170 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
171 list_remove(&zf
->zf_stream
, zs
);
174 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
175 dmu_zfetch_stream_fini(zs
);
179 * Clean-up state associated with a zfetch structure (e.g. destroy the
180 * streams). This doesn't free the zfetch_t itself, that's left to the caller.
183 dmu_zfetch_fini(zfetch_t
*zf
)
187 mutex_enter(&zf
->zf_lock
);
188 while ((zs
= list_head(&zf
->zf_stream
)) != NULL
)
189 dmu_zfetch_stream_remove(zf
, zs
);
190 mutex_exit(&zf
->zf_lock
);
191 list_destroy(&zf
->zf_stream
);
192 mutex_destroy(&zf
->zf_lock
);
198 * If there aren't too many streams already, create a new stream.
199 * The "blkid" argument is the next block that we expect this stream to access.
200 * While we're here, clean up old streams (which haven't been
201 * accessed for at least zfetch_min_sec_reap seconds).
204 dmu_zfetch_stream_create(zfetch_t
*zf
, uint64_t blkid
)
207 hrtime_t now
= gethrtime();
209 ASSERT(MUTEX_HELD(&zf
->zf_lock
));
212 * Clean up old streams.
214 for (zstream_t
*zs
= list_head(&zf
->zf_stream
);
215 zs
!= NULL
; zs
= zs_next
) {
216 zs_next
= list_next(&zf
->zf_stream
, zs
);
218 * Skip if still active. 1 -- zf_stream reference.
220 if (zfs_refcount_count(&zs
->zs_refs
) != 1)
222 if (((now
- zs
->zs_atime
) / NANOSEC
) >
224 dmu_zfetch_stream_remove(zf
, zs
);
228 * The maximum number of streams is normally zfetch_max_streams,
229 * but for small files we lower it such that it's at least possible
230 * for all the streams to be non-overlapping.
232 * If we are already at the maximum number of streams for this file,
233 * even after removing old streams, then don't create this stream.
235 uint32_t max_streams
= MAX(1, MIN(zfetch_max_streams
,
236 zf
->zf_dnode
->dn_maxblkid
* zf
->zf_dnode
->dn_datablksz
/
237 zfetch_max_distance
));
238 if (zf
->zf_numstreams
>= max_streams
) {
239 ZFETCHSTAT_BUMP(zfetchstat_max_streams
);
243 zstream_t
*zs
= kmem_zalloc(sizeof (*zs
), KM_SLEEP
);
244 zs
->zs_blkid
= blkid
;
245 zs
->zs_pf_blkid1
= blkid
;
246 zs
->zs_pf_blkid
= blkid
;
247 zs
->zs_ipf_blkid1
= blkid
;
248 zs
->zs_ipf_blkid
= blkid
;
251 zs
->zs_missed
= B_FALSE
;
252 zfs_refcount_create(&zs
->zs_callers
);
253 zfs_refcount_create(&zs
->zs_refs
);
254 /* One reference for zf_stream. */
255 zfs_refcount_add(&zs
->zs_refs
, NULL
);
257 list_insert_head(&zf
->zf_stream
, zs
);
261 dmu_zfetch_stream_done(void *arg
, boolean_t io_issued
)
266 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
267 dmu_zfetch_stream_fini(zs
);
271 * This is the predictive prefetch entry point. dmu_zfetch_prepare()
272 * associates dnode access specified with blkid and nblks arguments with
273 * prefetch stream, predicts further accesses based on that stats and returns
274 * the stream pointer on success. That pointer must later be passed to
275 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and
276 * release it. dmu_zfetch() is a wrapper for simple cases when window between
277 * prediction and prefetch initiation is not needed.
278 * fetch_data argument specifies whether actual data blocks should be fetched:
279 * FALSE -- prefetch only indirect blocks for predicted data blocks;
280 * TRUE -- prefetch predicted data blocks plus following indirect blocks.
283 dmu_zfetch_prepare(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
,
284 boolean_t fetch_data
, boolean_t have_lock
)
287 int64_t pf_start
, ipf_start
;
288 int64_t pf_ahead_blks
, max_blks
;
289 int max_dist_blks
, pf_nblks
, ipf_nblks
;
290 uint64_t end_of_access_blkid
, maxblkid
;
291 end_of_access_blkid
= blkid
+ nblks
;
292 spa_t
*spa
= zf
->zf_dnode
->dn_objset
->os_spa
;
294 if (zfs_prefetch_disable
)
297 * If we haven't yet loaded the indirect vdevs' mappings, we
298 * can only read from blocks that we carefully ensure are on
299 * concrete vdevs (or previously-loaded indirect vdevs). So we
300 * can't allow the predictive prefetcher to attempt reads of other
301 * blocks (e.g. of the MOS's dnode object).
303 if (!spa_indirect_vdevs_loaded(spa
))
307 * As a fast path for small (single-block) files, ignore access
308 * to the first block.
310 if (!have_lock
&& blkid
== 0)
314 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
317 * A fast path for small files for which no prefetch will
320 maxblkid
= zf
->zf_dnode
->dn_maxblkid
;
323 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
326 mutex_enter(&zf
->zf_lock
);
329 * Find matching prefetch stream. Depending on whether the accesses
330 * are block-aligned, first block of the new access may either follow
331 * the last block of the previous access, or be equal to it.
333 for (zs
= list_head(&zf
->zf_stream
); zs
!= NULL
;
334 zs
= list_next(&zf
->zf_stream
, zs
)) {
335 if (blkid
== zs
->zs_blkid
) {
337 } else if (blkid
+ 1 == zs
->zs_blkid
) {
345 * If the file is ending, remove the matching stream if found.
346 * If not found then it is too late to create a new one now.
348 if (end_of_access_blkid
>= maxblkid
) {
350 dmu_zfetch_stream_remove(zf
, zs
);
351 mutex_exit(&zf
->zf_lock
);
353 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
357 /* Exit if we already prefetched this block before. */
359 mutex_exit(&zf
->zf_lock
);
361 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
367 * This access is not part of any existing stream. Create
368 * a new stream for it.
370 dmu_zfetch_stream_create(zf
, end_of_access_blkid
);
371 mutex_exit(&zf
->zf_lock
);
373 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
374 ZFETCHSTAT_BUMP(zfetchstat_misses
);
379 * This access was to a block that we issued a prefetch for on
380 * behalf of this stream. Issue further prefetches for this stream.
382 * Normally, we start prefetching where we stopped
383 * prefetching last (zs_pf_blkid). But when we get our first
384 * hit on this stream, zs_pf_blkid == zs_blkid, we don't
385 * want to prefetch the block we just accessed. In this case,
386 * start just after the block we just accessed.
388 pf_start
= MAX(zs
->zs_pf_blkid
, end_of_access_blkid
);
389 if (zs
->zs_pf_blkid1
< end_of_access_blkid
)
390 zs
->zs_pf_blkid1
= end_of_access_blkid
;
391 if (zs
->zs_ipf_blkid1
< end_of_access_blkid
)
392 zs
->zs_ipf_blkid1
= end_of_access_blkid
;
395 * Double our amount of prefetched data, but don't let the
396 * prefetch get further ahead than zfetch_max_distance.
400 zfetch_max_distance
>> zf
->zf_dnode
->dn_datablkshift
;
402 * Previously, we were (zs_pf_blkid - blkid) ahead. We
403 * want to now be double that, so read that amount again,
404 * plus the amount we are catching up by (i.e. the amount
407 pf_ahead_blks
= zs
->zs_pf_blkid
- blkid
+ nblks
;
408 max_blks
= max_dist_blks
- (pf_start
- end_of_access_blkid
);
409 pf_nblks
= MIN(pf_ahead_blks
, max_blks
);
414 zs
->zs_pf_blkid
= pf_start
+ pf_nblks
;
417 * Do the same for indirects, starting from where we stopped last,
418 * or where we will stop reading data blocks (and the indirects
419 * that point to them).
421 ipf_start
= MAX(zs
->zs_ipf_blkid
, zs
->zs_pf_blkid
);
422 max_dist_blks
= zfetch_max_idistance
>> zf
->zf_dnode
->dn_datablkshift
;
424 * We want to double our distance ahead of the data prefetch
425 * (or reader, if we are not prefetching data). Previously, we
426 * were (zs_ipf_blkid - blkid) ahead. To double that, we read
427 * that amount again, plus the amount we are catching up by
428 * (i.e. the amount read now + the amount of data prefetched now).
430 pf_ahead_blks
= zs
->zs_ipf_blkid
- blkid
+ nblks
+ pf_nblks
;
431 max_blks
= max_dist_blks
- (ipf_start
- zs
->zs_pf_blkid
);
432 ipf_nblks
= MIN(pf_ahead_blks
, max_blks
);
433 zs
->zs_ipf_blkid
= ipf_start
+ ipf_nblks
;
435 zs
->zs_blkid
= end_of_access_blkid
;
436 /* Protect the stream from reclamation. */
437 zs
->zs_atime
= gethrtime();
438 zfs_refcount_add(&zs
->zs_refs
, NULL
);
439 /* Count concurrent callers. */
440 zfs_refcount_add(&zs
->zs_callers
, NULL
);
441 mutex_exit(&zf
->zf_lock
);
444 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
446 ZFETCHSTAT_BUMP(zfetchstat_hits
);
451 dmu_zfetch_run(zstream_t
*zs
, boolean_t missed
, boolean_t have_lock
)
453 zfetch_t
*zf
= zs
->zs_fetch
;
454 int64_t pf_start
, pf_end
, ipf_start
, ipf_end
;
458 zs
->zs_missed
= missed
;
461 * Postpone the prefetch if there are more concurrent callers.
462 * It happens when multiple requests are waiting for the same
463 * indirect block. The last one will run the prefetch for all.
465 if (zfs_refcount_remove(&zs
->zs_callers
, NULL
) != 0) {
466 /* Drop reference taken in dmu_zfetch_prepare(). */
467 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
468 dmu_zfetch_stream_fini(zs
);
472 mutex_enter(&zf
->zf_lock
);
474 pf_start
= zs
->zs_pf_blkid1
;
475 pf_end
= zs
->zs_pf_blkid1
= zs
->zs_pf_blkid
;
477 pf_start
= pf_end
= 0;
479 ipf_start
= MAX(zs
->zs_pf_blkid1
, zs
->zs_ipf_blkid1
);
480 ipf_end
= zs
->zs_ipf_blkid1
= zs
->zs_ipf_blkid
;
481 mutex_exit(&zf
->zf_lock
);
482 ASSERT3S(pf_start
, <=, pf_end
);
483 ASSERT3S(ipf_start
, <=, ipf_end
);
485 epbs
= zf
->zf_dnode
->dn_indblkshift
- SPA_BLKPTRSHIFT
;
486 ipf_start
= P2ROUNDUP(ipf_start
, 1 << epbs
) >> epbs
;
487 ipf_end
= P2ROUNDUP(ipf_end
, 1 << epbs
) >> epbs
;
488 ASSERT3S(ipf_start
, <=, ipf_end
);
489 issued
= pf_end
- pf_start
+ ipf_end
- ipf_start
;
491 /* More references on top of taken in dmu_zfetch_prepare(). */
492 for (int i
= 0; i
< issued
- 1; i
++)
493 zfs_refcount_add(&zs
->zs_refs
, NULL
);
494 } else if (issued
== 0) {
495 /* Some other thread has done our work, so drop the ref. */
496 if (zfs_refcount_remove(&zs
->zs_refs
, NULL
) == 0)
497 dmu_zfetch_stream_fini(zs
);
502 rw_enter(&zf
->zf_dnode
->dn_struct_rwlock
, RW_READER
);
505 for (int64_t blk
= pf_start
; blk
< pf_end
; blk
++) {
506 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 0, blk
,
507 ZIO_PRIORITY_ASYNC_READ
, ARC_FLAG_PREDICTIVE_PREFETCH
,
508 dmu_zfetch_stream_done
, zs
);
510 for (int64_t iblk
= ipf_start
; iblk
< ipf_end
; iblk
++) {
511 issued
+= dbuf_prefetch_impl(zf
->zf_dnode
, 1, iblk
,
512 ZIO_PRIORITY_ASYNC_READ
, ARC_FLAG_PREDICTIVE_PREFETCH
,
513 dmu_zfetch_stream_done
, zs
);
517 rw_exit(&zf
->zf_dnode
->dn_struct_rwlock
);
520 ZFETCHSTAT_ADD(zfetchstat_io_issued
, issued
);
524 dmu_zfetch(zfetch_t
*zf
, uint64_t blkid
, uint64_t nblks
, boolean_t fetch_data
,
525 boolean_t missed
, boolean_t have_lock
)
529 zs
= dmu_zfetch_prepare(zf
, blkid
, nblks
, fetch_data
, have_lock
);
531 dmu_zfetch_run(zs
, missed
, have_lock
);
535 ZFS_MODULE_PARAM(zfs_prefetch
, zfs_prefetch_
, disable
, INT
, ZMOD_RW
,
536 "Disable all ZFS prefetching");
538 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_streams
, UINT
, ZMOD_RW
,
539 "Max number of streams per zfetch");
541 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, min_sec_reap
, UINT
, ZMOD_RW
,
542 "Min time before stream reclaim");
544 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_distance
, UINT
, ZMOD_RW
,
545 "Max bytes to prefetch per stream");
547 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, max_idistance
, UINT
, ZMOD_RW
,
548 "Max bytes to prefetch indirects for per stream");
550 ZFS_MODULE_PARAM(zfs_prefetch
, zfetch_
, array_rd_sz
, ULONG
, ZMOD_RW
,
551 "Number of bytes in a array_read");