]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f | 23 | * Use is subject to license terms. |
55d85d5a GW |
24 | */ |
25 | ||
26 | /* | |
93e28d66 | 27 | * Copyright (c) 2011, 2019 by Delphix. All rights reserved. |
34dc7c2f BB |
28 | */ |
29 | ||
30 | #ifndef _SYS_METASLAB_IMPL_H | |
31 | #define _SYS_METASLAB_IMPL_H | |
32 | ||
34dc7c2f BB |
33 | #include <sys/metaslab.h> |
34 | #include <sys/space_map.h> | |
93cf2076 | 35 | #include <sys/range_tree.h> |
34dc7c2f BB |
36 | #include <sys/vdev.h> |
37 | #include <sys/txg.h> | |
38 | #include <sys/avl.h> | |
f09fda50 | 39 | #include <sys/multilist.h> |
34dc7c2f BB |
40 | |
41 | #ifdef __cplusplus | |
42 | extern "C" { | |
43 | #endif | |
44 | ||
4e21fd06 DB |
45 | /* |
46 | * Metaslab allocation tracing record. | |
47 | */ | |
48 | typedef struct metaslab_alloc_trace { | |
49 | list_node_t mat_list_node; | |
50 | metaslab_group_t *mat_mg; | |
51 | metaslab_t *mat_msp; | |
52 | uint64_t mat_size; | |
53 | uint64_t mat_weight; | |
54 | uint32_t mat_dva_id; | |
55 | uint64_t mat_offset; | |
492f64e9 | 56 | int mat_allocator; |
4e21fd06 DB |
57 | } metaslab_alloc_trace_t; |
58 | ||
59 | /* | |
60 | * Used by the metaslab allocation tracing facility to indicate | |
61 | * error conditions. These errors are stored to the offset member | |
62 | * of the metaslab_alloc_trace_t record and displayed by mdb. | |
63 | */ | |
64 | typedef enum trace_alloc_type { | |
65 | TRACE_ALLOC_FAILURE = -1ULL, | |
66 | TRACE_TOO_SMALL = -2ULL, | |
67 | TRACE_FORCE_GANG = -3ULL, | |
68 | TRACE_NOT_ALLOCATABLE = -4ULL, | |
69 | TRACE_GROUP_FAILURE = -5ULL, | |
70 | TRACE_ENOSPC = -6ULL, | |
71 | TRACE_CONDENSING = -7ULL, | |
619f0976 | 72 | TRACE_VDEV_ERROR = -8ULL, |
1b939560 | 73 | TRACE_DISABLED = -9ULL, |
4e21fd06 DB |
74 | } trace_alloc_type_t; |
75 | ||
76 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
77 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
492f64e9 PD |
78 | #define METASLAB_WEIGHT_CLAIM (1ULL << 61) |
79 | #define METASLAB_WEIGHT_TYPE (1ULL << 60) | |
4e21fd06 | 80 | #define METASLAB_ACTIVE_MASK \ |
492f64e9 PD |
81 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ |
82 | METASLAB_WEIGHT_CLAIM) | |
4e21fd06 DB |
83 | |
84 | /* | |
85 | * The metaslab weight is used to encode the amount of free space in a | |
86 | * metaslab, such that the "best" metaslab appears first when sorting the | |
87 | * metaslabs by weight. The weight (and therefore the "best" metaslab) can | |
88 | * be determined in two different ways: by computing a weighted sum of all | |
89 | * the free space in the metaslab (a space based weight) or by counting only | |
90 | * the free segments of the largest size (a segment based weight). We prefer | |
91 | * the segment based weight because it reflects how the free space is | |
92 | * comprised, but we cannot always use it -- legacy pools do not have the | |
93 | * space map histogram information necessary to determine the largest | |
94 | * contiguous regions. Pools that have the space map histogram determine | |
95 | * the segment weight by looking at each bucket in the histogram and | |
96 | * determining the free space whose size in bytes is in the range: | |
97 | * [2^i, 2^(i+1)) | |
98 | * We then encode the largest index, i, that contains regions into the | |
99 | * segment-weighted value. | |
100 | * | |
101 | * Space-based weight: | |
102 | * | |
103 | * 64 56 48 40 32 24 16 8 0 | |
104 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
492f64e9 | 105 | * |PSC1| weighted-free space | |
4e21fd06 DB |
106 | * +-------+-------+-------+-------+-------+-------+-------+-------+ |
107 | * | |
108 | * PS - indicates primary and secondary activation | |
492f64e9 | 109 | * C - indicates activation for claimed block zio |
4e21fd06 DB |
110 | * space - the fragmentation-weighted space |
111 | * | |
112 | * Segment-based weight: | |
113 | * | |
114 | * 64 56 48 40 32 24 16 8 0 | |
115 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
492f64e9 | 116 | * |PSC0| idx| count of segments in region | |
4e21fd06 DB |
117 | * +-------+-------+-------+-------+-------+-------+-------+-------+ |
118 | * | |
119 | * PS - indicates primary and secondary activation | |
492f64e9 | 120 | * C - indicates activation for claimed block zio |
4e21fd06 DB |
121 | * idx - index for the highest bucket in the histogram |
122 | * count - number of segments in the specified bucket | |
123 | */ | |
492f64e9 PD |
124 | #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) |
125 | #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) | |
4e21fd06 DB |
126 | |
127 | #define WEIGHT_IS_SPACEBASED(weight) \ | |
492f64e9 PD |
128 | ((weight) == 0 || BF64_GET((weight), 60, 1)) |
129 | #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) | |
4e21fd06 DB |
130 | |
131 | /* | |
132 | * These macros are only applicable to segment-based weighting. | |
133 | */ | |
492f64e9 PD |
134 | #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) |
135 | #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) | |
136 | #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) | |
137 | #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) | |
4e21fd06 | 138 | |
f8020c93 AM |
139 | /* |
140 | * Per-allocator data structure. | |
141 | */ | |
142 | typedef struct metaslab_class_allocator { | |
143 | metaslab_group_t *mca_rotor; | |
144 | uint64_t mca_aliquot; | |
145 | ||
146 | /* | |
147 | * The allocation throttle works on a reservation system. Whenever | |
148 | * an asynchronous zio wants to perform an allocation it must | |
149 | * first reserve the number of blocks that it wants to allocate. | |
150 | * If there aren't sufficient slots available for the pending zio | |
151 | * then that I/O is throttled until more slots free up. The current | |
152 | * number of reserved allocations is maintained by the mca_alloc_slots | |
153 | * refcount. The mca_alloc_max_slots value determines the maximum | |
154 | * number of allocations that the system allows. Gang blocks are | |
155 | * allowed to reserve slots even if we've reached the maximum | |
156 | * number of allocations allowed. | |
157 | */ | |
158 | uint64_t mca_alloc_max_slots; | |
159 | zfs_refcount_t mca_alloc_slots; | |
1b50749c | 160 | } ____cacheline_aligned metaslab_class_allocator_t; |
f8020c93 | 161 | |
f3a7f661 GW |
162 | /* |
163 | * A metaslab class encompasses a category of allocatable top-level vdevs. | |
164 | * Each top-level vdev is associated with a metaslab group which defines | |
165 | * the allocatable region for that vdev. Examples of these categories include | |
166 | * "normal" for data block allocations (i.e. main pool allocations) or "log" | |
167 | * for allocations designated for intent log devices (i.e. slog devices). | |
168 | * When a block allocation is requested from the SPA it is associated with a | |
169 | * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging | |
170 | * to the class can be used to satisfy that request. Allocations are done | |
f8020c93 | 171 | * by traversing the metaslab groups that are linked off of the mca_rotor field. |
f3a7f661 GW |
172 | * This rotor points to the next metaslab group where allocations will be |
173 | * attempted. Allocating a block is a 3 step process -- select the metaslab | |
174 | * group, select the metaslab, and then allocate the block. The metaslab | |
175 | * class defines the low-level block allocator that will be used as the | |
176 | * final step in allocation. These allocators are pluggable allowing each class | |
177 | * to use a block allocator that best suits that class. | |
178 | */ | |
34dc7c2f | 179 | struct metaslab_class { |
3dfb57a3 | 180 | kmutex_t mc_lock; |
428870ff | 181 | spa_t *mc_spa; |
18168da7 | 182 | const metaslab_ops_t *mc_ops; |
3dfb57a3 DB |
183 | |
184 | /* | |
185 | * Track the number of metaslab groups that have been initialized | |
186 | * and can accept allocations. An initialized metaslab group is | |
187 | * one has been completely added to the config (i.e. we have | |
188 | * updated the MOS config and the space has been added to the pool). | |
189 | */ | |
190 | uint64_t mc_groups; | |
191 | ||
192 | /* | |
193 | * Toggle to enable/disable the allocation throttle. | |
194 | */ | |
195 | boolean_t mc_alloc_throttle_enabled; | |
196 | ||
ac72fac3 | 197 | uint64_t mc_alloc_groups; /* # of allocatable groups */ |
3dfb57a3 | 198 | |
428870ff BB |
199 | uint64_t mc_alloc; /* total allocated space */ |
200 | uint64_t mc_deferred; /* total deferred frees */ | |
201 | uint64_t mc_space; /* total space (alloc + free) */ | |
202 | uint64_t mc_dspace; /* total deflated space */ | |
f3a7f661 | 203 | uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; |
f09fda50 PD |
204 | |
205 | /* | |
206 | * List of all loaded metaslabs in the class, sorted in order of most | |
207 | * recent use. | |
208 | */ | |
ffdf019c | 209 | multilist_t mc_metaslab_txg_list; |
f8020c93 AM |
210 | |
211 | metaslab_class_allocator_t mc_allocator[]; | |
34dc7c2f BB |
212 | }; |
213 | ||
32d805c3 MA |
214 | /* |
215 | * Per-allocator data structure. | |
216 | */ | |
217 | typedef struct metaslab_group_allocator { | |
218 | uint64_t mga_cur_max_alloc_queue_depth; | |
219 | zfs_refcount_t mga_alloc_queue_depth; | |
220 | metaslab_t *mga_primary; | |
221 | metaslab_t *mga_secondary; | |
222 | } metaslab_group_allocator_t; | |
223 | ||
f3a7f661 GW |
224 | /* |
225 | * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) | |
4e33ba4c | 226 | * of a top-level vdev. They are linked together to form a circular linked |
f3a7f661 GW |
227 | * list and can belong to only one metaslab class. Metaslab groups may become |
228 | * ineligible for allocations for a number of reasons such as limited free | |
229 | * space, fragmentation, or going offline. When this happens the allocator will | |
230 | * simply find the next metaslab group in the linked list and attempt | |
231 | * to allocate from that group instead. | |
232 | */ | |
34dc7c2f BB |
233 | struct metaslab_group { |
234 | kmutex_t mg_lock; | |
235 | avl_tree_t mg_metaslab_tree; | |
236 | uint64_t mg_aliquot; | |
ac72fac3 | 237 | boolean_t mg_allocatable; /* can we allocate? */ |
492f64e9 | 238 | uint64_t mg_ms_ready; |
3dfb57a3 DB |
239 | |
240 | /* | |
241 | * A metaslab group is considered to be initialized only after | |
242 | * we have updated the MOS config and added the space to the pool. | |
243 | * We only allow allocation attempts to a metaslab group if it | |
244 | * has been initialized. | |
245 | */ | |
246 | boolean_t mg_initialized; | |
247 | ||
ac72fac3 | 248 | uint64_t mg_free_capacity; /* percentage free */ |
34dc7c2f | 249 | int64_t mg_bias; |
428870ff | 250 | int64_t mg_activation_count; |
34dc7c2f BB |
251 | metaslab_class_t *mg_class; |
252 | vdev_t *mg_vd; | |
93cf2076 | 253 | taskq_t *mg_taskq; |
34dc7c2f BB |
254 | metaslab_group_t *mg_prev; |
255 | metaslab_group_t *mg_next; | |
3dfb57a3 DB |
256 | |
257 | /* | |
492f64e9 PD |
258 | * In order for the allocation throttle to function properly, we cannot |
259 | * have too many IOs going to each disk by default; the throttle | |
260 | * operates by allocating more work to disks that finish quickly, so | |
261 | * allocating larger chunks to each disk reduces its effectiveness. | |
262 | * However, if the number of IOs going to each allocator is too small, | |
263 | * we will not perform proper aggregation at the vdev_queue layer, | |
264 | * also resulting in decreased performance. Therefore, we will use a | |
265 | * ramp-up strategy. | |
266 | * | |
267 | * Each allocator in each metaslab group has a current queue depth | |
268 | * (mg_alloc_queue_depth[allocator]) and a current max queue depth | |
f8020c93 | 269 | * (mga_cur_max_alloc_queue_depth[allocator]), and each metaslab group |
492f64e9 PD |
270 | * has an absolute max queue depth (mg_max_alloc_queue_depth). We |
271 | * add IOs to an allocator until the mg_alloc_queue_depth for that | |
272 | * allocator hits the cur_max. Every time an IO completes for a given | |
273 | * allocator on a given metaslab group, we increment its cur_max until | |
274 | * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to | |
275 | * help protect against disks that decrease in performance over time. | |
276 | * | |
277 | * It's possible for an allocator to handle more allocations than | |
278 | * its max. This can occur when gang blocks are required or when other | |
279 | * groups are unable to handle their share of allocations. | |
3dfb57a3 DB |
280 | */ |
281 | uint64_t mg_max_alloc_queue_depth; | |
f8020c93 | 282 | |
3dfb57a3 DB |
283 | /* |
284 | * A metalab group that can no longer allocate the minimum block | |
285 | * size will set mg_no_free_space. Once a metaslab group is out | |
286 | * of space then its share of work must be distributed to other | |
287 | * groups. | |
288 | */ | |
289 | boolean_t mg_no_free_space; | |
290 | ||
291 | uint64_t mg_allocations; | |
292 | uint64_t mg_failed_allocations; | |
f3a7f661 GW |
293 | uint64_t mg_fragmentation; |
294 | uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; | |
619f0976 | 295 | |
1b939560 BB |
296 | int mg_ms_disabled; |
297 | boolean_t mg_disabled_updating; | |
298 | kmutex_t mg_ms_disabled_lock; | |
299 | kcondvar_t mg_ms_disabled_cv; | |
f8020c93 AM |
300 | |
301 | int mg_allocators; | |
302 | metaslab_group_allocator_t mg_allocator[]; | |
34dc7c2f BB |
303 | }; |
304 | ||
305 | /* | |
93cf2076 | 306 | * This value defines the number of elements in the ms_lbas array. The value |
f3a7f661 GW |
307 | * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. |
308 | * This is the equivalent of highbit(UINT64_MAX). | |
93cf2076 GW |
309 | */ |
310 | #define MAX_LBAS 64 | |
311 | ||
312 | /* | |
258553d3 | 313 | * Each metaslab maintains a set of in-core trees to track metaslab |
d2734cce | 314 | * operations. The in-core free tree (ms_allocatable) contains the list of |
258553d3 | 315 | * free segments which are eligible for allocation. As blocks are |
d2734cce SD |
316 | * allocated, the allocated segment are removed from the ms_allocatable and |
317 | * added to a per txg allocation tree (ms_allocating). As blocks are | |
318 | * freed, they are added to the free tree (ms_freeing). These trees | |
a1d477c2 MA |
319 | * allow us to process all allocations and frees in syncing context |
320 | * where it is safe to update the on-disk space maps. An additional set | |
321 | * of in-core trees is maintained to track deferred frees | |
d2734cce SD |
322 | * (ms_defer). Once a block is freed it will move from the |
323 | * ms_freed to the ms_defer tree. A deferred free means that a block | |
258553d3 TC |
324 | * has been freed but cannot be used by the pool until TXG_DEFER_SIZE |
325 | * transactions groups later. For example, a block that is freed in txg | |
326 | * 50 will not be available for reallocation until txg 52 (50 + | |
327 | * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. | |
328 | * A pool could be safely rolled back TXG_DEFERS_SIZE transactions | |
329 | * groups and ensure that no block has been reallocated. | |
93cf2076 GW |
330 | * |
331 | * The simplified transition diagram looks like this: | |
332 | * | |
333 | * | |
334 | * ALLOCATE | |
335 | * | | |
336 | * V | |
d2734cce | 337 | * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) |
93cf2076 | 338 | * ^ |
d2734cce SD |
339 | * | ms_freeing <--- FREE |
340 | * | | | |
341 | * | v | |
342 | * | ms_freed | |
343 | * | | | |
344 | * +-------- ms_defer[2] <-------+-------> (write to space map) | |
e51be066 | 345 | * |
93cf2076 GW |
346 | * |
347 | * Each metaslab's space is tracked in a single space map in the MOS, | |
258553d3 TC |
348 | * which is only updated in syncing context. Each time we sync a txg, |
349 | * we append the allocs and frees from that txg to the space map. The | |
350 | * pool space is only updated once all metaslabs have finished syncing. | |
e51be066 | 351 | * |
258553d3 TC |
352 | * To load the in-core free tree we read the space map from disk. This |
353 | * object contains a series of alloc and free records that are combined | |
354 | * to make up the list of all free segments in this metaslab. These | |
d2734cce SD |
355 | * segments are represented in-core by the ms_allocatable and are stored |
356 | * in an AVL tree. | |
e51be066 | 357 | * |
93cf2076 | 358 | * As the space map grows (as a result of the appends) it will |
258553d3 TC |
359 | * eventually become space-inefficient. When the metaslab's in-core |
360 | * free tree is zfs_condense_pct/100 times the size of the minimal | |
361 | * on-disk representation, we rewrite it in its minimized form. If a | |
362 | * metaslab needs to condense then we must set the ms_condensing flag to | |
363 | * ensure that allocations are not performed on the metaslab that is | |
364 | * being written. | |
34dc7c2f BB |
365 | */ |
366 | struct metaslab { | |
425d3237 SD |
367 | /* |
368 | * This is the main lock of the metaslab and its purpose is to | |
369 | * coordinate our allocations and frees [e.g metaslab_block_alloc(), | |
370 | * metaslab_free_concrete(), ..etc] with our various syncing | |
371 | * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. | |
372 | * | |
373 | * The lock is also used during some miscellaneous operations like | |
374 | * using the metaslab's histogram for the metaslab group's histogram | |
375 | * aggregation, or marking the metaslab for initialization. | |
376 | */ | |
93cf2076 | 377 | kmutex_t ms_lock; |
425d3237 SD |
378 | |
379 | /* | |
380 | * Acquired together with the ms_lock whenever we expect to | |
381 | * write to metaslab data on-disk (i.e flushing entries to | |
382 | * the metaslab's space map). It helps coordinate readers of | |
383 | * the metaslab's space map [see spa_vdev_remove_thread()] | |
93e28d66 | 384 | * with writers [see metaslab_sync() or metaslab_flush()]. |
425d3237 SD |
385 | * |
386 | * Note that metaslab_load(), even though a reader, uses | |
387 | * a completely different mechanism to deal with the reading | |
388 | * of the metaslab's space map based on ms_synced_length. That | |
389 | * said, the function still uses the ms_sync_lock after it | |
390 | * has read the ms_sm [see relevant comment in metaslab_load() | |
391 | * as to why]. | |
392 | */ | |
a1d477c2 | 393 | kmutex_t ms_sync_lock; |
425d3237 | 394 | |
93cf2076 GW |
395 | kcondvar_t ms_load_cv; |
396 | space_map_t *ms_sm; | |
93cf2076 GW |
397 | uint64_t ms_id; |
398 | uint64_t ms_start; | |
399 | uint64_t ms_size; | |
f3a7f661 | 400 | uint64_t ms_fragmentation; |
93cf2076 | 401 | |
d2734cce SD |
402 | range_tree_t *ms_allocating[TXG_SIZE]; |
403 | range_tree_t *ms_allocatable; | |
425d3237 | 404 | uint64_t ms_allocated_this_txg; |
f09fda50 | 405 | uint64_t ms_allocating_total; |
93cf2076 | 406 | |
258553d3 TC |
407 | /* |
408 | * The following range trees are accessed only from syncing context. | |
409 | * ms_free*tree only have entries while syncing, and are empty | |
410 | * between syncs. | |
411 | */ | |
d2734cce SD |
412 | range_tree_t *ms_freeing; /* to free this syncing txg */ |
413 | range_tree_t *ms_freed; /* already freed this syncing txg */ | |
414 | range_tree_t *ms_defer[TXG_DEFER_SIZE]; | |
415 | range_tree_t *ms_checkpointing; /* to add to the checkpoint */ | |
258553d3 | 416 | |
1b939560 BB |
417 | /* |
418 | * The ms_trim tree is the set of allocatable segments which are | |
419 | * eligible for trimming. (When the metaslab is loaded, it's a | |
420 | * subset of ms_allocatable.) It's kept in-core as long as the | |
421 | * autotrim property is set and is not vacated when the metaslab | |
422 | * is unloaded. Its purpose is to aggregate freed ranges to | |
423 | * facilitate efficient trimming. | |
424 | */ | |
425 | range_tree_t *ms_trim; | |
426 | ||
93cf2076 | 427 | boolean_t ms_condensing; /* condensing? */ |
f3a7f661 | 428 | boolean_t ms_condense_wanted; |
4e21fd06 | 429 | |
1b939560 BB |
430 | /* |
431 | * The number of consumers which have disabled the metaslab. | |
432 | */ | |
433 | uint64_t ms_disabled; | |
619f0976 | 434 | |
4e21fd06 | 435 | /* |
b194fab0 SD |
436 | * We must always hold the ms_lock when modifying ms_loaded |
437 | * and ms_loading. | |
4e21fd06 | 438 | */ |
93cf2076 GW |
439 | boolean_t ms_loaded; |
440 | boolean_t ms_loading; | |
93e28d66 SD |
441 | kcondvar_t ms_flush_cv; |
442 | boolean_t ms_flushing; | |
93cf2076 | 443 | |
928e8ad4 SD |
444 | /* |
445 | * The following histograms count entries that are in the | |
446 | * metaslab's space map (and its histogram) but are not in | |
447 | * ms_allocatable yet, because they are in ms_freed, ms_freeing, | |
448 | * or ms_defer[]. | |
449 | * | |
450 | * When the metaslab is not loaded, its ms_weight needs to | |
451 | * reflect what is allocatable (i.e. what will be part of | |
452 | * ms_allocatable if it is loaded). The weight is computed from | |
453 | * the spacemap histogram, but that includes ranges that are | |
454 | * not yet allocatable (because they are in ms_freed, | |
455 | * ms_freeing, or ms_defer[]). Therefore, when calculating the | |
456 | * weight, we need to remove those ranges. | |
457 | * | |
458 | * The ranges in the ms_freed and ms_defer[] range trees are all | |
459 | * present in the spacemap. However, the spacemap may have | |
460 | * multiple entries to represent a contiguous range, because it | |
461 | * is written across multiple sync passes, but the changes of | |
462 | * all sync passes are consolidated into the range trees. | |
463 | * Adjacent ranges that are freed in different sync passes of | |
464 | * one txg will be represented separately (as 2 or more entries) | |
465 | * in the space map (and its histogram), but these adjacent | |
466 | * ranges will be consolidated (represented as one entry) in the | |
467 | * ms_freed/ms_defer[] range trees (and their histograms). | |
468 | * | |
469 | * When calculating the weight, we can not simply subtract the | |
470 | * range trees' histograms from the spacemap's histogram, | |
471 | * because the range trees' histograms may have entries in | |
472 | * higher buckets than the spacemap, due to consolidation. | |
473 | * Instead we must subtract the exact entries that were added to | |
474 | * the spacemap's histogram. ms_synchist and ms_deferhist[] | |
475 | * represent these exact entries, so we can subtract them from | |
476 | * the spacemap's histogram when calculating ms_weight. | |
477 | * | |
478 | * ms_synchist represents the same ranges as ms_freeing + | |
479 | * ms_freed, but without consolidation across sync passes. | |
480 | * | |
481 | * ms_deferhist[i] represents the same ranges as ms_defer[i], | |
482 | * but without consolidation across sync passes. | |
483 | */ | |
484 | uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; | |
485 | uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; | |
486 | ||
425d3237 SD |
487 | /* |
488 | * Tracks the exact amount of allocated space of this metaslab | |
489 | * (and specifically the metaslab's space map) up to the most | |
490 | * recently completed sync pass [see usage in metaslab_sync()]. | |
491 | */ | |
492 | uint64_t ms_allocated_space; | |
428870ff | 493 | int64_t ms_deferspace; /* sum of ms_defermap[] space */ |
34dc7c2f | 494 | uint64_t ms_weight; /* weight vs. others in group */ |
4e21fd06 DB |
495 | uint64_t ms_activation_weight; /* activation weight */ |
496 | ||
497 | /* | |
498 | * Track of whenever a metaslab is selected for loading or allocation. | |
499 | * We use this value to determine how long the metaslab should | |
500 | * stay cached. | |
501 | */ | |
502 | uint64_t ms_selected_txg; | |
c81f1790 PD |
503 | /* |
504 | * ms_load/unload_time can be used for performance monitoring | |
505 | * (e.g. by dtrace or mdb). | |
506 | */ | |
507 | hrtime_t ms_load_time; /* time last loaded */ | |
508 | hrtime_t ms_unload_time; /* time last unloaded */ | |
eef0f4d8 | 509 | hrtime_t ms_selected_time; /* time last allocated from */ |
4e21fd06 DB |
510 | |
511 | uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ | |
512 | uint64_t ms_max_size; /* maximum allocatable size */ | |
93cf2076 | 513 | |
492f64e9 PD |
514 | /* |
515 | * -1 if it's not active in an allocator, otherwise set to the allocator | |
516 | * this metaslab is active for. | |
517 | */ | |
518 | int ms_allocator; | |
519 | boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ | |
520 | ||
93cf2076 GW |
521 | /* |
522 | * The metaslab block allocators can optionally use a size-ordered | |
523 | * range tree and/or an array of LBAs. Not all allocators use | |
d2734cce SD |
524 | * this functionality. The ms_allocatable_by_size should always |
525 | * contain the same number of segments as the ms_allocatable. The | |
526 | * only difference is that the ms_allocatable_by_size is ordered by | |
527 | * segment sizes. | |
93cf2076 | 528 | */ |
ca577779 PD |
529 | zfs_btree_t ms_allocatable_by_size; |
530 | zfs_btree_t ms_unflushed_frees_by_size; | |
93cf2076 GW |
531 | uint64_t ms_lbas[MAX_LBAS]; |
532 | ||
34dc7c2f BB |
533 | metaslab_group_t *ms_group; /* metaslab group */ |
534 | avl_node_t ms_group_node; /* node in metaslab group tree */ | |
535 | txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ | |
93e28d66 | 536 | avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ |
f09fda50 PD |
537 | /* |
538 | * Node in metaslab class's selected txg list | |
539 | */ | |
540 | multilist_node_t ms_class_txg_node; | |
93e28d66 SD |
541 | |
542 | /* | |
543 | * Allocs and frees that are committed to the vdev log spacemap but | |
544 | * not yet to this metaslab's spacemap. | |
545 | */ | |
546 | range_tree_t *ms_unflushed_allocs; | |
547 | range_tree_t *ms_unflushed_frees; | |
548 | ||
549 | /* | |
550 | * We have flushed entries up to but not including this TXG. In | |
551 | * other words, all changes from this TXG and onward should not | |
552 | * be in this metaslab's space map and must be read from the | |
553 | * log space maps. | |
554 | */ | |
555 | uint64_t ms_unflushed_txg; | |
492f64e9 | 556 | |
425d3237 SD |
557 | /* updated every time we are done syncing the metaslab's space map */ |
558 | uint64_t ms_synced_length; | |
559 | ||
492f64e9 | 560 | boolean_t ms_new; |
34dc7c2f BB |
561 | }; |
562 | ||
93e28d66 SD |
563 | typedef struct metaslab_unflushed_phys { |
564 | /* on-disk counterpart of ms_unflushed_txg */ | |
565 | uint64_t msp_unflushed_txg; | |
566 | } metaslab_unflushed_phys_t; | |
567 | ||
34dc7c2f BB |
568 | #ifdef __cplusplus |
569 | } | |
570 | #endif | |
571 | ||
572 | #endif /* _SYS_METASLAB_IMPL_H */ |