]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f | 23 | * Use is subject to license terms. |
55d85d5a GW |
24 | */ |
25 | ||
26 | /* | |
d2734cce | 27 | * Copyright (c) 2011, 2017 by Delphix. All rights reserved. |
34dc7c2f BB |
28 | */ |
29 | ||
30 | #ifndef _SYS_METASLAB_IMPL_H | |
31 | #define _SYS_METASLAB_IMPL_H | |
32 | ||
34dc7c2f BB |
33 | #include <sys/metaslab.h> |
34 | #include <sys/space_map.h> | |
93cf2076 | 35 | #include <sys/range_tree.h> |
34dc7c2f BB |
36 | #include <sys/vdev.h> |
37 | #include <sys/txg.h> | |
38 | #include <sys/avl.h> | |
39 | ||
40 | #ifdef __cplusplus | |
41 | extern "C" { | |
42 | #endif | |
43 | ||
4e21fd06 DB |
44 | /* |
45 | * Metaslab allocation tracing record. | |
46 | */ | |
47 | typedef struct metaslab_alloc_trace { | |
48 | list_node_t mat_list_node; | |
49 | metaslab_group_t *mat_mg; | |
50 | metaslab_t *mat_msp; | |
51 | uint64_t mat_size; | |
52 | uint64_t mat_weight; | |
53 | uint32_t mat_dva_id; | |
54 | uint64_t mat_offset; | |
55 | } metaslab_alloc_trace_t; | |
56 | ||
57 | /* | |
58 | * Used by the metaslab allocation tracing facility to indicate | |
59 | * error conditions. These errors are stored to the offset member | |
60 | * of the metaslab_alloc_trace_t record and displayed by mdb. | |
61 | */ | |
62 | typedef enum trace_alloc_type { | |
63 | TRACE_ALLOC_FAILURE = -1ULL, | |
64 | TRACE_TOO_SMALL = -2ULL, | |
65 | TRACE_FORCE_GANG = -3ULL, | |
66 | TRACE_NOT_ALLOCATABLE = -4ULL, | |
67 | TRACE_GROUP_FAILURE = -5ULL, | |
68 | TRACE_ENOSPC = -6ULL, | |
69 | TRACE_CONDENSING = -7ULL, | |
70 | TRACE_VDEV_ERROR = -8ULL | |
71 | } trace_alloc_type_t; | |
72 | ||
73 | #define METASLAB_WEIGHT_PRIMARY (1ULL << 63) | |
74 | #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) | |
75 | #define METASLAB_WEIGHT_TYPE (1ULL << 61) | |
76 | #define METASLAB_ACTIVE_MASK \ | |
77 | (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) | |
78 | ||
79 | /* | |
80 | * The metaslab weight is used to encode the amount of free space in a | |
81 | * metaslab, such that the "best" metaslab appears first when sorting the | |
82 | * metaslabs by weight. The weight (and therefore the "best" metaslab) can | |
83 | * be determined in two different ways: by computing a weighted sum of all | |
84 | * the free space in the metaslab (a space based weight) or by counting only | |
85 | * the free segments of the largest size (a segment based weight). We prefer | |
86 | * the segment based weight because it reflects how the free space is | |
87 | * comprised, but we cannot always use it -- legacy pools do not have the | |
88 | * space map histogram information necessary to determine the largest | |
89 | * contiguous regions. Pools that have the space map histogram determine | |
90 | * the segment weight by looking at each bucket in the histogram and | |
91 | * determining the free space whose size in bytes is in the range: | |
92 | * [2^i, 2^(i+1)) | |
93 | * We then encode the largest index, i, that contains regions into the | |
94 | * segment-weighted value. | |
95 | * | |
96 | * Space-based weight: | |
97 | * | |
98 | * 64 56 48 40 32 24 16 8 0 | |
99 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
100 | * |PS1| weighted-free space | | |
101 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
102 | * | |
103 | * PS - indicates primary and secondary activation | |
104 | * space - the fragmentation-weighted space | |
105 | * | |
106 | * Segment-based weight: | |
107 | * | |
108 | * 64 56 48 40 32 24 16 8 0 | |
109 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
110 | * |PS0| idx| count of segments in region | | |
111 | * +-------+-------+-------+-------+-------+-------+-------+-------+ | |
112 | * | |
113 | * PS - indicates primary and secondary activation | |
114 | * idx - index for the highest bucket in the histogram | |
115 | * count - number of segments in the specified bucket | |
116 | */ | |
117 | #define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 62, 2) | |
118 | #define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 62, 2, x) | |
119 | ||
120 | #define WEIGHT_IS_SPACEBASED(weight) \ | |
121 | ((weight) == 0 || BF64_GET((weight), 61, 1)) | |
122 | #define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 61, 1, 1) | |
123 | ||
124 | /* | |
125 | * These macros are only applicable to segment-based weighting. | |
126 | */ | |
127 | #define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 55, 6) | |
128 | #define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 55, 6, x) | |
129 | #define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 55) | |
130 | #define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 55, x) | |
131 | ||
f3a7f661 GW |
132 | /* |
133 | * A metaslab class encompasses a category of allocatable top-level vdevs. | |
134 | * Each top-level vdev is associated with a metaslab group which defines | |
135 | * the allocatable region for that vdev. Examples of these categories include | |
136 | * "normal" for data block allocations (i.e. main pool allocations) or "log" | |
137 | * for allocations designated for intent log devices (i.e. slog devices). | |
138 | * When a block allocation is requested from the SPA it is associated with a | |
139 | * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging | |
140 | * to the class can be used to satisfy that request. Allocations are done | |
141 | * by traversing the metaslab groups that are linked off of the mc_rotor field. | |
142 | * This rotor points to the next metaslab group where allocations will be | |
143 | * attempted. Allocating a block is a 3 step process -- select the metaslab | |
144 | * group, select the metaslab, and then allocate the block. The metaslab | |
145 | * class defines the low-level block allocator that will be used as the | |
146 | * final step in allocation. These allocators are pluggable allowing each class | |
147 | * to use a block allocator that best suits that class. | |
148 | */ | |
34dc7c2f | 149 | struct metaslab_class { |
3dfb57a3 | 150 | kmutex_t mc_lock; |
428870ff | 151 | spa_t *mc_spa; |
34dc7c2f | 152 | metaslab_group_t *mc_rotor; |
93cf2076 | 153 | metaslab_ops_t *mc_ops; |
428870ff | 154 | uint64_t mc_aliquot; |
3dfb57a3 DB |
155 | |
156 | /* | |
157 | * Track the number of metaslab groups that have been initialized | |
158 | * and can accept allocations. An initialized metaslab group is | |
159 | * one has been completely added to the config (i.e. we have | |
160 | * updated the MOS config and the space has been added to the pool). | |
161 | */ | |
162 | uint64_t mc_groups; | |
163 | ||
164 | /* | |
165 | * Toggle to enable/disable the allocation throttle. | |
166 | */ | |
167 | boolean_t mc_alloc_throttle_enabled; | |
168 | ||
169 | /* | |
170 | * The allocation throttle works on a reservation system. Whenever | |
171 | * an asynchronous zio wants to perform an allocation it must | |
172 | * first reserve the number of blocks that it wants to allocate. | |
173 | * If there aren't sufficient slots available for the pending zio | |
174 | * then that I/O is throttled until more slots free up. The current | |
175 | * number of reserved allocations is maintained by the mc_alloc_slots | |
176 | * refcount. The mc_alloc_max_slots value determines the maximum | |
177 | * number of allocations that the system allows. Gang blocks are | |
178 | * allowed to reserve slots even if we've reached the maximum | |
179 | * number of allocations allowed. | |
180 | */ | |
181 | uint64_t mc_alloc_max_slots; | |
182 | refcount_t mc_alloc_slots; | |
183 | ||
ac72fac3 | 184 | uint64_t mc_alloc_groups; /* # of allocatable groups */ |
3dfb57a3 | 185 | |
428870ff BB |
186 | uint64_t mc_alloc; /* total allocated space */ |
187 | uint64_t mc_deferred; /* total deferred frees */ | |
188 | uint64_t mc_space; /* total space (alloc + free) */ | |
189 | uint64_t mc_dspace; /* total deflated space */ | |
f3a7f661 | 190 | uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; |
34dc7c2f BB |
191 | }; |
192 | ||
f3a7f661 GW |
193 | /* |
194 | * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) | |
4e33ba4c | 195 | * of a top-level vdev. They are linked together to form a circular linked |
f3a7f661 GW |
196 | * list and can belong to only one metaslab class. Metaslab groups may become |
197 | * ineligible for allocations for a number of reasons such as limited free | |
198 | * space, fragmentation, or going offline. When this happens the allocator will | |
199 | * simply find the next metaslab group in the linked list and attempt | |
200 | * to allocate from that group instead. | |
201 | */ | |
34dc7c2f BB |
202 | struct metaslab_group { |
203 | kmutex_t mg_lock; | |
204 | avl_tree_t mg_metaslab_tree; | |
205 | uint64_t mg_aliquot; | |
ac72fac3 | 206 | boolean_t mg_allocatable; /* can we allocate? */ |
3dfb57a3 DB |
207 | |
208 | /* | |
209 | * A metaslab group is considered to be initialized only after | |
210 | * we have updated the MOS config and added the space to the pool. | |
211 | * We only allow allocation attempts to a metaslab group if it | |
212 | * has been initialized. | |
213 | */ | |
214 | boolean_t mg_initialized; | |
215 | ||
ac72fac3 | 216 | uint64_t mg_free_capacity; /* percentage free */ |
34dc7c2f | 217 | int64_t mg_bias; |
428870ff | 218 | int64_t mg_activation_count; |
34dc7c2f BB |
219 | metaslab_class_t *mg_class; |
220 | vdev_t *mg_vd; | |
93cf2076 | 221 | taskq_t *mg_taskq; |
34dc7c2f BB |
222 | metaslab_group_t *mg_prev; |
223 | metaslab_group_t *mg_next; | |
3dfb57a3 DB |
224 | |
225 | /* | |
226 | * Each metaslab group can handle mg_max_alloc_queue_depth allocations | |
227 | * which are tracked by mg_alloc_queue_depth. It's possible for a | |
228 | * metaslab group to handle more allocations than its max. This | |
229 | * can occur when gang blocks are required or when other groups | |
230 | * are unable to handle their share of allocations. | |
231 | */ | |
232 | uint64_t mg_max_alloc_queue_depth; | |
233 | refcount_t mg_alloc_queue_depth; | |
234 | ||
235 | /* | |
236 | * A metalab group that can no longer allocate the minimum block | |
237 | * size will set mg_no_free_space. Once a metaslab group is out | |
238 | * of space then its share of work must be distributed to other | |
239 | * groups. | |
240 | */ | |
241 | boolean_t mg_no_free_space; | |
242 | ||
243 | uint64_t mg_allocations; | |
244 | uint64_t mg_failed_allocations; | |
f3a7f661 GW |
245 | uint64_t mg_fragmentation; |
246 | uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; | |
34dc7c2f BB |
247 | }; |
248 | ||
249 | /* | |
93cf2076 | 250 | * This value defines the number of elements in the ms_lbas array. The value |
f3a7f661 GW |
251 | * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. |
252 | * This is the equivalent of highbit(UINT64_MAX). | |
93cf2076 GW |
253 | */ |
254 | #define MAX_LBAS 64 | |
255 | ||
256 | /* | |
258553d3 | 257 | * Each metaslab maintains a set of in-core trees to track metaslab |
d2734cce | 258 | * operations. The in-core free tree (ms_allocatable) contains the list of |
258553d3 | 259 | * free segments which are eligible for allocation. As blocks are |
d2734cce SD |
260 | * allocated, the allocated segment are removed from the ms_allocatable and |
261 | * added to a per txg allocation tree (ms_allocating). As blocks are | |
262 | * freed, they are added to the free tree (ms_freeing). These trees | |
a1d477c2 MA |
263 | * allow us to process all allocations and frees in syncing context |
264 | * where it is safe to update the on-disk space maps. An additional set | |
265 | * of in-core trees is maintained to track deferred frees | |
d2734cce SD |
266 | * (ms_defer). Once a block is freed it will move from the |
267 | * ms_freed to the ms_defer tree. A deferred free means that a block | |
258553d3 TC |
268 | * has been freed but cannot be used by the pool until TXG_DEFER_SIZE |
269 | * transactions groups later. For example, a block that is freed in txg | |
270 | * 50 will not be available for reallocation until txg 52 (50 + | |
271 | * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. | |
272 | * A pool could be safely rolled back TXG_DEFERS_SIZE transactions | |
273 | * groups and ensure that no block has been reallocated. | |
93cf2076 GW |
274 | * |
275 | * The simplified transition diagram looks like this: | |
276 | * | |
277 | * | |
278 | * ALLOCATE | |
279 | * | | |
280 | * V | |
d2734cce | 281 | * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) |
93cf2076 | 282 | * ^ |
d2734cce SD |
283 | * | ms_freeing <--- FREE |
284 | * | | | |
285 | * | v | |
286 | * | ms_freed | |
287 | * | | | |
288 | * +-------- ms_defer[2] <-------+-------> (write to space map) | |
e51be066 | 289 | * |
93cf2076 GW |
290 | * |
291 | * Each metaslab's space is tracked in a single space map in the MOS, | |
258553d3 TC |
292 | * which is only updated in syncing context. Each time we sync a txg, |
293 | * we append the allocs and frees from that txg to the space map. The | |
294 | * pool space is only updated once all metaslabs have finished syncing. | |
e51be066 | 295 | * |
258553d3 TC |
296 | * To load the in-core free tree we read the space map from disk. This |
297 | * object contains a series of alloc and free records that are combined | |
298 | * to make up the list of all free segments in this metaslab. These | |
d2734cce SD |
299 | * segments are represented in-core by the ms_allocatable and are stored |
300 | * in an AVL tree. | |
e51be066 | 301 | * |
93cf2076 | 302 | * As the space map grows (as a result of the appends) it will |
258553d3 TC |
303 | * eventually become space-inefficient. When the metaslab's in-core |
304 | * free tree is zfs_condense_pct/100 times the size of the minimal | |
305 | * on-disk representation, we rewrite it in its minimized form. If a | |
306 | * metaslab needs to condense then we must set the ms_condensing flag to | |
307 | * ensure that allocations are not performed on the metaslab that is | |
308 | * being written. | |
34dc7c2f BB |
309 | */ |
310 | struct metaslab { | |
93cf2076 | 311 | kmutex_t ms_lock; |
a1d477c2 | 312 | kmutex_t ms_sync_lock; |
93cf2076 GW |
313 | kcondvar_t ms_load_cv; |
314 | space_map_t *ms_sm; | |
93cf2076 GW |
315 | uint64_t ms_id; |
316 | uint64_t ms_start; | |
317 | uint64_t ms_size; | |
f3a7f661 | 318 | uint64_t ms_fragmentation; |
93cf2076 | 319 | |
d2734cce SD |
320 | range_tree_t *ms_allocating[TXG_SIZE]; |
321 | range_tree_t *ms_allocatable; | |
93cf2076 | 322 | |
258553d3 TC |
323 | /* |
324 | * The following range trees are accessed only from syncing context. | |
325 | * ms_free*tree only have entries while syncing, and are empty | |
326 | * between syncs. | |
327 | */ | |
d2734cce SD |
328 | range_tree_t *ms_freeing; /* to free this syncing txg */ |
329 | range_tree_t *ms_freed; /* already freed this syncing txg */ | |
330 | range_tree_t *ms_defer[TXG_DEFER_SIZE]; | |
331 | range_tree_t *ms_checkpointing; /* to add to the checkpoint */ | |
258553d3 | 332 | |
93cf2076 | 333 | boolean_t ms_condensing; /* condensing? */ |
f3a7f661 | 334 | boolean_t ms_condense_wanted; |
d2734cce | 335 | uint64_t ms_condense_checked_txg; |
4e21fd06 DB |
336 | |
337 | /* | |
338 | * We must hold both ms_lock and ms_group->mg_lock in order to | |
339 | * modify ms_loaded. | |
340 | */ | |
93cf2076 GW |
341 | boolean_t ms_loaded; |
342 | boolean_t ms_loading; | |
343 | ||
428870ff | 344 | int64_t ms_deferspace; /* sum of ms_defermap[] space */ |
34dc7c2f | 345 | uint64_t ms_weight; /* weight vs. others in group */ |
4e21fd06 DB |
346 | uint64_t ms_activation_weight; /* activation weight */ |
347 | ||
348 | /* | |
349 | * Track of whenever a metaslab is selected for loading or allocation. | |
350 | * We use this value to determine how long the metaslab should | |
351 | * stay cached. | |
352 | */ | |
353 | uint64_t ms_selected_txg; | |
354 | ||
355 | uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ | |
356 | uint64_t ms_max_size; /* maximum allocatable size */ | |
93cf2076 GW |
357 | |
358 | /* | |
359 | * The metaslab block allocators can optionally use a size-ordered | |
360 | * range tree and/or an array of LBAs. Not all allocators use | |
d2734cce SD |
361 | * this functionality. The ms_allocatable_by_size should always |
362 | * contain the same number of segments as the ms_allocatable. The | |
363 | * only difference is that the ms_allocatable_by_size is ordered by | |
364 | * segment sizes. | |
93cf2076 | 365 | */ |
d2734cce | 366 | avl_tree_t ms_allocatable_by_size; |
93cf2076 GW |
367 | uint64_t ms_lbas[MAX_LBAS]; |
368 | ||
34dc7c2f BB |
369 | metaslab_group_t *ms_group; /* metaslab group */ |
370 | avl_node_t ms_group_node; /* node in metaslab group tree */ | |
371 | txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ | |
372 | }; | |
373 | ||
374 | #ifdef __cplusplus | |
375 | } | |
376 | #endif | |
377 | ||
378 | #endif /* _SYS_METASLAB_IMPL_H */ |