]>
Commit | Line | Data |
---|---|---|
93e28d66 SD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2018, 2019 by Delphix. All rights reserved. | |
24 | */ | |
25 | ||
26 | #include <sys/dmu_objset.h> | |
27 | #include <sys/metaslab.h> | |
28 | #include <sys/metaslab_impl.h> | |
29 | #include <sys/spa.h> | |
30 | #include <sys/spa_impl.h> | |
31 | #include <sys/spa_log_spacemap.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/zap.h> | |
34 | ||
35 | /* | |
36 | * Log Space Maps | |
37 | * | |
38 | * Log space maps are an optimization in ZFS metadata allocations for pools | |
39 | * whose workloads are primarily random-writes. Random-write workloads are also | |
40 | * typically random-free, meaning that they are freeing from locations scattered | |
41 | * throughout the pool. This means that each TXG we will have to append some | |
42 | * FREE records to almost every metaslab. With log space maps, we hold their | |
43 | * changes in memory and log them altogether in one pool-wide space map on-disk | |
44 | * for persistence. As more blocks are accumulated in the log space maps and | |
45 | * more unflushed changes are accounted in memory, we flush a selected group | |
46 | * of metaslabs every TXG to relieve memory pressure and potential overheads | |
47 | * when loading the pool. Flushing a metaslab to disk relieves memory as we | |
48 | * flush any unflushed changes from memory to disk (i.e. the metaslab's space | |
49 | * map) and saves import time by making old log space maps obsolete and | |
50 | * eventually destroying them. [A log space map is said to be obsolete when all | |
51 | * its entries have made it to their corresponding metaslab space maps]. | |
52 | * | |
53 | * == On disk data structures used == | |
54 | * | |
55 | * - The pool has a new feature flag and a new entry in the MOS. The feature | |
56 | * is activated when we create the first log space map and remains active | |
57 | * for the lifetime of the pool. The new entry in the MOS Directory [refer | |
58 | * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value | |
59 | * pairs are of the form <key: txg, value: log space map object for that txg>. | |
60 | * This entry is our on-disk reference of the log space maps that exist in | |
61 | * the pool for each TXG and it is used during import to load all the | |
62 | * metaslab unflushed changes in memory. To see how this structure is first | |
63 | * created and later populated refer to spa_generate_syncing_log_sm(). To see | |
64 | * how it is used during import time refer to spa_ld_log_sm_metadata(). | |
65 | * | |
66 | * - Each vdev has a new entry in its vdev_top_zap (see field | |
67 | * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of | |
68 | * each metaslab in this vdev. This field is the on-disk counterpart of the | |
69 | * in-memory field ms_unflushed_txg which tells us from which TXG and onwards | |
70 | * the metaslab haven't had its changes flushed. During import, we use this | |
71 | * to ignore any entries in the space map log that are for this metaslab but | |
72 | * from a TXG before msp_unflushed_txg. At that point, we also populate its | |
73 | * in-memory counterpart and from there both fields are updated every time | |
74 | * we flush that metaslab. | |
75 | * | |
76 | * - A space map is created every TXG and, during that TXG, it is used to log | |
77 | * all incoming changes (the log space map). When created, the log space map | |
78 | * is referenced in memory by spa_syncing_log_sm and its object ID is inserted | |
79 | * to the space map ZAP mentioned above. The log space map is closed at the | |
80 | * end of the TXG and will be destroyed when it becomes fully obsolete. We | |
81 | * know when a log space map has become obsolete by looking at the oldest | |
82 | * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger | |
83 | * than the log space map's TXG, then it means that there is no metaslab who | |
84 | * doesn't have the changes from that log and we can therefore destroy it. | |
85 | * [see spa_cleanup_old_sm_logs()]. | |
86 | * | |
87 | * == Important in-memory structures == | |
88 | * | |
89 | * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in | |
90 | * the pool by their ms_unflushed_txg field. It is primarily used for three | |
91 | * reasons. First of all, it is used during flushing where we try to flush | |
92 | * metaslabs in-order from the oldest-flushed to the most recently flushed | |
93 | * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the | |
94 | * oldest flushed metaslab to distinguish which log space maps have become | |
95 | * obsolete and which ones are still relevant. Finally it tells us which | |
96 | * metaslabs have unflushed changes in a pool where this feature was just | |
97 | * enabled, as we don't immediately add all of the pool's metaslabs but we | |
98 | * add them over time as they go through metaslab_sync(). The reason that | |
99 | * we do that is to ease these pools into the behavior of the flushing | |
100 | * algorithm (described later on). | |
101 | * | |
102 | * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory | |
103 | * counterpart of the space map ZAP mentioned above. It's an AVL tree whose | |
104 | * nodes represent the log space maps in the pool. This in-memory | |
105 | * representation of log space maps in the pool sorts the log space maps by | |
106 | * the TXG that they were created (which is also the TXG of their unflushed | |
107 | * changes). It also contains the following extra information for each | |
108 | * space map: | |
109 | * [1] The number of metaslabs that were last flushed on that TXG. This is | |
110 | * important because if that counter is zero and this is the oldest | |
111 | * log then it means that it is also obsolete. | |
112 | * [2] The number of blocks of that space map. This field is used by the | |
113 | * block heuristic of our flushing algorithm (described later on). | |
114 | * It represents how many blocks of metadata changes ZFS had to write | |
115 | * to disk for that TXG. | |
116 | * | |
117 | * - The per-spa field spa_log_summary is a list of entries that summarizes | |
118 | * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg | |
119 | * AVL tree mentioned above. The reason this exists is that our flushing | |
120 | * algorithm (described later) tries to estimate how many metaslabs to flush | |
121 | * in each TXG by iterating over all the log space maps and looking at their | |
122 | * block counts. Summarizing that information means that don't have to | |
123 | * iterate through each space map, minimizing the runtime overhead of the | |
124 | * flushing algorithm which would be induced in syncing context. In terms of | |
125 | * implementation the log summary is used as a queue: | |
126 | * * we modify or pop entries from its head when we flush metaslabs | |
127 | * * we modify or append entries to its tail when we sync changes. | |
128 | * | |
129 | * - Each metaslab has two new range trees that hold its unflushed changes, | |
130 | * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint. | |
131 | * | |
132 | * == Flushing algorithm == | |
133 | * | |
134 | * The decision of how many metaslabs to flush on a give TXG is guided by | |
135 | * two heuristics: | |
136 | * | |
137 | * [1] The memory heuristic - | |
138 | * We keep track of the memory used by the unflushed trees from all the | |
139 | * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it | |
140 | * stays below a certain threshold which is determined by an arbitrary hard | |
141 | * limit and an arbitrary percentage of the system's memory [see | |
142 | * spa_log_exceeds_memlimit()]. When we see that the memory usage of the | |
143 | * unflushed changes are passing that threshold, we flush metaslabs, which | |
144 | * empties their unflushed range trees, reducing the memory used. | |
145 | * | |
146 | * [2] The block heuristic - | |
147 | * We try to keep the total number of blocks in the log space maps in check | |
148 | * so the log doesn't grow indefinitely and we don't induce a lot of overhead | |
149 | * when loading the pool. At the same time we don't want to flush a lot of | |
150 | * metaslabs too often as this would defeat the purpose of the log space map. | |
151 | * As a result we set a limit in the amount of blocks that we think it's | |
152 | * acceptable for the log space maps to have and try not to cross it. | |
153 | * [see sus_blocklimit from spa_unflushed_stats]. | |
154 | * | |
155 | * In order to stay below the block limit every TXG we have to estimate how | |
156 | * many metaslabs we need to flush based on the current rate of incoming blocks | |
157 | * and our history of log space map blocks. The main idea here is to answer | |
158 | * the question of how many metaslabs do we need to flush in order to get rid | |
159 | * at least an X amount of log space map blocks. We can answer this question | |
160 | * by iterating backwards from the oldest log space map to the newest one | |
161 | * and looking at their metaslab and block counts. At this point the log summary | |
162 | * mentioned above comes handy as it reduces the amount of things that we have | |
163 | * to iterate (even though it may reduce the preciseness of our estimates due | |
164 | * to its aggregation of data). So with that in mind, we project the incoming | |
165 | * rate of the current TXG into the future and attempt to approximate how many | |
166 | * metaslabs would we need to flush from now in order to avoid exceeding our | |
167 | * block limit in different points in the future (granted that we would keep | |
168 | * flushing the same number of metaslabs for every TXG). Then we take the | |
169 | * maximum number from all these estimates to be on the safe side. For the | |
170 | * exact implementation details of algorithm refer to | |
171 | * spa_estimate_metaslabs_to_flush. | |
172 | */ | |
173 | ||
174 | /* | |
175 | * This is used as the block size for the space maps used for the | |
176 | * log space map feature. These space maps benefit from a bigger | |
177 | * block size as we expect to be writing a lot of data to them at | |
178 | * once. | |
179 | */ | |
180 | unsigned long zfs_log_sm_blksz = 1ULL << 17; | |
181 | ||
182 | /* | |
183 | * Percentage of the overall system’s memory that ZFS allows to be | |
184 | * used for unflushed changes (e.g. the sum of size of all the nodes | |
185 | * in the unflushed trees). | |
186 | * | |
187 | * Note that this value is calculated over 1000000 for finer granularity | |
188 | * (thus the _ppm suffix; reads as "parts per million"). As an example, | |
189 | * the default of 1000 allows 0.1% of memory to be used. | |
190 | */ | |
191 | unsigned long zfs_unflushed_max_mem_ppm = 1000; | |
192 | ||
193 | /* | |
194 | * Specific hard-limit in memory that ZFS allows to be used for | |
195 | * unflushed changes. | |
196 | */ | |
197 | unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; | |
198 | ||
199 | /* | |
200 | * The following tunable determines the number of blocks that can be used for | |
201 | * the log space maps. It is expressed as a percentage of the total number of | |
202 | * metaslabs in the pool (i.e. the default of 400 means that the number of log | |
203 | * blocks is capped at 4 times the number of metaslabs). | |
204 | * | |
205 | * This value exists to tune our flushing algorithm, with higher values | |
206 | * flushing metaslabs less often (doing less I/Os) per TXG versus lower values | |
207 | * flushing metaslabs more aggressively with the upside of saving overheads | |
208 | * when loading the pool. Another factor in this tradeoff is that flushing | |
209 | * less often can potentially lead to better utilization of the metaslab space | |
210 | * map's block size as we accumulate more changes per flush. | |
211 | * | |
212 | * Given that this tunable indirectly controls the flush rate (metaslabs | |
213 | * flushed per txg) and that's why making it a percentage in terms of the | |
214 | * number of metaslabs in the pool makes sense here. | |
215 | * | |
216 | * As a rule of thumb we default this tunable to 400% based on the following: | |
217 | * | |
218 | * 1] Assuming a constant flush rate and a constant incoming rate of log blocks | |
219 | * it is reasonable to expect that the amount of obsolete entries changes | |
220 | * linearly from txg to txg (e.g. the oldest log should have the most | |
221 | * obsolete entries, and the most recent one the least). With this we could | |
222 | * say that, at any given time, about half of the entries in the whole space | |
223 | * map log are obsolete. Thus for every two entries for a metaslab in the | |
224 | * log space map, only one of them is valid and actually makes it to the | |
225 | * metaslab's space map. | |
226 | * [factor of 2] | |
227 | * 2] Each entry in the log space map is guaranteed to be two words while | |
228 | * entries in metaslab space maps are generally single-word. | |
229 | * [an extra factor of 2 - 400% overall] | |
230 | * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into | |
231 | * account any consolidation of segments from the log space map to the | |
232 | * unflushed range trees nor their history (e.g. a segment being allocated, | |
233 | * then freed, then allocated again means 3 log space map entries but 0 | |
234 | * metaslab space map entries). Depending on the workload, we've seen ~1.8 | |
235 | * non-obsolete log space map entries per metaslab entry, for a total of | |
236 | * ~600%. Since most of these estimates though are workload dependent, we | |
237 | * default on 400% to be conservative. | |
238 | * | |
239 | * Thus we could say that even in the worst | |
240 | * case of [1] and [2], the factor should end up being 4. | |
241 | * | |
242 | * That said, regardless of the number of metaslabs in the pool we need to | |
243 | * provide upper and lower bounds for the log block limit. | |
244 | * [see zfs_unflushed_log_block_{min,max}] | |
245 | */ | |
246 | unsigned long zfs_unflushed_log_block_pct = 400; | |
247 | ||
248 | /* | |
249 | * If the number of metaslabs is small and our incoming rate is high, we could | |
250 | * get into a situation that we are flushing all our metaslabs every TXG. Thus | |
251 | * we always allow at least this many log blocks. | |
252 | */ | |
253 | unsigned long zfs_unflushed_log_block_min = 1000; | |
254 | ||
255 | /* | |
256 | * If the log becomes too big, the import time of the pool can take a hit in | |
257 | * terms of performance. Thus we have a hard limit in the size of the log in | |
258 | * terms of blocks. | |
259 | */ | |
260 | unsigned long zfs_unflushed_log_block_max = (1ULL << 18); | |
261 | ||
262 | /* | |
263 | * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and | |
264 | * stability of the flushing algorithm (longer summary) vs its runtime overhead | |
265 | * (smaller summary is faster to traverse). | |
266 | */ | |
267 | unsigned long zfs_max_logsm_summary_length = 10; | |
268 | ||
269 | /* | |
270 | * Tunable that sets the lower bound on the metaslabs to flush every TXG. | |
271 | * | |
272 | * Setting this to 0 has no effect since if the pool is idle we won't even be | |
273 | * creating log space maps and therefore we won't be flushing. On the other | |
274 | * hand if the pool has any incoming workload our block heuristic will start | |
275 | * flushing metaslabs anyway. | |
276 | * | |
277 | * The point of this tunable is to be used in extreme cases where we really | |
278 | * want to flush more metaslabs than our adaptable heuristic plans to flush. | |
279 | */ | |
280 | unsigned long zfs_min_metaslabs_to_flush = 1; | |
281 | ||
282 | /* | |
283 | * Tunable that specifies how far in the past do we want to look when trying to | |
284 | * estimate the incoming log blocks for the current TXG. | |
285 | * | |
286 | * Setting this too high may not only increase runtime but also minimize the | |
287 | * effect of the incoming rates from the most recent TXGs as we take the | |
288 | * average over all the blocks that we walk | |
289 | * [see spa_estimate_incoming_log_blocks]. | |
290 | */ | |
291 | unsigned long zfs_max_log_walking = 5; | |
292 | ||
293 | /* | |
294 | * This tunable exists solely for testing purposes. It ensures that the log | |
295 | * spacemaps are not flushed and destroyed during export in order for the | |
296 | * relevant log spacemap import code paths to be tested (effectively simulating | |
297 | * a crash). | |
298 | */ | |
299 | int zfs_keep_log_spacemaps_at_export = 0; | |
300 | ||
301 | static uint64_t | |
302 | spa_estimate_incoming_log_blocks(spa_t *spa) | |
303 | { | |
304 | ASSERT3U(spa_sync_pass(spa), ==, 1); | |
305 | uint64_t steps = 0, sum = 0; | |
306 | for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); | |
307 | sls != NULL && steps < zfs_max_log_walking; | |
308 | sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) { | |
309 | if (sls->sls_txg == spa_syncing_txg(spa)) { | |
310 | /* | |
311 | * skip the log created in this TXG as this would | |
312 | * make our estimations inaccurate. | |
313 | */ | |
314 | continue; | |
315 | } | |
316 | sum += sls->sls_nblocks; | |
317 | steps++; | |
318 | } | |
319 | return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0); | |
320 | } | |
321 | ||
322 | uint64_t | |
323 | spa_log_sm_blocklimit(spa_t *spa) | |
324 | { | |
325 | return (spa->spa_unflushed_stats.sus_blocklimit); | |
326 | } | |
327 | ||
328 | void | |
329 | spa_log_sm_set_blocklimit(spa_t *spa) | |
330 | { | |
331 | if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { | |
332 | ASSERT0(spa_log_sm_blocklimit(spa)); | |
333 | return; | |
334 | } | |
335 | ||
336 | uint64_t calculated_limit = | |
337 | (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; | |
338 | spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, | |
339 | zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); | |
340 | } | |
341 | ||
342 | uint64_t | |
343 | spa_log_sm_nblocks(spa_t *spa) | |
344 | { | |
345 | return (spa->spa_unflushed_stats.sus_nblocks); | |
346 | } | |
347 | ||
348 | /* | |
349 | * Ensure that the in-memory log space map structures and the summary | |
350 | * have the same block and metaslab counts. | |
351 | */ | |
352 | static void | |
353 | spa_log_summary_verify_counts(spa_t *spa) | |
354 | { | |
355 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); | |
356 | ||
357 | if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0) | |
358 | return; | |
359 | ||
360 | uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed); | |
361 | ||
362 | uint64_t ms_in_summary = 0, blk_in_summary = 0; | |
363 | for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); | |
364 | e; e = list_next(&spa->spa_log_summary, e)) { | |
365 | ms_in_summary += e->lse_mscount; | |
366 | blk_in_summary += e->lse_blkcount; | |
367 | } | |
368 | ||
369 | uint64_t ms_in_logs = 0, blk_in_logs = 0; | |
370 | for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); | |
371 | sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { | |
372 | ms_in_logs += sls->sls_mscount; | |
373 | blk_in_logs += sls->sls_nblocks; | |
374 | } | |
375 | ||
376 | VERIFY3U(ms_in_logs, ==, ms_in_summary); | |
377 | VERIFY3U(ms_in_logs, ==, ms_in_avl); | |
378 | VERIFY3U(blk_in_logs, ==, blk_in_summary); | |
379 | VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa)); | |
380 | } | |
381 | ||
382 | static boolean_t | |
383 | summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) | |
384 | { | |
385 | uint64_t blocks_per_row = MAX(1, | |
386 | DIV_ROUND_UP(spa_log_sm_blocklimit(spa), | |
387 | zfs_max_logsm_summary_length)); | |
388 | return (blocks_per_row <= e->lse_blkcount); | |
389 | } | |
390 | ||
391 | /* | |
392 | * Update the log summary information to reflect the fact that a metaslab | |
393 | * was flushed or destroyed (e.g due to device removal or pool export/destroy). | |
394 | * | |
395 | * We typically flush the oldest flushed metaslab so the first (and olderst) | |
396 | * entry of the summary is updated. However if that metaslab is getting loaded | |
397 | * we may flush the second oldest one which may be part of an entry later in | |
398 | * the summary. Moreover, if we call into this function from metaslab_fini() | |
399 | * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask | |
400 | * for a txg as an argument so we can locate the appropriate summary entry for | |
401 | * the metaslab. | |
402 | */ | |
403 | void | |
404 | spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) | |
405 | { | |
406 | /* | |
407 | * We don't track summary data for read-only pools and this function | |
408 | * can be called from metaslab_fini(). In that case return immediately. | |
409 | */ | |
410 | if (!spa_writeable(spa)) | |
411 | return; | |
412 | ||
413 | log_summary_entry_t *target = NULL; | |
414 | for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); | |
415 | e != NULL; e = list_next(&spa->spa_log_summary, e)) { | |
416 | if (e->lse_start > txg) | |
417 | break; | |
418 | target = e; | |
419 | } | |
420 | ||
421 | if (target == NULL || target->lse_mscount == 0) { | |
422 | /* | |
423 | * We didn't find a summary entry for this metaslab. We must be | |
424 | * at the teardown of a spa_load() attempt that got an error | |
425 | * while reading the log space maps. | |
426 | */ | |
427 | VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); | |
428 | return; | |
429 | } | |
430 | ||
431 | target->lse_mscount--; | |
432 | } | |
433 | ||
434 | /* | |
435 | * Update the log summary information to reflect the fact that we destroyed | |
436 | * old log space maps. Since we can only destroy the oldest log space maps, | |
437 | * we decrement the block count of the oldest summary entry and potentially | |
438 | * destroy it when that count hits 0. | |
439 | * | |
440 | * This function is called after a metaslab is flushed and typically that | |
441 | * metaslab is the oldest flushed, which means that this function will | |
442 | * typically decrement the block count of the first entry of the summary and | |
443 | * potentially free it if the block count gets to zero (its metaslab count | |
444 | * should be zero too at that point). | |
445 | * | |
446 | * There are certain scenarios though that don't work exactly like that so we | |
447 | * need to account for them: | |
448 | * | |
449 | * Scenario [1]: It is possible that after we flushed the oldest flushed | |
450 | * metaslab and we destroyed the oldest log space map, more recent logs had 0 | |
451 | * metaslabs pointing to them so we got rid of them too. This can happen due | |
452 | * to metaslabs being destroyed through device removal, or because the oldest | |
453 | * flushed metaslab was loading but we kept flushing more recently flushed | |
454 | * metaslabs due to the memory pressure of unflushed changes. Because of that, | |
455 | * we always iterate from the beginning of the summary and if blocks_gone is | |
456 | * bigger than the block_count of the current entry we free that entry (we | |
457 | * expect its metaslab count to be zero), we decrement blocks_gone and on to | |
458 | * the next entry repeating this procedure until blocks_gone gets decremented | |
459 | * to 0. Doing this also works for the typical case mentioned above. | |
460 | * | |
461 | * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by | |
462 | * the first (and oldest) entry in the summary. If the first few entries of | |
463 | * the summary were only accounting metaslabs from a device that was just | |
464 | * removed, then the current oldest flushed metaslab could be accounted by an | |
465 | * entry somewhere in the middle of the summary. Moreover flushing that | |
466 | * metaslab will destroy all the log space maps older than its ms_unflushed_txg | |
467 | * because they became obsolete after the removal. Thus, iterating as we did | |
468 | * for scenario [1] works out for this case too. | |
469 | * | |
470 | * Scenario [3]: At times we decide to flush all the metaslabs in the pool | |
471 | * in one TXG (either because we are exporting the pool or because our flushing | |
472 | * heuristics decided to do so). When that happens all the log space maps get | |
473 | * destroyed except the one created for the current TXG which doesn't have | |
474 | * any log blocks yet. As log space maps get destroyed with every metaslab that | |
475 | * we flush, entries in the summary are also destroyed. This brings a weird | |
476 | * corner-case when we flush the last metaslab and the log space map of the | |
477 | * current TXG is in the same summary entry with other log space maps that | |
478 | * are older. When that happens we are eventually left with this one last | |
479 | * summary entry whose blocks are gone (blocks_gone equals the entry's block | |
480 | * count) but its metaslab count is non-zero (because it accounts all the | |
481 | * metaslabs in the pool as they all got flushed). Under this scenario we can't | |
482 | * free this last summary entry as it's referencing all the metaslabs in the | |
483 | * pool and its block count will get incremented at the end of this sync (when | |
484 | * we close the syncing log space map). Thus we just decrement its current | |
485 | * block count and leave it alone. In the case that the pool gets exported, | |
486 | * its metaslab count will be decremented over time as we call metaslab_fini() | |
487 | * for all the metaslabs in the pool and the entry will be freed at | |
488 | * spa_unload_log_sm_metadata(). | |
489 | */ | |
490 | void | |
491 | spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) | |
492 | { | |
493 | for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); | |
494 | e != NULL; e = list_head(&spa->spa_log_summary)) { | |
495 | if (e->lse_blkcount > blocks_gone) { | |
496 | /* | |
497 | * Assert that we stopped at an entry that is not | |
498 | * obsolete. | |
499 | */ | |
500 | ASSERT(e->lse_mscount != 0); | |
501 | ||
502 | e->lse_blkcount -= blocks_gone; | |
503 | blocks_gone = 0; | |
504 | break; | |
505 | } else if (e->lse_mscount == 0) { | |
506 | /* remove obsolete entry */ | |
507 | blocks_gone -= e->lse_blkcount; | |
508 | list_remove(&spa->spa_log_summary, e); | |
509 | kmem_free(e, sizeof (log_summary_entry_t)); | |
510 | } else { | |
511 | /* Verify that this is scenario [3] mentioned above. */ | |
512 | VERIFY3U(blocks_gone, ==, e->lse_blkcount); | |
513 | ||
514 | /* | |
515 | * Assert that this is scenario [3] further by ensuring | |
516 | * that this is the only entry in the summary. | |
517 | */ | |
518 | VERIFY3P(e, ==, list_tail(&spa->spa_log_summary)); | |
519 | ASSERT3P(e, ==, list_head(&spa->spa_log_summary)); | |
520 | ||
521 | blocks_gone = e->lse_blkcount = 0; | |
522 | break; | |
523 | } | |
524 | } | |
525 | ||
526 | /* | |
527 | * Ensure that there is no way we are trying to remove more blocks | |
528 | * than the # of blocks in the summary. | |
529 | */ | |
530 | ASSERT0(blocks_gone); | |
531 | } | |
532 | ||
533 | void | |
534 | spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg) | |
535 | { | |
536 | spa_log_sm_t target = { .sls_txg = txg }; | |
537 | spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, | |
538 | &target, NULL); | |
539 | ||
540 | if (sls == NULL) { | |
541 | /* | |
542 | * We must be at the teardown of a spa_load() attempt that | |
543 | * got an error while reading the log space maps. | |
544 | */ | |
545 | VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); | |
546 | return; | |
547 | } | |
548 | ||
549 | ASSERT(sls->sls_mscount > 0); | |
550 | sls->sls_mscount--; | |
551 | } | |
552 | ||
553 | void | |
554 | spa_log_sm_increment_current_mscount(spa_t *spa) | |
555 | { | |
556 | spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg); | |
557 | ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa)); | |
558 | last_sls->sls_mscount++; | |
559 | } | |
560 | ||
561 | static void | |
562 | summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, | |
563 | uint64_t nblocks) | |
564 | { | |
565 | log_summary_entry_t *e = list_tail(&spa->spa_log_summary); | |
566 | ||
567 | if (e == NULL || summary_entry_is_full(spa, e)) { | |
568 | e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); | |
569 | e->lse_start = txg; | |
570 | list_insert_tail(&spa->spa_log_summary, e); | |
571 | } | |
572 | ||
573 | ASSERT3U(e->lse_start, <=, txg); | |
574 | e->lse_mscount += metaslabs_flushed; | |
575 | e->lse_blkcount += nblocks; | |
576 | } | |
577 | ||
578 | static void | |
579 | spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) | |
580 | { | |
581 | summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); | |
582 | } | |
583 | ||
584 | void | |
585 | spa_log_summary_add_flushed_metaslab(spa_t *spa) | |
586 | { | |
587 | summary_add_data(spa, spa_syncing_txg(spa), 1, 0); | |
588 | } | |
589 | ||
590 | /* | |
591 | * This function attempts to estimate how many metaslabs should | |
592 | * we flush to satisfy our block heuristic for the log spacemap | |
593 | * for the upcoming TXGs. | |
594 | * | |
595 | * Specifically, it first tries to estimate the number of incoming | |
596 | * blocks in this TXG. Then by projecting that incoming rate to | |
597 | * future TXGs and using the log summary, it figures out how many | |
598 | * flushes we would need to do for future TXGs individually to | |
599 | * stay below our block limit and returns the maximum number of | |
600 | * flushes from those estimates. | |
601 | */ | |
602 | static uint64_t | |
603 | spa_estimate_metaslabs_to_flush(spa_t *spa) | |
604 | { | |
605 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); | |
606 | ASSERT3U(spa_sync_pass(spa), ==, 1); | |
607 | ASSERT(spa_log_sm_blocklimit(spa) != 0); | |
608 | ||
609 | /* | |
610 | * This variable contains the incoming rate that will be projected | |
611 | * and used for our flushing estimates in the future. | |
612 | */ | |
613 | uint64_t incoming = spa_estimate_incoming_log_blocks(spa); | |
614 | ||
615 | /* | |
616 | * At any point in time this variable tells us how many | |
617 | * TXGs in the future we are so we can make our estimations. | |
618 | */ | |
619 | uint64_t txgs_in_future = 1; | |
620 | ||
621 | /* | |
622 | * This variable tells us how much room do we have until we hit | |
623 | * our limit. When it goes negative, it means that we've exceeded | |
624 | * our limit and we need to flush. | |
625 | * | |
626 | * Note that since we start at the first TXG in the future (i.e. | |
627 | * txgs_in_future starts from 1) we already decrement this | |
628 | * variable by the incoming rate. | |
629 | */ | |
630 | int64_t available_blocks = | |
631 | spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; | |
632 | ||
633 | /* | |
634 | * This variable tells us the total number of flushes needed to | |
635 | * keep the log size within the limit when we reach txgs_in_future. | |
636 | */ | |
637 | uint64_t total_flushes = 0; | |
638 | ||
639 | /* Holds the current maximum of our estimates so far. */ | |
640 | uint64_t max_flushes_pertxg = | |
641 | MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), | |
642 | zfs_min_metaslabs_to_flush); | |
643 | ||
644 | /* | |
645 | * For our estimations we only look as far in the future | |
646 | * as the summary allows us. | |
647 | */ | |
648 | for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); | |
649 | e; e = list_next(&spa->spa_log_summary, e)) { | |
650 | ||
651 | /* | |
652 | * If there is still room before we exceed our limit | |
653 | * then keep skipping TXGs accumulating more blocks | |
654 | * based on the incoming rate until we exceed it. | |
655 | */ | |
656 | if (available_blocks >= 0) { | |
657 | uint64_t skip_txgs = (available_blocks / incoming) + 1; | |
658 | available_blocks -= (skip_txgs * incoming); | |
659 | txgs_in_future += skip_txgs; | |
660 | ASSERT3S(available_blocks, >=, -incoming); | |
661 | } | |
662 | ||
663 | /* | |
664 | * At this point we're far enough into the future where | |
665 | * the limit was just exceeded and we flush metaslabs | |
666 | * based on the current entry in the summary, updating | |
667 | * our available_blocks. | |
668 | */ | |
669 | ASSERT3S(available_blocks, <, 0); | |
670 | available_blocks += e->lse_blkcount; | |
671 | total_flushes += e->lse_mscount; | |
672 | ||
673 | /* | |
674 | * Keep the running maximum of the total_flushes that | |
675 | * we've done so far over the number of TXGs in the | |
676 | * future that we are. The idea here is to estimate | |
677 | * the average number of flushes that we should do | |
678 | * every TXG so that when we are that many TXGs in the | |
679 | * future we stay under the limit. | |
680 | */ | |
681 | max_flushes_pertxg = MAX(max_flushes_pertxg, | |
682 | DIV_ROUND_UP(total_flushes, txgs_in_future)); | |
683 | ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, | |
684 | max_flushes_pertxg); | |
685 | } | |
686 | return (max_flushes_pertxg); | |
687 | } | |
688 | ||
689 | uint64_t | |
690 | spa_log_sm_memused(spa_t *spa) | |
691 | { | |
692 | return (spa->spa_unflushed_stats.sus_memused); | |
693 | } | |
694 | ||
695 | static boolean_t | |
696 | spa_log_exceeds_memlimit(spa_t *spa) | |
697 | { | |
698 | if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt) | |
699 | return (B_TRUE); | |
700 | ||
701 | uint64_t system_mem_allowed = ((physmem * PAGESIZE) * | |
702 | zfs_unflushed_max_mem_ppm) / 1000000; | |
703 | if (spa_log_sm_memused(spa) > system_mem_allowed) | |
704 | return (B_TRUE); | |
705 | ||
706 | return (B_FALSE); | |
707 | } | |
708 | ||
709 | boolean_t | |
710 | spa_flush_all_logs_requested(spa_t *spa) | |
711 | { | |
712 | return (spa->spa_log_flushall_txg != 0); | |
713 | } | |
714 | ||
715 | void | |
716 | spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) | |
717 | { | |
718 | uint64_t txg = dmu_tx_get_txg(tx); | |
719 | ||
720 | if (spa_sync_pass(spa) != 1) | |
721 | return; | |
722 | ||
723 | if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) | |
724 | return; | |
725 | ||
726 | /* | |
727 | * If we don't have any metaslabs with unflushed changes | |
728 | * return immediately. | |
729 | */ | |
730 | if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0) | |
731 | return; | |
732 | ||
733 | /* | |
734 | * During SPA export we leave a few empty TXGs to go by [see | |
735 | * spa_final_dirty_txg() to understand why]. For this specific | |
736 | * case, it is important to not flush any metaslabs as that | |
737 | * would dirty this TXG. | |
738 | * | |
739 | * That said, during one of these dirty TXGs that is less or | |
740 | * equal to spa_final_dirty(), spa_unload() will request that | |
741 | * we try to flush all the metaslabs for that TXG before | |
742 | * exporting the pool, thus we ensure that we didn't get a | |
743 | * request of flushing everything before we attempt to return | |
744 | * immediately. | |
745 | */ | |
746 | if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && | |
747 | !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && | |
748 | !spa_flush_all_logs_requested(spa)) | |
749 | return; | |
750 | ||
751 | /* | |
752 | * We need to generate a log space map before flushing because this | |
753 | * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg) | |
754 | * for this TXG's flushed metaslab count (aka sls_mscount which is | |
755 | * manipulated in many ways down the metaslab_flush() codepath). | |
756 | * | |
757 | * That is not to say that we may generate a log space map when we | |
758 | * don't need it. If we are flushing metaslabs, that means that we | |
759 | * were going to write changes to disk anyway, so even if we were | |
760 | * not flushing, a log space map would have been created anyway in | |
761 | * metaslab_sync(). | |
762 | */ | |
763 | spa_generate_syncing_log_sm(spa, tx); | |
764 | ||
765 | /* | |
766 | * This variable tells us how many metaslabs we want to flush based | |
767 | * on the block-heuristic of our flushing algorithm (see block comment | |
768 | * of log space map feature). We also decrement this as we flush | |
769 | * metaslabs and attempt to destroy old log space maps. | |
770 | */ | |
771 | uint64_t want_to_flush; | |
772 | if (spa_flush_all_logs_requested(spa)) { | |
773 | ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); | |
774 | want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); | |
775 | } else { | |
776 | want_to_flush = spa_estimate_metaslabs_to_flush(spa); | |
777 | } | |
778 | ||
779 | ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, | |
780 | want_to_flush); | |
781 | ||
782 | /* Used purely for verification purposes */ | |
783 | uint64_t visited = 0; | |
784 | ||
785 | /* | |
786 | * Ideally we would only iterate through spa_metaslabs_by_flushed | |
787 | * using only one variable (curr). We can't do that because | |
788 | * metaslab_flush() mutates position of curr in the AVL when | |
789 | * it flushes that metaslab by moving it to the end of the tree. | |
790 | * Thus we always keep track of the original next node of the | |
791 | * current node (curr) in another variable (next). | |
792 | */ | |
793 | metaslab_t *next = NULL; | |
794 | for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed); | |
795 | curr != NULL; curr = next) { | |
796 | next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr); | |
797 | ||
798 | /* | |
799 | * If this metaslab has been flushed this txg then we've done | |
800 | * a full circle over the metaslabs. | |
801 | */ | |
802 | if (metaslab_unflushed_txg(curr) == txg) | |
803 | break; | |
804 | ||
805 | /* | |
806 | * If we are done flushing for the block heuristic and the | |
807 | * unflushed changes don't exceed the memory limit just stop. | |
808 | */ | |
809 | if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) | |
810 | break; | |
811 | ||
812 | mutex_enter(&curr->ms_sync_lock); | |
813 | mutex_enter(&curr->ms_lock); | |
814 | boolean_t flushed = metaslab_flush(curr, tx); | |
815 | mutex_exit(&curr->ms_lock); | |
816 | mutex_exit(&curr->ms_sync_lock); | |
817 | ||
818 | /* | |
819 | * If we failed to flush a metaslab (because it was loading), | |
820 | * then we are done with the block heuristic as it's not | |
821 | * possible to destroy any log space maps once you've skipped | |
822 | * a metaslab. In that case we just set our counter to 0 but | |
823 | * we continue looping in case there is still memory pressure | |
824 | * due to unflushed changes. Note that, flushing a metaslab | |
825 | * that is not the oldest flushed in the pool, will never | |
826 | * destroy any log space maps [see spa_cleanup_old_sm_logs()]. | |
827 | */ | |
828 | if (!flushed) { | |
829 | want_to_flush = 0; | |
830 | } else if (want_to_flush > 0) { | |
831 | want_to_flush--; | |
832 | } | |
833 | ||
834 | visited++; | |
835 | } | |
836 | ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); | |
837 | } | |
838 | ||
839 | /* | |
840 | * Close the log space map for this TXG and update the block counts | |
841 | * for the the log's in-memory structure and the summary. | |
842 | */ | |
843 | void | |
844 | spa_sync_close_syncing_log_sm(spa_t *spa) | |
845 | { | |
846 | if (spa_syncing_log_sm(spa) == NULL) | |
847 | return; | |
848 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); | |
849 | ||
850 | spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); | |
851 | ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa)); | |
852 | ||
853 | sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa)); | |
854 | spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; | |
855 | ||
856 | /* | |
857 | * Note that we can't assert that sls_mscount is not 0, | |
858 | * because there is the case where the first metaslab | |
859 | * in spa_metaslabs_by_flushed is loading and we were | |
860 | * not able to flush any metaslabs the current TXG. | |
861 | */ | |
862 | ASSERT(sls->sls_nblocks != 0); | |
863 | ||
864 | spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks); | |
865 | spa_log_summary_verify_counts(spa); | |
866 | ||
867 | space_map_close(spa->spa_syncing_log_sm); | |
868 | spa->spa_syncing_log_sm = NULL; | |
869 | ||
870 | /* | |
871 | * At this point we tried to flush as many metaslabs as we | |
872 | * can as the pool is getting exported. Reset the "flush all" | |
873 | * so the last few TXGs before closing the pool can be empty | |
874 | * (e.g. not dirty). | |
875 | */ | |
876 | if (spa_flush_all_logs_requested(spa)) { | |
877 | ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); | |
878 | spa->spa_log_flushall_txg = 0; | |
879 | } | |
880 | } | |
881 | ||
882 | void | |
883 | spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) | |
884 | { | |
885 | objset_t *mos = spa_meta_objset(spa); | |
886 | ||
887 | uint64_t spacemap_zap; | |
888 | int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, | |
889 | DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); | |
890 | if (error == ENOENT) { | |
891 | ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); | |
892 | return; | |
893 | } | |
894 | VERIFY0(error); | |
895 | ||
896 | metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); | |
897 | uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); | |
898 | ||
899 | /* Free all log space maps older than the oldest_flushed_txg. */ | |
900 | for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); | |
901 | sls && sls->sls_txg < oldest_flushed_txg; | |
902 | sls = avl_first(&spa->spa_sm_logs_by_txg)) { | |
903 | ASSERT0(sls->sls_mscount); | |
904 | avl_remove(&spa->spa_sm_logs_by_txg, sls); | |
905 | space_map_free_obj(mos, sls->sls_sm_obj, tx); | |
906 | VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); | |
907 | spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; | |
908 | kmem_free(sls, sizeof (spa_log_sm_t)); | |
909 | } | |
910 | } | |
911 | ||
912 | static spa_log_sm_t * | |
913 | spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg) | |
914 | { | |
915 | spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP); | |
916 | sls->sls_sm_obj = sm_obj; | |
917 | sls->sls_txg = txg; | |
918 | return (sls); | |
919 | } | |
920 | ||
921 | void | |
922 | spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) | |
923 | { | |
924 | uint64_t txg = dmu_tx_get_txg(tx); | |
925 | objset_t *mos = spa_meta_objset(spa); | |
926 | ||
927 | if (spa_syncing_log_sm(spa) != NULL) | |
928 | return; | |
929 | ||
930 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)) | |
931 | return; | |
932 | ||
933 | uint64_t spacemap_zap; | |
934 | int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, | |
935 | DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); | |
936 | if (error == ENOENT) { | |
937 | ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); | |
938 | ||
939 | error = 0; | |
940 | spacemap_zap = zap_create(mos, | |
941 | DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); | |
942 | VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, | |
943 | DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, | |
944 | &spacemap_zap, tx)); | |
945 | spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); | |
946 | } | |
947 | VERIFY0(error); | |
948 | ||
949 | uint64_t sm_obj; | |
950 | ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), | |
951 | ==, ENOENT); | |
952 | sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx); | |
953 | VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx)); | |
954 | avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg)); | |
955 | ||
956 | /* | |
957 | * We pass UINT64_MAX as the space map's representation size | |
958 | * and SPA_MINBLOCKSHIFT as the shift, to make the space map | |
959 | * accept any sorts of segments since there's no real advantage | |
960 | * to being more restrictive (given that we're already going | |
961 | * to be using 2-word entries). | |
962 | */ | |
963 | VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, | |
964 | 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); | |
965 | ||
966 | /* | |
967 | * If the log space map feature was just enabled, the blocklimit | |
968 | * has not yet been set. | |
969 | */ | |
970 | if (spa_log_sm_blocklimit(spa) == 0) | |
971 | spa_log_sm_set_blocklimit(spa); | |
972 | } | |
973 | ||
974 | /* | |
975 | * Find all the log space maps stored in the space map ZAP and sort | |
976 | * them by their TXG in spa_sm_logs_by_txg. | |
977 | */ | |
978 | static int | |
979 | spa_ld_log_sm_metadata(spa_t *spa) | |
980 | { | |
981 | int error; | |
982 | uint64_t spacemap_zap; | |
983 | ||
984 | ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); | |
985 | ||
986 | error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, | |
987 | DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); | |
988 | if (error == ENOENT) { | |
989 | /* the space map ZAP doesn't exist yet */ | |
990 | return (0); | |
991 | } else if (error != 0) { | |
992 | spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " | |
993 | "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]", | |
994 | error); | |
995 | return (error); | |
996 | } | |
997 | ||
998 | zap_cursor_t zc; | |
999 | zap_attribute_t za; | |
1000 | for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap); | |
1001 | zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { | |
1002 | uint64_t log_txg = zfs_strtonum(za.za_name, NULL); | |
1003 | spa_log_sm_t *sls = | |
1004 | spa_log_sm_alloc(za.za_first_integer, log_txg); | |
1005 | avl_add(&spa->spa_sm_logs_by_txg, sls); | |
1006 | } | |
1007 | zap_cursor_fini(&zc); | |
1008 | ||
1009 | for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); | |
1010 | m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { | |
1011 | spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) }; | |
1012 | spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, | |
1013 | &target, NULL); | |
1014 | sls->sls_mscount++; | |
1015 | } | |
1016 | ||
1017 | return (0); | |
1018 | } | |
1019 | ||
1020 | typedef struct spa_ld_log_sm_arg { | |
1021 | spa_t *slls_spa; | |
1022 | uint64_t slls_txg; | |
1023 | } spa_ld_log_sm_arg_t; | |
1024 | ||
1025 | static int | |
1026 | spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) | |
1027 | { | |
1028 | uint64_t offset = sme->sme_offset; | |
1029 | uint64_t size = sme->sme_run; | |
1030 | uint32_t vdev_id = sme->sme_vdev; | |
1031 | ||
1032 | spa_ld_log_sm_arg_t *slls = arg; | |
1033 | spa_t *spa = slls->slls_spa; | |
1034 | ||
1035 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
1036 | ||
1037 | /* | |
1038 | * If the vdev has been removed (i.e. it is indirect or a hole) | |
1039 | * skip this entry. The contents of this vdev have already moved | |
1040 | * elsewhere. | |
1041 | */ | |
1042 | if (!vdev_is_concrete(vd)) | |
1043 | return (0); | |
1044 | ||
1045 | metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; | |
1046 | ASSERT(!ms->ms_loaded); | |
1047 | ||
1048 | /* | |
1049 | * If we have already flushed entries for this TXG to this | |
1050 | * metaslab's space map, then ignore it. Note that we flush | |
1051 | * before processing any allocations/frees for that TXG, so | |
1052 | * the metaslab's space map only has entries from *before* | |
1053 | * the unflushed TXG. | |
1054 | */ | |
1055 | if (slls->slls_txg < metaslab_unflushed_txg(ms)) | |
1056 | return (0); | |
1057 | ||
1058 | switch (sme->sme_type) { | |
1059 | case SM_ALLOC: | |
1060 | range_tree_remove_xor_add_segment(offset, offset + size, | |
1061 | ms->ms_unflushed_frees, ms->ms_unflushed_allocs); | |
1062 | break; | |
1063 | case SM_FREE: | |
1064 | range_tree_remove_xor_add_segment(offset, offset + size, | |
1065 | ms->ms_unflushed_allocs, ms->ms_unflushed_frees); | |
1066 | break; | |
1067 | default: | |
1068 | panic("invalid maptype_t"); | |
1069 | break; | |
1070 | } | |
1071 | return (0); | |
1072 | } | |
1073 | ||
1074 | static int | |
1075 | spa_ld_log_sm_data(spa_t *spa) | |
1076 | { | |
1077 | int error = 0; | |
1078 | ||
1079 | /* | |
1080 | * If we are not going to do any writes there is no need | |
1081 | * to read the log space maps. | |
1082 | */ | |
1083 | if (!spa_writeable(spa)) | |
1084 | return (0); | |
1085 | ||
1086 | ASSERT0(spa->spa_unflushed_stats.sus_nblocks); | |
1087 | ASSERT0(spa->spa_unflushed_stats.sus_memused); | |
1088 | ||
1089 | hrtime_t read_logs_starttime = gethrtime(); | |
1090 | /* this is a no-op when we don't have space map logs */ | |
1091 | for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); | |
1092 | sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { | |
1093 | space_map_t *sm = NULL; | |
1094 | error = space_map_open(&sm, spa_meta_objset(spa), | |
1095 | sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); | |
1096 | if (error != 0) { | |
1097 | spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " | |
1098 | "space_map_open(obj=%llu) [error %d]", | |
1099 | (u_longlong_t)sls->sls_sm_obj, error); | |
1100 | goto out; | |
1101 | } | |
1102 | ||
1103 | struct spa_ld_log_sm_arg vla = { | |
1104 | .slls_spa = spa, | |
1105 | .slls_txg = sls->sls_txg | |
1106 | }; | |
1107 | error = space_map_iterate(sm, space_map_length(sm), | |
1108 | spa_ld_log_sm_cb, &vla); | |
1109 | if (error != 0) { | |
1110 | space_map_close(sm); | |
1111 | spa_load_failed(spa, "spa_ld_log_sm_data(): failed " | |
1112 | "at space_map_iterate(obj=%llu) [error %d]", | |
1113 | (u_longlong_t)sls->sls_sm_obj, error); | |
1114 | goto out; | |
1115 | } | |
1116 | ||
1117 | ASSERT0(sls->sls_nblocks); | |
1118 | sls->sls_nblocks = space_map_nblocks(sm); | |
1119 | spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; | |
1120 | summary_add_data(spa, sls->sls_txg, | |
1121 | sls->sls_mscount, sls->sls_nblocks); | |
1122 | ||
1123 | space_map_close(sm); | |
1124 | } | |
1125 | hrtime_t read_logs_endtime = gethrtime(); | |
1126 | spa_load_note(spa, | |
1127 | "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " | |
1128 | "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), | |
1129 | (u_longlong_t)spa_log_sm_nblocks(spa), | |
1130 | (u_longlong_t)zfs_log_sm_blksz, | |
1131 | (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); | |
1132 | ||
1133 | out: | |
1134 | /* | |
1135 | * Now that the metaslabs contain their unflushed changes: | |
1136 | * [1] recalculate their actual allocated space | |
1137 | * [2] recalculate their weights | |
1138 | * [3] sum up the memory usage of their unflushed range trees | |
1139 | * [4] optionally load them, if debug_load is set | |
1140 | * | |
1141 | * Note that even in the case where we get here because of an | |
1142 | * error (e.g. error != 0), we still want to update the fields | |
1143 | * below in order to have a proper teardown in spa_unload(). | |
1144 | */ | |
1145 | for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); | |
1146 | m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { | |
1147 | mutex_enter(&m->ms_lock); | |
1148 | m->ms_allocated_space = space_map_allocated(m->ms_sm) + | |
1149 | range_tree_space(m->ms_unflushed_allocs) - | |
1150 | range_tree_space(m->ms_unflushed_frees); | |
1151 | ||
1152 | vdev_t *vd = m->ms_group->mg_vd; | |
1153 | metaslab_space_update(vd, m->ms_group->mg_class, | |
1154 | range_tree_space(m->ms_unflushed_allocs), 0, 0); | |
1155 | metaslab_space_update(vd, m->ms_group->mg_class, | |
1156 | -range_tree_space(m->ms_unflushed_frees), 0, 0); | |
1157 | ||
1158 | ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); | |
1159 | metaslab_recalculate_weight_and_sort(m); | |
1160 | ||
1161 | spa->spa_unflushed_stats.sus_memused += | |
1162 | metaslab_unflushed_changes_memused(m); | |
1163 | ||
1164 | if (metaslab_debug_load && m->ms_sm != NULL) { | |
1165 | VERIFY0(metaslab_load(m)); | |
1166 | } | |
1167 | mutex_exit(&m->ms_lock); | |
1168 | } | |
1169 | ||
1170 | return (error); | |
1171 | } | |
1172 | ||
1173 | static int | |
1174 | spa_ld_unflushed_txgs(vdev_t *vd) | |
1175 | { | |
1176 | spa_t *spa = vd->vdev_spa; | |
1177 | objset_t *mos = spa_meta_objset(spa); | |
1178 | ||
1179 | if (vd->vdev_top_zap == 0) | |
1180 | return (0); | |
1181 | ||
1182 | uint64_t object = 0; | |
1183 | int error = zap_lookup(mos, vd->vdev_top_zap, | |
1184 | VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, | |
1185 | sizeof (uint64_t), 1, &object); | |
1186 | if (error == ENOENT) | |
1187 | return (0); | |
1188 | else if (error != 0) { | |
1189 | spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " | |
1190 | "zap_lookup(vdev_top_zap=%llu) [error %d]", | |
1191 | (u_longlong_t)vd->vdev_top_zap, error); | |
1192 | return (error); | |
1193 | } | |
1194 | ||
1195 | for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { | |
1196 | metaslab_t *ms = vd->vdev_ms[m]; | |
1197 | ASSERT(ms != NULL); | |
1198 | ||
1199 | metaslab_unflushed_phys_t entry; | |
1200 | uint64_t entry_size = sizeof (entry); | |
1201 | uint64_t entry_offset = ms->ms_id * entry_size; | |
1202 | ||
1203 | error = dmu_read(mos, object, | |
1204 | entry_offset, entry_size, &entry, 0); | |
1205 | if (error != 0) { | |
1206 | spa_load_failed(spa, "spa_ld_unflushed_txgs(): " | |
1207 | "failed at dmu_read(obj=%llu) [error %d]", | |
1208 | (u_longlong_t)object, error); | |
1209 | return (error); | |
1210 | } | |
1211 | ||
1212 | ms->ms_unflushed_txg = entry.msp_unflushed_txg; | |
1213 | if (ms->ms_unflushed_txg != 0) { | |
1214 | mutex_enter(&spa->spa_flushed_ms_lock); | |
1215 | avl_add(&spa->spa_metaslabs_by_flushed, ms); | |
1216 | mutex_exit(&spa->spa_flushed_ms_lock); | |
1217 | } | |
1218 | } | |
1219 | return (0); | |
1220 | } | |
1221 | ||
1222 | /* | |
1223 | * Read all the log space map entries into their respective | |
1224 | * metaslab unflushed trees and keep them sorted by TXG in the | |
1225 | * SPA's metadata. In addition, setup all the metadata for the | |
1226 | * memory and the block heuristics. | |
1227 | */ | |
1228 | int | |
1229 | spa_ld_log_spacemaps(spa_t *spa) | |
1230 | { | |
1231 | int error; | |
1232 | ||
1233 | spa_log_sm_set_blocklimit(spa); | |
1234 | ||
1235 | for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) { | |
1236 | vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; | |
1237 | error = spa_ld_unflushed_txgs(vd); | |
1238 | if (error != 0) | |
1239 | return (error); | |
1240 | } | |
1241 | ||
1242 | error = spa_ld_log_sm_metadata(spa); | |
1243 | if (error != 0) | |
1244 | return (error); | |
1245 | ||
1246 | /* | |
1247 | * Note: we don't actually expect anything to change at this point | |
1248 | * but we grab the config lock so we don't fail any assertions | |
1249 | * when using vdev_lookup_top(). | |
1250 | */ | |
1251 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
1252 | error = spa_ld_log_sm_data(spa); | |
1253 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
1254 | ||
1255 | return (error); | |
1256 | } | |
1257 | ||
1258 | #if defined(_KERNEL) | |
1259 | /* BEGIN CSTYLED */ | |
1260 | module_param(zfs_unflushed_max_mem_amt, ulong, 0644); | |
1261 | MODULE_PARM_DESC(zfs_unflushed_max_mem_amt, | |
1262 | "Specific hard-limit in memory that ZFS allows to be used for " | |
1263 | "unflushed changes"); | |
1264 | ||
1265 | module_param(zfs_unflushed_max_mem_ppm, ulong, 0644); | |
1266 | MODULE_PARM_DESC(zfs_unflushed_max_mem_ppm, | |
1267 | "Percentage of the overall system memory that ZFS allows to be " | |
1268 | "used for unflushed changes (value is calculated over 1000000 for " | |
1269 | "finer granularity"); | |
1270 | ||
1271 | module_param(zfs_unflushed_log_block_max, ulong, 0644); | |
1272 | MODULE_PARM_DESC(zfs_unflushed_log_block_max, | |
1273 | "Hard limit (upper-bound) in the size of the space map log " | |
1274 | "in terms of blocks."); | |
1275 | ||
1276 | module_param(zfs_unflushed_log_block_min, ulong, 0644); | |
1277 | MODULE_PARM_DESC(zfs_unflushed_log_block_min, | |
1278 | "Lower-bound limit for the maximum amount of blocks allowed in " | |
1279 | "log spacemap (see zfs_unflushed_log_block_max)"); | |
1280 | ||
1281 | module_param(zfs_unflushed_log_block_pct, ulong, 0644); | |
1282 | MODULE_PARM_DESC(zfs_unflushed_log_block_pct, | |
1283 | "Tunable used to determine the number of blocks that can be " | |
1284 | "used for the spacemap log, expressed as a percentage of the " | |
1285 | " total number of metaslabs in the pool (e.g. 400 means the " | |
1286 | " number of log blocks is capped at 4 times the number of " | |
1287 | "metaslabs)"); | |
1288 | ||
1289 | module_param(zfs_max_log_walking, ulong, 0644); | |
1290 | MODULE_PARM_DESC(zfs_max_log_walking, | |
1291 | "The number of past TXGs that the flushing algorithm of the log " | |
1292 | "spacemap feature uses to estimate incoming log blocks"); | |
1293 | ||
1294 | module_param(zfs_max_logsm_summary_length, ulong, 0644); | |
1295 | MODULE_PARM_DESC(zfs_max_logsm_summary_length, | |
1296 | "Maximum number of rows allowed in the summary of " | |
1297 | "the spacemap log"); | |
1298 | ||
1299 | module_param(zfs_min_metaslabs_to_flush, ulong, 0644); | |
1300 | MODULE_PARM_DESC(zfs_min_metaslabs_to_flush, | |
1301 | "Minimum number of metaslabs to flush per dirty TXG"); | |
1302 | ||
1303 | module_param(zfs_keep_log_spacemaps_at_export, int, 0644); | |
1304 | MODULE_PARM_DESC(zfs_keep_log_spacemaps_at_export, | |
1305 | "Prevent the log spacemaps from being flushed and destroyed " | |
1306 | "during pool export/destroy"); | |
1307 | /* END CSTYLED */ | |
1308 | #endif |