]>
Commit | Line | Data |
---|---|---|
67a1b037 PJD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
3a03c963 | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
67a1b037 PJD |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek | |
24 | */ | |
25 | ||
26 | #include <sys/zfs_context.h> | |
27 | #include <sys/spa.h> | |
28 | #include <sys/spa_impl.h> | |
29 | #include <sys/zio.h> | |
30 | #include <sys/brt.h> | |
803a9c12 | 31 | #include <sys/brt_impl.h> |
67a1b037 PJD |
32 | #include <sys/ddt.h> |
33 | #include <sys/bitmap.h> | |
34 | #include <sys/zap.h> | |
35 | #include <sys/dmu_tx.h> | |
36 | #include <sys/arc.h> | |
37 | #include <sys/dsl_pool.h> | |
38 | #include <sys/dsl_scan.h> | |
39 | #include <sys/vdev_impl.h> | |
40 | #include <sys/kstat.h> | |
41 | #include <sys/wmsum.h> | |
42 | ||
43 | /* | |
44 | * Block Cloning design. | |
45 | * | |
46 | * Block Cloning allows to manually clone a file (or a subset of its blocks) | |
47 | * into another (or the same) file by just creating additional references to | |
48 | * the data blocks without copying the data itself. Those references are kept | |
49 | * in the Block Reference Tables (BRTs). | |
50 | * | |
51 | * In many ways this is similar to the existing deduplication, but there are | |
52 | * some important differences: | |
53 | * | |
54 | * - Deduplication is automatic and Block Cloning is not - one has to use a | |
55 | * dedicated system call(s) to clone the given file/blocks. | |
56 | * - Deduplication keeps all data blocks in its table, even those referenced | |
57 | * just once. Block Cloning creates an entry in its tables only when there | |
58 | * are at least two references to the given data block. If the block was | |
59 | * never explicitly cloned or the second to last reference was dropped, | |
60 | * there will be neither space nor performance overhead. | |
61 | * - Deduplication needs data to work - one needs to pass real data to the | |
62 | * write(2) syscall, so hash can be calculated. Block Cloning doesn't require | |
63 | * data, just block pointers to the data, so it is extremely fast, as we pay | |
64 | * neither the cost of reading the data, nor the cost of writing the data - | |
65 | * we operate exclusively on metadata. | |
66 | * - If the D (dedup) bit is not set in the block pointer, it means that | |
67 | * the block is not in the dedup table (DDT) and we won't consult the DDT | |
68 | * when we need to free the block. Block Cloning must be consulted on every | |
69 | * free, because we cannot modify the source BP (eg. by setting something | |
70 | * similar to the D bit), thus we have no hint if the block is in the | |
71 | * Block Reference Table (BRT), so we need to look into the BRT. There is | |
72 | * an optimization in place that allows us to eliminate the majority of BRT | |
73 | * lookups which is described below in the "Minimizing free penalty" section. | |
74 | * - The BRT entry is much smaller than the DDT entry - for BRT we only store | |
75 | * 64bit offset and 64bit reference counter. | |
76 | * - Dedup keys are cryptographic hashes, so two blocks that are close to each | |
77 | * other on disk are most likely in totally different parts of the DDT. | |
78 | * The BRT entry keys are offsets into a single top-level VDEV, so data blocks | |
79 | * from one file should have BRT entries close to each other. | |
80 | * - Scrub will only do a single pass over a block that is referenced multiple | |
81 | * times in the DDT. Unfortunately it is not currently (if at all) possible | |
82 | * with Block Cloning and block referenced multiple times will be scrubbed | |
83 | * multiple times. The new, sorted scrub should be able to eliminate | |
84 | * duplicated reads given enough memory. | |
85 | * - Deduplication requires cryptographically strong hash as a checksum or | |
86 | * additional data verification. Block Cloning works with any checksum | |
87 | * algorithm or even with checksumming disabled. | |
88 | * | |
89 | * As mentioned above, the BRT entries are much smaller than the DDT entries. | |
90 | * To uniquely identify a block we just need its vdev id and offset. We also | |
91 | * need to maintain a reference counter. The vdev id will often repeat, as there | |
92 | * is a small number of top-level VDEVs and a large number of blocks stored in | |
93 | * each VDEV. We take advantage of that to reduce the BRT entry size further by | |
94 | * maintaining one BRT for each top-level VDEV, so we can then have only offset | |
95 | * and counter as the BRT entry. | |
96 | * | |
97 | * Minimizing free penalty. | |
98 | * | |
99 | * Block Cloning allows creating additional references to any existing block. | |
100 | * When we free a block there is no hint in the block pointer whether the block | |
101 | * was cloned or not, so on each free we have to check if there is a | |
102 | * corresponding entry in the BRT or not. If there is, we need to decrease | |
103 | * the reference counter. Doing BRT lookup on every free can potentially be | |
104 | * expensive by requiring additional I/Os if the BRT doesn't fit into memory. | |
105 | * This is the main problem with deduplication, so we've learned our lesson and | |
106 | * try not to repeat the same mistake here. How do we do that? We divide each | |
107 | * top-level VDEV into 16MB regions. For each region we maintain a counter that | |
108 | * is a sum of all the BRT entries that have offsets within the region. This | |
109 | * creates the entries count array of 16bit numbers for each top-level VDEV. | |
110 | * The entries count array is always kept in memory and updated on disk in the | |
111 | * same transaction group as the BRT updates to keep everything in-sync. We can | |
112 | * keep the array in memory, because it is very small. With 16MB regions and | |
113 | * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease | |
114 | * the region size even further in the future). Now, when we want to free | |
115 | * a block, we first consult the array. If the counter for the whole region is | |
116 | * zero, there is no need to look for the BRT entry, as there isn't one for | |
117 | * sure. If the counter for the region is greater than zero, only then we will | |
118 | * do a BRT lookup and if an entry is found we will decrease the reference | |
119 | * counter in the BRT entry and in the entry counters array. | |
120 | * | |
121 | * The entry counters array is small, but can potentially be larger for very | |
122 | * large VDEVs or smaller regions. In this case we don't want to rewrite entire | |
123 | * array on every change. We then divide the array into 32kB block and keep | |
124 | * a bitmap of dirty blocks within a transaction group. When we sync the | |
125 | * transaction group we can only update the parts of the entry counters array | |
126 | * that were modified. Note: Keeping track of the dirty parts of the entry | |
127 | * counters array is implemented, but updating only parts of the array on disk | |
128 | * is not yet implemented - for now we will update entire array if there was | |
129 | * any change. | |
130 | * | |
131 | * The implementation tries to be economic: if BRT is not used, or no longer | |
132 | * used, there will be no entries in the MOS and no additional memory used (eg. | |
133 | * the entry counters array is only allocated if needed). | |
134 | * | |
135 | * Interaction between Deduplication and Block Cloning. | |
136 | * | |
137 | * If both functionalities are in use, we could end up with a block that is | |
138 | * referenced multiple times in both DDT and BRT. When we free one of the | |
139 | * references we couldn't tell where it belongs, so we would have to decide | |
140 | * what table takes the precedence: do we first clear DDT references or BRT | |
141 | * references? To avoid this dilemma BRT cooperates with DDT - if a given block | |
142 | * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will | |
143 | * lookup DDT entry instead and increase the counter there. No BRT entry | |
144 | * will be created for a block which has the D (dedup) bit set. | |
145 | * BRT may be more efficient for manual deduplication, but if the block is | |
146 | * already in the DDT, then creating additional BRT entry would be less | |
147 | * efficient. This clever idea was proposed by Allan Jude. | |
148 | * | |
149 | * Block Cloning across datasets. | |
150 | * | |
151 | * Block Cloning is not limited to cloning blocks within the same dataset. | |
152 | * It is possible (and very useful) to clone blocks between different datasets. | |
153 | * One use case is recovering files from snapshots. By cloning the files into | |
154 | * dataset we need no additional storage. Without Block Cloning we would need | |
155 | * additional space for those files. | |
156 | * Another interesting use case is moving the files between datasets | |
157 | * (copying the file content to the new dataset and removing the source file). | |
158 | * In that case Block Cloning will only be used briefly, because the BRT entries | |
159 | * will be removed when the source is removed. | |
160 | * Note: currently it is not possible to clone blocks between encrypted | |
161 | * datasets, even if those datasets use the same encryption key (this includes | |
162 | * snapshots of encrypted datasets). Cloning blocks between datasets that use | |
163 | * the same keys should be possible and should be implemented in the future. | |
164 | * | |
165 | * Block Cloning flow through ZFS layers. | |
166 | * | |
167 | * Note: Block Cloning can be used both for cloning file system blocks and ZVOL | |
168 | * blocks. As of this writing no interface is implemented that allows for block | |
169 | * cloning within a ZVOL. | |
170 | * FreeBSD and Linux provides copy_file_range(2) system call and we will use it | |
171 | * for blocking cloning. | |
172 | * | |
173 | * ssize_t | |
174 | * copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp, | |
175 | * size_t len, unsigned int flags); | |
176 | * | |
177 | * Even though offsets and length represent bytes, they have to be | |
019dea0a | 178 | * block-aligned or we will return an error so the upper layer can |
67a1b037 PJD |
179 | * fallback to the generic mechanism that will just copy the data. |
180 | * Using copy_file_range(2) will call OS-independent zfs_clone_range() function. | |
181 | * This function was implemented based on zfs_write(), but instead of writing | |
182 | * the given data we first read block pointers using the new dmu_read_l0_bps() | |
183 | * function from the source file. Once we have BPs from the source file we call | |
184 | * the dmu_brt_clone() function on the destination file. This function | |
185 | * allocates BPs for us. We iterate over all source BPs. If the given BP is | |
186 | * a hole or an embedded block, we just copy BP as-is. If it points to a real | |
187 | * data we place this BP on a BRT pending list using the brt_pending_add() | |
188 | * function. | |
189 | * | |
190 | * We use this pending list to keep track of all BPs that got new references | |
191 | * within this transaction group. | |
192 | * | |
193 | * Some special cases to consider and how we address them: | |
194 | * - The block we want to clone may have been created within the same | |
195 | * transaction group that we are trying to clone. Such block has no BP | |
019dea0a | 196 | * allocated yet, so cannot be immediately cloned. We return EAGAIN. |
67a1b037 | 197 | * - The block we want to clone may have been modified within the same |
019dea0a | 198 | * transaction group. We return EAGAIN. |
67a1b037 PJD |
199 | * - A block may be cloned multiple times during one transaction group (that's |
200 | * why pending list is actually a tree and not an append-only list - this | |
201 | * way we can figure out faster if this block is cloned for the first time | |
202 | * in this txg or consecutive time). | |
203 | * - A block may be cloned and freed within the same transaction group | |
204 | * (see dbuf_undirty()). | |
205 | * - A block may be cloned and within the same transaction group the clone | |
206 | * can be cloned again (see dmu_read_l0_bps()). | |
207 | * - A file might have been deleted, but the caller still has a file descriptor | |
208 | * open to this file and clones it. | |
209 | * | |
210 | * When we free a block we have an additional step in the ZIO pipeline where we | |
211 | * call the zio_brt_free() function. We then call the brt_entry_decref() | |
212 | * that loads the corresponding BRT entry (if one exists) and decreases | |
213 | * reference counter. If this is not the last reference we will stop ZIO | |
214 | * pipeline here. If this is the last reference or the block is not in the | |
215 | * BRT, we continue the pipeline and free the block as usual. | |
216 | * | |
217 | * At the beginning of spa_sync() where there can be no more block cloning, | |
218 | * but before issuing frees we call brt_pending_apply(). This function applies | |
219 | * all the new clones to the BRT table - we load BRT entries and update | |
220 | * reference counters. To sync new BRT entries to disk, we use brt_sync() | |
221 | * function. This function will sync all dirty per-top-level-vdev BRTs, | |
222 | * the entry counters arrays, etc. | |
223 | * | |
224 | * Block Cloning and ZIL. | |
225 | * | |
226 | * Every clone operation is divided into chunks (similar to write) and each | |
227 | * chunk is cloned in a separate transaction. The chunk size is determined by | |
228 | * how many BPs we can fit into a single ZIL entry. | |
229 | * Replaying clone operation is different from the regular clone operation, | |
230 | * as when we log clone operations we cannot use the source object - it may | |
231 | * reside on a different dataset, so we log BPs we want to clone. | |
232 | * The ZIL is replayed when we mount the given dataset, not when the pool is | |
233 | * imported. Taking this into account it is possible that the pool is imported | |
234 | * without mounting datasets and the source dataset is destroyed before the | |
235 | * destination dataset is mounted and its ZIL replayed. | |
236 | * To address this situation we leverage zil_claim() mechanism where ZFS will | |
237 | * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE | |
a03ebd9b AM |
238 | * entries, we will bump reference counters for their BPs in the BRT. Then |
239 | * on mount and ZIL replay we bump the reference counters once more, while the | |
240 | * first references are dropped during ZIL destroy by zil_free_clone_range(). | |
241 | * It is possible that after zil_claim() we never mount the destination, so | |
242 | * we never replay its ZIL and just destroy it. In this case the only taken | |
243 | * references will be dropped by zil_free_clone_range(), since the cloning is | |
244 | * not going to ever take place. | |
67a1b037 PJD |
245 | */ |
246 | ||
67a1b037 PJD |
247 | static kmem_cache_t *brt_entry_cache; |
248 | static kmem_cache_t *brt_pending_entry_cache; | |
249 | ||
250 | /* | |
251 | * Enable/disable prefetching of BRT entries that we are going to modify. | |
252 | */ | |
253 | int zfs_brt_prefetch = 1; | |
254 | ||
255 | #ifdef ZFS_DEBUG | |
256 | #define BRT_DEBUG(...) do { \ | |
257 | if ((zfs_flags & ZFS_DEBUG_BRT) != 0) { \ | |
258 | __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \ | |
259 | } \ | |
260 | } while (0) | |
261 | #else | |
262 | #define BRT_DEBUG(...) do { } while (0) | |
263 | #endif | |
264 | ||
265 | int brt_zap_leaf_blockshift = 12; | |
266 | int brt_zap_indirect_blockshift = 12; | |
267 | ||
268 | static kstat_t *brt_ksp; | |
269 | ||
270 | typedef struct brt_stats { | |
271 | kstat_named_t brt_addref_entry_in_memory; | |
272 | kstat_named_t brt_addref_entry_not_on_disk; | |
273 | kstat_named_t brt_addref_entry_on_disk; | |
274 | kstat_named_t brt_addref_entry_read_lost_race; | |
275 | kstat_named_t brt_decref_entry_in_memory; | |
276 | kstat_named_t brt_decref_entry_loaded_from_disk; | |
277 | kstat_named_t brt_decref_entry_not_in_memory; | |
278 | kstat_named_t brt_decref_entry_not_on_disk; | |
279 | kstat_named_t brt_decref_entry_read_lost_race; | |
280 | kstat_named_t brt_decref_entry_still_referenced; | |
281 | kstat_named_t brt_decref_free_data_later; | |
282 | kstat_named_t brt_decref_free_data_now; | |
283 | kstat_named_t brt_decref_no_entry; | |
284 | } brt_stats_t; | |
285 | ||
286 | static brt_stats_t brt_stats = { | |
287 | { "addref_entry_in_memory", KSTAT_DATA_UINT64 }, | |
288 | { "addref_entry_not_on_disk", KSTAT_DATA_UINT64 }, | |
289 | { "addref_entry_on_disk", KSTAT_DATA_UINT64 }, | |
290 | { "addref_entry_read_lost_race", KSTAT_DATA_UINT64 }, | |
291 | { "decref_entry_in_memory", KSTAT_DATA_UINT64 }, | |
292 | { "decref_entry_loaded_from_disk", KSTAT_DATA_UINT64 }, | |
293 | { "decref_entry_not_in_memory", KSTAT_DATA_UINT64 }, | |
294 | { "decref_entry_not_on_disk", KSTAT_DATA_UINT64 }, | |
295 | { "decref_entry_read_lost_race", KSTAT_DATA_UINT64 }, | |
296 | { "decref_entry_still_referenced", KSTAT_DATA_UINT64 }, | |
297 | { "decref_free_data_later", KSTAT_DATA_UINT64 }, | |
298 | { "decref_free_data_now", KSTAT_DATA_UINT64 }, | |
299 | { "decref_no_entry", KSTAT_DATA_UINT64 } | |
300 | }; | |
301 | ||
302 | struct { | |
303 | wmsum_t brt_addref_entry_in_memory; | |
304 | wmsum_t brt_addref_entry_not_on_disk; | |
305 | wmsum_t brt_addref_entry_on_disk; | |
306 | wmsum_t brt_addref_entry_read_lost_race; | |
307 | wmsum_t brt_decref_entry_in_memory; | |
308 | wmsum_t brt_decref_entry_loaded_from_disk; | |
309 | wmsum_t brt_decref_entry_not_in_memory; | |
310 | wmsum_t brt_decref_entry_not_on_disk; | |
311 | wmsum_t brt_decref_entry_read_lost_race; | |
312 | wmsum_t brt_decref_entry_still_referenced; | |
313 | wmsum_t brt_decref_free_data_later; | |
314 | wmsum_t brt_decref_free_data_now; | |
315 | wmsum_t brt_decref_no_entry; | |
316 | } brt_sums; | |
317 | ||
318 | #define BRTSTAT_BUMP(stat) wmsum_add(&brt_sums.stat, 1) | |
319 | ||
320 | static int brt_entry_compare(const void *x1, const void *x2); | |
321 | static int brt_pending_entry_compare(const void *x1, const void *x2); | |
322 | ||
323 | static void | |
324 | brt_rlock(brt_t *brt) | |
325 | { | |
326 | rw_enter(&brt->brt_lock, RW_READER); | |
327 | } | |
328 | ||
329 | static void | |
330 | brt_wlock(brt_t *brt) | |
331 | { | |
332 | rw_enter(&brt->brt_lock, RW_WRITER); | |
333 | } | |
334 | ||
335 | static void | |
336 | brt_unlock(brt_t *brt) | |
337 | { | |
338 | rw_exit(&brt->brt_lock); | |
339 | } | |
340 | ||
341 | static uint16_t | |
342 | brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) | |
343 | { | |
344 | ||
345 | ASSERT3U(idx, <, brtvd->bv_size); | |
346 | ||
347 | if (brtvd->bv_need_byteswap) { | |
348 | return (BSWAP_16(brtvd->bv_entcount[idx])); | |
349 | } else { | |
350 | return (brtvd->bv_entcount[idx]); | |
351 | } | |
352 | } | |
353 | ||
354 | static void | |
355 | brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) | |
356 | { | |
357 | ||
358 | ASSERT3U(idx, <, brtvd->bv_size); | |
359 | ||
360 | if (brtvd->bv_need_byteswap) { | |
361 | brtvd->bv_entcount[idx] = BSWAP_16(entcnt); | |
362 | } else { | |
363 | brtvd->bv_entcount[idx] = entcnt; | |
364 | } | |
365 | } | |
366 | ||
367 | static void | |
368 | brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx) | |
369 | { | |
370 | uint16_t entcnt; | |
371 | ||
372 | ASSERT3U(idx, <, brtvd->bv_size); | |
373 | ||
374 | entcnt = brt_vdev_entcount_get(brtvd, idx); | |
375 | ASSERT(entcnt < UINT16_MAX); | |
376 | ||
377 | brt_vdev_entcount_set(brtvd, idx, entcnt + 1); | |
378 | } | |
379 | ||
380 | static void | |
381 | brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) | |
382 | { | |
383 | uint16_t entcnt; | |
384 | ||
385 | ASSERT3U(idx, <, brtvd->bv_size); | |
386 | ||
387 | entcnt = brt_vdev_entcount_get(brtvd, idx); | |
388 | ASSERT(entcnt > 0); | |
389 | ||
390 | brt_vdev_entcount_set(brtvd, idx, entcnt - 1); | |
391 | } | |
392 | ||
393 | #ifdef ZFS_DEBUG | |
394 | static void | |
395 | brt_vdev_dump(brt_t *brt) | |
396 | { | |
397 | brt_vdev_t *brtvd; | |
398 | uint64_t vdevid; | |
399 | ||
400 | if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { | |
401 | return; | |
402 | } | |
403 | ||
404 | if (brt->brt_nvdevs == 0) { | |
405 | zfs_dbgmsg("BRT empty"); | |
406 | return; | |
407 | } | |
408 | ||
409 | zfs_dbgmsg("BRT vdev dump:"); | |
410 | for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { | |
411 | uint64_t idx; | |
412 | ||
413 | brtvd = &brt->brt_vdevs[vdevid]; | |
414 | zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " | |
415 | "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", | |
416 | (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, | |
417 | brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, | |
418 | (u_longlong_t)brtvd->bv_size, | |
419 | (u_longlong_t)brtvd->bv_totalcount, | |
420 | (u_longlong_t)brtvd->bv_nblocks, | |
421 | (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); | |
422 | if (brtvd->bv_totalcount > 0) { | |
423 | zfs_dbgmsg(" entcounts:"); | |
424 | for (idx = 0; idx < brtvd->bv_size; idx++) { | |
425 | if (brt_vdev_entcount_get(brtvd, idx) > 0) { | |
426 | zfs_dbgmsg(" [%04llu] %hu", | |
427 | (u_longlong_t)idx, | |
428 | brt_vdev_entcount_get(brtvd, idx)); | |
429 | } | |
430 | } | |
431 | } | |
432 | if (brtvd->bv_entcount_dirty) { | |
433 | char *bitmap; | |
434 | ||
435 | bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); | |
436 | for (idx = 0; idx < brtvd->bv_nblocks; idx++) { | |
437 | bitmap[idx] = | |
438 | BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; | |
439 | } | |
440 | bitmap[idx] = '\0'; | |
441 | zfs_dbgmsg(" bitmap: %s", bitmap); | |
442 | kmem_free(bitmap, brtvd->bv_nblocks + 1); | |
443 | } | |
444 | } | |
445 | } | |
446 | #endif | |
447 | ||
448 | static brt_vdev_t * | |
449 | brt_vdev(brt_t *brt, uint64_t vdevid) | |
450 | { | |
451 | brt_vdev_t *brtvd; | |
452 | ||
453 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
454 | ||
455 | if (vdevid < brt->brt_nvdevs) { | |
456 | brtvd = &brt->brt_vdevs[vdevid]; | |
457 | } else { | |
458 | brtvd = NULL; | |
459 | } | |
460 | ||
461 | return (brtvd); | |
462 | } | |
463 | ||
464 | static void | |
465 | brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | |
466 | { | |
467 | char name[64]; | |
468 | ||
469 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
470 | ASSERT0(brtvd->bv_mos_brtvdev); | |
471 | ASSERT0(brtvd->bv_mos_entries); | |
472 | ASSERT(brtvd->bv_entcount != NULL); | |
473 | ASSERT(brtvd->bv_size > 0); | |
474 | ASSERT(brtvd->bv_bitmap != NULL); | |
475 | ASSERT(brtvd->bv_nblocks > 0); | |
476 | ||
477 | brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0, | |
478 | ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA, | |
479 | brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE, | |
480 | 0, tx); | |
481 | VERIFY(brtvd->bv_mos_entries != 0); | |
482 | BRT_DEBUG("MOS entries created, object=%llu", | |
483 | (u_longlong_t)brtvd->bv_mos_entries); | |
484 | ||
485 | /* | |
486 | * We allocate DMU buffer to store the bv_entcount[] array. | |
487 | * We will keep array size (bv_size) and cummulative count for all | |
488 | * bv_entcount[]s (bv_totalcount) in the bonus buffer. | |
489 | */ | |
490 | brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos, | |
491 | DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE, | |
492 | DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx); | |
493 | VERIFY(brtvd->bv_mos_brtvdev != 0); | |
494 | BRT_DEBUG("MOS BRT VDEV created, object=%llu", | |
495 | (u_longlong_t)brtvd->bv_mos_brtvdev); | |
496 | ||
497 | snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, | |
498 | (u_longlong_t)brtvd->bv_vdevid); | |
499 | VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, | |
500 | sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx)); | |
501 | BRT_DEBUG("Pool directory object created, object=%s", name); | |
502 | ||
503 | spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); | |
504 | } | |
505 | ||
506 | static void | |
507 | brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd) | |
508 | { | |
509 | vdev_t *vd; | |
510 | uint16_t *entcount; | |
511 | ulong_t *bitmap; | |
512 | uint64_t nblocks, size; | |
513 | ||
514 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
515 | ||
516 | spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER); | |
517 | vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid); | |
518 | size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1; | |
519 | spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG); | |
520 | ||
87a6e135 | 521 | entcount = vmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP); |
67a1b037 PJD |
522 | nblocks = BRT_RANGESIZE_TO_NBLOCKS(size); |
523 | bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP); | |
524 | ||
525 | if (!brtvd->bv_initiated) { | |
526 | ASSERT0(brtvd->bv_size); | |
527 | ASSERT(brtvd->bv_entcount == NULL); | |
528 | ASSERT(brtvd->bv_bitmap == NULL); | |
529 | ASSERT0(brtvd->bv_nblocks); | |
530 | ||
531 | avl_create(&brtvd->bv_tree, brt_entry_compare, | |
532 | sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node)); | |
533 | } else { | |
534 | ASSERT(brtvd->bv_size > 0); | |
535 | ASSERT(brtvd->bv_entcount != NULL); | |
536 | ASSERT(brtvd->bv_bitmap != NULL); | |
537 | ASSERT(brtvd->bv_nblocks > 0); | |
538 | /* | |
539 | * TODO: Allow vdev shrinking. We only need to implement | |
540 | * shrinking the on-disk BRT VDEV object. | |
541 | * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset, | |
542 | * size, tx); | |
543 | */ | |
544 | ASSERT3U(brtvd->bv_size, <=, size); | |
545 | ||
546 | memcpy(entcount, brtvd->bv_entcount, | |
547 | sizeof (entcount[0]) * MIN(size, brtvd->bv_size)); | |
548 | memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks), | |
549 | BT_SIZEOFMAP(brtvd->bv_nblocks))); | |
87a6e135 | 550 | vmem_free(brtvd->bv_entcount, |
67a1b037 PJD |
551 | sizeof (entcount[0]) * brtvd->bv_size); |
552 | kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); | |
553 | } | |
554 | ||
555 | brtvd->bv_size = size; | |
556 | brtvd->bv_entcount = entcount; | |
557 | brtvd->bv_bitmap = bitmap; | |
558 | brtvd->bv_nblocks = nblocks; | |
559 | if (!brtvd->bv_initiated) { | |
560 | brtvd->bv_need_byteswap = FALSE; | |
561 | brtvd->bv_initiated = TRUE; | |
562 | BRT_DEBUG("BRT VDEV %llu initiated.", | |
563 | (u_longlong_t)brtvd->bv_vdevid); | |
564 | } | |
565 | } | |
566 | ||
567 | static void | |
568 | brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd) | |
569 | { | |
570 | char name[64]; | |
571 | dmu_buf_t *db; | |
572 | brt_vdev_phys_t *bvphys; | |
573 | int error; | |
574 | ||
575 | snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, | |
576 | (u_longlong_t)brtvd->bv_vdevid); | |
577 | error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, | |
578 | sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev); | |
579 | if (error != 0) | |
580 | return; | |
581 | ASSERT(brtvd->bv_mos_brtvdev != 0); | |
582 | ||
583 | error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db); | |
584 | ASSERT0(error); | |
585 | if (error != 0) | |
586 | return; | |
587 | ||
588 | bvphys = db->db_data; | |
589 | if (brt->brt_rangesize == 0) { | |
590 | brt->brt_rangesize = bvphys->bvp_rangesize; | |
591 | } else { | |
592 | ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize); | |
593 | } | |
594 | ||
595 | ASSERT(!brtvd->bv_initiated); | |
596 | brt_vdev_realloc(brt, brtvd); | |
597 | ||
598 | /* TODO: We don't support VDEV shrinking. */ | |
599 | ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size); | |
600 | ||
601 | /* | |
602 | * If VDEV grew, we will leave new bv_entcount[] entries zeroed out. | |
603 | */ | |
604 | error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, | |
605 | MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t), | |
606 | brtvd->bv_entcount, DMU_READ_NO_PREFETCH); | |
607 | ASSERT0(error); | |
608 | ||
609 | brtvd->bv_mos_entries = bvphys->bvp_mos_entries; | |
610 | ASSERT(brtvd->bv_mos_entries != 0); | |
611 | brtvd->bv_need_byteswap = | |
612 | (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER); | |
613 | brtvd->bv_totalcount = bvphys->bvp_totalcount; | |
614 | brtvd->bv_usedspace = bvphys->bvp_usedspace; | |
615 | brtvd->bv_savedspace = bvphys->bvp_savedspace; | |
616 | brt->brt_usedspace += brtvd->bv_usedspace; | |
617 | brt->brt_savedspace += brtvd->bv_savedspace; | |
618 | ||
619 | dmu_buf_rele(db, FTAG); | |
620 | ||
621 | BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu", | |
622 | name, (u_longlong_t)brtvd->bv_mos_brtvdev, | |
623 | (u_longlong_t)brtvd->bv_mos_entries); | |
624 | } | |
625 | ||
626 | static void | |
627 | brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd) | |
628 | { | |
629 | ||
630 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
631 | ASSERT(brtvd->bv_initiated); | |
632 | ||
87a6e135 | 633 | vmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size); |
67a1b037 PJD |
634 | brtvd->bv_entcount = NULL; |
635 | kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks)); | |
636 | brtvd->bv_bitmap = NULL; | |
637 | ASSERT0(avl_numnodes(&brtvd->bv_tree)); | |
638 | avl_destroy(&brtvd->bv_tree); | |
639 | ||
640 | brtvd->bv_size = 0; | |
641 | brtvd->bv_nblocks = 0; | |
642 | ||
643 | brtvd->bv_initiated = FALSE; | |
644 | BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid); | |
645 | } | |
646 | ||
647 | static void | |
648 | brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | |
649 | { | |
650 | char name[64]; | |
651 | uint64_t count; | |
652 | dmu_buf_t *db; | |
653 | brt_vdev_phys_t *bvphys; | |
654 | ||
655 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
656 | ASSERT(brtvd->bv_mos_brtvdev != 0); | |
657 | ASSERT(brtvd->bv_mos_entries != 0); | |
658 | ||
659 | VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count)); | |
660 | VERIFY0(count); | |
661 | VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx)); | |
662 | BRT_DEBUG("MOS entries destroyed, object=%llu", | |
663 | (u_longlong_t)brtvd->bv_mos_entries); | |
664 | brtvd->bv_mos_entries = 0; | |
665 | ||
666 | VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); | |
667 | bvphys = db->db_data; | |
668 | ASSERT0(bvphys->bvp_totalcount); | |
669 | ASSERT0(bvphys->bvp_usedspace); | |
670 | ASSERT0(bvphys->bvp_savedspace); | |
671 | dmu_buf_rele(db, FTAG); | |
672 | ||
673 | VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx)); | |
674 | BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu", | |
675 | (u_longlong_t)brtvd->bv_mos_brtvdev); | |
676 | brtvd->bv_mos_brtvdev = 0; | |
677 | ||
678 | snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX, | |
679 | (u_longlong_t)brtvd->bv_vdevid); | |
680 | VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx)); | |
681 | BRT_DEBUG("Pool directory object removed, object=%s", name); | |
682 | ||
683 | brt_vdev_dealloc(brt, brtvd); | |
684 | ||
685 | spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx); | |
686 | } | |
687 | ||
688 | static void | |
689 | brt_vdevs_expand(brt_t *brt, uint64_t nvdevs) | |
690 | { | |
691 | brt_vdev_t *brtvd, *vdevs; | |
692 | uint64_t vdevid; | |
693 | ||
694 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
695 | ASSERT3U(nvdevs, >, brt->brt_nvdevs); | |
696 | ||
697 | vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP); | |
698 | if (brt->brt_nvdevs > 0) { | |
699 | ASSERT(brt->brt_vdevs != NULL); | |
700 | ||
701 | memcpy(vdevs, brt->brt_vdevs, | |
702 | sizeof (brt_vdev_t) * brt->brt_nvdevs); | |
703 | kmem_free(brt->brt_vdevs, | |
704 | sizeof (brt_vdev_t) * brt->brt_nvdevs); | |
705 | } | |
706 | for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) { | |
707 | brtvd = &vdevs[vdevid]; | |
708 | ||
709 | brtvd->bv_vdevid = vdevid; | |
710 | brtvd->bv_initiated = FALSE; | |
711 | } | |
712 | ||
713 | BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.", | |
714 | (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs); | |
715 | ||
716 | brt->brt_vdevs = vdevs; | |
717 | brt->brt_nvdevs = nvdevs; | |
718 | } | |
719 | ||
720 | static boolean_t | |
721 | brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre) | |
722 | { | |
723 | uint64_t idx; | |
724 | ||
725 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
726 | ||
727 | idx = bre->bre_offset / brt->brt_rangesize; | |
728 | if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) { | |
729 | /* VDEV wasn't expanded. */ | |
730 | return (brt_vdev_entcount_get(brtvd, idx) > 0); | |
731 | } | |
732 | ||
733 | return (FALSE); | |
734 | } | |
735 | ||
736 | static void | |
737 | brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, | |
738 | uint64_t dsize) | |
739 | { | |
740 | uint64_t idx; | |
741 | ||
742 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
743 | ASSERT(brtvd != NULL); | |
744 | ASSERT(brtvd->bv_entcount != NULL); | |
745 | ||
746 | brt->brt_savedspace += dsize; | |
747 | brtvd->bv_savedspace += dsize; | |
748 | brtvd->bv_meta_dirty = TRUE; | |
749 | ||
750 | if (bre->bre_refcount > 1) { | |
751 | return; | |
752 | } | |
753 | ||
754 | brt->brt_usedspace += dsize; | |
755 | brtvd->bv_usedspace += dsize; | |
756 | ||
757 | idx = bre->bre_offset / brt->brt_rangesize; | |
758 | if (idx >= brtvd->bv_size) { | |
759 | /* VDEV has been expanded. */ | |
760 | brt_vdev_realloc(brt, brtvd); | |
761 | } | |
762 | ||
763 | ASSERT3U(idx, <, brtvd->bv_size); | |
764 | ||
765 | brtvd->bv_totalcount++; | |
766 | brt_vdev_entcount_inc(brtvd, idx); | |
767 | brtvd->bv_entcount_dirty = TRUE; | |
768 | idx = idx / BRT_BLOCKSIZE / 8; | |
769 | BT_SET(brtvd->bv_bitmap, idx); | |
770 | ||
771 | #ifdef ZFS_DEBUG | |
772 | brt_vdev_dump(brt); | |
773 | #endif | |
774 | } | |
775 | ||
776 | static void | |
777 | brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, | |
778 | uint64_t dsize) | |
779 | { | |
780 | uint64_t idx; | |
781 | ||
782 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
783 | ASSERT(brtvd != NULL); | |
784 | ASSERT(brtvd->bv_entcount != NULL); | |
785 | ||
786 | brt->brt_savedspace -= dsize; | |
787 | brtvd->bv_savedspace -= dsize; | |
788 | brtvd->bv_meta_dirty = TRUE; | |
789 | ||
790 | if (bre->bre_refcount > 0) { | |
791 | return; | |
792 | } | |
793 | ||
794 | brt->brt_usedspace -= dsize; | |
795 | brtvd->bv_usedspace -= dsize; | |
796 | ||
797 | idx = bre->bre_offset / brt->brt_rangesize; | |
798 | ASSERT3U(idx, <, brtvd->bv_size); | |
799 | ||
800 | ASSERT(brtvd->bv_totalcount > 0); | |
801 | brtvd->bv_totalcount--; | |
802 | brt_vdev_entcount_dec(brtvd, idx); | |
803 | brtvd->bv_entcount_dirty = TRUE; | |
804 | idx = idx / BRT_BLOCKSIZE / 8; | |
805 | BT_SET(brtvd->bv_bitmap, idx); | |
806 | ||
807 | #ifdef ZFS_DEBUG | |
808 | brt_vdev_dump(brt); | |
809 | #endif | |
810 | } | |
811 | ||
812 | static void | |
813 | brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx) | |
814 | { | |
815 | dmu_buf_t *db; | |
816 | brt_vdev_phys_t *bvphys; | |
817 | ||
818 | ASSERT(brtvd->bv_meta_dirty); | |
819 | ASSERT(brtvd->bv_mos_brtvdev != 0); | |
820 | ASSERT(dmu_tx_is_syncing(tx)); | |
821 | ||
822 | VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db)); | |
823 | ||
824 | if (brtvd->bv_entcount_dirty) { | |
825 | /* | |
826 | * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks. | |
827 | */ | |
828 | dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0, | |
829 | brtvd->bv_size * sizeof (brtvd->bv_entcount[0]), | |
830 | brtvd->bv_entcount, tx); | |
831 | memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks)); | |
832 | brtvd->bv_entcount_dirty = FALSE; | |
833 | } | |
834 | ||
835 | dmu_buf_will_dirty(db, tx); | |
836 | bvphys = db->db_data; | |
837 | bvphys->bvp_mos_entries = brtvd->bv_mos_entries; | |
838 | bvphys->bvp_size = brtvd->bv_size; | |
839 | if (brtvd->bv_need_byteswap) { | |
840 | bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER; | |
841 | } else { | |
842 | bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER; | |
843 | } | |
844 | bvphys->bvp_totalcount = brtvd->bv_totalcount; | |
845 | bvphys->bvp_rangesize = brt->brt_rangesize; | |
846 | bvphys->bvp_usedspace = brtvd->bv_usedspace; | |
847 | bvphys->bvp_savedspace = brtvd->bv_savedspace; | |
848 | dmu_buf_rele(db, FTAG); | |
849 | ||
850 | brtvd->bv_meta_dirty = FALSE; | |
851 | } | |
852 | ||
853 | static void | |
854 | brt_vdevs_alloc(brt_t *brt, boolean_t load) | |
855 | { | |
856 | brt_vdev_t *brtvd; | |
857 | uint64_t vdevid; | |
858 | ||
859 | brt_wlock(brt); | |
860 | ||
861 | brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children); | |
862 | ||
863 | if (load) { | |
864 | for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { | |
865 | brtvd = &brt->brt_vdevs[vdevid]; | |
866 | ASSERT(brtvd->bv_entcount == NULL); | |
867 | ||
868 | brt_vdev_load(brt, brtvd); | |
869 | } | |
870 | } | |
871 | ||
872 | if (brt->brt_rangesize == 0) { | |
873 | brt->brt_rangesize = BRT_RANGESIZE; | |
874 | } | |
875 | ||
876 | brt_unlock(brt); | |
877 | } | |
878 | ||
879 | static void | |
880 | brt_vdevs_free(brt_t *brt) | |
881 | { | |
882 | brt_vdev_t *brtvd; | |
883 | uint64_t vdevid; | |
884 | ||
885 | brt_wlock(brt); | |
886 | ||
887 | for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { | |
888 | brtvd = &brt->brt_vdevs[vdevid]; | |
889 | if (brtvd->bv_initiated) | |
890 | brt_vdev_dealloc(brt, brtvd); | |
891 | } | |
892 | kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs); | |
893 | ||
894 | brt_unlock(brt); | |
895 | } | |
896 | ||
897 | static void | |
898 | brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp) | |
899 | { | |
900 | ||
901 | bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]); | |
902 | bre->bre_refcount = 0; | |
903 | ||
904 | *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]); | |
905 | } | |
906 | ||
907 | static int | |
908 | brt_entry_compare(const void *x1, const void *x2) | |
909 | { | |
910 | const brt_entry_t *bre1 = x1; | |
911 | const brt_entry_t *bre2 = x2; | |
912 | ||
913 | return (TREE_CMP(bre1->bre_offset, bre2->bre_offset)); | |
914 | } | |
915 | ||
916 | static int | |
917 | brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre) | |
918 | { | |
919 | uint64_t mos_entries; | |
920 | uint64_t one, physsize; | |
921 | int error; | |
922 | ||
923 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
924 | ||
925 | if (!brt_vdev_lookup(brt, brtvd, bre)) | |
926 | return (SET_ERROR(ENOENT)); | |
927 | ||
928 | /* | |
929 | * Remember mos_entries object number. After we reacquire the BRT lock, | |
930 | * the brtvd pointer may be invalid. | |
931 | */ | |
932 | mos_entries = brtvd->bv_mos_entries; | |
933 | if (mos_entries == 0) | |
934 | return (SET_ERROR(ENOENT)); | |
935 | ||
936 | brt_unlock(brt); | |
937 | ||
938 | error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset, | |
939 | BRT_KEY_WORDS, &one, &physsize); | |
940 | if (error == 0) { | |
941 | ASSERT3U(one, ==, 1); | |
942 | ASSERT3U(physsize, ==, sizeof (bre->bre_refcount)); | |
943 | ||
944 | error = zap_lookup_uint64(brt->brt_mos, mos_entries, | |
945 | &bre->bre_offset, BRT_KEY_WORDS, 1, | |
946 | sizeof (bre->bre_refcount), &bre->bre_refcount); | |
947 | BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu " | |
948 | "count=%llu error=%d", (u_longlong_t)mos_entries, | |
949 | (u_longlong_t)brtvd->bv_vdevid, | |
950 | (u_longlong_t)bre->bre_offset, | |
951 | error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error); | |
952 | } | |
953 | ||
954 | brt_wlock(brt); | |
955 | ||
956 | return (error); | |
957 | } | |
958 | ||
959 | static void | |
960 | brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre) | |
961 | { | |
962 | brt_vdev_t *brtvd; | |
963 | uint64_t mos_entries = 0; | |
964 | ||
965 | brt_rlock(brt); | |
966 | brtvd = brt_vdev(brt, vdevid); | |
967 | if (brtvd != NULL) | |
968 | mos_entries = brtvd->bv_mos_entries; | |
969 | brt_unlock(brt); | |
970 | ||
971 | if (mos_entries == 0) | |
972 | return; | |
973 | ||
974 | BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu", | |
975 | (u_longlong_t)mos_entries, (u_longlong_t)vdevid, | |
976 | (u_longlong_t)bre->bre_offset); | |
977 | (void) zap_prefetch_uint64(brt->brt_mos, mos_entries, | |
978 | (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS); | |
979 | } | |
980 | ||
981 | static int | |
982 | brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) | |
983 | { | |
984 | int error; | |
985 | ||
986 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
987 | ASSERT(brtvd->bv_mos_entries != 0); | |
988 | ASSERT(bre->bre_refcount > 0); | |
989 | ||
990 | error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries, | |
991 | (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1, | |
992 | sizeof (bre->bre_refcount), &bre->bre_refcount, tx); | |
993 | BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu " | |
994 | "error=%d", (u_longlong_t)brtvd->bv_mos_entries, | |
995 | (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, | |
996 | (u_longlong_t)bre->bre_refcount, error); | |
997 | ||
998 | return (error); | |
999 | } | |
1000 | ||
1001 | static int | |
1002 | brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) | |
1003 | { | |
1004 | int error; | |
1005 | ||
1006 | ASSERT(RW_LOCK_HELD(&brt->brt_lock)); | |
1007 | ASSERT(brtvd->bv_mos_entries != 0); | |
1008 | ASSERT0(bre->bre_refcount); | |
1009 | ||
1010 | error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries, | |
1011 | (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx); | |
1012 | BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu " | |
1013 | "error=%d", (u_longlong_t)brtvd->bv_mos_entries, | |
1014 | (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset, | |
1015 | (u_longlong_t)bre->bre_refcount, error); | |
1016 | ||
1017 | return (error); | |
1018 | } | |
1019 | ||
1020 | /* | |
1021 | * Return TRUE if we _can_ have BRT entry for this bp. It might be false | |
1022 | * positive, but gives us quick answer if we should look into BRT, which | |
1023 | * may require reads and thus will be more expensive. | |
1024 | */ | |
1025 | boolean_t | |
1026 | brt_maybe_exists(spa_t *spa, const blkptr_t *bp) | |
1027 | { | |
1028 | brt_t *brt = spa->spa_brt; | |
1029 | brt_vdev_t *brtvd; | |
1030 | brt_entry_t bre_search; | |
1031 | boolean_t mayexists = FALSE; | |
1032 | uint64_t vdevid; | |
1033 | ||
1034 | brt_entry_fill(bp, &bre_search, &vdevid); | |
1035 | ||
1036 | brt_rlock(brt); | |
1037 | ||
1038 | brtvd = brt_vdev(brt, vdevid); | |
1039 | if (brtvd != NULL && brtvd->bv_initiated) { | |
1040 | if (!avl_is_empty(&brtvd->bv_tree) || | |
1041 | brt_vdev_lookup(brt, brtvd, &bre_search)) { | |
1042 | mayexists = TRUE; | |
1043 | } | |
1044 | } | |
1045 | ||
1046 | brt_unlock(brt); | |
1047 | ||
1048 | return (mayexists); | |
1049 | } | |
1050 | ||
1051 | uint64_t | |
1052 | brt_get_dspace(spa_t *spa) | |
1053 | { | |
1054 | brt_t *brt = spa->spa_brt; | |
1055 | ||
1056 | if (brt == NULL) | |
1057 | return (0); | |
1058 | ||
1059 | return (brt->brt_savedspace); | |
1060 | } | |
1061 | ||
1062 | uint64_t | |
1063 | brt_get_used(spa_t *spa) | |
1064 | { | |
1065 | brt_t *brt = spa->spa_brt; | |
1066 | ||
1067 | if (brt == NULL) | |
1068 | return (0); | |
1069 | ||
1070 | return (brt->brt_usedspace); | |
1071 | } | |
1072 | ||
1073 | uint64_t | |
1074 | brt_get_saved(spa_t *spa) | |
1075 | { | |
1076 | brt_t *brt = spa->spa_brt; | |
1077 | ||
1078 | if (brt == NULL) | |
1079 | return (0); | |
1080 | ||
1081 | return (brt->brt_savedspace); | |
1082 | } | |
1083 | ||
1084 | uint64_t | |
1085 | brt_get_ratio(spa_t *spa) | |
1086 | { | |
1087 | brt_t *brt = spa->spa_brt; | |
1088 | ||
1089 | if (brt->brt_usedspace == 0) | |
1090 | return (100); | |
1091 | ||
1092 | return ((brt->brt_usedspace + brt->brt_savedspace) * 100 / | |
1093 | brt->brt_usedspace); | |
1094 | } | |
1095 | ||
1096 | static int | |
1097 | brt_kstats_update(kstat_t *ksp, int rw) | |
1098 | { | |
1099 | brt_stats_t *bs = ksp->ks_data; | |
1100 | ||
1101 | if (rw == KSTAT_WRITE) | |
1102 | return (EACCES); | |
1103 | ||
1104 | bs->brt_addref_entry_in_memory.value.ui64 = | |
1105 | wmsum_value(&brt_sums.brt_addref_entry_in_memory); | |
1106 | bs->brt_addref_entry_not_on_disk.value.ui64 = | |
1107 | wmsum_value(&brt_sums.brt_addref_entry_not_on_disk); | |
1108 | bs->brt_addref_entry_on_disk.value.ui64 = | |
1109 | wmsum_value(&brt_sums.brt_addref_entry_on_disk); | |
1110 | bs->brt_addref_entry_read_lost_race.value.ui64 = | |
1111 | wmsum_value(&brt_sums.brt_addref_entry_read_lost_race); | |
1112 | bs->brt_decref_entry_in_memory.value.ui64 = | |
1113 | wmsum_value(&brt_sums.brt_decref_entry_in_memory); | |
1114 | bs->brt_decref_entry_loaded_from_disk.value.ui64 = | |
1115 | wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk); | |
1116 | bs->brt_decref_entry_not_in_memory.value.ui64 = | |
1117 | wmsum_value(&brt_sums.brt_decref_entry_not_in_memory); | |
1118 | bs->brt_decref_entry_not_on_disk.value.ui64 = | |
1119 | wmsum_value(&brt_sums.brt_decref_entry_not_on_disk); | |
1120 | bs->brt_decref_entry_read_lost_race.value.ui64 = | |
1121 | wmsum_value(&brt_sums.brt_decref_entry_read_lost_race); | |
1122 | bs->brt_decref_entry_still_referenced.value.ui64 = | |
1123 | wmsum_value(&brt_sums.brt_decref_entry_still_referenced); | |
1124 | bs->brt_decref_free_data_later.value.ui64 = | |
1125 | wmsum_value(&brt_sums.brt_decref_free_data_later); | |
1126 | bs->brt_decref_free_data_now.value.ui64 = | |
1127 | wmsum_value(&brt_sums.brt_decref_free_data_now); | |
1128 | bs->brt_decref_no_entry.value.ui64 = | |
1129 | wmsum_value(&brt_sums.brt_decref_no_entry); | |
1130 | ||
1131 | return (0); | |
1132 | } | |
1133 | ||
1134 | static void | |
1135 | brt_stat_init(void) | |
1136 | { | |
1137 | ||
1138 | wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0); | |
1139 | wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0); | |
1140 | wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0); | |
1141 | wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0); | |
1142 | wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0); | |
1143 | wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0); | |
1144 | wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0); | |
1145 | wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0); | |
1146 | wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0); | |
1147 | wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0); | |
1148 | wmsum_init(&brt_sums.brt_decref_free_data_later, 0); | |
1149 | wmsum_init(&brt_sums.brt_decref_free_data_now, 0); | |
1150 | wmsum_init(&brt_sums.brt_decref_no_entry, 0); | |
1151 | ||
1152 | brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED, | |
1153 | sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); | |
1154 | if (brt_ksp != NULL) { | |
1155 | brt_ksp->ks_data = &brt_stats; | |
1156 | brt_ksp->ks_update = brt_kstats_update; | |
1157 | kstat_install(brt_ksp); | |
1158 | } | |
1159 | } | |
1160 | ||
1161 | static void | |
1162 | brt_stat_fini(void) | |
1163 | { | |
1164 | if (brt_ksp != NULL) { | |
1165 | kstat_delete(brt_ksp); | |
1166 | brt_ksp = NULL; | |
1167 | } | |
1168 | ||
1169 | wmsum_fini(&brt_sums.brt_addref_entry_in_memory); | |
1170 | wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk); | |
1171 | wmsum_fini(&brt_sums.brt_addref_entry_on_disk); | |
1172 | wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race); | |
1173 | wmsum_fini(&brt_sums.brt_decref_entry_in_memory); | |
1174 | wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk); | |
1175 | wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory); | |
1176 | wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk); | |
1177 | wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race); | |
1178 | wmsum_fini(&brt_sums.brt_decref_entry_still_referenced); | |
1179 | wmsum_fini(&brt_sums.brt_decref_free_data_later); | |
1180 | wmsum_fini(&brt_sums.brt_decref_free_data_now); | |
1181 | wmsum_fini(&brt_sums.brt_decref_no_entry); | |
1182 | } | |
1183 | ||
1184 | void | |
1185 | brt_init(void) | |
1186 | { | |
1187 | brt_entry_cache = kmem_cache_create("brt_entry_cache", | |
1188 | sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | |
1189 | brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache", | |
1190 | sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); | |
1191 | ||
1192 | brt_stat_init(); | |
1193 | } | |
1194 | ||
1195 | void | |
1196 | brt_fini(void) | |
1197 | { | |
1198 | brt_stat_fini(); | |
1199 | ||
1200 | kmem_cache_destroy(brt_entry_cache); | |
1201 | kmem_cache_destroy(brt_pending_entry_cache); | |
1202 | } | |
1203 | ||
1204 | static brt_entry_t * | |
1205 | brt_entry_alloc(const brt_entry_t *bre_init) | |
1206 | { | |
1207 | brt_entry_t *bre; | |
1208 | ||
1209 | bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP); | |
1210 | bre->bre_offset = bre_init->bre_offset; | |
1211 | bre->bre_refcount = bre_init->bre_refcount; | |
1212 | ||
1213 | return (bre); | |
1214 | } | |
1215 | ||
1216 | static void | |
1217 | brt_entry_free(brt_entry_t *bre) | |
1218 | { | |
1219 | ||
1220 | kmem_cache_free(brt_entry_cache, bre); | |
1221 | } | |
1222 | ||
1223 | static void | |
1224 | brt_entry_addref(brt_t *brt, const blkptr_t *bp) | |
1225 | { | |
1226 | brt_vdev_t *brtvd; | |
1227 | brt_entry_t *bre, *racebre; | |
1228 | brt_entry_t bre_search; | |
1229 | avl_index_t where; | |
1230 | uint64_t vdevid; | |
1231 | int error; | |
1232 | ||
1233 | ASSERT(!RW_WRITE_HELD(&brt->brt_lock)); | |
1234 | ||
1235 | brt_entry_fill(bp, &bre_search, &vdevid); | |
1236 | ||
1237 | brt_wlock(brt); | |
1238 | ||
1239 | brtvd = brt_vdev(brt, vdevid); | |
1240 | if (brtvd == NULL) { | |
1241 | ASSERT3U(vdevid, >=, brt->brt_nvdevs); | |
1242 | ||
1243 | /* New VDEV was added. */ | |
1244 | brt_vdevs_expand(brt, vdevid + 1); | |
1245 | brtvd = brt_vdev(brt, vdevid); | |
1246 | } | |
1247 | ASSERT(brtvd != NULL); | |
1248 | if (!brtvd->bv_initiated) | |
1249 | brt_vdev_realloc(brt, brtvd); | |
1250 | ||
1251 | bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); | |
1252 | if (bre != NULL) { | |
1253 | BRTSTAT_BUMP(brt_addref_entry_in_memory); | |
1254 | } else { | |
1255 | /* | |
1256 | * brt_entry_lookup() may drop the BRT (read) lock and | |
1257 | * reacquire it (write). | |
1258 | */ | |
1259 | error = brt_entry_lookup(brt, brtvd, &bre_search); | |
1260 | /* bre_search now contains correct bre_refcount */ | |
1261 | ASSERT(error == 0 || error == ENOENT); | |
1262 | if (error == 0) | |
1263 | BRTSTAT_BUMP(brt_addref_entry_on_disk); | |
1264 | else | |
1265 | BRTSTAT_BUMP(brt_addref_entry_not_on_disk); | |
1266 | /* | |
1267 | * When the BRT lock was dropped, brt_vdevs[] may have been | |
1268 | * expanded and reallocated, we need to update brtvd's pointer. | |
1269 | */ | |
1270 | brtvd = brt_vdev(brt, vdevid); | |
1271 | ASSERT(brtvd != NULL); | |
1272 | ||
1273 | racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); | |
1274 | if (racebre == NULL) { | |
1275 | bre = brt_entry_alloc(&bre_search); | |
1276 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
1277 | avl_insert(&brtvd->bv_tree, bre, where); | |
1278 | brt->brt_nentries++; | |
1279 | } else { | |
1280 | /* | |
1281 | * The entry was added when the BRT lock was dropped in | |
1282 | * brt_entry_lookup(). | |
1283 | */ | |
1284 | BRTSTAT_BUMP(brt_addref_entry_read_lost_race); | |
1285 | bre = racebre; | |
1286 | } | |
1287 | } | |
1288 | bre->bre_refcount++; | |
1289 | brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); | |
1290 | ||
1291 | brt_unlock(brt); | |
1292 | } | |
1293 | ||
1294 | /* Return TRUE if block should be freed immediately. */ | |
1295 | boolean_t | |
1296 | brt_entry_decref(spa_t *spa, const blkptr_t *bp) | |
1297 | { | |
1298 | brt_t *brt = spa->spa_brt; | |
1299 | brt_vdev_t *brtvd; | |
1300 | brt_entry_t *bre, *racebre; | |
1301 | brt_entry_t bre_search; | |
1302 | avl_index_t where; | |
1303 | uint64_t vdevid; | |
1304 | int error; | |
1305 | ||
1306 | brt_entry_fill(bp, &bre_search, &vdevid); | |
1307 | ||
1308 | brt_wlock(brt); | |
1309 | ||
1310 | brtvd = brt_vdev(brt, vdevid); | |
1311 | ASSERT(brtvd != NULL); | |
1312 | ||
1313 | bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); | |
1314 | if (bre != NULL) { | |
1315 | BRTSTAT_BUMP(brt_decref_entry_in_memory); | |
1316 | goto out; | |
1317 | } else { | |
1318 | BRTSTAT_BUMP(brt_decref_entry_not_in_memory); | |
1319 | } | |
1320 | ||
1321 | /* | |
1322 | * brt_entry_lookup() may drop the BRT lock and reacquire it. | |
1323 | */ | |
1324 | error = brt_entry_lookup(brt, brtvd, &bre_search); | |
1325 | /* bre_search now contains correct bre_refcount */ | |
1326 | ASSERT(error == 0 || error == ENOENT); | |
1327 | /* | |
1328 | * When the BRT lock was dropped, brt_vdevs[] may have been expanded | |
1329 | * and reallocated, we need to update brtvd's pointer. | |
1330 | */ | |
1331 | brtvd = brt_vdev(brt, vdevid); | |
1332 | ASSERT(brtvd != NULL); | |
1333 | ||
1334 | if (error == ENOENT) { | |
1335 | BRTSTAT_BUMP(brt_decref_entry_not_on_disk); | |
1336 | bre = NULL; | |
1337 | goto out; | |
1338 | } | |
1339 | ||
1340 | racebre = avl_find(&brtvd->bv_tree, &bre_search, &where); | |
1341 | if (racebre != NULL) { | |
1342 | /* | |
1343 | * The entry was added when the BRT lock was dropped in | |
1344 | * brt_entry_lookup(). | |
1345 | */ | |
1346 | BRTSTAT_BUMP(brt_decref_entry_read_lost_race); | |
1347 | bre = racebre; | |
1348 | goto out; | |
1349 | } | |
1350 | ||
1351 | BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk); | |
1352 | bre = brt_entry_alloc(&bre_search); | |
1353 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
1354 | avl_insert(&brtvd->bv_tree, bre, where); | |
1355 | brt->brt_nentries++; | |
1356 | ||
1357 | out: | |
1358 | if (bre == NULL) { | |
1359 | /* | |
1360 | * This is a free of a regular (not cloned) block. | |
1361 | */ | |
1362 | brt_unlock(brt); | |
1363 | BRTSTAT_BUMP(brt_decref_no_entry); | |
1364 | return (B_TRUE); | |
1365 | } | |
1366 | if (bre->bre_refcount == 0) { | |
1367 | brt_unlock(brt); | |
1368 | BRTSTAT_BUMP(brt_decref_free_data_now); | |
1369 | return (B_TRUE); | |
1370 | } | |
1371 | ||
1372 | ASSERT(bre->bre_refcount > 0); | |
1373 | bre->bre_refcount--; | |
1374 | if (bre->bre_refcount == 0) | |
1375 | BRTSTAT_BUMP(brt_decref_free_data_later); | |
1376 | else | |
1377 | BRTSTAT_BUMP(brt_decref_entry_still_referenced); | |
1378 | brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp)); | |
1379 | ||
1380 | brt_unlock(brt); | |
1381 | ||
1382 | return (B_FALSE); | |
1383 | } | |
1384 | ||
114a3996 RN |
1385 | uint64_t |
1386 | brt_entry_get_refcount(spa_t *spa, const blkptr_t *bp) | |
1387 | { | |
1388 | brt_t *brt = spa->spa_brt; | |
1389 | brt_vdev_t *brtvd; | |
1390 | brt_entry_t bre_search, *bre; | |
1391 | uint64_t vdevid, refcnt; | |
1392 | int error; | |
1393 | ||
1394 | brt_entry_fill(bp, &bre_search, &vdevid); | |
1395 | ||
1396 | brt_rlock(brt); | |
1397 | ||
1398 | brtvd = brt_vdev(brt, vdevid); | |
1399 | ASSERT(brtvd != NULL); | |
1400 | ||
1401 | bre = avl_find(&brtvd->bv_tree, &bre_search, NULL); | |
1402 | if (bre == NULL) { | |
1403 | error = brt_entry_lookup(brt, brtvd, &bre_search); | |
1404 | ASSERT(error == 0 || error == ENOENT); | |
1405 | if (error == ENOENT) | |
1406 | refcnt = 0; | |
1407 | else | |
1408 | refcnt = bre_search.bre_refcount; | |
1409 | } else | |
1410 | refcnt = bre->bre_refcount; | |
1411 | ||
1412 | brt_unlock(brt); | |
1413 | return (refcnt); | |
1414 | } | |
1415 | ||
67a1b037 PJD |
1416 | static void |
1417 | brt_prefetch(brt_t *brt, const blkptr_t *bp) | |
1418 | { | |
1419 | brt_entry_t bre; | |
1420 | uint64_t vdevid; | |
1421 | ||
1422 | ASSERT(bp != NULL); | |
1423 | ||
1424 | if (!zfs_brt_prefetch) | |
1425 | return; | |
1426 | ||
1427 | brt_entry_fill(bp, &bre, &vdevid); | |
1428 | ||
1429 | brt_entry_prefetch(brt, vdevid, &bre); | |
1430 | } | |
1431 | ||
1432 | static int | |
1433 | brt_pending_entry_compare(const void *x1, const void *x2) | |
1434 | { | |
1435 | const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2; | |
1436 | const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp; | |
1437 | int cmp; | |
1438 | ||
1439 | cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2)); | |
1440 | if (cmp == 0) { | |
1441 | cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]), | |
1442 | DVA_GET_VDEV(&bp2->blk_dva[0])); | |
1443 | if (cmp == 0) { | |
1444 | cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]), | |
1445 | DVA_GET_OFFSET(&bp2->blk_dva[0])); | |
1446 | } | |
1447 | } | |
1448 | ||
1449 | return (cmp); | |
1450 | } | |
1451 | ||
1452 | void | |
1453 | brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) | |
1454 | { | |
1455 | brt_t *brt; | |
1456 | avl_tree_t *pending_tree; | |
1457 | kmutex_t *pending_lock; | |
1458 | brt_pending_entry_t *bpe, *newbpe; | |
1459 | avl_index_t where; | |
1460 | uint64_t txg; | |
1461 | ||
1462 | brt = spa->spa_brt; | |
1463 | txg = dmu_tx_get_txg(tx); | |
1464 | ASSERT3U(txg, !=, 0); | |
1465 | pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; | |
1466 | pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; | |
1467 | ||
1468 | newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP); | |
1469 | newbpe->bpe_bp = *bp; | |
1470 | newbpe->bpe_count = 1; | |
1471 | ||
1472 | mutex_enter(pending_lock); | |
1473 | ||
1474 | bpe = avl_find(pending_tree, newbpe, &where); | |
1475 | if (bpe == NULL) { | |
1476 | avl_insert(pending_tree, newbpe, where); | |
1477 | newbpe = NULL; | |
1478 | } else { | |
1479 | bpe->bpe_count++; | |
1480 | } | |
1481 | ||
1482 | mutex_exit(pending_lock); | |
1483 | ||
1484 | if (newbpe != NULL) { | |
1485 | ASSERT(bpe != NULL); | |
1486 | ASSERT(bpe != newbpe); | |
1487 | kmem_cache_free(brt_pending_entry_cache, newbpe); | |
1488 | } else { | |
1489 | ASSERT(bpe == NULL); | |
1490 | } | |
1491 | ||
1492 | /* Prefetch BRT entry, as we will need it in the syncing context. */ | |
1493 | brt_prefetch(brt, bp); | |
1494 | } | |
1495 | ||
1496 | void | |
1497 | brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx) | |
1498 | { | |
1499 | brt_t *brt; | |
1500 | avl_tree_t *pending_tree; | |
1501 | kmutex_t *pending_lock; | |
1502 | brt_pending_entry_t *bpe, bpe_search; | |
1503 | uint64_t txg; | |
1504 | ||
1505 | brt = spa->spa_brt; | |
1506 | txg = dmu_tx_get_txg(tx); | |
1507 | ASSERT3U(txg, !=, 0); | |
1508 | pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; | |
1509 | pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; | |
1510 | ||
1511 | bpe_search.bpe_bp = *bp; | |
1512 | ||
1513 | mutex_enter(pending_lock); | |
1514 | ||
1515 | bpe = avl_find(pending_tree, &bpe_search, NULL); | |
1516 | /* I believe we should always find bpe when this function is called. */ | |
1517 | if (bpe != NULL) { | |
1518 | ASSERT(bpe->bpe_count > 0); | |
1519 | ||
1520 | bpe->bpe_count--; | |
1521 | if (bpe->bpe_count == 0) { | |
1522 | avl_remove(pending_tree, bpe); | |
1523 | kmem_cache_free(brt_pending_entry_cache, bpe); | |
1524 | } | |
1525 | } | |
1526 | ||
1527 | mutex_exit(pending_lock); | |
1528 | } | |
1529 | ||
1530 | void | |
1531 | brt_pending_apply(spa_t *spa, uint64_t txg) | |
1532 | { | |
1533 | brt_t *brt; | |
1534 | brt_pending_entry_t *bpe; | |
1535 | avl_tree_t *pending_tree; | |
1536 | kmutex_t *pending_lock; | |
1537 | void *c; | |
1538 | ||
1539 | ASSERT3U(txg, !=, 0); | |
1540 | ||
1541 | brt = spa->spa_brt; | |
1542 | pending_tree = &brt->brt_pending_tree[txg & TXG_MASK]; | |
1543 | pending_lock = &brt->brt_pending_lock[txg & TXG_MASK]; | |
1544 | ||
1545 | mutex_enter(pending_lock); | |
1546 | ||
1547 | c = NULL; | |
1548 | while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) { | |
1549 | boolean_t added_to_ddt; | |
1550 | ||
1551 | mutex_exit(pending_lock); | |
1552 | ||
1553 | for (int i = 0; i < bpe->bpe_count; i++) { | |
1554 | /* | |
1555 | * If the block has DEDUP bit set, it means that it | |
1556 | * already exists in the DEDUP table, so we can just | |
1557 | * use that instead of creating new entry in | |
1558 | * the BRT table. | |
1559 | */ | |
1560 | if (BP_GET_DEDUP(&bpe->bpe_bp)) { | |
1561 | added_to_ddt = ddt_addref(spa, &bpe->bpe_bp); | |
1562 | } else { | |
1563 | added_to_ddt = B_FALSE; | |
1564 | } | |
1565 | if (!added_to_ddt) | |
1566 | brt_entry_addref(brt, &bpe->bpe_bp); | |
1567 | } | |
1568 | ||
1569 | kmem_cache_free(brt_pending_entry_cache, bpe); | |
1570 | mutex_enter(pending_lock); | |
1571 | } | |
1572 | ||
1573 | mutex_exit(pending_lock); | |
1574 | } | |
1575 | ||
1576 | static void | |
1577 | brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx) | |
1578 | { | |
1579 | ||
1580 | ASSERT(RW_WRITE_HELD(&brt->brt_lock)); | |
1581 | ASSERT(brtvd->bv_mos_entries != 0); | |
1582 | ||
1583 | if (bre->bre_refcount == 0) { | |
1584 | int error; | |
1585 | ||
1586 | error = brt_entry_remove(brt, brtvd, bre, tx); | |
1587 | ASSERT(error == 0 || error == ENOENT); | |
1588 | /* | |
1589 | * If error == ENOENT then zfs_clone_range() was done from a | |
1590 | * removed (but opened) file (open(), unlink()). | |
1591 | */ | |
1592 | ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT); | |
1593 | } else { | |
1594 | VERIFY0(brt_entry_update(brt, brtvd, bre, tx)); | |
1595 | } | |
1596 | } | |
1597 | ||
1598 | static void | |
1599 | brt_sync_table(brt_t *brt, dmu_tx_t *tx) | |
1600 | { | |
1601 | brt_vdev_t *brtvd; | |
1602 | brt_entry_t *bre; | |
1603 | uint64_t vdevid; | |
1604 | void *c; | |
1605 | ||
1606 | brt_wlock(brt); | |
1607 | ||
1608 | for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { | |
1609 | brtvd = &brt->brt_vdevs[vdevid]; | |
1610 | ||
1611 | if (!brtvd->bv_initiated) | |
1612 | continue; | |
1613 | ||
1614 | if (!brtvd->bv_meta_dirty) { | |
1615 | ASSERT(!brtvd->bv_entcount_dirty); | |
1616 | ASSERT0(avl_numnodes(&brtvd->bv_tree)); | |
1617 | continue; | |
1618 | } | |
1619 | ||
1620 | ASSERT(!brtvd->bv_entcount_dirty || | |
1621 | avl_numnodes(&brtvd->bv_tree) != 0); | |
1622 | ||
1623 | if (brtvd->bv_mos_brtvdev == 0) | |
1624 | brt_vdev_create(brt, brtvd, tx); | |
1625 | ||
1626 | c = NULL; | |
1627 | while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) { | |
1628 | brt_sync_entry(brt, brtvd, bre, tx); | |
1629 | brt_entry_free(bre); | |
1630 | ASSERT(brt->brt_nentries > 0); | |
1631 | brt->brt_nentries--; | |
1632 | } | |
1633 | ||
1634 | brt_vdev_sync(brt, brtvd, tx); | |
1635 | ||
1636 | if (brtvd->bv_totalcount == 0) | |
1637 | brt_vdev_destroy(brt, brtvd, tx); | |
1638 | } | |
1639 | ||
1640 | ASSERT0(brt->brt_nentries); | |
1641 | ||
1642 | brt_unlock(brt); | |
1643 | } | |
1644 | ||
1645 | void | |
1646 | brt_sync(spa_t *spa, uint64_t txg) | |
1647 | { | |
1648 | dmu_tx_t *tx; | |
1649 | brt_t *brt; | |
1650 | ||
1651 | ASSERT(spa_syncing_txg(spa) == txg); | |
1652 | ||
1653 | brt = spa->spa_brt; | |
1654 | brt_rlock(brt); | |
1655 | if (brt->brt_nentries == 0) { | |
1656 | /* No changes. */ | |
1657 | brt_unlock(brt); | |
1658 | return; | |
1659 | } | |
1660 | brt_unlock(brt); | |
1661 | ||
1662 | tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | |
1663 | ||
1664 | brt_sync_table(brt, tx); | |
1665 | ||
1666 | dmu_tx_commit(tx); | |
1667 | } | |
1668 | ||
1669 | static void | |
1670 | brt_table_alloc(brt_t *brt) | |
1671 | { | |
1672 | ||
1673 | for (int i = 0; i < TXG_SIZE; i++) { | |
1674 | avl_create(&brt->brt_pending_tree[i], | |
1675 | brt_pending_entry_compare, | |
1676 | sizeof (brt_pending_entry_t), | |
1677 | offsetof(brt_pending_entry_t, bpe_node)); | |
1678 | mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT, | |
1679 | NULL); | |
1680 | } | |
1681 | } | |
1682 | ||
1683 | static void | |
1684 | brt_table_free(brt_t *brt) | |
1685 | { | |
1686 | ||
1687 | for (int i = 0; i < TXG_SIZE; i++) { | |
1688 | ASSERT(avl_is_empty(&brt->brt_pending_tree[i])); | |
1689 | ||
1690 | avl_destroy(&brt->brt_pending_tree[i]); | |
1691 | mutex_destroy(&brt->brt_pending_lock[i]); | |
1692 | } | |
1693 | } | |
1694 | ||
1695 | static void | |
1696 | brt_alloc(spa_t *spa) | |
1697 | { | |
1698 | brt_t *brt; | |
1699 | ||
1700 | ASSERT(spa->spa_brt == NULL); | |
1701 | ||
1702 | brt = kmem_zalloc(sizeof (*brt), KM_SLEEP); | |
1703 | rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL); | |
1704 | brt->brt_spa = spa; | |
1705 | brt->brt_rangesize = 0; | |
1706 | brt->brt_nentries = 0; | |
1707 | brt->brt_vdevs = NULL; | |
1708 | brt->brt_nvdevs = 0; | |
1709 | brt_table_alloc(brt); | |
1710 | ||
1711 | spa->spa_brt = brt; | |
1712 | } | |
1713 | ||
1714 | void | |
1715 | brt_create(spa_t *spa) | |
1716 | { | |
1717 | ||
1718 | brt_alloc(spa); | |
1719 | brt_vdevs_alloc(spa->spa_brt, B_FALSE); | |
1720 | } | |
1721 | ||
1722 | int | |
1723 | brt_load(spa_t *spa) | |
1724 | { | |
1725 | ||
1726 | brt_alloc(spa); | |
1727 | brt_vdevs_alloc(spa->spa_brt, B_TRUE); | |
1728 | ||
1729 | return (0); | |
1730 | } | |
1731 | ||
1732 | void | |
1733 | brt_unload(spa_t *spa) | |
1734 | { | |
1735 | brt_t *brt = spa->spa_brt; | |
1736 | ||
1737 | if (brt == NULL) | |
1738 | return; | |
1739 | ||
1740 | brt_vdevs_free(brt); | |
1741 | brt_table_free(brt); | |
1742 | rw_destroy(&brt->brt_lock); | |
1743 | kmem_free(brt, sizeof (*brt)); | |
1744 | spa->spa_brt = NULL; | |
1745 | } | |
1746 | ||
1747 | /* BEGIN CSTYLED */ | |
1748 | ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW, | |
1749 | "Enable prefetching of BRT entries"); | |
1750 | #ifdef ZFS_BRT_DEBUG | |
1751 | ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug"); | |
1752 | #endif | |
1753 | /* END CSTYLED */ |