]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | ||
26 | #pragma ident "@(#)dsl_pool.c 1.12 08/03/20 SMI" | |
27 | ||
28 | #include <sys/dsl_pool.h> | |
29 | #include <sys/dsl_dataset.h> | |
30 | #include <sys/dsl_dir.h> | |
31 | #include <sys/dsl_synctask.h> | |
32 | #include <sys/dmu_tx.h> | |
33 | #include <sys/dmu_objset.h> | |
34 | #include <sys/arc.h> | |
35 | #include <sys/zap.h> | |
36 | #include <sys/zio.h> | |
37 | #include <sys/zfs_context.h> | |
38 | #include <sys/fs/zfs.h> | |
39 | ||
40 | int zfs_no_write_throttle = 0; | |
41 | uint64_t zfs_write_limit_override = 0; | |
42 | ||
43 | static int | |
44 | dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) | |
45 | { | |
46 | uint64_t obj; | |
47 | int err; | |
48 | ||
49 | err = zap_lookup(dp->dp_meta_objset, | |
50 | dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, | |
51 | MOS_DIR_NAME, sizeof (obj), 1, &obj); | |
52 | if (err) | |
53 | return (err); | |
54 | ||
55 | return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); | |
56 | } | |
57 | ||
58 | static dsl_pool_t * | |
59 | dsl_pool_open_impl(spa_t *spa, uint64_t txg) | |
60 | { | |
61 | dsl_pool_t *dp; | |
62 | blkptr_t *bp = spa_get_rootblkptr(spa); | |
63 | extern uint64_t zfs_write_limit_min; | |
64 | ||
65 | dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); | |
66 | dp->dp_spa = spa; | |
67 | dp->dp_meta_rootbp = *bp; | |
68 | rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); | |
69 | dp->dp_write_limit = zfs_write_limit_min; | |
70 | txg_init(dp, txg); | |
71 | ||
72 | txg_list_create(&dp->dp_dirty_datasets, | |
73 | offsetof(dsl_dataset_t, ds_dirty_link)); | |
74 | txg_list_create(&dp->dp_dirty_dirs, | |
75 | offsetof(dsl_dir_t, dd_dirty_link)); | |
76 | txg_list_create(&dp->dp_sync_tasks, | |
77 | offsetof(dsl_sync_task_group_t, dstg_node)); | |
78 | list_create(&dp->dp_synced_datasets, sizeof (dsl_dataset_t), | |
79 | offsetof(dsl_dataset_t, ds_synced_link)); | |
80 | ||
81 | mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); | |
82 | ||
83 | return (dp); | |
84 | } | |
85 | ||
86 | int | |
87 | dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) | |
88 | { | |
89 | int err; | |
90 | dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); | |
91 | objset_impl_t *osi; | |
92 | ||
93 | rw_enter(&dp->dp_config_rwlock, RW_READER); | |
94 | err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); | |
95 | if (err) | |
96 | goto out; | |
97 | dp->dp_meta_objset = &osi->os; | |
98 | ||
99 | err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
100 | DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, | |
101 | &dp->dp_root_dir_obj); | |
102 | if (err) | |
103 | goto out; | |
104 | ||
105 | err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, | |
106 | NULL, dp, &dp->dp_root_dir); | |
107 | if (err) | |
108 | goto out; | |
109 | ||
110 | err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); | |
111 | if (err) | |
112 | goto out; | |
113 | ||
114 | out: | |
115 | rw_exit(&dp->dp_config_rwlock); | |
116 | if (err) | |
117 | dsl_pool_close(dp); | |
118 | else | |
119 | *dpp = dp; | |
120 | ||
121 | return (err); | |
122 | } | |
123 | ||
124 | void | |
125 | dsl_pool_close(dsl_pool_t *dp) | |
126 | { | |
127 | /* drop our reference from dsl_pool_open() */ | |
128 | if (dp->dp_mos_dir) | |
129 | dsl_dir_close(dp->dp_mos_dir, dp); | |
130 | if (dp->dp_root_dir) | |
131 | dsl_dir_close(dp->dp_root_dir, dp); | |
132 | ||
133 | /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ | |
134 | if (dp->dp_meta_objset) | |
135 | dmu_objset_evict(NULL, dp->dp_meta_objset->os); | |
136 | ||
137 | txg_list_destroy(&dp->dp_dirty_datasets); | |
138 | txg_list_destroy(&dp->dp_dirty_dirs); | |
139 | list_destroy(&dp->dp_synced_datasets); | |
140 | ||
141 | arc_flush(dp->dp_spa); | |
142 | txg_fini(dp); | |
143 | rw_destroy(&dp->dp_config_rwlock); | |
144 | mutex_destroy(&dp->dp_lock); | |
145 | kmem_free(dp, sizeof (dsl_pool_t)); | |
146 | } | |
147 | ||
148 | dsl_pool_t * | |
149 | dsl_pool_create(spa_t *spa, uint64_t txg) | |
150 | { | |
151 | int err; | |
152 | dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); | |
153 | dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); | |
154 | dp->dp_meta_objset = &dmu_objset_create_impl(spa, | |
155 | NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; | |
156 | ||
157 | /* create the pool directory */ | |
158 | err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, | |
159 | DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); | |
160 | ASSERT3U(err, ==, 0); | |
161 | ||
162 | /* create and open the root dir */ | |
163 | dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); | |
164 | VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, | |
165 | NULL, dp, &dp->dp_root_dir)); | |
166 | ||
167 | /* create and open the meta-objset dir */ | |
168 | (void) dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx); | |
169 | VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); | |
170 | ||
171 | dmu_tx_commit(tx); | |
172 | ||
173 | return (dp); | |
174 | } | |
175 | ||
176 | void | |
177 | dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) | |
178 | { | |
179 | zio_t *zio; | |
180 | dmu_tx_t *tx; | |
181 | dsl_dir_t *dd; | |
182 | dsl_dataset_t *ds; | |
183 | dsl_sync_task_group_t *dstg; | |
184 | objset_impl_t *mosi = dp->dp_meta_objset->os; | |
185 | int err; | |
186 | ||
187 | tx = dmu_tx_create_assigned(dp, txg); | |
188 | ||
189 | zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); | |
190 | while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { | |
191 | if (!list_link_active(&ds->ds_synced_link)) | |
192 | list_insert_tail(&dp->dp_synced_datasets, ds); | |
193 | else | |
194 | dmu_buf_rele(ds->ds_dbuf, ds); | |
195 | dsl_dataset_sync(ds, zio, tx); | |
196 | } | |
197 | err = zio_wait(zio); | |
198 | ASSERT(err == 0); | |
199 | ||
200 | while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) | |
201 | dsl_sync_task_group_sync(dstg, tx); | |
202 | while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) | |
203 | dsl_dir_sync(dd, tx); | |
204 | ||
205 | if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || | |
206 | list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { | |
207 | zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); | |
208 | dmu_objset_sync(mosi, zio, tx); | |
209 | err = zio_wait(zio); | |
210 | ASSERT(err == 0); | |
211 | dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); | |
212 | spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); | |
213 | } | |
214 | ||
215 | dmu_tx_commit(tx); | |
216 | } | |
217 | ||
218 | void | |
219 | dsl_pool_zil_clean(dsl_pool_t *dp) | |
220 | { | |
221 | dsl_dataset_t *ds; | |
222 | ||
223 | while (ds = list_head(&dp->dp_synced_datasets)) { | |
224 | list_remove(&dp->dp_synced_datasets, ds); | |
225 | ASSERT(ds->ds_user_ptr != NULL); | |
226 | zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); | |
227 | dmu_buf_rele(ds->ds_dbuf, ds); | |
228 | } | |
229 | } | |
230 | ||
231 | /* | |
232 | * TRUE if the current thread is the tx_sync_thread or if we | |
233 | * are being called from SPA context during pool initialization. | |
234 | */ | |
235 | int | |
236 | dsl_pool_sync_context(dsl_pool_t *dp) | |
237 | { | |
238 | return (curthread == dp->dp_tx.tx_sync_thread || | |
239 | spa_get_dsl(dp->dp_spa) == NULL); | |
240 | } | |
241 | ||
242 | uint64_t | |
243 | dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) | |
244 | { | |
245 | uint64_t space, resv; | |
246 | ||
247 | /* | |
248 | * Reserve about 1.6% (1/64), or at least 32MB, for allocation | |
249 | * efficiency. | |
250 | * XXX The intent log is not accounted for, so it must fit | |
251 | * within this slop. | |
252 | * | |
253 | * If we're trying to assess whether it's OK to do a free, | |
254 | * cut the reservation in half to allow forward progress | |
255 | * (e.g. make it possible to rm(1) files from a full pool). | |
256 | */ | |
257 | space = spa_get_dspace(dp->dp_spa); | |
258 | resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); | |
259 | if (netfree) | |
260 | resv >>= 1; | |
261 | ||
262 | return (space - resv); | |
263 | } | |
264 | ||
265 | int | |
266 | dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) | |
267 | { | |
268 | uint64_t reserved = 0; | |
269 | uint64_t write_limit = (zfs_write_limit_override ? | |
270 | zfs_write_limit_override : dp->dp_write_limit); | |
271 | ||
272 | if (zfs_no_write_throttle) { | |
273 | dp->dp_tempreserved[tx->tx_txg & TXG_MASK] += space; | |
274 | return (0); | |
275 | } | |
276 | ||
277 | /* | |
278 | * Check to see if we have exceeded the maximum allowed IO for | |
279 | * this transaction group. We can do this without locks since | |
280 | * a little slop here is ok. Note that we do the reserved check | |
281 | * with only half the requested reserve: this is because the | |
282 | * reserve requests are worst-case, and we really don't want to | |
283 | * throttle based off of worst-case estimates. | |
284 | */ | |
285 | if (write_limit > 0) { | |
286 | reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] | |
287 | + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; | |
288 | ||
289 | if (reserved && reserved > write_limit) | |
290 | return (ERESTART); | |
291 | } | |
292 | ||
293 | atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); | |
294 | ||
295 | /* | |
296 | * If this transaction group is over 7/8ths capacity, delay | |
297 | * the caller 1 clock tick. This will slow down the "fill" | |
298 | * rate until the sync process can catch up with us. | |
299 | */ | |
300 | if (reserved && reserved > (write_limit - write_limit << 3)) | |
301 | txg_delay(dp, tx->tx_txg, 1); | |
302 | ||
303 | return (0); | |
304 | } | |
305 | ||
306 | void | |
307 | dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) | |
308 | { | |
309 | ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); | |
310 | atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); | |
311 | } | |
312 | ||
313 | void | |
314 | dsl_pool_memory_pressure(dsl_pool_t *dp) | |
315 | { | |
316 | extern uint64_t zfs_write_limit_min; | |
317 | uint64_t space_inuse = 0; | |
318 | int i; | |
319 | ||
320 | if (dp->dp_write_limit == zfs_write_limit_min) | |
321 | return; | |
322 | ||
323 | for (i = 0; i < TXG_SIZE; i++) { | |
324 | space_inuse += dp->dp_space_towrite[i]; | |
325 | space_inuse += dp->dp_tempreserved[i]; | |
326 | } | |
327 | dp->dp_write_limit = MAX(zfs_write_limit_min, | |
328 | MIN(dp->dp_write_limit, space_inuse / 4)); | |
329 | } | |
330 | ||
331 | void | |
332 | dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) | |
333 | { | |
334 | if (space > 0) { | |
335 | mutex_enter(&dp->dp_lock); | |
336 | dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; | |
337 | mutex_exit(&dp->dp_lock); | |
338 | } | |
339 | } |