]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
c99c9001 | 25 | /* |
5d43cc9a | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
c99c9001 | 27 | */ |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | /* |
30 | * This file contains the code to implement file range locking in | |
d3cc8b15 | 31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is |
34dc7c2f BB |
32 | * support for growing the blocksize). |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
5d43cc9a MA |
37 | * lr = rangelock_enter(zp, off, len, lock_type); |
38 | * rangelock_reduce(lr, off, len); // optional | |
39 | * rangelock_exit(lr); | |
34dc7c2f | 40 | * |
27ece2ee MM |
41 | * Range locking rules |
42 | * -------------------- | |
43 | * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole | |
44 | * file range needs to be locked as RL_WRITER. Only then can the pages be | |
45 | * freed etc and zp_size reset. zp_size must be set within range lock. | |
46 | * 2. For writes and punching holes (zfs_write & zfs_space) just the range | |
47 | * being written or freed needs to be locked as RL_WRITER. | |
48 | * Multiple writes at the end of the file must coordinate zp_size updates | |
49 | * to ensure data isn't lost. A compare and swap loop is currently used | |
50 | * to ensure the file size is at least the offset last written. | |
51 | * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being | |
52 | * read needs to be locked as RL_READER. A check against zp_size can then | |
53 | * be made for reading beyond end of file. | |
54 | * | |
34dc7c2f BB |
55 | * AVL tree |
56 | * -------- | |
57 | * An AVL tree is used to maintain the state of the existing ranges | |
58 | * that are locked for exclusive (writer) or shared (reader) use. | |
59 | * The starting range offset is used for searching and sorting the tree. | |
60 | * | |
61 | * Common case | |
62 | * ----------- | |
5d43cc9a MA |
63 | * The (hopefully) usual case is of no overlaps or contention for locks. On |
64 | * entry to rangelock_enter(), a locked_range_t is allocated; the tree | |
65 | * searched that finds no overlap, and *this* locked_range_t is placed in the | |
66 | * tree. | |
34dc7c2f BB |
67 | * |
68 | * Overlaps/Reference counting/Proxy locks | |
69 | * --------------------------------------- | |
70 | * The avl code only allows one node at a particular offset. Also it's very | |
71 | * inefficient to search through all previous entries looking for overlaps | |
72 | * (because the very 1st in the ordered list might be at offset 0 but | |
73 | * cover the whole file). | |
74 | * So this implementation uses reference counts and proxy range locks. | |
75 | * Firstly, only reader locks use reference counts and proxy locks, | |
76 | * because writer locks are exclusive. | |
77 | * When a reader lock overlaps with another then a proxy lock is created | |
78 | * for that range and replaces the original lock. If the overlap | |
79 | * is exact then the reference count of the proxy is simply incremented. | |
80 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
81 | * new proxy locks created for non overlapping ranges. | |
82 | * The reference counts are adjusted accordingly. | |
4e33ba4c | 83 | * Meanwhile, the original lock is kept around (this is the callers handle) |
34dc7c2f BB |
84 | * and its offset and length are used when releasing the lock. |
85 | * | |
86 | * Thread coordination | |
87 | * ------------------- | |
88 | * In order to make wakeups efficient and to ensure multiple continuous | |
89 | * readers on a range don't starve a writer for the same range lock, | |
90 | * two condition variables are allocated in each rl_t. | |
91 | * If a writer (or reader) can't get a range it initialises the writer | |
92 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
93 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
94 | * writers then all readers before destroying the lock. | |
95 | * | |
96 | * Append mode writes | |
97 | * ------------------ | |
98 | * Append mode writes need to lock a range at the end of a file. | |
99 | * The offset of the end of the file is determined under the | |
100 | * range locking mutex, and the lock type converted from RL_APPEND to | |
101 | * RL_WRITER and the range locked. | |
102 | * | |
103 | * Grow block handling | |
104 | * ------------------- | |
5d43cc9a | 105 | * ZFS supports multiple block sizes, up to 16MB. The smallest |
34dc7c2f BB |
106 | * block size is used for the file which is grown as needed. During this |
107 | * growth all other writers and readers must be excluded. | |
108 | * So if the block size needs to be grown then the whole file is | |
109 | * exclusively locked, then later the caller will reduce the lock | |
5d43cc9a | 110 | * range to just the range to be written using rangelock_reduce(). |
34dc7c2f BB |
111 | */ |
112 | ||
5d43cc9a | 113 | #include <sys/zfs_context.h> |
34dc7c2f | 114 | #include <sys/zfs_rlock.h> |
5d43cc9a | 115 | |
27ece2ee | 116 | |
5d43cc9a MA |
117 | /* |
118 | * AVL comparison function used to order range locks | |
119 | * Locks are ordered on the start offset of the range. | |
120 | */ | |
121 | static int | |
2cc479d0 | 122 | zfs_rangelock_compare(const void *arg1, const void *arg2) |
5d43cc9a | 123 | { |
bd4dde8e MM |
124 | const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1; |
125 | const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2; | |
5d43cc9a | 126 | |
ca577779 | 127 | return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); |
5d43cc9a MA |
128 | } |
129 | ||
130 | /* | |
131 | * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. | |
132 | * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), | |
133 | * and may increase the range that's locked for RL_WRITER. | |
134 | */ | |
135 | void | |
bd4dde8e | 136 | zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg) |
5d43cc9a MA |
137 | { |
138 | mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); | |
2cc479d0 | 139 | avl_create(&rl->rl_tree, zfs_rangelock_compare, |
bd4dde8e | 140 | sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node)); |
5d43cc9a MA |
141 | rl->rl_cb = cb; |
142 | rl->rl_arg = arg; | |
143 | } | |
144 | ||
145 | void | |
bd4dde8e | 146 | zfs_rangelock_fini(zfs_rangelock_t *rl) |
5d43cc9a MA |
147 | { |
148 | mutex_destroy(&rl->rl_lock); | |
149 | avl_destroy(&rl->rl_tree); | |
150 | } | |
34dc7c2f BB |
151 | |
152 | /* | |
6e005617 MJ |
153 | * Check if a write lock can be grabbed. If not, fail immediately or sleep and |
154 | * recheck until available, depending on the value of the "nonblock" parameter. | |
34dc7c2f | 155 | */ |
6e005617 MJ |
156 | static boolean_t |
157 | zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new, | |
158 | boolean_t nonblock) | |
34dc7c2f | 159 | { |
5d43cc9a | 160 | avl_tree_t *tree = &rl->rl_tree; |
bd4dde8e | 161 | zfs_locked_range_t *lr; |
34dc7c2f | 162 | avl_index_t where; |
5d43cc9a MA |
163 | uint64_t orig_off = new->lr_offset; |
164 | uint64_t orig_len = new->lr_length; | |
bd4dde8e | 165 | zfs_rangelock_type_t orig_type = new->lr_type; |
34dc7c2f BB |
166 | |
167 | for (;;) { | |
168 | /* | |
5d43cc9a MA |
169 | * Call callback which can modify new->r_off,len,type. |
170 | * Note, the callback is used by the ZPL to handle appending | |
171 | * and changing blocksizes. It isn't needed for zvols. | |
34dc7c2f | 172 | */ |
5d43cc9a MA |
173 | if (rl->rl_cb != NULL) { |
174 | rl->rl_cb(new, rl->rl_arg); | |
34dc7c2f BB |
175 | } |
176 | ||
5d43cc9a MA |
177 | /* |
178 | * If the type was APPEND, the callback must convert it to | |
179 | * WRITER. | |
180 | */ | |
181 | ASSERT3U(new->lr_type, ==, RL_WRITER); | |
182 | ||
34dc7c2f BB |
183 | /* |
184 | * First check for the usual case of no locks | |
185 | */ | |
186 | if (avl_numnodes(tree) == 0) { | |
34dc7c2f | 187 | avl_add(tree, new); |
6e005617 | 188 | return (B_TRUE); |
34dc7c2f BB |
189 | } |
190 | ||
191 | /* | |
192 | * Look for any locks in the range. | |
193 | */ | |
5d43cc9a MA |
194 | lr = avl_find(tree, new, &where); |
195 | if (lr != NULL) | |
34dc7c2f BB |
196 | goto wait; /* already locked at same offset */ |
197 | ||
bd4dde8e | 198 | lr = avl_nearest(tree, where, AVL_AFTER); |
5d43cc9a MA |
199 | if (lr != NULL && |
200 | lr->lr_offset < new->lr_offset + new->lr_length) | |
34dc7c2f BB |
201 | goto wait; |
202 | ||
bd4dde8e | 203 | lr = avl_nearest(tree, where, AVL_BEFORE); |
5d43cc9a MA |
204 | if (lr != NULL && |
205 | lr->lr_offset + lr->lr_length > new->lr_offset) | |
34dc7c2f BB |
206 | goto wait; |
207 | ||
34dc7c2f | 208 | avl_insert(tree, new, where); |
6e005617 | 209 | return (B_TRUE); |
34dc7c2f | 210 | wait: |
6e005617 MJ |
211 | if (nonblock) |
212 | return (B_FALSE); | |
5d43cc9a MA |
213 | if (!lr->lr_write_wanted) { |
214 | cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); | |
215 | lr->lr_write_wanted = B_TRUE; | |
34dc7c2f | 216 | } |
5d43cc9a | 217 | cv_wait(&lr->lr_write_cv, &rl->rl_lock); |
34dc7c2f BB |
218 | |
219 | /* reset to original */ | |
5d43cc9a MA |
220 | new->lr_offset = orig_off; |
221 | new->lr_length = orig_len; | |
222 | new->lr_type = orig_type; | |
34dc7c2f BB |
223 | } |
224 | } | |
225 | ||
226 | /* | |
227 | * If this is an original (non-proxy) lock then replace it by | |
228 | * a proxy and return the proxy. | |
229 | */ | |
bd4dde8e MM |
230 | static zfs_locked_range_t * |
231 | zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr) | |
34dc7c2f | 232 | { |
bd4dde8e | 233 | zfs_locked_range_t *proxy; |
34dc7c2f | 234 | |
5d43cc9a MA |
235 | if (lr->lr_proxy) |
236 | return (lr); /* already a proxy */ | |
34dc7c2f | 237 | |
5d43cc9a MA |
238 | ASSERT3U(lr->lr_count, ==, 1); |
239 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
240 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
241 | avl_remove(tree, lr); | |
242 | lr->lr_count = 0; | |
34dc7c2f BB |
243 | |
244 | /* create a proxy range lock */ | |
bd4dde8e | 245 | proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
246 | proxy->lr_offset = lr->lr_offset; |
247 | proxy->lr_length = lr->lr_length; | |
248 | proxy->lr_count = 1; | |
249 | proxy->lr_type = RL_READER; | |
250 | proxy->lr_proxy = B_TRUE; | |
251 | proxy->lr_write_wanted = B_FALSE; | |
252 | proxy->lr_read_wanted = B_FALSE; | |
34dc7c2f BB |
253 | avl_add(tree, proxy); |
254 | ||
255 | return (proxy); | |
256 | } | |
257 | ||
258 | /* | |
259 | * Split the range lock at the supplied offset | |
260 | * returning the *front* proxy. | |
261 | */ | |
bd4dde8e MM |
262 | static zfs_locked_range_t * |
263 | zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off) | |
34dc7c2f | 264 | { |
bd4dde8e MM |
265 | zfs_locked_range_t *rear; |
266 | ||
5d43cc9a MA |
267 | ASSERT3U(lr->lr_length, >, 1); |
268 | ASSERT3U(off, >, lr->lr_offset); | |
269 | ASSERT3U(off, <, lr->lr_offset + lr->lr_length); | |
270 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
271 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
34dc7c2f BB |
272 | |
273 | /* create the rear proxy range lock */ | |
bd4dde8e | 274 | rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
275 | rear->lr_offset = off; |
276 | rear->lr_length = lr->lr_offset + lr->lr_length - off; | |
277 | rear->lr_count = lr->lr_count; | |
278 | rear->lr_type = RL_READER; | |
279 | rear->lr_proxy = B_TRUE; | |
280 | rear->lr_write_wanted = B_FALSE; | |
281 | rear->lr_read_wanted = B_FALSE; | |
282 | ||
bd4dde8e | 283 | zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr); |
5d43cc9a | 284 | front->lr_length = off - lr->lr_offset; |
34dc7c2f BB |
285 | |
286 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
287 | return (front); | |
288 | } | |
289 | ||
290 | /* | |
291 | * Create and add a new proxy range lock for the supplied range. | |
292 | */ | |
293 | static void | |
2cc479d0 | 294 | zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) |
34dc7c2f | 295 | { |
bd4dde8e MM |
296 | zfs_locked_range_t *lr; |
297 | ||
5d43cc9a | 298 | ASSERT(len != 0); |
bd4dde8e | 299 | lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
300 | lr->lr_offset = off; |
301 | lr->lr_length = len; | |
302 | lr->lr_count = 1; | |
303 | lr->lr_type = RL_READER; | |
304 | lr->lr_proxy = B_TRUE; | |
305 | lr->lr_write_wanted = B_FALSE; | |
306 | lr->lr_read_wanted = B_FALSE; | |
307 | avl_add(tree, lr); | |
34dc7c2f BB |
308 | } |
309 | ||
310 | static void | |
bd4dde8e MM |
311 | zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new, |
312 | zfs_locked_range_t *prev, avl_index_t where) | |
34dc7c2f | 313 | { |
bd4dde8e | 314 | zfs_locked_range_t *next; |
5d43cc9a MA |
315 | uint64_t off = new->lr_offset; |
316 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
317 | |
318 | /* | |
319 | * prev arrives either: | |
320 | * - pointing to an entry at the same offset | |
321 | * - pointing to the entry with the closest previous offset whose | |
322 | * range may overlap with the new range | |
323 | * - null, if there were no ranges starting before the new one | |
324 | */ | |
5d43cc9a MA |
325 | if (prev != NULL) { |
326 | if (prev->lr_offset + prev->lr_length <= off) { | |
34dc7c2f | 327 | prev = NULL; |
5d43cc9a | 328 | } else if (prev->lr_offset != off) { |
34dc7c2f BB |
329 | /* |
330 | * convert to proxy if needed then | |
331 | * split this entry and bump ref count | |
332 | */ | |
2cc479d0 | 333 | prev = zfs_rangelock_split(tree, prev, off); |
34dc7c2f BB |
334 | prev = AVL_NEXT(tree, prev); /* move to rear range */ |
335 | } | |
336 | } | |
5d43cc9a | 337 | ASSERT((prev == NULL) || (prev->lr_offset == off)); |
34dc7c2f | 338 | |
5d43cc9a | 339 | if (prev != NULL) |
34dc7c2f BB |
340 | next = prev; |
341 | else | |
5d43cc9a | 342 | next = avl_nearest(tree, where, AVL_AFTER); |
34dc7c2f | 343 | |
5d43cc9a | 344 | if (next == NULL || off + len <= next->lr_offset) { |
34dc7c2f BB |
345 | /* no overlaps, use the original new rl_t in the tree */ |
346 | avl_insert(tree, new, where); | |
347 | return; | |
348 | } | |
349 | ||
5d43cc9a | 350 | if (off < next->lr_offset) { |
34dc7c2f | 351 | /* Add a proxy for initial range before the overlap */ |
2cc479d0 | 352 | zfs_rangelock_new_proxy(tree, off, next->lr_offset - off); |
34dc7c2f BB |
353 | } |
354 | ||
5d43cc9a | 355 | new->lr_count = 0; /* will use proxies in tree */ |
34dc7c2f BB |
356 | /* |
357 | * We now search forward through the ranges, until we go past the end | |
358 | * of the new range. For each entry we make it a proxy if it | |
359 | * isn't already, then bump its reference count. If there's any | |
360 | * gaps between the ranges then we create a new proxy range. | |
361 | */ | |
362 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
5d43cc9a | 363 | if (off + len <= next->lr_offset) |
34dc7c2f | 364 | break; |
5d43cc9a MA |
365 | if (prev != NULL && prev->lr_offset + prev->lr_length < |
366 | next->lr_offset) { | |
34dc7c2f | 367 | /* there's a gap */ |
5d43cc9a MA |
368 | ASSERT3U(next->lr_offset, >, |
369 | prev->lr_offset + prev->lr_length); | |
2cc479d0 | 370 | zfs_rangelock_new_proxy(tree, |
5d43cc9a MA |
371 | prev->lr_offset + prev->lr_length, |
372 | next->lr_offset - | |
373 | (prev->lr_offset + prev->lr_length)); | |
34dc7c2f | 374 | } |
5d43cc9a | 375 | if (off + len == next->lr_offset + next->lr_length) { |
34dc7c2f | 376 | /* exact overlap with end */ |
2cc479d0 | 377 | next = zfs_rangelock_proxify(tree, next); |
5d43cc9a | 378 | next->lr_count++; |
34dc7c2f BB |
379 | return; |
380 | } | |
5d43cc9a | 381 | if (off + len < next->lr_offset + next->lr_length) { |
34dc7c2f | 382 | /* new range ends in the middle of this block */ |
2cc479d0 | 383 | next = zfs_rangelock_split(tree, next, off + len); |
5d43cc9a | 384 | next->lr_count++; |
34dc7c2f BB |
385 | return; |
386 | } | |
5d43cc9a | 387 | ASSERT3U(off + len, >, next->lr_offset + next->lr_length); |
2cc479d0 | 388 | next = zfs_rangelock_proxify(tree, next); |
5d43cc9a | 389 | next->lr_count++; |
34dc7c2f BB |
390 | } |
391 | ||
392 | /* Add the remaining end range. */ | |
2cc479d0 | 393 | zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, |
5d43cc9a | 394 | (off + len) - (prev->lr_offset + prev->lr_length)); |
34dc7c2f BB |
395 | } |
396 | ||
397 | /* | |
6e005617 MJ |
398 | * Check if a reader lock can be grabbed. If not, fail immediately or sleep and |
399 | * recheck until available, depending on the value of the "nonblock" parameter. | |
34dc7c2f | 400 | */ |
6e005617 MJ |
401 | static boolean_t |
402 | zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new, | |
403 | boolean_t nonblock) | |
34dc7c2f | 404 | { |
5d43cc9a | 405 | avl_tree_t *tree = &rl->rl_tree; |
bd4dde8e | 406 | zfs_locked_range_t *prev, *next; |
34dc7c2f | 407 | avl_index_t where; |
5d43cc9a MA |
408 | uint64_t off = new->lr_offset; |
409 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
410 | |
411 | /* | |
412 | * Look for any writer locks in the range. | |
413 | */ | |
414 | retry: | |
415 | prev = avl_find(tree, new, &where); | |
416 | if (prev == NULL) | |
bd4dde8e | 417 | prev = avl_nearest(tree, where, AVL_BEFORE); |
34dc7c2f BB |
418 | |
419 | /* | |
420 | * Check the previous range for a writer lock overlap. | |
421 | */ | |
5d43cc9a MA |
422 | if (prev && (off < prev->lr_offset + prev->lr_length)) { |
423 | if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { | |
6e005617 MJ |
424 | if (nonblock) |
425 | return (B_FALSE); | |
5d43cc9a MA |
426 | if (!prev->lr_read_wanted) { |
427 | cv_init(&prev->lr_read_cv, | |
428 | NULL, CV_DEFAULT, NULL); | |
429 | prev->lr_read_wanted = B_TRUE; | |
34dc7c2f | 430 | } |
5d43cc9a | 431 | cv_wait(&prev->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
432 | goto retry; |
433 | } | |
5d43cc9a | 434 | if (off + len < prev->lr_offset + prev->lr_length) |
34dc7c2f BB |
435 | goto got_lock; |
436 | } | |
437 | ||
438 | /* | |
439 | * Search through the following ranges to see if there's | |
440 | * write lock any overlap. | |
441 | */ | |
5d43cc9a | 442 | if (prev != NULL) |
34dc7c2f BB |
443 | next = AVL_NEXT(tree, prev); |
444 | else | |
bd4dde8e | 445 | next = avl_nearest(tree, where, AVL_AFTER); |
5d43cc9a MA |
446 | for (; next != NULL; next = AVL_NEXT(tree, next)) { |
447 | if (off + len <= next->lr_offset) | |
34dc7c2f | 448 | goto got_lock; |
5d43cc9a | 449 | if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { |
6e005617 MJ |
450 | if (nonblock) |
451 | return (B_FALSE); | |
5d43cc9a MA |
452 | if (!next->lr_read_wanted) { |
453 | cv_init(&next->lr_read_cv, | |
454 | NULL, CV_DEFAULT, NULL); | |
455 | next->lr_read_wanted = B_TRUE; | |
34dc7c2f | 456 | } |
5d43cc9a | 457 | cv_wait(&next->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
458 | goto retry; |
459 | } | |
5d43cc9a | 460 | if (off + len <= next->lr_offset + next->lr_length) |
34dc7c2f BB |
461 | goto got_lock; |
462 | } | |
463 | ||
464 | got_lock: | |
465 | /* | |
466 | * Add the read lock, which may involve splitting existing | |
5d43cc9a | 467 | * locks and bumping ref counts (r_count). |
34dc7c2f | 468 | */ |
2cc479d0 | 469 | zfs_rangelock_add_reader(tree, new, prev, where); |
6e005617 | 470 | return (B_TRUE); |
34dc7c2f BB |
471 | } |
472 | ||
473 | /* | |
5d43cc9a MA |
474 | * Lock a range (offset, length) as either shared (RL_READER) or exclusive |
475 | * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert | |
476 | * it to a RL_WRITER lock (with the offset at the end of the file). Returns | |
477 | * the range lock structure for later unlocking (or reduce range if the | |
6e005617 MJ |
478 | * entire file is locked as RL_WRITER), or NULL if nonblock is true and the |
479 | * lock could not be acquired immediately. | |
34dc7c2f | 480 | */ |
6e005617 MJ |
481 | static zfs_locked_range_t * |
482 | zfs_rangelock_enter_impl(zfs_rangelock_t *rl, uint64_t off, uint64_t len, | |
483 | zfs_rangelock_type_t type, boolean_t nonblock) | |
34dc7c2f | 484 | { |
bd4dde8e MM |
485 | zfs_locked_range_t *new; |
486 | ||
34dc7c2f BB |
487 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); |
488 | ||
bd4dde8e | 489 | new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
490 | new->lr_rangelock = rl; |
491 | new->lr_offset = off; | |
d164b209 BB |
492 | if (len + off < off) /* overflow */ |
493 | len = UINT64_MAX - off; | |
5d43cc9a MA |
494 | new->lr_length = len; |
495 | new->lr_count = 1; /* assume it's going to be in the tree */ | |
496 | new->lr_type = type; | |
497 | new->lr_proxy = B_FALSE; | |
498 | new->lr_write_wanted = B_FALSE; | |
499 | new->lr_read_wanted = B_FALSE; | |
500 | ||
501 | mutex_enter(&rl->rl_lock); | |
34dc7c2f BB |
502 | if (type == RL_READER) { |
503 | /* | |
504 | * First check for the usual case of no locks | |
505 | */ | |
6e005617 | 506 | if (avl_numnodes(&rl->rl_tree) == 0) { |
5d43cc9a | 507 | avl_add(&rl->rl_tree, new); |
6e005617 MJ |
508 | } else if (!zfs_rangelock_enter_reader(rl, new, nonblock)) { |
509 | kmem_free(new, sizeof (*new)); | |
510 | new = NULL; | |
511 | } | |
512 | } else if (!zfs_rangelock_enter_writer(rl, new, nonblock)) { | |
513 | kmem_free(new, sizeof (*new)); | |
514 | new = NULL; | |
2cc479d0 | 515 | } |
5d43cc9a | 516 | mutex_exit(&rl->rl_lock); |
34dc7c2f BB |
517 | return (new); |
518 | } | |
519 | ||
6e005617 MJ |
520 | zfs_locked_range_t * |
521 | zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, | |
522 | zfs_rangelock_type_t type) | |
523 | { | |
524 | return (zfs_rangelock_enter_impl(rl, off, len, type, B_FALSE)); | |
525 | } | |
526 | ||
527 | zfs_locked_range_t * | |
528 | zfs_rangelock_tryenter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, | |
529 | zfs_rangelock_type_t type) | |
530 | { | |
531 | return (zfs_rangelock_enter_impl(rl, off, len, type, B_TRUE)); | |
532 | } | |
533 | ||
5d43cc9a | 534 | /* |
bd4dde8e | 535 | * Safely free the zfs_locked_range_t. |
5d43cc9a | 536 | */ |
8926ab7a | 537 | static void |
bd4dde8e | 538 | zfs_rangelock_free(zfs_locked_range_t *lr) |
8926ab7a | 539 | { |
5d43cc9a MA |
540 | if (lr->lr_write_wanted) |
541 | cv_destroy(&lr->lr_write_cv); | |
8926ab7a | 542 | |
5d43cc9a MA |
543 | if (lr->lr_read_wanted) |
544 | cv_destroy(&lr->lr_read_cv); | |
8926ab7a | 545 | |
bd4dde8e | 546 | kmem_free(lr, sizeof (zfs_locked_range_t)); |
8926ab7a BB |
547 | } |
548 | ||
34dc7c2f BB |
549 | /* |
550 | * Unlock a reader lock | |
551 | */ | |
552 | static void | |
bd4dde8e | 553 | zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove, |
5d43cc9a | 554 | list_t *free_list) |
34dc7c2f | 555 | { |
5d43cc9a | 556 | avl_tree_t *tree = &rl->rl_tree; |
34dc7c2f BB |
557 | uint64_t len; |
558 | ||
559 | /* | |
560 | * The common case is when the remove entry is in the tree | |
561 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
562 | * with this one. Otherwise the remove entry will have been | |
563 | * removed from the tree and replaced by proxies (one or | |
564 | * more ranges mapping to the entire range). | |
565 | */ | |
5d43cc9a | 566 | if (remove->lr_count == 1) { |
34dc7c2f | 567 | avl_remove(tree, remove); |
5d43cc9a MA |
568 | if (remove->lr_write_wanted) |
569 | cv_broadcast(&remove->lr_write_cv); | |
570 | if (remove->lr_read_wanted) | |
571 | cv_broadcast(&remove->lr_read_cv); | |
450dc149 | 572 | list_insert_tail(free_list, remove); |
34dc7c2f | 573 | } else { |
5d43cc9a MA |
574 | ASSERT0(remove->lr_count); |
575 | ASSERT0(remove->lr_write_wanted); | |
576 | ASSERT0(remove->lr_read_wanted); | |
34dc7c2f BB |
577 | /* |
578 | * Find start proxy representing this reader lock, | |
579 | * then decrement ref count on all proxies | |
580 | * that make up this range, freeing them as needed. | |
581 | */ | |
bd4dde8e | 582 | zfs_locked_range_t *lr = avl_find(tree, remove, NULL); |
5d43cc9a MA |
583 | ASSERT3P(lr, !=, NULL); |
584 | ASSERT3U(lr->lr_count, !=, 0); | |
585 | ASSERT3U(lr->lr_type, ==, RL_READER); | |
bd4dde8e | 586 | zfs_locked_range_t *next = NULL; |
5d43cc9a MA |
587 | for (len = remove->lr_length; len != 0; lr = next) { |
588 | len -= lr->lr_length; | |
589 | if (len != 0) { | |
590 | next = AVL_NEXT(tree, lr); | |
591 | ASSERT3P(next, !=, NULL); | |
592 | ASSERT3U(lr->lr_offset + lr->lr_length, ==, | |
593 | next->lr_offset); | |
594 | ASSERT3U(next->lr_count, !=, 0); | |
595 | ASSERT3U(next->lr_type, ==, RL_READER); | |
34dc7c2f | 596 | } |
5d43cc9a MA |
597 | lr->lr_count--; |
598 | if (lr->lr_count == 0) { | |
599 | avl_remove(tree, lr); | |
600 | if (lr->lr_write_wanted) | |
601 | cv_broadcast(&lr->lr_write_cv); | |
602 | if (lr->lr_read_wanted) | |
603 | cv_broadcast(&lr->lr_read_cv); | |
604 | list_insert_tail(free_list, lr); | |
34dc7c2f BB |
605 | } |
606 | } | |
bd4dde8e | 607 | kmem_free(remove, sizeof (zfs_locked_range_t)); |
34dc7c2f | 608 | } |
34dc7c2f BB |
609 | } |
610 | ||
611 | /* | |
612 | * Unlock range and destroy range lock structure. | |
613 | */ | |
614 | void | |
bd4dde8e | 615 | zfs_rangelock_exit(zfs_locked_range_t *lr) |
34dc7c2f | 616 | { |
bd4dde8e | 617 | zfs_rangelock_t *rl = lr->lr_rangelock; |
450dc149 | 618 | list_t free_list; |
bd4dde8e | 619 | zfs_locked_range_t *free_lr; |
34dc7c2f | 620 | |
5d43cc9a MA |
621 | ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); |
622 | ASSERT(lr->lr_count == 1 || lr->lr_count == 0); | |
623 | ASSERT(!lr->lr_proxy); | |
8926ab7a | 624 | |
5d43cc9a MA |
625 | /* |
626 | * The free list is used to defer the cv_destroy() and | |
627 | * subsequent kmem_free until after the mutex is dropped. | |
628 | */ | |
bd4dde8e MM |
629 | list_create(&free_list, sizeof (zfs_locked_range_t), |
630 | offsetof(zfs_locked_range_t, lr_node)); | |
8926ab7a | 631 | |
5d43cc9a MA |
632 | mutex_enter(&rl->rl_lock); |
633 | if (lr->lr_type == RL_WRITER) { | |
634 | /* writer locks can't be shared or split */ | |
635 | avl_remove(&rl->rl_tree, lr); | |
636 | if (lr->lr_write_wanted) | |
637 | cv_broadcast(&lr->lr_write_cv); | |
638 | if (lr->lr_read_wanted) | |
639 | cv_broadcast(&lr->lr_read_cv); | |
640 | list_insert_tail(&free_list, lr); | |
34dc7c2f BB |
641 | } else { |
642 | /* | |
5d43cc9a | 643 | * lock may be shared, let rangelock_exit_reader() |
bd4dde8e | 644 | * release the lock and free the zfs_locked_range_t. |
34dc7c2f | 645 | */ |
2cc479d0 | 646 | zfs_rangelock_exit_reader(rl, lr, &free_list); |
34dc7c2f | 647 | } |
5d43cc9a | 648 | mutex_exit(&rl->rl_lock); |
450dc149 | 649 | |
5d43cc9a | 650 | while ((free_lr = list_remove_head(&free_list)) != NULL) |
2cc479d0 | 651 | zfs_rangelock_free(free_lr); |
450dc149 BB |
652 | |
653 | list_destroy(&free_list); | |
34dc7c2f BB |
654 | } |
655 | ||
656 | /* | |
657 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
5d43cc9a | 658 | * Asserts the whole file is exclusively locked and so there's only one |
34dc7c2f BB |
659 | * entry in the tree. |
660 | */ | |
661 | void | |
bd4dde8e | 662 | zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len) |
34dc7c2f | 663 | { |
bd4dde8e | 664 | zfs_rangelock_t *rl = lr->lr_rangelock; |
34dc7c2f BB |
665 | |
666 | /* Ensure there are no other locks */ | |
5d43cc9a MA |
667 | ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); |
668 | ASSERT3U(lr->lr_offset, ==, 0); | |
669 | ASSERT3U(lr->lr_type, ==, RL_WRITER); | |
670 | ASSERT(!lr->lr_proxy); | |
671 | ASSERT3U(lr->lr_length, ==, UINT64_MAX); | |
672 | ASSERT3U(lr->lr_count, ==, 1); | |
673 | ||
674 | mutex_enter(&rl->rl_lock); | |
675 | lr->lr_offset = off; | |
676 | lr->lr_length = len; | |
677 | mutex_exit(&rl->rl_lock); | |
678 | if (lr->lr_write_wanted) | |
679 | cv_broadcast(&lr->lr_write_cv); | |
680 | if (lr->lr_read_wanted) | |
681 | cv_broadcast(&lr->lr_read_cv); | |
34dc7c2f | 682 | } |
d88895a0 | 683 | |
5d43cc9a | 684 | #if defined(_KERNEL) |
2cc479d0 MM |
685 | EXPORT_SYMBOL(zfs_rangelock_init); |
686 | EXPORT_SYMBOL(zfs_rangelock_fini); | |
687 | EXPORT_SYMBOL(zfs_rangelock_enter); | |
6e005617 | 688 | EXPORT_SYMBOL(zfs_rangelock_tryenter); |
2cc479d0 MM |
689 | EXPORT_SYMBOL(zfs_rangelock_exit); |
690 | EXPORT_SYMBOL(zfs_rangelock_reduce); | |
d88895a0 | 691 | #endif |