]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
c99c9001 | 25 | /* |
5d43cc9a | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
c99c9001 | 27 | */ |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | /* |
30 | * This file contains the code to implement file range locking in | |
d3cc8b15 | 31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is |
34dc7c2f BB |
32 | * support for growing the blocksize). |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
5d43cc9a MA |
37 | * lr = rangelock_enter(zp, off, len, lock_type); |
38 | * rangelock_reduce(lr, off, len); // optional | |
39 | * rangelock_exit(lr); | |
34dc7c2f BB |
40 | * |
41 | * AVL tree | |
42 | * -------- | |
43 | * An AVL tree is used to maintain the state of the existing ranges | |
44 | * that are locked for exclusive (writer) or shared (reader) use. | |
45 | * The starting range offset is used for searching and sorting the tree. | |
46 | * | |
47 | * Common case | |
48 | * ----------- | |
5d43cc9a MA |
49 | * The (hopefully) usual case is of no overlaps or contention for locks. On |
50 | * entry to rangelock_enter(), a locked_range_t is allocated; the tree | |
51 | * searched that finds no overlap, and *this* locked_range_t is placed in the | |
52 | * tree. | |
34dc7c2f BB |
53 | * |
54 | * Overlaps/Reference counting/Proxy locks | |
55 | * --------------------------------------- | |
56 | * The avl code only allows one node at a particular offset. Also it's very | |
57 | * inefficient to search through all previous entries looking for overlaps | |
58 | * (because the very 1st in the ordered list might be at offset 0 but | |
59 | * cover the whole file). | |
60 | * So this implementation uses reference counts and proxy range locks. | |
61 | * Firstly, only reader locks use reference counts and proxy locks, | |
62 | * because writer locks are exclusive. | |
63 | * When a reader lock overlaps with another then a proxy lock is created | |
64 | * for that range and replaces the original lock. If the overlap | |
65 | * is exact then the reference count of the proxy is simply incremented. | |
66 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
67 | * new proxy locks created for non overlapping ranges. | |
68 | * The reference counts are adjusted accordingly. | |
4e33ba4c | 69 | * Meanwhile, the original lock is kept around (this is the callers handle) |
34dc7c2f BB |
70 | * and its offset and length are used when releasing the lock. |
71 | * | |
72 | * Thread coordination | |
73 | * ------------------- | |
74 | * In order to make wakeups efficient and to ensure multiple continuous | |
75 | * readers on a range don't starve a writer for the same range lock, | |
76 | * two condition variables are allocated in each rl_t. | |
77 | * If a writer (or reader) can't get a range it initialises the writer | |
78 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
79 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
80 | * writers then all readers before destroying the lock. | |
81 | * | |
82 | * Append mode writes | |
83 | * ------------------ | |
84 | * Append mode writes need to lock a range at the end of a file. | |
85 | * The offset of the end of the file is determined under the | |
86 | * range locking mutex, and the lock type converted from RL_APPEND to | |
87 | * RL_WRITER and the range locked. | |
88 | * | |
89 | * Grow block handling | |
90 | * ------------------- | |
5d43cc9a | 91 | * ZFS supports multiple block sizes, up to 16MB. The smallest |
34dc7c2f BB |
92 | * block size is used for the file which is grown as needed. During this |
93 | * growth all other writers and readers must be excluded. | |
94 | * So if the block size needs to be grown then the whole file is | |
95 | * exclusively locked, then later the caller will reduce the lock | |
5d43cc9a | 96 | * range to just the range to be written using rangelock_reduce(). |
34dc7c2f BB |
97 | */ |
98 | ||
5d43cc9a | 99 | #include <sys/zfs_context.h> |
34dc7c2f | 100 | #include <sys/zfs_rlock.h> |
5d43cc9a MA |
101 | |
102 | /* | |
103 | * AVL comparison function used to order range locks | |
104 | * Locks are ordered on the start offset of the range. | |
105 | */ | |
106 | static int | |
2cc479d0 | 107 | zfs_rangelock_compare(const void *arg1, const void *arg2) |
5d43cc9a | 108 | { |
bd4dde8e MM |
109 | const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1; |
110 | const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2; | |
5d43cc9a | 111 | |
ca577779 | 112 | return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); |
5d43cc9a MA |
113 | } |
114 | ||
115 | /* | |
116 | * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. | |
117 | * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), | |
118 | * and may increase the range that's locked for RL_WRITER. | |
119 | */ | |
120 | void | |
bd4dde8e | 121 | zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg) |
5d43cc9a MA |
122 | { |
123 | mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); | |
2cc479d0 | 124 | avl_create(&rl->rl_tree, zfs_rangelock_compare, |
bd4dde8e | 125 | sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node)); |
5d43cc9a MA |
126 | rl->rl_cb = cb; |
127 | rl->rl_arg = arg; | |
128 | } | |
129 | ||
130 | void | |
bd4dde8e | 131 | zfs_rangelock_fini(zfs_rangelock_t *rl) |
5d43cc9a MA |
132 | { |
133 | mutex_destroy(&rl->rl_lock); | |
134 | avl_destroy(&rl->rl_tree); | |
135 | } | |
34dc7c2f BB |
136 | |
137 | /* | |
138 | * Check if a write lock can be grabbed, or wait and recheck until available. | |
139 | */ | |
140 | static void | |
bd4dde8e | 141 | zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new) |
34dc7c2f | 142 | { |
5d43cc9a | 143 | avl_tree_t *tree = &rl->rl_tree; |
bd4dde8e | 144 | zfs_locked_range_t *lr; |
34dc7c2f | 145 | avl_index_t where; |
5d43cc9a MA |
146 | uint64_t orig_off = new->lr_offset; |
147 | uint64_t orig_len = new->lr_length; | |
bd4dde8e | 148 | zfs_rangelock_type_t orig_type = new->lr_type; |
34dc7c2f BB |
149 | |
150 | for (;;) { | |
151 | /* | |
5d43cc9a MA |
152 | * Call callback which can modify new->r_off,len,type. |
153 | * Note, the callback is used by the ZPL to handle appending | |
154 | * and changing blocksizes. It isn't needed for zvols. | |
34dc7c2f | 155 | */ |
5d43cc9a MA |
156 | if (rl->rl_cb != NULL) { |
157 | rl->rl_cb(new, rl->rl_arg); | |
34dc7c2f BB |
158 | } |
159 | ||
5d43cc9a MA |
160 | /* |
161 | * If the type was APPEND, the callback must convert it to | |
162 | * WRITER. | |
163 | */ | |
164 | ASSERT3U(new->lr_type, ==, RL_WRITER); | |
165 | ||
34dc7c2f BB |
166 | /* |
167 | * First check for the usual case of no locks | |
168 | */ | |
169 | if (avl_numnodes(tree) == 0) { | |
34dc7c2f BB |
170 | avl_add(tree, new); |
171 | return; | |
172 | } | |
173 | ||
174 | /* | |
175 | * Look for any locks in the range. | |
176 | */ | |
5d43cc9a MA |
177 | lr = avl_find(tree, new, &where); |
178 | if (lr != NULL) | |
34dc7c2f BB |
179 | goto wait; /* already locked at same offset */ |
180 | ||
bd4dde8e | 181 | lr = avl_nearest(tree, where, AVL_AFTER); |
5d43cc9a MA |
182 | if (lr != NULL && |
183 | lr->lr_offset < new->lr_offset + new->lr_length) | |
34dc7c2f BB |
184 | goto wait; |
185 | ||
bd4dde8e | 186 | lr = avl_nearest(tree, where, AVL_BEFORE); |
5d43cc9a MA |
187 | if (lr != NULL && |
188 | lr->lr_offset + lr->lr_length > new->lr_offset) | |
34dc7c2f BB |
189 | goto wait; |
190 | ||
34dc7c2f BB |
191 | avl_insert(tree, new, where); |
192 | return; | |
193 | wait: | |
5d43cc9a MA |
194 | if (!lr->lr_write_wanted) { |
195 | cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); | |
196 | lr->lr_write_wanted = B_TRUE; | |
34dc7c2f | 197 | } |
5d43cc9a | 198 | cv_wait(&lr->lr_write_cv, &rl->rl_lock); |
34dc7c2f BB |
199 | |
200 | /* reset to original */ | |
5d43cc9a MA |
201 | new->lr_offset = orig_off; |
202 | new->lr_length = orig_len; | |
203 | new->lr_type = orig_type; | |
34dc7c2f BB |
204 | } |
205 | } | |
206 | ||
207 | /* | |
208 | * If this is an original (non-proxy) lock then replace it by | |
209 | * a proxy and return the proxy. | |
210 | */ | |
bd4dde8e MM |
211 | static zfs_locked_range_t * |
212 | zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr) | |
34dc7c2f | 213 | { |
bd4dde8e | 214 | zfs_locked_range_t *proxy; |
34dc7c2f | 215 | |
5d43cc9a MA |
216 | if (lr->lr_proxy) |
217 | return (lr); /* already a proxy */ | |
34dc7c2f | 218 | |
5d43cc9a MA |
219 | ASSERT3U(lr->lr_count, ==, 1); |
220 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
221 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
222 | avl_remove(tree, lr); | |
223 | lr->lr_count = 0; | |
34dc7c2f BB |
224 | |
225 | /* create a proxy range lock */ | |
bd4dde8e | 226 | proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
227 | proxy->lr_offset = lr->lr_offset; |
228 | proxy->lr_length = lr->lr_length; | |
229 | proxy->lr_count = 1; | |
230 | proxy->lr_type = RL_READER; | |
231 | proxy->lr_proxy = B_TRUE; | |
232 | proxy->lr_write_wanted = B_FALSE; | |
233 | proxy->lr_read_wanted = B_FALSE; | |
34dc7c2f BB |
234 | avl_add(tree, proxy); |
235 | ||
236 | return (proxy); | |
237 | } | |
238 | ||
239 | /* | |
240 | * Split the range lock at the supplied offset | |
241 | * returning the *front* proxy. | |
242 | */ | |
bd4dde8e MM |
243 | static zfs_locked_range_t * |
244 | zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off) | |
34dc7c2f | 245 | { |
bd4dde8e MM |
246 | zfs_locked_range_t *rear; |
247 | ||
5d43cc9a MA |
248 | ASSERT3U(lr->lr_length, >, 1); |
249 | ASSERT3U(off, >, lr->lr_offset); | |
250 | ASSERT3U(off, <, lr->lr_offset + lr->lr_length); | |
251 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
252 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
34dc7c2f BB |
253 | |
254 | /* create the rear proxy range lock */ | |
bd4dde8e | 255 | rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
256 | rear->lr_offset = off; |
257 | rear->lr_length = lr->lr_offset + lr->lr_length - off; | |
258 | rear->lr_count = lr->lr_count; | |
259 | rear->lr_type = RL_READER; | |
260 | rear->lr_proxy = B_TRUE; | |
261 | rear->lr_write_wanted = B_FALSE; | |
262 | rear->lr_read_wanted = B_FALSE; | |
263 | ||
bd4dde8e | 264 | zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr); |
5d43cc9a | 265 | front->lr_length = off - lr->lr_offset; |
34dc7c2f BB |
266 | |
267 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
268 | return (front); | |
269 | } | |
270 | ||
271 | /* | |
272 | * Create and add a new proxy range lock for the supplied range. | |
273 | */ | |
274 | static void | |
2cc479d0 | 275 | zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) |
34dc7c2f | 276 | { |
bd4dde8e MM |
277 | zfs_locked_range_t *lr; |
278 | ||
5d43cc9a | 279 | ASSERT(len != 0); |
bd4dde8e | 280 | lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
281 | lr->lr_offset = off; |
282 | lr->lr_length = len; | |
283 | lr->lr_count = 1; | |
284 | lr->lr_type = RL_READER; | |
285 | lr->lr_proxy = B_TRUE; | |
286 | lr->lr_write_wanted = B_FALSE; | |
287 | lr->lr_read_wanted = B_FALSE; | |
288 | avl_add(tree, lr); | |
34dc7c2f BB |
289 | } |
290 | ||
291 | static void | |
bd4dde8e MM |
292 | zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new, |
293 | zfs_locked_range_t *prev, avl_index_t where) | |
34dc7c2f | 294 | { |
bd4dde8e | 295 | zfs_locked_range_t *next; |
5d43cc9a MA |
296 | uint64_t off = new->lr_offset; |
297 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
298 | |
299 | /* | |
300 | * prev arrives either: | |
301 | * - pointing to an entry at the same offset | |
302 | * - pointing to the entry with the closest previous offset whose | |
303 | * range may overlap with the new range | |
304 | * - null, if there were no ranges starting before the new one | |
305 | */ | |
5d43cc9a MA |
306 | if (prev != NULL) { |
307 | if (prev->lr_offset + prev->lr_length <= off) { | |
34dc7c2f | 308 | prev = NULL; |
5d43cc9a | 309 | } else if (prev->lr_offset != off) { |
34dc7c2f BB |
310 | /* |
311 | * convert to proxy if needed then | |
312 | * split this entry and bump ref count | |
313 | */ | |
2cc479d0 | 314 | prev = zfs_rangelock_split(tree, prev, off); |
34dc7c2f BB |
315 | prev = AVL_NEXT(tree, prev); /* move to rear range */ |
316 | } | |
317 | } | |
5d43cc9a | 318 | ASSERT((prev == NULL) || (prev->lr_offset == off)); |
34dc7c2f | 319 | |
5d43cc9a | 320 | if (prev != NULL) |
34dc7c2f BB |
321 | next = prev; |
322 | else | |
5d43cc9a | 323 | next = avl_nearest(tree, where, AVL_AFTER); |
34dc7c2f | 324 | |
5d43cc9a | 325 | if (next == NULL || off + len <= next->lr_offset) { |
34dc7c2f BB |
326 | /* no overlaps, use the original new rl_t in the tree */ |
327 | avl_insert(tree, new, where); | |
328 | return; | |
329 | } | |
330 | ||
5d43cc9a | 331 | if (off < next->lr_offset) { |
34dc7c2f | 332 | /* Add a proxy for initial range before the overlap */ |
2cc479d0 | 333 | zfs_rangelock_new_proxy(tree, off, next->lr_offset - off); |
34dc7c2f BB |
334 | } |
335 | ||
5d43cc9a | 336 | new->lr_count = 0; /* will use proxies in tree */ |
34dc7c2f BB |
337 | /* |
338 | * We now search forward through the ranges, until we go past the end | |
339 | * of the new range. For each entry we make it a proxy if it | |
340 | * isn't already, then bump its reference count. If there's any | |
341 | * gaps between the ranges then we create a new proxy range. | |
342 | */ | |
343 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
5d43cc9a | 344 | if (off + len <= next->lr_offset) |
34dc7c2f | 345 | break; |
5d43cc9a MA |
346 | if (prev != NULL && prev->lr_offset + prev->lr_length < |
347 | next->lr_offset) { | |
34dc7c2f | 348 | /* there's a gap */ |
5d43cc9a MA |
349 | ASSERT3U(next->lr_offset, >, |
350 | prev->lr_offset + prev->lr_length); | |
2cc479d0 | 351 | zfs_rangelock_new_proxy(tree, |
5d43cc9a MA |
352 | prev->lr_offset + prev->lr_length, |
353 | next->lr_offset - | |
354 | (prev->lr_offset + prev->lr_length)); | |
34dc7c2f | 355 | } |
5d43cc9a | 356 | if (off + len == next->lr_offset + next->lr_length) { |
34dc7c2f | 357 | /* exact overlap with end */ |
2cc479d0 | 358 | next = zfs_rangelock_proxify(tree, next); |
5d43cc9a | 359 | next->lr_count++; |
34dc7c2f BB |
360 | return; |
361 | } | |
5d43cc9a | 362 | if (off + len < next->lr_offset + next->lr_length) { |
34dc7c2f | 363 | /* new range ends in the middle of this block */ |
2cc479d0 | 364 | next = zfs_rangelock_split(tree, next, off + len); |
5d43cc9a | 365 | next->lr_count++; |
34dc7c2f BB |
366 | return; |
367 | } | |
5d43cc9a | 368 | ASSERT3U(off + len, >, next->lr_offset + next->lr_length); |
2cc479d0 | 369 | next = zfs_rangelock_proxify(tree, next); |
5d43cc9a | 370 | next->lr_count++; |
34dc7c2f BB |
371 | } |
372 | ||
373 | /* Add the remaining end range. */ | |
2cc479d0 | 374 | zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, |
5d43cc9a | 375 | (off + len) - (prev->lr_offset + prev->lr_length)); |
34dc7c2f BB |
376 | } |
377 | ||
378 | /* | |
379 | * Check if a reader lock can be grabbed, or wait and recheck until available. | |
380 | */ | |
381 | static void | |
bd4dde8e | 382 | zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new) |
34dc7c2f | 383 | { |
5d43cc9a | 384 | avl_tree_t *tree = &rl->rl_tree; |
bd4dde8e | 385 | zfs_locked_range_t *prev, *next; |
34dc7c2f | 386 | avl_index_t where; |
5d43cc9a MA |
387 | uint64_t off = new->lr_offset; |
388 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
389 | |
390 | /* | |
391 | * Look for any writer locks in the range. | |
392 | */ | |
393 | retry: | |
394 | prev = avl_find(tree, new, &where); | |
395 | if (prev == NULL) | |
bd4dde8e | 396 | prev = avl_nearest(tree, where, AVL_BEFORE); |
34dc7c2f BB |
397 | |
398 | /* | |
399 | * Check the previous range for a writer lock overlap. | |
400 | */ | |
5d43cc9a MA |
401 | if (prev && (off < prev->lr_offset + prev->lr_length)) { |
402 | if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { | |
403 | if (!prev->lr_read_wanted) { | |
404 | cv_init(&prev->lr_read_cv, | |
405 | NULL, CV_DEFAULT, NULL); | |
406 | prev->lr_read_wanted = B_TRUE; | |
34dc7c2f | 407 | } |
5d43cc9a | 408 | cv_wait(&prev->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
409 | goto retry; |
410 | } | |
5d43cc9a | 411 | if (off + len < prev->lr_offset + prev->lr_length) |
34dc7c2f BB |
412 | goto got_lock; |
413 | } | |
414 | ||
415 | /* | |
416 | * Search through the following ranges to see if there's | |
417 | * write lock any overlap. | |
418 | */ | |
5d43cc9a | 419 | if (prev != NULL) |
34dc7c2f BB |
420 | next = AVL_NEXT(tree, prev); |
421 | else | |
bd4dde8e | 422 | next = avl_nearest(tree, where, AVL_AFTER); |
5d43cc9a MA |
423 | for (; next != NULL; next = AVL_NEXT(tree, next)) { |
424 | if (off + len <= next->lr_offset) | |
34dc7c2f | 425 | goto got_lock; |
5d43cc9a MA |
426 | if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { |
427 | if (!next->lr_read_wanted) { | |
428 | cv_init(&next->lr_read_cv, | |
429 | NULL, CV_DEFAULT, NULL); | |
430 | next->lr_read_wanted = B_TRUE; | |
34dc7c2f | 431 | } |
5d43cc9a | 432 | cv_wait(&next->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
433 | goto retry; |
434 | } | |
5d43cc9a | 435 | if (off + len <= next->lr_offset + next->lr_length) |
34dc7c2f BB |
436 | goto got_lock; |
437 | } | |
438 | ||
439 | got_lock: | |
440 | /* | |
441 | * Add the read lock, which may involve splitting existing | |
5d43cc9a | 442 | * locks and bumping ref counts (r_count). |
34dc7c2f | 443 | */ |
2cc479d0 | 444 | zfs_rangelock_add_reader(tree, new, prev, where); |
34dc7c2f BB |
445 | } |
446 | ||
447 | /* | |
5d43cc9a MA |
448 | * Lock a range (offset, length) as either shared (RL_READER) or exclusive |
449 | * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert | |
450 | * it to a RL_WRITER lock (with the offset at the end of the file). Returns | |
451 | * the range lock structure for later unlocking (or reduce range if the | |
452 | * entire file is locked as RL_WRITER). | |
34dc7c2f | 453 | */ |
bd4dde8e MM |
454 | zfs_locked_range_t * |
455 | zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, | |
456 | zfs_rangelock_type_t type) | |
34dc7c2f | 457 | { |
bd4dde8e MM |
458 | zfs_locked_range_t *new; |
459 | ||
34dc7c2f BB |
460 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); |
461 | ||
bd4dde8e | 462 | new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); |
5d43cc9a MA |
463 | new->lr_rangelock = rl; |
464 | new->lr_offset = off; | |
d164b209 BB |
465 | if (len + off < off) /* overflow */ |
466 | len = UINT64_MAX - off; | |
5d43cc9a MA |
467 | new->lr_length = len; |
468 | new->lr_count = 1; /* assume it's going to be in the tree */ | |
469 | new->lr_type = type; | |
470 | new->lr_proxy = B_FALSE; | |
471 | new->lr_write_wanted = B_FALSE; | |
472 | new->lr_read_wanted = B_FALSE; | |
473 | ||
474 | mutex_enter(&rl->rl_lock); | |
34dc7c2f BB |
475 | if (type == RL_READER) { |
476 | /* | |
477 | * First check for the usual case of no locks | |
478 | */ | |
5d43cc9a MA |
479 | if (avl_numnodes(&rl->rl_tree) == 0) |
480 | avl_add(&rl->rl_tree, new); | |
34dc7c2f | 481 | else |
2cc479d0 MM |
482 | zfs_rangelock_enter_reader(rl, new); |
483 | } else { | |
484 | /* RL_WRITER or RL_APPEND */ | |
485 | zfs_rangelock_enter_writer(rl, new); | |
486 | } | |
5d43cc9a | 487 | mutex_exit(&rl->rl_lock); |
34dc7c2f BB |
488 | return (new); |
489 | } | |
490 | ||
5d43cc9a | 491 | /* |
bd4dde8e | 492 | * Safely free the zfs_locked_range_t. |
5d43cc9a | 493 | */ |
8926ab7a | 494 | static void |
bd4dde8e | 495 | zfs_rangelock_free(zfs_locked_range_t *lr) |
8926ab7a | 496 | { |
5d43cc9a MA |
497 | if (lr->lr_write_wanted) |
498 | cv_destroy(&lr->lr_write_cv); | |
8926ab7a | 499 | |
5d43cc9a MA |
500 | if (lr->lr_read_wanted) |
501 | cv_destroy(&lr->lr_read_cv); | |
8926ab7a | 502 | |
bd4dde8e | 503 | kmem_free(lr, sizeof (zfs_locked_range_t)); |
8926ab7a BB |
504 | } |
505 | ||
34dc7c2f BB |
506 | /* |
507 | * Unlock a reader lock | |
508 | */ | |
509 | static void | |
bd4dde8e | 510 | zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove, |
5d43cc9a | 511 | list_t *free_list) |
34dc7c2f | 512 | { |
5d43cc9a | 513 | avl_tree_t *tree = &rl->rl_tree; |
34dc7c2f BB |
514 | uint64_t len; |
515 | ||
516 | /* | |
517 | * The common case is when the remove entry is in the tree | |
518 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
519 | * with this one. Otherwise the remove entry will have been | |
520 | * removed from the tree and replaced by proxies (one or | |
521 | * more ranges mapping to the entire range). | |
522 | */ | |
5d43cc9a | 523 | if (remove->lr_count == 1) { |
34dc7c2f | 524 | avl_remove(tree, remove); |
5d43cc9a MA |
525 | if (remove->lr_write_wanted) |
526 | cv_broadcast(&remove->lr_write_cv); | |
527 | if (remove->lr_read_wanted) | |
528 | cv_broadcast(&remove->lr_read_cv); | |
450dc149 | 529 | list_insert_tail(free_list, remove); |
34dc7c2f | 530 | } else { |
5d43cc9a MA |
531 | ASSERT0(remove->lr_count); |
532 | ASSERT0(remove->lr_write_wanted); | |
533 | ASSERT0(remove->lr_read_wanted); | |
34dc7c2f BB |
534 | /* |
535 | * Find start proxy representing this reader lock, | |
536 | * then decrement ref count on all proxies | |
537 | * that make up this range, freeing them as needed. | |
538 | */ | |
bd4dde8e | 539 | zfs_locked_range_t *lr = avl_find(tree, remove, NULL); |
5d43cc9a MA |
540 | ASSERT3P(lr, !=, NULL); |
541 | ASSERT3U(lr->lr_count, !=, 0); | |
542 | ASSERT3U(lr->lr_type, ==, RL_READER); | |
bd4dde8e | 543 | zfs_locked_range_t *next = NULL; |
5d43cc9a MA |
544 | for (len = remove->lr_length; len != 0; lr = next) { |
545 | len -= lr->lr_length; | |
546 | if (len != 0) { | |
547 | next = AVL_NEXT(tree, lr); | |
548 | ASSERT3P(next, !=, NULL); | |
549 | ASSERT3U(lr->lr_offset + lr->lr_length, ==, | |
550 | next->lr_offset); | |
551 | ASSERT3U(next->lr_count, !=, 0); | |
552 | ASSERT3U(next->lr_type, ==, RL_READER); | |
34dc7c2f | 553 | } |
5d43cc9a MA |
554 | lr->lr_count--; |
555 | if (lr->lr_count == 0) { | |
556 | avl_remove(tree, lr); | |
557 | if (lr->lr_write_wanted) | |
558 | cv_broadcast(&lr->lr_write_cv); | |
559 | if (lr->lr_read_wanted) | |
560 | cv_broadcast(&lr->lr_read_cv); | |
561 | list_insert_tail(free_list, lr); | |
34dc7c2f BB |
562 | } |
563 | } | |
bd4dde8e | 564 | kmem_free(remove, sizeof (zfs_locked_range_t)); |
34dc7c2f | 565 | } |
34dc7c2f BB |
566 | } |
567 | ||
568 | /* | |
569 | * Unlock range and destroy range lock structure. | |
570 | */ | |
571 | void | |
bd4dde8e | 572 | zfs_rangelock_exit(zfs_locked_range_t *lr) |
34dc7c2f | 573 | { |
bd4dde8e | 574 | zfs_rangelock_t *rl = lr->lr_rangelock; |
450dc149 | 575 | list_t free_list; |
bd4dde8e | 576 | zfs_locked_range_t *free_lr; |
34dc7c2f | 577 | |
5d43cc9a MA |
578 | ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); |
579 | ASSERT(lr->lr_count == 1 || lr->lr_count == 0); | |
580 | ASSERT(!lr->lr_proxy); | |
8926ab7a | 581 | |
5d43cc9a MA |
582 | /* |
583 | * The free list is used to defer the cv_destroy() and | |
584 | * subsequent kmem_free until after the mutex is dropped. | |
585 | */ | |
bd4dde8e MM |
586 | list_create(&free_list, sizeof (zfs_locked_range_t), |
587 | offsetof(zfs_locked_range_t, lr_node)); | |
8926ab7a | 588 | |
5d43cc9a MA |
589 | mutex_enter(&rl->rl_lock); |
590 | if (lr->lr_type == RL_WRITER) { | |
591 | /* writer locks can't be shared or split */ | |
592 | avl_remove(&rl->rl_tree, lr); | |
593 | if (lr->lr_write_wanted) | |
594 | cv_broadcast(&lr->lr_write_cv); | |
595 | if (lr->lr_read_wanted) | |
596 | cv_broadcast(&lr->lr_read_cv); | |
597 | list_insert_tail(&free_list, lr); | |
34dc7c2f BB |
598 | } else { |
599 | /* | |
5d43cc9a | 600 | * lock may be shared, let rangelock_exit_reader() |
bd4dde8e | 601 | * release the lock and free the zfs_locked_range_t. |
34dc7c2f | 602 | */ |
2cc479d0 | 603 | zfs_rangelock_exit_reader(rl, lr, &free_list); |
34dc7c2f | 604 | } |
5d43cc9a | 605 | mutex_exit(&rl->rl_lock); |
450dc149 | 606 | |
5d43cc9a | 607 | while ((free_lr = list_remove_head(&free_list)) != NULL) |
2cc479d0 | 608 | zfs_rangelock_free(free_lr); |
450dc149 BB |
609 | |
610 | list_destroy(&free_list); | |
34dc7c2f BB |
611 | } |
612 | ||
613 | /* | |
614 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
5d43cc9a | 615 | * Asserts the whole file is exclusively locked and so there's only one |
34dc7c2f BB |
616 | * entry in the tree. |
617 | */ | |
618 | void | |
bd4dde8e | 619 | zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len) |
34dc7c2f | 620 | { |
bd4dde8e | 621 | zfs_rangelock_t *rl = lr->lr_rangelock; |
34dc7c2f BB |
622 | |
623 | /* Ensure there are no other locks */ | |
5d43cc9a MA |
624 | ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); |
625 | ASSERT3U(lr->lr_offset, ==, 0); | |
626 | ASSERT3U(lr->lr_type, ==, RL_WRITER); | |
627 | ASSERT(!lr->lr_proxy); | |
628 | ASSERT3U(lr->lr_length, ==, UINT64_MAX); | |
629 | ASSERT3U(lr->lr_count, ==, 1); | |
630 | ||
631 | mutex_enter(&rl->rl_lock); | |
632 | lr->lr_offset = off; | |
633 | lr->lr_length = len; | |
634 | mutex_exit(&rl->rl_lock); | |
635 | if (lr->lr_write_wanted) | |
636 | cv_broadcast(&lr->lr_write_cv); | |
637 | if (lr->lr_read_wanted) | |
638 | cv_broadcast(&lr->lr_read_cv); | |
34dc7c2f | 639 | } |
d88895a0 | 640 | |
5d43cc9a | 641 | #if defined(_KERNEL) |
2cc479d0 MM |
642 | EXPORT_SYMBOL(zfs_rangelock_init); |
643 | EXPORT_SYMBOL(zfs_rangelock_fini); | |
644 | EXPORT_SYMBOL(zfs_rangelock_enter); | |
645 | EXPORT_SYMBOL(zfs_rangelock_exit); | |
646 | EXPORT_SYMBOL(zfs_rangelock_reduce); | |
d88895a0 | 647 | #endif |