]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
c99c9001 | 25 | /* |
5d43cc9a | 26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. |
c99c9001 | 27 | */ |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | /* |
30 | * This file contains the code to implement file range locking in | |
d3cc8b15 | 31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is |
34dc7c2f BB |
32 | * support for growing the blocksize). |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
5d43cc9a MA |
37 | * lr = rangelock_enter(zp, off, len, lock_type); |
38 | * rangelock_reduce(lr, off, len); // optional | |
39 | * rangelock_exit(lr); | |
34dc7c2f BB |
40 | * |
41 | * AVL tree | |
42 | * -------- | |
43 | * An AVL tree is used to maintain the state of the existing ranges | |
44 | * that are locked for exclusive (writer) or shared (reader) use. | |
45 | * The starting range offset is used for searching and sorting the tree. | |
46 | * | |
47 | * Common case | |
48 | * ----------- | |
5d43cc9a MA |
49 | * The (hopefully) usual case is of no overlaps or contention for locks. On |
50 | * entry to rangelock_enter(), a locked_range_t is allocated; the tree | |
51 | * searched that finds no overlap, and *this* locked_range_t is placed in the | |
52 | * tree. | |
34dc7c2f BB |
53 | * |
54 | * Overlaps/Reference counting/Proxy locks | |
55 | * --------------------------------------- | |
56 | * The avl code only allows one node at a particular offset. Also it's very | |
57 | * inefficient to search through all previous entries looking for overlaps | |
58 | * (because the very 1st in the ordered list might be at offset 0 but | |
59 | * cover the whole file). | |
60 | * So this implementation uses reference counts and proxy range locks. | |
61 | * Firstly, only reader locks use reference counts and proxy locks, | |
62 | * because writer locks are exclusive. | |
63 | * When a reader lock overlaps with another then a proxy lock is created | |
64 | * for that range and replaces the original lock. If the overlap | |
65 | * is exact then the reference count of the proxy is simply incremented. | |
66 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
67 | * new proxy locks created for non overlapping ranges. | |
68 | * The reference counts are adjusted accordingly. | |
4e33ba4c | 69 | * Meanwhile, the original lock is kept around (this is the callers handle) |
34dc7c2f BB |
70 | * and its offset and length are used when releasing the lock. |
71 | * | |
72 | * Thread coordination | |
73 | * ------------------- | |
74 | * In order to make wakeups efficient and to ensure multiple continuous | |
75 | * readers on a range don't starve a writer for the same range lock, | |
76 | * two condition variables are allocated in each rl_t. | |
77 | * If a writer (or reader) can't get a range it initialises the writer | |
78 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
79 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
80 | * writers then all readers before destroying the lock. | |
81 | * | |
82 | * Append mode writes | |
83 | * ------------------ | |
84 | * Append mode writes need to lock a range at the end of a file. | |
85 | * The offset of the end of the file is determined under the | |
86 | * range locking mutex, and the lock type converted from RL_APPEND to | |
87 | * RL_WRITER and the range locked. | |
88 | * | |
89 | * Grow block handling | |
90 | * ------------------- | |
5d43cc9a | 91 | * ZFS supports multiple block sizes, up to 16MB. The smallest |
34dc7c2f BB |
92 | * block size is used for the file which is grown as needed. During this |
93 | * growth all other writers and readers must be excluded. | |
94 | * So if the block size needs to be grown then the whole file is | |
95 | * exclusively locked, then later the caller will reduce the lock | |
5d43cc9a | 96 | * range to just the range to be written using rangelock_reduce(). |
34dc7c2f BB |
97 | */ |
98 | ||
5d43cc9a | 99 | #include <sys/zfs_context.h> |
34dc7c2f | 100 | #include <sys/zfs_rlock.h> |
5d43cc9a MA |
101 | |
102 | /* | |
103 | * AVL comparison function used to order range locks | |
104 | * Locks are ordered on the start offset of the range. | |
105 | */ | |
106 | static int | |
107 | rangelock_compare(const void *arg1, const void *arg2) | |
108 | { | |
109 | const locked_range_t *rl1 = (const locked_range_t *)arg1; | |
110 | const locked_range_t *rl2 = (const locked_range_t *)arg2; | |
111 | ||
112 | return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); | |
113 | } | |
114 | ||
115 | /* | |
116 | * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. | |
117 | * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), | |
118 | * and may increase the range that's locked for RL_WRITER. | |
119 | */ | |
120 | void | |
121 | rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) | |
122 | { | |
123 | mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); | |
124 | avl_create(&rl->rl_tree, rangelock_compare, | |
125 | sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); | |
126 | rl->rl_cb = cb; | |
127 | rl->rl_arg = arg; | |
128 | } | |
129 | ||
130 | void | |
131 | rangelock_fini(rangelock_t *rl) | |
132 | { | |
133 | mutex_destroy(&rl->rl_lock); | |
134 | avl_destroy(&rl->rl_tree); | |
135 | } | |
34dc7c2f BB |
136 | |
137 | /* | |
138 | * Check if a write lock can be grabbed, or wait and recheck until available. | |
139 | */ | |
140 | static void | |
5d43cc9a | 141 | rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) |
34dc7c2f | 142 | { |
5d43cc9a MA |
143 | avl_tree_t *tree = &rl->rl_tree; |
144 | locked_range_t *lr; | |
34dc7c2f | 145 | avl_index_t where; |
5d43cc9a MA |
146 | uint64_t orig_off = new->lr_offset; |
147 | uint64_t orig_len = new->lr_length; | |
148 | rangelock_type_t orig_type = new->lr_type; | |
34dc7c2f BB |
149 | |
150 | for (;;) { | |
151 | /* | |
5d43cc9a MA |
152 | * Call callback which can modify new->r_off,len,type. |
153 | * Note, the callback is used by the ZPL to handle appending | |
154 | * and changing blocksizes. It isn't needed for zvols. | |
34dc7c2f | 155 | */ |
5d43cc9a MA |
156 | if (rl->rl_cb != NULL) { |
157 | rl->rl_cb(new, rl->rl_arg); | |
34dc7c2f BB |
158 | } |
159 | ||
5d43cc9a MA |
160 | /* |
161 | * If the type was APPEND, the callback must convert it to | |
162 | * WRITER. | |
163 | */ | |
164 | ASSERT3U(new->lr_type, ==, RL_WRITER); | |
165 | ||
34dc7c2f BB |
166 | /* |
167 | * First check for the usual case of no locks | |
168 | */ | |
169 | if (avl_numnodes(tree) == 0) { | |
34dc7c2f BB |
170 | avl_add(tree, new); |
171 | return; | |
172 | } | |
173 | ||
174 | /* | |
175 | * Look for any locks in the range. | |
176 | */ | |
5d43cc9a MA |
177 | lr = avl_find(tree, new, &where); |
178 | if (lr != NULL) | |
34dc7c2f BB |
179 | goto wait; /* already locked at same offset */ |
180 | ||
5d43cc9a MA |
181 | lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); |
182 | if (lr != NULL && | |
183 | lr->lr_offset < new->lr_offset + new->lr_length) | |
34dc7c2f BB |
184 | goto wait; |
185 | ||
5d43cc9a MA |
186 | lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); |
187 | if (lr != NULL && | |
188 | lr->lr_offset + lr->lr_length > new->lr_offset) | |
34dc7c2f BB |
189 | goto wait; |
190 | ||
34dc7c2f BB |
191 | avl_insert(tree, new, where); |
192 | return; | |
193 | wait: | |
5d43cc9a MA |
194 | if (!lr->lr_write_wanted) { |
195 | cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); | |
196 | lr->lr_write_wanted = B_TRUE; | |
34dc7c2f | 197 | } |
5d43cc9a | 198 | cv_wait(&lr->lr_write_cv, &rl->rl_lock); |
34dc7c2f BB |
199 | |
200 | /* reset to original */ | |
5d43cc9a MA |
201 | new->lr_offset = orig_off; |
202 | new->lr_length = orig_len; | |
203 | new->lr_type = orig_type; | |
34dc7c2f BB |
204 | } |
205 | } | |
206 | ||
207 | /* | |
208 | * If this is an original (non-proxy) lock then replace it by | |
209 | * a proxy and return the proxy. | |
210 | */ | |
5d43cc9a MA |
211 | static locked_range_t * |
212 | rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) | |
34dc7c2f | 213 | { |
5d43cc9a | 214 | locked_range_t *proxy; |
34dc7c2f | 215 | |
5d43cc9a MA |
216 | if (lr->lr_proxy) |
217 | return (lr); /* already a proxy */ | |
34dc7c2f | 218 | |
5d43cc9a MA |
219 | ASSERT3U(lr->lr_count, ==, 1); |
220 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
221 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
222 | avl_remove(tree, lr); | |
223 | lr->lr_count = 0; | |
34dc7c2f BB |
224 | |
225 | /* create a proxy range lock */ | |
5d43cc9a MA |
226 | proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); |
227 | proxy->lr_offset = lr->lr_offset; | |
228 | proxy->lr_length = lr->lr_length; | |
229 | proxy->lr_count = 1; | |
230 | proxy->lr_type = RL_READER; | |
231 | proxy->lr_proxy = B_TRUE; | |
232 | proxy->lr_write_wanted = B_FALSE; | |
233 | proxy->lr_read_wanted = B_FALSE; | |
34dc7c2f BB |
234 | avl_add(tree, proxy); |
235 | ||
236 | return (proxy); | |
237 | } | |
238 | ||
239 | /* | |
240 | * Split the range lock at the supplied offset | |
241 | * returning the *front* proxy. | |
242 | */ | |
5d43cc9a MA |
243 | static locked_range_t * |
244 | rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) | |
34dc7c2f | 245 | { |
5d43cc9a MA |
246 | ASSERT3U(lr->lr_length, >, 1); |
247 | ASSERT3U(off, >, lr->lr_offset); | |
248 | ASSERT3U(off, <, lr->lr_offset + lr->lr_length); | |
249 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
250 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
34dc7c2f BB |
251 | |
252 | /* create the rear proxy range lock */ | |
5d43cc9a MA |
253 | locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); |
254 | rear->lr_offset = off; | |
255 | rear->lr_length = lr->lr_offset + lr->lr_length - off; | |
256 | rear->lr_count = lr->lr_count; | |
257 | rear->lr_type = RL_READER; | |
258 | rear->lr_proxy = B_TRUE; | |
259 | rear->lr_write_wanted = B_FALSE; | |
260 | rear->lr_read_wanted = B_FALSE; | |
261 | ||
262 | locked_range_t *front = rangelock_proxify(tree, lr); | |
263 | front->lr_length = off - lr->lr_offset; | |
34dc7c2f BB |
264 | |
265 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
266 | return (front); | |
267 | } | |
268 | ||
269 | /* | |
270 | * Create and add a new proxy range lock for the supplied range. | |
271 | */ | |
272 | static void | |
5d43cc9a | 273 | rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) |
34dc7c2f | 274 | { |
5d43cc9a MA |
275 | ASSERT(len != 0); |
276 | locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); | |
277 | lr->lr_offset = off; | |
278 | lr->lr_length = len; | |
279 | lr->lr_count = 1; | |
280 | lr->lr_type = RL_READER; | |
281 | lr->lr_proxy = B_TRUE; | |
282 | lr->lr_write_wanted = B_FALSE; | |
283 | lr->lr_read_wanted = B_FALSE; | |
284 | avl_add(tree, lr); | |
34dc7c2f BB |
285 | } |
286 | ||
287 | static void | |
5d43cc9a MA |
288 | rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, |
289 | locked_range_t *prev, avl_index_t where) | |
34dc7c2f | 290 | { |
5d43cc9a MA |
291 | locked_range_t *next; |
292 | uint64_t off = new->lr_offset; | |
293 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
294 | |
295 | /* | |
296 | * prev arrives either: | |
297 | * - pointing to an entry at the same offset | |
298 | * - pointing to the entry with the closest previous offset whose | |
299 | * range may overlap with the new range | |
300 | * - null, if there were no ranges starting before the new one | |
301 | */ | |
5d43cc9a MA |
302 | if (prev != NULL) { |
303 | if (prev->lr_offset + prev->lr_length <= off) { | |
34dc7c2f | 304 | prev = NULL; |
5d43cc9a | 305 | } else if (prev->lr_offset != off) { |
34dc7c2f BB |
306 | /* |
307 | * convert to proxy if needed then | |
308 | * split this entry and bump ref count | |
309 | */ | |
5d43cc9a | 310 | prev = rangelock_split(tree, prev, off); |
34dc7c2f BB |
311 | prev = AVL_NEXT(tree, prev); /* move to rear range */ |
312 | } | |
313 | } | |
5d43cc9a | 314 | ASSERT((prev == NULL) || (prev->lr_offset == off)); |
34dc7c2f | 315 | |
5d43cc9a | 316 | if (prev != NULL) |
34dc7c2f BB |
317 | next = prev; |
318 | else | |
5d43cc9a | 319 | next = avl_nearest(tree, where, AVL_AFTER); |
34dc7c2f | 320 | |
5d43cc9a | 321 | if (next == NULL || off + len <= next->lr_offset) { |
34dc7c2f BB |
322 | /* no overlaps, use the original new rl_t in the tree */ |
323 | avl_insert(tree, new, where); | |
324 | return; | |
325 | } | |
326 | ||
5d43cc9a | 327 | if (off < next->lr_offset) { |
34dc7c2f | 328 | /* Add a proxy for initial range before the overlap */ |
5d43cc9a | 329 | rangelock_new_proxy(tree, off, next->lr_offset - off); |
34dc7c2f BB |
330 | } |
331 | ||
5d43cc9a | 332 | new->lr_count = 0; /* will use proxies in tree */ |
34dc7c2f BB |
333 | /* |
334 | * We now search forward through the ranges, until we go past the end | |
335 | * of the new range. For each entry we make it a proxy if it | |
336 | * isn't already, then bump its reference count. If there's any | |
337 | * gaps between the ranges then we create a new proxy range. | |
338 | */ | |
339 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
5d43cc9a | 340 | if (off + len <= next->lr_offset) |
34dc7c2f | 341 | break; |
5d43cc9a MA |
342 | if (prev != NULL && prev->lr_offset + prev->lr_length < |
343 | next->lr_offset) { | |
34dc7c2f | 344 | /* there's a gap */ |
5d43cc9a MA |
345 | ASSERT3U(next->lr_offset, >, |
346 | prev->lr_offset + prev->lr_length); | |
347 | rangelock_new_proxy(tree, | |
348 | prev->lr_offset + prev->lr_length, | |
349 | next->lr_offset - | |
350 | (prev->lr_offset + prev->lr_length)); | |
34dc7c2f | 351 | } |
5d43cc9a | 352 | if (off + len == next->lr_offset + next->lr_length) { |
34dc7c2f | 353 | /* exact overlap with end */ |
5d43cc9a MA |
354 | next = rangelock_proxify(tree, next); |
355 | next->lr_count++; | |
34dc7c2f BB |
356 | return; |
357 | } | |
5d43cc9a | 358 | if (off + len < next->lr_offset + next->lr_length) { |
34dc7c2f | 359 | /* new range ends in the middle of this block */ |
5d43cc9a MA |
360 | next = rangelock_split(tree, next, off + len); |
361 | next->lr_count++; | |
34dc7c2f BB |
362 | return; |
363 | } | |
5d43cc9a MA |
364 | ASSERT3U(off + len, >, next->lr_offset + next->lr_length); |
365 | next = rangelock_proxify(tree, next); | |
366 | next->lr_count++; | |
34dc7c2f BB |
367 | } |
368 | ||
369 | /* Add the remaining end range. */ | |
5d43cc9a MA |
370 | rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, |
371 | (off + len) - (prev->lr_offset + prev->lr_length)); | |
34dc7c2f BB |
372 | } |
373 | ||
374 | /* | |
375 | * Check if a reader lock can be grabbed, or wait and recheck until available. | |
376 | */ | |
377 | static void | |
5d43cc9a | 378 | rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) |
34dc7c2f | 379 | { |
5d43cc9a MA |
380 | avl_tree_t *tree = &rl->rl_tree; |
381 | locked_range_t *prev, *next; | |
34dc7c2f | 382 | avl_index_t where; |
5d43cc9a MA |
383 | uint64_t off = new->lr_offset; |
384 | uint64_t len = new->lr_length; | |
34dc7c2f BB |
385 | |
386 | /* | |
387 | * Look for any writer locks in the range. | |
388 | */ | |
389 | retry: | |
390 | prev = avl_find(tree, new, &where); | |
391 | if (prev == NULL) | |
5d43cc9a | 392 | prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); |
34dc7c2f BB |
393 | |
394 | /* | |
395 | * Check the previous range for a writer lock overlap. | |
396 | */ | |
5d43cc9a MA |
397 | if (prev && (off < prev->lr_offset + prev->lr_length)) { |
398 | if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { | |
399 | if (!prev->lr_read_wanted) { | |
400 | cv_init(&prev->lr_read_cv, | |
401 | NULL, CV_DEFAULT, NULL); | |
402 | prev->lr_read_wanted = B_TRUE; | |
34dc7c2f | 403 | } |
5d43cc9a | 404 | cv_wait(&prev->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
405 | goto retry; |
406 | } | |
5d43cc9a | 407 | if (off + len < prev->lr_offset + prev->lr_length) |
34dc7c2f BB |
408 | goto got_lock; |
409 | } | |
410 | ||
411 | /* | |
412 | * Search through the following ranges to see if there's | |
413 | * write lock any overlap. | |
414 | */ | |
5d43cc9a | 415 | if (prev != NULL) |
34dc7c2f BB |
416 | next = AVL_NEXT(tree, prev); |
417 | else | |
5d43cc9a MA |
418 | next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); |
419 | for (; next != NULL; next = AVL_NEXT(tree, next)) { | |
420 | if (off + len <= next->lr_offset) | |
34dc7c2f | 421 | goto got_lock; |
5d43cc9a MA |
422 | if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { |
423 | if (!next->lr_read_wanted) { | |
424 | cv_init(&next->lr_read_cv, | |
425 | NULL, CV_DEFAULT, NULL); | |
426 | next->lr_read_wanted = B_TRUE; | |
34dc7c2f | 427 | } |
5d43cc9a | 428 | cv_wait(&next->lr_read_cv, &rl->rl_lock); |
34dc7c2f BB |
429 | goto retry; |
430 | } | |
5d43cc9a | 431 | if (off + len <= next->lr_offset + next->lr_length) |
34dc7c2f BB |
432 | goto got_lock; |
433 | } | |
434 | ||
435 | got_lock: | |
436 | /* | |
437 | * Add the read lock, which may involve splitting existing | |
5d43cc9a | 438 | * locks and bumping ref counts (r_count). |
34dc7c2f | 439 | */ |
5d43cc9a | 440 | rangelock_add_reader(tree, new, prev, where); |
34dc7c2f BB |
441 | } |
442 | ||
443 | /* | |
5d43cc9a MA |
444 | * Lock a range (offset, length) as either shared (RL_READER) or exclusive |
445 | * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert | |
446 | * it to a RL_WRITER lock (with the offset at the end of the file). Returns | |
447 | * the range lock structure for later unlocking (or reduce range if the | |
448 | * entire file is locked as RL_WRITER). | |
34dc7c2f | 449 | */ |
5d43cc9a MA |
450 | locked_range_t * |
451 | rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, | |
452 | rangelock_type_t type) | |
34dc7c2f | 453 | { |
34dc7c2f BB |
454 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); |
455 | ||
5d43cc9a MA |
456 | locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); |
457 | new->lr_rangelock = rl; | |
458 | new->lr_offset = off; | |
d164b209 BB |
459 | if (len + off < off) /* overflow */ |
460 | len = UINT64_MAX - off; | |
5d43cc9a MA |
461 | new->lr_length = len; |
462 | new->lr_count = 1; /* assume it's going to be in the tree */ | |
463 | new->lr_type = type; | |
464 | new->lr_proxy = B_FALSE; | |
465 | new->lr_write_wanted = B_FALSE; | |
466 | new->lr_read_wanted = B_FALSE; | |
467 | ||
468 | mutex_enter(&rl->rl_lock); | |
34dc7c2f BB |
469 | if (type == RL_READER) { |
470 | /* | |
471 | * First check for the usual case of no locks | |
472 | */ | |
5d43cc9a MA |
473 | if (avl_numnodes(&rl->rl_tree) == 0) |
474 | avl_add(&rl->rl_tree, new); | |
34dc7c2f | 475 | else |
5d43cc9a MA |
476 | rangelock_enter_reader(rl, new); |
477 | } else | |
478 | rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */ | |
479 | mutex_exit(&rl->rl_lock); | |
34dc7c2f BB |
480 | return (new); |
481 | } | |
482 | ||
5d43cc9a MA |
483 | /* |
484 | * Safely free the locked_range_t. | |
485 | */ | |
8926ab7a | 486 | static void |
5d43cc9a | 487 | rangelock_free(locked_range_t *lr) |
8926ab7a | 488 | { |
5d43cc9a MA |
489 | if (lr->lr_write_wanted) |
490 | cv_destroy(&lr->lr_write_cv); | |
8926ab7a | 491 | |
5d43cc9a MA |
492 | if (lr->lr_read_wanted) |
493 | cv_destroy(&lr->lr_read_cv); | |
8926ab7a | 494 | |
5d43cc9a | 495 | kmem_free(lr, sizeof (locked_range_t)); |
8926ab7a BB |
496 | } |
497 | ||
34dc7c2f BB |
498 | /* |
499 | * Unlock a reader lock | |
500 | */ | |
501 | static void | |
5d43cc9a MA |
502 | rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, |
503 | list_t *free_list) | |
34dc7c2f | 504 | { |
5d43cc9a | 505 | avl_tree_t *tree = &rl->rl_tree; |
34dc7c2f BB |
506 | uint64_t len; |
507 | ||
508 | /* | |
509 | * The common case is when the remove entry is in the tree | |
510 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
511 | * with this one. Otherwise the remove entry will have been | |
512 | * removed from the tree and replaced by proxies (one or | |
513 | * more ranges mapping to the entire range). | |
514 | */ | |
5d43cc9a | 515 | if (remove->lr_count == 1) { |
34dc7c2f | 516 | avl_remove(tree, remove); |
5d43cc9a MA |
517 | if (remove->lr_write_wanted) |
518 | cv_broadcast(&remove->lr_write_cv); | |
519 | if (remove->lr_read_wanted) | |
520 | cv_broadcast(&remove->lr_read_cv); | |
450dc149 | 521 | list_insert_tail(free_list, remove); |
34dc7c2f | 522 | } else { |
5d43cc9a MA |
523 | ASSERT0(remove->lr_count); |
524 | ASSERT0(remove->lr_write_wanted); | |
525 | ASSERT0(remove->lr_read_wanted); | |
34dc7c2f BB |
526 | /* |
527 | * Find start proxy representing this reader lock, | |
528 | * then decrement ref count on all proxies | |
529 | * that make up this range, freeing them as needed. | |
530 | */ | |
5d43cc9a MA |
531 | locked_range_t *lr = avl_find(tree, remove, NULL); |
532 | ASSERT3P(lr, !=, NULL); | |
533 | ASSERT3U(lr->lr_count, !=, 0); | |
534 | ASSERT3U(lr->lr_type, ==, RL_READER); | |
535 | locked_range_t *next = NULL; | |
536 | for (len = remove->lr_length; len != 0; lr = next) { | |
537 | len -= lr->lr_length; | |
538 | if (len != 0) { | |
539 | next = AVL_NEXT(tree, lr); | |
540 | ASSERT3P(next, !=, NULL); | |
541 | ASSERT3U(lr->lr_offset + lr->lr_length, ==, | |
542 | next->lr_offset); | |
543 | ASSERT3U(next->lr_count, !=, 0); | |
544 | ASSERT3U(next->lr_type, ==, RL_READER); | |
34dc7c2f | 545 | } |
5d43cc9a MA |
546 | lr->lr_count--; |
547 | if (lr->lr_count == 0) { | |
548 | avl_remove(tree, lr); | |
549 | if (lr->lr_write_wanted) | |
550 | cv_broadcast(&lr->lr_write_cv); | |
551 | if (lr->lr_read_wanted) | |
552 | cv_broadcast(&lr->lr_read_cv); | |
553 | list_insert_tail(free_list, lr); | |
34dc7c2f BB |
554 | } |
555 | } | |
5d43cc9a | 556 | kmem_free(remove, sizeof (locked_range_t)); |
34dc7c2f | 557 | } |
34dc7c2f BB |
558 | } |
559 | ||
560 | /* | |
561 | * Unlock range and destroy range lock structure. | |
562 | */ | |
563 | void | |
5d43cc9a | 564 | rangelock_exit(locked_range_t *lr) |
34dc7c2f | 565 | { |
5d43cc9a | 566 | rangelock_t *rl = lr->lr_rangelock; |
450dc149 | 567 | list_t free_list; |
5d43cc9a | 568 | locked_range_t *free_lr; |
34dc7c2f | 569 | |
5d43cc9a MA |
570 | ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); |
571 | ASSERT(lr->lr_count == 1 || lr->lr_count == 0); | |
572 | ASSERT(!lr->lr_proxy); | |
8926ab7a | 573 | |
5d43cc9a MA |
574 | /* |
575 | * The free list is used to defer the cv_destroy() and | |
576 | * subsequent kmem_free until after the mutex is dropped. | |
577 | */ | |
578 | list_create(&free_list, sizeof (locked_range_t), | |
579 | offsetof(locked_range_t, lr_node)); | |
8926ab7a | 580 | |
5d43cc9a MA |
581 | mutex_enter(&rl->rl_lock); |
582 | if (lr->lr_type == RL_WRITER) { | |
583 | /* writer locks can't be shared or split */ | |
584 | avl_remove(&rl->rl_tree, lr); | |
585 | if (lr->lr_write_wanted) | |
586 | cv_broadcast(&lr->lr_write_cv); | |
587 | if (lr->lr_read_wanted) | |
588 | cv_broadcast(&lr->lr_read_cv); | |
589 | list_insert_tail(&free_list, lr); | |
34dc7c2f BB |
590 | } else { |
591 | /* | |
5d43cc9a MA |
592 | * lock may be shared, let rangelock_exit_reader() |
593 | * release the lock and free the locked_range_t. | |
34dc7c2f | 594 | */ |
5d43cc9a | 595 | rangelock_exit_reader(rl, lr, &free_list); |
34dc7c2f | 596 | } |
5d43cc9a | 597 | mutex_exit(&rl->rl_lock); |
450dc149 | 598 | |
5d43cc9a MA |
599 | while ((free_lr = list_remove_head(&free_list)) != NULL) |
600 | rangelock_free(free_lr); | |
450dc149 BB |
601 | |
602 | list_destroy(&free_list); | |
34dc7c2f BB |
603 | } |
604 | ||
605 | /* | |
606 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
5d43cc9a | 607 | * Asserts the whole file is exclusively locked and so there's only one |
34dc7c2f BB |
608 | * entry in the tree. |
609 | */ | |
610 | void | |
5d43cc9a | 611 | rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) |
34dc7c2f | 612 | { |
5d43cc9a | 613 | rangelock_t *rl = lr->lr_rangelock; |
34dc7c2f BB |
614 | |
615 | /* Ensure there are no other locks */ | |
5d43cc9a MA |
616 | ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); |
617 | ASSERT3U(lr->lr_offset, ==, 0); | |
618 | ASSERT3U(lr->lr_type, ==, RL_WRITER); | |
619 | ASSERT(!lr->lr_proxy); | |
620 | ASSERT3U(lr->lr_length, ==, UINT64_MAX); | |
621 | ASSERT3U(lr->lr_count, ==, 1); | |
622 | ||
623 | mutex_enter(&rl->rl_lock); | |
624 | lr->lr_offset = off; | |
625 | lr->lr_length = len; | |
626 | mutex_exit(&rl->rl_lock); | |
627 | if (lr->lr_write_wanted) | |
628 | cv_broadcast(&lr->lr_write_cv); | |
629 | if (lr->lr_read_wanted) | |
630 | cv_broadcast(&lr->lr_read_cv); | |
34dc7c2f | 631 | } |
d88895a0 | 632 | |
5d43cc9a MA |
633 | #if defined(_KERNEL) |
634 | EXPORT_SYMBOL(rangelock_init); | |
635 | EXPORT_SYMBOL(rangelock_fini); | |
636 | EXPORT_SYMBOL(rangelock_enter); | |
637 | EXPORT_SYMBOL(rangelock_exit); | |
638 | EXPORT_SYMBOL(rangelock_reduce); | |
d88895a0 | 639 | #endif |