]>
Commit | Line | Data |
---|---|---|
1 | /* | |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. | |
23 | * Use is subject to license terms. | |
24 | */ | |
25 | /* | |
26 | * Copyright (c) 2012, 2018 by Delphix. All rights reserved. | |
27 | */ | |
28 | ||
29 | /* | |
30 | * This file contains the code to implement file range locking in | |
31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is | |
32 | * support for growing the blocksize). | |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
37 | * lr = rangelock_enter(zp, off, len, lock_type); | |
38 | * rangelock_reduce(lr, off, len); // optional | |
39 | * rangelock_exit(lr); | |
40 | * | |
41 | * AVL tree | |
42 | * -------- | |
43 | * An AVL tree is used to maintain the state of the existing ranges | |
44 | * that are locked for exclusive (writer) or shared (reader) use. | |
45 | * The starting range offset is used for searching and sorting the tree. | |
46 | * | |
47 | * Common case | |
48 | * ----------- | |
49 | * The (hopefully) usual case is of no overlaps or contention for locks. On | |
50 | * entry to rangelock_enter(), a locked_range_t is allocated; the tree | |
51 | * searched that finds no overlap, and *this* locked_range_t is placed in the | |
52 | * tree. | |
53 | * | |
54 | * Overlaps/Reference counting/Proxy locks | |
55 | * --------------------------------------- | |
56 | * The avl code only allows one node at a particular offset. Also it's very | |
57 | * inefficient to search through all previous entries looking for overlaps | |
58 | * (because the very 1st in the ordered list might be at offset 0 but | |
59 | * cover the whole file). | |
60 | * So this implementation uses reference counts and proxy range locks. | |
61 | * Firstly, only reader locks use reference counts and proxy locks, | |
62 | * because writer locks are exclusive. | |
63 | * When a reader lock overlaps with another then a proxy lock is created | |
64 | * for that range and replaces the original lock. If the overlap | |
65 | * is exact then the reference count of the proxy is simply incremented. | |
66 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
67 | * new proxy locks created for non overlapping ranges. | |
68 | * The reference counts are adjusted accordingly. | |
69 | * Meanwhile, the original lock is kept around (this is the callers handle) | |
70 | * and its offset and length are used when releasing the lock. | |
71 | * | |
72 | * Thread coordination | |
73 | * ------------------- | |
74 | * In order to make wakeups efficient and to ensure multiple continuous | |
75 | * readers on a range don't starve a writer for the same range lock, | |
76 | * two condition variables are allocated in each rl_t. | |
77 | * If a writer (or reader) can't get a range it initialises the writer | |
78 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
79 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
80 | * writers then all readers before destroying the lock. | |
81 | * | |
82 | * Append mode writes | |
83 | * ------------------ | |
84 | * Append mode writes need to lock a range at the end of a file. | |
85 | * The offset of the end of the file is determined under the | |
86 | * range locking mutex, and the lock type converted from RL_APPEND to | |
87 | * RL_WRITER and the range locked. | |
88 | * | |
89 | * Grow block handling | |
90 | * ------------------- | |
91 | * ZFS supports multiple block sizes, up to 16MB. The smallest | |
92 | * block size is used for the file which is grown as needed. During this | |
93 | * growth all other writers and readers must be excluded. | |
94 | * So if the block size needs to be grown then the whole file is | |
95 | * exclusively locked, then later the caller will reduce the lock | |
96 | * range to just the range to be written using rangelock_reduce(). | |
97 | */ | |
98 | ||
99 | #include <sys/zfs_context.h> | |
100 | #include <sys/zfs_rlock.h> | |
101 | ||
102 | /* | |
103 | * AVL comparison function used to order range locks | |
104 | * Locks are ordered on the start offset of the range. | |
105 | */ | |
106 | static int | |
107 | rangelock_compare(const void *arg1, const void *arg2) | |
108 | { | |
109 | const locked_range_t *rl1 = (const locked_range_t *)arg1; | |
110 | const locked_range_t *rl2 = (const locked_range_t *)arg2; | |
111 | ||
112 | return (AVL_CMP(rl1->lr_offset, rl2->lr_offset)); | |
113 | } | |
114 | ||
115 | /* | |
116 | * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. | |
117 | * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), | |
118 | * and may increase the range that's locked for RL_WRITER. | |
119 | */ | |
120 | void | |
121 | rangelock_init(rangelock_t *rl, rangelock_cb_t *cb, void *arg) | |
122 | { | |
123 | mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); | |
124 | avl_create(&rl->rl_tree, rangelock_compare, | |
125 | sizeof (locked_range_t), offsetof(locked_range_t, lr_node)); | |
126 | rl->rl_cb = cb; | |
127 | rl->rl_arg = arg; | |
128 | } | |
129 | ||
130 | void | |
131 | rangelock_fini(rangelock_t *rl) | |
132 | { | |
133 | mutex_destroy(&rl->rl_lock); | |
134 | avl_destroy(&rl->rl_tree); | |
135 | } | |
136 | ||
137 | /* | |
138 | * Check if a write lock can be grabbed, or wait and recheck until available. | |
139 | */ | |
140 | static void | |
141 | rangelock_enter_writer(rangelock_t *rl, locked_range_t *new) | |
142 | { | |
143 | avl_tree_t *tree = &rl->rl_tree; | |
144 | locked_range_t *lr; | |
145 | avl_index_t where; | |
146 | uint64_t orig_off = new->lr_offset; | |
147 | uint64_t orig_len = new->lr_length; | |
148 | rangelock_type_t orig_type = new->lr_type; | |
149 | ||
150 | for (;;) { | |
151 | /* | |
152 | * Call callback which can modify new->r_off,len,type. | |
153 | * Note, the callback is used by the ZPL to handle appending | |
154 | * and changing blocksizes. It isn't needed for zvols. | |
155 | */ | |
156 | if (rl->rl_cb != NULL) { | |
157 | rl->rl_cb(new, rl->rl_arg); | |
158 | } | |
159 | ||
160 | /* | |
161 | * If the type was APPEND, the callback must convert it to | |
162 | * WRITER. | |
163 | */ | |
164 | ASSERT3U(new->lr_type, ==, RL_WRITER); | |
165 | ||
166 | /* | |
167 | * First check for the usual case of no locks | |
168 | */ | |
169 | if (avl_numnodes(tree) == 0) { | |
170 | avl_add(tree, new); | |
171 | return; | |
172 | } | |
173 | ||
174 | /* | |
175 | * Look for any locks in the range. | |
176 | */ | |
177 | lr = avl_find(tree, new, &where); | |
178 | if (lr != NULL) | |
179 | goto wait; /* already locked at same offset */ | |
180 | ||
181 | lr = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); | |
182 | if (lr != NULL && | |
183 | lr->lr_offset < new->lr_offset + new->lr_length) | |
184 | goto wait; | |
185 | ||
186 | lr = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); | |
187 | if (lr != NULL && | |
188 | lr->lr_offset + lr->lr_length > new->lr_offset) | |
189 | goto wait; | |
190 | ||
191 | avl_insert(tree, new, where); | |
192 | return; | |
193 | wait: | |
194 | if (!lr->lr_write_wanted) { | |
195 | cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); | |
196 | lr->lr_write_wanted = B_TRUE; | |
197 | } | |
198 | cv_wait(&lr->lr_write_cv, &rl->rl_lock); | |
199 | ||
200 | /* reset to original */ | |
201 | new->lr_offset = orig_off; | |
202 | new->lr_length = orig_len; | |
203 | new->lr_type = orig_type; | |
204 | } | |
205 | } | |
206 | ||
207 | /* | |
208 | * If this is an original (non-proxy) lock then replace it by | |
209 | * a proxy and return the proxy. | |
210 | */ | |
211 | static locked_range_t * | |
212 | rangelock_proxify(avl_tree_t *tree, locked_range_t *lr) | |
213 | { | |
214 | locked_range_t *proxy; | |
215 | ||
216 | if (lr->lr_proxy) | |
217 | return (lr); /* already a proxy */ | |
218 | ||
219 | ASSERT3U(lr->lr_count, ==, 1); | |
220 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
221 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
222 | avl_remove(tree, lr); | |
223 | lr->lr_count = 0; | |
224 | ||
225 | /* create a proxy range lock */ | |
226 | proxy = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); | |
227 | proxy->lr_offset = lr->lr_offset; | |
228 | proxy->lr_length = lr->lr_length; | |
229 | proxy->lr_count = 1; | |
230 | proxy->lr_type = RL_READER; | |
231 | proxy->lr_proxy = B_TRUE; | |
232 | proxy->lr_write_wanted = B_FALSE; | |
233 | proxy->lr_read_wanted = B_FALSE; | |
234 | avl_add(tree, proxy); | |
235 | ||
236 | return (proxy); | |
237 | } | |
238 | ||
239 | /* | |
240 | * Split the range lock at the supplied offset | |
241 | * returning the *front* proxy. | |
242 | */ | |
243 | static locked_range_t * | |
244 | rangelock_split(avl_tree_t *tree, locked_range_t *lr, uint64_t off) | |
245 | { | |
246 | ASSERT3U(lr->lr_length, >, 1); | |
247 | ASSERT3U(off, >, lr->lr_offset); | |
248 | ASSERT3U(off, <, lr->lr_offset + lr->lr_length); | |
249 | ASSERT(lr->lr_write_wanted == B_FALSE); | |
250 | ASSERT(lr->lr_read_wanted == B_FALSE); | |
251 | ||
252 | /* create the rear proxy range lock */ | |
253 | locked_range_t *rear = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); | |
254 | rear->lr_offset = off; | |
255 | rear->lr_length = lr->lr_offset + lr->lr_length - off; | |
256 | rear->lr_count = lr->lr_count; | |
257 | rear->lr_type = RL_READER; | |
258 | rear->lr_proxy = B_TRUE; | |
259 | rear->lr_write_wanted = B_FALSE; | |
260 | rear->lr_read_wanted = B_FALSE; | |
261 | ||
262 | locked_range_t *front = rangelock_proxify(tree, lr); | |
263 | front->lr_length = off - lr->lr_offset; | |
264 | ||
265 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
266 | return (front); | |
267 | } | |
268 | ||
269 | /* | |
270 | * Create and add a new proxy range lock for the supplied range. | |
271 | */ | |
272 | static void | |
273 | rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) | |
274 | { | |
275 | ASSERT(len != 0); | |
276 | locked_range_t *lr = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); | |
277 | lr->lr_offset = off; | |
278 | lr->lr_length = len; | |
279 | lr->lr_count = 1; | |
280 | lr->lr_type = RL_READER; | |
281 | lr->lr_proxy = B_TRUE; | |
282 | lr->lr_write_wanted = B_FALSE; | |
283 | lr->lr_read_wanted = B_FALSE; | |
284 | avl_add(tree, lr); | |
285 | } | |
286 | ||
287 | static void | |
288 | rangelock_add_reader(avl_tree_t *tree, locked_range_t *new, | |
289 | locked_range_t *prev, avl_index_t where) | |
290 | { | |
291 | locked_range_t *next; | |
292 | uint64_t off = new->lr_offset; | |
293 | uint64_t len = new->lr_length; | |
294 | ||
295 | /* | |
296 | * prev arrives either: | |
297 | * - pointing to an entry at the same offset | |
298 | * - pointing to the entry with the closest previous offset whose | |
299 | * range may overlap with the new range | |
300 | * - null, if there were no ranges starting before the new one | |
301 | */ | |
302 | if (prev != NULL) { | |
303 | if (prev->lr_offset + prev->lr_length <= off) { | |
304 | prev = NULL; | |
305 | } else if (prev->lr_offset != off) { | |
306 | /* | |
307 | * convert to proxy if needed then | |
308 | * split this entry and bump ref count | |
309 | */ | |
310 | prev = rangelock_split(tree, prev, off); | |
311 | prev = AVL_NEXT(tree, prev); /* move to rear range */ | |
312 | } | |
313 | } | |
314 | ASSERT((prev == NULL) || (prev->lr_offset == off)); | |
315 | ||
316 | if (prev != NULL) | |
317 | next = prev; | |
318 | else | |
319 | next = avl_nearest(tree, where, AVL_AFTER); | |
320 | ||
321 | if (next == NULL || off + len <= next->lr_offset) { | |
322 | /* no overlaps, use the original new rl_t in the tree */ | |
323 | avl_insert(tree, new, where); | |
324 | return; | |
325 | } | |
326 | ||
327 | if (off < next->lr_offset) { | |
328 | /* Add a proxy for initial range before the overlap */ | |
329 | rangelock_new_proxy(tree, off, next->lr_offset - off); | |
330 | } | |
331 | ||
332 | new->lr_count = 0; /* will use proxies in tree */ | |
333 | /* | |
334 | * We now search forward through the ranges, until we go past the end | |
335 | * of the new range. For each entry we make it a proxy if it | |
336 | * isn't already, then bump its reference count. If there's any | |
337 | * gaps between the ranges then we create a new proxy range. | |
338 | */ | |
339 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
340 | if (off + len <= next->lr_offset) | |
341 | break; | |
342 | if (prev != NULL && prev->lr_offset + prev->lr_length < | |
343 | next->lr_offset) { | |
344 | /* there's a gap */ | |
345 | ASSERT3U(next->lr_offset, >, | |
346 | prev->lr_offset + prev->lr_length); | |
347 | rangelock_new_proxy(tree, | |
348 | prev->lr_offset + prev->lr_length, | |
349 | next->lr_offset - | |
350 | (prev->lr_offset + prev->lr_length)); | |
351 | } | |
352 | if (off + len == next->lr_offset + next->lr_length) { | |
353 | /* exact overlap with end */ | |
354 | next = rangelock_proxify(tree, next); | |
355 | next->lr_count++; | |
356 | return; | |
357 | } | |
358 | if (off + len < next->lr_offset + next->lr_length) { | |
359 | /* new range ends in the middle of this block */ | |
360 | next = rangelock_split(tree, next, off + len); | |
361 | next->lr_count++; | |
362 | return; | |
363 | } | |
364 | ASSERT3U(off + len, >, next->lr_offset + next->lr_length); | |
365 | next = rangelock_proxify(tree, next); | |
366 | next->lr_count++; | |
367 | } | |
368 | ||
369 | /* Add the remaining end range. */ | |
370 | rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, | |
371 | (off + len) - (prev->lr_offset + prev->lr_length)); | |
372 | } | |
373 | ||
374 | /* | |
375 | * Check if a reader lock can be grabbed, or wait and recheck until available. | |
376 | */ | |
377 | static void | |
378 | rangelock_enter_reader(rangelock_t *rl, locked_range_t *new) | |
379 | { | |
380 | avl_tree_t *tree = &rl->rl_tree; | |
381 | locked_range_t *prev, *next; | |
382 | avl_index_t where; | |
383 | uint64_t off = new->lr_offset; | |
384 | uint64_t len = new->lr_length; | |
385 | ||
386 | /* | |
387 | * Look for any writer locks in the range. | |
388 | */ | |
389 | retry: | |
390 | prev = avl_find(tree, new, &where); | |
391 | if (prev == NULL) | |
392 | prev = (locked_range_t *)avl_nearest(tree, where, AVL_BEFORE); | |
393 | ||
394 | /* | |
395 | * Check the previous range for a writer lock overlap. | |
396 | */ | |
397 | if (prev && (off < prev->lr_offset + prev->lr_length)) { | |
398 | if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { | |
399 | if (!prev->lr_read_wanted) { | |
400 | cv_init(&prev->lr_read_cv, | |
401 | NULL, CV_DEFAULT, NULL); | |
402 | prev->lr_read_wanted = B_TRUE; | |
403 | } | |
404 | cv_wait(&prev->lr_read_cv, &rl->rl_lock); | |
405 | goto retry; | |
406 | } | |
407 | if (off + len < prev->lr_offset + prev->lr_length) | |
408 | goto got_lock; | |
409 | } | |
410 | ||
411 | /* | |
412 | * Search through the following ranges to see if there's | |
413 | * write lock any overlap. | |
414 | */ | |
415 | if (prev != NULL) | |
416 | next = AVL_NEXT(tree, prev); | |
417 | else | |
418 | next = (locked_range_t *)avl_nearest(tree, where, AVL_AFTER); | |
419 | for (; next != NULL; next = AVL_NEXT(tree, next)) { | |
420 | if (off + len <= next->lr_offset) | |
421 | goto got_lock; | |
422 | if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { | |
423 | if (!next->lr_read_wanted) { | |
424 | cv_init(&next->lr_read_cv, | |
425 | NULL, CV_DEFAULT, NULL); | |
426 | next->lr_read_wanted = B_TRUE; | |
427 | } | |
428 | cv_wait(&next->lr_read_cv, &rl->rl_lock); | |
429 | goto retry; | |
430 | } | |
431 | if (off + len <= next->lr_offset + next->lr_length) | |
432 | goto got_lock; | |
433 | } | |
434 | ||
435 | got_lock: | |
436 | /* | |
437 | * Add the read lock, which may involve splitting existing | |
438 | * locks and bumping ref counts (r_count). | |
439 | */ | |
440 | rangelock_add_reader(tree, new, prev, where); | |
441 | } | |
442 | ||
443 | /* | |
444 | * Lock a range (offset, length) as either shared (RL_READER) or exclusive | |
445 | * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert | |
446 | * it to a RL_WRITER lock (with the offset at the end of the file). Returns | |
447 | * the range lock structure for later unlocking (or reduce range if the | |
448 | * entire file is locked as RL_WRITER). | |
449 | */ | |
450 | locked_range_t * | |
451 | rangelock_enter(rangelock_t *rl, uint64_t off, uint64_t len, | |
452 | rangelock_type_t type) | |
453 | { | |
454 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); | |
455 | ||
456 | locked_range_t *new = kmem_alloc(sizeof (locked_range_t), KM_SLEEP); | |
457 | new->lr_rangelock = rl; | |
458 | new->lr_offset = off; | |
459 | if (len + off < off) /* overflow */ | |
460 | len = UINT64_MAX - off; | |
461 | new->lr_length = len; | |
462 | new->lr_count = 1; /* assume it's going to be in the tree */ | |
463 | new->lr_type = type; | |
464 | new->lr_proxy = B_FALSE; | |
465 | new->lr_write_wanted = B_FALSE; | |
466 | new->lr_read_wanted = B_FALSE; | |
467 | ||
468 | mutex_enter(&rl->rl_lock); | |
469 | if (type == RL_READER) { | |
470 | /* | |
471 | * First check for the usual case of no locks | |
472 | */ | |
473 | if (avl_numnodes(&rl->rl_tree) == 0) | |
474 | avl_add(&rl->rl_tree, new); | |
475 | else | |
476 | rangelock_enter_reader(rl, new); | |
477 | } else | |
478 | rangelock_enter_writer(rl, new); /* RL_WRITER or RL_APPEND */ | |
479 | mutex_exit(&rl->rl_lock); | |
480 | return (new); | |
481 | } | |
482 | ||
483 | /* | |
484 | * Safely free the locked_range_t. | |
485 | */ | |
486 | static void | |
487 | rangelock_free(locked_range_t *lr) | |
488 | { | |
489 | if (lr->lr_write_wanted) | |
490 | cv_destroy(&lr->lr_write_cv); | |
491 | ||
492 | if (lr->lr_read_wanted) | |
493 | cv_destroy(&lr->lr_read_cv); | |
494 | ||
495 | kmem_free(lr, sizeof (locked_range_t)); | |
496 | } | |
497 | ||
498 | /* | |
499 | * Unlock a reader lock | |
500 | */ | |
501 | static void | |
502 | rangelock_exit_reader(rangelock_t *rl, locked_range_t *remove, | |
503 | list_t *free_list) | |
504 | { | |
505 | avl_tree_t *tree = &rl->rl_tree; | |
506 | uint64_t len; | |
507 | ||
508 | /* | |
509 | * The common case is when the remove entry is in the tree | |
510 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
511 | * with this one. Otherwise the remove entry will have been | |
512 | * removed from the tree and replaced by proxies (one or | |
513 | * more ranges mapping to the entire range). | |
514 | */ | |
515 | if (remove->lr_count == 1) { | |
516 | avl_remove(tree, remove); | |
517 | if (remove->lr_write_wanted) | |
518 | cv_broadcast(&remove->lr_write_cv); | |
519 | if (remove->lr_read_wanted) | |
520 | cv_broadcast(&remove->lr_read_cv); | |
521 | list_insert_tail(free_list, remove); | |
522 | } else { | |
523 | ASSERT0(remove->lr_count); | |
524 | ASSERT0(remove->lr_write_wanted); | |
525 | ASSERT0(remove->lr_read_wanted); | |
526 | /* | |
527 | * Find start proxy representing this reader lock, | |
528 | * then decrement ref count on all proxies | |
529 | * that make up this range, freeing them as needed. | |
530 | */ | |
531 | locked_range_t *lr = avl_find(tree, remove, NULL); | |
532 | ASSERT3P(lr, !=, NULL); | |
533 | ASSERT3U(lr->lr_count, !=, 0); | |
534 | ASSERT3U(lr->lr_type, ==, RL_READER); | |
535 | locked_range_t *next = NULL; | |
536 | for (len = remove->lr_length; len != 0; lr = next) { | |
537 | len -= lr->lr_length; | |
538 | if (len != 0) { | |
539 | next = AVL_NEXT(tree, lr); | |
540 | ASSERT3P(next, !=, NULL); | |
541 | ASSERT3U(lr->lr_offset + lr->lr_length, ==, | |
542 | next->lr_offset); | |
543 | ASSERT3U(next->lr_count, !=, 0); | |
544 | ASSERT3U(next->lr_type, ==, RL_READER); | |
545 | } | |
546 | lr->lr_count--; | |
547 | if (lr->lr_count == 0) { | |
548 | avl_remove(tree, lr); | |
549 | if (lr->lr_write_wanted) | |
550 | cv_broadcast(&lr->lr_write_cv); | |
551 | if (lr->lr_read_wanted) | |
552 | cv_broadcast(&lr->lr_read_cv); | |
553 | list_insert_tail(free_list, lr); | |
554 | } | |
555 | } | |
556 | kmem_free(remove, sizeof (locked_range_t)); | |
557 | } | |
558 | } | |
559 | ||
560 | /* | |
561 | * Unlock range and destroy range lock structure. | |
562 | */ | |
563 | void | |
564 | rangelock_exit(locked_range_t *lr) | |
565 | { | |
566 | rangelock_t *rl = lr->lr_rangelock; | |
567 | list_t free_list; | |
568 | locked_range_t *free_lr; | |
569 | ||
570 | ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); | |
571 | ASSERT(lr->lr_count == 1 || lr->lr_count == 0); | |
572 | ASSERT(!lr->lr_proxy); | |
573 | ||
574 | /* | |
575 | * The free list is used to defer the cv_destroy() and | |
576 | * subsequent kmem_free until after the mutex is dropped. | |
577 | */ | |
578 | list_create(&free_list, sizeof (locked_range_t), | |
579 | offsetof(locked_range_t, lr_node)); | |
580 | ||
581 | mutex_enter(&rl->rl_lock); | |
582 | if (lr->lr_type == RL_WRITER) { | |
583 | /* writer locks can't be shared or split */ | |
584 | avl_remove(&rl->rl_tree, lr); | |
585 | if (lr->lr_write_wanted) | |
586 | cv_broadcast(&lr->lr_write_cv); | |
587 | if (lr->lr_read_wanted) | |
588 | cv_broadcast(&lr->lr_read_cv); | |
589 | list_insert_tail(&free_list, lr); | |
590 | } else { | |
591 | /* | |
592 | * lock may be shared, let rangelock_exit_reader() | |
593 | * release the lock and free the locked_range_t. | |
594 | */ | |
595 | rangelock_exit_reader(rl, lr, &free_list); | |
596 | } | |
597 | mutex_exit(&rl->rl_lock); | |
598 | ||
599 | while ((free_lr = list_remove_head(&free_list)) != NULL) | |
600 | rangelock_free(free_lr); | |
601 | ||
602 | list_destroy(&free_list); | |
603 | } | |
604 | ||
605 | /* | |
606 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
607 | * Asserts the whole file is exclusively locked and so there's only one | |
608 | * entry in the tree. | |
609 | */ | |
610 | void | |
611 | rangelock_reduce(locked_range_t *lr, uint64_t off, uint64_t len) | |
612 | { | |
613 | rangelock_t *rl = lr->lr_rangelock; | |
614 | ||
615 | /* Ensure there are no other locks */ | |
616 | ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); | |
617 | ASSERT3U(lr->lr_offset, ==, 0); | |
618 | ASSERT3U(lr->lr_type, ==, RL_WRITER); | |
619 | ASSERT(!lr->lr_proxy); | |
620 | ASSERT3U(lr->lr_length, ==, UINT64_MAX); | |
621 | ASSERT3U(lr->lr_count, ==, 1); | |
622 | ||
623 | mutex_enter(&rl->rl_lock); | |
624 | lr->lr_offset = off; | |
625 | lr->lr_length = len; | |
626 | mutex_exit(&rl->rl_lock); | |
627 | if (lr->lr_write_wanted) | |
628 | cv_broadcast(&lr->lr_write_cv); | |
629 | if (lr->lr_read_wanted) | |
630 | cv_broadcast(&lr->lr_read_cv); | |
631 | } | |
632 | ||
633 | #if defined(_KERNEL) | |
634 | EXPORT_SYMBOL(rangelock_init); | |
635 | EXPORT_SYMBOL(rangelock_fini); | |
636 | EXPORT_SYMBOL(rangelock_enter); | |
637 | EXPORT_SYMBOL(rangelock_exit); | |
638 | EXPORT_SYMBOL(rangelock_reduce); | |
639 | #endif |