]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
c99c9001 MS |
25 | /* |
26 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
27 | */ | |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | /* |
30 | * This file contains the code to implement file range locking in | |
d3cc8b15 | 31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is |
34dc7c2f BB |
32 | * support for growing the blocksize). |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
37 | * rl = zfs_range_lock(zp, off, len, lock_type); | |
38 | * zfs_range_unlock(rl); | |
39 | * zfs_range_reduce(rl, off, len); | |
40 | * | |
41 | * AVL tree | |
42 | * -------- | |
43 | * An AVL tree is used to maintain the state of the existing ranges | |
44 | * that are locked for exclusive (writer) or shared (reader) use. | |
45 | * The starting range offset is used for searching and sorting the tree. | |
46 | * | |
47 | * Common case | |
48 | * ----------- | |
49 | * The (hopefully) usual case is of no overlaps or contention for | |
50 | * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree | |
51 | * searched that finds no overlap, and *this* rl_t is placed in the tree. | |
52 | * | |
53 | * Overlaps/Reference counting/Proxy locks | |
54 | * --------------------------------------- | |
55 | * The avl code only allows one node at a particular offset. Also it's very | |
56 | * inefficient to search through all previous entries looking for overlaps | |
57 | * (because the very 1st in the ordered list might be at offset 0 but | |
58 | * cover the whole file). | |
59 | * So this implementation uses reference counts and proxy range locks. | |
60 | * Firstly, only reader locks use reference counts and proxy locks, | |
61 | * because writer locks are exclusive. | |
62 | * When a reader lock overlaps with another then a proxy lock is created | |
63 | * for that range and replaces the original lock. If the overlap | |
64 | * is exact then the reference count of the proxy is simply incremented. | |
65 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
66 | * new proxy locks created for non overlapping ranges. | |
67 | * The reference counts are adjusted accordingly. | |
4e33ba4c | 68 | * Meanwhile, the original lock is kept around (this is the callers handle) |
34dc7c2f BB |
69 | * and its offset and length are used when releasing the lock. |
70 | * | |
71 | * Thread coordination | |
72 | * ------------------- | |
73 | * In order to make wakeups efficient and to ensure multiple continuous | |
74 | * readers on a range don't starve a writer for the same range lock, | |
75 | * two condition variables are allocated in each rl_t. | |
76 | * If a writer (or reader) can't get a range it initialises the writer | |
77 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
78 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
79 | * writers then all readers before destroying the lock. | |
80 | * | |
81 | * Append mode writes | |
82 | * ------------------ | |
83 | * Append mode writes need to lock a range at the end of a file. | |
84 | * The offset of the end of the file is determined under the | |
85 | * range locking mutex, and the lock type converted from RL_APPEND to | |
86 | * RL_WRITER and the range locked. | |
87 | * | |
88 | * Grow block handling | |
89 | * ------------------- | |
4e33ba4c | 90 | * ZFS supports multiple block sizes currently up to 128K. The smallest |
34dc7c2f BB |
91 | * block size is used for the file which is grown as needed. During this |
92 | * growth all other writers and readers must be excluded. | |
93 | * So if the block size needs to be grown then the whole file is | |
94 | * exclusively locked, then later the caller will reduce the lock | |
95 | * range to just the range to be written using zfs_reduce_range. | |
96 | */ | |
97 | ||
98 | #include <sys/zfs_rlock.h> | |
93ce2b4c | 99 | #include <sys/sysmacros.h> |
34dc7c2f BB |
100 | |
101 | /* | |
102 | * Check if a write lock can be grabbed, or wait and recheck until available. | |
103 | */ | |
104 | static void | |
d88895a0 | 105 | zfs_range_lock_writer(zfs_rlock_t *zrl, rl_t *new) |
34dc7c2f | 106 | { |
d88895a0 | 107 | avl_tree_t *tree = &zrl->zr_avl; |
34dc7c2f BB |
108 | rl_t *rl; |
109 | avl_index_t where; | |
110 | uint64_t end_size; | |
111 | uint64_t off = new->r_off; | |
112 | uint64_t len = new->r_len; | |
113 | ||
114 | for (;;) { | |
115 | /* | |
d88895a0 CC |
116 | * Range locking is also used by zvol. However, for zvol, we |
117 | * don't need to append or grow blocksize, so skip that | |
118 | * processing. | |
34dc7c2f BB |
119 | * |
120 | * Yes, this is ugly, and would be solved by not handling | |
121 | * grow or append in range lock code. If that was done then | |
122 | * we could make the range locking code generically available | |
123 | * to other non-zfs consumers. | |
124 | */ | |
d88895a0 | 125 | if (zrl->zr_size) { /* caller is ZPL */ |
34dc7c2f BB |
126 | /* |
127 | * If in append mode pick up the current end of file. | |
128 | * This is done under z_range_lock to avoid races. | |
129 | */ | |
130 | if (new->r_type == RL_APPEND) | |
d88895a0 | 131 | new->r_off = *zrl->zr_size; |
34dc7c2f BB |
132 | |
133 | /* | |
134 | * If we need to grow the block size then grab the whole | |
135 | * file range. This is also done under z_range_lock to | |
136 | * avoid races. | |
137 | */ | |
d88895a0 CC |
138 | end_size = MAX(*zrl->zr_size, new->r_off + len); |
139 | if (end_size > *zrl->zr_blksz && | |
140 | (!ISP2(*zrl->zr_blksz) || | |
141 | *zrl->zr_blksz < *zrl->zr_max_blksz)) { | |
34dc7c2f BB |
142 | new->r_off = 0; |
143 | new->r_len = UINT64_MAX; | |
144 | } | |
145 | } | |
146 | ||
147 | /* | |
148 | * First check for the usual case of no locks | |
149 | */ | |
150 | if (avl_numnodes(tree) == 0) { | |
151 | new->r_type = RL_WRITER; /* convert to writer */ | |
152 | avl_add(tree, new); | |
153 | return; | |
154 | } | |
155 | ||
156 | /* | |
157 | * Look for any locks in the range. | |
158 | */ | |
159 | rl = avl_find(tree, new, &where); | |
160 | if (rl) | |
161 | goto wait; /* already locked at same offset */ | |
162 | ||
163 | rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
164 | if (rl && (rl->r_off < new->r_off + new->r_len)) | |
165 | goto wait; | |
166 | ||
167 | rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); | |
168 | if (rl && rl->r_off + rl->r_len > new->r_off) | |
169 | goto wait; | |
170 | ||
171 | new->r_type = RL_WRITER; /* convert possible RL_APPEND */ | |
172 | avl_insert(tree, new, where); | |
173 | return; | |
174 | wait: | |
175 | if (!rl->r_write_wanted) { | |
176 | cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); | |
177 | rl->r_write_wanted = B_TRUE; | |
178 | } | |
d88895a0 | 179 | cv_wait(&rl->r_wr_cv, &zrl->zr_mutex); |
34dc7c2f BB |
180 | |
181 | /* reset to original */ | |
182 | new->r_off = off; | |
183 | new->r_len = len; | |
184 | } | |
185 | } | |
186 | ||
187 | /* | |
188 | * If this is an original (non-proxy) lock then replace it by | |
189 | * a proxy and return the proxy. | |
190 | */ | |
191 | static rl_t * | |
192 | zfs_range_proxify(avl_tree_t *tree, rl_t *rl) | |
193 | { | |
194 | rl_t *proxy; | |
195 | ||
196 | if (rl->r_proxy) | |
197 | return (rl); /* already a proxy */ | |
198 | ||
199 | ASSERT3U(rl->r_cnt, ==, 1); | |
200 | ASSERT(rl->r_write_wanted == B_FALSE); | |
201 | ASSERT(rl->r_read_wanted == B_FALSE); | |
202 | avl_remove(tree, rl); | |
203 | rl->r_cnt = 0; | |
204 | ||
205 | /* create a proxy range lock */ | |
79c76d5b | 206 | proxy = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
34dc7c2f BB |
207 | proxy->r_off = rl->r_off; |
208 | proxy->r_len = rl->r_len; | |
209 | proxy->r_cnt = 1; | |
210 | proxy->r_type = RL_READER; | |
211 | proxy->r_proxy = B_TRUE; | |
212 | proxy->r_write_wanted = B_FALSE; | |
213 | proxy->r_read_wanted = B_FALSE; | |
214 | avl_add(tree, proxy); | |
215 | ||
216 | return (proxy); | |
217 | } | |
218 | ||
219 | /* | |
220 | * Split the range lock at the supplied offset | |
221 | * returning the *front* proxy. | |
222 | */ | |
223 | static rl_t * | |
224 | zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) | |
225 | { | |
226 | rl_t *front, *rear; | |
227 | ||
228 | ASSERT3U(rl->r_len, >, 1); | |
229 | ASSERT3U(off, >, rl->r_off); | |
230 | ASSERT3U(off, <, rl->r_off + rl->r_len); | |
231 | ASSERT(rl->r_write_wanted == B_FALSE); | |
232 | ASSERT(rl->r_read_wanted == B_FALSE); | |
233 | ||
234 | /* create the rear proxy range lock */ | |
79c76d5b | 235 | rear = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
34dc7c2f BB |
236 | rear->r_off = off; |
237 | rear->r_len = rl->r_off + rl->r_len - off; | |
238 | rear->r_cnt = rl->r_cnt; | |
239 | rear->r_type = RL_READER; | |
240 | rear->r_proxy = B_TRUE; | |
241 | rear->r_write_wanted = B_FALSE; | |
242 | rear->r_read_wanted = B_FALSE; | |
243 | ||
244 | front = zfs_range_proxify(tree, rl); | |
245 | front->r_len = off - rl->r_off; | |
246 | ||
247 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
248 | return (front); | |
249 | } | |
250 | ||
251 | /* | |
252 | * Create and add a new proxy range lock for the supplied range. | |
253 | */ | |
254 | static void | |
255 | zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) | |
256 | { | |
257 | rl_t *rl; | |
258 | ||
259 | ASSERT(len); | |
79c76d5b | 260 | rl = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
34dc7c2f BB |
261 | rl->r_off = off; |
262 | rl->r_len = len; | |
263 | rl->r_cnt = 1; | |
264 | rl->r_type = RL_READER; | |
265 | rl->r_proxy = B_TRUE; | |
266 | rl->r_write_wanted = B_FALSE; | |
267 | rl->r_read_wanted = B_FALSE; | |
268 | avl_add(tree, rl); | |
269 | } | |
270 | ||
271 | static void | |
272 | zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) | |
273 | { | |
274 | rl_t *next; | |
275 | uint64_t off = new->r_off; | |
276 | uint64_t len = new->r_len; | |
277 | ||
278 | /* | |
279 | * prev arrives either: | |
280 | * - pointing to an entry at the same offset | |
281 | * - pointing to the entry with the closest previous offset whose | |
282 | * range may overlap with the new range | |
283 | * - null, if there were no ranges starting before the new one | |
284 | */ | |
285 | if (prev) { | |
286 | if (prev->r_off + prev->r_len <= off) { | |
287 | prev = NULL; | |
288 | } else if (prev->r_off != off) { | |
289 | /* | |
290 | * convert to proxy if needed then | |
291 | * split this entry and bump ref count | |
292 | */ | |
293 | prev = zfs_range_split(tree, prev, off); | |
294 | prev = AVL_NEXT(tree, prev); /* move to rear range */ | |
295 | } | |
296 | } | |
297 | ASSERT((prev == NULL) || (prev->r_off == off)); | |
298 | ||
299 | if (prev) | |
300 | next = prev; | |
301 | else | |
302 | next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
303 | ||
304 | if (next == NULL || off + len <= next->r_off) { | |
305 | /* no overlaps, use the original new rl_t in the tree */ | |
306 | avl_insert(tree, new, where); | |
307 | return; | |
308 | } | |
309 | ||
310 | if (off < next->r_off) { | |
311 | /* Add a proxy for initial range before the overlap */ | |
312 | zfs_range_new_proxy(tree, off, next->r_off - off); | |
313 | } | |
314 | ||
315 | new->r_cnt = 0; /* will use proxies in tree */ | |
316 | /* | |
317 | * We now search forward through the ranges, until we go past the end | |
318 | * of the new range. For each entry we make it a proxy if it | |
319 | * isn't already, then bump its reference count. If there's any | |
320 | * gaps between the ranges then we create a new proxy range. | |
321 | */ | |
322 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
323 | if (off + len <= next->r_off) | |
324 | break; | |
325 | if (prev && prev->r_off + prev->r_len < next->r_off) { | |
326 | /* there's a gap */ | |
327 | ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); | |
328 | zfs_range_new_proxy(tree, prev->r_off + prev->r_len, | |
329 | next->r_off - (prev->r_off + prev->r_len)); | |
330 | } | |
331 | if (off + len == next->r_off + next->r_len) { | |
332 | /* exact overlap with end */ | |
333 | next = zfs_range_proxify(tree, next); | |
334 | next->r_cnt++; | |
335 | return; | |
336 | } | |
337 | if (off + len < next->r_off + next->r_len) { | |
338 | /* new range ends in the middle of this block */ | |
339 | next = zfs_range_split(tree, next, off + len); | |
340 | next->r_cnt++; | |
341 | return; | |
342 | } | |
343 | ASSERT3U(off + len, >, next->r_off + next->r_len); | |
344 | next = zfs_range_proxify(tree, next); | |
345 | next->r_cnt++; | |
346 | } | |
347 | ||
348 | /* Add the remaining end range. */ | |
349 | zfs_range_new_proxy(tree, prev->r_off + prev->r_len, | |
350 | (off + len) - (prev->r_off + prev->r_len)); | |
351 | } | |
352 | ||
353 | /* | |
354 | * Check if a reader lock can be grabbed, or wait and recheck until available. | |
355 | */ | |
356 | static void | |
d88895a0 | 357 | zfs_range_lock_reader(zfs_rlock_t *zrl, rl_t *new) |
34dc7c2f | 358 | { |
d88895a0 | 359 | avl_tree_t *tree = &zrl->zr_avl; |
34dc7c2f BB |
360 | rl_t *prev, *next; |
361 | avl_index_t where; | |
362 | uint64_t off = new->r_off; | |
363 | uint64_t len = new->r_len; | |
364 | ||
365 | /* | |
366 | * Look for any writer locks in the range. | |
367 | */ | |
368 | retry: | |
369 | prev = avl_find(tree, new, &where); | |
370 | if (prev == NULL) | |
371 | prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); | |
372 | ||
373 | /* | |
374 | * Check the previous range for a writer lock overlap. | |
375 | */ | |
376 | if (prev && (off < prev->r_off + prev->r_len)) { | |
377 | if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { | |
378 | if (!prev->r_read_wanted) { | |
379 | cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); | |
380 | prev->r_read_wanted = B_TRUE; | |
381 | } | |
d88895a0 | 382 | cv_wait(&prev->r_rd_cv, &zrl->zr_mutex); |
34dc7c2f BB |
383 | goto retry; |
384 | } | |
385 | if (off + len < prev->r_off + prev->r_len) | |
386 | goto got_lock; | |
387 | } | |
388 | ||
389 | /* | |
390 | * Search through the following ranges to see if there's | |
391 | * write lock any overlap. | |
392 | */ | |
393 | if (prev) | |
394 | next = AVL_NEXT(tree, prev); | |
395 | else | |
396 | next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
397 | for (; next; next = AVL_NEXT(tree, next)) { | |
398 | if (off + len <= next->r_off) | |
399 | goto got_lock; | |
400 | if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { | |
401 | if (!next->r_read_wanted) { | |
402 | cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); | |
403 | next->r_read_wanted = B_TRUE; | |
404 | } | |
d88895a0 | 405 | cv_wait(&next->r_rd_cv, &zrl->zr_mutex); |
34dc7c2f BB |
406 | goto retry; |
407 | } | |
408 | if (off + len <= next->r_off + next->r_len) | |
409 | goto got_lock; | |
410 | } | |
411 | ||
412 | got_lock: | |
413 | /* | |
414 | * Add the read lock, which may involve splitting existing | |
415 | * locks and bumping ref counts (r_cnt). | |
416 | */ | |
417 | zfs_range_add_reader(tree, new, prev, where); | |
418 | } | |
419 | ||
420 | /* | |
421 | * Lock a range (offset, length) as either shared (RL_READER) | |
422 | * or exclusive (RL_WRITER). Returns the range lock structure | |
423 | * for later unlocking or reduce range (if entire file | |
424 | * previously locked as RL_WRITER). | |
425 | */ | |
426 | rl_t * | |
d88895a0 | 427 | zfs_range_lock(zfs_rlock_t *zrl, uint64_t off, uint64_t len, rl_type_t type) |
34dc7c2f BB |
428 | { |
429 | rl_t *new; | |
430 | ||
431 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); | |
432 | ||
79c76d5b | 433 | new = kmem_alloc(sizeof (rl_t), KM_SLEEP); |
d88895a0 | 434 | new->r_zrl = zrl; |
34dc7c2f | 435 | new->r_off = off; |
d164b209 BB |
436 | if (len + off < off) /* overflow */ |
437 | len = UINT64_MAX - off; | |
34dc7c2f BB |
438 | new->r_len = len; |
439 | new->r_cnt = 1; /* assume it's going to be in the tree */ | |
440 | new->r_type = type; | |
441 | new->r_proxy = B_FALSE; | |
442 | new->r_write_wanted = B_FALSE; | |
443 | new->r_read_wanted = B_FALSE; | |
444 | ||
d88895a0 | 445 | mutex_enter(&zrl->zr_mutex); |
34dc7c2f BB |
446 | if (type == RL_READER) { |
447 | /* | |
448 | * First check for the usual case of no locks | |
449 | */ | |
d88895a0 CC |
450 | if (avl_numnodes(&zrl->zr_avl) == 0) |
451 | avl_add(&zrl->zr_avl, new); | |
34dc7c2f | 452 | else |
d88895a0 CC |
453 | zfs_range_lock_reader(zrl, new); |
454 | } else /* RL_WRITER or RL_APPEND */ | |
455 | zfs_range_lock_writer(zrl, new); | |
456 | mutex_exit(&zrl->zr_mutex); | |
34dc7c2f BB |
457 | return (new); |
458 | } | |
459 | ||
8926ab7a BB |
460 | static void |
461 | zfs_range_free(void *arg) | |
462 | { | |
463 | rl_t *rl = arg; | |
464 | ||
465 | if (rl->r_write_wanted) | |
466 | cv_destroy(&rl->r_wr_cv); | |
467 | ||
468 | if (rl->r_read_wanted) | |
469 | cv_destroy(&rl->r_rd_cv); | |
470 | ||
471 | kmem_free(rl, sizeof (rl_t)); | |
472 | } | |
473 | ||
34dc7c2f BB |
474 | /* |
475 | * Unlock a reader lock | |
476 | */ | |
477 | static void | |
d88895a0 | 478 | zfs_range_unlock_reader(zfs_rlock_t *zrl, rl_t *remove, list_t *free_list) |
34dc7c2f | 479 | { |
d88895a0 | 480 | avl_tree_t *tree = &zrl->zr_avl; |
d4ed6673 | 481 | rl_t *rl, *next = NULL; |
34dc7c2f BB |
482 | uint64_t len; |
483 | ||
484 | /* | |
485 | * The common case is when the remove entry is in the tree | |
486 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
487 | * with this one. Otherwise the remove entry will have been | |
488 | * removed from the tree and replaced by proxies (one or | |
489 | * more ranges mapping to the entire range). | |
490 | */ | |
491 | if (remove->r_cnt == 1) { | |
492 | avl_remove(tree, remove); | |
a298dbde | 493 | |
8926ab7a | 494 | if (remove->r_write_wanted) |
34dc7c2f | 495 | cv_broadcast(&remove->r_wr_cv); |
8926ab7a BB |
496 | |
497 | if (remove->r_read_wanted) | |
34dc7c2f | 498 | cv_broadcast(&remove->r_rd_cv); |
8926ab7a | 499 | |
450dc149 | 500 | list_insert_tail(free_list, remove); |
34dc7c2f | 501 | } else { |
c99c9001 MS |
502 | ASSERT0(remove->r_cnt); |
503 | ASSERT0(remove->r_write_wanted); | |
504 | ASSERT0(remove->r_read_wanted); | |
34dc7c2f BB |
505 | /* |
506 | * Find start proxy representing this reader lock, | |
507 | * then decrement ref count on all proxies | |
508 | * that make up this range, freeing them as needed. | |
509 | */ | |
510 | rl = avl_find(tree, remove, NULL); | |
511 | ASSERT(rl); | |
512 | ASSERT(rl->r_cnt); | |
513 | ASSERT(rl->r_type == RL_READER); | |
514 | for (len = remove->r_len; len != 0; rl = next) { | |
515 | len -= rl->r_len; | |
516 | if (len) { | |
517 | next = AVL_NEXT(tree, rl); | |
518 | ASSERT(next); | |
519 | ASSERT(rl->r_off + rl->r_len == next->r_off); | |
520 | ASSERT(next->r_cnt); | |
521 | ASSERT(next->r_type == RL_READER); | |
522 | } | |
523 | rl->r_cnt--; | |
524 | if (rl->r_cnt == 0) { | |
525 | avl_remove(tree, rl); | |
8926ab7a BB |
526 | |
527 | if (rl->r_write_wanted) | |
34dc7c2f | 528 | cv_broadcast(&rl->r_wr_cv); |
8926ab7a BB |
529 | |
530 | if (rl->r_read_wanted) | |
34dc7c2f | 531 | cv_broadcast(&rl->r_rd_cv); |
8926ab7a | 532 | |
450dc149 | 533 | list_insert_tail(free_list, rl); |
34dc7c2f BB |
534 | } |
535 | } | |
8926ab7a | 536 | |
8926ab7a | 537 | kmem_free(remove, sizeof (rl_t)); |
34dc7c2f | 538 | } |
34dc7c2f BB |
539 | } |
540 | ||
541 | /* | |
542 | * Unlock range and destroy range lock structure. | |
543 | */ | |
544 | void | |
545 | zfs_range_unlock(rl_t *rl) | |
546 | { | |
d88895a0 | 547 | zfs_rlock_t *zrl = rl->r_zrl; |
450dc149 BB |
548 | list_t free_list; |
549 | rl_t *free_rl; | |
34dc7c2f BB |
550 | |
551 | ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); | |
552 | ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); | |
553 | ASSERT(!rl->r_proxy); | |
d1d7e268 | 554 | list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node)); |
34dc7c2f | 555 | |
d88895a0 | 556 | mutex_enter(&zrl->zr_mutex); |
34dc7c2f BB |
557 | if (rl->r_type == RL_WRITER) { |
558 | /* writer locks can't be shared or split */ | |
d88895a0 | 559 | avl_remove(&zrl->zr_avl, rl); |
8926ab7a | 560 | if (rl->r_write_wanted) |
34dc7c2f | 561 | cv_broadcast(&rl->r_wr_cv); |
8926ab7a BB |
562 | |
563 | if (rl->r_read_wanted) | |
34dc7c2f | 564 | cv_broadcast(&rl->r_rd_cv); |
8926ab7a | 565 | |
450dc149 | 566 | list_insert_tail(&free_list, rl); |
34dc7c2f BB |
567 | } else { |
568 | /* | |
569 | * lock may be shared, let zfs_range_unlock_reader() | |
8926ab7a | 570 | * release the zp->z_range_lock lock and free the rl_t |
34dc7c2f | 571 | */ |
d88895a0 | 572 | zfs_range_unlock_reader(zrl, rl, &free_list); |
34dc7c2f | 573 | } |
d88895a0 | 574 | mutex_exit(&zrl->zr_mutex); |
450dc149 BB |
575 | |
576 | while ((free_rl = list_head(&free_list)) != NULL) { | |
577 | list_remove(&free_list, free_rl); | |
578 | zfs_range_free(free_rl); | |
579 | } | |
580 | ||
581 | list_destroy(&free_list); | |
34dc7c2f BB |
582 | } |
583 | ||
584 | /* | |
585 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
586 | * Asserts the whole file is exclusivly locked and so there's only one | |
587 | * entry in the tree. | |
588 | */ | |
589 | void | |
590 | zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) | |
591 | { | |
d88895a0 | 592 | zfs_rlock_t *zrl = rl->r_zrl; |
34dc7c2f BB |
593 | |
594 | /* Ensure there are no other locks */ | |
d88895a0 | 595 | ASSERT(avl_numnodes(&zrl->zr_avl) == 1); |
34dc7c2f BB |
596 | ASSERT(rl->r_off == 0); |
597 | ASSERT(rl->r_type == RL_WRITER); | |
598 | ASSERT(!rl->r_proxy); | |
599 | ASSERT3U(rl->r_len, ==, UINT64_MAX); | |
600 | ASSERT3U(rl->r_cnt, ==, 1); | |
601 | ||
d88895a0 | 602 | mutex_enter(&zrl->zr_mutex); |
34dc7c2f BB |
603 | rl->r_off = off; |
604 | rl->r_len = len; | |
a298dbde | 605 | |
34dc7c2f BB |
606 | if (rl->r_write_wanted) |
607 | cv_broadcast(&rl->r_wr_cv); | |
608 | if (rl->r_read_wanted) | |
609 | cv_broadcast(&rl->r_rd_cv); | |
a298dbde | 610 | |
d88895a0 | 611 | mutex_exit(&zrl->zr_mutex); |
34dc7c2f BB |
612 | } |
613 | ||
614 | /* | |
615 | * AVL comparison function used to order range locks | |
616 | * Locks are ordered on the start offset of the range. | |
617 | */ | |
618 | int | |
619 | zfs_range_compare(const void *arg1, const void *arg2) | |
620 | { | |
ee36c709 GN |
621 | const rl_t *rl1 = (const rl_t *)arg1; |
622 | const rl_t *rl2 = (const rl_t *)arg2; | |
623 | ||
624 | return (AVL_CMP(rl1->r_off, rl2->r_off)); | |
34dc7c2f | 625 | } |
d88895a0 CC |
626 | |
627 | #ifdef _KERNEL | |
628 | EXPORT_SYMBOL(zfs_range_lock); | |
629 | EXPORT_SYMBOL(zfs_range_unlock); | |
630 | EXPORT_SYMBOL(zfs_range_reduce); | |
631 | EXPORT_SYMBOL(zfs_range_compare); | |
632 | #endif |