]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
428870ff | 22 | * Copyright 2010 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
23 | * Use is subject to license terms. |
24 | */ | |
c99c9001 MS |
25 | /* |
26 | * Copyright (c) 2012 by Delphix. All rights reserved. | |
27 | */ | |
34dc7c2f | 28 | |
34dc7c2f BB |
29 | /* |
30 | * This file contains the code to implement file range locking in | |
d3cc8b15 | 31 | * ZFS, although there isn't much specific to ZFS (all that comes to mind is |
34dc7c2f BB |
32 | * support for growing the blocksize). |
33 | * | |
34 | * Interface | |
35 | * --------- | |
36 | * Defined in zfs_rlock.h but essentially: | |
37 | * rl = zfs_range_lock(zp, off, len, lock_type); | |
38 | * zfs_range_unlock(rl); | |
39 | * zfs_range_reduce(rl, off, len); | |
40 | * | |
41 | * AVL tree | |
42 | * -------- | |
43 | * An AVL tree is used to maintain the state of the existing ranges | |
44 | * that are locked for exclusive (writer) or shared (reader) use. | |
45 | * The starting range offset is used for searching and sorting the tree. | |
46 | * | |
47 | * Common case | |
48 | * ----------- | |
49 | * The (hopefully) usual case is of no overlaps or contention for | |
50 | * locks. On entry to zfs_lock_range() a rl_t is allocated; the tree | |
51 | * searched that finds no overlap, and *this* rl_t is placed in the tree. | |
52 | * | |
53 | * Overlaps/Reference counting/Proxy locks | |
54 | * --------------------------------------- | |
55 | * The avl code only allows one node at a particular offset. Also it's very | |
56 | * inefficient to search through all previous entries looking for overlaps | |
57 | * (because the very 1st in the ordered list might be at offset 0 but | |
58 | * cover the whole file). | |
59 | * So this implementation uses reference counts and proxy range locks. | |
60 | * Firstly, only reader locks use reference counts and proxy locks, | |
61 | * because writer locks are exclusive. | |
62 | * When a reader lock overlaps with another then a proxy lock is created | |
63 | * for that range and replaces the original lock. If the overlap | |
64 | * is exact then the reference count of the proxy is simply incremented. | |
65 | * Otherwise, the proxy lock is split into smaller lock ranges and | |
66 | * new proxy locks created for non overlapping ranges. | |
67 | * The reference counts are adjusted accordingly. | |
68 | * Meanwhile, the orginal lock is kept around (this is the callers handle) | |
69 | * and its offset and length are used when releasing the lock. | |
70 | * | |
71 | * Thread coordination | |
72 | * ------------------- | |
73 | * In order to make wakeups efficient and to ensure multiple continuous | |
74 | * readers on a range don't starve a writer for the same range lock, | |
75 | * two condition variables are allocated in each rl_t. | |
76 | * If a writer (or reader) can't get a range it initialises the writer | |
77 | * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; | |
78 | * and waits on that cv. When a thread unlocks that range it wakes up all | |
79 | * writers then all readers before destroying the lock. | |
80 | * | |
81 | * Append mode writes | |
82 | * ------------------ | |
83 | * Append mode writes need to lock a range at the end of a file. | |
84 | * The offset of the end of the file is determined under the | |
85 | * range locking mutex, and the lock type converted from RL_APPEND to | |
86 | * RL_WRITER and the range locked. | |
87 | * | |
88 | * Grow block handling | |
89 | * ------------------- | |
90 | * ZFS supports multiple block sizes currently upto 128K. The smallest | |
91 | * block size is used for the file which is grown as needed. During this | |
92 | * growth all other writers and readers must be excluded. | |
93 | * So if the block size needs to be grown then the whole file is | |
94 | * exclusively locked, then later the caller will reduce the lock | |
95 | * range to just the range to be written using zfs_reduce_range. | |
96 | */ | |
97 | ||
98 | #include <sys/zfs_rlock.h> | |
99 | ||
100 | /* | |
101 | * Check if a write lock can be grabbed, or wait and recheck until available. | |
102 | */ | |
103 | static void | |
104 | zfs_range_lock_writer(znode_t *zp, rl_t *new) | |
105 | { | |
106 | avl_tree_t *tree = &zp->z_range_avl; | |
107 | rl_t *rl; | |
108 | avl_index_t where; | |
109 | uint64_t end_size; | |
110 | uint64_t off = new->r_off; | |
111 | uint64_t len = new->r_len; | |
112 | ||
113 | for (;;) { | |
114 | /* | |
115 | * Range locking is also used by zvol and uses a | |
116 | * dummied up znode. However, for zvol, we don't need to | |
117 | * append or grow blocksize, and besides we don't have | |
3c4988c8 | 118 | * a "sa" data or zfs_sb_t - so skip that processing. |
34dc7c2f BB |
119 | * |
120 | * Yes, this is ugly, and would be solved by not handling | |
121 | * grow or append in range lock code. If that was done then | |
122 | * we could make the range locking code generically available | |
123 | * to other non-zfs consumers. | |
124 | */ | |
3c4988c8 | 125 | if (!zp->z_is_zvol) { /* caller is ZPL */ |
34dc7c2f BB |
126 | /* |
127 | * If in append mode pick up the current end of file. | |
128 | * This is done under z_range_lock to avoid races. | |
129 | */ | |
130 | if (new->r_type == RL_APPEND) | |
428870ff | 131 | new->r_off = zp->z_size; |
34dc7c2f BB |
132 | |
133 | /* | |
134 | * If we need to grow the block size then grab the whole | |
135 | * file range. This is also done under z_range_lock to | |
136 | * avoid races. | |
137 | */ | |
428870ff | 138 | end_size = MAX(zp->z_size, new->r_off + len); |
34dc7c2f | 139 | if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || |
3558fd73 | 140 | zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { |
34dc7c2f BB |
141 | new->r_off = 0; |
142 | new->r_len = UINT64_MAX; | |
143 | } | |
144 | } | |
145 | ||
146 | /* | |
147 | * First check for the usual case of no locks | |
148 | */ | |
149 | if (avl_numnodes(tree) == 0) { | |
150 | new->r_type = RL_WRITER; /* convert to writer */ | |
151 | avl_add(tree, new); | |
152 | return; | |
153 | } | |
154 | ||
155 | /* | |
156 | * Look for any locks in the range. | |
157 | */ | |
158 | rl = avl_find(tree, new, &where); | |
159 | if (rl) | |
160 | goto wait; /* already locked at same offset */ | |
161 | ||
162 | rl = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
163 | if (rl && (rl->r_off < new->r_off + new->r_len)) | |
164 | goto wait; | |
165 | ||
166 | rl = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); | |
167 | if (rl && rl->r_off + rl->r_len > new->r_off) | |
168 | goto wait; | |
169 | ||
170 | new->r_type = RL_WRITER; /* convert possible RL_APPEND */ | |
171 | avl_insert(tree, new, where); | |
172 | return; | |
173 | wait: | |
174 | if (!rl->r_write_wanted) { | |
175 | cv_init(&rl->r_wr_cv, NULL, CV_DEFAULT, NULL); | |
176 | rl->r_write_wanted = B_TRUE; | |
177 | } | |
178 | cv_wait(&rl->r_wr_cv, &zp->z_range_lock); | |
179 | ||
180 | /* reset to original */ | |
181 | new->r_off = off; | |
182 | new->r_len = len; | |
183 | } | |
184 | } | |
185 | ||
186 | /* | |
187 | * If this is an original (non-proxy) lock then replace it by | |
188 | * a proxy and return the proxy. | |
189 | */ | |
190 | static rl_t * | |
191 | zfs_range_proxify(avl_tree_t *tree, rl_t *rl) | |
192 | { | |
193 | rl_t *proxy; | |
194 | ||
195 | if (rl->r_proxy) | |
196 | return (rl); /* already a proxy */ | |
197 | ||
198 | ASSERT3U(rl->r_cnt, ==, 1); | |
199 | ASSERT(rl->r_write_wanted == B_FALSE); | |
200 | ASSERT(rl->r_read_wanted == B_FALSE); | |
201 | avl_remove(tree, rl); | |
202 | rl->r_cnt = 0; | |
203 | ||
204 | /* create a proxy range lock */ | |
6f53a6a2 | 205 | proxy = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); |
34dc7c2f BB |
206 | proxy->r_off = rl->r_off; |
207 | proxy->r_len = rl->r_len; | |
208 | proxy->r_cnt = 1; | |
209 | proxy->r_type = RL_READER; | |
210 | proxy->r_proxy = B_TRUE; | |
211 | proxy->r_write_wanted = B_FALSE; | |
212 | proxy->r_read_wanted = B_FALSE; | |
213 | avl_add(tree, proxy); | |
214 | ||
215 | return (proxy); | |
216 | } | |
217 | ||
218 | /* | |
219 | * Split the range lock at the supplied offset | |
220 | * returning the *front* proxy. | |
221 | */ | |
222 | static rl_t * | |
223 | zfs_range_split(avl_tree_t *tree, rl_t *rl, uint64_t off) | |
224 | { | |
225 | rl_t *front, *rear; | |
226 | ||
227 | ASSERT3U(rl->r_len, >, 1); | |
228 | ASSERT3U(off, >, rl->r_off); | |
229 | ASSERT3U(off, <, rl->r_off + rl->r_len); | |
230 | ASSERT(rl->r_write_wanted == B_FALSE); | |
231 | ASSERT(rl->r_read_wanted == B_FALSE); | |
232 | ||
233 | /* create the rear proxy range lock */ | |
6f53a6a2 | 234 | rear = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); |
34dc7c2f BB |
235 | rear->r_off = off; |
236 | rear->r_len = rl->r_off + rl->r_len - off; | |
237 | rear->r_cnt = rl->r_cnt; | |
238 | rear->r_type = RL_READER; | |
239 | rear->r_proxy = B_TRUE; | |
240 | rear->r_write_wanted = B_FALSE; | |
241 | rear->r_read_wanted = B_FALSE; | |
242 | ||
243 | front = zfs_range_proxify(tree, rl); | |
244 | front->r_len = off - rl->r_off; | |
245 | ||
246 | avl_insert_here(tree, rear, front, AVL_AFTER); | |
247 | return (front); | |
248 | } | |
249 | ||
250 | /* | |
251 | * Create and add a new proxy range lock for the supplied range. | |
252 | */ | |
253 | static void | |
254 | zfs_range_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) | |
255 | { | |
256 | rl_t *rl; | |
257 | ||
258 | ASSERT(len); | |
e4d89e9c | 259 | rl = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); |
34dc7c2f BB |
260 | rl->r_off = off; |
261 | rl->r_len = len; | |
262 | rl->r_cnt = 1; | |
263 | rl->r_type = RL_READER; | |
264 | rl->r_proxy = B_TRUE; | |
265 | rl->r_write_wanted = B_FALSE; | |
266 | rl->r_read_wanted = B_FALSE; | |
267 | avl_add(tree, rl); | |
268 | } | |
269 | ||
270 | static void | |
271 | zfs_range_add_reader(avl_tree_t *tree, rl_t *new, rl_t *prev, avl_index_t where) | |
272 | { | |
273 | rl_t *next; | |
274 | uint64_t off = new->r_off; | |
275 | uint64_t len = new->r_len; | |
276 | ||
277 | /* | |
278 | * prev arrives either: | |
279 | * - pointing to an entry at the same offset | |
280 | * - pointing to the entry with the closest previous offset whose | |
281 | * range may overlap with the new range | |
282 | * - null, if there were no ranges starting before the new one | |
283 | */ | |
284 | if (prev) { | |
285 | if (prev->r_off + prev->r_len <= off) { | |
286 | prev = NULL; | |
287 | } else if (prev->r_off != off) { | |
288 | /* | |
289 | * convert to proxy if needed then | |
290 | * split this entry and bump ref count | |
291 | */ | |
292 | prev = zfs_range_split(tree, prev, off); | |
293 | prev = AVL_NEXT(tree, prev); /* move to rear range */ | |
294 | } | |
295 | } | |
296 | ASSERT((prev == NULL) || (prev->r_off == off)); | |
297 | ||
298 | if (prev) | |
299 | next = prev; | |
300 | else | |
301 | next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
302 | ||
303 | if (next == NULL || off + len <= next->r_off) { | |
304 | /* no overlaps, use the original new rl_t in the tree */ | |
305 | avl_insert(tree, new, where); | |
306 | return; | |
307 | } | |
308 | ||
309 | if (off < next->r_off) { | |
310 | /* Add a proxy for initial range before the overlap */ | |
311 | zfs_range_new_proxy(tree, off, next->r_off - off); | |
312 | } | |
313 | ||
314 | new->r_cnt = 0; /* will use proxies in tree */ | |
315 | /* | |
316 | * We now search forward through the ranges, until we go past the end | |
317 | * of the new range. For each entry we make it a proxy if it | |
318 | * isn't already, then bump its reference count. If there's any | |
319 | * gaps between the ranges then we create a new proxy range. | |
320 | */ | |
321 | for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { | |
322 | if (off + len <= next->r_off) | |
323 | break; | |
324 | if (prev && prev->r_off + prev->r_len < next->r_off) { | |
325 | /* there's a gap */ | |
326 | ASSERT3U(next->r_off, >, prev->r_off + prev->r_len); | |
327 | zfs_range_new_proxy(tree, prev->r_off + prev->r_len, | |
328 | next->r_off - (prev->r_off + prev->r_len)); | |
329 | } | |
330 | if (off + len == next->r_off + next->r_len) { | |
331 | /* exact overlap with end */ | |
332 | next = zfs_range_proxify(tree, next); | |
333 | next->r_cnt++; | |
334 | return; | |
335 | } | |
336 | if (off + len < next->r_off + next->r_len) { | |
337 | /* new range ends in the middle of this block */ | |
338 | next = zfs_range_split(tree, next, off + len); | |
339 | next->r_cnt++; | |
340 | return; | |
341 | } | |
342 | ASSERT3U(off + len, >, next->r_off + next->r_len); | |
343 | next = zfs_range_proxify(tree, next); | |
344 | next->r_cnt++; | |
345 | } | |
346 | ||
347 | /* Add the remaining end range. */ | |
348 | zfs_range_new_proxy(tree, prev->r_off + prev->r_len, | |
349 | (off + len) - (prev->r_off + prev->r_len)); | |
350 | } | |
351 | ||
352 | /* | |
353 | * Check if a reader lock can be grabbed, or wait and recheck until available. | |
354 | */ | |
355 | static void | |
356 | zfs_range_lock_reader(znode_t *zp, rl_t *new) | |
357 | { | |
358 | avl_tree_t *tree = &zp->z_range_avl; | |
359 | rl_t *prev, *next; | |
360 | avl_index_t where; | |
361 | uint64_t off = new->r_off; | |
362 | uint64_t len = new->r_len; | |
363 | ||
364 | /* | |
365 | * Look for any writer locks in the range. | |
366 | */ | |
367 | retry: | |
368 | prev = avl_find(tree, new, &where); | |
369 | if (prev == NULL) | |
370 | prev = (rl_t *)avl_nearest(tree, where, AVL_BEFORE); | |
371 | ||
372 | /* | |
373 | * Check the previous range for a writer lock overlap. | |
374 | */ | |
375 | if (prev && (off < prev->r_off + prev->r_len)) { | |
376 | if ((prev->r_type == RL_WRITER) || (prev->r_write_wanted)) { | |
377 | if (!prev->r_read_wanted) { | |
378 | cv_init(&prev->r_rd_cv, NULL, CV_DEFAULT, NULL); | |
379 | prev->r_read_wanted = B_TRUE; | |
380 | } | |
381 | cv_wait(&prev->r_rd_cv, &zp->z_range_lock); | |
382 | goto retry; | |
383 | } | |
384 | if (off + len < prev->r_off + prev->r_len) | |
385 | goto got_lock; | |
386 | } | |
387 | ||
388 | /* | |
389 | * Search through the following ranges to see if there's | |
390 | * write lock any overlap. | |
391 | */ | |
392 | if (prev) | |
393 | next = AVL_NEXT(tree, prev); | |
394 | else | |
395 | next = (rl_t *)avl_nearest(tree, where, AVL_AFTER); | |
396 | for (; next; next = AVL_NEXT(tree, next)) { | |
397 | if (off + len <= next->r_off) | |
398 | goto got_lock; | |
399 | if ((next->r_type == RL_WRITER) || (next->r_write_wanted)) { | |
400 | if (!next->r_read_wanted) { | |
401 | cv_init(&next->r_rd_cv, NULL, CV_DEFAULT, NULL); | |
402 | next->r_read_wanted = B_TRUE; | |
403 | } | |
404 | cv_wait(&next->r_rd_cv, &zp->z_range_lock); | |
405 | goto retry; | |
406 | } | |
407 | if (off + len <= next->r_off + next->r_len) | |
408 | goto got_lock; | |
409 | } | |
410 | ||
411 | got_lock: | |
412 | /* | |
413 | * Add the read lock, which may involve splitting existing | |
414 | * locks and bumping ref counts (r_cnt). | |
415 | */ | |
416 | zfs_range_add_reader(tree, new, prev, where); | |
417 | } | |
418 | ||
419 | /* | |
420 | * Lock a range (offset, length) as either shared (RL_READER) | |
421 | * or exclusive (RL_WRITER). Returns the range lock structure | |
422 | * for later unlocking or reduce range (if entire file | |
423 | * previously locked as RL_WRITER). | |
424 | */ | |
425 | rl_t * | |
426 | zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) | |
427 | { | |
428 | rl_t *new; | |
429 | ||
430 | ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); | |
431 | ||
b8d06fca | 432 | new = kmem_alloc(sizeof (rl_t), KM_PUSHPAGE); |
34dc7c2f BB |
433 | new->r_zp = zp; |
434 | new->r_off = off; | |
d164b209 BB |
435 | if (len + off < off) /* overflow */ |
436 | len = UINT64_MAX - off; | |
34dc7c2f BB |
437 | new->r_len = len; |
438 | new->r_cnt = 1; /* assume it's going to be in the tree */ | |
439 | new->r_type = type; | |
440 | new->r_proxy = B_FALSE; | |
441 | new->r_write_wanted = B_FALSE; | |
442 | new->r_read_wanted = B_FALSE; | |
443 | ||
444 | mutex_enter(&zp->z_range_lock); | |
445 | if (type == RL_READER) { | |
446 | /* | |
447 | * First check for the usual case of no locks | |
448 | */ | |
449 | if (avl_numnodes(&zp->z_range_avl) == 0) | |
450 | avl_add(&zp->z_range_avl, new); | |
451 | else | |
452 | zfs_range_lock_reader(zp, new); | |
453 | } else | |
454 | zfs_range_lock_writer(zp, new); /* RL_WRITER or RL_APPEND */ | |
455 | mutex_exit(&zp->z_range_lock); | |
456 | return (new); | |
457 | } | |
458 | ||
8926ab7a BB |
459 | static void |
460 | zfs_range_free(void *arg) | |
461 | { | |
462 | rl_t *rl = arg; | |
463 | ||
464 | if (rl->r_write_wanted) | |
465 | cv_destroy(&rl->r_wr_cv); | |
466 | ||
467 | if (rl->r_read_wanted) | |
468 | cv_destroy(&rl->r_rd_cv); | |
469 | ||
470 | kmem_free(rl, sizeof (rl_t)); | |
471 | } | |
472 | ||
34dc7c2f BB |
473 | /* |
474 | * Unlock a reader lock | |
475 | */ | |
476 | static void | |
450dc149 | 477 | zfs_range_unlock_reader(znode_t *zp, rl_t *remove, list_t *free_list) |
34dc7c2f BB |
478 | { |
479 | avl_tree_t *tree = &zp->z_range_avl; | |
d4ed6673 | 480 | rl_t *rl, *next = NULL; |
34dc7c2f BB |
481 | uint64_t len; |
482 | ||
483 | /* | |
484 | * The common case is when the remove entry is in the tree | |
485 | * (cnt == 1) meaning there's been no other reader locks overlapping | |
486 | * with this one. Otherwise the remove entry will have been | |
487 | * removed from the tree and replaced by proxies (one or | |
488 | * more ranges mapping to the entire range). | |
489 | */ | |
490 | if (remove->r_cnt == 1) { | |
491 | avl_remove(tree, remove); | |
a298dbde | 492 | |
8926ab7a | 493 | if (remove->r_write_wanted) |
34dc7c2f | 494 | cv_broadcast(&remove->r_wr_cv); |
8926ab7a BB |
495 | |
496 | if (remove->r_read_wanted) | |
34dc7c2f | 497 | cv_broadcast(&remove->r_rd_cv); |
8926ab7a | 498 | |
450dc149 | 499 | list_insert_tail(free_list, remove); |
34dc7c2f | 500 | } else { |
c99c9001 MS |
501 | ASSERT0(remove->r_cnt); |
502 | ASSERT0(remove->r_write_wanted); | |
503 | ASSERT0(remove->r_read_wanted); | |
34dc7c2f BB |
504 | /* |
505 | * Find start proxy representing this reader lock, | |
506 | * then decrement ref count on all proxies | |
507 | * that make up this range, freeing them as needed. | |
508 | */ | |
509 | rl = avl_find(tree, remove, NULL); | |
510 | ASSERT(rl); | |
511 | ASSERT(rl->r_cnt); | |
512 | ASSERT(rl->r_type == RL_READER); | |
513 | for (len = remove->r_len; len != 0; rl = next) { | |
514 | len -= rl->r_len; | |
515 | if (len) { | |
516 | next = AVL_NEXT(tree, rl); | |
517 | ASSERT(next); | |
518 | ASSERT(rl->r_off + rl->r_len == next->r_off); | |
519 | ASSERT(next->r_cnt); | |
520 | ASSERT(next->r_type == RL_READER); | |
521 | } | |
522 | rl->r_cnt--; | |
523 | if (rl->r_cnt == 0) { | |
524 | avl_remove(tree, rl); | |
8926ab7a BB |
525 | |
526 | if (rl->r_write_wanted) | |
34dc7c2f | 527 | cv_broadcast(&rl->r_wr_cv); |
8926ab7a BB |
528 | |
529 | if (rl->r_read_wanted) | |
34dc7c2f | 530 | cv_broadcast(&rl->r_rd_cv); |
8926ab7a | 531 | |
450dc149 | 532 | list_insert_tail(free_list, rl); |
34dc7c2f BB |
533 | } |
534 | } | |
8926ab7a | 535 | |
8926ab7a | 536 | kmem_free(remove, sizeof (rl_t)); |
34dc7c2f | 537 | } |
34dc7c2f BB |
538 | } |
539 | ||
540 | /* | |
541 | * Unlock range and destroy range lock structure. | |
542 | */ | |
543 | void | |
544 | zfs_range_unlock(rl_t *rl) | |
545 | { | |
546 | znode_t *zp = rl->r_zp; | |
450dc149 BB |
547 | list_t free_list; |
548 | rl_t *free_rl; | |
34dc7c2f BB |
549 | |
550 | ASSERT(rl->r_type == RL_WRITER || rl->r_type == RL_READER); | |
551 | ASSERT(rl->r_cnt == 1 || rl->r_cnt == 0); | |
552 | ASSERT(!rl->r_proxy); | |
d1d7e268 | 553 | list_create(&free_list, sizeof (rl_t), offsetof(rl_t, rl_node)); |
34dc7c2f BB |
554 | |
555 | mutex_enter(&zp->z_range_lock); | |
556 | if (rl->r_type == RL_WRITER) { | |
557 | /* writer locks can't be shared or split */ | |
558 | avl_remove(&zp->z_range_avl, rl); | |
8926ab7a | 559 | if (rl->r_write_wanted) |
34dc7c2f | 560 | cv_broadcast(&rl->r_wr_cv); |
8926ab7a BB |
561 | |
562 | if (rl->r_read_wanted) | |
34dc7c2f | 563 | cv_broadcast(&rl->r_rd_cv); |
8926ab7a | 564 | |
450dc149 | 565 | list_insert_tail(&free_list, rl); |
34dc7c2f BB |
566 | } else { |
567 | /* | |
568 | * lock may be shared, let zfs_range_unlock_reader() | |
8926ab7a | 569 | * release the zp->z_range_lock lock and free the rl_t |
34dc7c2f | 570 | */ |
450dc149 | 571 | zfs_range_unlock_reader(zp, rl, &free_list); |
34dc7c2f | 572 | } |
a298dbde | 573 | mutex_exit(&zp->z_range_lock); |
450dc149 BB |
574 | |
575 | while ((free_rl = list_head(&free_list)) != NULL) { | |
576 | list_remove(&free_list, free_rl); | |
577 | zfs_range_free(free_rl); | |
578 | } | |
579 | ||
580 | list_destroy(&free_list); | |
34dc7c2f BB |
581 | } |
582 | ||
583 | /* | |
584 | * Reduce range locked as RL_WRITER from whole file to specified range. | |
585 | * Asserts the whole file is exclusivly locked and so there's only one | |
586 | * entry in the tree. | |
587 | */ | |
588 | void | |
589 | zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len) | |
590 | { | |
591 | znode_t *zp = rl->r_zp; | |
592 | ||
593 | /* Ensure there are no other locks */ | |
594 | ASSERT(avl_numnodes(&zp->z_range_avl) == 1); | |
595 | ASSERT(rl->r_off == 0); | |
596 | ASSERT(rl->r_type == RL_WRITER); | |
597 | ASSERT(!rl->r_proxy); | |
598 | ASSERT3U(rl->r_len, ==, UINT64_MAX); | |
599 | ASSERT3U(rl->r_cnt, ==, 1); | |
600 | ||
601 | mutex_enter(&zp->z_range_lock); | |
602 | rl->r_off = off; | |
603 | rl->r_len = len; | |
a298dbde | 604 | |
34dc7c2f BB |
605 | if (rl->r_write_wanted) |
606 | cv_broadcast(&rl->r_wr_cv); | |
607 | if (rl->r_read_wanted) | |
608 | cv_broadcast(&rl->r_rd_cv); | |
a298dbde BB |
609 | |
610 | mutex_exit(&zp->z_range_lock); | |
34dc7c2f BB |
611 | } |
612 | ||
613 | /* | |
614 | * AVL comparison function used to order range locks | |
615 | * Locks are ordered on the start offset of the range. | |
616 | */ | |
617 | int | |
618 | zfs_range_compare(const void *arg1, const void *arg2) | |
619 | { | |
620 | const rl_t *rl1 = arg1; | |
621 | const rl_t *rl2 = arg2; | |
622 | ||
623 | if (rl1->r_off > rl2->r_off) | |
624 | return (1); | |
625 | if (rl1->r_off < rl2->r_off) | |
626 | return (-1); | |
627 | return (0); | |
628 | } |