]>
Commit | Line | Data |
---|---|---|
ce0f95a5 | 1 | ================================= |
c742b531 | 2 | Red-black Trees (rbtree) in Linux |
ce0f95a5 MCC |
3 | ================================= |
4 | ||
5 | ||
6 | :Date: January 18, 2007 | |
7 | :Author: Rob Landley <rob@landley.net> | |
c742b531 RL |
8 | |
9 | What are red-black trees, and what are they for? | |
10 | ------------------------------------------------ | |
11 | ||
12 | Red-black trees are a type of self-balancing binary search tree, used for | |
13 | storing sortable key/value data pairs. This differs from radix trees (which | |
14 | are used to efficiently store sparse arrays and thus use long integer indexes | |
15 | to insert/access/delete nodes) and hash tables (which are not kept sorted to | |
16 | be easily traversed in order, and must be tuned for a specific size and | |
17 | hash function where rbtrees scale gracefully storing arbitrary keys). | |
18 | ||
19 | Red-black trees are similar to AVL trees, but provide faster real-time bounded | |
20 | worst case performance for insertion and deletion (at most two rotations and | |
21 | three rotations, respectively, to balance the tree), with slightly slower | |
22 | (but still O(log n)) lookup time. | |
23 | ||
24 | To quote Linux Weekly News: | |
25 | ||
26 | There are a number of red-black trees in use in the kernel. | |
17a9e7bb RD |
27 | The deadline and CFQ I/O schedulers employ rbtrees to |
28 | track requests; the packet CD/DVD driver does the same. | |
c742b531 RL |
29 | The high-resolution timer code uses an rbtree to organize outstanding |
30 | timer requests. The ext3 filesystem tracks directory entries in a | |
31 | red-black tree. Virtual memory areas (VMAs) are tracked with red-black | |
32 | trees, as are epoll file descriptors, cryptographic keys, and network | |
33 | packets in the "hierarchical token bucket" scheduler. | |
34 | ||
35 | This document covers use of the Linux rbtree implementation. For more | |
36 | information on the nature and implementation of Red Black Trees, see: | |
37 | ||
38 | Linux Weekly News article on red-black trees | |
39 | http://lwn.net/Articles/184495/ | |
40 | ||
41 | Wikipedia entry on red-black trees | |
42 | http://en.wikipedia.org/wiki/Red-black_tree | |
43 | ||
44 | Linux implementation of red-black trees | |
45 | --------------------------------------- | |
46 | ||
47 | Linux's rbtree implementation lives in the file "lib/rbtree.c". To use it, | |
48 | "#include <linux/rbtree.h>". | |
49 | ||
50 | The Linux rbtree implementation is optimized for speed, and thus has one | |
51 | less layer of indirection (and better cache locality) than more traditional | |
52 | tree implementations. Instead of using pointers to separate rb_node and data | |
53 | structures, each instance of struct rb_node is embedded in the data structure | |
54 | it organizes. And instead of using a comparison callback function pointer, | |
55 | users are expected to write their own tree search and insert functions | |
56 | which call the provided rbtree functions. Locking is also left up to the | |
57 | user of the rbtree code. | |
58 | ||
59 | Creating a new rbtree | |
60 | --------------------- | |
61 | ||
ce0f95a5 | 62 | Data nodes in an rbtree tree are structures containing a struct rb_node member:: |
c742b531 RL |
63 | |
64 | struct mytype { | |
65 | struct rb_node node; | |
66 | char *keystring; | |
67 | }; | |
68 | ||
69 | When dealing with a pointer to the embedded struct rb_node, the containing data | |
70 | structure may be accessed with the standard container_of() macro. In addition, | |
71 | individual members may be accessed directly via rb_entry(node, type, member). | |
72 | ||
73 | At the root of each rbtree is an rb_root structure, which is initialized to be | |
74 | empty via: | |
75 | ||
76 | struct rb_root mytree = RB_ROOT; | |
77 | ||
78 | Searching for a value in an rbtree | |
79 | ---------------------------------- | |
80 | ||
81 | Writing a search function for your tree is fairly straightforward: start at the | |
82 | root, compare each value, and follow the left or right branch as necessary. | |
83 | ||
ce0f95a5 | 84 | Example:: |
c742b531 RL |
85 | |
86 | struct mytype *my_search(struct rb_root *root, char *string) | |
87 | { | |
88 | struct rb_node *node = root->rb_node; | |
89 | ||
90 | while (node) { | |
91 | struct mytype *data = container_of(node, struct mytype, node); | |
92 | int result; | |
93 | ||
94 | result = strcmp(string, data->keystring); | |
95 | ||
96 | if (result < 0) | |
97 | node = node->rb_left; | |
98 | else if (result > 0) | |
99 | node = node->rb_right; | |
100 | else | |
101 | return data; | |
102 | } | |
103 | return NULL; | |
104 | } | |
105 | ||
106 | Inserting data into an rbtree | |
107 | ----------------------------- | |
108 | ||
109 | Inserting data in the tree involves first searching for the place to insert the | |
110 | new node, then inserting the node and rebalancing ("recoloring") the tree. | |
111 | ||
112 | The search for insertion differs from the previous search by finding the | |
113 | location of the pointer on which to graft the new node. The new node also | |
114 | needs a link to its parent node for rebalancing purposes. | |
115 | ||
ce0f95a5 | 116 | Example:: |
c742b531 RL |
117 | |
118 | int my_insert(struct rb_root *root, struct mytype *data) | |
119 | { | |
120 | struct rb_node **new = &(root->rb_node), *parent = NULL; | |
121 | ||
122 | /* Figure out where to put new node */ | |
123 | while (*new) { | |
124 | struct mytype *this = container_of(*new, struct mytype, node); | |
125 | int result = strcmp(data->keystring, this->keystring); | |
126 | ||
127 | parent = *new; | |
128 | if (result < 0) | |
129 | new = &((*new)->rb_left); | |
130 | else if (result > 0) | |
131 | new = &((*new)->rb_right); | |
132 | else | |
133 | return FALSE; | |
134 | } | |
135 | ||
136 | /* Add new node and rebalance tree. */ | |
27af1da4 | 137 | rb_link_node(&data->node, parent, new); |
138 | rb_insert_color(&data->node, root); | |
c742b531 RL |
139 | |
140 | return TRUE; | |
141 | } | |
142 | ||
143 | Removing or replacing existing data in an rbtree | |
144 | ------------------------------------------------ | |
145 | ||
ce0f95a5 | 146 | To remove an existing node from a tree, call:: |
c742b531 RL |
147 | |
148 | void rb_erase(struct rb_node *victim, struct rb_root *tree); | |
149 | ||
ce0f95a5 | 150 | Example:: |
c742b531 | 151 | |
27af1da4 | 152 | struct mytype *data = mysearch(&mytree, "walrus"); |
c742b531 RL |
153 | |
154 | if (data) { | |
27af1da4 | 155 | rb_erase(&data->node, &mytree); |
c742b531 RL |
156 | myfree(data); |
157 | } | |
158 | ||
ce0f95a5 | 159 | To replace an existing node in a tree with a new one with the same key, call:: |
c742b531 RL |
160 | |
161 | void rb_replace_node(struct rb_node *old, struct rb_node *new, | |
162 | struct rb_root *tree); | |
163 | ||
164 | Replacing a node this way does not re-sort the tree: If the new node doesn't | |
165 | have the same key as the old node, the rbtree will probably become corrupted. | |
166 | ||
167 | Iterating through the elements stored in an rbtree (in sort order) | |
168 | ------------------------------------------------------------------ | |
169 | ||
170 | Four functions are provided for iterating through an rbtree's contents in | |
171 | sorted order. These work on arbitrary trees, and should not need to be | |
ce0f95a5 | 172 | modified or wrapped (except for locking purposes):: |
c742b531 RL |
173 | |
174 | struct rb_node *rb_first(struct rb_root *tree); | |
175 | struct rb_node *rb_last(struct rb_root *tree); | |
176 | struct rb_node *rb_next(struct rb_node *node); | |
177 | struct rb_node *rb_prev(struct rb_node *node); | |
178 | ||
179 | To start iterating, call rb_first() or rb_last() with a pointer to the root | |
180 | of the tree, which will return a pointer to the node structure contained in | |
181 | the first or last element in the tree. To continue, fetch the next or previous | |
182 | node by calling rb_next() or rb_prev() on the current node. This will return | |
183 | NULL when there are no more nodes left. | |
184 | ||
185 | The iterator functions return a pointer to the embedded struct rb_node, from | |
186 | which the containing data structure may be accessed with the container_of() | |
187 | macro, and individual members may be accessed directly via | |
188 | rb_entry(node, type, member). | |
189 | ||
ce0f95a5 | 190 | Example:: |
c742b531 RL |
191 | |
192 | struct rb_node *node; | |
193 | for (node = rb_first(&mytree); node; node = rb_next(node)) | |
19034233 | 194 | printk("key=%s\n", rb_entry(node, struct mytype, node)->keystring); |
c742b531 | 195 | |
cd9e61ed DB |
196 | Cached rbtrees |
197 | -------------- | |
198 | ||
199 | Computing the leftmost (smallest) node is quite a common task for binary | |
200 | search trees, such as for traversals or users relying on a the particular | |
201 | order for their own logic. To this end, users can use 'struct rb_root_cached' | |
202 | to optimize O(logN) rb_first() calls to a simple pointer fetch avoiding | |
203 | potentially expensive tree iterations. This is done at negligible runtime | |
204 | overhead for maintanence; albeit larger memory footprint. | |
205 | ||
206 | Similar to the rb_root structure, cached rbtrees are initialized to be | |
207 | empty via: | |
208 | ||
209 | struct rb_root_cached mytree = RB_ROOT_CACHED; | |
210 | ||
211 | Cached rbtree is simply a regular rb_root with an extra pointer to cache the | |
212 | leftmost node. This allows rb_root_cached to exist wherever rb_root does, | |
213 | which permits augmented trees to be supported as well as only a few extra | |
214 | interfaces: | |
215 | ||
216 | struct rb_node *rb_first_cached(struct rb_root_cached *tree); | |
217 | void rb_insert_color_cached(struct rb_node *, struct rb_root_cached *, bool); | |
218 | void rb_erase_cached(struct rb_node *node, struct rb_root_cached *); | |
219 | ||
220 | Both insert and erase calls have their respective counterpart of augmented | |
221 | trees: | |
222 | ||
223 | void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *, | |
224 | bool, struct rb_augment_callbacks *); | |
225 | void rb_erase_augmented_cached(struct rb_node *, struct rb_root_cached *, | |
226 | struct rb_augment_callbacks *); | |
227 | ||
228 | ||
17d9ddc7 PV |
229 | Support for Augmented rbtrees |
230 | ----------------------------- | |
231 | ||
14b94af0 ML |
232 | Augmented rbtree is an rbtree with "some" additional data stored in |
233 | each node, where the additional data for node N must be a function of | |
234 | the contents of all nodes in the subtree rooted at N. This data can | |
235 | be used to augment some new functionality to rbtree. Augmented rbtree | |
236 | is an optional feature built on top of basic rbtree infrastructure. | |
237 | An rbtree user who wants this feature will have to call the augmentation | |
238 | functions with the user provided augmentation callback when inserting | |
239 | and erasing nodes. | |
2f175074 | 240 | |
9c079add | 241 | C files implementing augmented rbtree manipulation must include |
121e0248 | 242 | <linux/rbtree_augmented.h> instead of <linux/rbtree.h>. Note that |
9c079add ML |
243 | linux/rbtree_augmented.h exposes some rbtree implementations details |
244 | you are not expected to rely on; please stick to the documented APIs | |
245 | there and do not include <linux/rbtree_augmented.h> from header files | |
246 | either so as to minimize chances of your users accidentally relying on | |
247 | such implementation details. | |
248 | ||
14b94af0 ML |
249 | On insertion, the user must update the augmented information on the path |
250 | leading to the inserted node, then call rb_link_node() as usual and | |
251 | rb_augment_inserted() instead of the usual rb_insert_color() call. | |
252 | If rb_augment_inserted() rebalances the rbtree, it will callback into | |
253 | a user provided function to update the augmented information on the | |
254 | affected subtrees. | |
2f175074 | 255 | |
14b94af0 ML |
256 | When erasing a node, the user must call rb_erase_augmented() instead of |
257 | rb_erase(). rb_erase_augmented() calls back into user provided functions | |
258 | to updated the augmented information on affected subtrees. | |
17d9ddc7 | 259 | |
14b94af0 ML |
260 | In both cases, the callbacks are provided through struct rb_augment_callbacks. |
261 | 3 callbacks must be defined: | |
262 | ||
263 | - A propagation callback, which updates the augmented value for a given | |
264 | node and its ancestors, up to a given stop point (or NULL to update | |
265 | all the way to the root). | |
266 | ||
267 | - A copy callback, which copies the augmented value for a given subtree | |
268 | to a newly assigned subtree root. | |
269 | ||
270 | - A tree rotation callback, which copies the augmented value for a given | |
271 | subtree to a newly assigned subtree root AND recomputes the augmented | |
272 | information for the former subtree root. | |
273 | ||
9c079add ML |
274 | The compiled code for rb_erase_augmented() may inline the propagation and |
275 | copy callbacks, which results in a large function, so each augmented rbtree | |
276 | user should have a single rb_erase_augmented() call site in order to limit | |
277 | compiled code size. | |
278 | ||
14b94af0 | 279 | |
ce0f95a5 MCC |
280 | Sample usage |
281 | ^^^^^^^^^^^^ | |
17d9ddc7 PV |
282 | |
283 | Interval tree is an example of augmented rb tree. Reference - | |
284 | "Introduction to Algorithms" by Cormen, Leiserson, Rivest and Stein. | |
285 | More details about interval trees: | |
286 | ||
287 | Classical rbtree has a single key and it cannot be directly used to store | |
288 | interval ranges like [lo:hi] and do a quick lookup for any overlap with a new | |
289 | lo:hi or to find whether there is an exact match for a new lo:hi. | |
290 | ||
291 | However, rbtree can be augmented to store such interval ranges in a structured | |
292 | way making it possible to do efficient lookup and exact match. | |
293 | ||
294 | This "extra information" stored in each node is the maximum hi | |
c98be0c9 | 295 | (max_hi) value among all the nodes that are its descendants. This |
17d9ddc7 PV |
296 | information can be maintained at each node just be looking at the node |
297 | and its immediate children. And this will be used in O(log n) lookup | |
298 | for lowest match (lowest start address among all possible matches) | |
ce0f95a5 | 299 | with something like:: |
17d9ddc7 | 300 | |
ce0f95a5 MCC |
301 | struct interval_tree_node * |
302 | interval_tree_first_match(struct rb_root *root, | |
303 | unsigned long start, unsigned long last) | |
304 | { | |
14b94af0 ML |
305 | struct interval_tree_node *node; |
306 | ||
307 | if (!root->rb_node) | |
308 | return NULL; | |
309 | node = rb_entry(root->rb_node, struct interval_tree_node, rb); | |
310 | ||
311 | while (true) { | |
312 | if (node->rb.rb_left) { | |
313 | struct interval_tree_node *left = | |
314 | rb_entry(node->rb.rb_left, | |
315 | struct interval_tree_node, rb); | |
316 | if (left->__subtree_last >= start) { | |
317 | /* | |
318 | * Some nodes in left subtree satisfy Cond2. | |
319 | * Iterate to find the leftmost such node N. | |
320 | * If it also satisfies Cond1, that's the match | |
321 | * we are looking for. Otherwise, there is no | |
322 | * matching interval as nodes to the right of N | |
323 | * can't satisfy Cond1 either. | |
324 | */ | |
325 | node = left; | |
326 | continue; | |
327 | } | |
17d9ddc7 | 328 | } |
14b94af0 ML |
329 | if (node->start <= last) { /* Cond1 */ |
330 | if (node->last >= start) /* Cond2 */ | |
331 | return node; /* node is leftmost match */ | |
332 | if (node->rb.rb_right) { | |
333 | node = rb_entry(node->rb.rb_right, | |
334 | struct interval_tree_node, rb); | |
335 | if (node->__subtree_last >= start) | |
336 | continue; | |
337 | } | |
338 | } | |
339 | return NULL; /* No match */ | |
340 | } | |
ce0f95a5 | 341 | } |
14b94af0 | 342 | |
ce0f95a5 | 343 | Insertion/removal are defined using the following augmented callbacks:: |
14b94af0 | 344 | |
ce0f95a5 MCC |
345 | static inline unsigned long |
346 | compute_subtree_last(struct interval_tree_node *node) | |
347 | { | |
14b94af0 ML |
348 | unsigned long max = node->last, subtree_last; |
349 | if (node->rb.rb_left) { | |
350 | subtree_last = rb_entry(node->rb.rb_left, | |
351 | struct interval_tree_node, rb)->__subtree_last; | |
352 | if (max < subtree_last) | |
353 | max = subtree_last; | |
354 | } | |
355 | if (node->rb.rb_right) { | |
356 | subtree_last = rb_entry(node->rb.rb_right, | |
357 | struct interval_tree_node, rb)->__subtree_last; | |
358 | if (max < subtree_last) | |
359 | max = subtree_last; | |
360 | } | |
361 | return max; | |
ce0f95a5 | 362 | } |
14b94af0 | 363 | |
ce0f95a5 MCC |
364 | static void augment_propagate(struct rb_node *rb, struct rb_node *stop) |
365 | { | |
14b94af0 ML |
366 | while (rb != stop) { |
367 | struct interval_tree_node *node = | |
368 | rb_entry(rb, struct interval_tree_node, rb); | |
369 | unsigned long subtree_last = compute_subtree_last(node); | |
370 | if (node->__subtree_last == subtree_last) | |
371 | break; | |
372 | node->__subtree_last = subtree_last; | |
373 | rb = rb_parent(&node->rb); | |
374 | } | |
ce0f95a5 | 375 | } |
14b94af0 | 376 | |
ce0f95a5 MCC |
377 | static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new) |
378 | { | |
14b94af0 ML |
379 | struct interval_tree_node *old = |
380 | rb_entry(rb_old, struct interval_tree_node, rb); | |
381 | struct interval_tree_node *new = | |
382 | rb_entry(rb_new, struct interval_tree_node, rb); | |
383 | ||
384 | new->__subtree_last = old->__subtree_last; | |
ce0f95a5 | 385 | } |
14b94af0 | 386 | |
ce0f95a5 MCC |
387 | static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new) |
388 | { | |
14b94af0 ML |
389 | struct interval_tree_node *old = |
390 | rb_entry(rb_old, struct interval_tree_node, rb); | |
391 | struct interval_tree_node *new = | |
392 | rb_entry(rb_new, struct interval_tree_node, rb); | |
393 | ||
394 | new->__subtree_last = old->__subtree_last; | |
395 | old->__subtree_last = compute_subtree_last(old); | |
ce0f95a5 | 396 | } |
14b94af0 | 397 | |
ce0f95a5 | 398 | static const struct rb_augment_callbacks augment_callbacks = { |
14b94af0 | 399 | augment_propagate, augment_copy, augment_rotate |
ce0f95a5 | 400 | }; |
14b94af0 | 401 | |
ce0f95a5 MCC |
402 | void interval_tree_insert(struct interval_tree_node *node, |
403 | struct rb_root *root) | |
404 | { | |
14b94af0 ML |
405 | struct rb_node **link = &root->rb_node, *rb_parent = NULL; |
406 | unsigned long start = node->start, last = node->last; | |
407 | struct interval_tree_node *parent; | |
408 | ||
409 | while (*link) { | |
410 | rb_parent = *link; | |
411 | parent = rb_entry(rb_parent, struct interval_tree_node, rb); | |
412 | if (parent->__subtree_last < last) | |
413 | parent->__subtree_last = last; | |
414 | if (start < parent->start) | |
415 | link = &parent->rb.rb_left; | |
416 | else | |
417 | link = &parent->rb.rb_right; | |
17d9ddc7 | 418 | } |
14b94af0 ML |
419 | |
420 | node->__subtree_last = last; | |
421 | rb_link_node(&node->rb, rb_parent, link); | |
422 | rb_insert_augmented(&node->rb, root, &augment_callbacks); | |
ce0f95a5 | 423 | } |
17d9ddc7 | 424 | |
ce0f95a5 MCC |
425 | void interval_tree_remove(struct interval_tree_node *node, |
426 | struct rb_root *root) | |
427 | { | |
14b94af0 | 428 | rb_erase_augmented(&node->rb, root, &augment_callbacks); |
ce0f95a5 | 429 | } |