]>
Commit | Line | Data |
---|---|---|
d19720a9 | 1 | Reference-count design for elements of lists/arrays protected by RCU. |
c0dfb290 | 2 | |
9963185c PM |
3 | |
4 | Please note that the percpu-ref feature is likely your first | |
5 | stop if you need to combine reference counts and RCU. Please see | |
6 | include/linux/percpu-refcount.h for more information. However, in | |
7 | those unusual cases where percpu-ref would consume too much memory, | |
8 | please read on. | |
9 | ||
10 | ------------------------------------------------------------------------ | |
11 | ||
d19720a9 PM |
12 | Reference counting on elements of lists which are protected by traditional |
13 | reader/writer spinlocks or semaphores are straightforward: | |
c0dfb290 | 14 | |
de1dbcee | 15 | CODE LISTING A: |
095975da NP |
16 | 1. 2. |
17 | add() search_and_reference() | |
18 | { { | |
19 | alloc_object read_lock(&list_lock); | |
20 | ... search_for_element | |
21 | atomic_set(&el->rc, 1); atomic_inc(&el->rc); | |
22 | write_lock(&list_lock); ... | |
23 | add_element read_unlock(&list_lock); | |
24 | ... ... | |
25 | write_unlock(&list_lock); } | |
c0dfb290 DS |
26 | } |
27 | ||
28 | 3. 4. | |
29 | release_referenced() delete() | |
30 | { { | |
095975da | 31 | ... write_lock(&list_lock); |
de1dbcee JFG |
32 | if(atomic_dec_and_test(&el->rc)) ... |
33 | kfree(el); | |
a4d611fd | 34 | ... remove_element |
095975da NP |
35 | } write_unlock(&list_lock); |
36 | ... | |
37 | if (atomic_dec_and_test(&el->rc)) | |
38 | kfree(el); | |
39 | ... | |
c0dfb290 DS |
40 | } |
41 | ||
d19720a9 | 42 | If this list/array is made lock free using RCU as in changing the |
e8aed686 LJ |
43 | write_lock() in add() and delete() to spin_lock() and changing read_lock() |
44 | in search_and_reference() to rcu_read_lock(), the atomic_inc() in | |
45 | search_and_reference() could potentially hold reference to an element which | |
d19720a9 PM |
46 | has already been deleted from the list/array. Use atomic_inc_not_zero() |
47 | in this scenario as follows: | |
c0dfb290 | 48 | |
de1dbcee | 49 | CODE LISTING B: |
c0dfb290 DS |
50 | 1. 2. |
51 | add() search_and_reference() | |
52 | { { | |
095975da NP |
53 | alloc_object rcu_read_lock(); |
54 | ... search_for_element | |
e8aed686 LJ |
55 | atomic_set(&el->rc, 1); if (!atomic_inc_not_zero(&el->rc)) { |
56 | spin_lock(&list_lock); rcu_read_unlock(); | |
095975da NP |
57 | return FAIL; |
58 | add_element } | |
59 | ... ... | |
e8aed686 | 60 | spin_unlock(&list_lock); rcu_read_unlock(); |
c0dfb290 DS |
61 | } } |
62 | 3. 4. | |
63 | release_referenced() delete() | |
64 | { { | |
e8aed686 | 65 | ... spin_lock(&list_lock); |
d19720a9 | 66 | if (atomic_dec_and_test(&el->rc)) ... |
a4d611fd | 67 | call_rcu(&el->head, el_free); remove_element |
e8aed686 | 68 | ... spin_unlock(&list_lock); |
d19720a9 | 69 | } ... |
095975da NP |
70 | if (atomic_dec_and_test(&el->rc)) |
71 | call_rcu(&el->head, el_free); | |
72 | ... | |
c0dfb290 DS |
73 | } |
74 | ||
d19720a9 PM |
75 | Sometimes, a reference to the element needs to be obtained in the |
76 | update (write) stream. In such cases, atomic_inc_not_zero() might be | |
77 | overkill, since we hold the update-side spinlock. One might instead | |
78 | use atomic_inc() in such cases. | |
a4d611fd PM |
79 | |
80 | It is not always convenient to deal with "FAIL" in the | |
81 | search_and_reference() code path. In such cases, the | |
82 | atomic_dec_and_test() may be moved from delete() to el_free() | |
83 | as follows: | |
84 | ||
de1dbcee | 85 | CODE LISTING C: |
a4d611fd PM |
86 | 1. 2. |
87 | add() search_and_reference() | |
88 | { { | |
89 | alloc_object rcu_read_lock(); | |
90 | ... search_for_element | |
91 | atomic_set(&el->rc, 1); atomic_inc(&el->rc); | |
92 | spin_lock(&list_lock); ... | |
93 | ||
94 | add_element rcu_read_unlock(); | |
95 | ... } | |
96 | spin_unlock(&list_lock); 4. | |
97 | } delete() | |
98 | 3. { | |
99 | release_referenced() spin_lock(&list_lock); | |
100 | { ... | |
101 | ... remove_element | |
102 | if (atomic_dec_and_test(&el->rc)) spin_unlock(&list_lock); | |
103 | kfree(el); ... | |
104 | ... call_rcu(&el->head, el_free); | |
105 | } ... | |
106 | 5. } | |
107 | void el_free(struct rcu_head *rhp) | |
108 | { | |
109 | release_referenced(); | |
110 | } | |
111 | ||
112 | The key point is that the initial reference added by add() is not removed | |
113 | until after a grace period has elapsed following removal. This means that | |
114 | search_and_reference() cannot find this element, which means that the value | |
115 | of el->rc cannot increase. Thus, once it reaches zero, there are no | |
116 | readers that can or ever will be able to reference the element. The | |
117 | element can therefore safely be freed. This in turn guarantees that if | |
118 | any reader finds the element, that reader may safely acquire a reference | |
119 | without checking the value of the reference counter. | |
120 | ||
de1dbcee JFG |
121 | A clear advantage of the RCU-based pattern in listing C over the one |
122 | in listing B is that any call to search_and_reference() that locates | |
123 | a given object will succeed in obtaining a reference to that object, | |
124 | even given a concurrent invocation of delete() for that same object. | |
125 | Similarly, a clear advantage of both listings B and C over listing A is | |
126 | that a call to delete() is not delayed even if there are an arbitrarily | |
127 | large number of calls to search_and_reference() searching for the same | |
128 | object that delete() was invoked on. Instead, all that is delayed is | |
129 | the eventual invocation of kfree(), which is usually not a problem on | |
130 | modern computer systems, even the small ones. | |
131 | ||
a4d611fd PM |
132 | In cases where delete() can sleep, synchronize_rcu() can be called from |
133 | delete(), so that el_free() can be subsumed into delete as follows: | |
134 | ||
135 | 4. | |
136 | delete() | |
137 | { | |
138 | spin_lock(&list_lock); | |
139 | ... | |
140 | remove_element | |
141 | spin_unlock(&list_lock); | |
142 | ... | |
143 | synchronize_rcu(); | |
144 | if (atomic_dec_and_test(&el->rc)) | |
145 | kfree(el); | |
146 | ... | |
147 | } | |
de1dbcee JFG |
148 | |
149 | As additional examples in the kernel, the pattern in listing C is used by | |
150 | reference counting of struct pid, while the pattern in listing B is used by | |
151 | struct posix_acl. |