]>
Commit | Line | Data |
---|---|---|
37fb3e43 PD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * This file and its contents are supplied under the terms of the | |
5 | * Common Development and Distribution License ("CDDL"), version 1.0. | |
6 | * You may only use this file in accordance with the terms of version | |
7 | * 1.0 of the CDDL. | |
8 | * | |
9 | * A full copy of the text of the CDDL should have accompanied this | |
10 | * source. A copy of the CDDL is also available via the Internet at | |
11 | * http://www.illumos.org/license/CDDL. | |
12 | * | |
13 | * CDDL HEADER END | |
14 | */ | |
15 | /* | |
ae3d8491 | 16 | * Copyright (c) 2017, 2018 by Delphix. All rights reserved. |
37fb3e43 PD |
17 | */ |
18 | ||
19 | #include <sys/zfs_context.h> | |
20 | #include <sys/aggsum.h> | |
21 | ||
22 | /* | |
23 | * Aggregate-sum counters are a form of fanned-out counter, used when atomic | |
24 | * instructions on a single field cause enough CPU cache line contention to | |
25 | * slow system performance. Due to their increased overhead and the expense | |
26 | * involved with precisely reading from them, they should only be used in cases | |
27 | * where the write rate (increment/decrement) is much higher than the read rate | |
28 | * (get value). | |
29 | * | |
30 | * Aggregate sum counters are comprised of two basic parts, the core and the | |
31 | * buckets. The core counter contains a lock for the entire counter, as well | |
32 | * as the current upper and lower bounds on the value of the counter. The | |
33 | * aggsum_bucket structure contains a per-bucket lock to protect the contents of | |
34 | * the bucket, the current amount that this bucket has changed from the global | |
35 | * counter (called the delta), and the amount of increment and decrement we have | |
36 | * "borrowed" from the core counter. | |
37 | * | |
38 | * The basic operation of an aggsum is simple. Threads that wish to modify the | |
39 | * counter will modify one bucket's counter (determined by their current CPU, to | |
40 | * help minimize lock and cache contention). If the bucket already has | |
41 | * sufficient capacity borrowed from the core structure to handle their request, | |
42 | * they simply modify the delta and return. If the bucket does not, we clear | |
43 | * the bucket's current state (to prevent the borrowed amounts from getting too | |
44 | * large), and borrow more from the core counter. Borrowing is done by adding to | |
45 | * the upper bound (or subtracting from the lower bound) of the core counter, | |
46 | * and setting the borrow value for the bucket to the amount added (or | |
47 | * subtracted). Clearing the bucket is the opposite; we add the current delta | |
48 | * to both the lower and upper bounds of the core counter, subtract the borrowed | |
49 | * incremental from the upper bound, and add the borrowed decrement from the | |
50 | * lower bound. Note that only borrowing and clearing require access to the | |
51 | * core counter; since all other operations access CPU-local resources, | |
52 | * performance can be much higher than a traditional counter. | |
53 | * | |
54 | * Threads that wish to read from the counter have a slightly more challenging | |
55 | * task. It is fast to determine the upper and lower bounds of the aggum; this | |
56 | * does not require grabbing any locks. This suffices for cases where an | |
57 | * approximation of the aggsum's value is acceptable. However, if one needs to | |
58 | * know whether some specific value is above or below the current value in the | |
59 | * aggsum, they invoke aggsum_compare(). This function operates by repeatedly | |
60 | * comparing the target value to the upper and lower bounds of the aggsum, and | |
61 | * then clearing a bucket. This proceeds until the target is outside of the | |
62 | * upper and lower bounds and we return a response, or the last bucket has been | |
63 | * cleared and we know that the target is equal to the aggsum's value. Finally, | |
64 | * the most expensive operation is determining the precise value of the aggsum. | |
65 | * To do this, we clear every bucket and then return the upper bound (which must | |
66 | * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() | |
67 | * expensive is clearing buckets. This involves grabbing the global lock | |
68 | * (serializing against themselves and borrow operations), grabbing a bucket's | |
69 | * lock (preventing threads on those CPUs from modifying their delta), and | |
70 | * zeroing out the borrowed value (forcing that thread to borrow on its next | |
71 | * request, which will also be expensive). This is what makes aggsums well | |
72 | * suited for write-many read-rarely operations. | |
73 | */ | |
74 | ||
75 | /* | |
76 | * We will borrow aggsum_borrow_multiplier times the current request, so we will | |
77 | * have to get the as_lock approximately every aggsum_borrow_multiplier calls to | |
78 | * aggsum_delta(). | |
79 | */ | |
80 | static uint_t aggsum_borrow_multiplier = 10; | |
81 | ||
82 | void | |
83 | aggsum_init(aggsum_t *as, uint64_t value) | |
84 | { | |
85 | bzero(as, sizeof (*as)); | |
86 | as->as_lower_bound = as->as_upper_bound = value; | |
87 | mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); | |
88 | as->as_numbuckets = boot_ncpus; | |
89 | as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), | |
90 | KM_SLEEP); | |
91 | for (int i = 0; i < as->as_numbuckets; i++) { | |
92 | mutex_init(&as->as_buckets[i].asc_lock, | |
93 | NULL, MUTEX_DEFAULT, NULL); | |
94 | } | |
95 | } | |
96 | ||
97 | void | |
98 | aggsum_fini(aggsum_t *as) | |
99 | { | |
100 | for (int i = 0; i < as->as_numbuckets; i++) | |
101 | mutex_destroy(&as->as_buckets[i].asc_lock); | |
102 | kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); | |
103 | mutex_destroy(&as->as_lock); | |
104 | } | |
105 | ||
106 | int64_t | |
107 | aggsum_lower_bound(aggsum_t *as) | |
108 | { | |
109 | return (as->as_lower_bound); | |
110 | } | |
111 | ||
112 | int64_t | |
113 | aggsum_upper_bound(aggsum_t *as) | |
114 | { | |
115 | return (as->as_upper_bound); | |
116 | } | |
117 | ||
118 | static void | |
119 | aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) | |
120 | { | |
121 | ASSERT(MUTEX_HELD(&as->as_lock)); | |
122 | ASSERT(MUTEX_HELD(&asb->asc_lock)); | |
123 | ||
124 | /* | |
125 | * We use atomic instructions for this because we read the upper and | |
126 | * lower bounds without the lock, so we need stores to be atomic. | |
127 | */ | |
128 | atomic_add_64((volatile uint64_t *)&as->as_lower_bound, asb->asc_delta); | |
129 | atomic_add_64((volatile uint64_t *)&as->as_upper_bound, asb->asc_delta); | |
130 | asb->asc_delta = 0; | |
131 | atomic_add_64((volatile uint64_t *)&as->as_upper_bound, | |
132 | -asb->asc_borrowed); | |
133 | atomic_add_64((volatile uint64_t *)&as->as_lower_bound, | |
134 | asb->asc_borrowed); | |
135 | asb->asc_borrowed = 0; | |
136 | } | |
137 | ||
138 | uint64_t | |
139 | aggsum_value(aggsum_t *as) | |
140 | { | |
141 | int64_t rv; | |
142 | ||
143 | mutex_enter(&as->as_lock); | |
144 | if (as->as_lower_bound == as->as_upper_bound) { | |
145 | rv = as->as_lower_bound; | |
146 | for (int i = 0; i < as->as_numbuckets; i++) { | |
147 | ASSERT0(as->as_buckets[i].asc_delta); | |
148 | ASSERT0(as->as_buckets[i].asc_borrowed); | |
149 | } | |
150 | mutex_exit(&as->as_lock); | |
151 | return (rv); | |
152 | } | |
153 | for (int i = 0; i < as->as_numbuckets; i++) { | |
154 | struct aggsum_bucket *asb = &as->as_buckets[i]; | |
155 | mutex_enter(&asb->asc_lock); | |
156 | aggsum_flush_bucket(as, asb); | |
157 | mutex_exit(&asb->asc_lock); | |
158 | } | |
159 | VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); | |
160 | rv = as->as_lower_bound; | |
161 | mutex_exit(&as->as_lock); | |
162 | ||
163 | return (rv); | |
164 | } | |
165 | ||
166 | static void | |
167 | aggsum_borrow(aggsum_t *as, int64_t delta, struct aggsum_bucket *asb) | |
168 | { | |
169 | int64_t abs_delta = (delta < 0 ? -delta : delta); | |
170 | mutex_enter(&as->as_lock); | |
171 | mutex_enter(&asb->asc_lock); | |
172 | ||
173 | aggsum_flush_bucket(as, asb); | |
174 | ||
175 | atomic_add_64((volatile uint64_t *)&as->as_upper_bound, abs_delta); | |
176 | atomic_add_64((volatile uint64_t *)&as->as_lower_bound, -abs_delta); | |
177 | asb->asc_borrowed = abs_delta; | |
178 | ||
179 | mutex_exit(&asb->asc_lock); | |
180 | mutex_exit(&as->as_lock); | |
181 | } | |
182 | ||
183 | void | |
184 | aggsum_add(aggsum_t *as, int64_t delta) | |
185 | { | |
174bcd58 BB |
186 | struct aggsum_bucket *asb; |
187 | ||
188 | kpreempt_disable(); | |
189 | asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; | |
190 | kpreempt_enable(); | |
37fb3e43 PD |
191 | |
192 | for (;;) { | |
193 | mutex_enter(&asb->asc_lock); | |
194 | if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && | |
195 | asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { | |
196 | asb->asc_delta += delta; | |
197 | mutex_exit(&asb->asc_lock); | |
198 | return; | |
199 | } | |
200 | mutex_exit(&asb->asc_lock); | |
201 | aggsum_borrow(as, delta * aggsum_borrow_multiplier, asb); | |
202 | } | |
203 | } | |
204 | ||
205 | /* | |
206 | * Compare the aggsum value to target efficiently. Returns -1 if the value | |
207 | * represented by the aggsum is less than target, 1 if it's greater, and 0 if | |
208 | * they are equal. | |
209 | */ | |
210 | int | |
211 | aggsum_compare(aggsum_t *as, uint64_t target) | |
212 | { | |
213 | if (as->as_upper_bound < target) | |
214 | return (-1); | |
215 | if (as->as_lower_bound > target) | |
216 | return (1); | |
217 | mutex_enter(&as->as_lock); | |
218 | for (int i = 0; i < as->as_numbuckets; i++) { | |
219 | struct aggsum_bucket *asb = &as->as_buckets[i]; | |
220 | mutex_enter(&asb->asc_lock); | |
221 | aggsum_flush_bucket(as, asb); | |
222 | mutex_exit(&asb->asc_lock); | |
223 | if (as->as_upper_bound < target) { | |
224 | mutex_exit(&as->as_lock); | |
225 | return (-1); | |
226 | } | |
227 | if (as->as_lower_bound > target) { | |
228 | mutex_exit(&as->as_lock); | |
229 | return (1); | |
230 | } | |
231 | } | |
232 | VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); | |
233 | ASSERT3U(as->as_lower_bound, ==, target); | |
234 | mutex_exit(&as->as_lock); | |
235 | return (0); | |
236 | } |