]>
Commit | Line | Data |
---|---|---|
37fb3e43 PD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * This file and its contents are supplied under the terms of the | |
5 | * Common Development and Distribution License ("CDDL"), version 1.0. | |
6 | * You may only use this file in accordance with the terms of version | |
7 | * 1.0 of the CDDL. | |
8 | * | |
9 | * A full copy of the text of the CDDL should have accompanied this | |
10 | * source. A copy of the CDDL is also available via the Internet at | |
11 | * http://www.illumos.org/license/CDDL. | |
12 | * | |
13 | * CDDL HEADER END | |
14 | */ | |
15 | /* | |
ae3d8491 | 16 | * Copyright (c) 2017, 2018 by Delphix. All rights reserved. |
37fb3e43 PD |
17 | */ |
18 | ||
19 | #include <sys/zfs_context.h> | |
20 | #include <sys/aggsum.h> | |
21 | ||
22 | /* | |
23 | * Aggregate-sum counters are a form of fanned-out counter, used when atomic | |
24 | * instructions on a single field cause enough CPU cache line contention to | |
25 | * slow system performance. Due to their increased overhead and the expense | |
26 | * involved with precisely reading from them, they should only be used in cases | |
27 | * where the write rate (increment/decrement) is much higher than the read rate | |
28 | * (get value). | |
29 | * | |
30 | * Aggregate sum counters are comprised of two basic parts, the core and the | |
31 | * buckets. The core counter contains a lock for the entire counter, as well | |
32 | * as the current upper and lower bounds on the value of the counter. The | |
33 | * aggsum_bucket structure contains a per-bucket lock to protect the contents of | |
34 | * the bucket, the current amount that this bucket has changed from the global | |
35 | * counter (called the delta), and the amount of increment and decrement we have | |
36 | * "borrowed" from the core counter. | |
37 | * | |
38 | * The basic operation of an aggsum is simple. Threads that wish to modify the | |
39 | * counter will modify one bucket's counter (determined by their current CPU, to | |
40 | * help minimize lock and cache contention). If the bucket already has | |
41 | * sufficient capacity borrowed from the core structure to handle their request, | |
42 | * they simply modify the delta and return. If the bucket does not, we clear | |
43 | * the bucket's current state (to prevent the borrowed amounts from getting too | |
44 | * large), and borrow more from the core counter. Borrowing is done by adding to | |
45 | * the upper bound (or subtracting from the lower bound) of the core counter, | |
46 | * and setting the borrow value for the bucket to the amount added (or | |
47 | * subtracted). Clearing the bucket is the opposite; we add the current delta | |
48 | * to both the lower and upper bounds of the core counter, subtract the borrowed | |
49 | * incremental from the upper bound, and add the borrowed decrement from the | |
50 | * lower bound. Note that only borrowing and clearing require access to the | |
51 | * core counter; since all other operations access CPU-local resources, | |
52 | * performance can be much higher than a traditional counter. | |
53 | * | |
54 | * Threads that wish to read from the counter have a slightly more challenging | |
55 | * task. It is fast to determine the upper and lower bounds of the aggum; this | |
56 | * does not require grabbing any locks. This suffices for cases where an | |
57 | * approximation of the aggsum's value is acceptable. However, if one needs to | |
58 | * know whether some specific value is above or below the current value in the | |
59 | * aggsum, they invoke aggsum_compare(). This function operates by repeatedly | |
60 | * comparing the target value to the upper and lower bounds of the aggsum, and | |
61 | * then clearing a bucket. This proceeds until the target is outside of the | |
62 | * upper and lower bounds and we return a response, or the last bucket has been | |
63 | * cleared and we know that the target is equal to the aggsum's value. Finally, | |
64 | * the most expensive operation is determining the precise value of the aggsum. | |
65 | * To do this, we clear every bucket and then return the upper bound (which must | |
66 | * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() | |
67 | * expensive is clearing buckets. This involves grabbing the global lock | |
68 | * (serializing against themselves and borrow operations), grabbing a bucket's | |
69 | * lock (preventing threads on those CPUs from modifying their delta), and | |
70 | * zeroing out the borrowed value (forcing that thread to borrow on its next | |
71 | * request, which will also be expensive). This is what makes aggsums well | |
72 | * suited for write-many read-rarely operations. | |
60a4c7d2 PD |
73 | * |
74 | * Note that the aggsums do not expand if more CPUs are hot-added. In that | |
75 | * case, we will have less fanout than boot_ncpus, but we don't want to always | |
76 | * reserve the RAM necessary to create the extra slots for additional CPUs up | |
77 | * front, and dynamically adding them is a complex task. | |
37fb3e43 PD |
78 | */ |
79 | ||
80 | /* | |
ea400129 AM |
81 | * We will borrow 2^aggsum_borrow_shift times the current request, so we will |
82 | * have to get the as_lock approximately every 2^aggsum_borrow_shift calls to | |
83 | * aggsum_add(). | |
37fb3e43 | 84 | */ |
ea400129 | 85 | static uint_t aggsum_borrow_shift = 4; |
37fb3e43 PD |
86 | |
87 | void | |
88 | aggsum_init(aggsum_t *as, uint64_t value) | |
89 | { | |
861166b0 | 90 | memset(as, 0, sizeof (*as)); |
37fb3e43 PD |
91 | as->as_lower_bound = as->as_upper_bound = value; |
92 | mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); | |
ea400129 AM |
93 | /* |
94 | * Too many buckets may hurt read performance without improving | |
95 | * write. From 12 CPUs use bucket per 2 CPUs, from 48 per 4, etc. | |
96 | */ | |
97 | as->as_bucketshift = highbit64(boot_ncpus / 6) / 2; | |
98 | as->as_numbuckets = ((boot_ncpus - 1) >> as->as_bucketshift) + 1; | |
99 | as->as_buckets = kmem_zalloc(as->as_numbuckets * | |
100 | sizeof (aggsum_bucket_t), KM_SLEEP); | |
37fb3e43 PD |
101 | for (int i = 0; i < as->as_numbuckets; i++) { |
102 | mutex_init(&as->as_buckets[i].asc_lock, | |
103 | NULL, MUTEX_DEFAULT, NULL); | |
104 | } | |
105 | } | |
106 | ||
107 | void | |
108 | aggsum_fini(aggsum_t *as) | |
109 | { | |
110 | for (int i = 0; i < as->as_numbuckets; i++) | |
111 | mutex_destroy(&as->as_buckets[i].asc_lock); | |
112 | kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); | |
113 | mutex_destroy(&as->as_lock); | |
114 | } | |
115 | ||
116 | int64_t | |
117 | aggsum_lower_bound(aggsum_t *as) | |
118 | { | |
ea400129 | 119 | return (atomic_load_64((volatile uint64_t *)&as->as_lower_bound)); |
37fb3e43 PD |
120 | } |
121 | ||
ea400129 | 122 | uint64_t |
37fb3e43 PD |
123 | aggsum_upper_bound(aggsum_t *as) |
124 | { | |
ea400129 | 125 | return (atomic_load_64(&as->as_upper_bound)); |
37fb3e43 PD |
126 | } |
127 | ||
128 | uint64_t | |
129 | aggsum_value(aggsum_t *as) | |
130 | { | |
ea400129 AM |
131 | int64_t lb; |
132 | uint64_t ub; | |
37fb3e43 PD |
133 | |
134 | mutex_enter(&as->as_lock); | |
ea400129 AM |
135 | lb = as->as_lower_bound; |
136 | ub = as->as_upper_bound; | |
137 | if (lb == ub) { | |
37fb3e43 PD |
138 | for (int i = 0; i < as->as_numbuckets; i++) { |
139 | ASSERT0(as->as_buckets[i].asc_delta); | |
140 | ASSERT0(as->as_buckets[i].asc_borrowed); | |
141 | } | |
142 | mutex_exit(&as->as_lock); | |
ea400129 | 143 | return (lb); |
37fb3e43 PD |
144 | } |
145 | for (int i = 0; i < as->as_numbuckets; i++) { | |
146 | struct aggsum_bucket *asb = &as->as_buckets[i]; | |
ea400129 AM |
147 | if (asb->asc_borrowed == 0) |
148 | continue; | |
37fb3e43 | 149 | mutex_enter(&asb->asc_lock); |
ea400129 AM |
150 | lb += asb->asc_delta + asb->asc_borrowed; |
151 | ub += asb->asc_delta - asb->asc_borrowed; | |
152 | asb->asc_delta = 0; | |
153 | asb->asc_borrowed = 0; | |
37fb3e43 PD |
154 | mutex_exit(&asb->asc_lock); |
155 | } | |
ea400129 AM |
156 | ASSERT3U(lb, ==, ub); |
157 | atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); | |
158 | atomic_store_64(&as->as_upper_bound, lb); | |
37fb3e43 PD |
159 | mutex_exit(&as->as_lock); |
160 | ||
ea400129 | 161 | return (lb); |
37fb3e43 PD |
162 | } |
163 | ||
37fb3e43 PD |
164 | void |
165 | aggsum_add(aggsum_t *as, int64_t delta) | |
166 | { | |
174bcd58 | 167 | struct aggsum_bucket *asb; |
e0ce98d5 | 168 | int64_t borrow; |
174bcd58 | 169 | |
ea400129 AM |
170 | asb = &as->as_buckets[(CPU_SEQID_UNSTABLE >> as->as_bucketshift) % |
171 | as->as_numbuckets]; | |
37fb3e43 | 172 | |
e0ce98d5 AM |
173 | /* Try fast path if we already borrowed enough before. */ |
174 | mutex_enter(&asb->asc_lock); | |
175 | if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && | |
176 | asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { | |
177 | asb->asc_delta += delta; | |
37fb3e43 | 178 | mutex_exit(&asb->asc_lock); |
e0ce98d5 | 179 | return; |
37fb3e43 | 180 | } |
e0ce98d5 AM |
181 | mutex_exit(&asb->asc_lock); |
182 | ||
183 | /* | |
184 | * We haven't borrowed enough. Take the global lock and borrow | |
185 | * considering what is requested now and what we borrowed before. | |
186 | */ | |
ea400129 AM |
187 | borrow = (delta < 0 ? -delta : delta); |
188 | borrow <<= aggsum_borrow_shift + as->as_bucketshift; | |
e0ce98d5 | 189 | mutex_enter(&as->as_lock); |
e0ce98d5 AM |
190 | if (borrow >= asb->asc_borrowed) |
191 | borrow -= asb->asc_borrowed; | |
192 | else | |
193 | borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; | |
ea400129 AM |
194 | mutex_enter(&asb->asc_lock); |
195 | delta += asb->asc_delta; | |
196 | asb->asc_delta = 0; | |
e0ce98d5 | 197 | asb->asc_borrowed += borrow; |
e0ce98d5 | 198 | mutex_exit(&asb->asc_lock); |
ea400129 AM |
199 | atomic_store_64((volatile uint64_t *)&as->as_lower_bound, |
200 | as->as_lower_bound + delta - borrow); | |
201 | atomic_store_64(&as->as_upper_bound, | |
202 | as->as_upper_bound + delta + borrow); | |
e0ce98d5 | 203 | mutex_exit(&as->as_lock); |
37fb3e43 PD |
204 | } |
205 | ||
206 | /* | |
207 | * Compare the aggsum value to target efficiently. Returns -1 if the value | |
208 | * represented by the aggsum is less than target, 1 if it's greater, and 0 if | |
209 | * they are equal. | |
210 | */ | |
211 | int | |
212 | aggsum_compare(aggsum_t *as, uint64_t target) | |
213 | { | |
ea400129 AM |
214 | int64_t lb; |
215 | uint64_t ub; | |
216 | int i; | |
217 | ||
218 | if (atomic_load_64(&as->as_upper_bound) < target) | |
37fb3e43 | 219 | return (-1); |
ea400129 AM |
220 | lb = atomic_load_64((volatile uint64_t *)&as->as_lower_bound); |
221 | if (lb > 0 && (uint64_t)lb > target) | |
37fb3e43 PD |
222 | return (1); |
223 | mutex_enter(&as->as_lock); | |
ea400129 AM |
224 | lb = as->as_lower_bound; |
225 | ub = as->as_upper_bound; | |
226 | for (i = 0; i < as->as_numbuckets; i++) { | |
37fb3e43 | 227 | struct aggsum_bucket *asb = &as->as_buckets[i]; |
ea400129 AM |
228 | if (asb->asc_borrowed == 0) |
229 | continue; | |
37fb3e43 | 230 | mutex_enter(&asb->asc_lock); |
ea400129 AM |
231 | lb += asb->asc_delta + asb->asc_borrowed; |
232 | ub += asb->asc_delta - asb->asc_borrowed; | |
233 | asb->asc_delta = 0; | |
234 | asb->asc_borrowed = 0; | |
37fb3e43 | 235 | mutex_exit(&asb->asc_lock); |
ea400129 AM |
236 | if (ub < target || (lb > 0 && (uint64_t)lb > target)) |
237 | break; | |
37fb3e43 | 238 | } |
ea400129 AM |
239 | if (i >= as->as_numbuckets) |
240 | ASSERT3U(lb, ==, ub); | |
241 | atomic_store_64((volatile uint64_t *)&as->as_lower_bound, lb); | |
242 | atomic_store_64(&as->as_upper_bound, ub); | |
37fb3e43 | 243 | mutex_exit(&as->as_lock); |
ea400129 | 244 | return (ub < target ? -1 : (uint64_t)lb > target ? 1 : 0); |
37fb3e43 | 245 | } |