]>
Commit | Line | Data |
---|---|---|
2613eab1 KK |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Historical Service Time | |
4 | * | |
5 | * Keeps a time-weighted exponential moving average of the historical | |
6 | * service time. Estimates future service time based on the historical | |
7 | * service time and the number of outstanding requests. | |
8 | * | |
9 | * Marks paths stale if they have not finished within hst * | |
10 | * num_paths. If a path is stale and unused, we will send a single | |
11 | * request to probe in case the path has improved. This situation | |
12 | * generally arises if the path is so much worse than others that it | |
13 | * will never have the best estimated service time, or if the entire | |
14 | * multipath device is unused. If a path is stale and in use, limit the | |
15 | * number of requests it can receive with the assumption that the path | |
16 | * has become degraded. | |
17 | * | |
18 | * To avoid repeatedly calculating exponents for time weighting, times | |
19 | * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) | |
20 | * ns, and the weighting is pre-calculated. | |
21 | * | |
22 | */ | |
23 | ||
24 | #include "dm.h" | |
25 | #include "dm-path-selector.h" | |
26 | ||
27 | #include <linux/blkdev.h> | |
28 | #include <linux/slab.h> | |
29 | #include <linux/module.h> | |
30 | ||
31 | ||
32 | #define DM_MSG_PREFIX "multipath historical-service-time" | |
33 | #define HST_MIN_IO 1 | |
34 | #define HST_VERSION "0.1.1" | |
35 | ||
36 | #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ | |
37 | #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) | |
38 | #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) | |
39 | #define HST_FIXED_95 972 | |
40 | #define HST_MAX_INFLIGHT HST_FIXED_1 | |
41 | #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ | |
42 | #define HST_WEIGHT_COUNT 64ULL | |
43 | ||
44 | struct selector { | |
45 | struct list_head valid_paths; | |
46 | struct list_head failed_paths; | |
47 | int valid_count; | |
48 | spinlock_t lock; | |
49 | ||
50 | unsigned int weights[HST_WEIGHT_COUNT]; | |
51 | unsigned int threshold_multiplier; | |
52 | }; | |
53 | ||
54 | struct path_info { | |
55 | struct list_head list; | |
56 | struct dm_path *path; | |
57 | unsigned int repeat_count; | |
58 | ||
59 | spinlock_t lock; | |
60 | ||
61 | u64 historical_service_time; /* Fixed point */ | |
62 | ||
63 | u64 stale_after; | |
64 | u64 last_finish; | |
65 | ||
66 | u64 outstanding; | |
67 | }; | |
68 | ||
69 | /** | |
70 | * fixed_power - compute: x^n, in O(log n) time | |
71 | * | |
72 | * @x: base of the power | |
73 | * @frac_bits: fractional bits of @x | |
74 | * @n: power to raise @x to. | |
75 | * | |
76 | * By exploiting the relation between the definition of the natural power | |
77 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | |
78 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | |
79 | * (where: n_i \elem {0, 1}, the binary vector representing n), | |
80 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | |
81 | * of course trivially computable in O(log_2 n), the length of our binary | |
82 | * vector. | |
83 | * | |
84 | * (see: kernel/sched/loadavg.c) | |
85 | */ | |
86 | static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) | |
87 | { | |
88 | unsigned long result = 1UL << frac_bits; | |
89 | ||
90 | if (n) { | |
91 | for (;;) { | |
92 | if (n & 1) { | |
93 | result *= x; | |
94 | result += 1UL << (frac_bits - 1); | |
95 | result >>= frac_bits; | |
96 | } | |
97 | n >>= 1; | |
98 | if (!n) | |
99 | break; | |
100 | x *= x; | |
101 | x += 1UL << (frac_bits - 1); | |
102 | x >>= frac_bits; | |
103 | } | |
104 | } | |
105 | ||
106 | return result; | |
107 | } | |
108 | ||
109 | /* | |
110 | * Calculate the next value of an exponential moving average | |
111 | * a_1 = a_0 * e + a * (1 - e) | |
112 | * | |
113 | * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] | |
114 | * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] | |
115 | * @weight: [0, HST_FIXED_1] | |
116 | * | |
117 | * Note: | |
118 | * To account for multiple periods in the same calculation, | |
119 | * a_n = a_0 * e^n + a * (1 - e^n), | |
120 | * so call fixed_ema(last, next, pow(weight, N)) | |
121 | */ | |
122 | static u64 fixed_ema(u64 last, u64 next, u64 weight) | |
123 | { | |
124 | last *= weight; | |
125 | last += next * (HST_FIXED_1 - weight); | |
126 | last += 1ULL << (HST_FIXED_SHIFT - 1); | |
127 | return last >> HST_FIXED_SHIFT; | |
128 | } | |
129 | ||
130 | static struct selector *alloc_selector(void) | |
131 | { | |
132 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | |
133 | ||
134 | if (s) { | |
135 | INIT_LIST_HEAD(&s->valid_paths); | |
136 | INIT_LIST_HEAD(&s->failed_paths); | |
137 | spin_lock_init(&s->lock); | |
138 | s->valid_count = 0; | |
139 | } | |
140 | ||
141 | return s; | |
142 | } | |
143 | ||
144 | /* | |
145 | * Get the weight for a given time span. | |
146 | */ | |
147 | static u64 hst_weight(struct path_selector *ps, u64 delta) | |
148 | { | |
149 | struct selector *s = ps->context; | |
150 | int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, | |
151 | HST_WEIGHT_COUNT - 1); | |
152 | ||
153 | return s->weights[bucket]; | |
154 | } | |
155 | ||
156 | /* | |
157 | * Set up the weights array. | |
158 | * | |
159 | * weights[len-1] = 0 | |
160 | * weights[n] = base ^ (n + 1) | |
161 | */ | |
162 | static void hst_set_weights(struct path_selector *ps, unsigned int base) | |
163 | { | |
164 | struct selector *s = ps->context; | |
165 | int i; | |
166 | ||
167 | if (base >= HST_FIXED_1) | |
168 | return; | |
169 | ||
170 | for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) | |
171 | s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); | |
172 | s->weights[HST_WEIGHT_COUNT - 1] = 0; | |
173 | } | |
174 | ||
175 | static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) | |
176 | { | |
177 | struct selector *s; | |
178 | unsigned int base_weight = HST_FIXED_95; | |
179 | unsigned int threshold_multiplier = 0; | |
180 | char dummy; | |
181 | ||
182 | /* | |
183 | * Arguments: [<base_weight> [<threshold_multiplier>]] | |
184 | * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A | |
185 | * value of 0 will completely ignore any history. | |
186 | * If not given, default (HST_FIXED_95) is used. | |
187 | * <threshold_multiplier>: Minimum threshold multiplier for paths to | |
188 | * be considered different. That is, a path is | |
189 | * considered different iff (p1 > N * p2) where p1 | |
190 | * is the path with higher service time. A threshold | |
191 | * of 1 or 0 has no effect. Defaults to 0. | |
192 | */ | |
193 | if (argc > 2) | |
194 | return -EINVAL; | |
195 | ||
196 | if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || | |
197 | base_weight >= HST_FIXED_1)) { | |
198 | return -EINVAL; | |
199 | } | |
200 | ||
201 | if (argc > 1 && (sscanf(argv[1], "%u%c", | |
202 | &threshold_multiplier, &dummy) != 1)) { | |
203 | return -EINVAL; | |
204 | } | |
205 | ||
206 | s = alloc_selector(); | |
207 | if (!s) | |
208 | return -ENOMEM; | |
209 | ||
210 | ps->context = s; | |
211 | ||
212 | hst_set_weights(ps, base_weight); | |
213 | s->threshold_multiplier = threshold_multiplier; | |
214 | return 0; | |
215 | } | |
216 | ||
217 | static void free_paths(struct list_head *paths) | |
218 | { | |
219 | struct path_info *pi, *next; | |
220 | ||
221 | list_for_each_entry_safe(pi, next, paths, list) { | |
222 | list_del(&pi->list); | |
223 | kfree(pi); | |
224 | } | |
225 | } | |
226 | ||
227 | static void hst_destroy(struct path_selector *ps) | |
228 | { | |
229 | struct selector *s = ps->context; | |
230 | ||
231 | free_paths(&s->valid_paths); | |
232 | free_paths(&s->failed_paths); | |
233 | kfree(s); | |
234 | ps->context = NULL; | |
235 | } | |
236 | ||
237 | static int hst_status(struct path_selector *ps, struct dm_path *path, | |
238 | status_type_t type, char *result, unsigned int maxlen) | |
239 | { | |
240 | unsigned int sz = 0; | |
241 | struct path_info *pi; | |
242 | ||
243 | if (!path) { | |
244 | struct selector *s = ps->context; | |
245 | ||
246 | DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); | |
247 | } else { | |
248 | pi = path->pscontext; | |
249 | ||
250 | switch (type) { | |
251 | case STATUSTYPE_INFO: | |
252 | DMEMIT("%llu %llu %llu ", pi->historical_service_time, | |
253 | pi->outstanding, pi->stale_after); | |
254 | break; | |
255 | case STATUSTYPE_TABLE: | |
256 | DMEMIT("0 "); | |
257 | break; | |
8ec45662 TS |
258 | case STATUSTYPE_IMA: |
259 | *result = '\0'; | |
260 | break; | |
2613eab1 KK |
261 | } |
262 | } | |
263 | ||
264 | return sz; | |
265 | } | |
266 | ||
267 | static int hst_add_path(struct path_selector *ps, struct dm_path *path, | |
268 | int argc, char **argv, char **error) | |
269 | { | |
270 | struct selector *s = ps->context; | |
271 | struct path_info *pi; | |
272 | unsigned int repeat_count = HST_MIN_IO; | |
273 | char dummy; | |
274 | unsigned long flags; | |
275 | ||
276 | /* | |
277 | * Arguments: [<repeat_count>] | |
278 | * <repeat_count>: The number of I/Os before switching path. | |
279 | * If not given, default (HST_MIN_IO) is used. | |
280 | */ | |
281 | if (argc > 1) { | |
282 | *error = "historical-service-time ps: incorrect number of arguments"; | |
283 | return -EINVAL; | |
284 | } | |
285 | ||
286 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { | |
287 | *error = "historical-service-time ps: invalid repeat count"; | |
288 | return -EINVAL; | |
289 | } | |
290 | ||
291 | /* allocate the path */ | |
292 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | |
293 | if (!pi) { | |
294 | *error = "historical-service-time ps: Error allocating path context"; | |
295 | return -ENOMEM; | |
296 | } | |
297 | ||
298 | pi->path = path; | |
299 | pi->repeat_count = repeat_count; | |
300 | ||
301 | pi->historical_service_time = HST_FIXED_1; | |
302 | ||
303 | spin_lock_init(&pi->lock); | |
304 | pi->outstanding = 0; | |
305 | ||
306 | pi->stale_after = 0; | |
307 | pi->last_finish = 0; | |
308 | ||
309 | path->pscontext = pi; | |
310 | ||
311 | spin_lock_irqsave(&s->lock, flags); | |
312 | list_add_tail(&pi->list, &s->valid_paths); | |
313 | s->valid_count++; | |
314 | spin_unlock_irqrestore(&s->lock, flags); | |
315 | ||
316 | return 0; | |
317 | } | |
318 | ||
319 | static void hst_fail_path(struct path_selector *ps, struct dm_path *path) | |
320 | { | |
321 | struct selector *s = ps->context; | |
322 | struct path_info *pi = path->pscontext; | |
323 | unsigned long flags; | |
324 | ||
325 | spin_lock_irqsave(&s->lock, flags); | |
326 | list_move(&pi->list, &s->failed_paths); | |
327 | s->valid_count--; | |
328 | spin_unlock_irqrestore(&s->lock, flags); | |
329 | } | |
330 | ||
331 | static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) | |
332 | { | |
333 | struct selector *s = ps->context; | |
334 | struct path_info *pi = path->pscontext; | |
335 | unsigned long flags; | |
336 | ||
337 | spin_lock_irqsave(&s->lock, flags); | |
338 | list_move_tail(&pi->list, &s->valid_paths); | |
339 | s->valid_count++; | |
340 | spin_unlock_irqrestore(&s->lock, flags); | |
341 | ||
342 | return 0; | |
343 | } | |
344 | ||
345 | static void hst_fill_compare(struct path_info *pi, u64 *hst, | |
346 | u64 *out, u64 *stale) | |
347 | { | |
348 | unsigned long flags; | |
349 | ||
350 | spin_lock_irqsave(&pi->lock, flags); | |
351 | *hst = pi->historical_service_time; | |
352 | *out = pi->outstanding; | |
353 | *stale = pi->stale_after; | |
354 | spin_unlock_irqrestore(&pi->lock, flags); | |
355 | } | |
356 | ||
357 | /* | |
358 | * Compare the estimated service time of 2 paths, pi1 and pi2, | |
359 | * for the incoming I/O. | |
360 | * | |
361 | * Returns: | |
362 | * < 0 : pi1 is better | |
363 | * 0 : no difference between pi1 and pi2 | |
364 | * > 0 : pi2 is better | |
365 | * | |
366 | */ | |
367 | static long long hst_compare(struct path_info *pi1, struct path_info *pi2, | |
368 | u64 time_now, struct path_selector *ps) | |
369 | { | |
370 | struct selector *s = ps->context; | |
371 | u64 hst1, hst2; | |
372 | long long out1, out2, stale1, stale2; | |
373 | int pi2_better, over_threshold; | |
374 | ||
375 | hst_fill_compare(pi1, &hst1, &out1, &stale1); | |
376 | hst_fill_compare(pi2, &hst2, &out2, &stale2); | |
377 | ||
378 | /* Check here if estimated latency for two paths are too similar. | |
379 | * If this is the case, we skip extra calculation and just compare | |
380 | * outstanding requests. In this case, any unloaded paths will | |
381 | * be preferred. | |
382 | */ | |
383 | if (hst1 > hst2) | |
384 | over_threshold = hst1 > (s->threshold_multiplier * hst2); | |
385 | else | |
386 | over_threshold = hst2 > (s->threshold_multiplier * hst1); | |
387 | ||
388 | if (!over_threshold) | |
389 | return out1 - out2; | |
390 | ||
391 | /* | |
392 | * If an unloaded path is stale, choose it. If both paths are unloaded, | |
393 | * choose path that is the most stale. | |
394 | * (If one path is loaded, choose the other) | |
395 | */ | |
396 | if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || | |
397 | (!out1 && !out2)) | |
398 | return (!out2 * stale1) - (!out1 * stale2); | |
399 | ||
400 | /* Compare estimated service time. If outstanding is the same, we | |
401 | * don't need to multiply | |
402 | */ | |
403 | if (out1 == out2) { | |
404 | pi2_better = hst1 > hst2; | |
405 | } else { | |
406 | /* Potential overflow with out >= 1024 */ | |
407 | if (unlikely(out1 >= HST_MAX_INFLIGHT || | |
408 | out2 >= HST_MAX_INFLIGHT)) { | |
409 | /* If over 1023 in-flights, we may overflow if hst | |
410 | * is at max. (With this shift we still overflow at | |
411 | * 1048576 in-flights, which is high enough). | |
412 | */ | |
413 | hst1 >>= HST_FIXED_SHIFT; | |
414 | hst2 >>= HST_FIXED_SHIFT; | |
415 | } | |
416 | pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; | |
417 | } | |
418 | ||
419 | /* In the case that the 'winner' is stale, limit to equal usage. */ | |
420 | if (pi2_better) { | |
421 | if (stale2 < time_now) | |
422 | return out1 - out2; | |
423 | return 1; | |
424 | } | |
425 | if (stale1 < time_now) | |
426 | return out1 - out2; | |
427 | return -1; | |
428 | } | |
429 | ||
430 | static struct dm_path *hst_select_path(struct path_selector *ps, | |
431 | size_t nr_bytes) | |
432 | { | |
433 | struct selector *s = ps->context; | |
434 | struct path_info *pi = NULL, *best = NULL; | |
275dfec5 | 435 | u64 time_now = ktime_get_ns(); |
2613eab1 KK |
436 | struct dm_path *ret = NULL; |
437 | unsigned long flags; | |
438 | ||
439 | spin_lock_irqsave(&s->lock, flags); | |
440 | if (list_empty(&s->valid_paths)) | |
441 | goto out; | |
442 | ||
443 | list_for_each_entry(pi, &s->valid_paths, list) { | |
444 | if (!best || (hst_compare(pi, best, time_now, ps) < 0)) | |
445 | best = pi; | |
446 | } | |
447 | ||
448 | if (!best) | |
449 | goto out; | |
450 | ||
451 | /* Move last used path to end (least preferred in case of ties) */ | |
452 | list_move_tail(&best->list, &s->valid_paths); | |
453 | ||
454 | ret = best->path; | |
455 | ||
456 | out: | |
457 | spin_unlock_irqrestore(&s->lock, flags); | |
458 | return ret; | |
459 | } | |
460 | ||
461 | static int hst_start_io(struct path_selector *ps, struct dm_path *path, | |
462 | size_t nr_bytes) | |
463 | { | |
464 | struct path_info *pi = path->pscontext; | |
465 | unsigned long flags; | |
466 | ||
467 | spin_lock_irqsave(&pi->lock, flags); | |
468 | pi->outstanding++; | |
469 | spin_unlock_irqrestore(&pi->lock, flags); | |
470 | ||
471 | return 0; | |
472 | } | |
473 | ||
474 | static u64 path_service_time(struct path_info *pi, u64 start_time) | |
475 | { | |
275dfec5 | 476 | u64 now = ktime_get_ns(); |
2613eab1 KK |
477 | |
478 | /* if a previous disk request has finished after this IO was | |
479 | * sent to the hardware, pretend the submission happened | |
480 | * serially. | |
481 | */ | |
482 | if (time_after64(pi->last_finish, start_time)) | |
483 | start_time = pi->last_finish; | |
484 | ||
275dfec5 KK |
485 | pi->last_finish = now; |
486 | if (time_before64(now, start_time)) | |
2613eab1 KK |
487 | return 0; |
488 | ||
275dfec5 | 489 | return now - start_time; |
2613eab1 KK |
490 | } |
491 | ||
492 | static int hst_end_io(struct path_selector *ps, struct dm_path *path, | |
493 | size_t nr_bytes, u64 start_time) | |
494 | { | |
495 | struct path_info *pi = path->pscontext; | |
496 | struct selector *s = ps->context; | |
497 | unsigned long flags; | |
498 | u64 st; | |
499 | ||
500 | spin_lock_irqsave(&pi->lock, flags); | |
501 | ||
502 | st = path_service_time(pi, start_time); | |
503 | pi->outstanding--; | |
504 | pi->historical_service_time = | |
505 | fixed_ema(pi->historical_service_time, | |
506 | min(st * HST_FIXED_1, HST_FIXED_MAX), | |
507 | hst_weight(ps, st)); | |
508 | ||
509 | /* | |
510 | * On request end, mark path as fresh. If a path hasn't | |
511 | * finished any requests within the fresh period, the estimated | |
512 | * service time is considered too optimistic and we limit the | |
513 | * maximum requests on that path. | |
514 | */ | |
515 | pi->stale_after = pi->last_finish + | |
516 | (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); | |
517 | ||
518 | spin_unlock_irqrestore(&pi->lock, flags); | |
519 | ||
520 | return 0; | |
521 | } | |
522 | ||
523 | static struct path_selector_type hst_ps = { | |
524 | .name = "historical-service-time", | |
525 | .module = THIS_MODULE, | |
526 | .table_args = 1, | |
527 | .info_args = 3, | |
528 | .create = hst_create, | |
529 | .destroy = hst_destroy, | |
530 | .status = hst_status, | |
531 | .add_path = hst_add_path, | |
532 | .fail_path = hst_fail_path, | |
533 | .reinstate_path = hst_reinstate_path, | |
534 | .select_path = hst_select_path, | |
535 | .start_io = hst_start_io, | |
536 | .end_io = hst_end_io, | |
537 | }; | |
538 | ||
539 | static int __init dm_hst_init(void) | |
540 | { | |
541 | int r = dm_register_path_selector(&hst_ps); | |
542 | ||
543 | if (r < 0) | |
544 | DMERR("register failed %d", r); | |
545 | ||
546 | DMINFO("version " HST_VERSION " loaded"); | |
547 | ||
548 | return r; | |
549 | } | |
550 | ||
551 | static void __exit dm_hst_exit(void) | |
552 | { | |
553 | int r = dm_unregister_path_selector(&hst_ps); | |
554 | ||
555 | if (r < 0) | |
556 | DMERR("unregister failed %d", r); | |
557 | } | |
558 | ||
559 | module_init(dm_hst_init); | |
560 | module_exit(dm_hst_exit); | |
561 | ||
562 | MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); | |
563 | MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); | |
564 | MODULE_LICENSE("GPL"); |