]>
Commit | Line | Data |
---|---|---|
2613eab1 KK |
1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* | |
3 | * Historical Service Time | |
4 | * | |
5 | * Keeps a time-weighted exponential moving average of the historical | |
6 | * service time. Estimates future service time based on the historical | |
7 | * service time and the number of outstanding requests. | |
8 | * | |
9 | * Marks paths stale if they have not finished within hst * | |
10 | * num_paths. If a path is stale and unused, we will send a single | |
11 | * request to probe in case the path has improved. This situation | |
12 | * generally arises if the path is so much worse than others that it | |
13 | * will never have the best estimated service time, or if the entire | |
14 | * multipath device is unused. If a path is stale and in use, limit the | |
15 | * number of requests it can receive with the assumption that the path | |
16 | * has become degraded. | |
17 | * | |
18 | * To avoid repeatedly calculating exponents for time weighting, times | |
19 | * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT) | |
20 | * ns, and the weighting is pre-calculated. | |
21 | * | |
22 | */ | |
23 | ||
24 | #include "dm.h" | |
25 | #include "dm-path-selector.h" | |
26 | ||
27 | #include <linux/blkdev.h> | |
28 | #include <linux/slab.h> | |
29 | #include <linux/module.h> | |
30 | ||
31 | ||
32 | #define DM_MSG_PREFIX "multipath historical-service-time" | |
33 | #define HST_MIN_IO 1 | |
34 | #define HST_VERSION "0.1.1" | |
35 | ||
36 | #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */ | |
37 | #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT) | |
38 | #define HST_FIXED_1 (1 << HST_FIXED_SHIFT) | |
39 | #define HST_FIXED_95 972 | |
40 | #define HST_MAX_INFLIGHT HST_FIXED_1 | |
41 | #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */ | |
42 | #define HST_WEIGHT_COUNT 64ULL | |
43 | ||
44 | struct selector { | |
45 | struct list_head valid_paths; | |
46 | struct list_head failed_paths; | |
47 | int valid_count; | |
48 | spinlock_t lock; | |
49 | ||
50 | unsigned int weights[HST_WEIGHT_COUNT]; | |
51 | unsigned int threshold_multiplier; | |
52 | }; | |
53 | ||
54 | struct path_info { | |
55 | struct list_head list; | |
56 | struct dm_path *path; | |
57 | unsigned int repeat_count; | |
58 | ||
59 | spinlock_t lock; | |
60 | ||
61 | u64 historical_service_time; /* Fixed point */ | |
62 | ||
63 | u64 stale_after; | |
64 | u64 last_finish; | |
65 | ||
66 | u64 outstanding; | |
67 | }; | |
68 | ||
69 | /** | |
70 | * fixed_power - compute: x^n, in O(log n) time | |
71 | * | |
72 | * @x: base of the power | |
73 | * @frac_bits: fractional bits of @x | |
74 | * @n: power to raise @x to. | |
75 | * | |
76 | * By exploiting the relation between the definition of the natural power | |
77 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and | |
78 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, | |
79 | * (where: n_i \elem {0, 1}, the binary vector representing n), | |
80 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is | |
81 | * of course trivially computable in O(log_2 n), the length of our binary | |
82 | * vector. | |
83 | * | |
84 | * (see: kernel/sched/loadavg.c) | |
85 | */ | |
86 | static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n) | |
87 | { | |
88 | unsigned long result = 1UL << frac_bits; | |
89 | ||
90 | if (n) { | |
91 | for (;;) { | |
92 | if (n & 1) { | |
93 | result *= x; | |
94 | result += 1UL << (frac_bits - 1); | |
95 | result >>= frac_bits; | |
96 | } | |
97 | n >>= 1; | |
98 | if (!n) | |
99 | break; | |
100 | x *= x; | |
101 | x += 1UL << (frac_bits - 1); | |
102 | x >>= frac_bits; | |
103 | } | |
104 | } | |
105 | ||
106 | return result; | |
107 | } | |
108 | ||
109 | /* | |
110 | * Calculate the next value of an exponential moving average | |
111 | * a_1 = a_0 * e + a * (1 - e) | |
112 | * | |
113 | * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT] | |
114 | * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT] | |
115 | * @weight: [0, HST_FIXED_1] | |
116 | * | |
117 | * Note: | |
118 | * To account for multiple periods in the same calculation, | |
119 | * a_n = a_0 * e^n + a * (1 - e^n), | |
120 | * so call fixed_ema(last, next, pow(weight, N)) | |
121 | */ | |
122 | static u64 fixed_ema(u64 last, u64 next, u64 weight) | |
123 | { | |
124 | last *= weight; | |
125 | last += next * (HST_FIXED_1 - weight); | |
126 | last += 1ULL << (HST_FIXED_SHIFT - 1); | |
127 | return last >> HST_FIXED_SHIFT; | |
128 | } | |
129 | ||
130 | static struct selector *alloc_selector(void) | |
131 | { | |
132 | struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL); | |
133 | ||
134 | if (s) { | |
135 | INIT_LIST_HEAD(&s->valid_paths); | |
136 | INIT_LIST_HEAD(&s->failed_paths); | |
137 | spin_lock_init(&s->lock); | |
138 | s->valid_count = 0; | |
139 | } | |
140 | ||
141 | return s; | |
142 | } | |
143 | ||
144 | /* | |
145 | * Get the weight for a given time span. | |
146 | */ | |
147 | static u64 hst_weight(struct path_selector *ps, u64 delta) | |
148 | { | |
149 | struct selector *s = ps->context; | |
150 | int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL, | |
151 | HST_WEIGHT_COUNT - 1); | |
152 | ||
153 | return s->weights[bucket]; | |
154 | } | |
155 | ||
156 | /* | |
157 | * Set up the weights array. | |
158 | * | |
159 | * weights[len-1] = 0 | |
160 | * weights[n] = base ^ (n + 1) | |
161 | */ | |
162 | static void hst_set_weights(struct path_selector *ps, unsigned int base) | |
163 | { | |
164 | struct selector *s = ps->context; | |
165 | int i; | |
166 | ||
167 | if (base >= HST_FIXED_1) | |
168 | return; | |
169 | ||
170 | for (i = 0; i < HST_WEIGHT_COUNT - 1; i++) | |
171 | s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1); | |
172 | s->weights[HST_WEIGHT_COUNT - 1] = 0; | |
173 | } | |
174 | ||
175 | static int hst_create(struct path_selector *ps, unsigned int argc, char **argv) | |
176 | { | |
177 | struct selector *s; | |
178 | unsigned int base_weight = HST_FIXED_95; | |
179 | unsigned int threshold_multiplier = 0; | |
180 | char dummy; | |
181 | ||
182 | /* | |
183 | * Arguments: [<base_weight> [<threshold_multiplier>]] | |
184 | * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A | |
185 | * value of 0 will completely ignore any history. | |
186 | * If not given, default (HST_FIXED_95) is used. | |
187 | * <threshold_multiplier>: Minimum threshold multiplier for paths to | |
188 | * be considered different. That is, a path is | |
189 | * considered different iff (p1 > N * p2) where p1 | |
190 | * is the path with higher service time. A threshold | |
191 | * of 1 or 0 has no effect. Defaults to 0. | |
192 | */ | |
193 | if (argc > 2) | |
194 | return -EINVAL; | |
195 | ||
196 | if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 || | |
197 | base_weight >= HST_FIXED_1)) { | |
198 | return -EINVAL; | |
199 | } | |
200 | ||
201 | if (argc > 1 && (sscanf(argv[1], "%u%c", | |
202 | &threshold_multiplier, &dummy) != 1)) { | |
203 | return -EINVAL; | |
204 | } | |
205 | ||
206 | s = alloc_selector(); | |
207 | if (!s) | |
208 | return -ENOMEM; | |
209 | ||
210 | ps->context = s; | |
211 | ||
212 | hst_set_weights(ps, base_weight); | |
213 | s->threshold_multiplier = threshold_multiplier; | |
214 | return 0; | |
215 | } | |
216 | ||
217 | static void free_paths(struct list_head *paths) | |
218 | { | |
219 | struct path_info *pi, *next; | |
220 | ||
221 | list_for_each_entry_safe(pi, next, paths, list) { | |
222 | list_del(&pi->list); | |
223 | kfree(pi); | |
224 | } | |
225 | } | |
226 | ||
227 | static void hst_destroy(struct path_selector *ps) | |
228 | { | |
229 | struct selector *s = ps->context; | |
230 | ||
231 | free_paths(&s->valid_paths); | |
232 | free_paths(&s->failed_paths); | |
233 | kfree(s); | |
234 | ps->context = NULL; | |
235 | } | |
236 | ||
237 | static int hst_status(struct path_selector *ps, struct dm_path *path, | |
238 | status_type_t type, char *result, unsigned int maxlen) | |
239 | { | |
240 | unsigned int sz = 0; | |
241 | struct path_info *pi; | |
242 | ||
243 | if (!path) { | |
244 | struct selector *s = ps->context; | |
245 | ||
246 | DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier); | |
247 | } else { | |
248 | pi = path->pscontext; | |
249 | ||
250 | switch (type) { | |
251 | case STATUSTYPE_INFO: | |
252 | DMEMIT("%llu %llu %llu ", pi->historical_service_time, | |
253 | pi->outstanding, pi->stale_after); | |
254 | break; | |
255 | case STATUSTYPE_TABLE: | |
256 | DMEMIT("0 "); | |
257 | break; | |
258 | } | |
259 | } | |
260 | ||
261 | return sz; | |
262 | } | |
263 | ||
264 | static int hst_add_path(struct path_selector *ps, struct dm_path *path, | |
265 | int argc, char **argv, char **error) | |
266 | { | |
267 | struct selector *s = ps->context; | |
268 | struct path_info *pi; | |
269 | unsigned int repeat_count = HST_MIN_IO; | |
270 | char dummy; | |
271 | unsigned long flags; | |
272 | ||
273 | /* | |
274 | * Arguments: [<repeat_count>] | |
275 | * <repeat_count>: The number of I/Os before switching path. | |
276 | * If not given, default (HST_MIN_IO) is used. | |
277 | */ | |
278 | if (argc > 1) { | |
279 | *error = "historical-service-time ps: incorrect number of arguments"; | |
280 | return -EINVAL; | |
281 | } | |
282 | ||
283 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { | |
284 | *error = "historical-service-time ps: invalid repeat count"; | |
285 | return -EINVAL; | |
286 | } | |
287 | ||
288 | /* allocate the path */ | |
289 | pi = kmalloc(sizeof(*pi), GFP_KERNEL); | |
290 | if (!pi) { | |
291 | *error = "historical-service-time ps: Error allocating path context"; | |
292 | return -ENOMEM; | |
293 | } | |
294 | ||
295 | pi->path = path; | |
296 | pi->repeat_count = repeat_count; | |
297 | ||
298 | pi->historical_service_time = HST_FIXED_1; | |
299 | ||
300 | spin_lock_init(&pi->lock); | |
301 | pi->outstanding = 0; | |
302 | ||
303 | pi->stale_after = 0; | |
304 | pi->last_finish = 0; | |
305 | ||
306 | path->pscontext = pi; | |
307 | ||
308 | spin_lock_irqsave(&s->lock, flags); | |
309 | list_add_tail(&pi->list, &s->valid_paths); | |
310 | s->valid_count++; | |
311 | spin_unlock_irqrestore(&s->lock, flags); | |
312 | ||
313 | return 0; | |
314 | } | |
315 | ||
316 | static void hst_fail_path(struct path_selector *ps, struct dm_path *path) | |
317 | { | |
318 | struct selector *s = ps->context; | |
319 | struct path_info *pi = path->pscontext; | |
320 | unsigned long flags; | |
321 | ||
322 | spin_lock_irqsave(&s->lock, flags); | |
323 | list_move(&pi->list, &s->failed_paths); | |
324 | s->valid_count--; | |
325 | spin_unlock_irqrestore(&s->lock, flags); | |
326 | } | |
327 | ||
328 | static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path) | |
329 | { | |
330 | struct selector *s = ps->context; | |
331 | struct path_info *pi = path->pscontext; | |
332 | unsigned long flags; | |
333 | ||
334 | spin_lock_irqsave(&s->lock, flags); | |
335 | list_move_tail(&pi->list, &s->valid_paths); | |
336 | s->valid_count++; | |
337 | spin_unlock_irqrestore(&s->lock, flags); | |
338 | ||
339 | return 0; | |
340 | } | |
341 | ||
342 | static void hst_fill_compare(struct path_info *pi, u64 *hst, | |
343 | u64 *out, u64 *stale) | |
344 | { | |
345 | unsigned long flags; | |
346 | ||
347 | spin_lock_irqsave(&pi->lock, flags); | |
348 | *hst = pi->historical_service_time; | |
349 | *out = pi->outstanding; | |
350 | *stale = pi->stale_after; | |
351 | spin_unlock_irqrestore(&pi->lock, flags); | |
352 | } | |
353 | ||
354 | /* | |
355 | * Compare the estimated service time of 2 paths, pi1 and pi2, | |
356 | * for the incoming I/O. | |
357 | * | |
358 | * Returns: | |
359 | * < 0 : pi1 is better | |
360 | * 0 : no difference between pi1 and pi2 | |
361 | * > 0 : pi2 is better | |
362 | * | |
363 | */ | |
364 | static long long hst_compare(struct path_info *pi1, struct path_info *pi2, | |
365 | u64 time_now, struct path_selector *ps) | |
366 | { | |
367 | struct selector *s = ps->context; | |
368 | u64 hst1, hst2; | |
369 | long long out1, out2, stale1, stale2; | |
370 | int pi2_better, over_threshold; | |
371 | ||
372 | hst_fill_compare(pi1, &hst1, &out1, &stale1); | |
373 | hst_fill_compare(pi2, &hst2, &out2, &stale2); | |
374 | ||
375 | /* Check here if estimated latency for two paths are too similar. | |
376 | * If this is the case, we skip extra calculation and just compare | |
377 | * outstanding requests. In this case, any unloaded paths will | |
378 | * be preferred. | |
379 | */ | |
380 | if (hst1 > hst2) | |
381 | over_threshold = hst1 > (s->threshold_multiplier * hst2); | |
382 | else | |
383 | over_threshold = hst2 > (s->threshold_multiplier * hst1); | |
384 | ||
385 | if (!over_threshold) | |
386 | return out1 - out2; | |
387 | ||
388 | /* | |
389 | * If an unloaded path is stale, choose it. If both paths are unloaded, | |
390 | * choose path that is the most stale. | |
391 | * (If one path is loaded, choose the other) | |
392 | */ | |
393 | if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) || | |
394 | (!out1 && !out2)) | |
395 | return (!out2 * stale1) - (!out1 * stale2); | |
396 | ||
397 | /* Compare estimated service time. If outstanding is the same, we | |
398 | * don't need to multiply | |
399 | */ | |
400 | if (out1 == out2) { | |
401 | pi2_better = hst1 > hst2; | |
402 | } else { | |
403 | /* Potential overflow with out >= 1024 */ | |
404 | if (unlikely(out1 >= HST_MAX_INFLIGHT || | |
405 | out2 >= HST_MAX_INFLIGHT)) { | |
406 | /* If over 1023 in-flights, we may overflow if hst | |
407 | * is at max. (With this shift we still overflow at | |
408 | * 1048576 in-flights, which is high enough). | |
409 | */ | |
410 | hst1 >>= HST_FIXED_SHIFT; | |
411 | hst2 >>= HST_FIXED_SHIFT; | |
412 | } | |
413 | pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2; | |
414 | } | |
415 | ||
416 | /* In the case that the 'winner' is stale, limit to equal usage. */ | |
417 | if (pi2_better) { | |
418 | if (stale2 < time_now) | |
419 | return out1 - out2; | |
420 | return 1; | |
421 | } | |
422 | if (stale1 < time_now) | |
423 | return out1 - out2; | |
424 | return -1; | |
425 | } | |
426 | ||
427 | static struct dm_path *hst_select_path(struct path_selector *ps, | |
428 | size_t nr_bytes) | |
429 | { | |
430 | struct selector *s = ps->context; | |
431 | struct path_info *pi = NULL, *best = NULL; | |
432 | u64 time_now = sched_clock(); | |
433 | struct dm_path *ret = NULL; | |
434 | unsigned long flags; | |
435 | ||
436 | spin_lock_irqsave(&s->lock, flags); | |
437 | if (list_empty(&s->valid_paths)) | |
438 | goto out; | |
439 | ||
440 | list_for_each_entry(pi, &s->valid_paths, list) { | |
441 | if (!best || (hst_compare(pi, best, time_now, ps) < 0)) | |
442 | best = pi; | |
443 | } | |
444 | ||
445 | if (!best) | |
446 | goto out; | |
447 | ||
448 | /* Move last used path to end (least preferred in case of ties) */ | |
449 | list_move_tail(&best->list, &s->valid_paths); | |
450 | ||
451 | ret = best->path; | |
452 | ||
453 | out: | |
454 | spin_unlock_irqrestore(&s->lock, flags); | |
455 | return ret; | |
456 | } | |
457 | ||
458 | static int hst_start_io(struct path_selector *ps, struct dm_path *path, | |
459 | size_t nr_bytes) | |
460 | { | |
461 | struct path_info *pi = path->pscontext; | |
462 | unsigned long flags; | |
463 | ||
464 | spin_lock_irqsave(&pi->lock, flags); | |
465 | pi->outstanding++; | |
466 | spin_unlock_irqrestore(&pi->lock, flags); | |
467 | ||
468 | return 0; | |
469 | } | |
470 | ||
471 | static u64 path_service_time(struct path_info *pi, u64 start_time) | |
472 | { | |
473 | u64 sched_now = ktime_get_ns(); | |
474 | ||
475 | /* if a previous disk request has finished after this IO was | |
476 | * sent to the hardware, pretend the submission happened | |
477 | * serially. | |
478 | */ | |
479 | if (time_after64(pi->last_finish, start_time)) | |
480 | start_time = pi->last_finish; | |
481 | ||
482 | pi->last_finish = sched_now; | |
483 | if (time_before64(sched_now, start_time)) | |
484 | return 0; | |
485 | ||
486 | return sched_now - start_time; | |
487 | } | |
488 | ||
489 | static int hst_end_io(struct path_selector *ps, struct dm_path *path, | |
490 | size_t nr_bytes, u64 start_time) | |
491 | { | |
492 | struct path_info *pi = path->pscontext; | |
493 | struct selector *s = ps->context; | |
494 | unsigned long flags; | |
495 | u64 st; | |
496 | ||
497 | spin_lock_irqsave(&pi->lock, flags); | |
498 | ||
499 | st = path_service_time(pi, start_time); | |
500 | pi->outstanding--; | |
501 | pi->historical_service_time = | |
502 | fixed_ema(pi->historical_service_time, | |
503 | min(st * HST_FIXED_1, HST_FIXED_MAX), | |
504 | hst_weight(ps, st)); | |
505 | ||
506 | /* | |
507 | * On request end, mark path as fresh. If a path hasn't | |
508 | * finished any requests within the fresh period, the estimated | |
509 | * service time is considered too optimistic and we limit the | |
510 | * maximum requests on that path. | |
511 | */ | |
512 | pi->stale_after = pi->last_finish + | |
513 | (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT)); | |
514 | ||
515 | spin_unlock_irqrestore(&pi->lock, flags); | |
516 | ||
517 | return 0; | |
518 | } | |
519 | ||
520 | static struct path_selector_type hst_ps = { | |
521 | .name = "historical-service-time", | |
522 | .module = THIS_MODULE, | |
523 | .table_args = 1, | |
524 | .info_args = 3, | |
525 | .create = hst_create, | |
526 | .destroy = hst_destroy, | |
527 | .status = hst_status, | |
528 | .add_path = hst_add_path, | |
529 | .fail_path = hst_fail_path, | |
530 | .reinstate_path = hst_reinstate_path, | |
531 | .select_path = hst_select_path, | |
532 | .start_io = hst_start_io, | |
533 | .end_io = hst_end_io, | |
534 | }; | |
535 | ||
536 | static int __init dm_hst_init(void) | |
537 | { | |
538 | int r = dm_register_path_selector(&hst_ps); | |
539 | ||
540 | if (r < 0) | |
541 | DMERR("register failed %d", r); | |
542 | ||
543 | DMINFO("version " HST_VERSION " loaded"); | |
544 | ||
545 | return r; | |
546 | } | |
547 | ||
548 | static void __exit dm_hst_exit(void) | |
549 | { | |
550 | int r = dm_unregister_path_selector(&hst_ps); | |
551 | ||
552 | if (r < 0) | |
553 | DMERR("unregister failed %d", r); | |
554 | } | |
555 | ||
556 | module_init(dm_hst_init); | |
557 | module_exit(dm_hst_exit); | |
558 | ||
559 | MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector"); | |
560 | MODULE_AUTHOR("Khazhismel Kumykov <khazhy@google.com>"); | |
561 | MODULE_LICENSE("GPL"); |