123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565 |
- // SPDX-License-Identifier: GPL-2.0
- /*
- * Historical Service Time
- *
- * Keeps a time-weighted exponential moving average of the historical
- * service time. Estimates future service time based on the historical
- * service time and the number of outstanding requests.
- *
- * Marks paths stale if they have not finished within hst *
- * num_paths. If a path is stale and unused, we will send a single
- * request to probe in case the path has improved. This situation
- * generally arises if the path is so much worse than others that it
- * will never have the best estimated service time, or if the entire
- * multipath device is unused. If a path is stale and in use, limit the
- * number of requests it can receive with the assumption that the path
- * has become degraded.
- *
- * To avoid repeatedly calculating exponents for time weighting, times
- * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
- * ns, and the weighting is pre-calculated.
- *
- */
- #include "dm.h"
- #include "dm-path-selector.h"
- #include <linux/blkdev.h>
- #include <linux/slab.h>
- #include <linux/module.h>
- #define DM_MSG_PREFIX "multipath historical-service-time"
- #define HST_MIN_IO 1
- #define HST_VERSION "0.1.1"
- #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
- #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
- #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
- #define HST_FIXED_95 972
- #define HST_MAX_INFLIGHT HST_FIXED_1
- #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
- #define HST_WEIGHT_COUNT 64ULL
- struct selector {
- struct list_head valid_paths;
- struct list_head failed_paths;
- int valid_count;
- spinlock_t lock;
- unsigned int weights[HST_WEIGHT_COUNT];
- unsigned int threshold_multiplier;
- };
- struct path_info {
- struct list_head list;
- struct dm_path *path;
- unsigned int repeat_count;
- spinlock_t lock;
- u64 historical_service_time; /* Fixed point */
- u64 stale_after;
- u64 last_finish;
- u64 outstanding;
- };
- /**
- * fixed_power - compute: x^n, in O(log n) time
- *
- * @x: base of the power
- * @frac_bits: fractional bits of @x
- * @n: power to raise @x to.
- *
- * By exploiting the relation between the definition of the natural power
- * function: x^n := x*x*...*x (x multiplied by itself for n times), and
- * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
- * (where: n_i \elem {0, 1}, the binary vector representing n),
- * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
- * of course trivially computable in O(log_2 n), the length of our binary
- * vector.
- *
- * (see: kernel/sched/loadavg.c)
- */
- static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
- {
- unsigned long result = 1UL << frac_bits;
- if (n) {
- for (;;) {
- if (n & 1) {
- result *= x;
- result += 1UL << (frac_bits - 1);
- result >>= frac_bits;
- }
- n >>= 1;
- if (!n)
- break;
- x *= x;
- x += 1UL << (frac_bits - 1);
- x >>= frac_bits;
- }
- }
- return result;
- }
- /*
- * Calculate the next value of an exponential moving average
- * a_1 = a_0 * e + a * (1 - e)
- *
- * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
- * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
- * @weight: [0, HST_FIXED_1]
- *
- * Note:
- * To account for multiple periods in the same calculation,
- * a_n = a_0 * e^n + a * (1 - e^n),
- * so call fixed_ema(last, next, pow(weight, N))
- */
- static u64 fixed_ema(u64 last, u64 next, u64 weight)
- {
- last *= weight;
- last += next * (HST_FIXED_1 - weight);
- last += 1ULL << (HST_FIXED_SHIFT - 1);
- return last >> HST_FIXED_SHIFT;
- }
- static struct selector *alloc_selector(void)
- {
- struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
- if (s) {
- INIT_LIST_HEAD(&s->valid_paths);
- INIT_LIST_HEAD(&s->failed_paths);
- spin_lock_init(&s->lock);
- s->valid_count = 0;
- }
- return s;
- }
- /*
- * Get the weight for a given time span.
- */
- static u64 hst_weight(struct path_selector *ps, u64 delta)
- {
- struct selector *s = ps->context;
- int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
- HST_WEIGHT_COUNT - 1);
- return s->weights[bucket];
- }
- /*
- * Set up the weights array.
- *
- * weights[len-1] = 0
- * weights[n] = base ^ (n + 1)
- */
- static void hst_set_weights(struct path_selector *ps, unsigned int base)
- {
- struct selector *s = ps->context;
- int i;
- if (base >= HST_FIXED_1)
- return;
- for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
- s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
- s->weights[HST_WEIGHT_COUNT - 1] = 0;
- }
- static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
- {
- struct selector *s;
- unsigned int base_weight = HST_FIXED_95;
- unsigned int threshold_multiplier = 0;
- char dummy;
- /*
- * Arguments: [<base_weight> [<threshold_multiplier>]]
- * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
- * value of 0 will completely ignore any history.
- * If not given, default (HST_FIXED_95) is used.
- * <threshold_multiplier>: Minimum threshold multiplier for paths to
- * be considered different. That is, a path is
- * considered different iff (p1 > N * p2) where p1
- * is the path with higher service time. A threshold
- * of 1 or 0 has no effect. Defaults to 0.
- */
- if (argc > 2)
- return -EINVAL;
- if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
- base_weight >= HST_FIXED_1)) {
- return -EINVAL;
- }
- if (argc > 1 && (sscanf(argv[1], "%u%c",
- &threshold_multiplier, &dummy) != 1)) {
- return -EINVAL;
- }
- s = alloc_selector();
- if (!s)
- return -ENOMEM;
- ps->context = s;
- hst_set_weights(ps, base_weight);
- s->threshold_multiplier = threshold_multiplier;
- return 0;
- }
- static void free_paths(struct list_head *paths)
- {
- struct path_info *pi, *next;
- list_for_each_entry_safe(pi, next, paths, list) {
- list_del(&pi->list);
- kfree(pi);
- }
- }
- static void hst_destroy(struct path_selector *ps)
- {
- struct selector *s = ps->context;
- free_paths(&s->valid_paths);
- free_paths(&s->failed_paths);
- kfree(s);
- ps->context = NULL;
- }
- static int hst_status(struct path_selector *ps, struct dm_path *path,
- status_type_t type, char *result, unsigned int maxlen)
- {
- unsigned int sz = 0;
- struct path_info *pi;
- if (!path) {
- struct selector *s = ps->context;
- DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
- } else {
- pi = path->pscontext;
- switch (type) {
- case STATUSTYPE_INFO:
- DMEMIT("%llu %llu %llu ", pi->historical_service_time,
- pi->outstanding, pi->stale_after);
- break;
- case STATUSTYPE_TABLE:
- DMEMIT("0 ");
- break;
- case STATUSTYPE_IMA:
- *result = '\0';
- break;
- }
- }
- return sz;
- }
- static int hst_add_path(struct path_selector *ps, struct dm_path *path,
- int argc, char **argv, char **error)
- {
- struct selector *s = ps->context;
- struct path_info *pi;
- unsigned int repeat_count = HST_MIN_IO;
- char dummy;
- unsigned long flags;
- /*
- * Arguments: [<repeat_count>]
- * <repeat_count>: The number of I/Os before switching path.
- * If not given, default (HST_MIN_IO) is used.
- */
- if (argc > 1) {
- *error = "historical-service-time ps: incorrect number of arguments";
- return -EINVAL;
- }
- if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
- *error = "historical-service-time ps: invalid repeat count";
- return -EINVAL;
- }
- /* allocate the path */
- pi = kmalloc(sizeof(*pi), GFP_KERNEL);
- if (!pi) {
- *error = "historical-service-time ps: Error allocating path context";
- return -ENOMEM;
- }
- pi->path = path;
- pi->repeat_count = repeat_count;
- pi->historical_service_time = HST_FIXED_1;
- spin_lock_init(&pi->lock);
- pi->outstanding = 0;
- pi->stale_after = 0;
- pi->last_finish = 0;
- path->pscontext = pi;
- spin_lock_irqsave(&s->lock, flags);
- list_add_tail(&pi->list, &s->valid_paths);
- s->valid_count++;
- spin_unlock_irqrestore(&s->lock, flags);
- return 0;
- }
- static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
- {
- struct selector *s = ps->context;
- struct path_info *pi = path->pscontext;
- unsigned long flags;
- spin_lock_irqsave(&s->lock, flags);
- list_move(&pi->list, &s->failed_paths);
- s->valid_count--;
- spin_unlock_irqrestore(&s->lock, flags);
- }
- static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
- {
- struct selector *s = ps->context;
- struct path_info *pi = path->pscontext;
- unsigned long flags;
- spin_lock_irqsave(&s->lock, flags);
- list_move_tail(&pi->list, &s->valid_paths);
- s->valid_count++;
- spin_unlock_irqrestore(&s->lock, flags);
- return 0;
- }
- static void hst_fill_compare(struct path_info *pi, u64 *hst,
- u64 *out, u64 *stale)
- {
- unsigned long flags;
- spin_lock_irqsave(&pi->lock, flags);
- *hst = pi->historical_service_time;
- *out = pi->outstanding;
- *stale = pi->stale_after;
- spin_unlock_irqrestore(&pi->lock, flags);
- }
- /*
- * Compare the estimated service time of 2 paths, pi1 and pi2,
- * for the incoming I/O.
- *
- * Returns:
- * < 0 : pi1 is better
- * 0 : no difference between pi1 and pi2
- * > 0 : pi2 is better
- *
- */
- static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
- u64 time_now, struct path_selector *ps)
- {
- struct selector *s = ps->context;
- u64 hst1, hst2;
- long long out1, out2, stale1, stale2;
- int pi2_better, over_threshold;
- hst_fill_compare(pi1, &hst1, &out1, &stale1);
- hst_fill_compare(pi2, &hst2, &out2, &stale2);
- /* Check here if estimated latency for two paths are too similar.
- * If this is the case, we skip extra calculation and just compare
- * outstanding requests. In this case, any unloaded paths will
- * be preferred.
- */
- if (hst1 > hst2)
- over_threshold = hst1 > (s->threshold_multiplier * hst2);
- else
- over_threshold = hst2 > (s->threshold_multiplier * hst1);
- if (!over_threshold)
- return out1 - out2;
- /*
- * If an unloaded path is stale, choose it. If both paths are unloaded,
- * choose path that is the most stale.
- * (If one path is loaded, choose the other)
- */
- if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
- (!out1 && !out2))
- return (!out2 * stale1) - (!out1 * stale2);
- /* Compare estimated service time. If outstanding is the same, we
- * don't need to multiply
- */
- if (out1 == out2) {
- pi2_better = hst1 > hst2;
- } else {
- /* Potential overflow with out >= 1024 */
- if (unlikely(out1 >= HST_MAX_INFLIGHT ||
- out2 >= HST_MAX_INFLIGHT)) {
- /* If over 1023 in-flights, we may overflow if hst
- * is at max. (With this shift we still overflow at
- * 1048576 in-flights, which is high enough).
- */
- hst1 >>= HST_FIXED_SHIFT;
- hst2 >>= HST_FIXED_SHIFT;
- }
- pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
- }
- /* In the case that the 'winner' is stale, limit to equal usage. */
- if (pi2_better) {
- if (stale2 < time_now)
- return out1 - out2;
- return 1;
- }
- if (stale1 < time_now)
- return out1 - out2;
- return -1;
- }
- static struct dm_path *hst_select_path(struct path_selector *ps,
- size_t nr_bytes)
- {
- struct selector *s = ps->context;
- struct path_info *pi = NULL, *best = NULL;
- u64 time_now = ktime_get_ns();
- struct dm_path *ret = NULL;
- unsigned long flags;
- spin_lock_irqsave(&s->lock, flags);
- if (list_empty(&s->valid_paths))
- goto out;
- list_for_each_entry(pi, &s->valid_paths, list) {
- if (!best || (hst_compare(pi, best, time_now, ps) < 0))
- best = pi;
- }
- if (!best)
- goto out;
- /* Move last used path to end (least preferred in case of ties) */
- list_move_tail(&best->list, &s->valid_paths);
- ret = best->path;
- out:
- spin_unlock_irqrestore(&s->lock, flags);
- return ret;
- }
- static int hst_start_io(struct path_selector *ps, struct dm_path *path,
- size_t nr_bytes)
- {
- struct path_info *pi = path->pscontext;
- unsigned long flags;
- spin_lock_irqsave(&pi->lock, flags);
- pi->outstanding++;
- spin_unlock_irqrestore(&pi->lock, flags);
- return 0;
- }
- static u64 path_service_time(struct path_info *pi, u64 start_time)
- {
- u64 now = ktime_get_ns();
- /* if a previous disk request has finished after this IO was
- * sent to the hardware, pretend the submission happened
- * serially.
- */
- if (time_after64(pi->last_finish, start_time))
- start_time = pi->last_finish;
- pi->last_finish = now;
- if (time_before64(now, start_time))
- return 0;
- return now - start_time;
- }
- static int hst_end_io(struct path_selector *ps, struct dm_path *path,
- size_t nr_bytes, u64 start_time)
- {
- struct path_info *pi = path->pscontext;
- struct selector *s = ps->context;
- unsigned long flags;
- u64 st;
- spin_lock_irqsave(&pi->lock, flags);
- st = path_service_time(pi, start_time);
- pi->outstanding--;
- pi->historical_service_time =
- fixed_ema(pi->historical_service_time,
- min(st * HST_FIXED_1, HST_FIXED_MAX),
- hst_weight(ps, st));
- /*
- * On request end, mark path as fresh. If a path hasn't
- * finished any requests within the fresh period, the estimated
- * service time is considered too optimistic and we limit the
- * maximum requests on that path.
- */
- pi->stale_after = pi->last_finish +
- (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
- spin_unlock_irqrestore(&pi->lock, flags);
- return 0;
- }
- static struct path_selector_type hst_ps = {
- .name = "historical-service-time",
- .module = THIS_MODULE,
- .features = DM_PS_USE_HR_TIMER,
- .table_args = 1,
- .info_args = 3,
- .create = hst_create,
- .destroy = hst_destroy,
- .status = hst_status,
- .add_path = hst_add_path,
- .fail_path = hst_fail_path,
- .reinstate_path = hst_reinstate_path,
- .select_path = hst_select_path,
- .start_io = hst_start_io,
- .end_io = hst_end_io,
- };
- static int __init dm_hst_init(void)
- {
- int r = dm_register_path_selector(&hst_ps);
- if (r < 0)
- DMERR("register failed %d", r);
- DMINFO("version " HST_VERSION " loaded");
- return r;
- }
- static void __exit dm_hst_exit(void)
- {
- int r = dm_unregister_path_selector(&hst_ps);
- if (r < 0)
- DMERR("unregister failed %d", r);
- }
- module_init(dm_hst_init);
- module_exit(dm_hst_exit);
- MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
- MODULE_AUTHOR("Khazhismel Kumykov <[email protected]>");
- MODULE_LICENSE("GPL");
|