dm-ps-historical-service-time.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Historical Service Time
  4. *
  5. * Keeps a time-weighted exponential moving average of the historical
  6. * service time. Estimates future service time based on the historical
  7. * service time and the number of outstanding requests.
  8. *
  9. * Marks paths stale if they have not finished within hst *
  10. * num_paths. If a path is stale and unused, we will send a single
  11. * request to probe in case the path has improved. This situation
  12. * generally arises if the path is so much worse than others that it
  13. * will never have the best estimated service time, or if the entire
  14. * multipath device is unused. If a path is stale and in use, limit the
  15. * number of requests it can receive with the assumption that the path
  16. * has become degraded.
  17. *
  18. * To avoid repeatedly calculating exponents for time weighting, times
  19. * are split into HST_WEIGHT_COUNT buckets each (1 >> HST_BUCKET_SHIFT)
  20. * ns, and the weighting is pre-calculated.
  21. *
  22. */
  23. #include "dm.h"
  24. #include "dm-path-selector.h"
  25. #include <linux/blkdev.h>
  26. #include <linux/slab.h>
  27. #include <linux/module.h>
  28. #define DM_MSG_PREFIX "multipath historical-service-time"
  29. #define HST_MIN_IO 1
  30. #define HST_VERSION "0.1.1"
  31. #define HST_FIXED_SHIFT 10 /* 10 bits of decimal precision */
  32. #define HST_FIXED_MAX (ULLONG_MAX >> HST_FIXED_SHIFT)
  33. #define HST_FIXED_1 (1 << HST_FIXED_SHIFT)
  34. #define HST_FIXED_95 972
  35. #define HST_MAX_INFLIGHT HST_FIXED_1
  36. #define HST_BUCKET_SHIFT 24 /* Buckets are ~ 16ms */
  37. #define HST_WEIGHT_COUNT 64ULL
  38. struct selector {
  39. struct list_head valid_paths;
  40. struct list_head failed_paths;
  41. int valid_count;
  42. spinlock_t lock;
  43. unsigned int weights[HST_WEIGHT_COUNT];
  44. unsigned int threshold_multiplier;
  45. };
  46. struct path_info {
  47. struct list_head list;
  48. struct dm_path *path;
  49. unsigned int repeat_count;
  50. spinlock_t lock;
  51. u64 historical_service_time; /* Fixed point */
  52. u64 stale_after;
  53. u64 last_finish;
  54. u64 outstanding;
  55. };
  56. /**
  57. * fixed_power - compute: x^n, in O(log n) time
  58. *
  59. * @x: base of the power
  60. * @frac_bits: fractional bits of @x
  61. * @n: power to raise @x to.
  62. *
  63. * By exploiting the relation between the definition of the natural power
  64. * function: x^n := x*x*...*x (x multiplied by itself for n times), and
  65. * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
  66. * (where: n_i \elem {0, 1}, the binary vector representing n),
  67. * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
  68. * of course trivially computable in O(log_2 n), the length of our binary
  69. * vector.
  70. *
  71. * (see: kernel/sched/loadavg.c)
  72. */
  73. static u64 fixed_power(u64 x, unsigned int frac_bits, unsigned int n)
  74. {
  75. unsigned long result = 1UL << frac_bits;
  76. if (n) {
  77. for (;;) {
  78. if (n & 1) {
  79. result *= x;
  80. result += 1UL << (frac_bits - 1);
  81. result >>= frac_bits;
  82. }
  83. n >>= 1;
  84. if (!n)
  85. break;
  86. x *= x;
  87. x += 1UL << (frac_bits - 1);
  88. x >>= frac_bits;
  89. }
  90. }
  91. return result;
  92. }
  93. /*
  94. * Calculate the next value of an exponential moving average
  95. * a_1 = a_0 * e + a * (1 - e)
  96. *
  97. * @last: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
  98. * @next: [0, ULLONG_MAX >> HST_FIXED_SHIFT]
  99. * @weight: [0, HST_FIXED_1]
  100. *
  101. * Note:
  102. * To account for multiple periods in the same calculation,
  103. * a_n = a_0 * e^n + a * (1 - e^n),
  104. * so call fixed_ema(last, next, pow(weight, N))
  105. */
  106. static u64 fixed_ema(u64 last, u64 next, u64 weight)
  107. {
  108. last *= weight;
  109. last += next * (HST_FIXED_1 - weight);
  110. last += 1ULL << (HST_FIXED_SHIFT - 1);
  111. return last >> HST_FIXED_SHIFT;
  112. }
  113. static struct selector *alloc_selector(void)
  114. {
  115. struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
  116. if (s) {
  117. INIT_LIST_HEAD(&s->valid_paths);
  118. INIT_LIST_HEAD(&s->failed_paths);
  119. spin_lock_init(&s->lock);
  120. s->valid_count = 0;
  121. }
  122. return s;
  123. }
  124. /*
  125. * Get the weight for a given time span.
  126. */
  127. static u64 hst_weight(struct path_selector *ps, u64 delta)
  128. {
  129. struct selector *s = ps->context;
  130. int bucket = clamp(delta >> HST_BUCKET_SHIFT, 0ULL,
  131. HST_WEIGHT_COUNT - 1);
  132. return s->weights[bucket];
  133. }
  134. /*
  135. * Set up the weights array.
  136. *
  137. * weights[len-1] = 0
  138. * weights[n] = base ^ (n + 1)
  139. */
  140. static void hst_set_weights(struct path_selector *ps, unsigned int base)
  141. {
  142. struct selector *s = ps->context;
  143. int i;
  144. if (base >= HST_FIXED_1)
  145. return;
  146. for (i = 0; i < HST_WEIGHT_COUNT - 1; i++)
  147. s->weights[i] = fixed_power(base, HST_FIXED_SHIFT, i + 1);
  148. s->weights[HST_WEIGHT_COUNT - 1] = 0;
  149. }
  150. static int hst_create(struct path_selector *ps, unsigned int argc, char **argv)
  151. {
  152. struct selector *s;
  153. unsigned int base_weight = HST_FIXED_95;
  154. unsigned int threshold_multiplier = 0;
  155. char dummy;
  156. /*
  157. * Arguments: [<base_weight> [<threshold_multiplier>]]
  158. * <base_weight>: Base weight for ema [0, 1024) 10-bit fixed point. A
  159. * value of 0 will completely ignore any history.
  160. * If not given, default (HST_FIXED_95) is used.
  161. * <threshold_multiplier>: Minimum threshold multiplier for paths to
  162. * be considered different. That is, a path is
  163. * considered different iff (p1 > N * p2) where p1
  164. * is the path with higher service time. A threshold
  165. * of 1 or 0 has no effect. Defaults to 0.
  166. */
  167. if (argc > 2)
  168. return -EINVAL;
  169. if (argc && (sscanf(argv[0], "%u%c", &base_weight, &dummy) != 1 ||
  170. base_weight >= HST_FIXED_1)) {
  171. return -EINVAL;
  172. }
  173. if (argc > 1 && (sscanf(argv[1], "%u%c",
  174. &threshold_multiplier, &dummy) != 1)) {
  175. return -EINVAL;
  176. }
  177. s = alloc_selector();
  178. if (!s)
  179. return -ENOMEM;
  180. ps->context = s;
  181. hst_set_weights(ps, base_weight);
  182. s->threshold_multiplier = threshold_multiplier;
  183. return 0;
  184. }
  185. static void free_paths(struct list_head *paths)
  186. {
  187. struct path_info *pi, *next;
  188. list_for_each_entry_safe(pi, next, paths, list) {
  189. list_del(&pi->list);
  190. kfree(pi);
  191. }
  192. }
  193. static void hst_destroy(struct path_selector *ps)
  194. {
  195. struct selector *s = ps->context;
  196. free_paths(&s->valid_paths);
  197. free_paths(&s->failed_paths);
  198. kfree(s);
  199. ps->context = NULL;
  200. }
  201. static int hst_status(struct path_selector *ps, struct dm_path *path,
  202. status_type_t type, char *result, unsigned int maxlen)
  203. {
  204. unsigned int sz = 0;
  205. struct path_info *pi;
  206. if (!path) {
  207. struct selector *s = ps->context;
  208. DMEMIT("2 %u %u ", s->weights[0], s->threshold_multiplier);
  209. } else {
  210. pi = path->pscontext;
  211. switch (type) {
  212. case STATUSTYPE_INFO:
  213. DMEMIT("%llu %llu %llu ", pi->historical_service_time,
  214. pi->outstanding, pi->stale_after);
  215. break;
  216. case STATUSTYPE_TABLE:
  217. DMEMIT("0 ");
  218. break;
  219. case STATUSTYPE_IMA:
  220. *result = '\0';
  221. break;
  222. }
  223. }
  224. return sz;
  225. }
  226. static int hst_add_path(struct path_selector *ps, struct dm_path *path,
  227. int argc, char **argv, char **error)
  228. {
  229. struct selector *s = ps->context;
  230. struct path_info *pi;
  231. unsigned int repeat_count = HST_MIN_IO;
  232. char dummy;
  233. unsigned long flags;
  234. /*
  235. * Arguments: [<repeat_count>]
  236. * <repeat_count>: The number of I/Os before switching path.
  237. * If not given, default (HST_MIN_IO) is used.
  238. */
  239. if (argc > 1) {
  240. *error = "historical-service-time ps: incorrect number of arguments";
  241. return -EINVAL;
  242. }
  243. if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
  244. *error = "historical-service-time ps: invalid repeat count";
  245. return -EINVAL;
  246. }
  247. /* allocate the path */
  248. pi = kmalloc(sizeof(*pi), GFP_KERNEL);
  249. if (!pi) {
  250. *error = "historical-service-time ps: Error allocating path context";
  251. return -ENOMEM;
  252. }
  253. pi->path = path;
  254. pi->repeat_count = repeat_count;
  255. pi->historical_service_time = HST_FIXED_1;
  256. spin_lock_init(&pi->lock);
  257. pi->outstanding = 0;
  258. pi->stale_after = 0;
  259. pi->last_finish = 0;
  260. path->pscontext = pi;
  261. spin_lock_irqsave(&s->lock, flags);
  262. list_add_tail(&pi->list, &s->valid_paths);
  263. s->valid_count++;
  264. spin_unlock_irqrestore(&s->lock, flags);
  265. return 0;
  266. }
  267. static void hst_fail_path(struct path_selector *ps, struct dm_path *path)
  268. {
  269. struct selector *s = ps->context;
  270. struct path_info *pi = path->pscontext;
  271. unsigned long flags;
  272. spin_lock_irqsave(&s->lock, flags);
  273. list_move(&pi->list, &s->failed_paths);
  274. s->valid_count--;
  275. spin_unlock_irqrestore(&s->lock, flags);
  276. }
  277. static int hst_reinstate_path(struct path_selector *ps, struct dm_path *path)
  278. {
  279. struct selector *s = ps->context;
  280. struct path_info *pi = path->pscontext;
  281. unsigned long flags;
  282. spin_lock_irqsave(&s->lock, flags);
  283. list_move_tail(&pi->list, &s->valid_paths);
  284. s->valid_count++;
  285. spin_unlock_irqrestore(&s->lock, flags);
  286. return 0;
  287. }
  288. static void hst_fill_compare(struct path_info *pi, u64 *hst,
  289. u64 *out, u64 *stale)
  290. {
  291. unsigned long flags;
  292. spin_lock_irqsave(&pi->lock, flags);
  293. *hst = pi->historical_service_time;
  294. *out = pi->outstanding;
  295. *stale = pi->stale_after;
  296. spin_unlock_irqrestore(&pi->lock, flags);
  297. }
  298. /*
  299. * Compare the estimated service time of 2 paths, pi1 and pi2,
  300. * for the incoming I/O.
  301. *
  302. * Returns:
  303. * < 0 : pi1 is better
  304. * 0 : no difference between pi1 and pi2
  305. * > 0 : pi2 is better
  306. *
  307. */
  308. static long long hst_compare(struct path_info *pi1, struct path_info *pi2,
  309. u64 time_now, struct path_selector *ps)
  310. {
  311. struct selector *s = ps->context;
  312. u64 hst1, hst2;
  313. long long out1, out2, stale1, stale2;
  314. int pi2_better, over_threshold;
  315. hst_fill_compare(pi1, &hst1, &out1, &stale1);
  316. hst_fill_compare(pi2, &hst2, &out2, &stale2);
  317. /* Check here if estimated latency for two paths are too similar.
  318. * If this is the case, we skip extra calculation and just compare
  319. * outstanding requests. In this case, any unloaded paths will
  320. * be preferred.
  321. */
  322. if (hst1 > hst2)
  323. over_threshold = hst1 > (s->threshold_multiplier * hst2);
  324. else
  325. over_threshold = hst2 > (s->threshold_multiplier * hst1);
  326. if (!over_threshold)
  327. return out1 - out2;
  328. /*
  329. * If an unloaded path is stale, choose it. If both paths are unloaded,
  330. * choose path that is the most stale.
  331. * (If one path is loaded, choose the other)
  332. */
  333. if ((!out1 && stale1 < time_now) || (!out2 && stale2 < time_now) ||
  334. (!out1 && !out2))
  335. return (!out2 * stale1) - (!out1 * stale2);
  336. /* Compare estimated service time. If outstanding is the same, we
  337. * don't need to multiply
  338. */
  339. if (out1 == out2) {
  340. pi2_better = hst1 > hst2;
  341. } else {
  342. /* Potential overflow with out >= 1024 */
  343. if (unlikely(out1 >= HST_MAX_INFLIGHT ||
  344. out2 >= HST_MAX_INFLIGHT)) {
  345. /* If over 1023 in-flights, we may overflow if hst
  346. * is at max. (With this shift we still overflow at
  347. * 1048576 in-flights, which is high enough).
  348. */
  349. hst1 >>= HST_FIXED_SHIFT;
  350. hst2 >>= HST_FIXED_SHIFT;
  351. }
  352. pi2_better = (1 + out1) * hst1 > (1 + out2) * hst2;
  353. }
  354. /* In the case that the 'winner' is stale, limit to equal usage. */
  355. if (pi2_better) {
  356. if (stale2 < time_now)
  357. return out1 - out2;
  358. return 1;
  359. }
  360. if (stale1 < time_now)
  361. return out1 - out2;
  362. return -1;
  363. }
  364. static struct dm_path *hst_select_path(struct path_selector *ps,
  365. size_t nr_bytes)
  366. {
  367. struct selector *s = ps->context;
  368. struct path_info *pi = NULL, *best = NULL;
  369. u64 time_now = ktime_get_ns();
  370. struct dm_path *ret = NULL;
  371. unsigned long flags;
  372. spin_lock_irqsave(&s->lock, flags);
  373. if (list_empty(&s->valid_paths))
  374. goto out;
  375. list_for_each_entry(pi, &s->valid_paths, list) {
  376. if (!best || (hst_compare(pi, best, time_now, ps) < 0))
  377. best = pi;
  378. }
  379. if (!best)
  380. goto out;
  381. /* Move last used path to end (least preferred in case of ties) */
  382. list_move_tail(&best->list, &s->valid_paths);
  383. ret = best->path;
  384. out:
  385. spin_unlock_irqrestore(&s->lock, flags);
  386. return ret;
  387. }
  388. static int hst_start_io(struct path_selector *ps, struct dm_path *path,
  389. size_t nr_bytes)
  390. {
  391. struct path_info *pi = path->pscontext;
  392. unsigned long flags;
  393. spin_lock_irqsave(&pi->lock, flags);
  394. pi->outstanding++;
  395. spin_unlock_irqrestore(&pi->lock, flags);
  396. return 0;
  397. }
  398. static u64 path_service_time(struct path_info *pi, u64 start_time)
  399. {
  400. u64 now = ktime_get_ns();
  401. /* if a previous disk request has finished after this IO was
  402. * sent to the hardware, pretend the submission happened
  403. * serially.
  404. */
  405. if (time_after64(pi->last_finish, start_time))
  406. start_time = pi->last_finish;
  407. pi->last_finish = now;
  408. if (time_before64(now, start_time))
  409. return 0;
  410. return now - start_time;
  411. }
  412. static int hst_end_io(struct path_selector *ps, struct dm_path *path,
  413. size_t nr_bytes, u64 start_time)
  414. {
  415. struct path_info *pi = path->pscontext;
  416. struct selector *s = ps->context;
  417. unsigned long flags;
  418. u64 st;
  419. spin_lock_irqsave(&pi->lock, flags);
  420. st = path_service_time(pi, start_time);
  421. pi->outstanding--;
  422. pi->historical_service_time =
  423. fixed_ema(pi->historical_service_time,
  424. min(st * HST_FIXED_1, HST_FIXED_MAX),
  425. hst_weight(ps, st));
  426. /*
  427. * On request end, mark path as fresh. If a path hasn't
  428. * finished any requests within the fresh period, the estimated
  429. * service time is considered too optimistic and we limit the
  430. * maximum requests on that path.
  431. */
  432. pi->stale_after = pi->last_finish +
  433. (s->valid_count * (pi->historical_service_time >> HST_FIXED_SHIFT));
  434. spin_unlock_irqrestore(&pi->lock, flags);
  435. return 0;
  436. }
  437. static struct path_selector_type hst_ps = {
  438. .name = "historical-service-time",
  439. .module = THIS_MODULE,
  440. .features = DM_PS_USE_HR_TIMER,
  441. .table_args = 1,
  442. .info_args = 3,
  443. .create = hst_create,
  444. .destroy = hst_destroy,
  445. .status = hst_status,
  446. .add_path = hst_add_path,
  447. .fail_path = hst_fail_path,
  448. .reinstate_path = hst_reinstate_path,
  449. .select_path = hst_select_path,
  450. .start_io = hst_start_io,
  451. .end_io = hst_end_io,
  452. };
  453. static int __init dm_hst_init(void)
  454. {
  455. int r = dm_register_path_selector(&hst_ps);
  456. if (r < 0)
  457. DMERR("register failed %d", r);
  458. DMINFO("version " HST_VERSION " loaded");
  459. return r;
  460. }
  461. static void __exit dm_hst_exit(void)
  462. {
  463. int r = dm_unregister_path_selector(&hst_ps);
  464. if (r < 0)
  465. DMERR("unregister failed %d", r);
  466. }
  467. module_init(dm_hst_init);
  468. module_exit(dm_hst_exit);
  469. MODULE_DESCRIPTION(DM_NAME " measured service time oriented path selector");
  470. MODULE_AUTHOR("Khazhismel Kumykov <[email protected]>");
  471. MODULE_LICENSE("GPL");