kyber-iosched.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * The Kyber I/O scheduler. Controls latency by throttling queue depths using
  4. * scalable techniques.
  5. *
  6. * Copyright (C) 2017 Facebook
  7. */
  8. #include <linux/kernel.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/blk-mq.h>
  11. #include <linux/module.h>
  12. #include <linux/sbitmap.h>
  13. #include <trace/events/block.h>
  14. #include "elevator.h"
  15. #include "blk.h"
  16. #include "blk-mq.h"
  17. #include "blk-mq-debugfs.h"
  18. #include "blk-mq-sched.h"
  19. #include "blk-mq-tag.h"
  20. #define CREATE_TRACE_POINTS
  21. #include <trace/events/kyber.h>
  22. /*
  23. * Scheduling domains: the device is divided into multiple domains based on the
  24. * request type.
  25. */
  26. enum {
  27. KYBER_READ,
  28. KYBER_WRITE,
  29. KYBER_DISCARD,
  30. KYBER_OTHER,
  31. KYBER_NUM_DOMAINS,
  32. };
  33. static const char *kyber_domain_names[] = {
  34. [KYBER_READ] = "READ",
  35. [KYBER_WRITE] = "WRITE",
  36. [KYBER_DISCARD] = "DISCARD",
  37. [KYBER_OTHER] = "OTHER",
  38. };
  39. enum {
  40. /*
  41. * In order to prevent starvation of synchronous requests by a flood of
  42. * asynchronous requests, we reserve 25% of requests for synchronous
  43. * operations.
  44. */
  45. KYBER_ASYNC_PERCENT = 75,
  46. };
  47. /*
  48. * Maximum device-wide depth for each scheduling domain.
  49. *
  50. * Even for fast devices with lots of tags like NVMe, you can saturate the
  51. * device with only a fraction of the maximum possible queue depth. So, we cap
  52. * these to a reasonable value.
  53. */
  54. static const unsigned int kyber_depth[] = {
  55. [KYBER_READ] = 256,
  56. [KYBER_WRITE] = 128,
  57. [KYBER_DISCARD] = 64,
  58. [KYBER_OTHER] = 16,
  59. };
  60. /*
  61. * Default latency targets for each scheduling domain.
  62. */
  63. static const u64 kyber_latency_targets[] = {
  64. [KYBER_READ] = 2ULL * NSEC_PER_MSEC,
  65. [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC,
  66. [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC,
  67. };
  68. /*
  69. * Batch size (number of requests we'll dispatch in a row) for each scheduling
  70. * domain.
  71. */
  72. static const unsigned int kyber_batch_size[] = {
  73. [KYBER_READ] = 16,
  74. [KYBER_WRITE] = 8,
  75. [KYBER_DISCARD] = 1,
  76. [KYBER_OTHER] = 1,
  77. };
  78. /*
  79. * Requests latencies are recorded in a histogram with buckets defined relative
  80. * to the target latency:
  81. *
  82. * <= 1/4 * target latency
  83. * <= 1/2 * target latency
  84. * <= 3/4 * target latency
  85. * <= target latency
  86. * <= 1 1/4 * target latency
  87. * <= 1 1/2 * target latency
  88. * <= 1 3/4 * target latency
  89. * > 1 3/4 * target latency
  90. */
  91. enum {
  92. /*
  93. * The width of the latency histogram buckets is
  94. * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency.
  95. */
  96. KYBER_LATENCY_SHIFT = 2,
  97. /*
  98. * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency,
  99. * thus, "good".
  100. */
  101. KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT,
  102. /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */
  103. KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT,
  104. };
  105. /*
  106. * We measure both the total latency and the I/O latency (i.e., latency after
  107. * submitting to the device).
  108. */
  109. enum {
  110. KYBER_TOTAL_LATENCY,
  111. KYBER_IO_LATENCY,
  112. };
  113. static const char *kyber_latency_type_names[] = {
  114. [KYBER_TOTAL_LATENCY] = "total",
  115. [KYBER_IO_LATENCY] = "I/O",
  116. };
  117. /*
  118. * Per-cpu latency histograms: total latency and I/O latency for each scheduling
  119. * domain except for KYBER_OTHER.
  120. */
  121. struct kyber_cpu_latency {
  122. atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
  123. };
  124. /*
  125. * There is a same mapping between ctx & hctx and kcq & khd,
  126. * we use request->mq_ctx->index_hw to index the kcq in khd.
  127. */
  128. struct kyber_ctx_queue {
  129. /*
  130. * Used to ensure operations on rq_list and kcq_map to be an atmoic one.
  131. * Also protect the rqs on rq_list when merge.
  132. */
  133. spinlock_t lock;
  134. struct list_head rq_list[KYBER_NUM_DOMAINS];
  135. } ____cacheline_aligned_in_smp;
  136. struct kyber_queue_data {
  137. struct request_queue *q;
  138. dev_t dev;
  139. /*
  140. * Each scheduling domain has a limited number of in-flight requests
  141. * device-wide, limited by these tokens.
  142. */
  143. struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS];
  144. /*
  145. * Async request percentage, converted to per-word depth for
  146. * sbitmap_get_shallow().
  147. */
  148. unsigned int async_depth;
  149. struct kyber_cpu_latency __percpu *cpu_latency;
  150. /* Timer for stats aggregation and adjusting domain tokens. */
  151. struct timer_list timer;
  152. unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS];
  153. unsigned long latency_timeout[KYBER_OTHER];
  154. int domain_p99[KYBER_OTHER];
  155. /* Target latencies in nanoseconds. */
  156. u64 latency_targets[KYBER_OTHER];
  157. };
  158. struct kyber_hctx_data {
  159. spinlock_t lock;
  160. struct list_head rqs[KYBER_NUM_DOMAINS];
  161. unsigned int cur_domain;
  162. unsigned int batching;
  163. struct kyber_ctx_queue *kcqs;
  164. struct sbitmap kcq_map[KYBER_NUM_DOMAINS];
  165. struct sbq_wait domain_wait[KYBER_NUM_DOMAINS];
  166. struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS];
  167. atomic_t wait_index[KYBER_NUM_DOMAINS];
  168. };
  169. static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
  170. void *key);
  171. static unsigned int kyber_sched_domain(blk_opf_t opf)
  172. {
  173. switch (opf & REQ_OP_MASK) {
  174. case REQ_OP_READ:
  175. return KYBER_READ;
  176. case REQ_OP_WRITE:
  177. return KYBER_WRITE;
  178. case REQ_OP_DISCARD:
  179. return KYBER_DISCARD;
  180. default:
  181. return KYBER_OTHER;
  182. }
  183. }
  184. static void flush_latency_buckets(struct kyber_queue_data *kqd,
  185. struct kyber_cpu_latency *cpu_latency,
  186. unsigned int sched_domain, unsigned int type)
  187. {
  188. unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
  189. atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type];
  190. unsigned int bucket;
  191. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
  192. buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0);
  193. }
  194. /*
  195. * Calculate the histogram bucket with the given percentile rank, or -1 if there
  196. * aren't enough samples yet.
  197. */
  198. static int calculate_percentile(struct kyber_queue_data *kqd,
  199. unsigned int sched_domain, unsigned int type,
  200. unsigned int percentile)
  201. {
  202. unsigned int *buckets = kqd->latency_buckets[sched_domain][type];
  203. unsigned int bucket, samples = 0, percentile_samples;
  204. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++)
  205. samples += buckets[bucket];
  206. if (!samples)
  207. return -1;
  208. /*
  209. * We do the calculation once we have 500 samples or one second passes
  210. * since the first sample was recorded, whichever comes first.
  211. */
  212. if (!kqd->latency_timeout[sched_domain])
  213. kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL);
  214. if (samples < 500 &&
  215. time_is_after_jiffies(kqd->latency_timeout[sched_domain])) {
  216. return -1;
  217. }
  218. kqd->latency_timeout[sched_domain] = 0;
  219. percentile_samples = DIV_ROUND_UP(samples * percentile, 100);
  220. for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) {
  221. if (buckets[bucket] >= percentile_samples)
  222. break;
  223. percentile_samples -= buckets[bucket];
  224. }
  225. memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
  226. trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain],
  227. kyber_latency_type_names[type], percentile,
  228. bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
  229. return bucket;
  230. }
  231. static void kyber_resize_domain(struct kyber_queue_data *kqd,
  232. unsigned int sched_domain, unsigned int depth)
  233. {
  234. depth = clamp(depth, 1U, kyber_depth[sched_domain]);
  235. if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
  236. sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
  237. trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain],
  238. depth);
  239. }
  240. }
  241. static void kyber_timer_fn(struct timer_list *t)
  242. {
  243. struct kyber_queue_data *kqd = from_timer(kqd, t, timer);
  244. unsigned int sched_domain;
  245. int cpu;
  246. bool bad = false;
  247. /* Sum all of the per-cpu latency histograms. */
  248. for_each_online_cpu(cpu) {
  249. struct kyber_cpu_latency *cpu_latency;
  250. cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu);
  251. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  252. flush_latency_buckets(kqd, cpu_latency, sched_domain,
  253. KYBER_TOTAL_LATENCY);
  254. flush_latency_buckets(kqd, cpu_latency, sched_domain,
  255. KYBER_IO_LATENCY);
  256. }
  257. }
  258. /*
  259. * Check if any domains have a high I/O latency, which might indicate
  260. * congestion in the device. Note that we use the p90; we don't want to
  261. * be too sensitive to outliers here.
  262. */
  263. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  264. int p90;
  265. p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY,
  266. 90);
  267. if (p90 >= KYBER_GOOD_BUCKETS)
  268. bad = true;
  269. }
  270. /*
  271. * Adjust the scheduling domain depths. If we determined that there was
  272. * congestion, we throttle all domains with good latencies. Either way,
  273. * we ease up on throttling domains with bad latencies.
  274. */
  275. for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) {
  276. unsigned int orig_depth, depth;
  277. int p99;
  278. p99 = calculate_percentile(kqd, sched_domain,
  279. KYBER_TOTAL_LATENCY, 99);
  280. /*
  281. * This is kind of subtle: different domains will not
  282. * necessarily have enough samples to calculate the latency
  283. * percentiles during the same window, so we have to remember
  284. * the p99 for the next time we observe congestion; once we do,
  285. * we don't want to throttle again until we get more data, so we
  286. * reset it to -1.
  287. */
  288. if (bad) {
  289. if (p99 < 0)
  290. p99 = kqd->domain_p99[sched_domain];
  291. kqd->domain_p99[sched_domain] = -1;
  292. } else if (p99 >= 0) {
  293. kqd->domain_p99[sched_domain] = p99;
  294. }
  295. if (p99 < 0)
  296. continue;
  297. /*
  298. * If this domain has bad latency, throttle less. Otherwise,
  299. * throttle more iff we determined that there is congestion.
  300. *
  301. * The new depth is scaled linearly with the p99 latency vs the
  302. * latency target. E.g., if the p99 is 3/4 of the target, then
  303. * we throttle down to 3/4 of the current depth, and if the p99
  304. * is 2x the target, then we double the depth.
  305. */
  306. if (bad || p99 >= KYBER_GOOD_BUCKETS) {
  307. orig_depth = kqd->domain_tokens[sched_domain].sb.depth;
  308. depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT;
  309. kyber_resize_domain(kqd, sched_domain, depth);
  310. }
  311. }
  312. }
  313. static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
  314. {
  315. struct kyber_queue_data *kqd;
  316. int ret = -ENOMEM;
  317. int i;
  318. kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node);
  319. if (!kqd)
  320. goto err;
  321. kqd->q = q;
  322. kqd->dev = disk_devt(q->disk);
  323. kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
  324. GFP_KERNEL | __GFP_ZERO);
  325. if (!kqd->cpu_latency)
  326. goto err_kqd;
  327. timer_setup(&kqd->timer, kyber_timer_fn, 0);
  328. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  329. WARN_ON(!kyber_depth[i]);
  330. WARN_ON(!kyber_batch_size[i]);
  331. ret = sbitmap_queue_init_node(&kqd->domain_tokens[i],
  332. kyber_depth[i], -1, false,
  333. GFP_KERNEL, q->node);
  334. if (ret) {
  335. while (--i >= 0)
  336. sbitmap_queue_free(&kqd->domain_tokens[i]);
  337. goto err_buckets;
  338. }
  339. }
  340. for (i = 0; i < KYBER_OTHER; i++) {
  341. kqd->domain_p99[i] = -1;
  342. kqd->latency_targets[i] = kyber_latency_targets[i];
  343. }
  344. return kqd;
  345. err_buckets:
  346. free_percpu(kqd->cpu_latency);
  347. err_kqd:
  348. kfree(kqd);
  349. err:
  350. return ERR_PTR(ret);
  351. }
  352. static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
  353. {
  354. struct kyber_queue_data *kqd;
  355. struct elevator_queue *eq;
  356. eq = elevator_alloc(q, e);
  357. if (!eq)
  358. return -ENOMEM;
  359. kqd = kyber_queue_data_alloc(q);
  360. if (IS_ERR(kqd)) {
  361. kobject_put(&eq->kobj);
  362. return PTR_ERR(kqd);
  363. }
  364. blk_stat_enable_accounting(q);
  365. blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
  366. eq->elevator_data = kqd;
  367. q->elevator = eq;
  368. return 0;
  369. }
  370. static void kyber_exit_sched(struct elevator_queue *e)
  371. {
  372. struct kyber_queue_data *kqd = e->elevator_data;
  373. int i;
  374. del_timer_sync(&kqd->timer);
  375. blk_stat_disable_accounting(kqd->q);
  376. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  377. sbitmap_queue_free(&kqd->domain_tokens[i]);
  378. free_percpu(kqd->cpu_latency);
  379. kfree(kqd);
  380. }
  381. static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
  382. {
  383. unsigned int i;
  384. spin_lock_init(&kcq->lock);
  385. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  386. INIT_LIST_HEAD(&kcq->rq_list[i]);
  387. }
  388. static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
  389. {
  390. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  391. struct blk_mq_tags *tags = hctx->sched_tags;
  392. unsigned int shift = tags->bitmap_tags.sb.shift;
  393. kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
  394. sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
  395. }
  396. static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  397. {
  398. struct kyber_hctx_data *khd;
  399. int i;
  400. khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node);
  401. if (!khd)
  402. return -ENOMEM;
  403. khd->kcqs = kmalloc_array_node(hctx->nr_ctx,
  404. sizeof(struct kyber_ctx_queue),
  405. GFP_KERNEL, hctx->numa_node);
  406. if (!khd->kcqs)
  407. goto err_khd;
  408. for (i = 0; i < hctx->nr_ctx; i++)
  409. kyber_ctx_queue_init(&khd->kcqs[i]);
  410. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  411. if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
  412. ilog2(8), GFP_KERNEL, hctx->numa_node,
  413. false, false)) {
  414. while (--i >= 0)
  415. sbitmap_free(&khd->kcq_map[i]);
  416. goto err_kcqs;
  417. }
  418. }
  419. spin_lock_init(&khd->lock);
  420. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  421. INIT_LIST_HEAD(&khd->rqs[i]);
  422. khd->domain_wait[i].sbq = NULL;
  423. init_waitqueue_func_entry(&khd->domain_wait[i].wait,
  424. kyber_domain_wake);
  425. khd->domain_wait[i].wait.private = hctx;
  426. INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry);
  427. atomic_set(&khd->wait_index[i], 0);
  428. }
  429. khd->cur_domain = 0;
  430. khd->batching = 0;
  431. hctx->sched_data = khd;
  432. kyber_depth_updated(hctx);
  433. return 0;
  434. err_kcqs:
  435. kfree(khd->kcqs);
  436. err_khd:
  437. kfree(khd);
  438. return -ENOMEM;
  439. }
  440. static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
  441. {
  442. struct kyber_hctx_data *khd = hctx->sched_data;
  443. int i;
  444. for (i = 0; i < KYBER_NUM_DOMAINS; i++)
  445. sbitmap_free(&khd->kcq_map[i]);
  446. kfree(khd->kcqs);
  447. kfree(hctx->sched_data);
  448. }
  449. static int rq_get_domain_token(struct request *rq)
  450. {
  451. return (long)rq->elv.priv[0];
  452. }
  453. static void rq_set_domain_token(struct request *rq, int token)
  454. {
  455. rq->elv.priv[0] = (void *)(long)token;
  456. }
  457. static void rq_clear_domain_token(struct kyber_queue_data *kqd,
  458. struct request *rq)
  459. {
  460. unsigned int sched_domain;
  461. int nr;
  462. nr = rq_get_domain_token(rq);
  463. if (nr != -1) {
  464. sched_domain = kyber_sched_domain(rq->cmd_flags);
  465. sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr,
  466. rq->mq_ctx->cpu);
  467. }
  468. }
  469. static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
  470. {
  471. /*
  472. * We use the scheduler tags as per-hardware queue queueing tokens.
  473. * Async requests can be limited at this stage.
  474. */
  475. if (!op_is_sync(opf)) {
  476. struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
  477. data->shallow_depth = kqd->async_depth;
  478. }
  479. }
  480. static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
  481. unsigned int nr_segs)
  482. {
  483. struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
  484. struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
  485. struct kyber_hctx_data *khd = hctx->sched_data;
  486. struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
  487. unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
  488. struct list_head *rq_list = &kcq->rq_list[sched_domain];
  489. bool merged;
  490. spin_lock(&kcq->lock);
  491. merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
  492. spin_unlock(&kcq->lock);
  493. return merged;
  494. }
  495. static void kyber_prepare_request(struct request *rq)
  496. {
  497. rq_set_domain_token(rq, -1);
  498. }
  499. static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
  500. struct list_head *rq_list, bool at_head)
  501. {
  502. struct kyber_hctx_data *khd = hctx->sched_data;
  503. struct request *rq, *next;
  504. list_for_each_entry_safe(rq, next, rq_list, queuelist) {
  505. unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags);
  506. struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]];
  507. struct list_head *head = &kcq->rq_list[sched_domain];
  508. spin_lock(&kcq->lock);
  509. trace_block_rq_insert(rq);
  510. if (at_head)
  511. list_move(&rq->queuelist, head);
  512. else
  513. list_move_tail(&rq->queuelist, head);
  514. sbitmap_set_bit(&khd->kcq_map[sched_domain],
  515. rq->mq_ctx->index_hw[hctx->type]);
  516. spin_unlock(&kcq->lock);
  517. }
  518. }
  519. static void kyber_finish_request(struct request *rq)
  520. {
  521. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  522. rq_clear_domain_token(kqd, rq);
  523. }
  524. static void add_latency_sample(struct kyber_cpu_latency *cpu_latency,
  525. unsigned int sched_domain, unsigned int type,
  526. u64 target, u64 latency)
  527. {
  528. unsigned int bucket;
  529. u64 divisor;
  530. if (latency > 0) {
  531. divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1);
  532. bucket = min_t(unsigned int, div64_u64(latency - 1, divisor),
  533. KYBER_LATENCY_BUCKETS - 1);
  534. } else {
  535. bucket = 0;
  536. }
  537. atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]);
  538. }
  539. static void kyber_completed_request(struct request *rq, u64 now)
  540. {
  541. struct kyber_queue_data *kqd = rq->q->elevator->elevator_data;
  542. struct kyber_cpu_latency *cpu_latency;
  543. unsigned int sched_domain;
  544. u64 target;
  545. sched_domain = kyber_sched_domain(rq->cmd_flags);
  546. if (sched_domain == KYBER_OTHER)
  547. return;
  548. cpu_latency = get_cpu_ptr(kqd->cpu_latency);
  549. target = kqd->latency_targets[sched_domain];
  550. add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY,
  551. target, now - rq->start_time_ns);
  552. add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target,
  553. now - rq->io_start_time_ns);
  554. put_cpu_ptr(kqd->cpu_latency);
  555. timer_reduce(&kqd->timer, jiffies + HZ / 10);
  556. }
  557. struct flush_kcq_data {
  558. struct kyber_hctx_data *khd;
  559. unsigned int sched_domain;
  560. struct list_head *list;
  561. };
  562. static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data)
  563. {
  564. struct flush_kcq_data *flush_data = data;
  565. struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr];
  566. spin_lock(&kcq->lock);
  567. list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain],
  568. flush_data->list);
  569. sbitmap_clear_bit(sb, bitnr);
  570. spin_unlock(&kcq->lock);
  571. return true;
  572. }
  573. static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd,
  574. unsigned int sched_domain,
  575. struct list_head *list)
  576. {
  577. struct flush_kcq_data data = {
  578. .khd = khd,
  579. .sched_domain = sched_domain,
  580. .list = list,
  581. };
  582. sbitmap_for_each_set(&khd->kcq_map[sched_domain],
  583. flush_busy_kcq, &data);
  584. }
  585. static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags,
  586. void *key)
  587. {
  588. struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private);
  589. struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait);
  590. sbitmap_del_wait_queue(wait);
  591. blk_mq_run_hw_queue(hctx, true);
  592. return 1;
  593. }
  594. static int kyber_get_domain_token(struct kyber_queue_data *kqd,
  595. struct kyber_hctx_data *khd,
  596. struct blk_mq_hw_ctx *hctx)
  597. {
  598. unsigned int sched_domain = khd->cur_domain;
  599. struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain];
  600. struct sbq_wait *wait = &khd->domain_wait[sched_domain];
  601. struct sbq_wait_state *ws;
  602. int nr;
  603. nr = __sbitmap_queue_get(domain_tokens);
  604. /*
  605. * If we failed to get a domain token, make sure the hardware queue is
  606. * run when one becomes available. Note that this is serialized on
  607. * khd->lock, but we still need to be careful about the waker.
  608. */
  609. if (nr < 0 && list_empty_careful(&wait->wait.entry)) {
  610. ws = sbq_wait_ptr(domain_tokens,
  611. &khd->wait_index[sched_domain]);
  612. khd->domain_ws[sched_domain] = ws;
  613. sbitmap_add_wait_queue(domain_tokens, ws, wait);
  614. /*
  615. * Try again in case a token was freed before we got on the wait
  616. * queue.
  617. */
  618. nr = __sbitmap_queue_get(domain_tokens);
  619. }
  620. /*
  621. * If we got a token while we were on the wait queue, remove ourselves
  622. * from the wait queue to ensure that all wake ups make forward
  623. * progress. It's possible that the waker already deleted the entry
  624. * between the !list_empty_careful() check and us grabbing the lock, but
  625. * list_del_init() is okay with that.
  626. */
  627. if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) {
  628. ws = khd->domain_ws[sched_domain];
  629. spin_lock_irq(&ws->wait.lock);
  630. sbitmap_del_wait_queue(wait);
  631. spin_unlock_irq(&ws->wait.lock);
  632. }
  633. return nr;
  634. }
  635. static struct request *
  636. kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
  637. struct kyber_hctx_data *khd,
  638. struct blk_mq_hw_ctx *hctx)
  639. {
  640. struct list_head *rqs;
  641. struct request *rq;
  642. int nr;
  643. rqs = &khd->rqs[khd->cur_domain];
  644. /*
  645. * If we already have a flushed request, then we just need to get a
  646. * token for it. Otherwise, if there are pending requests in the kcqs,
  647. * flush the kcqs, but only if we can get a token. If not, we should
  648. * leave the requests in the kcqs so that they can be merged. Note that
  649. * khd->lock serializes the flushes, so if we observed any bit set in
  650. * the kcq_map, we will always get a request.
  651. */
  652. rq = list_first_entry_or_null(rqs, struct request, queuelist);
  653. if (rq) {
  654. nr = kyber_get_domain_token(kqd, khd, hctx);
  655. if (nr >= 0) {
  656. khd->batching++;
  657. rq_set_domain_token(rq, nr);
  658. list_del_init(&rq->queuelist);
  659. return rq;
  660. } else {
  661. trace_kyber_throttled(kqd->dev,
  662. kyber_domain_names[khd->cur_domain]);
  663. }
  664. } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
  665. nr = kyber_get_domain_token(kqd, khd, hctx);
  666. if (nr >= 0) {
  667. kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs);
  668. rq = list_first_entry(rqs, struct request, queuelist);
  669. khd->batching++;
  670. rq_set_domain_token(rq, nr);
  671. list_del_init(&rq->queuelist);
  672. return rq;
  673. } else {
  674. trace_kyber_throttled(kqd->dev,
  675. kyber_domain_names[khd->cur_domain]);
  676. }
  677. }
  678. /* There were either no pending requests or no tokens. */
  679. return NULL;
  680. }
  681. static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx)
  682. {
  683. struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
  684. struct kyber_hctx_data *khd = hctx->sched_data;
  685. struct request *rq;
  686. int i;
  687. spin_lock(&khd->lock);
  688. /*
  689. * First, if we are still entitled to batch, try to dispatch a request
  690. * from the batch.
  691. */
  692. if (khd->batching < kyber_batch_size[khd->cur_domain]) {
  693. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  694. if (rq)
  695. goto out;
  696. }
  697. /*
  698. * Either,
  699. * 1. We were no longer entitled to a batch.
  700. * 2. The domain we were batching didn't have any requests.
  701. * 3. The domain we were batching was out of tokens.
  702. *
  703. * Start another batch. Note that this wraps back around to the original
  704. * domain if no other domains have requests or tokens.
  705. */
  706. khd->batching = 0;
  707. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  708. if (khd->cur_domain == KYBER_NUM_DOMAINS - 1)
  709. khd->cur_domain = 0;
  710. else
  711. khd->cur_domain++;
  712. rq = kyber_dispatch_cur_domain(kqd, khd, hctx);
  713. if (rq)
  714. goto out;
  715. }
  716. rq = NULL;
  717. out:
  718. spin_unlock(&khd->lock);
  719. return rq;
  720. }
  721. static bool kyber_has_work(struct blk_mq_hw_ctx *hctx)
  722. {
  723. struct kyber_hctx_data *khd = hctx->sched_data;
  724. int i;
  725. for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
  726. if (!list_empty_careful(&khd->rqs[i]) ||
  727. sbitmap_any_bit_set(&khd->kcq_map[i]))
  728. return true;
  729. }
  730. return false;
  731. }
  732. #define KYBER_LAT_SHOW_STORE(domain, name) \
  733. static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \
  734. char *page) \
  735. { \
  736. struct kyber_queue_data *kqd = e->elevator_data; \
  737. \
  738. return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \
  739. } \
  740. \
  741. static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \
  742. const char *page, size_t count) \
  743. { \
  744. struct kyber_queue_data *kqd = e->elevator_data; \
  745. unsigned long long nsec; \
  746. int ret; \
  747. \
  748. ret = kstrtoull(page, 10, &nsec); \
  749. if (ret) \
  750. return ret; \
  751. \
  752. kqd->latency_targets[domain] = nsec; \
  753. \
  754. return count; \
  755. }
  756. KYBER_LAT_SHOW_STORE(KYBER_READ, read);
  757. KYBER_LAT_SHOW_STORE(KYBER_WRITE, write);
  758. #undef KYBER_LAT_SHOW_STORE
  759. #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store)
  760. static struct elv_fs_entry kyber_sched_attrs[] = {
  761. KYBER_LAT_ATTR(read),
  762. KYBER_LAT_ATTR(write),
  763. __ATTR_NULL
  764. };
  765. #undef KYBER_LAT_ATTR
  766. #ifdef CONFIG_BLK_DEBUG_FS
  767. #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \
  768. static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \
  769. { \
  770. struct request_queue *q = data; \
  771. struct kyber_queue_data *kqd = q->elevator->elevator_data; \
  772. \
  773. sbitmap_queue_show(&kqd->domain_tokens[domain], m); \
  774. return 0; \
  775. } \
  776. \
  777. static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \
  778. __acquires(&khd->lock) \
  779. { \
  780. struct blk_mq_hw_ctx *hctx = m->private; \
  781. struct kyber_hctx_data *khd = hctx->sched_data; \
  782. \
  783. spin_lock(&khd->lock); \
  784. return seq_list_start(&khd->rqs[domain], *pos); \
  785. } \
  786. \
  787. static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \
  788. loff_t *pos) \
  789. { \
  790. struct blk_mq_hw_ctx *hctx = m->private; \
  791. struct kyber_hctx_data *khd = hctx->sched_data; \
  792. \
  793. return seq_list_next(v, &khd->rqs[domain], pos); \
  794. } \
  795. \
  796. static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \
  797. __releases(&khd->lock) \
  798. { \
  799. struct blk_mq_hw_ctx *hctx = m->private; \
  800. struct kyber_hctx_data *khd = hctx->sched_data; \
  801. \
  802. spin_unlock(&khd->lock); \
  803. } \
  804. \
  805. static const struct seq_operations kyber_##name##_rqs_seq_ops = { \
  806. .start = kyber_##name##_rqs_start, \
  807. .next = kyber_##name##_rqs_next, \
  808. .stop = kyber_##name##_rqs_stop, \
  809. .show = blk_mq_debugfs_rq_show, \
  810. }; \
  811. \
  812. static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \
  813. { \
  814. struct blk_mq_hw_ctx *hctx = data; \
  815. struct kyber_hctx_data *khd = hctx->sched_data; \
  816. wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \
  817. \
  818. seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \
  819. return 0; \
  820. }
  821. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read)
  822. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write)
  823. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard)
  824. KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other)
  825. #undef KYBER_DEBUGFS_DOMAIN_ATTRS
  826. static int kyber_async_depth_show(void *data, struct seq_file *m)
  827. {
  828. struct request_queue *q = data;
  829. struct kyber_queue_data *kqd = q->elevator->elevator_data;
  830. seq_printf(m, "%u\n", kqd->async_depth);
  831. return 0;
  832. }
  833. static int kyber_cur_domain_show(void *data, struct seq_file *m)
  834. {
  835. struct blk_mq_hw_ctx *hctx = data;
  836. struct kyber_hctx_data *khd = hctx->sched_data;
  837. seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]);
  838. return 0;
  839. }
  840. static int kyber_batching_show(void *data, struct seq_file *m)
  841. {
  842. struct blk_mq_hw_ctx *hctx = data;
  843. struct kyber_hctx_data *khd = hctx->sched_data;
  844. seq_printf(m, "%u\n", khd->batching);
  845. return 0;
  846. }
  847. #define KYBER_QUEUE_DOMAIN_ATTRS(name) \
  848. {#name "_tokens", 0400, kyber_##name##_tokens_show}
  849. static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = {
  850. KYBER_QUEUE_DOMAIN_ATTRS(read),
  851. KYBER_QUEUE_DOMAIN_ATTRS(write),
  852. KYBER_QUEUE_DOMAIN_ATTRS(discard),
  853. KYBER_QUEUE_DOMAIN_ATTRS(other),
  854. {"async_depth", 0400, kyber_async_depth_show},
  855. {},
  856. };
  857. #undef KYBER_QUEUE_DOMAIN_ATTRS
  858. #define KYBER_HCTX_DOMAIN_ATTRS(name) \
  859. {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \
  860. {#name "_waiting", 0400, kyber_##name##_waiting_show}
  861. static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = {
  862. KYBER_HCTX_DOMAIN_ATTRS(read),
  863. KYBER_HCTX_DOMAIN_ATTRS(write),
  864. KYBER_HCTX_DOMAIN_ATTRS(discard),
  865. KYBER_HCTX_DOMAIN_ATTRS(other),
  866. {"cur_domain", 0400, kyber_cur_domain_show},
  867. {"batching", 0400, kyber_batching_show},
  868. {},
  869. };
  870. #undef KYBER_HCTX_DOMAIN_ATTRS
  871. #endif
  872. static struct elevator_type kyber_sched = {
  873. .ops = {
  874. .init_sched = kyber_init_sched,
  875. .exit_sched = kyber_exit_sched,
  876. .init_hctx = kyber_init_hctx,
  877. .exit_hctx = kyber_exit_hctx,
  878. .limit_depth = kyber_limit_depth,
  879. .bio_merge = kyber_bio_merge,
  880. .prepare_request = kyber_prepare_request,
  881. .insert_requests = kyber_insert_requests,
  882. .finish_request = kyber_finish_request,
  883. .requeue_request = kyber_finish_request,
  884. .completed_request = kyber_completed_request,
  885. .dispatch_request = kyber_dispatch_request,
  886. .has_work = kyber_has_work,
  887. .depth_updated = kyber_depth_updated,
  888. },
  889. #ifdef CONFIG_BLK_DEBUG_FS
  890. .queue_debugfs_attrs = kyber_queue_debugfs_attrs,
  891. .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs,
  892. #endif
  893. .elevator_attrs = kyber_sched_attrs,
  894. .elevator_name = "kyber",
  895. .elevator_owner = THIS_MODULE,
  896. };
  897. static int __init kyber_init(void)
  898. {
  899. return elv_register(&kyber_sched);
  900. }
  901. static void __exit kyber_exit(void)
  902. {
  903. elv_unregister(&kyber_sched);
  904. }
  905. module_init(kyber_init);
  906. module_exit(kyber_exit);
  907. MODULE_AUTHOR("Omar Sandoval");
  908. MODULE_LICENSE("GPL");
  909. MODULE_DESCRIPTION("Kyber I/O scheduler");