blk-throttle.c 67 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Interface for controlling IO bandwidth on a request queue
  4. *
  5. * Copyright (C) 2010 Vivek Goyal <[email protected]>
  6. */
  7. #include <linux/module.h>
  8. #include <linux/slab.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/bio.h>
  11. #include <linux/blktrace_api.h>
  12. #include "blk.h"
  13. #include "blk-cgroup-rwstat.h"
  14. #include "blk-stat.h"
  15. #include "blk-throttle.h"
  16. /* Max dispatch from a group in 1 round */
  17. #define THROTL_GRP_QUANTUM 8
  18. /* Total max dispatch from all groups in one round */
  19. #define THROTL_QUANTUM 32
  20. /* Throttling is performed over a slice and after that slice is renewed */
  21. #define DFL_THROTL_SLICE_HD (HZ / 10)
  22. #define DFL_THROTL_SLICE_SSD (HZ / 50)
  23. #define MAX_THROTL_SLICE (HZ)
  24. #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */
  25. #define MIN_THROTL_BPS (320 * 1024)
  26. #define MIN_THROTL_IOPS (10)
  27. #define DFL_LATENCY_TARGET (-1L)
  28. #define DFL_IDLE_THRESHOLD (0)
  29. #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */
  30. #define LATENCY_FILTERED_SSD (0)
  31. /*
  32. * For HD, very small latency comes from sequential IO. Such IO is helpless to
  33. * help determine if its IO is impacted by others, hence we ignore the IO
  34. */
  35. #define LATENCY_FILTERED_HD (1000L) /* 1ms */
  36. /* A workqueue to queue throttle related work */
  37. static struct workqueue_struct *kthrotld_workqueue;
  38. #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
  39. /* We measure latency for request size from <= 4k to >= 1M */
  40. #define LATENCY_BUCKET_SIZE 9
  41. struct latency_bucket {
  42. unsigned long total_latency; /* ns / 1024 */
  43. int samples;
  44. };
  45. struct avg_latency_bucket {
  46. unsigned long latency; /* ns / 1024 */
  47. bool valid;
  48. };
  49. struct throtl_data
  50. {
  51. /* service tree for active throtl groups */
  52. struct throtl_service_queue service_queue;
  53. struct request_queue *queue;
  54. /* Total Number of queued bios on READ and WRITE lists */
  55. unsigned int nr_queued[2];
  56. unsigned int throtl_slice;
  57. /* Work for dispatching throttled bios */
  58. struct work_struct dispatch_work;
  59. unsigned int limit_index;
  60. bool limit_valid[LIMIT_CNT];
  61. unsigned long low_upgrade_time;
  62. unsigned long low_downgrade_time;
  63. unsigned int scale;
  64. struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE];
  65. struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE];
  66. struct latency_bucket __percpu *latency_buckets[2];
  67. unsigned long last_calculate_time;
  68. unsigned long filtered_latency;
  69. bool track_bio_latency;
  70. };
  71. static void throtl_pending_timer_fn(struct timer_list *t);
  72. static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
  73. {
  74. return pd_to_blkg(&tg->pd);
  75. }
  76. /**
  77. * sq_to_tg - return the throl_grp the specified service queue belongs to
  78. * @sq: the throtl_service_queue of interest
  79. *
  80. * Return the throtl_grp @sq belongs to. If @sq is the top-level one
  81. * embedded in throtl_data, %NULL is returned.
  82. */
  83. static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq)
  84. {
  85. if (sq && sq->parent_sq)
  86. return container_of(sq, struct throtl_grp, service_queue);
  87. else
  88. return NULL;
  89. }
  90. /**
  91. * sq_to_td - return throtl_data the specified service queue belongs to
  92. * @sq: the throtl_service_queue of interest
  93. *
  94. * A service_queue can be embedded in either a throtl_grp or throtl_data.
  95. * Determine the associated throtl_data accordingly and return it.
  96. */
  97. static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
  98. {
  99. struct throtl_grp *tg = sq_to_tg(sq);
  100. if (tg)
  101. return tg->td;
  102. else
  103. return container_of(sq, struct throtl_data, service_queue);
  104. }
  105. /*
  106. * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
  107. * make the IO dispatch more smooth.
  108. * Scale up: linearly scale up according to lapsed time since upgrade. For
  109. * every throtl_slice, the limit scales up 1/2 .low limit till the
  110. * limit hits .max limit
  111. * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
  112. */
  113. static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td)
  114. {
  115. /* arbitrary value to avoid too big scale */
  116. if (td->scale < 4096 && time_after_eq(jiffies,
  117. td->low_upgrade_time + td->scale * td->throtl_slice))
  118. td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice;
  119. return low + (low >> 1) * td->scale;
  120. }
  121. static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw)
  122. {
  123. struct blkcg_gq *blkg = tg_to_blkg(tg);
  124. struct throtl_data *td;
  125. uint64_t ret;
  126. if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  127. return U64_MAX;
  128. td = tg->td;
  129. ret = tg->bps[rw][td->limit_index];
  130. if (ret == 0 && td->limit_index == LIMIT_LOW) {
  131. /* intermediate node or iops isn't 0 */
  132. if (!list_empty(&blkg->blkcg->css.children) ||
  133. tg->iops[rw][td->limit_index])
  134. return U64_MAX;
  135. else
  136. return MIN_THROTL_BPS;
  137. }
  138. if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] &&
  139. tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) {
  140. uint64_t adjusted;
  141. adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td);
  142. ret = min(tg->bps[rw][LIMIT_MAX], adjusted);
  143. }
  144. return ret;
  145. }
  146. static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
  147. {
  148. struct blkcg_gq *blkg = tg_to_blkg(tg);
  149. struct throtl_data *td;
  150. unsigned int ret;
  151. if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent)
  152. return UINT_MAX;
  153. td = tg->td;
  154. ret = tg->iops[rw][td->limit_index];
  155. if (ret == 0 && tg->td->limit_index == LIMIT_LOW) {
  156. /* intermediate node or bps isn't 0 */
  157. if (!list_empty(&blkg->blkcg->css.children) ||
  158. tg->bps[rw][td->limit_index])
  159. return UINT_MAX;
  160. else
  161. return MIN_THROTL_IOPS;
  162. }
  163. if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] &&
  164. tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) {
  165. uint64_t adjusted;
  166. adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td);
  167. if (adjusted > UINT_MAX)
  168. adjusted = UINT_MAX;
  169. ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted);
  170. }
  171. return ret;
  172. }
  173. #define request_bucket_index(sectors) \
  174. clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1)
  175. /**
  176. * throtl_log - log debug message via blktrace
  177. * @sq: the service_queue being reported
  178. * @fmt: printf format string
  179. * @args: printf args
  180. *
  181. * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a
  182. * throtl_grp; otherwise, just "throtl".
  183. */
  184. #define throtl_log(sq, fmt, args...) do { \
  185. struct throtl_grp *__tg = sq_to_tg((sq)); \
  186. struct throtl_data *__td = sq_to_td((sq)); \
  187. \
  188. (void)__td; \
  189. if (likely(!blk_trace_note_message_enabled(__td->queue))) \
  190. break; \
  191. if ((__tg)) { \
  192. blk_add_cgroup_trace_msg(__td->queue, \
  193. &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
  194. } else { \
  195. blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
  196. } \
  197. } while (0)
  198. static inline unsigned int throtl_bio_data_size(struct bio *bio)
  199. {
  200. /* assume it's one sector */
  201. if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
  202. return 512;
  203. return bio->bi_iter.bi_size;
  204. }
  205. static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg)
  206. {
  207. INIT_LIST_HEAD(&qn->node);
  208. bio_list_init(&qn->bios);
  209. qn->tg = tg;
  210. }
  211. /**
  212. * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it
  213. * @bio: bio being added
  214. * @qn: qnode to add bio to
  215. * @queued: the service_queue->queued[] list @qn belongs to
  216. *
  217. * Add @bio to @qn and put @qn on @queued if it's not already on.
  218. * @qn->tg's reference count is bumped when @qn is activated. See the
  219. * comment on top of throtl_qnode definition for details.
  220. */
  221. static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
  222. struct list_head *queued)
  223. {
  224. bio_list_add(&qn->bios, bio);
  225. if (list_empty(&qn->node)) {
  226. list_add_tail(&qn->node, queued);
  227. blkg_get(tg_to_blkg(qn->tg));
  228. }
  229. }
  230. /**
  231. * throtl_peek_queued - peek the first bio on a qnode list
  232. * @queued: the qnode list to peek
  233. */
  234. static struct bio *throtl_peek_queued(struct list_head *queued)
  235. {
  236. struct throtl_qnode *qn;
  237. struct bio *bio;
  238. if (list_empty(queued))
  239. return NULL;
  240. qn = list_first_entry(queued, struct throtl_qnode, node);
  241. bio = bio_list_peek(&qn->bios);
  242. WARN_ON_ONCE(!bio);
  243. return bio;
  244. }
  245. /**
  246. * throtl_pop_queued - pop the first bio form a qnode list
  247. * @queued: the qnode list to pop a bio from
  248. * @tg_to_put: optional out argument for throtl_grp to put
  249. *
  250. * Pop the first bio from the qnode list @queued. After popping, the first
  251. * qnode is removed from @queued if empty or moved to the end of @queued so
  252. * that the popping order is round-robin.
  253. *
  254. * When the first qnode is removed, its associated throtl_grp should be put
  255. * too. If @tg_to_put is NULL, this function automatically puts it;
  256. * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is
  257. * responsible for putting it.
  258. */
  259. static struct bio *throtl_pop_queued(struct list_head *queued,
  260. struct throtl_grp **tg_to_put)
  261. {
  262. struct throtl_qnode *qn;
  263. struct bio *bio;
  264. if (list_empty(queued))
  265. return NULL;
  266. qn = list_first_entry(queued, struct throtl_qnode, node);
  267. bio = bio_list_pop(&qn->bios);
  268. WARN_ON_ONCE(!bio);
  269. if (bio_list_empty(&qn->bios)) {
  270. list_del_init(&qn->node);
  271. if (tg_to_put)
  272. *tg_to_put = qn->tg;
  273. else
  274. blkg_put(tg_to_blkg(qn->tg));
  275. } else {
  276. list_move_tail(&qn->node, queued);
  277. }
  278. return bio;
  279. }
  280. /* init a service_queue, assumes the caller zeroed it */
  281. static void throtl_service_queue_init(struct throtl_service_queue *sq)
  282. {
  283. INIT_LIST_HEAD(&sq->queued[READ]);
  284. INIT_LIST_HEAD(&sq->queued[WRITE]);
  285. sq->pending_tree = RB_ROOT_CACHED;
  286. timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
  287. }
  288. static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
  289. struct request_queue *q,
  290. struct blkcg *blkcg)
  291. {
  292. struct throtl_grp *tg;
  293. int rw;
  294. tg = kzalloc_node(sizeof(*tg), gfp, q->node);
  295. if (!tg)
  296. return NULL;
  297. if (blkg_rwstat_init(&tg->stat_bytes, gfp))
  298. goto err_free_tg;
  299. if (blkg_rwstat_init(&tg->stat_ios, gfp))
  300. goto err_exit_stat_bytes;
  301. throtl_service_queue_init(&tg->service_queue);
  302. for (rw = READ; rw <= WRITE; rw++) {
  303. throtl_qnode_init(&tg->qnode_on_self[rw], tg);
  304. throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
  305. }
  306. RB_CLEAR_NODE(&tg->rb_node);
  307. tg->bps[READ][LIMIT_MAX] = U64_MAX;
  308. tg->bps[WRITE][LIMIT_MAX] = U64_MAX;
  309. tg->iops[READ][LIMIT_MAX] = UINT_MAX;
  310. tg->iops[WRITE][LIMIT_MAX] = UINT_MAX;
  311. tg->bps_conf[READ][LIMIT_MAX] = U64_MAX;
  312. tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX;
  313. tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX;
  314. tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX;
  315. /* LIMIT_LOW will have default value 0 */
  316. tg->latency_target = DFL_LATENCY_TARGET;
  317. tg->latency_target_conf = DFL_LATENCY_TARGET;
  318. tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  319. tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
  320. return &tg->pd;
  321. err_exit_stat_bytes:
  322. blkg_rwstat_exit(&tg->stat_bytes);
  323. err_free_tg:
  324. kfree(tg);
  325. return NULL;
  326. }
  327. static void throtl_pd_init(struct blkg_policy_data *pd)
  328. {
  329. struct throtl_grp *tg = pd_to_tg(pd);
  330. struct blkcg_gq *blkg = tg_to_blkg(tg);
  331. struct throtl_data *td = blkg->q->td;
  332. struct throtl_service_queue *sq = &tg->service_queue;
  333. /*
  334. * If on the default hierarchy, we switch to properly hierarchical
  335. * behavior where limits on a given throtl_grp are applied to the
  336. * whole subtree rather than just the group itself. e.g. If 16M
  337. * read_bps limit is set on the root group, the whole system can't
  338. * exceed 16M for the device.
  339. *
  340. * If not on the default hierarchy, the broken flat hierarchy
  341. * behavior is retained where all throtl_grps are treated as if
  342. * they're all separate root groups right below throtl_data.
  343. * Limits of a group don't interact with limits of other groups
  344. * regardless of the position of the group in the hierarchy.
  345. */
  346. sq->parent_sq = &td->service_queue;
  347. if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent)
  348. sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue;
  349. tg->td = td;
  350. }
  351. /*
  352. * Set has_rules[] if @tg or any of its parents have limits configured.
  353. * This doesn't require walking up to the top of the hierarchy as the
  354. * parent's has_rules[] is guaranteed to be correct.
  355. */
  356. static void tg_update_has_rules(struct throtl_grp *tg)
  357. {
  358. struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq);
  359. struct throtl_data *td = tg->td;
  360. int rw;
  361. for (rw = READ; rw <= WRITE; rw++) {
  362. tg->has_rules_iops[rw] =
  363. (parent_tg && parent_tg->has_rules_iops[rw]) ||
  364. (td->limit_valid[td->limit_index] &&
  365. tg_iops_limit(tg, rw) != UINT_MAX);
  366. tg->has_rules_bps[rw] =
  367. (parent_tg && parent_tg->has_rules_bps[rw]) ||
  368. (td->limit_valid[td->limit_index] &&
  369. (tg_bps_limit(tg, rw) != U64_MAX));
  370. }
  371. }
  372. static void throtl_pd_online(struct blkg_policy_data *pd)
  373. {
  374. struct throtl_grp *tg = pd_to_tg(pd);
  375. /*
  376. * We don't want new groups to escape the limits of its ancestors.
  377. * Update has_rules[] after a new group is brought online.
  378. */
  379. tg_update_has_rules(tg);
  380. }
  381. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  382. static void blk_throtl_update_limit_valid(struct throtl_data *td)
  383. {
  384. struct cgroup_subsys_state *pos_css;
  385. struct blkcg_gq *blkg;
  386. bool low_valid = false;
  387. rcu_read_lock();
  388. blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  389. struct throtl_grp *tg = blkg_to_tg(blkg);
  390. if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] ||
  391. tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) {
  392. low_valid = true;
  393. break;
  394. }
  395. }
  396. rcu_read_unlock();
  397. td->limit_valid[LIMIT_LOW] = low_valid;
  398. }
  399. #else
  400. static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
  401. {
  402. }
  403. #endif
  404. static void throtl_upgrade_state(struct throtl_data *td);
  405. static void throtl_pd_offline(struct blkg_policy_data *pd)
  406. {
  407. struct throtl_grp *tg = pd_to_tg(pd);
  408. tg->bps[READ][LIMIT_LOW] = 0;
  409. tg->bps[WRITE][LIMIT_LOW] = 0;
  410. tg->iops[READ][LIMIT_LOW] = 0;
  411. tg->iops[WRITE][LIMIT_LOW] = 0;
  412. blk_throtl_update_limit_valid(tg->td);
  413. if (!tg->td->limit_valid[tg->td->limit_index])
  414. throtl_upgrade_state(tg->td);
  415. }
  416. static void throtl_pd_free(struct blkg_policy_data *pd)
  417. {
  418. struct throtl_grp *tg = pd_to_tg(pd);
  419. del_timer_sync(&tg->service_queue.pending_timer);
  420. blkg_rwstat_exit(&tg->stat_bytes);
  421. blkg_rwstat_exit(&tg->stat_ios);
  422. kfree(tg);
  423. }
  424. static struct throtl_grp *
  425. throtl_rb_first(struct throtl_service_queue *parent_sq)
  426. {
  427. struct rb_node *n;
  428. n = rb_first_cached(&parent_sq->pending_tree);
  429. WARN_ON_ONCE(!n);
  430. if (!n)
  431. return NULL;
  432. return rb_entry_tg(n);
  433. }
  434. static void throtl_rb_erase(struct rb_node *n,
  435. struct throtl_service_queue *parent_sq)
  436. {
  437. rb_erase_cached(n, &parent_sq->pending_tree);
  438. RB_CLEAR_NODE(n);
  439. }
  440. static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
  441. {
  442. struct throtl_grp *tg;
  443. tg = throtl_rb_first(parent_sq);
  444. if (!tg)
  445. return;
  446. parent_sq->first_pending_disptime = tg->disptime;
  447. }
  448. static void tg_service_queue_add(struct throtl_grp *tg)
  449. {
  450. struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq;
  451. struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node;
  452. struct rb_node *parent = NULL;
  453. struct throtl_grp *__tg;
  454. unsigned long key = tg->disptime;
  455. bool leftmost = true;
  456. while (*node != NULL) {
  457. parent = *node;
  458. __tg = rb_entry_tg(parent);
  459. if (time_before(key, __tg->disptime))
  460. node = &parent->rb_left;
  461. else {
  462. node = &parent->rb_right;
  463. leftmost = false;
  464. }
  465. }
  466. rb_link_node(&tg->rb_node, parent, node);
  467. rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree,
  468. leftmost);
  469. }
  470. static void throtl_enqueue_tg(struct throtl_grp *tg)
  471. {
  472. if (!(tg->flags & THROTL_TG_PENDING)) {
  473. tg_service_queue_add(tg);
  474. tg->flags |= THROTL_TG_PENDING;
  475. tg->service_queue.parent_sq->nr_pending++;
  476. }
  477. }
  478. static void throtl_dequeue_tg(struct throtl_grp *tg)
  479. {
  480. if (tg->flags & THROTL_TG_PENDING) {
  481. struct throtl_service_queue *parent_sq =
  482. tg->service_queue.parent_sq;
  483. throtl_rb_erase(&tg->rb_node, parent_sq);
  484. --parent_sq->nr_pending;
  485. tg->flags &= ~THROTL_TG_PENDING;
  486. }
  487. }
  488. /* Call with queue lock held */
  489. static void throtl_schedule_pending_timer(struct throtl_service_queue *sq,
  490. unsigned long expires)
  491. {
  492. unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice;
  493. /*
  494. * Since we are adjusting the throttle limit dynamically, the sleep
  495. * time calculated according to previous limit might be invalid. It's
  496. * possible the cgroup sleep time is very long and no other cgroups
  497. * have IO running so notify the limit changes. Make sure the cgroup
  498. * doesn't sleep too long to avoid the missed notification.
  499. */
  500. if (time_after(expires, max_expire))
  501. expires = max_expire;
  502. mod_timer(&sq->pending_timer, expires);
  503. throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu",
  504. expires - jiffies, jiffies);
  505. }
  506. /**
  507. * throtl_schedule_next_dispatch - schedule the next dispatch cycle
  508. * @sq: the service_queue to schedule dispatch for
  509. * @force: force scheduling
  510. *
  511. * Arm @sq->pending_timer so that the next dispatch cycle starts on the
  512. * dispatch time of the first pending child. Returns %true if either timer
  513. * is armed or there's no pending child left. %false if the current
  514. * dispatch window is still open and the caller should continue
  515. * dispatching.
  516. *
  517. * If @force is %true, the dispatch timer is always scheduled and this
  518. * function is guaranteed to return %true. This is to be used when the
  519. * caller can't dispatch itself and needs to invoke pending_timer
  520. * unconditionally. Note that forced scheduling is likely to induce short
  521. * delay before dispatch starts even if @sq->first_pending_disptime is not
  522. * in the future and thus shouldn't be used in hot paths.
  523. */
  524. static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq,
  525. bool force)
  526. {
  527. /* any pending children left? */
  528. if (!sq->nr_pending)
  529. return true;
  530. update_min_dispatch_time(sq);
  531. /* is the next dispatch time in the future? */
  532. if (force || time_after(sq->first_pending_disptime, jiffies)) {
  533. throtl_schedule_pending_timer(sq, sq->first_pending_disptime);
  534. return true;
  535. }
  536. /* tell the caller to continue dispatching */
  537. return false;
  538. }
  539. static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
  540. bool rw, unsigned long start)
  541. {
  542. tg->bytes_disp[rw] = 0;
  543. tg->io_disp[rw] = 0;
  544. tg->carryover_bytes[rw] = 0;
  545. tg->carryover_ios[rw] = 0;
  546. /*
  547. * Previous slice has expired. We must have trimmed it after last
  548. * bio dispatch. That means since start of last slice, we never used
  549. * that bandwidth. Do try to make use of that bandwidth while giving
  550. * credit.
  551. */
  552. if (time_after_eq(start, tg->slice_start[rw]))
  553. tg->slice_start[rw] = start;
  554. tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
  555. throtl_log(&tg->service_queue,
  556. "[%c] new slice with credit start=%lu end=%lu jiffies=%lu",
  557. rw == READ ? 'R' : 'W', tg->slice_start[rw],
  558. tg->slice_end[rw], jiffies);
  559. }
  560. static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
  561. bool clear_carryover)
  562. {
  563. tg->bytes_disp[rw] = 0;
  564. tg->io_disp[rw] = 0;
  565. tg->slice_start[rw] = jiffies;
  566. tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
  567. if (clear_carryover) {
  568. tg->carryover_bytes[rw] = 0;
  569. tg->carryover_ios[rw] = 0;
  570. }
  571. throtl_log(&tg->service_queue,
  572. "[%c] new slice start=%lu end=%lu jiffies=%lu",
  573. rw == READ ? 'R' : 'W', tg->slice_start[rw],
  574. tg->slice_end[rw], jiffies);
  575. }
  576. static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
  577. unsigned long jiffy_end)
  578. {
  579. tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
  580. }
  581. static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
  582. unsigned long jiffy_end)
  583. {
  584. throtl_set_slice_end(tg, rw, jiffy_end);
  585. throtl_log(&tg->service_queue,
  586. "[%c] extend slice start=%lu end=%lu jiffies=%lu",
  587. rw == READ ? 'R' : 'W', tg->slice_start[rw],
  588. tg->slice_end[rw], jiffies);
  589. }
  590. /* Determine if previously allocated or extended slice is complete or not */
  591. static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
  592. {
  593. if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
  594. return false;
  595. return true;
  596. }
  597. static unsigned int calculate_io_allowed(u32 iops_limit,
  598. unsigned long jiffy_elapsed)
  599. {
  600. unsigned int io_allowed;
  601. u64 tmp;
  602. /*
  603. * jiffy_elapsed should not be a big value as minimum iops can be
  604. * 1 then at max jiffy elapsed should be equivalent of 1 second as we
  605. * will allow dispatch after 1 second and after that slice should
  606. * have been trimmed.
  607. */
  608. tmp = (u64)iops_limit * jiffy_elapsed;
  609. do_div(tmp, HZ);
  610. if (tmp > UINT_MAX)
  611. io_allowed = UINT_MAX;
  612. else
  613. io_allowed = tmp;
  614. return io_allowed;
  615. }
  616. static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
  617. {
  618. /*
  619. * Can result be wider than 64 bits?
  620. * We check against 62, not 64, due to ilog2 truncation.
  621. */
  622. if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62)
  623. return U64_MAX;
  624. return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
  625. }
  626. /* Trim the used slices and adjust slice start accordingly */
  627. static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
  628. {
  629. unsigned long time_elapsed;
  630. long long bytes_trim;
  631. int io_trim;
  632. BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
  633. /*
  634. * If bps are unlimited (-1), then time slice don't get
  635. * renewed. Don't try to trim the slice if slice is used. A new
  636. * slice will start when appropriate.
  637. */
  638. if (throtl_slice_used(tg, rw))
  639. return;
  640. /*
  641. * A bio has been dispatched. Also adjust slice_end. It might happen
  642. * that initially cgroup limit was very low resulting in high
  643. * slice_end, but later limit was bumped up and bio was dispatched
  644. * sooner, then we need to reduce slice_end. A high bogus slice_end
  645. * is bad because it does not allow new slice to start.
  646. */
  647. throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
  648. time_elapsed = rounddown(jiffies - tg->slice_start[rw],
  649. tg->td->throtl_slice);
  650. if (!time_elapsed)
  651. return;
  652. bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
  653. time_elapsed) +
  654. tg->carryover_bytes[rw];
  655. io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) +
  656. tg->carryover_ios[rw];
  657. if (bytes_trim <= 0 && io_trim <= 0)
  658. return;
  659. tg->carryover_bytes[rw] = 0;
  660. if ((long long)tg->bytes_disp[rw] >= bytes_trim)
  661. tg->bytes_disp[rw] -= bytes_trim;
  662. else
  663. tg->bytes_disp[rw] = 0;
  664. tg->carryover_ios[rw] = 0;
  665. if ((int)tg->io_disp[rw] >= io_trim)
  666. tg->io_disp[rw] -= io_trim;
  667. else
  668. tg->io_disp[rw] = 0;
  669. tg->slice_start[rw] += time_elapsed;
  670. throtl_log(&tg->service_queue,
  671. "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu",
  672. rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice,
  673. bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw],
  674. jiffies);
  675. }
  676. static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
  677. {
  678. unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
  679. u64 bps_limit = tg_bps_limit(tg, rw);
  680. u32 iops_limit = tg_iops_limit(tg, rw);
  681. /*
  682. * If config is updated while bios are still throttled, calculate and
  683. * accumulate how many bytes/ios are waited across changes. And
  684. * carryover_bytes/ios will be used to calculate new wait time under new
  685. * configuration.
  686. */
  687. if (bps_limit != U64_MAX)
  688. tg->carryover_bytes[rw] +=
  689. calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
  690. tg->bytes_disp[rw];
  691. if (iops_limit != UINT_MAX)
  692. tg->carryover_ios[rw] +=
  693. calculate_io_allowed(iops_limit, jiffy_elapsed) -
  694. tg->io_disp[rw];
  695. }
  696. static void tg_update_carryover(struct throtl_grp *tg)
  697. {
  698. if (tg->service_queue.nr_queued[READ])
  699. __tg_update_carryover(tg, READ);
  700. if (tg->service_queue.nr_queued[WRITE])
  701. __tg_update_carryover(tg, WRITE);
  702. /* see comments in struct throtl_grp for meaning of these fields. */
  703. throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__,
  704. tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
  705. tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
  706. }
  707. static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
  708. u32 iops_limit, unsigned long *wait)
  709. {
  710. bool rw = bio_data_dir(bio);
  711. unsigned int io_allowed;
  712. unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
  713. if (iops_limit == UINT_MAX) {
  714. if (wait)
  715. *wait = 0;
  716. return true;
  717. }
  718. jiffy_elapsed = jiffies - tg->slice_start[rw];
  719. /* Round up to the next throttle slice, wait time must be nonzero */
  720. jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
  721. io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
  722. tg->carryover_ios[rw];
  723. if (tg->io_disp[rw] + 1 <= io_allowed) {
  724. if (wait)
  725. *wait = 0;
  726. return true;
  727. }
  728. /* Calc approx time to dispatch */
  729. jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
  730. if (wait)
  731. *wait = jiffy_wait;
  732. return false;
  733. }
  734. static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
  735. u64 bps_limit, unsigned long *wait)
  736. {
  737. bool rw = bio_data_dir(bio);
  738. u64 bytes_allowed, extra_bytes;
  739. unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
  740. unsigned int bio_size = throtl_bio_data_size(bio);
  741. /* no need to throttle if this bio's bytes have been accounted */
  742. if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
  743. if (wait)
  744. *wait = 0;
  745. return true;
  746. }
  747. jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
  748. /* Slice has just started. Consider one slice interval */
  749. if (!jiffy_elapsed)
  750. jiffy_elapsed_rnd = tg->td->throtl_slice;
  751. jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
  752. bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
  753. tg->carryover_bytes[rw];
  754. if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
  755. if (wait)
  756. *wait = 0;
  757. return true;
  758. }
  759. /* Calc approx time to dispatch */
  760. extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
  761. jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
  762. if (!jiffy_wait)
  763. jiffy_wait = 1;
  764. /*
  765. * This wait time is without taking into consideration the rounding
  766. * up we did. Add that time also.
  767. */
  768. jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
  769. if (wait)
  770. *wait = jiffy_wait;
  771. return false;
  772. }
  773. /*
  774. * Returns whether one can dispatch a bio or not. Also returns approx number
  775. * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  776. */
  777. static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
  778. unsigned long *wait)
  779. {
  780. bool rw = bio_data_dir(bio);
  781. unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
  782. u64 bps_limit = tg_bps_limit(tg, rw);
  783. u32 iops_limit = tg_iops_limit(tg, rw);
  784. /*
  785. * Currently whole state machine of group depends on first bio
  786. * queued in the group bio list. So one should not be calling
  787. * this function with a different bio if there are other bios
  788. * queued.
  789. */
  790. BUG_ON(tg->service_queue.nr_queued[rw] &&
  791. bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
  792. /* If tg->bps = -1, then BW is unlimited */
  793. if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
  794. tg->flags & THROTL_TG_CANCELING) {
  795. if (wait)
  796. *wait = 0;
  797. return true;
  798. }
  799. /*
  800. * If previous slice expired, start a new one otherwise renew/extend
  801. * existing slice to make sure it is at least throtl_slice interval
  802. * long since now. New slice is started only for empty throttle group.
  803. * If there is queued bio, that means there should be an active
  804. * slice and it should be extended instead.
  805. */
  806. if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
  807. throtl_start_new_slice(tg, rw, true);
  808. else {
  809. if (time_before(tg->slice_end[rw],
  810. jiffies + tg->td->throtl_slice))
  811. throtl_extend_slice(tg, rw,
  812. jiffies + tg->td->throtl_slice);
  813. }
  814. if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
  815. tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
  816. if (wait)
  817. *wait = 0;
  818. return true;
  819. }
  820. max_wait = max(bps_wait, iops_wait);
  821. if (wait)
  822. *wait = max_wait;
  823. if (time_before(tg->slice_end[rw], jiffies + max_wait))
  824. throtl_extend_slice(tg, rw, jiffies + max_wait);
  825. return false;
  826. }
  827. static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
  828. {
  829. bool rw = bio_data_dir(bio);
  830. unsigned int bio_size = throtl_bio_data_size(bio);
  831. /* Charge the bio to the group */
  832. if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
  833. tg->bytes_disp[rw] += bio_size;
  834. tg->last_bytes_disp[rw] += bio_size;
  835. }
  836. tg->io_disp[rw]++;
  837. tg->last_io_disp[rw]++;
  838. }
  839. /**
  840. * throtl_add_bio_tg - add a bio to the specified throtl_grp
  841. * @bio: bio to add
  842. * @qn: qnode to use
  843. * @tg: the target throtl_grp
  844. *
  845. * Add @bio to @tg's service_queue using @qn. If @qn is not specified,
  846. * tg->qnode_on_self[] is used.
  847. */
  848. static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
  849. struct throtl_grp *tg)
  850. {
  851. struct throtl_service_queue *sq = &tg->service_queue;
  852. bool rw = bio_data_dir(bio);
  853. if (!qn)
  854. qn = &tg->qnode_on_self[rw];
  855. /*
  856. * If @tg doesn't currently have any bios queued in the same
  857. * direction, queueing @bio can change when @tg should be
  858. * dispatched. Mark that @tg was empty. This is automatically
  859. * cleared on the next tg_update_disptime().
  860. */
  861. if (!sq->nr_queued[rw])
  862. tg->flags |= THROTL_TG_WAS_EMPTY;
  863. throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
  864. sq->nr_queued[rw]++;
  865. throtl_enqueue_tg(tg);
  866. }
  867. static void tg_update_disptime(struct throtl_grp *tg)
  868. {
  869. struct throtl_service_queue *sq = &tg->service_queue;
  870. unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
  871. struct bio *bio;
  872. bio = throtl_peek_queued(&sq->queued[READ]);
  873. if (bio)
  874. tg_may_dispatch(tg, bio, &read_wait);
  875. bio = throtl_peek_queued(&sq->queued[WRITE]);
  876. if (bio)
  877. tg_may_dispatch(tg, bio, &write_wait);
  878. min_wait = min(read_wait, write_wait);
  879. disptime = jiffies + min_wait;
  880. /* Update dispatch time */
  881. throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
  882. tg->disptime = disptime;
  883. tg_service_queue_add(tg);
  884. /* see throtl_add_bio_tg() */
  885. tg->flags &= ~THROTL_TG_WAS_EMPTY;
  886. }
  887. static void start_parent_slice_with_credit(struct throtl_grp *child_tg,
  888. struct throtl_grp *parent_tg, bool rw)
  889. {
  890. if (throtl_slice_used(parent_tg, rw)) {
  891. throtl_start_new_slice_with_credit(parent_tg, rw,
  892. child_tg->slice_start[rw]);
  893. }
  894. }
  895. static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
  896. {
  897. struct throtl_service_queue *sq = &tg->service_queue;
  898. struct throtl_service_queue *parent_sq = sq->parent_sq;
  899. struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
  900. struct throtl_grp *tg_to_put = NULL;
  901. struct bio *bio;
  902. /*
  903. * @bio is being transferred from @tg to @parent_sq. Popping a bio
  904. * from @tg may put its reference and @parent_sq might end up
  905. * getting released prematurely. Remember the tg to put and put it
  906. * after @bio is transferred to @parent_sq.
  907. */
  908. bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
  909. sq->nr_queued[rw]--;
  910. throtl_charge_bio(tg, bio);
  911. /*
  912. * If our parent is another tg, we just need to transfer @bio to
  913. * the parent using throtl_add_bio_tg(). If our parent is
  914. * @td->service_queue, @bio is ready to be issued. Put it on its
  915. * bio_lists[] and decrease total number queued. The caller is
  916. * responsible for issuing these bios.
  917. */
  918. if (parent_tg) {
  919. throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
  920. start_parent_slice_with_credit(tg, parent_tg, rw);
  921. } else {
  922. bio_set_flag(bio, BIO_BPS_THROTTLED);
  923. throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
  924. &parent_sq->queued[rw]);
  925. BUG_ON(tg->td->nr_queued[rw] <= 0);
  926. tg->td->nr_queued[rw]--;
  927. }
  928. throtl_trim_slice(tg, rw);
  929. if (tg_to_put)
  930. blkg_put(tg_to_blkg(tg_to_put));
  931. }
  932. static int throtl_dispatch_tg(struct throtl_grp *tg)
  933. {
  934. struct throtl_service_queue *sq = &tg->service_queue;
  935. unsigned int nr_reads = 0, nr_writes = 0;
  936. unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
  937. unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
  938. struct bio *bio;
  939. /* Try to dispatch 75% READS and 25% WRITES */
  940. while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
  941. tg_may_dispatch(tg, bio, NULL)) {
  942. tg_dispatch_one_bio(tg, bio_data_dir(bio));
  943. nr_reads++;
  944. if (nr_reads >= max_nr_reads)
  945. break;
  946. }
  947. while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
  948. tg_may_dispatch(tg, bio, NULL)) {
  949. tg_dispatch_one_bio(tg, bio_data_dir(bio));
  950. nr_writes++;
  951. if (nr_writes >= max_nr_writes)
  952. break;
  953. }
  954. return nr_reads + nr_writes;
  955. }
  956. static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
  957. {
  958. unsigned int nr_disp = 0;
  959. while (1) {
  960. struct throtl_grp *tg;
  961. struct throtl_service_queue *sq;
  962. if (!parent_sq->nr_pending)
  963. break;
  964. tg = throtl_rb_first(parent_sq);
  965. if (!tg)
  966. break;
  967. if (time_before(jiffies, tg->disptime))
  968. break;
  969. nr_disp += throtl_dispatch_tg(tg);
  970. sq = &tg->service_queue;
  971. if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
  972. tg_update_disptime(tg);
  973. else
  974. throtl_dequeue_tg(tg);
  975. if (nr_disp >= THROTL_QUANTUM)
  976. break;
  977. }
  978. return nr_disp;
  979. }
  980. static bool throtl_can_upgrade(struct throtl_data *td,
  981. struct throtl_grp *this_tg);
  982. /**
  983. * throtl_pending_timer_fn - timer function for service_queue->pending_timer
  984. * @t: the pending_timer member of the throtl_service_queue being serviced
  985. *
  986. * This timer is armed when a child throtl_grp with active bio's become
  987. * pending and queued on the service_queue's pending_tree and expires when
  988. * the first child throtl_grp should be dispatched. This function
  989. * dispatches bio's from the children throtl_grps to the parent
  990. * service_queue.
  991. *
  992. * If the parent's parent is another throtl_grp, dispatching is propagated
  993. * by either arming its pending_timer or repeating dispatch directly. If
  994. * the top-level service_tree is reached, throtl_data->dispatch_work is
  995. * kicked so that the ready bio's are issued.
  996. */
  997. static void throtl_pending_timer_fn(struct timer_list *t)
  998. {
  999. struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
  1000. struct throtl_grp *tg = sq_to_tg(sq);
  1001. struct throtl_data *td = sq_to_td(sq);
  1002. struct throtl_service_queue *parent_sq;
  1003. struct request_queue *q;
  1004. bool dispatched;
  1005. int ret;
  1006. /* throtl_data may be gone, so figure out request queue by blkg */
  1007. if (tg)
  1008. q = tg->pd.blkg->q;
  1009. else
  1010. q = td->queue;
  1011. spin_lock_irq(&q->queue_lock);
  1012. if (!q->root_blkg)
  1013. goto out_unlock;
  1014. if (throtl_can_upgrade(td, NULL))
  1015. throtl_upgrade_state(td);
  1016. again:
  1017. parent_sq = sq->parent_sq;
  1018. dispatched = false;
  1019. while (true) {
  1020. throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
  1021. sq->nr_queued[READ] + sq->nr_queued[WRITE],
  1022. sq->nr_queued[READ], sq->nr_queued[WRITE]);
  1023. ret = throtl_select_dispatch(sq);
  1024. if (ret) {
  1025. throtl_log(sq, "bios disp=%u", ret);
  1026. dispatched = true;
  1027. }
  1028. if (throtl_schedule_next_dispatch(sq, false))
  1029. break;
  1030. /* this dispatch windows is still open, relax and repeat */
  1031. spin_unlock_irq(&q->queue_lock);
  1032. cpu_relax();
  1033. spin_lock_irq(&q->queue_lock);
  1034. }
  1035. if (!dispatched)
  1036. goto out_unlock;
  1037. if (parent_sq) {
  1038. /* @parent_sq is another throl_grp, propagate dispatch */
  1039. if (tg->flags & THROTL_TG_WAS_EMPTY) {
  1040. tg_update_disptime(tg);
  1041. if (!throtl_schedule_next_dispatch(parent_sq, false)) {
  1042. /* window is already open, repeat dispatching */
  1043. sq = parent_sq;
  1044. tg = sq_to_tg(sq);
  1045. goto again;
  1046. }
  1047. }
  1048. } else {
  1049. /* reached the top-level, queue issuing */
  1050. queue_work(kthrotld_workqueue, &td->dispatch_work);
  1051. }
  1052. out_unlock:
  1053. spin_unlock_irq(&q->queue_lock);
  1054. }
  1055. /**
  1056. * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
  1057. * @work: work item being executed
  1058. *
  1059. * This function is queued for execution when bios reach the bio_lists[]
  1060. * of throtl_data->service_queue. Those bios are ready and issued by this
  1061. * function.
  1062. */
  1063. static void blk_throtl_dispatch_work_fn(struct work_struct *work)
  1064. {
  1065. struct throtl_data *td = container_of(work, struct throtl_data,
  1066. dispatch_work);
  1067. struct throtl_service_queue *td_sq = &td->service_queue;
  1068. struct request_queue *q = td->queue;
  1069. struct bio_list bio_list_on_stack;
  1070. struct bio *bio;
  1071. struct blk_plug plug;
  1072. int rw;
  1073. bio_list_init(&bio_list_on_stack);
  1074. spin_lock_irq(&q->queue_lock);
  1075. for (rw = READ; rw <= WRITE; rw++)
  1076. while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
  1077. bio_list_add(&bio_list_on_stack, bio);
  1078. spin_unlock_irq(&q->queue_lock);
  1079. if (!bio_list_empty(&bio_list_on_stack)) {
  1080. blk_start_plug(&plug);
  1081. while ((bio = bio_list_pop(&bio_list_on_stack)))
  1082. submit_bio_noacct_nocheck(bio);
  1083. blk_finish_plug(&plug);
  1084. }
  1085. }
  1086. static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd,
  1087. int off)
  1088. {
  1089. struct throtl_grp *tg = pd_to_tg(pd);
  1090. u64 v = *(u64 *)((void *)tg + off);
  1091. if (v == U64_MAX)
  1092. return 0;
  1093. return __blkg_prfill_u64(sf, pd, v);
  1094. }
  1095. static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd,
  1096. int off)
  1097. {
  1098. struct throtl_grp *tg = pd_to_tg(pd);
  1099. unsigned int v = *(unsigned int *)((void *)tg + off);
  1100. if (v == UINT_MAX)
  1101. return 0;
  1102. return __blkg_prfill_u64(sf, pd, v);
  1103. }
  1104. static int tg_print_conf_u64(struct seq_file *sf, void *v)
  1105. {
  1106. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64,
  1107. &blkcg_policy_throtl, seq_cft(sf)->private, false);
  1108. return 0;
  1109. }
  1110. static int tg_print_conf_uint(struct seq_file *sf, void *v)
  1111. {
  1112. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint,
  1113. &blkcg_policy_throtl, seq_cft(sf)->private, false);
  1114. return 0;
  1115. }
  1116. static void tg_conf_updated(struct throtl_grp *tg, bool global)
  1117. {
  1118. struct throtl_service_queue *sq = &tg->service_queue;
  1119. struct cgroup_subsys_state *pos_css;
  1120. struct blkcg_gq *blkg;
  1121. throtl_log(&tg->service_queue,
  1122. "limit change rbps=%llu wbps=%llu riops=%u wiops=%u",
  1123. tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
  1124. tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
  1125. /*
  1126. * Update has_rules[] flags for the updated tg's subtree. A tg is
  1127. * considered to have rules if either the tg itself or any of its
  1128. * ancestors has rules. This identifies groups without any
  1129. * restrictions in the whole hierarchy and allows them to bypass
  1130. * blk-throttle.
  1131. */
  1132. blkg_for_each_descendant_pre(blkg, pos_css,
  1133. global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) {
  1134. struct throtl_grp *this_tg = blkg_to_tg(blkg);
  1135. struct throtl_grp *parent_tg;
  1136. tg_update_has_rules(this_tg);
  1137. /* ignore root/second level */
  1138. if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent ||
  1139. !blkg->parent->parent)
  1140. continue;
  1141. parent_tg = blkg_to_tg(blkg->parent);
  1142. /*
  1143. * make sure all children has lower idle time threshold and
  1144. * higher latency target
  1145. */
  1146. this_tg->idletime_threshold = min(this_tg->idletime_threshold,
  1147. parent_tg->idletime_threshold);
  1148. this_tg->latency_target = max(this_tg->latency_target,
  1149. parent_tg->latency_target);
  1150. }
  1151. /*
  1152. * We're already holding queue_lock and know @tg is valid. Let's
  1153. * apply the new config directly.
  1154. *
  1155. * Restart the slices for both READ and WRITES. It might happen
  1156. * that a group's limit are dropped suddenly and we don't want to
  1157. * account recently dispatched IO with new low rate.
  1158. */
  1159. throtl_start_new_slice(tg, READ, false);
  1160. throtl_start_new_slice(tg, WRITE, false);
  1161. if (tg->flags & THROTL_TG_PENDING) {
  1162. tg_update_disptime(tg);
  1163. throtl_schedule_next_dispatch(sq->parent_sq, true);
  1164. }
  1165. }
  1166. static ssize_t tg_set_conf(struct kernfs_open_file *of,
  1167. char *buf, size_t nbytes, loff_t off, bool is_u64)
  1168. {
  1169. struct blkcg *blkcg = css_to_blkcg(of_css(of));
  1170. struct blkg_conf_ctx ctx;
  1171. struct throtl_grp *tg;
  1172. int ret;
  1173. u64 v;
  1174. ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  1175. if (ret)
  1176. return ret;
  1177. ret = -EINVAL;
  1178. if (sscanf(ctx.body, "%llu", &v) != 1)
  1179. goto out_finish;
  1180. if (!v)
  1181. v = U64_MAX;
  1182. tg = blkg_to_tg(ctx.blkg);
  1183. tg_update_carryover(tg);
  1184. if (is_u64)
  1185. *(u64 *)((void *)tg + of_cft(of)->private) = v;
  1186. else
  1187. *(unsigned int *)((void *)tg + of_cft(of)->private) = v;
  1188. tg_conf_updated(tg, false);
  1189. ret = 0;
  1190. out_finish:
  1191. blkg_conf_finish(&ctx);
  1192. return ret ?: nbytes;
  1193. }
  1194. static ssize_t tg_set_conf_u64(struct kernfs_open_file *of,
  1195. char *buf, size_t nbytes, loff_t off)
  1196. {
  1197. return tg_set_conf(of, buf, nbytes, off, true);
  1198. }
  1199. static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
  1200. char *buf, size_t nbytes, loff_t off)
  1201. {
  1202. return tg_set_conf(of, buf, nbytes, off, false);
  1203. }
  1204. static int tg_print_rwstat(struct seq_file *sf, void *v)
  1205. {
  1206. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  1207. blkg_prfill_rwstat, &blkcg_policy_throtl,
  1208. seq_cft(sf)->private, true);
  1209. return 0;
  1210. }
  1211. static u64 tg_prfill_rwstat_recursive(struct seq_file *sf,
  1212. struct blkg_policy_data *pd, int off)
  1213. {
  1214. struct blkg_rwstat_sample sum;
  1215. blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off,
  1216. &sum);
  1217. return __blkg_prfill_rwstat(sf, pd, &sum);
  1218. }
  1219. static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
  1220. {
  1221. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
  1222. tg_prfill_rwstat_recursive, &blkcg_policy_throtl,
  1223. seq_cft(sf)->private, true);
  1224. return 0;
  1225. }
  1226. static struct cftype throtl_legacy_files[] = {
  1227. {
  1228. .name = "throttle.read_bps_device",
  1229. .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]),
  1230. .seq_show = tg_print_conf_u64,
  1231. .write = tg_set_conf_u64,
  1232. },
  1233. {
  1234. .name = "throttle.write_bps_device",
  1235. .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]),
  1236. .seq_show = tg_print_conf_u64,
  1237. .write = tg_set_conf_u64,
  1238. },
  1239. {
  1240. .name = "throttle.read_iops_device",
  1241. .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]),
  1242. .seq_show = tg_print_conf_uint,
  1243. .write = tg_set_conf_uint,
  1244. },
  1245. {
  1246. .name = "throttle.write_iops_device",
  1247. .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]),
  1248. .seq_show = tg_print_conf_uint,
  1249. .write = tg_set_conf_uint,
  1250. },
  1251. {
  1252. .name = "throttle.io_service_bytes",
  1253. .private = offsetof(struct throtl_grp, stat_bytes),
  1254. .seq_show = tg_print_rwstat,
  1255. },
  1256. {
  1257. .name = "throttle.io_service_bytes_recursive",
  1258. .private = offsetof(struct throtl_grp, stat_bytes),
  1259. .seq_show = tg_print_rwstat_recursive,
  1260. },
  1261. {
  1262. .name = "throttle.io_serviced",
  1263. .private = offsetof(struct throtl_grp, stat_ios),
  1264. .seq_show = tg_print_rwstat,
  1265. },
  1266. {
  1267. .name = "throttle.io_serviced_recursive",
  1268. .private = offsetof(struct throtl_grp, stat_ios),
  1269. .seq_show = tg_print_rwstat_recursive,
  1270. },
  1271. { } /* terminate */
  1272. };
  1273. static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd,
  1274. int off)
  1275. {
  1276. struct throtl_grp *tg = pd_to_tg(pd);
  1277. const char *dname = blkg_dev_name(pd->blkg);
  1278. char bufs[4][21] = { "max", "max", "max", "max" };
  1279. u64 bps_dft;
  1280. unsigned int iops_dft;
  1281. char idle_time[26] = "";
  1282. char latency_time[26] = "";
  1283. if (!dname)
  1284. return 0;
  1285. if (off == LIMIT_LOW) {
  1286. bps_dft = 0;
  1287. iops_dft = 0;
  1288. } else {
  1289. bps_dft = U64_MAX;
  1290. iops_dft = UINT_MAX;
  1291. }
  1292. if (tg->bps_conf[READ][off] == bps_dft &&
  1293. tg->bps_conf[WRITE][off] == bps_dft &&
  1294. tg->iops_conf[READ][off] == iops_dft &&
  1295. tg->iops_conf[WRITE][off] == iops_dft &&
  1296. (off != LIMIT_LOW ||
  1297. (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD &&
  1298. tg->latency_target_conf == DFL_LATENCY_TARGET)))
  1299. return 0;
  1300. if (tg->bps_conf[READ][off] != U64_MAX)
  1301. snprintf(bufs[0], sizeof(bufs[0]), "%llu",
  1302. tg->bps_conf[READ][off]);
  1303. if (tg->bps_conf[WRITE][off] != U64_MAX)
  1304. snprintf(bufs[1], sizeof(bufs[1]), "%llu",
  1305. tg->bps_conf[WRITE][off]);
  1306. if (tg->iops_conf[READ][off] != UINT_MAX)
  1307. snprintf(bufs[2], sizeof(bufs[2]), "%u",
  1308. tg->iops_conf[READ][off]);
  1309. if (tg->iops_conf[WRITE][off] != UINT_MAX)
  1310. snprintf(bufs[3], sizeof(bufs[3]), "%u",
  1311. tg->iops_conf[WRITE][off]);
  1312. if (off == LIMIT_LOW) {
  1313. if (tg->idletime_threshold_conf == ULONG_MAX)
  1314. strcpy(idle_time, " idle=max");
  1315. else
  1316. snprintf(idle_time, sizeof(idle_time), " idle=%lu",
  1317. tg->idletime_threshold_conf);
  1318. if (tg->latency_target_conf == ULONG_MAX)
  1319. strcpy(latency_time, " latency=max");
  1320. else
  1321. snprintf(latency_time, sizeof(latency_time),
  1322. " latency=%lu", tg->latency_target_conf);
  1323. }
  1324. seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n",
  1325. dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time,
  1326. latency_time);
  1327. return 0;
  1328. }
  1329. static int tg_print_limit(struct seq_file *sf, void *v)
  1330. {
  1331. blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit,
  1332. &blkcg_policy_throtl, seq_cft(sf)->private, false);
  1333. return 0;
  1334. }
  1335. static ssize_t tg_set_limit(struct kernfs_open_file *of,
  1336. char *buf, size_t nbytes, loff_t off)
  1337. {
  1338. struct blkcg *blkcg = css_to_blkcg(of_css(of));
  1339. struct blkg_conf_ctx ctx;
  1340. struct throtl_grp *tg;
  1341. u64 v[4];
  1342. unsigned long idle_time;
  1343. unsigned long latency_time;
  1344. int ret;
  1345. int index = of_cft(of)->private;
  1346. ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
  1347. if (ret)
  1348. return ret;
  1349. tg = blkg_to_tg(ctx.blkg);
  1350. tg_update_carryover(tg);
  1351. v[0] = tg->bps_conf[READ][index];
  1352. v[1] = tg->bps_conf[WRITE][index];
  1353. v[2] = tg->iops_conf[READ][index];
  1354. v[3] = tg->iops_conf[WRITE][index];
  1355. idle_time = tg->idletime_threshold_conf;
  1356. latency_time = tg->latency_target_conf;
  1357. while (true) {
  1358. char tok[27]; /* wiops=18446744073709551616 */
  1359. char *p;
  1360. u64 val = U64_MAX;
  1361. int len;
  1362. if (sscanf(ctx.body, "%26s%n", tok, &len) != 1)
  1363. break;
  1364. if (tok[0] == '\0')
  1365. break;
  1366. ctx.body += len;
  1367. ret = -EINVAL;
  1368. p = tok;
  1369. strsep(&p, "=");
  1370. if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max")))
  1371. goto out_finish;
  1372. ret = -ERANGE;
  1373. if (!val)
  1374. goto out_finish;
  1375. ret = -EINVAL;
  1376. if (!strcmp(tok, "rbps") && val > 1)
  1377. v[0] = val;
  1378. else if (!strcmp(tok, "wbps") && val > 1)
  1379. v[1] = val;
  1380. else if (!strcmp(tok, "riops") && val > 1)
  1381. v[2] = min_t(u64, val, UINT_MAX);
  1382. else if (!strcmp(tok, "wiops") && val > 1)
  1383. v[3] = min_t(u64, val, UINT_MAX);
  1384. else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
  1385. idle_time = val;
  1386. else if (off == LIMIT_LOW && !strcmp(tok, "latency"))
  1387. latency_time = val;
  1388. else
  1389. goto out_finish;
  1390. }
  1391. tg->bps_conf[READ][index] = v[0];
  1392. tg->bps_conf[WRITE][index] = v[1];
  1393. tg->iops_conf[READ][index] = v[2];
  1394. tg->iops_conf[WRITE][index] = v[3];
  1395. if (index == LIMIT_MAX) {
  1396. tg->bps[READ][index] = v[0];
  1397. tg->bps[WRITE][index] = v[1];
  1398. tg->iops[READ][index] = v[2];
  1399. tg->iops[WRITE][index] = v[3];
  1400. }
  1401. tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW],
  1402. tg->bps_conf[READ][LIMIT_MAX]);
  1403. tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW],
  1404. tg->bps_conf[WRITE][LIMIT_MAX]);
  1405. tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW],
  1406. tg->iops_conf[READ][LIMIT_MAX]);
  1407. tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW],
  1408. tg->iops_conf[WRITE][LIMIT_MAX]);
  1409. tg->idletime_threshold_conf = idle_time;
  1410. tg->latency_target_conf = latency_time;
  1411. /* force user to configure all settings for low limit */
  1412. if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
  1413. tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
  1414. tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||
  1415. tg->latency_target_conf == DFL_LATENCY_TARGET) {
  1416. tg->bps[READ][LIMIT_LOW] = 0;
  1417. tg->bps[WRITE][LIMIT_LOW] = 0;
  1418. tg->iops[READ][LIMIT_LOW] = 0;
  1419. tg->iops[WRITE][LIMIT_LOW] = 0;
  1420. tg->idletime_threshold = DFL_IDLE_THRESHOLD;
  1421. tg->latency_target = DFL_LATENCY_TARGET;
  1422. } else if (index == LIMIT_LOW) {
  1423. tg->idletime_threshold = tg->idletime_threshold_conf;
  1424. tg->latency_target = tg->latency_target_conf;
  1425. }
  1426. blk_throtl_update_limit_valid(tg->td);
  1427. if (tg->td->limit_valid[LIMIT_LOW]) {
  1428. if (index == LIMIT_LOW)
  1429. tg->td->limit_index = LIMIT_LOW;
  1430. } else
  1431. tg->td->limit_index = LIMIT_MAX;
  1432. tg_conf_updated(tg, index == LIMIT_LOW &&
  1433. tg->td->limit_valid[LIMIT_LOW]);
  1434. ret = 0;
  1435. out_finish:
  1436. blkg_conf_finish(&ctx);
  1437. return ret ?: nbytes;
  1438. }
  1439. static struct cftype throtl_files[] = {
  1440. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  1441. {
  1442. .name = "low",
  1443. .flags = CFTYPE_NOT_ON_ROOT,
  1444. .seq_show = tg_print_limit,
  1445. .write = tg_set_limit,
  1446. .private = LIMIT_LOW,
  1447. },
  1448. #endif
  1449. {
  1450. .name = "max",
  1451. .flags = CFTYPE_NOT_ON_ROOT,
  1452. .seq_show = tg_print_limit,
  1453. .write = tg_set_limit,
  1454. .private = LIMIT_MAX,
  1455. },
  1456. { } /* terminate */
  1457. };
  1458. static void throtl_shutdown_wq(struct request_queue *q)
  1459. {
  1460. struct throtl_data *td = q->td;
  1461. cancel_work_sync(&td->dispatch_work);
  1462. }
  1463. struct blkcg_policy blkcg_policy_throtl = {
  1464. .dfl_cftypes = throtl_files,
  1465. .legacy_cftypes = throtl_legacy_files,
  1466. .pd_alloc_fn = throtl_pd_alloc,
  1467. .pd_init_fn = throtl_pd_init,
  1468. .pd_online_fn = throtl_pd_online,
  1469. .pd_offline_fn = throtl_pd_offline,
  1470. .pd_free_fn = throtl_pd_free,
  1471. };
  1472. void blk_throtl_cancel_bios(struct gendisk *disk)
  1473. {
  1474. struct request_queue *q = disk->queue;
  1475. struct cgroup_subsys_state *pos_css;
  1476. struct blkcg_gq *blkg;
  1477. spin_lock_irq(&q->queue_lock);
  1478. /*
  1479. * queue_lock is held, rcu lock is not needed here technically.
  1480. * However, rcu lock is still held to emphasize that following
  1481. * path need RCU protection and to prevent warning from lockdep.
  1482. */
  1483. rcu_read_lock();
  1484. blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
  1485. struct throtl_grp *tg = blkg_to_tg(blkg);
  1486. struct throtl_service_queue *sq = &tg->service_queue;
  1487. /*
  1488. * Set the flag to make sure throtl_pending_timer_fn() won't
  1489. * stop until all throttled bios are dispatched.
  1490. */
  1491. blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
  1492. /*
  1493. * Update disptime after setting the above flag to make sure
  1494. * throtl_select_dispatch() won't exit without dispatching.
  1495. */
  1496. tg_update_disptime(tg);
  1497. throtl_schedule_pending_timer(sq, jiffies + 1);
  1498. }
  1499. rcu_read_unlock();
  1500. spin_unlock_irq(&q->queue_lock);
  1501. }
  1502. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  1503. static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
  1504. {
  1505. unsigned long rtime = jiffies, wtime = jiffies;
  1506. if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW])
  1507. rtime = tg->last_low_overflow_time[READ];
  1508. if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW])
  1509. wtime = tg->last_low_overflow_time[WRITE];
  1510. return min(rtime, wtime);
  1511. }
  1512. /* tg should not be an intermediate node */
  1513. static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
  1514. {
  1515. struct throtl_service_queue *parent_sq;
  1516. struct throtl_grp *parent = tg;
  1517. unsigned long ret = __tg_last_low_overflow_time(tg);
  1518. while (true) {
  1519. parent_sq = parent->service_queue.parent_sq;
  1520. parent = sq_to_tg(parent_sq);
  1521. if (!parent)
  1522. break;
  1523. /*
  1524. * The parent doesn't have low limit, it always reaches low
  1525. * limit. Its overflow time is useless for children
  1526. */
  1527. if (!parent->bps[READ][LIMIT_LOW] &&
  1528. !parent->iops[READ][LIMIT_LOW] &&
  1529. !parent->bps[WRITE][LIMIT_LOW] &&
  1530. !parent->iops[WRITE][LIMIT_LOW])
  1531. continue;
  1532. if (time_after(__tg_last_low_overflow_time(parent), ret))
  1533. ret = __tg_last_low_overflow_time(parent);
  1534. }
  1535. return ret;
  1536. }
  1537. static bool throtl_tg_is_idle(struct throtl_grp *tg)
  1538. {
  1539. /*
  1540. * cgroup is idle if:
  1541. * - single idle is too long, longer than a fixed value (in case user
  1542. * configure a too big threshold) or 4 times of idletime threshold
  1543. * - average think time is more than threshold
  1544. * - IO latency is largely below threshold
  1545. */
  1546. unsigned long time;
  1547. bool ret;
  1548. time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
  1549. ret = tg->latency_target == DFL_LATENCY_TARGET ||
  1550. tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
  1551. (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
  1552. tg->avg_idletime > tg->idletime_threshold ||
  1553. (tg->latency_target && tg->bio_cnt &&
  1554. tg->bad_bio_cnt * 5 < tg->bio_cnt);
  1555. throtl_log(&tg->service_queue,
  1556. "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d",
  1557. tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt,
  1558. tg->bio_cnt, ret, tg->td->scale);
  1559. return ret;
  1560. }
  1561. static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
  1562. {
  1563. struct throtl_service_queue *sq = &tg->service_queue;
  1564. bool read_limit, write_limit;
  1565. /*
  1566. * if cgroup reaches low limit (if low limit is 0, the cgroup always
  1567. * reaches), it's ok to upgrade to next limit
  1568. */
  1569. read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
  1570. write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
  1571. if (!read_limit && !write_limit)
  1572. return true;
  1573. if (read_limit && sq->nr_queued[READ] &&
  1574. (!write_limit || sq->nr_queued[WRITE]))
  1575. return true;
  1576. if (write_limit && sq->nr_queued[WRITE] &&
  1577. (!read_limit || sq->nr_queued[READ]))
  1578. return true;
  1579. if (time_after_eq(jiffies,
  1580. tg_last_low_overflow_time(tg) + tg->td->throtl_slice) &&
  1581. throtl_tg_is_idle(tg))
  1582. return true;
  1583. return false;
  1584. }
  1585. static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg)
  1586. {
  1587. while (true) {
  1588. if (throtl_tg_can_upgrade(tg))
  1589. return true;
  1590. tg = sq_to_tg(tg->service_queue.parent_sq);
  1591. if (!tg || !tg_to_blkg(tg)->parent)
  1592. return false;
  1593. }
  1594. return false;
  1595. }
  1596. static bool throtl_can_upgrade(struct throtl_data *td,
  1597. struct throtl_grp *this_tg)
  1598. {
  1599. struct cgroup_subsys_state *pos_css;
  1600. struct blkcg_gq *blkg;
  1601. if (td->limit_index != LIMIT_LOW)
  1602. return false;
  1603. if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice))
  1604. return false;
  1605. rcu_read_lock();
  1606. blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  1607. struct throtl_grp *tg = blkg_to_tg(blkg);
  1608. if (tg == this_tg)
  1609. continue;
  1610. if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  1611. continue;
  1612. if (!throtl_hierarchy_can_upgrade(tg)) {
  1613. rcu_read_unlock();
  1614. return false;
  1615. }
  1616. }
  1617. rcu_read_unlock();
  1618. return true;
  1619. }
  1620. static void throtl_upgrade_check(struct throtl_grp *tg)
  1621. {
  1622. unsigned long now = jiffies;
  1623. if (tg->td->limit_index != LIMIT_LOW)
  1624. return;
  1625. if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
  1626. return;
  1627. tg->last_check_time = now;
  1628. if (!time_after_eq(now,
  1629. __tg_last_low_overflow_time(tg) + tg->td->throtl_slice))
  1630. return;
  1631. if (throtl_can_upgrade(tg->td, NULL))
  1632. throtl_upgrade_state(tg->td);
  1633. }
  1634. static void throtl_upgrade_state(struct throtl_data *td)
  1635. {
  1636. struct cgroup_subsys_state *pos_css;
  1637. struct blkcg_gq *blkg;
  1638. throtl_log(&td->service_queue, "upgrade to max");
  1639. td->limit_index = LIMIT_MAX;
  1640. td->low_upgrade_time = jiffies;
  1641. td->scale = 0;
  1642. rcu_read_lock();
  1643. blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) {
  1644. struct throtl_grp *tg = blkg_to_tg(blkg);
  1645. struct throtl_service_queue *sq = &tg->service_queue;
  1646. tg->disptime = jiffies - 1;
  1647. throtl_select_dispatch(sq);
  1648. throtl_schedule_next_dispatch(sq, true);
  1649. }
  1650. rcu_read_unlock();
  1651. throtl_select_dispatch(&td->service_queue);
  1652. throtl_schedule_next_dispatch(&td->service_queue, true);
  1653. queue_work(kthrotld_workqueue, &td->dispatch_work);
  1654. }
  1655. static void throtl_downgrade_state(struct throtl_data *td)
  1656. {
  1657. td->scale /= 2;
  1658. throtl_log(&td->service_queue, "downgrade, scale %d", td->scale);
  1659. if (td->scale) {
  1660. td->low_upgrade_time = jiffies - td->scale * td->throtl_slice;
  1661. return;
  1662. }
  1663. td->limit_index = LIMIT_LOW;
  1664. td->low_downgrade_time = jiffies;
  1665. }
  1666. static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
  1667. {
  1668. struct throtl_data *td = tg->td;
  1669. unsigned long now = jiffies;
  1670. /*
  1671. * If cgroup is below low limit, consider downgrade and throttle other
  1672. * cgroups
  1673. */
  1674. if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
  1675. time_after_eq(now, tg_last_low_overflow_time(tg) +
  1676. td->throtl_slice) &&
  1677. (!throtl_tg_is_idle(tg) ||
  1678. !list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
  1679. return true;
  1680. return false;
  1681. }
  1682. static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
  1683. {
  1684. while (true) {
  1685. if (!throtl_tg_can_downgrade(tg))
  1686. return false;
  1687. tg = sq_to_tg(tg->service_queue.parent_sq);
  1688. if (!tg || !tg_to_blkg(tg)->parent)
  1689. break;
  1690. }
  1691. return true;
  1692. }
  1693. static void throtl_downgrade_check(struct throtl_grp *tg)
  1694. {
  1695. uint64_t bps;
  1696. unsigned int iops;
  1697. unsigned long elapsed_time;
  1698. unsigned long now = jiffies;
  1699. if (tg->td->limit_index != LIMIT_MAX ||
  1700. !tg->td->limit_valid[LIMIT_LOW])
  1701. return;
  1702. if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children))
  1703. return;
  1704. if (time_after(tg->last_check_time + tg->td->throtl_slice, now))
  1705. return;
  1706. elapsed_time = now - tg->last_check_time;
  1707. tg->last_check_time = now;
  1708. if (time_before(now, tg_last_low_overflow_time(tg) +
  1709. tg->td->throtl_slice))
  1710. return;
  1711. if (tg->bps[READ][LIMIT_LOW]) {
  1712. bps = tg->last_bytes_disp[READ] * HZ;
  1713. do_div(bps, elapsed_time);
  1714. if (bps >= tg->bps[READ][LIMIT_LOW])
  1715. tg->last_low_overflow_time[READ] = now;
  1716. }
  1717. if (tg->bps[WRITE][LIMIT_LOW]) {
  1718. bps = tg->last_bytes_disp[WRITE] * HZ;
  1719. do_div(bps, elapsed_time);
  1720. if (bps >= tg->bps[WRITE][LIMIT_LOW])
  1721. tg->last_low_overflow_time[WRITE] = now;
  1722. }
  1723. if (tg->iops[READ][LIMIT_LOW]) {
  1724. iops = tg->last_io_disp[READ] * HZ / elapsed_time;
  1725. if (iops >= tg->iops[READ][LIMIT_LOW])
  1726. tg->last_low_overflow_time[READ] = now;
  1727. }
  1728. if (tg->iops[WRITE][LIMIT_LOW]) {
  1729. iops = tg->last_io_disp[WRITE] * HZ / elapsed_time;
  1730. if (iops >= tg->iops[WRITE][LIMIT_LOW])
  1731. tg->last_low_overflow_time[WRITE] = now;
  1732. }
  1733. /*
  1734. * If cgroup is below low limit, consider downgrade and throttle other
  1735. * cgroups
  1736. */
  1737. if (throtl_hierarchy_can_downgrade(tg))
  1738. throtl_downgrade_state(tg->td);
  1739. tg->last_bytes_disp[READ] = 0;
  1740. tg->last_bytes_disp[WRITE] = 0;
  1741. tg->last_io_disp[READ] = 0;
  1742. tg->last_io_disp[WRITE] = 0;
  1743. }
  1744. static void blk_throtl_update_idletime(struct throtl_grp *tg)
  1745. {
  1746. unsigned long now;
  1747. unsigned long last_finish_time = tg->last_finish_time;
  1748. if (last_finish_time == 0)
  1749. return;
  1750. now = ktime_get_ns() >> 10;
  1751. if (now <= last_finish_time ||
  1752. last_finish_time == tg->checked_last_finish_time)
  1753. return;
  1754. tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3;
  1755. tg->checked_last_finish_time = last_finish_time;
  1756. }
  1757. static void throtl_update_latency_buckets(struct throtl_data *td)
  1758. {
  1759. struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
  1760. int i, cpu, rw;
  1761. unsigned long last_latency[2] = { 0 };
  1762. unsigned long latency[2];
  1763. if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
  1764. return;
  1765. if (time_before(jiffies, td->last_calculate_time + HZ))
  1766. return;
  1767. td->last_calculate_time = jiffies;
  1768. memset(avg_latency, 0, sizeof(avg_latency));
  1769. for (rw = READ; rw <= WRITE; rw++) {
  1770. for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  1771. struct latency_bucket *tmp = &td->tmp_buckets[rw][i];
  1772. for_each_possible_cpu(cpu) {
  1773. struct latency_bucket *bucket;
  1774. /* this isn't race free, but ok in practice */
  1775. bucket = per_cpu_ptr(td->latency_buckets[rw],
  1776. cpu);
  1777. tmp->total_latency += bucket[i].total_latency;
  1778. tmp->samples += bucket[i].samples;
  1779. bucket[i].total_latency = 0;
  1780. bucket[i].samples = 0;
  1781. }
  1782. if (tmp->samples >= 32) {
  1783. int samples = tmp->samples;
  1784. latency[rw] = tmp->total_latency;
  1785. tmp->total_latency = 0;
  1786. tmp->samples = 0;
  1787. latency[rw] /= samples;
  1788. if (latency[rw] == 0)
  1789. continue;
  1790. avg_latency[rw][i].latency = latency[rw];
  1791. }
  1792. }
  1793. }
  1794. for (rw = READ; rw <= WRITE; rw++) {
  1795. for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  1796. if (!avg_latency[rw][i].latency) {
  1797. if (td->avg_buckets[rw][i].latency < last_latency[rw])
  1798. td->avg_buckets[rw][i].latency =
  1799. last_latency[rw];
  1800. continue;
  1801. }
  1802. if (!td->avg_buckets[rw][i].valid)
  1803. latency[rw] = avg_latency[rw][i].latency;
  1804. else
  1805. latency[rw] = (td->avg_buckets[rw][i].latency * 7 +
  1806. avg_latency[rw][i].latency) >> 3;
  1807. td->avg_buckets[rw][i].latency = max(latency[rw],
  1808. last_latency[rw]);
  1809. td->avg_buckets[rw][i].valid = true;
  1810. last_latency[rw] = td->avg_buckets[rw][i].latency;
  1811. }
  1812. }
  1813. for (i = 0; i < LATENCY_BUCKET_SIZE; i++)
  1814. throtl_log(&td->service_queue,
  1815. "Latency bucket %d: read latency=%ld, read valid=%d, "
  1816. "write latency=%ld, write valid=%d", i,
  1817. td->avg_buckets[READ][i].latency,
  1818. td->avg_buckets[READ][i].valid,
  1819. td->avg_buckets[WRITE][i].latency,
  1820. td->avg_buckets[WRITE][i].valid);
  1821. }
  1822. #else
  1823. static inline void throtl_update_latency_buckets(struct throtl_data *td)
  1824. {
  1825. }
  1826. static void blk_throtl_update_idletime(struct throtl_grp *tg)
  1827. {
  1828. }
  1829. static void throtl_downgrade_check(struct throtl_grp *tg)
  1830. {
  1831. }
  1832. static void throtl_upgrade_check(struct throtl_grp *tg)
  1833. {
  1834. }
  1835. static bool throtl_can_upgrade(struct throtl_data *td,
  1836. struct throtl_grp *this_tg)
  1837. {
  1838. return false;
  1839. }
  1840. static void throtl_upgrade_state(struct throtl_data *td)
  1841. {
  1842. }
  1843. #endif
  1844. bool __blk_throtl_bio(struct bio *bio)
  1845. {
  1846. struct request_queue *q = bdev_get_queue(bio->bi_bdev);
  1847. struct blkcg_gq *blkg = bio->bi_blkg;
  1848. struct throtl_qnode *qn = NULL;
  1849. struct throtl_grp *tg = blkg_to_tg(blkg);
  1850. struct throtl_service_queue *sq;
  1851. bool rw = bio_data_dir(bio);
  1852. bool throttled = false;
  1853. struct throtl_data *td = tg->td;
  1854. rcu_read_lock();
  1855. if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
  1856. blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
  1857. bio->bi_iter.bi_size);
  1858. blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
  1859. }
  1860. spin_lock_irq(&q->queue_lock);
  1861. throtl_update_latency_buckets(td);
  1862. blk_throtl_update_idletime(tg);
  1863. sq = &tg->service_queue;
  1864. again:
  1865. while (true) {
  1866. if (tg->last_low_overflow_time[rw] == 0)
  1867. tg->last_low_overflow_time[rw] = jiffies;
  1868. throtl_downgrade_check(tg);
  1869. throtl_upgrade_check(tg);
  1870. /* throtl is FIFO - if bios are already queued, should queue */
  1871. if (sq->nr_queued[rw])
  1872. break;
  1873. /* if above limits, break to queue */
  1874. if (!tg_may_dispatch(tg, bio, NULL)) {
  1875. tg->last_low_overflow_time[rw] = jiffies;
  1876. if (throtl_can_upgrade(td, tg)) {
  1877. throtl_upgrade_state(td);
  1878. goto again;
  1879. }
  1880. break;
  1881. }
  1882. /* within limits, let's charge and dispatch directly */
  1883. throtl_charge_bio(tg, bio);
  1884. /*
  1885. * We need to trim slice even when bios are not being queued
  1886. * otherwise it might happen that a bio is not queued for
  1887. * a long time and slice keeps on extending and trim is not
  1888. * called for a long time. Now if limits are reduced suddenly
  1889. * we take into account all the IO dispatched so far at new
  1890. * low rate and * newly queued IO gets a really long dispatch
  1891. * time.
  1892. *
  1893. * So keep on trimming slice even if bio is not queued.
  1894. */
  1895. throtl_trim_slice(tg, rw);
  1896. /*
  1897. * @bio passed through this layer without being throttled.
  1898. * Climb up the ladder. If we're already at the top, it
  1899. * can be executed directly.
  1900. */
  1901. qn = &tg->qnode_on_parent[rw];
  1902. sq = sq->parent_sq;
  1903. tg = sq_to_tg(sq);
  1904. if (!tg) {
  1905. bio_set_flag(bio, BIO_BPS_THROTTLED);
  1906. goto out_unlock;
  1907. }
  1908. }
  1909. /* out-of-limit, queue to @tg */
  1910. throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d",
  1911. rw == READ ? 'R' : 'W',
  1912. tg->bytes_disp[rw], bio->bi_iter.bi_size,
  1913. tg_bps_limit(tg, rw),
  1914. tg->io_disp[rw], tg_iops_limit(tg, rw),
  1915. sq->nr_queued[READ], sq->nr_queued[WRITE]);
  1916. tg->last_low_overflow_time[rw] = jiffies;
  1917. td->nr_queued[rw]++;
  1918. throtl_add_bio_tg(bio, qn, tg);
  1919. throttled = true;
  1920. /*
  1921. * Update @tg's dispatch time and force schedule dispatch if @tg
  1922. * was empty before @bio. The forced scheduling isn't likely to
  1923. * cause undue delay as @bio is likely to be dispatched directly if
  1924. * its @tg's disptime is not in the future.
  1925. */
  1926. if (tg->flags & THROTL_TG_WAS_EMPTY) {
  1927. tg_update_disptime(tg);
  1928. throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true);
  1929. }
  1930. out_unlock:
  1931. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  1932. if (throttled || !td->track_bio_latency)
  1933. bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
  1934. #endif
  1935. spin_unlock_irq(&q->queue_lock);
  1936. rcu_read_unlock();
  1937. return throttled;
  1938. }
  1939. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  1940. static void throtl_track_latency(struct throtl_data *td, sector_t size,
  1941. enum req_op op, unsigned long time)
  1942. {
  1943. const bool rw = op_is_write(op);
  1944. struct latency_bucket *latency;
  1945. int index;
  1946. if (!td || td->limit_index != LIMIT_LOW ||
  1947. !(op == REQ_OP_READ || op == REQ_OP_WRITE) ||
  1948. !blk_queue_nonrot(td->queue))
  1949. return;
  1950. index = request_bucket_index(size);
  1951. latency = get_cpu_ptr(td->latency_buckets[rw]);
  1952. latency[index].total_latency += time;
  1953. latency[index].samples++;
  1954. put_cpu_ptr(td->latency_buckets[rw]);
  1955. }
  1956. void blk_throtl_stat_add(struct request *rq, u64 time_ns)
  1957. {
  1958. struct request_queue *q = rq->q;
  1959. struct throtl_data *td = q->td;
  1960. throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
  1961. time_ns >> 10);
  1962. }
  1963. void blk_throtl_bio_endio(struct bio *bio)
  1964. {
  1965. struct blkcg_gq *blkg;
  1966. struct throtl_grp *tg;
  1967. u64 finish_time_ns;
  1968. unsigned long finish_time;
  1969. unsigned long start_time;
  1970. unsigned long lat;
  1971. int rw = bio_data_dir(bio);
  1972. blkg = bio->bi_blkg;
  1973. if (!blkg)
  1974. return;
  1975. tg = blkg_to_tg(blkg);
  1976. if (!tg->td->limit_valid[LIMIT_LOW])
  1977. return;
  1978. finish_time_ns = ktime_get_ns();
  1979. tg->last_finish_time = finish_time_ns >> 10;
  1980. start_time = bio_issue_time(&bio->bi_issue) >> 10;
  1981. finish_time = __bio_issue_time(finish_time_ns) >> 10;
  1982. if (!start_time || finish_time <= start_time)
  1983. return;
  1984. lat = finish_time - start_time;
  1985. /* this is only for bio based driver */
  1986. if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY))
  1987. throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue),
  1988. bio_op(bio), lat);
  1989. if (tg->latency_target && lat >= tg->td->filtered_latency) {
  1990. int bucket;
  1991. unsigned int threshold;
  1992. bucket = request_bucket_index(bio_issue_size(&bio->bi_issue));
  1993. threshold = tg->td->avg_buckets[rw][bucket].latency +
  1994. tg->latency_target;
  1995. if (lat > threshold)
  1996. tg->bad_bio_cnt++;
  1997. /*
  1998. * Not race free, could get wrong count, which means cgroups
  1999. * will be throttled
  2000. */
  2001. tg->bio_cnt++;
  2002. }
  2003. if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) {
  2004. tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies;
  2005. tg->bio_cnt /= 2;
  2006. tg->bad_bio_cnt /= 2;
  2007. }
  2008. }
  2009. #endif
  2010. int blk_throtl_init(struct gendisk *disk)
  2011. {
  2012. struct request_queue *q = disk->queue;
  2013. struct throtl_data *td;
  2014. int ret;
  2015. td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
  2016. if (!td)
  2017. return -ENOMEM;
  2018. td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) *
  2019. LATENCY_BUCKET_SIZE, __alignof__(u64));
  2020. if (!td->latency_buckets[READ]) {
  2021. kfree(td);
  2022. return -ENOMEM;
  2023. }
  2024. td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) *
  2025. LATENCY_BUCKET_SIZE, __alignof__(u64));
  2026. if (!td->latency_buckets[WRITE]) {
  2027. free_percpu(td->latency_buckets[READ]);
  2028. kfree(td);
  2029. return -ENOMEM;
  2030. }
  2031. INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn);
  2032. throtl_service_queue_init(&td->service_queue);
  2033. q->td = td;
  2034. td->queue = q;
  2035. td->limit_valid[LIMIT_MAX] = true;
  2036. td->limit_index = LIMIT_MAX;
  2037. td->low_upgrade_time = jiffies;
  2038. td->low_downgrade_time = jiffies;
  2039. /* activate policy */
  2040. ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
  2041. if (ret) {
  2042. free_percpu(td->latency_buckets[READ]);
  2043. free_percpu(td->latency_buckets[WRITE]);
  2044. kfree(td);
  2045. }
  2046. return ret;
  2047. }
  2048. void blk_throtl_exit(struct gendisk *disk)
  2049. {
  2050. struct request_queue *q = disk->queue;
  2051. BUG_ON(!q->td);
  2052. del_timer_sync(&q->td->service_queue.pending_timer);
  2053. throtl_shutdown_wq(q);
  2054. blkcg_deactivate_policy(q, &blkcg_policy_throtl);
  2055. free_percpu(q->td->latency_buckets[READ]);
  2056. free_percpu(q->td->latency_buckets[WRITE]);
  2057. kfree(q->td);
  2058. }
  2059. void blk_throtl_register(struct gendisk *disk)
  2060. {
  2061. struct request_queue *q = disk->queue;
  2062. struct throtl_data *td;
  2063. int i;
  2064. td = q->td;
  2065. BUG_ON(!td);
  2066. if (blk_queue_nonrot(q)) {
  2067. td->throtl_slice = DFL_THROTL_SLICE_SSD;
  2068. td->filtered_latency = LATENCY_FILTERED_SSD;
  2069. } else {
  2070. td->throtl_slice = DFL_THROTL_SLICE_HD;
  2071. td->filtered_latency = LATENCY_FILTERED_HD;
  2072. for (i = 0; i < LATENCY_BUCKET_SIZE; i++) {
  2073. td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY;
  2074. td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY;
  2075. }
  2076. }
  2077. #ifndef CONFIG_BLK_DEV_THROTTLING_LOW
  2078. /* if no low limit, use previous default */
  2079. td->throtl_slice = DFL_THROTL_SLICE_HD;
  2080. #endif
  2081. td->track_bio_latency = !queue_is_mq(q);
  2082. if (!td->track_bio_latency)
  2083. blk_stat_enable_accounting(q);
  2084. }
  2085. #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
  2086. ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page)
  2087. {
  2088. if (!q->td)
  2089. return -EINVAL;
  2090. return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice));
  2091. }
  2092. ssize_t blk_throtl_sample_time_store(struct request_queue *q,
  2093. const char *page, size_t count)
  2094. {
  2095. unsigned long v;
  2096. unsigned long t;
  2097. if (!q->td)
  2098. return -EINVAL;
  2099. if (kstrtoul(page, 10, &v))
  2100. return -EINVAL;
  2101. t = msecs_to_jiffies(v);
  2102. if (t == 0 || t > MAX_THROTL_SLICE)
  2103. return -EINVAL;
  2104. q->td->throtl_slice = t;
  2105. return count;
  2106. }
  2107. #endif
  2108. static int __init throtl_init(void)
  2109. {
  2110. kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
  2111. if (!kthrotld_workqueue)
  2112. panic("Failed to create kthrotld\n");
  2113. return blkcg_policy_register(&blkcg_policy_throtl);
  2114. }
  2115. module_init(throtl_init);