blk-iocost.c 97 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470
  1. /* SPDX-License-Identifier: GPL-2.0
  2. *
  3. * IO cost model based controller.
  4. *
  5. * Copyright (C) 2019 Tejun Heo <[email protected]>
  6. * Copyright (C) 2019 Andy Newell <[email protected]>
  7. * Copyright (C) 2019 Facebook
  8. *
  9. * One challenge of controlling IO resources is the lack of trivially
  10. * observable cost metric. This is distinguished from CPU and memory where
  11. * wallclock time and the number of bytes can serve as accurate enough
  12. * approximations.
  13. *
  14. * Bandwidth and iops are the most commonly used metrics for IO devices but
  15. * depending on the type and specifics of the device, different IO patterns
  16. * easily lead to multiple orders of magnitude variations rendering them
  17. * useless for the purpose of IO capacity distribution. While on-device
  18. * time, with a lot of clutches, could serve as a useful approximation for
  19. * non-queued rotational devices, this is no longer viable with modern
  20. * devices, even the rotational ones.
  21. *
  22. * While there is no cost metric we can trivially observe, it isn't a
  23. * complete mystery. For example, on a rotational device, seek cost
  24. * dominates while a contiguous transfer contributes a smaller amount
  25. * proportional to the size. If we can characterize at least the relative
  26. * costs of these different types of IOs, it should be possible to
  27. * implement a reasonable work-conserving proportional IO resource
  28. * distribution.
  29. *
  30. * 1. IO Cost Model
  31. *
  32. * IO cost model estimates the cost of an IO given its basic parameters and
  33. * history (e.g. the end sector of the last IO). The cost is measured in
  34. * device time. If a given IO is estimated to cost 10ms, the device should
  35. * be able to process ~100 of those IOs in a second.
  36. *
  37. * Currently, there's only one builtin cost model - linear. Each IO is
  38. * classified as sequential or random and given a base cost accordingly.
  39. * On top of that, a size cost proportional to the length of the IO is
  40. * added. While simple, this model captures the operational
  41. * characteristics of a wide varienty of devices well enough. Default
  42. * parameters for several different classes of devices are provided and the
  43. * parameters can be configured from userspace via
  44. * /sys/fs/cgroup/io.cost.model.
  45. *
  46. * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47. * device-specific coefficients.
  48. *
  49. * 2. Control Strategy
  50. *
  51. * The device virtual time (vtime) is used as the primary control metric.
  52. * The control strategy is composed of the following three parts.
  53. *
  54. * 2-1. Vtime Distribution
  55. *
  56. * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57. * calculated. Please consider the following hierarchy where the numbers
  58. * inside parentheses denote the configured weights.
  59. *
  60. * root
  61. * / \
  62. * A (w:100) B (w:300)
  63. * / \
  64. * A0 (w:100) A1 (w:100)
  65. *
  66. * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67. * of equal weight, each gets 50% share. If then B starts issuing IOs, B
  68. * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69. * 12.5% each. The distribution mechanism only cares about these flattened
  70. * shares. They're called hweights (hierarchical weights) and always add
  71. * upto 1 (WEIGHT_ONE).
  72. *
  73. * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75. * against the device vtime - an IO which takes 10ms on the underlying
  76. * device is considered to take 80ms on A0.
  77. *
  78. * This constitutes the basis of IO capacity distribution. Each cgroup's
  79. * vtime is running at a rate determined by its hweight. A cgroup tracks
  80. * the vtime consumed by past IOs and can issue a new IO if doing so
  81. * wouldn't outrun the current device vtime. Otherwise, the IO is
  82. * suspended until the vtime has progressed enough to cover it.
  83. *
  84. * 2-2. Vrate Adjustment
  85. *
  86. * It's unrealistic to expect the cost model to be perfect. There are too
  87. * many devices and even on the same device the overall performance
  88. * fluctuates depending on numerous factors such as IO mixture and device
  89. * internal garbage collection. The controller needs to adapt dynamically.
  90. *
  91. * This is achieved by adjusting the overall IO rate according to how busy
  92. * the device is. If the device becomes overloaded, we're sending down too
  93. * many IOs and should generally slow down. If there are waiting issuers
  94. * but the device isn't saturated, we're issuing too few and should
  95. * generally speed up.
  96. *
  97. * To slow down, we lower the vrate - the rate at which the device vtime
  98. * passes compared to the wall clock. For example, if the vtime is running
  99. * at the vrate of 75%, all cgroups added up would only be able to issue
  100. * 750ms worth of IOs per second, and vice-versa for speeding up.
  101. *
  102. * Device business is determined using two criteria - rq wait and
  103. * completion latencies.
  104. *
  105. * When a device gets saturated, the on-device and then the request queues
  106. * fill up and a bio which is ready to be issued has to wait for a request
  107. * to become available. When this delay becomes noticeable, it's a clear
  108. * indication that the device is saturated and we lower the vrate. This
  109. * saturation signal is fairly conservative as it only triggers when both
  110. * hardware and software queues are filled up, and is used as the default
  111. * busy signal.
  112. *
  113. * As devices can have deep queues and be unfair in how the queued commands
  114. * are executed, soley depending on rq wait may not result in satisfactory
  115. * control quality. For a better control quality, completion latency QoS
  116. * parameters can be configured so that the device is considered saturated
  117. * if N'th percentile completion latency rises above the set point.
  118. *
  119. * The completion latency requirements are a function of both the
  120. * underlying device characteristics and the desired IO latency quality of
  121. * service. There is an inherent trade-off - the tighter the latency QoS,
  122. * the higher the bandwidth lossage. Latency QoS is disabled by default
  123. * and can be set through /sys/fs/cgroup/io.cost.qos.
  124. *
  125. * 2-3. Work Conservation
  126. *
  127. * Imagine two cgroups A and B with equal weights. A is issuing a small IO
  128. * periodically while B is sending out enough parallel IOs to saturate the
  129. * device on its own. Let's say A's usage amounts to 100ms worth of IO
  130. * cost per second, i.e., 10% of the device capacity. The naive
  131. * distribution of half and half would lead to 60% utilization of the
  132. * device, a significant reduction in the total amount of work done
  133. * compared to free-for-all competition. This is too high a cost to pay
  134. * for IO control.
  135. *
  136. * To conserve the total amount of work done, we keep track of how much
  137. * each active cgroup is actually using and yield part of its weight if
  138. * there are other cgroups which can make use of it. In the above case,
  139. * A's weight will be lowered so that it hovers above the actual usage and
  140. * B would be able to use the rest.
  141. *
  142. * As we don't want to penalize a cgroup for donating its weight, the
  143. * surplus weight adjustment factors in a margin and has an immediate
  144. * snapback mechanism in case the cgroup needs more IO vtime for itself.
  145. *
  146. * Note that adjusting down surplus weights has the same effects as
  147. * accelerating vtime for other cgroups and work conservation can also be
  148. * implemented by adjusting vrate dynamically. However, squaring who can
  149. * donate and should take back how much requires hweight propagations
  150. * anyway making it easier to implement and understand as a separate
  151. * mechanism.
  152. *
  153. * 3. Monitoring
  154. *
  155. * Instead of debugfs or other clumsy monitoring mechanisms, this
  156. * controller uses a drgn based monitoring script -
  157. * tools/cgroup/iocost_monitor.py. For details on drgn, please see
  158. * https://github.com/osandov/drgn. The output looks like the following.
  159. *
  160. * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
  161. * active weight hweight% inflt% dbt delay usages%
  162. * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
  163. * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
  164. *
  165. * - per : Timer period
  166. * - cur_per : Internal wall and device vtime clock
  167. * - vrate : Device virtual time rate against wall clock
  168. * - weight : Surplus-adjusted and configured weights
  169. * - hweight : Surplus-adjusted and configured hierarchical weights
  170. * - inflt : The percentage of in-flight IO cost at the end of last period
  171. * - del_ms : Deferred issuer delay induction level and duration
  172. * - usages : Usage history
  173. */
  174. #include <linux/kernel.h>
  175. #include <linux/module.h>
  176. #include <linux/timer.h>
  177. #include <linux/time64.h>
  178. #include <linux/parser.h>
  179. #include <linux/sched/signal.h>
  180. #include <asm/local.h>
  181. #include <asm/local64.h>
  182. #include "blk-rq-qos.h"
  183. #include "blk-stat.h"
  184. #include "blk-wbt.h"
  185. #include "blk-cgroup.h"
  186. #ifdef CONFIG_TRACEPOINTS
  187. /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
  188. #define TRACE_IOCG_PATH_LEN 1024
  189. static DEFINE_SPINLOCK(trace_iocg_path_lock);
  190. static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
  191. #define TRACE_IOCG_PATH(type, iocg, ...) \
  192. do { \
  193. unsigned long flags; \
  194. if (trace_iocost_##type##_enabled()) { \
  195. spin_lock_irqsave(&trace_iocg_path_lock, flags); \
  196. cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
  197. trace_iocg_path, TRACE_IOCG_PATH_LEN); \
  198. trace_iocost_##type(iocg, trace_iocg_path, \
  199. ##__VA_ARGS__); \
  200. spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
  201. } \
  202. } while (0)
  203. #else /* CONFIG_TRACE_POINTS */
  204. #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
  205. #endif /* CONFIG_TRACE_POINTS */
  206. enum {
  207. MILLION = 1000000,
  208. /* timer period is calculated from latency requirements, bound it */
  209. MIN_PERIOD = USEC_PER_MSEC,
  210. MAX_PERIOD = USEC_PER_SEC,
  211. /*
  212. * iocg->vtime is targeted at 50% behind the device vtime, which
  213. * serves as its IO credit buffer. Surplus weight adjustment is
  214. * immediately canceled if the vtime margin runs below 10%.
  215. */
  216. MARGIN_MIN_PCT = 10,
  217. MARGIN_LOW_PCT = 20,
  218. MARGIN_TARGET_PCT = 50,
  219. INUSE_ADJ_STEP_PCT = 25,
  220. /* Have some play in timer operations */
  221. TIMER_SLACK_PCT = 1,
  222. /* 1/64k is granular enough and can easily be handled w/ u32 */
  223. WEIGHT_ONE = 1 << 16,
  224. };
  225. enum {
  226. /*
  227. * As vtime is used to calculate the cost of each IO, it needs to
  228. * be fairly high precision. For example, it should be able to
  229. * represent the cost of a single page worth of discard with
  230. * suffificient accuracy. At the same time, it should be able to
  231. * represent reasonably long enough durations to be useful and
  232. * convenient during operation.
  233. *
  234. * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
  235. * granularity and days of wrap-around time even at extreme vrates.
  236. */
  237. VTIME_PER_SEC_SHIFT = 37,
  238. VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
  239. VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
  240. VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
  241. /* bound vrate adjustments within two orders of magnitude */
  242. VRATE_MIN_PPM = 10000, /* 1% */
  243. VRATE_MAX_PPM = 100000000, /* 10000% */
  244. VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
  245. VRATE_CLAMP_ADJ_PCT = 4,
  246. /* switch iff the conditions are met for longer than this */
  247. AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
  248. };
  249. enum {
  250. /* if IOs end up waiting for requests, issue less */
  251. RQ_WAIT_BUSY_PCT = 5,
  252. /* unbusy hysterisis */
  253. UNBUSY_THR_PCT = 75,
  254. /*
  255. * The effect of delay is indirect and non-linear and a huge amount of
  256. * future debt can accumulate abruptly while unthrottled. Linearly scale
  257. * up delay as debt is going up and then let it decay exponentially.
  258. * This gives us quick ramp ups while delay is accumulating and long
  259. * tails which can help reducing the frequency of debt explosions on
  260. * unthrottle. The parameters are experimentally determined.
  261. *
  262. * The delay mechanism provides adequate protection and behavior in many
  263. * cases. However, this is far from ideal and falls shorts on both
  264. * fronts. The debtors are often throttled too harshly costing a
  265. * significant level of fairness and possibly total work while the
  266. * protection against their impacts on the system can be choppy and
  267. * unreliable.
  268. *
  269. * The shortcoming primarily stems from the fact that, unlike for page
  270. * cache, the kernel doesn't have well-defined back-pressure propagation
  271. * mechanism and policies for anonymous memory. Fully addressing this
  272. * issue will likely require substantial improvements in the area.
  273. */
  274. MIN_DELAY_THR_PCT = 500,
  275. MAX_DELAY_THR_PCT = 25000,
  276. MIN_DELAY = 250,
  277. MAX_DELAY = 250 * USEC_PER_MSEC,
  278. /* halve debts if avg usage over 100ms is under 50% */
  279. DFGV_USAGE_PCT = 50,
  280. DFGV_PERIOD = 100 * USEC_PER_MSEC,
  281. /* don't let cmds which take a very long time pin lagging for too long */
  282. MAX_LAGGING_PERIODS = 10,
  283. /*
  284. * Count IO size in 4k pages. The 12bit shift helps keeping
  285. * size-proportional components of cost calculation in closer
  286. * numbers of digits to per-IO cost components.
  287. */
  288. IOC_PAGE_SHIFT = 12,
  289. IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
  290. IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
  291. /* if apart further than 16M, consider randio for linear model */
  292. LCOEF_RANDIO_PAGES = 4096,
  293. };
  294. enum ioc_running {
  295. IOC_IDLE,
  296. IOC_RUNNING,
  297. IOC_STOP,
  298. };
  299. /* io.cost.qos controls including per-dev enable of the whole controller */
  300. enum {
  301. QOS_ENABLE,
  302. QOS_CTRL,
  303. NR_QOS_CTRL_PARAMS,
  304. };
  305. /* io.cost.qos params */
  306. enum {
  307. QOS_RPPM,
  308. QOS_RLAT,
  309. QOS_WPPM,
  310. QOS_WLAT,
  311. QOS_MIN,
  312. QOS_MAX,
  313. NR_QOS_PARAMS,
  314. };
  315. /* io.cost.model controls */
  316. enum {
  317. COST_CTRL,
  318. COST_MODEL,
  319. NR_COST_CTRL_PARAMS,
  320. };
  321. /* builtin linear cost model coefficients */
  322. enum {
  323. I_LCOEF_RBPS,
  324. I_LCOEF_RSEQIOPS,
  325. I_LCOEF_RRANDIOPS,
  326. I_LCOEF_WBPS,
  327. I_LCOEF_WSEQIOPS,
  328. I_LCOEF_WRANDIOPS,
  329. NR_I_LCOEFS,
  330. };
  331. enum {
  332. LCOEF_RPAGE,
  333. LCOEF_RSEQIO,
  334. LCOEF_RRANDIO,
  335. LCOEF_WPAGE,
  336. LCOEF_WSEQIO,
  337. LCOEF_WRANDIO,
  338. NR_LCOEFS,
  339. };
  340. enum {
  341. AUTOP_INVALID,
  342. AUTOP_HDD,
  343. AUTOP_SSD_QD1,
  344. AUTOP_SSD_DFL,
  345. AUTOP_SSD_FAST,
  346. };
  347. struct ioc_params {
  348. u32 qos[NR_QOS_PARAMS];
  349. u64 i_lcoefs[NR_I_LCOEFS];
  350. u64 lcoefs[NR_LCOEFS];
  351. u32 too_fast_vrate_pct;
  352. u32 too_slow_vrate_pct;
  353. };
  354. struct ioc_margins {
  355. s64 min;
  356. s64 low;
  357. s64 target;
  358. };
  359. struct ioc_missed {
  360. local_t nr_met;
  361. local_t nr_missed;
  362. u32 last_met;
  363. u32 last_missed;
  364. };
  365. struct ioc_pcpu_stat {
  366. struct ioc_missed missed[2];
  367. local64_t rq_wait_ns;
  368. u64 last_rq_wait_ns;
  369. };
  370. /* per device */
  371. struct ioc {
  372. struct rq_qos rqos;
  373. bool enabled;
  374. struct ioc_params params;
  375. struct ioc_margins margins;
  376. u32 period_us;
  377. u32 timer_slack_ns;
  378. u64 vrate_min;
  379. u64 vrate_max;
  380. spinlock_t lock;
  381. struct timer_list timer;
  382. struct list_head active_iocgs; /* active cgroups */
  383. struct ioc_pcpu_stat __percpu *pcpu_stat;
  384. enum ioc_running running;
  385. atomic64_t vtime_rate;
  386. u64 vtime_base_rate;
  387. s64 vtime_err;
  388. seqcount_spinlock_t period_seqcount;
  389. u64 period_at; /* wallclock starttime */
  390. u64 period_at_vtime; /* vtime starttime */
  391. atomic64_t cur_period; /* inc'd each period */
  392. int busy_level; /* saturation history */
  393. bool weights_updated;
  394. atomic_t hweight_gen; /* for lazy hweights */
  395. /* debt forgivness */
  396. u64 dfgv_period_at;
  397. u64 dfgv_period_rem;
  398. u64 dfgv_usage_us_sum;
  399. u64 autop_too_fast_at;
  400. u64 autop_too_slow_at;
  401. int autop_idx;
  402. bool user_qos_params:1;
  403. bool user_cost_model:1;
  404. };
  405. struct iocg_pcpu_stat {
  406. local64_t abs_vusage;
  407. };
  408. struct iocg_stat {
  409. u64 usage_us;
  410. u64 wait_us;
  411. u64 indebt_us;
  412. u64 indelay_us;
  413. };
  414. /* per device-cgroup pair */
  415. struct ioc_gq {
  416. struct blkg_policy_data pd;
  417. struct ioc *ioc;
  418. /*
  419. * A iocg can get its weight from two sources - an explicit
  420. * per-device-cgroup configuration or the default weight of the
  421. * cgroup. `cfg_weight` is the explicit per-device-cgroup
  422. * configuration. `weight` is the effective considering both
  423. * sources.
  424. *
  425. * When an idle cgroup becomes active its `active` goes from 0 to
  426. * `weight`. `inuse` is the surplus adjusted active weight.
  427. * `active` and `inuse` are used to calculate `hweight_active` and
  428. * `hweight_inuse`.
  429. *
  430. * `last_inuse` remembers `inuse` while an iocg is idle to persist
  431. * surplus adjustments.
  432. *
  433. * `inuse` may be adjusted dynamically during period. `saved_*` are used
  434. * to determine and track adjustments.
  435. */
  436. u32 cfg_weight;
  437. u32 weight;
  438. u32 active;
  439. u32 inuse;
  440. u32 last_inuse;
  441. s64 saved_margin;
  442. sector_t cursor; /* to detect randio */
  443. /*
  444. * `vtime` is this iocg's vtime cursor which progresses as IOs are
  445. * issued. If lagging behind device vtime, the delta represents
  446. * the currently available IO budget. If running ahead, the
  447. * overage.
  448. *
  449. * `vtime_done` is the same but progressed on completion rather
  450. * than issue. The delta behind `vtime` represents the cost of
  451. * currently in-flight IOs.
  452. */
  453. atomic64_t vtime;
  454. atomic64_t done_vtime;
  455. u64 abs_vdebt;
  456. /* current delay in effect and when it started */
  457. u64 delay;
  458. u64 delay_at;
  459. /*
  460. * The period this iocg was last active in. Used for deactivation
  461. * and invalidating `vtime`.
  462. */
  463. atomic64_t active_period;
  464. struct list_head active_list;
  465. /* see __propagate_weights() and current_hweight() for details */
  466. u64 child_active_sum;
  467. u64 child_inuse_sum;
  468. u64 child_adjusted_sum;
  469. int hweight_gen;
  470. u32 hweight_active;
  471. u32 hweight_inuse;
  472. u32 hweight_donating;
  473. u32 hweight_after_donation;
  474. struct list_head walk_list;
  475. struct list_head surplus_list;
  476. struct wait_queue_head waitq;
  477. struct hrtimer waitq_timer;
  478. /* timestamp at the latest activation */
  479. u64 activated_at;
  480. /* statistics */
  481. struct iocg_pcpu_stat __percpu *pcpu_stat;
  482. struct iocg_stat stat;
  483. struct iocg_stat last_stat;
  484. u64 last_stat_abs_vusage;
  485. u64 usage_delta_us;
  486. u64 wait_since;
  487. u64 indebt_since;
  488. u64 indelay_since;
  489. /* this iocg's depth in the hierarchy and ancestors including self */
  490. int level;
  491. struct ioc_gq *ancestors[];
  492. };
  493. /* per cgroup */
  494. struct ioc_cgrp {
  495. struct blkcg_policy_data cpd;
  496. unsigned int dfl_weight;
  497. };
  498. struct ioc_now {
  499. u64 now_ns;
  500. u64 now;
  501. u64 vnow;
  502. u64 vrate;
  503. };
  504. struct iocg_wait {
  505. struct wait_queue_entry wait;
  506. struct bio *bio;
  507. u64 abs_cost;
  508. bool committed;
  509. };
  510. struct iocg_wake_ctx {
  511. struct ioc_gq *iocg;
  512. u32 hw_inuse;
  513. s64 vbudget;
  514. };
  515. static const struct ioc_params autop[] = {
  516. [AUTOP_HDD] = {
  517. .qos = {
  518. [QOS_RLAT] = 250000, /* 250ms */
  519. [QOS_WLAT] = 250000,
  520. [QOS_MIN] = VRATE_MIN_PPM,
  521. [QOS_MAX] = VRATE_MAX_PPM,
  522. },
  523. .i_lcoefs = {
  524. [I_LCOEF_RBPS] = 174019176,
  525. [I_LCOEF_RSEQIOPS] = 41708,
  526. [I_LCOEF_RRANDIOPS] = 370,
  527. [I_LCOEF_WBPS] = 178075866,
  528. [I_LCOEF_WSEQIOPS] = 42705,
  529. [I_LCOEF_WRANDIOPS] = 378,
  530. },
  531. },
  532. [AUTOP_SSD_QD1] = {
  533. .qos = {
  534. [QOS_RLAT] = 25000, /* 25ms */
  535. [QOS_WLAT] = 25000,
  536. [QOS_MIN] = VRATE_MIN_PPM,
  537. [QOS_MAX] = VRATE_MAX_PPM,
  538. },
  539. .i_lcoefs = {
  540. [I_LCOEF_RBPS] = 245855193,
  541. [I_LCOEF_RSEQIOPS] = 61575,
  542. [I_LCOEF_RRANDIOPS] = 6946,
  543. [I_LCOEF_WBPS] = 141365009,
  544. [I_LCOEF_WSEQIOPS] = 33716,
  545. [I_LCOEF_WRANDIOPS] = 26796,
  546. },
  547. },
  548. [AUTOP_SSD_DFL] = {
  549. .qos = {
  550. [QOS_RLAT] = 25000, /* 25ms */
  551. [QOS_WLAT] = 25000,
  552. [QOS_MIN] = VRATE_MIN_PPM,
  553. [QOS_MAX] = VRATE_MAX_PPM,
  554. },
  555. .i_lcoefs = {
  556. [I_LCOEF_RBPS] = 488636629,
  557. [I_LCOEF_RSEQIOPS] = 8932,
  558. [I_LCOEF_RRANDIOPS] = 8518,
  559. [I_LCOEF_WBPS] = 427891549,
  560. [I_LCOEF_WSEQIOPS] = 28755,
  561. [I_LCOEF_WRANDIOPS] = 21940,
  562. },
  563. .too_fast_vrate_pct = 500,
  564. },
  565. [AUTOP_SSD_FAST] = {
  566. .qos = {
  567. [QOS_RLAT] = 5000, /* 5ms */
  568. [QOS_WLAT] = 5000,
  569. [QOS_MIN] = VRATE_MIN_PPM,
  570. [QOS_MAX] = VRATE_MAX_PPM,
  571. },
  572. .i_lcoefs = {
  573. [I_LCOEF_RBPS] = 3102524156LLU,
  574. [I_LCOEF_RSEQIOPS] = 724816,
  575. [I_LCOEF_RRANDIOPS] = 778122,
  576. [I_LCOEF_WBPS] = 1742780862LLU,
  577. [I_LCOEF_WSEQIOPS] = 425702,
  578. [I_LCOEF_WRANDIOPS] = 443193,
  579. },
  580. .too_slow_vrate_pct = 10,
  581. },
  582. };
  583. /*
  584. * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
  585. * vtime credit shortage and down on device saturation.
  586. */
  587. static u32 vrate_adj_pct[] =
  588. { 0, 0, 0, 0,
  589. 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
  590. 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  591. 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
  592. static struct blkcg_policy blkcg_policy_iocost;
  593. /* accessors and helpers */
  594. static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
  595. {
  596. return container_of(rqos, struct ioc, rqos);
  597. }
  598. static struct ioc *q_to_ioc(struct request_queue *q)
  599. {
  600. return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
  601. }
  602. static const char __maybe_unused *ioc_name(struct ioc *ioc)
  603. {
  604. struct gendisk *disk = ioc->rqos.q->disk;
  605. if (!disk)
  606. return "<unknown>";
  607. return disk->disk_name;
  608. }
  609. static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
  610. {
  611. return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
  612. }
  613. static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
  614. {
  615. return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
  616. }
  617. static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
  618. {
  619. return pd_to_blkg(&iocg->pd);
  620. }
  621. static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
  622. {
  623. return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
  624. struct ioc_cgrp, cpd);
  625. }
  626. /*
  627. * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
  628. * weight, the more expensive each IO. Must round up.
  629. */
  630. static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
  631. {
  632. return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
  633. }
  634. /*
  635. * The inverse of abs_cost_to_cost(). Must round up.
  636. */
  637. static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
  638. {
  639. return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
  640. }
  641. static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
  642. u64 abs_cost, u64 cost)
  643. {
  644. struct iocg_pcpu_stat *gcs;
  645. bio->bi_iocost_cost = cost;
  646. atomic64_add(cost, &iocg->vtime);
  647. gcs = get_cpu_ptr(iocg->pcpu_stat);
  648. local64_add(abs_cost, &gcs->abs_vusage);
  649. put_cpu_ptr(gcs);
  650. }
  651. static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
  652. {
  653. if (lock_ioc) {
  654. spin_lock_irqsave(&iocg->ioc->lock, *flags);
  655. spin_lock(&iocg->waitq.lock);
  656. } else {
  657. spin_lock_irqsave(&iocg->waitq.lock, *flags);
  658. }
  659. }
  660. static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
  661. {
  662. if (unlock_ioc) {
  663. spin_unlock(&iocg->waitq.lock);
  664. spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
  665. } else {
  666. spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
  667. }
  668. }
  669. #define CREATE_TRACE_POINTS
  670. #include <trace/events/iocost.h>
  671. static void ioc_refresh_margins(struct ioc *ioc)
  672. {
  673. struct ioc_margins *margins = &ioc->margins;
  674. u32 period_us = ioc->period_us;
  675. u64 vrate = ioc->vtime_base_rate;
  676. margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
  677. margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
  678. margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
  679. }
  680. /* latency Qos params changed, update period_us and all the dependent params */
  681. static void ioc_refresh_period_us(struct ioc *ioc)
  682. {
  683. u32 ppm, lat, multi, period_us;
  684. lockdep_assert_held(&ioc->lock);
  685. /* pick the higher latency target */
  686. if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
  687. ppm = ioc->params.qos[QOS_RPPM];
  688. lat = ioc->params.qos[QOS_RLAT];
  689. } else {
  690. ppm = ioc->params.qos[QOS_WPPM];
  691. lat = ioc->params.qos[QOS_WLAT];
  692. }
  693. /*
  694. * We want the period to be long enough to contain a healthy number
  695. * of IOs while short enough for granular control. Define it as a
  696. * multiple of the latency target. Ideally, the multiplier should
  697. * be scaled according to the percentile so that it would nominally
  698. * contain a certain number of requests. Let's be simpler and
  699. * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
  700. */
  701. if (ppm)
  702. multi = max_t(u32, (MILLION - ppm) / 50000, 2);
  703. else
  704. multi = 2;
  705. period_us = multi * lat;
  706. period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
  707. /* calculate dependent params */
  708. ioc->period_us = period_us;
  709. ioc->timer_slack_ns = div64_u64(
  710. (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
  711. 100);
  712. ioc_refresh_margins(ioc);
  713. }
  714. static int ioc_autop_idx(struct ioc *ioc)
  715. {
  716. int idx = ioc->autop_idx;
  717. const struct ioc_params *p = &autop[idx];
  718. u32 vrate_pct;
  719. u64 now_ns;
  720. /* rotational? */
  721. if (!blk_queue_nonrot(ioc->rqos.q))
  722. return AUTOP_HDD;
  723. /* handle SATA SSDs w/ broken NCQ */
  724. if (blk_queue_depth(ioc->rqos.q) == 1)
  725. return AUTOP_SSD_QD1;
  726. /* use one of the normal ssd sets */
  727. if (idx < AUTOP_SSD_DFL)
  728. return AUTOP_SSD_DFL;
  729. /* if user is overriding anything, maintain what was there */
  730. if (ioc->user_qos_params || ioc->user_cost_model)
  731. return idx;
  732. /* step up/down based on the vrate */
  733. vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
  734. now_ns = ktime_get_ns();
  735. if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
  736. if (!ioc->autop_too_fast_at)
  737. ioc->autop_too_fast_at = now_ns;
  738. if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
  739. return idx + 1;
  740. } else {
  741. ioc->autop_too_fast_at = 0;
  742. }
  743. if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
  744. if (!ioc->autop_too_slow_at)
  745. ioc->autop_too_slow_at = now_ns;
  746. if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
  747. return idx - 1;
  748. } else {
  749. ioc->autop_too_slow_at = 0;
  750. }
  751. return idx;
  752. }
  753. /*
  754. * Take the followings as input
  755. *
  756. * @bps maximum sequential throughput
  757. * @seqiops maximum sequential 4k iops
  758. * @randiops maximum random 4k iops
  759. *
  760. * and calculate the linear model cost coefficients.
  761. *
  762. * *@page per-page cost 1s / (@bps / 4096)
  763. * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
  764. * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
  765. */
  766. static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
  767. u64 *page, u64 *seqio, u64 *randio)
  768. {
  769. u64 v;
  770. *page = *seqio = *randio = 0;
  771. if (bps) {
  772. u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
  773. if (bps_pages)
  774. *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
  775. else
  776. *page = 1;
  777. }
  778. if (seqiops) {
  779. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
  780. if (v > *page)
  781. *seqio = v - *page;
  782. }
  783. if (randiops) {
  784. v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
  785. if (v > *page)
  786. *randio = v - *page;
  787. }
  788. }
  789. static void ioc_refresh_lcoefs(struct ioc *ioc)
  790. {
  791. u64 *u = ioc->params.i_lcoefs;
  792. u64 *c = ioc->params.lcoefs;
  793. calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  794. &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
  795. calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
  796. &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
  797. }
  798. static bool ioc_refresh_params(struct ioc *ioc, bool force)
  799. {
  800. const struct ioc_params *p;
  801. int idx;
  802. lockdep_assert_held(&ioc->lock);
  803. idx = ioc_autop_idx(ioc);
  804. p = &autop[idx];
  805. if (idx == ioc->autop_idx && !force)
  806. return false;
  807. if (idx != ioc->autop_idx)
  808. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  809. ioc->autop_idx = idx;
  810. ioc->autop_too_fast_at = 0;
  811. ioc->autop_too_slow_at = 0;
  812. if (!ioc->user_qos_params)
  813. memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
  814. if (!ioc->user_cost_model)
  815. memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
  816. ioc_refresh_period_us(ioc);
  817. ioc_refresh_lcoefs(ioc);
  818. ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
  819. VTIME_PER_USEC, MILLION);
  820. ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
  821. VTIME_PER_USEC, MILLION);
  822. return true;
  823. }
  824. /*
  825. * When an iocg accumulates too much vtime or gets deactivated, we throw away
  826. * some vtime, which lowers the overall device utilization. As the exact amount
  827. * which is being thrown away is known, we can compensate by accelerating the
  828. * vrate accordingly so that the extra vtime generated in the current period
  829. * matches what got lost.
  830. */
  831. static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
  832. {
  833. s64 pleft = ioc->period_at + ioc->period_us - now->now;
  834. s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
  835. s64 vcomp, vcomp_min, vcomp_max;
  836. lockdep_assert_held(&ioc->lock);
  837. /* we need some time left in this period */
  838. if (pleft <= 0)
  839. goto done;
  840. /*
  841. * Calculate how much vrate should be adjusted to offset the error.
  842. * Limit the amount of adjustment and deduct the adjusted amount from
  843. * the error.
  844. */
  845. vcomp = -div64_s64(ioc->vtime_err, pleft);
  846. vcomp_min = -(ioc->vtime_base_rate >> 1);
  847. vcomp_max = ioc->vtime_base_rate;
  848. vcomp = clamp(vcomp, vcomp_min, vcomp_max);
  849. ioc->vtime_err += vcomp * pleft;
  850. atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
  851. done:
  852. /* bound how much error can accumulate */
  853. ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
  854. }
  855. static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
  856. int nr_lagging, int nr_shortages,
  857. int prev_busy_level, u32 *missed_ppm)
  858. {
  859. u64 vrate = ioc->vtime_base_rate;
  860. u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
  861. if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
  862. if (ioc->busy_level != prev_busy_level || nr_lagging)
  863. trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
  864. missed_ppm, rq_wait_pct,
  865. nr_lagging, nr_shortages);
  866. return;
  867. }
  868. /*
  869. * If vrate is out of bounds, apply clamp gradually as the
  870. * bounds can change abruptly. Otherwise, apply busy_level
  871. * based adjustment.
  872. */
  873. if (vrate < vrate_min) {
  874. vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
  875. vrate = min(vrate, vrate_min);
  876. } else if (vrate > vrate_max) {
  877. vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
  878. vrate = max(vrate, vrate_max);
  879. } else {
  880. int idx = min_t(int, abs(ioc->busy_level),
  881. ARRAY_SIZE(vrate_adj_pct) - 1);
  882. u32 adj_pct = vrate_adj_pct[idx];
  883. if (ioc->busy_level > 0)
  884. adj_pct = 100 - adj_pct;
  885. else
  886. adj_pct = 100 + adj_pct;
  887. vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
  888. vrate_min, vrate_max);
  889. }
  890. trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
  891. nr_lagging, nr_shortages);
  892. ioc->vtime_base_rate = vrate;
  893. ioc_refresh_margins(ioc);
  894. }
  895. /* take a snapshot of the current [v]time and vrate */
  896. static void ioc_now(struct ioc *ioc, struct ioc_now *now)
  897. {
  898. unsigned seq;
  899. now->now_ns = ktime_get();
  900. now->now = ktime_to_us(now->now_ns);
  901. now->vrate = atomic64_read(&ioc->vtime_rate);
  902. /*
  903. * The current vtime is
  904. *
  905. * vtime at period start + (wallclock time since the start) * vrate
  906. *
  907. * As a consistent snapshot of `period_at_vtime` and `period_at` is
  908. * needed, they're seqcount protected.
  909. */
  910. do {
  911. seq = read_seqcount_begin(&ioc->period_seqcount);
  912. now->vnow = ioc->period_at_vtime +
  913. (now->now - ioc->period_at) * now->vrate;
  914. } while (read_seqcount_retry(&ioc->period_seqcount, seq));
  915. }
  916. static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
  917. {
  918. WARN_ON_ONCE(ioc->running != IOC_RUNNING);
  919. write_seqcount_begin(&ioc->period_seqcount);
  920. ioc->period_at = now->now;
  921. ioc->period_at_vtime = now->vnow;
  922. write_seqcount_end(&ioc->period_seqcount);
  923. ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
  924. add_timer(&ioc->timer);
  925. }
  926. /*
  927. * Update @iocg's `active` and `inuse` to @active and @inuse, update level
  928. * weight sums and propagate upwards accordingly. If @save, the current margin
  929. * is saved to be used as reference for later inuse in-period adjustments.
  930. */
  931. static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  932. bool save, struct ioc_now *now)
  933. {
  934. struct ioc *ioc = iocg->ioc;
  935. int lvl;
  936. lockdep_assert_held(&ioc->lock);
  937. /*
  938. * For an active leaf node, its inuse shouldn't be zero or exceed
  939. * @active. An active internal node's inuse is solely determined by the
  940. * inuse to active ratio of its children regardless of @inuse.
  941. */
  942. if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
  943. inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
  944. iocg->child_active_sum);
  945. } else {
  946. inuse = clamp_t(u32, inuse, 1, active);
  947. }
  948. iocg->last_inuse = iocg->inuse;
  949. if (save)
  950. iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
  951. if (active == iocg->active && inuse == iocg->inuse)
  952. return;
  953. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  954. struct ioc_gq *parent = iocg->ancestors[lvl];
  955. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  956. u32 parent_active = 0, parent_inuse = 0;
  957. /* update the level sums */
  958. parent->child_active_sum += (s32)(active - child->active);
  959. parent->child_inuse_sum += (s32)(inuse - child->inuse);
  960. /* apply the updates */
  961. child->active = active;
  962. child->inuse = inuse;
  963. /*
  964. * The delta between inuse and active sums indicates that
  965. * much of weight is being given away. Parent's inuse
  966. * and active should reflect the ratio.
  967. */
  968. if (parent->child_active_sum) {
  969. parent_active = parent->weight;
  970. parent_inuse = DIV64_U64_ROUND_UP(
  971. parent_active * parent->child_inuse_sum,
  972. parent->child_active_sum);
  973. }
  974. /* do we need to keep walking up? */
  975. if (parent_active == parent->active &&
  976. parent_inuse == parent->inuse)
  977. break;
  978. active = parent_active;
  979. inuse = parent_inuse;
  980. }
  981. ioc->weights_updated = true;
  982. }
  983. static void commit_weights(struct ioc *ioc)
  984. {
  985. lockdep_assert_held(&ioc->lock);
  986. if (ioc->weights_updated) {
  987. /* paired with rmb in current_hweight(), see there */
  988. smp_wmb();
  989. atomic_inc(&ioc->hweight_gen);
  990. ioc->weights_updated = false;
  991. }
  992. }
  993. static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
  994. bool save, struct ioc_now *now)
  995. {
  996. __propagate_weights(iocg, active, inuse, save, now);
  997. commit_weights(iocg->ioc);
  998. }
  999. static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
  1000. {
  1001. struct ioc *ioc = iocg->ioc;
  1002. int lvl;
  1003. u32 hwa, hwi;
  1004. int ioc_gen;
  1005. /* hot path - if uptodate, use cached */
  1006. ioc_gen = atomic_read(&ioc->hweight_gen);
  1007. if (ioc_gen == iocg->hweight_gen)
  1008. goto out;
  1009. /*
  1010. * Paired with wmb in commit_weights(). If we saw the updated
  1011. * hweight_gen, all the weight updates from __propagate_weights() are
  1012. * visible too.
  1013. *
  1014. * We can race with weight updates during calculation and get it
  1015. * wrong. However, hweight_gen would have changed and a future
  1016. * reader will recalculate and we're guaranteed to discard the
  1017. * wrong result soon.
  1018. */
  1019. smp_rmb();
  1020. hwa = hwi = WEIGHT_ONE;
  1021. for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
  1022. struct ioc_gq *parent = iocg->ancestors[lvl];
  1023. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1024. u64 active_sum = READ_ONCE(parent->child_active_sum);
  1025. u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
  1026. u32 active = READ_ONCE(child->active);
  1027. u32 inuse = READ_ONCE(child->inuse);
  1028. /* we can race with deactivations and either may read as zero */
  1029. if (!active_sum || !inuse_sum)
  1030. continue;
  1031. active_sum = max_t(u64, active, active_sum);
  1032. hwa = div64_u64((u64)hwa * active, active_sum);
  1033. inuse_sum = max_t(u64, inuse, inuse_sum);
  1034. hwi = div64_u64((u64)hwi * inuse, inuse_sum);
  1035. }
  1036. iocg->hweight_active = max_t(u32, hwa, 1);
  1037. iocg->hweight_inuse = max_t(u32, hwi, 1);
  1038. iocg->hweight_gen = ioc_gen;
  1039. out:
  1040. if (hw_activep)
  1041. *hw_activep = iocg->hweight_active;
  1042. if (hw_inusep)
  1043. *hw_inusep = iocg->hweight_inuse;
  1044. }
  1045. /*
  1046. * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
  1047. * other weights stay unchanged.
  1048. */
  1049. static u32 current_hweight_max(struct ioc_gq *iocg)
  1050. {
  1051. u32 hwm = WEIGHT_ONE;
  1052. u32 inuse = iocg->active;
  1053. u64 child_inuse_sum;
  1054. int lvl;
  1055. lockdep_assert_held(&iocg->ioc->lock);
  1056. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1057. struct ioc_gq *parent = iocg->ancestors[lvl];
  1058. struct ioc_gq *child = iocg->ancestors[lvl + 1];
  1059. child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
  1060. hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
  1061. inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
  1062. parent->child_active_sum);
  1063. }
  1064. return max_t(u32, hwm, 1);
  1065. }
  1066. static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
  1067. {
  1068. struct ioc *ioc = iocg->ioc;
  1069. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1070. struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
  1071. u32 weight;
  1072. lockdep_assert_held(&ioc->lock);
  1073. weight = iocg->cfg_weight ?: iocc->dfl_weight;
  1074. if (weight != iocg->weight && iocg->active)
  1075. propagate_weights(iocg, weight, iocg->inuse, true, now);
  1076. iocg->weight = weight;
  1077. }
  1078. static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
  1079. {
  1080. struct ioc *ioc = iocg->ioc;
  1081. u64 last_period, cur_period;
  1082. u64 vtime, vtarget;
  1083. int i;
  1084. /*
  1085. * If seem to be already active, just update the stamp to tell the
  1086. * timer that we're still active. We don't mind occassional races.
  1087. */
  1088. if (!list_empty(&iocg->active_list)) {
  1089. ioc_now(ioc, now);
  1090. cur_period = atomic64_read(&ioc->cur_period);
  1091. if (atomic64_read(&iocg->active_period) != cur_period)
  1092. atomic64_set(&iocg->active_period, cur_period);
  1093. return true;
  1094. }
  1095. /* racy check on internal node IOs, treat as root level IOs */
  1096. if (iocg->child_active_sum)
  1097. return false;
  1098. spin_lock_irq(&ioc->lock);
  1099. ioc_now(ioc, now);
  1100. /* update period */
  1101. cur_period = atomic64_read(&ioc->cur_period);
  1102. last_period = atomic64_read(&iocg->active_period);
  1103. atomic64_set(&iocg->active_period, cur_period);
  1104. /* already activated or breaking leaf-only constraint? */
  1105. if (!list_empty(&iocg->active_list))
  1106. goto succeed_unlock;
  1107. for (i = iocg->level - 1; i > 0; i--)
  1108. if (!list_empty(&iocg->ancestors[i]->active_list))
  1109. goto fail_unlock;
  1110. if (iocg->child_active_sum)
  1111. goto fail_unlock;
  1112. /*
  1113. * Always start with the target budget. On deactivation, we throw away
  1114. * anything above it.
  1115. */
  1116. vtarget = now->vnow - ioc->margins.target;
  1117. vtime = atomic64_read(&iocg->vtime);
  1118. atomic64_add(vtarget - vtime, &iocg->vtime);
  1119. atomic64_add(vtarget - vtime, &iocg->done_vtime);
  1120. vtime = vtarget;
  1121. /*
  1122. * Activate, propagate weight and start period timer if not
  1123. * running. Reset hweight_gen to avoid accidental match from
  1124. * wrapping.
  1125. */
  1126. iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
  1127. list_add(&iocg->active_list, &ioc->active_iocgs);
  1128. propagate_weights(iocg, iocg->weight,
  1129. iocg->last_inuse ?: iocg->weight, true, now);
  1130. TRACE_IOCG_PATH(iocg_activate, iocg, now,
  1131. last_period, cur_period, vtime);
  1132. iocg->activated_at = now->now;
  1133. if (ioc->running == IOC_IDLE) {
  1134. ioc->running = IOC_RUNNING;
  1135. ioc->dfgv_period_at = now->now;
  1136. ioc->dfgv_period_rem = 0;
  1137. ioc_start_period(ioc, now);
  1138. }
  1139. succeed_unlock:
  1140. spin_unlock_irq(&ioc->lock);
  1141. return true;
  1142. fail_unlock:
  1143. spin_unlock_irq(&ioc->lock);
  1144. return false;
  1145. }
  1146. static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
  1147. {
  1148. struct ioc *ioc = iocg->ioc;
  1149. struct blkcg_gq *blkg = iocg_to_blkg(iocg);
  1150. u64 tdelta, delay, new_delay;
  1151. s64 vover, vover_pct;
  1152. u32 hwa;
  1153. lockdep_assert_held(&iocg->waitq.lock);
  1154. /* calculate the current delay in effect - 1/2 every second */
  1155. tdelta = now->now - iocg->delay_at;
  1156. if (iocg->delay)
  1157. delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
  1158. else
  1159. delay = 0;
  1160. /* calculate the new delay from the debt amount */
  1161. current_hweight(iocg, &hwa, NULL);
  1162. vover = atomic64_read(&iocg->vtime) +
  1163. abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
  1164. vover_pct = div64_s64(100 * vover,
  1165. ioc->period_us * ioc->vtime_base_rate);
  1166. if (vover_pct <= MIN_DELAY_THR_PCT)
  1167. new_delay = 0;
  1168. else if (vover_pct >= MAX_DELAY_THR_PCT)
  1169. new_delay = MAX_DELAY;
  1170. else
  1171. new_delay = MIN_DELAY +
  1172. div_u64((MAX_DELAY - MIN_DELAY) *
  1173. (vover_pct - MIN_DELAY_THR_PCT),
  1174. MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
  1175. /* pick the higher one and apply */
  1176. if (new_delay > delay) {
  1177. iocg->delay = new_delay;
  1178. iocg->delay_at = now->now;
  1179. delay = new_delay;
  1180. }
  1181. if (delay >= MIN_DELAY) {
  1182. if (!iocg->indelay_since)
  1183. iocg->indelay_since = now->now;
  1184. blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
  1185. return true;
  1186. } else {
  1187. if (iocg->indelay_since) {
  1188. iocg->stat.indelay_us += now->now - iocg->indelay_since;
  1189. iocg->indelay_since = 0;
  1190. }
  1191. iocg->delay = 0;
  1192. blkcg_clear_delay(blkg);
  1193. return false;
  1194. }
  1195. }
  1196. static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
  1197. struct ioc_now *now)
  1198. {
  1199. struct iocg_pcpu_stat *gcs;
  1200. lockdep_assert_held(&iocg->ioc->lock);
  1201. lockdep_assert_held(&iocg->waitq.lock);
  1202. WARN_ON_ONCE(list_empty(&iocg->active_list));
  1203. /*
  1204. * Once in debt, debt handling owns inuse. @iocg stays at the minimum
  1205. * inuse donating all of it share to others until its debt is paid off.
  1206. */
  1207. if (!iocg->abs_vdebt && abs_cost) {
  1208. iocg->indebt_since = now->now;
  1209. propagate_weights(iocg, iocg->active, 0, false, now);
  1210. }
  1211. iocg->abs_vdebt += abs_cost;
  1212. gcs = get_cpu_ptr(iocg->pcpu_stat);
  1213. local64_add(abs_cost, &gcs->abs_vusage);
  1214. put_cpu_ptr(gcs);
  1215. }
  1216. static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
  1217. struct ioc_now *now)
  1218. {
  1219. lockdep_assert_held(&iocg->ioc->lock);
  1220. lockdep_assert_held(&iocg->waitq.lock);
  1221. /* make sure that nobody messed with @iocg */
  1222. WARN_ON_ONCE(list_empty(&iocg->active_list));
  1223. WARN_ON_ONCE(iocg->inuse > 1);
  1224. iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
  1225. /* if debt is paid in full, restore inuse */
  1226. if (!iocg->abs_vdebt) {
  1227. iocg->stat.indebt_us += now->now - iocg->indebt_since;
  1228. iocg->indebt_since = 0;
  1229. propagate_weights(iocg, iocg->active, iocg->last_inuse,
  1230. false, now);
  1231. }
  1232. }
  1233. static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
  1234. int flags, void *key)
  1235. {
  1236. struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
  1237. struct iocg_wake_ctx *ctx = key;
  1238. u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
  1239. ctx->vbudget -= cost;
  1240. if (ctx->vbudget < 0)
  1241. return -1;
  1242. iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
  1243. wait->committed = true;
  1244. /*
  1245. * autoremove_wake_function() removes the wait entry only when it
  1246. * actually changed the task state. We want the wait always removed.
  1247. * Remove explicitly and use default_wake_function(). Note that the
  1248. * order of operations is important as finish_wait() tests whether
  1249. * @wq_entry is removed without grabbing the lock.
  1250. */
  1251. default_wake_function(wq_entry, mode, flags, key);
  1252. list_del_init_careful(&wq_entry->entry);
  1253. return 0;
  1254. }
  1255. /*
  1256. * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
  1257. * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
  1258. * addition to iocg->waitq.lock.
  1259. */
  1260. static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
  1261. struct ioc_now *now)
  1262. {
  1263. struct ioc *ioc = iocg->ioc;
  1264. struct iocg_wake_ctx ctx = { .iocg = iocg };
  1265. u64 vshortage, expires, oexpires;
  1266. s64 vbudget;
  1267. u32 hwa;
  1268. lockdep_assert_held(&iocg->waitq.lock);
  1269. current_hweight(iocg, &hwa, NULL);
  1270. vbudget = now->vnow - atomic64_read(&iocg->vtime);
  1271. /* pay off debt */
  1272. if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
  1273. u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
  1274. u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
  1275. u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
  1276. lockdep_assert_held(&ioc->lock);
  1277. atomic64_add(vpay, &iocg->vtime);
  1278. atomic64_add(vpay, &iocg->done_vtime);
  1279. iocg_pay_debt(iocg, abs_vpay, now);
  1280. vbudget -= vpay;
  1281. }
  1282. if (iocg->abs_vdebt || iocg->delay)
  1283. iocg_kick_delay(iocg, now);
  1284. /*
  1285. * Debt can still be outstanding if we haven't paid all yet or the
  1286. * caller raced and called without @pay_debt. Shouldn't wake up waiters
  1287. * under debt. Make sure @vbudget reflects the outstanding amount and is
  1288. * not positive.
  1289. */
  1290. if (iocg->abs_vdebt) {
  1291. s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
  1292. vbudget = min_t(s64, 0, vbudget - vdebt);
  1293. }
  1294. /*
  1295. * Wake up the ones which are due and see how much vtime we'll need for
  1296. * the next one. As paying off debt restores hw_inuse, it must be read
  1297. * after the above debt payment.
  1298. */
  1299. ctx.vbudget = vbudget;
  1300. current_hweight(iocg, NULL, &ctx.hw_inuse);
  1301. __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
  1302. if (!waitqueue_active(&iocg->waitq)) {
  1303. if (iocg->wait_since) {
  1304. iocg->stat.wait_us += now->now - iocg->wait_since;
  1305. iocg->wait_since = 0;
  1306. }
  1307. return;
  1308. }
  1309. if (!iocg->wait_since)
  1310. iocg->wait_since = now->now;
  1311. if (WARN_ON_ONCE(ctx.vbudget >= 0))
  1312. return;
  1313. /* determine next wakeup, add a timer margin to guarantee chunking */
  1314. vshortage = -ctx.vbudget;
  1315. expires = now->now_ns +
  1316. DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
  1317. NSEC_PER_USEC;
  1318. expires += ioc->timer_slack_ns;
  1319. /* if already active and close enough, don't bother */
  1320. oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
  1321. if (hrtimer_is_queued(&iocg->waitq_timer) &&
  1322. abs(oexpires - expires) <= ioc->timer_slack_ns)
  1323. return;
  1324. hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
  1325. ioc->timer_slack_ns, HRTIMER_MODE_ABS);
  1326. }
  1327. static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
  1328. {
  1329. struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
  1330. bool pay_debt = READ_ONCE(iocg->abs_vdebt);
  1331. struct ioc_now now;
  1332. unsigned long flags;
  1333. ioc_now(iocg->ioc, &now);
  1334. iocg_lock(iocg, pay_debt, &flags);
  1335. iocg_kick_waitq(iocg, pay_debt, &now);
  1336. iocg_unlock(iocg, pay_debt, &flags);
  1337. return HRTIMER_NORESTART;
  1338. }
  1339. static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
  1340. {
  1341. u32 nr_met[2] = { };
  1342. u32 nr_missed[2] = { };
  1343. u64 rq_wait_ns = 0;
  1344. int cpu, rw;
  1345. for_each_online_cpu(cpu) {
  1346. struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
  1347. u64 this_rq_wait_ns;
  1348. for (rw = READ; rw <= WRITE; rw++) {
  1349. u32 this_met = local_read(&stat->missed[rw].nr_met);
  1350. u32 this_missed = local_read(&stat->missed[rw].nr_missed);
  1351. nr_met[rw] += this_met - stat->missed[rw].last_met;
  1352. nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
  1353. stat->missed[rw].last_met = this_met;
  1354. stat->missed[rw].last_missed = this_missed;
  1355. }
  1356. this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
  1357. rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
  1358. stat->last_rq_wait_ns = this_rq_wait_ns;
  1359. }
  1360. for (rw = READ; rw <= WRITE; rw++) {
  1361. if (nr_met[rw] + nr_missed[rw])
  1362. missed_ppm_ar[rw] =
  1363. DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
  1364. nr_met[rw] + nr_missed[rw]);
  1365. else
  1366. missed_ppm_ar[rw] = 0;
  1367. }
  1368. *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
  1369. ioc->period_us * NSEC_PER_USEC);
  1370. }
  1371. /* was iocg idle this period? */
  1372. static bool iocg_is_idle(struct ioc_gq *iocg)
  1373. {
  1374. struct ioc *ioc = iocg->ioc;
  1375. /* did something get issued this period? */
  1376. if (atomic64_read(&iocg->active_period) ==
  1377. atomic64_read(&ioc->cur_period))
  1378. return false;
  1379. /* is something in flight? */
  1380. if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
  1381. return false;
  1382. return true;
  1383. }
  1384. /*
  1385. * Call this function on the target leaf @iocg's to build pre-order traversal
  1386. * list of all the ancestors in @inner_walk. The inner nodes are linked through
  1387. * ->walk_list and the caller is responsible for dissolving the list after use.
  1388. */
  1389. static void iocg_build_inner_walk(struct ioc_gq *iocg,
  1390. struct list_head *inner_walk)
  1391. {
  1392. int lvl;
  1393. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  1394. /* find the first ancestor which hasn't been visited yet */
  1395. for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
  1396. if (!list_empty(&iocg->ancestors[lvl]->walk_list))
  1397. break;
  1398. }
  1399. /* walk down and visit the inner nodes to get pre-order traversal */
  1400. while (++lvl <= iocg->level - 1) {
  1401. struct ioc_gq *inner = iocg->ancestors[lvl];
  1402. /* record traversal order */
  1403. list_add_tail(&inner->walk_list, inner_walk);
  1404. }
  1405. }
  1406. /* propagate the deltas to the parent */
  1407. static void iocg_flush_stat_upward(struct ioc_gq *iocg)
  1408. {
  1409. if (iocg->level > 0) {
  1410. struct iocg_stat *parent_stat =
  1411. &iocg->ancestors[iocg->level - 1]->stat;
  1412. parent_stat->usage_us +=
  1413. iocg->stat.usage_us - iocg->last_stat.usage_us;
  1414. parent_stat->wait_us +=
  1415. iocg->stat.wait_us - iocg->last_stat.wait_us;
  1416. parent_stat->indebt_us +=
  1417. iocg->stat.indebt_us - iocg->last_stat.indebt_us;
  1418. parent_stat->indelay_us +=
  1419. iocg->stat.indelay_us - iocg->last_stat.indelay_us;
  1420. }
  1421. iocg->last_stat = iocg->stat;
  1422. }
  1423. /* collect per-cpu counters and propagate the deltas to the parent */
  1424. static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
  1425. {
  1426. struct ioc *ioc = iocg->ioc;
  1427. u64 abs_vusage = 0;
  1428. u64 vusage_delta;
  1429. int cpu;
  1430. lockdep_assert_held(&iocg->ioc->lock);
  1431. /* collect per-cpu counters */
  1432. for_each_possible_cpu(cpu) {
  1433. abs_vusage += local64_read(
  1434. per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
  1435. }
  1436. vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
  1437. iocg->last_stat_abs_vusage = abs_vusage;
  1438. iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
  1439. iocg->stat.usage_us += iocg->usage_delta_us;
  1440. iocg_flush_stat_upward(iocg);
  1441. }
  1442. /* get stat counters ready for reading on all active iocgs */
  1443. static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
  1444. {
  1445. LIST_HEAD(inner_walk);
  1446. struct ioc_gq *iocg, *tiocg;
  1447. /* flush leaves and build inner node walk list */
  1448. list_for_each_entry(iocg, target_iocgs, active_list) {
  1449. iocg_flush_stat_leaf(iocg, now);
  1450. iocg_build_inner_walk(iocg, &inner_walk);
  1451. }
  1452. /* keep flushing upwards by walking the inner list backwards */
  1453. list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
  1454. iocg_flush_stat_upward(iocg);
  1455. list_del_init(&iocg->walk_list);
  1456. }
  1457. }
  1458. /*
  1459. * Determine what @iocg's hweight_inuse should be after donating unused
  1460. * capacity. @hwm is the upper bound and used to signal no donation. This
  1461. * function also throws away @iocg's excess budget.
  1462. */
  1463. static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
  1464. u32 usage, struct ioc_now *now)
  1465. {
  1466. struct ioc *ioc = iocg->ioc;
  1467. u64 vtime = atomic64_read(&iocg->vtime);
  1468. s64 excess, delta, target, new_hwi;
  1469. /* debt handling owns inuse for debtors */
  1470. if (iocg->abs_vdebt)
  1471. return 1;
  1472. /* see whether minimum margin requirement is met */
  1473. if (waitqueue_active(&iocg->waitq) ||
  1474. time_after64(vtime, now->vnow - ioc->margins.min))
  1475. return hwm;
  1476. /* throw away excess above target */
  1477. excess = now->vnow - vtime - ioc->margins.target;
  1478. if (excess > 0) {
  1479. atomic64_add(excess, &iocg->vtime);
  1480. atomic64_add(excess, &iocg->done_vtime);
  1481. vtime += excess;
  1482. ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
  1483. }
  1484. /*
  1485. * Let's say the distance between iocg's and device's vtimes as a
  1486. * fraction of period duration is delta. Assuming that the iocg will
  1487. * consume the usage determined above, we want to determine new_hwi so
  1488. * that delta equals MARGIN_TARGET at the end of the next period.
  1489. *
  1490. * We need to execute usage worth of IOs while spending the sum of the
  1491. * new budget (1 - MARGIN_TARGET) and the leftover from the last period
  1492. * (delta):
  1493. *
  1494. * usage = (1 - MARGIN_TARGET + delta) * new_hwi
  1495. *
  1496. * Therefore, the new_hwi is:
  1497. *
  1498. * new_hwi = usage / (1 - MARGIN_TARGET + delta)
  1499. */
  1500. delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
  1501. now->vnow - ioc->period_at_vtime);
  1502. target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
  1503. new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
  1504. return clamp_t(s64, new_hwi, 1, hwm);
  1505. }
  1506. /*
  1507. * For work-conservation, an iocg which isn't using all of its share should
  1508. * donate the leftover to other iocgs. There are two ways to achieve this - 1.
  1509. * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
  1510. *
  1511. * #1 is mathematically simpler but has the drawback of requiring synchronous
  1512. * global hweight_inuse updates when idle iocg's get activated or inuse weights
  1513. * change due to donation snapbacks as it has the possibility of grossly
  1514. * overshooting what's allowed by the model and vrate.
  1515. *
  1516. * #2 is inherently safe with local operations. The donating iocg can easily
  1517. * snap back to higher weights when needed without worrying about impacts on
  1518. * other nodes as the impacts will be inherently correct. This also makes idle
  1519. * iocg activations safe. The only effect activations have is decreasing
  1520. * hweight_inuse of others, the right solution to which is for those iocgs to
  1521. * snap back to higher weights.
  1522. *
  1523. * So, we go with #2. The challenge is calculating how each donating iocg's
  1524. * inuse should be adjusted to achieve the target donation amounts. This is done
  1525. * using Andy's method described in the following pdf.
  1526. *
  1527. * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
  1528. *
  1529. * Given the weights and target after-donation hweight_inuse values, Andy's
  1530. * method determines how the proportional distribution should look like at each
  1531. * sibling level to maintain the relative relationship between all non-donating
  1532. * pairs. To roughly summarize, it divides the tree into donating and
  1533. * non-donating parts, calculates global donation rate which is used to
  1534. * determine the target hweight_inuse for each node, and then derives per-level
  1535. * proportions.
  1536. *
  1537. * The following pdf shows that global distribution calculated this way can be
  1538. * achieved by scaling inuse weights of donating leaves and propagating the
  1539. * adjustments upwards proportionally.
  1540. *
  1541. * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
  1542. *
  1543. * Combining the above two, we can determine how each leaf iocg's inuse should
  1544. * be adjusted to achieve the target donation.
  1545. *
  1546. * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
  1547. *
  1548. * The inline comments use symbols from the last pdf.
  1549. *
  1550. * b is the sum of the absolute budgets in the subtree. 1 for the root node.
  1551. * f is the sum of the absolute budgets of non-donating nodes in the subtree.
  1552. * t is the sum of the absolute budgets of donating nodes in the subtree.
  1553. * w is the weight of the node. w = w_f + w_t
  1554. * w_f is the non-donating portion of w. w_f = w * f / b
  1555. * w_b is the donating portion of w. w_t = w * t / b
  1556. * s is the sum of all sibling weights. s = Sum(w) for siblings
  1557. * s_f and s_t are the non-donating and donating portions of s.
  1558. *
  1559. * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
  1560. * w_pt is the donating portion of the parent's weight and w'_pt the same value
  1561. * after adjustments. Subscript r denotes the root node's values.
  1562. */
  1563. static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
  1564. {
  1565. LIST_HEAD(over_hwa);
  1566. LIST_HEAD(inner_walk);
  1567. struct ioc_gq *iocg, *tiocg, *root_iocg;
  1568. u32 after_sum, over_sum, over_target, gamma;
  1569. /*
  1570. * It's pretty unlikely but possible for the total sum of
  1571. * hweight_after_donation's to be higher than WEIGHT_ONE, which will
  1572. * confuse the following calculations. If such condition is detected,
  1573. * scale down everyone over its full share equally to keep the sum below
  1574. * WEIGHT_ONE.
  1575. */
  1576. after_sum = 0;
  1577. over_sum = 0;
  1578. list_for_each_entry(iocg, surpluses, surplus_list) {
  1579. u32 hwa;
  1580. current_hweight(iocg, &hwa, NULL);
  1581. after_sum += iocg->hweight_after_donation;
  1582. if (iocg->hweight_after_donation > hwa) {
  1583. over_sum += iocg->hweight_after_donation;
  1584. list_add(&iocg->walk_list, &over_hwa);
  1585. }
  1586. }
  1587. if (after_sum >= WEIGHT_ONE) {
  1588. /*
  1589. * The delta should be deducted from the over_sum, calculate
  1590. * target over_sum value.
  1591. */
  1592. u32 over_delta = after_sum - (WEIGHT_ONE - 1);
  1593. WARN_ON_ONCE(over_sum <= over_delta);
  1594. over_target = over_sum - over_delta;
  1595. } else {
  1596. over_target = 0;
  1597. }
  1598. list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
  1599. if (over_target)
  1600. iocg->hweight_after_donation =
  1601. div_u64((u64)iocg->hweight_after_donation *
  1602. over_target, over_sum);
  1603. list_del_init(&iocg->walk_list);
  1604. }
  1605. /*
  1606. * Build pre-order inner node walk list and prepare for donation
  1607. * adjustment calculations.
  1608. */
  1609. list_for_each_entry(iocg, surpluses, surplus_list) {
  1610. iocg_build_inner_walk(iocg, &inner_walk);
  1611. }
  1612. root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
  1613. WARN_ON_ONCE(root_iocg->level > 0);
  1614. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1615. iocg->child_adjusted_sum = 0;
  1616. iocg->hweight_donating = 0;
  1617. iocg->hweight_after_donation = 0;
  1618. }
  1619. /*
  1620. * Propagate the donating budget (b_t) and after donation budget (b'_t)
  1621. * up the hierarchy.
  1622. */
  1623. list_for_each_entry(iocg, surpluses, surplus_list) {
  1624. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1625. parent->hweight_donating += iocg->hweight_donating;
  1626. parent->hweight_after_donation += iocg->hweight_after_donation;
  1627. }
  1628. list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
  1629. if (iocg->level > 0) {
  1630. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1631. parent->hweight_donating += iocg->hweight_donating;
  1632. parent->hweight_after_donation += iocg->hweight_after_donation;
  1633. }
  1634. }
  1635. /*
  1636. * Calculate inner hwa's (b) and make sure the donation values are
  1637. * within the accepted ranges as we're doing low res calculations with
  1638. * roundups.
  1639. */
  1640. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1641. if (iocg->level) {
  1642. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1643. iocg->hweight_active = DIV64_U64_ROUND_UP(
  1644. (u64)parent->hweight_active * iocg->active,
  1645. parent->child_active_sum);
  1646. }
  1647. iocg->hweight_donating = min(iocg->hweight_donating,
  1648. iocg->hweight_active);
  1649. iocg->hweight_after_donation = min(iocg->hweight_after_donation,
  1650. iocg->hweight_donating - 1);
  1651. if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
  1652. iocg->hweight_donating <= 1 ||
  1653. iocg->hweight_after_donation == 0)) {
  1654. pr_warn("iocg: invalid donation weights in ");
  1655. pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
  1656. pr_cont(": active=%u donating=%u after=%u\n",
  1657. iocg->hweight_active, iocg->hweight_donating,
  1658. iocg->hweight_after_donation);
  1659. }
  1660. }
  1661. /*
  1662. * Calculate the global donation rate (gamma) - the rate to adjust
  1663. * non-donating budgets by.
  1664. *
  1665. * No need to use 64bit multiplication here as the first operand is
  1666. * guaranteed to be smaller than WEIGHT_ONE (1<<16).
  1667. *
  1668. * We know that there are beneficiary nodes and the sum of the donating
  1669. * hweights can't be whole; however, due to the round-ups during hweight
  1670. * calculations, root_iocg->hweight_donating might still end up equal to
  1671. * or greater than whole. Limit the range when calculating the divider.
  1672. *
  1673. * gamma = (1 - t_r') / (1 - t_r)
  1674. */
  1675. gamma = DIV_ROUND_UP(
  1676. (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
  1677. WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
  1678. /*
  1679. * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
  1680. * nodes.
  1681. */
  1682. list_for_each_entry(iocg, &inner_walk, walk_list) {
  1683. struct ioc_gq *parent;
  1684. u32 inuse, wpt, wptp;
  1685. u64 st, sf;
  1686. if (iocg->level == 0) {
  1687. /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
  1688. iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
  1689. iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
  1690. WEIGHT_ONE - iocg->hweight_after_donation);
  1691. continue;
  1692. }
  1693. parent = iocg->ancestors[iocg->level - 1];
  1694. /* b' = gamma * b_f + b_t' */
  1695. iocg->hweight_inuse = DIV64_U64_ROUND_UP(
  1696. (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
  1697. WEIGHT_ONE) + iocg->hweight_after_donation;
  1698. /* w' = s' * b' / b'_p */
  1699. inuse = DIV64_U64_ROUND_UP(
  1700. (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
  1701. parent->hweight_inuse);
  1702. /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
  1703. st = DIV64_U64_ROUND_UP(
  1704. iocg->child_active_sum * iocg->hweight_donating,
  1705. iocg->hweight_active);
  1706. sf = iocg->child_active_sum - st;
  1707. wpt = DIV64_U64_ROUND_UP(
  1708. (u64)iocg->active * iocg->hweight_donating,
  1709. iocg->hweight_active);
  1710. wptp = DIV64_U64_ROUND_UP(
  1711. (u64)inuse * iocg->hweight_after_donation,
  1712. iocg->hweight_inuse);
  1713. iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
  1714. }
  1715. /*
  1716. * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
  1717. * we can finally determine leaf adjustments.
  1718. */
  1719. list_for_each_entry(iocg, surpluses, surplus_list) {
  1720. struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
  1721. u32 inuse;
  1722. /*
  1723. * In-debt iocgs participated in the donation calculation with
  1724. * the minimum target hweight_inuse. Configuring inuse
  1725. * accordingly would work fine but debt handling expects
  1726. * @iocg->inuse stay at the minimum and we don't wanna
  1727. * interfere.
  1728. */
  1729. if (iocg->abs_vdebt) {
  1730. WARN_ON_ONCE(iocg->inuse > 1);
  1731. continue;
  1732. }
  1733. /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
  1734. inuse = DIV64_U64_ROUND_UP(
  1735. parent->child_adjusted_sum * iocg->hweight_after_donation,
  1736. parent->hweight_inuse);
  1737. TRACE_IOCG_PATH(inuse_transfer, iocg, now,
  1738. iocg->inuse, inuse,
  1739. iocg->hweight_inuse,
  1740. iocg->hweight_after_donation);
  1741. __propagate_weights(iocg, iocg->active, inuse, true, now);
  1742. }
  1743. /* walk list should be dissolved after use */
  1744. list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
  1745. list_del_init(&iocg->walk_list);
  1746. }
  1747. /*
  1748. * A low weight iocg can amass a large amount of debt, for example, when
  1749. * anonymous memory gets reclaimed aggressively. If the system has a lot of
  1750. * memory paired with a slow IO device, the debt can span multiple seconds or
  1751. * more. If there are no other subsequent IO issuers, the in-debt iocg may end
  1752. * up blocked paying its debt while the IO device is idle.
  1753. *
  1754. * The following protects against such cases. If the device has been
  1755. * sufficiently idle for a while, the debts are halved and delays are
  1756. * recalculated.
  1757. */
  1758. static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
  1759. struct ioc_now *now)
  1760. {
  1761. struct ioc_gq *iocg;
  1762. u64 dur, usage_pct, nr_cycles;
  1763. /* if no debtor, reset the cycle */
  1764. if (!nr_debtors) {
  1765. ioc->dfgv_period_at = now->now;
  1766. ioc->dfgv_period_rem = 0;
  1767. ioc->dfgv_usage_us_sum = 0;
  1768. return;
  1769. }
  1770. /*
  1771. * Debtors can pass through a lot of writes choking the device and we
  1772. * don't want to be forgiving debts while the device is struggling from
  1773. * write bursts. If we're missing latency targets, consider the device
  1774. * fully utilized.
  1775. */
  1776. if (ioc->busy_level > 0)
  1777. usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
  1778. ioc->dfgv_usage_us_sum += usage_us_sum;
  1779. if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
  1780. return;
  1781. /*
  1782. * At least DFGV_PERIOD has passed since the last period. Calculate the
  1783. * average usage and reset the period counters.
  1784. */
  1785. dur = now->now - ioc->dfgv_period_at;
  1786. usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
  1787. ioc->dfgv_period_at = now->now;
  1788. ioc->dfgv_usage_us_sum = 0;
  1789. /* if was too busy, reset everything */
  1790. if (usage_pct > DFGV_USAGE_PCT) {
  1791. ioc->dfgv_period_rem = 0;
  1792. return;
  1793. }
  1794. /*
  1795. * Usage is lower than threshold. Let's forgive some debts. Debt
  1796. * forgiveness runs off of the usual ioc timer but its period usually
  1797. * doesn't match ioc's. Compensate the difference by performing the
  1798. * reduction as many times as would fit in the duration since the last
  1799. * run and carrying over the left-over duration in @ioc->dfgv_period_rem
  1800. * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
  1801. * reductions is doubled.
  1802. */
  1803. nr_cycles = dur + ioc->dfgv_period_rem;
  1804. ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
  1805. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1806. u64 __maybe_unused old_debt, __maybe_unused old_delay;
  1807. if (!iocg->abs_vdebt && !iocg->delay)
  1808. continue;
  1809. spin_lock(&iocg->waitq.lock);
  1810. old_debt = iocg->abs_vdebt;
  1811. old_delay = iocg->delay;
  1812. if (iocg->abs_vdebt)
  1813. iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
  1814. if (iocg->delay)
  1815. iocg->delay = iocg->delay >> nr_cycles ?: 1;
  1816. iocg_kick_waitq(iocg, true, now);
  1817. TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
  1818. old_debt, iocg->abs_vdebt,
  1819. old_delay, iocg->delay);
  1820. spin_unlock(&iocg->waitq.lock);
  1821. }
  1822. }
  1823. /*
  1824. * Check the active iocgs' state to avoid oversleeping and deactive
  1825. * idle iocgs.
  1826. *
  1827. * Since waiters determine the sleep durations based on the vrate
  1828. * they saw at the time of sleep, if vrate has increased, some
  1829. * waiters could be sleeping for too long. Wake up tardy waiters
  1830. * which should have woken up in the last period and expire idle
  1831. * iocgs.
  1832. */
  1833. static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
  1834. {
  1835. int nr_debtors = 0;
  1836. struct ioc_gq *iocg, *tiocg;
  1837. list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
  1838. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  1839. !iocg->delay && !iocg_is_idle(iocg))
  1840. continue;
  1841. spin_lock(&iocg->waitq.lock);
  1842. /* flush wait and indebt stat deltas */
  1843. if (iocg->wait_since) {
  1844. iocg->stat.wait_us += now->now - iocg->wait_since;
  1845. iocg->wait_since = now->now;
  1846. }
  1847. if (iocg->indebt_since) {
  1848. iocg->stat.indebt_us +=
  1849. now->now - iocg->indebt_since;
  1850. iocg->indebt_since = now->now;
  1851. }
  1852. if (iocg->indelay_since) {
  1853. iocg->stat.indelay_us +=
  1854. now->now - iocg->indelay_since;
  1855. iocg->indelay_since = now->now;
  1856. }
  1857. if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
  1858. iocg->delay) {
  1859. /* might be oversleeping vtime / hweight changes, kick */
  1860. iocg_kick_waitq(iocg, true, now);
  1861. if (iocg->abs_vdebt || iocg->delay)
  1862. nr_debtors++;
  1863. } else if (iocg_is_idle(iocg)) {
  1864. /* no waiter and idle, deactivate */
  1865. u64 vtime = atomic64_read(&iocg->vtime);
  1866. s64 excess;
  1867. /*
  1868. * @iocg has been inactive for a full duration and will
  1869. * have a high budget. Account anything above target as
  1870. * error and throw away. On reactivation, it'll start
  1871. * with the target budget.
  1872. */
  1873. excess = now->vnow - vtime - ioc->margins.target;
  1874. if (excess > 0) {
  1875. u32 old_hwi;
  1876. current_hweight(iocg, NULL, &old_hwi);
  1877. ioc->vtime_err -= div64_u64(excess * old_hwi,
  1878. WEIGHT_ONE);
  1879. }
  1880. TRACE_IOCG_PATH(iocg_idle, iocg, now,
  1881. atomic64_read(&iocg->active_period),
  1882. atomic64_read(&ioc->cur_period), vtime);
  1883. __propagate_weights(iocg, 0, 0, false, now);
  1884. list_del_init(&iocg->active_list);
  1885. }
  1886. spin_unlock(&iocg->waitq.lock);
  1887. }
  1888. commit_weights(ioc);
  1889. return nr_debtors;
  1890. }
  1891. static void ioc_timer_fn(struct timer_list *timer)
  1892. {
  1893. struct ioc *ioc = container_of(timer, struct ioc, timer);
  1894. struct ioc_gq *iocg, *tiocg;
  1895. struct ioc_now now;
  1896. LIST_HEAD(surpluses);
  1897. int nr_debtors, nr_shortages = 0, nr_lagging = 0;
  1898. u64 usage_us_sum = 0;
  1899. u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
  1900. u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
  1901. u32 missed_ppm[2], rq_wait_pct;
  1902. u64 period_vtime;
  1903. int prev_busy_level;
  1904. /* how were the latencies during the period? */
  1905. ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
  1906. /* take care of active iocgs */
  1907. spin_lock_irq(&ioc->lock);
  1908. ioc_now(ioc, &now);
  1909. period_vtime = now.vnow - ioc->period_at_vtime;
  1910. if (WARN_ON_ONCE(!period_vtime)) {
  1911. spin_unlock_irq(&ioc->lock);
  1912. return;
  1913. }
  1914. nr_debtors = ioc_check_iocgs(ioc, &now);
  1915. /*
  1916. * Wait and indebt stat are flushed above and the donation calculation
  1917. * below needs updated usage stat. Let's bring stat up-to-date.
  1918. */
  1919. iocg_flush_stat(&ioc->active_iocgs, &now);
  1920. /* calc usage and see whether some weights need to be moved around */
  1921. list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
  1922. u64 vdone, vtime, usage_us;
  1923. u32 hw_active, hw_inuse;
  1924. /*
  1925. * Collect unused and wind vtime closer to vnow to prevent
  1926. * iocgs from accumulating a large amount of budget.
  1927. */
  1928. vdone = atomic64_read(&iocg->done_vtime);
  1929. vtime = atomic64_read(&iocg->vtime);
  1930. current_hweight(iocg, &hw_active, &hw_inuse);
  1931. /*
  1932. * Latency QoS detection doesn't account for IOs which are
  1933. * in-flight for longer than a period. Detect them by
  1934. * comparing vdone against period start. If lagging behind
  1935. * IOs from past periods, don't increase vrate.
  1936. */
  1937. if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
  1938. !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
  1939. time_after64(vtime, vdone) &&
  1940. time_after64(vtime, now.vnow -
  1941. MAX_LAGGING_PERIODS * period_vtime) &&
  1942. time_before64(vdone, now.vnow - period_vtime))
  1943. nr_lagging++;
  1944. /*
  1945. * Determine absolute usage factoring in in-flight IOs to avoid
  1946. * high-latency completions appearing as idle.
  1947. */
  1948. usage_us = iocg->usage_delta_us;
  1949. usage_us_sum += usage_us;
  1950. /* see whether there's surplus vtime */
  1951. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  1952. if (hw_inuse < hw_active ||
  1953. (!waitqueue_active(&iocg->waitq) &&
  1954. time_before64(vtime, now.vnow - ioc->margins.low))) {
  1955. u32 hwa, old_hwi, hwm, new_hwi, usage;
  1956. u64 usage_dur;
  1957. if (vdone != vtime) {
  1958. u64 inflight_us = DIV64_U64_ROUND_UP(
  1959. cost_to_abs_cost(vtime - vdone, hw_inuse),
  1960. ioc->vtime_base_rate);
  1961. usage_us = max(usage_us, inflight_us);
  1962. }
  1963. /* convert to hweight based usage ratio */
  1964. if (time_after64(iocg->activated_at, ioc->period_at))
  1965. usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
  1966. else
  1967. usage_dur = max_t(u64, now.now - ioc->period_at, 1);
  1968. usage = clamp_t(u32,
  1969. DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
  1970. usage_dur),
  1971. 1, WEIGHT_ONE);
  1972. /*
  1973. * Already donating or accumulated enough to start.
  1974. * Determine the donation amount.
  1975. */
  1976. current_hweight(iocg, &hwa, &old_hwi);
  1977. hwm = current_hweight_max(iocg);
  1978. new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
  1979. usage, &now);
  1980. /*
  1981. * Donation calculation assumes hweight_after_donation
  1982. * to be positive, a condition that a donor w/ hwa < 2
  1983. * can't meet. Don't bother with donation if hwa is
  1984. * below 2. It's not gonna make a meaningful difference
  1985. * anyway.
  1986. */
  1987. if (new_hwi < hwm && hwa >= 2) {
  1988. iocg->hweight_donating = hwa;
  1989. iocg->hweight_after_donation = new_hwi;
  1990. list_add(&iocg->surplus_list, &surpluses);
  1991. } else if (!iocg->abs_vdebt) {
  1992. /*
  1993. * @iocg doesn't have enough to donate. Reset
  1994. * its inuse to active.
  1995. *
  1996. * Don't reset debtors as their inuse's are
  1997. * owned by debt handling. This shouldn't affect
  1998. * donation calculuation in any meaningful way
  1999. * as @iocg doesn't have a meaningful amount of
  2000. * share anyway.
  2001. */
  2002. TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
  2003. iocg->inuse, iocg->active,
  2004. iocg->hweight_inuse, new_hwi);
  2005. __propagate_weights(iocg, iocg->active,
  2006. iocg->active, true, &now);
  2007. nr_shortages++;
  2008. }
  2009. } else {
  2010. /* genuinely short on vtime */
  2011. nr_shortages++;
  2012. }
  2013. }
  2014. if (!list_empty(&surpluses) && nr_shortages)
  2015. transfer_surpluses(&surpluses, &now);
  2016. commit_weights(ioc);
  2017. /* surplus list should be dissolved after use */
  2018. list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
  2019. list_del_init(&iocg->surplus_list);
  2020. /*
  2021. * If q is getting clogged or we're missing too much, we're issuing
  2022. * too much IO and should lower vtime rate. If we're not missing
  2023. * and experiencing shortages but not surpluses, we're too stingy
  2024. * and should increase vtime rate.
  2025. */
  2026. prev_busy_level = ioc->busy_level;
  2027. if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
  2028. missed_ppm[READ] > ppm_rthr ||
  2029. missed_ppm[WRITE] > ppm_wthr) {
  2030. /* clearly missing QoS targets, slow down vrate */
  2031. ioc->busy_level = max(ioc->busy_level, 0);
  2032. ioc->busy_level++;
  2033. } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
  2034. missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
  2035. missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
  2036. /* QoS targets are being met with >25% margin */
  2037. if (nr_shortages) {
  2038. /*
  2039. * We're throttling while the device has spare
  2040. * capacity. If vrate was being slowed down, stop.
  2041. */
  2042. ioc->busy_level = min(ioc->busy_level, 0);
  2043. /*
  2044. * If there are IOs spanning multiple periods, wait
  2045. * them out before pushing the device harder.
  2046. */
  2047. if (!nr_lagging)
  2048. ioc->busy_level--;
  2049. } else {
  2050. /*
  2051. * Nobody is being throttled and the users aren't
  2052. * issuing enough IOs to saturate the device. We
  2053. * simply don't know how close the device is to
  2054. * saturation. Coast.
  2055. */
  2056. ioc->busy_level = 0;
  2057. }
  2058. } else {
  2059. /* inside the hysterisis margin, we're good */
  2060. ioc->busy_level = 0;
  2061. }
  2062. ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
  2063. ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
  2064. prev_busy_level, missed_ppm);
  2065. ioc_refresh_params(ioc, false);
  2066. ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
  2067. /*
  2068. * This period is done. Move onto the next one. If nothing's
  2069. * going on with the device, stop the timer.
  2070. */
  2071. atomic64_inc(&ioc->cur_period);
  2072. if (ioc->running != IOC_STOP) {
  2073. if (!list_empty(&ioc->active_iocgs)) {
  2074. ioc_start_period(ioc, &now);
  2075. } else {
  2076. ioc->busy_level = 0;
  2077. ioc->vtime_err = 0;
  2078. ioc->running = IOC_IDLE;
  2079. }
  2080. ioc_refresh_vrate(ioc, &now);
  2081. }
  2082. spin_unlock_irq(&ioc->lock);
  2083. }
  2084. static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
  2085. u64 abs_cost, struct ioc_now *now)
  2086. {
  2087. struct ioc *ioc = iocg->ioc;
  2088. struct ioc_margins *margins = &ioc->margins;
  2089. u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
  2090. u32 hwi, adj_step;
  2091. s64 margin;
  2092. u64 cost, new_inuse;
  2093. unsigned long flags;
  2094. current_hweight(iocg, NULL, &hwi);
  2095. old_hwi = hwi;
  2096. cost = abs_cost_to_cost(abs_cost, hwi);
  2097. margin = now->vnow - vtime - cost;
  2098. /* debt handling owns inuse for debtors */
  2099. if (iocg->abs_vdebt)
  2100. return cost;
  2101. /*
  2102. * We only increase inuse during period and do so if the margin has
  2103. * deteriorated since the previous adjustment.
  2104. */
  2105. if (margin >= iocg->saved_margin || margin >= margins->low ||
  2106. iocg->inuse == iocg->active)
  2107. return cost;
  2108. spin_lock_irqsave(&ioc->lock, flags);
  2109. /* we own inuse only when @iocg is in the normal active state */
  2110. if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
  2111. spin_unlock_irqrestore(&ioc->lock, flags);
  2112. return cost;
  2113. }
  2114. /*
  2115. * Bump up inuse till @abs_cost fits in the existing budget.
  2116. * adj_step must be determined after acquiring ioc->lock - we might
  2117. * have raced and lost to another thread for activation and could
  2118. * be reading 0 iocg->active before ioc->lock which will lead to
  2119. * infinite loop.
  2120. */
  2121. new_inuse = iocg->inuse;
  2122. adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
  2123. do {
  2124. new_inuse = new_inuse + adj_step;
  2125. propagate_weights(iocg, iocg->active, new_inuse, true, now);
  2126. current_hweight(iocg, NULL, &hwi);
  2127. cost = abs_cost_to_cost(abs_cost, hwi);
  2128. } while (time_after64(vtime + cost, now->vnow) &&
  2129. iocg->inuse != iocg->active);
  2130. spin_unlock_irqrestore(&ioc->lock, flags);
  2131. TRACE_IOCG_PATH(inuse_adjust, iocg, now,
  2132. old_inuse, iocg->inuse, old_hwi, hwi);
  2133. return cost;
  2134. }
  2135. static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
  2136. bool is_merge, u64 *costp)
  2137. {
  2138. struct ioc *ioc = iocg->ioc;
  2139. u64 coef_seqio, coef_randio, coef_page;
  2140. u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
  2141. u64 seek_pages = 0;
  2142. u64 cost = 0;
  2143. switch (bio_op(bio)) {
  2144. case REQ_OP_READ:
  2145. coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
  2146. coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
  2147. coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
  2148. break;
  2149. case REQ_OP_WRITE:
  2150. coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
  2151. coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
  2152. coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
  2153. break;
  2154. default:
  2155. goto out;
  2156. }
  2157. if (iocg->cursor) {
  2158. seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
  2159. seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
  2160. }
  2161. if (!is_merge) {
  2162. if (seek_pages > LCOEF_RANDIO_PAGES) {
  2163. cost += coef_randio;
  2164. } else {
  2165. cost += coef_seqio;
  2166. }
  2167. }
  2168. cost += pages * coef_page;
  2169. out:
  2170. *costp = cost;
  2171. }
  2172. static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
  2173. {
  2174. u64 cost;
  2175. calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
  2176. return cost;
  2177. }
  2178. static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
  2179. u64 *costp)
  2180. {
  2181. unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
  2182. switch (req_op(rq)) {
  2183. case REQ_OP_READ:
  2184. *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
  2185. break;
  2186. case REQ_OP_WRITE:
  2187. *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
  2188. break;
  2189. default:
  2190. *costp = 0;
  2191. }
  2192. }
  2193. static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
  2194. {
  2195. u64 cost;
  2196. calc_size_vtime_cost_builtin(rq, ioc, &cost);
  2197. return cost;
  2198. }
  2199. static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
  2200. {
  2201. struct blkcg_gq *blkg = bio->bi_blkg;
  2202. struct ioc *ioc = rqos_to_ioc(rqos);
  2203. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2204. struct ioc_now now;
  2205. struct iocg_wait wait;
  2206. u64 abs_cost, cost, vtime;
  2207. bool use_debt, ioc_locked;
  2208. unsigned long flags;
  2209. /* bypass IOs if disabled, still initializing, or for root cgroup */
  2210. if (!ioc->enabled || !iocg || !iocg->level)
  2211. return;
  2212. /* calculate the absolute vtime cost */
  2213. abs_cost = calc_vtime_cost(bio, iocg, false);
  2214. if (!abs_cost)
  2215. return;
  2216. if (!iocg_activate(iocg, &now))
  2217. return;
  2218. iocg->cursor = bio_end_sector(bio);
  2219. vtime = atomic64_read(&iocg->vtime);
  2220. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2221. /*
  2222. * If no one's waiting and within budget, issue right away. The
  2223. * tests are racy but the races aren't systemic - we only miss once
  2224. * in a while which is fine.
  2225. */
  2226. if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
  2227. time_before_eq64(vtime + cost, now.vnow)) {
  2228. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2229. return;
  2230. }
  2231. /*
  2232. * We're over budget. This can be handled in two ways. IOs which may
  2233. * cause priority inversions are punted to @ioc->aux_iocg and charged as
  2234. * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
  2235. * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
  2236. * whether debt handling is needed and acquire locks accordingly.
  2237. */
  2238. use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
  2239. ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
  2240. retry_lock:
  2241. iocg_lock(iocg, ioc_locked, &flags);
  2242. /*
  2243. * @iocg must stay activated for debt and waitq handling. Deactivation
  2244. * is synchronized against both ioc->lock and waitq.lock and we won't
  2245. * get deactivated as long as we're waiting or has debt, so we're good
  2246. * if we're activated here. In the unlikely cases that we aren't, just
  2247. * issue the IO.
  2248. */
  2249. if (unlikely(list_empty(&iocg->active_list))) {
  2250. iocg_unlock(iocg, ioc_locked, &flags);
  2251. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2252. return;
  2253. }
  2254. /*
  2255. * We're over budget. If @bio has to be issued regardless, remember
  2256. * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
  2257. * off the debt before waking more IOs.
  2258. *
  2259. * This way, the debt is continuously paid off each period with the
  2260. * actual budget available to the cgroup. If we just wound vtime, we
  2261. * would incorrectly use the current hw_inuse for the entire amount
  2262. * which, for example, can lead to the cgroup staying blocked for a
  2263. * long time even with substantially raised hw_inuse.
  2264. *
  2265. * An iocg with vdebt should stay online so that the timer can keep
  2266. * deducting its vdebt and [de]activate use_delay mechanism
  2267. * accordingly. We don't want to race against the timer trying to
  2268. * clear them and leave @iocg inactive w/ dangling use_delay heavily
  2269. * penalizing the cgroup and its descendants.
  2270. */
  2271. if (use_debt) {
  2272. iocg_incur_debt(iocg, abs_cost, &now);
  2273. if (iocg_kick_delay(iocg, &now))
  2274. blkcg_schedule_throttle(rqos->q->disk,
  2275. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2276. iocg_unlock(iocg, ioc_locked, &flags);
  2277. return;
  2278. }
  2279. /* guarantee that iocgs w/ waiters have maximum inuse */
  2280. if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
  2281. if (!ioc_locked) {
  2282. iocg_unlock(iocg, false, &flags);
  2283. ioc_locked = true;
  2284. goto retry_lock;
  2285. }
  2286. propagate_weights(iocg, iocg->active, iocg->active, true,
  2287. &now);
  2288. }
  2289. /*
  2290. * Append self to the waitq and schedule the wakeup timer if we're
  2291. * the first waiter. The timer duration is calculated based on the
  2292. * current vrate. vtime and hweight changes can make it too short
  2293. * or too long. Each wait entry records the absolute cost it's
  2294. * waiting for to allow re-evaluation using a custom wait entry.
  2295. *
  2296. * If too short, the timer simply reschedules itself. If too long,
  2297. * the period timer will notice and trigger wakeups.
  2298. *
  2299. * All waiters are on iocg->waitq and the wait states are
  2300. * synchronized using waitq.lock.
  2301. */
  2302. init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
  2303. wait.wait.private = current;
  2304. wait.bio = bio;
  2305. wait.abs_cost = abs_cost;
  2306. wait.committed = false; /* will be set true by waker */
  2307. __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
  2308. iocg_kick_waitq(iocg, ioc_locked, &now);
  2309. iocg_unlock(iocg, ioc_locked, &flags);
  2310. while (true) {
  2311. set_current_state(TASK_UNINTERRUPTIBLE);
  2312. if (wait.committed)
  2313. break;
  2314. io_schedule();
  2315. }
  2316. /* waker already committed us, proceed */
  2317. finish_wait(&iocg->waitq, &wait.wait);
  2318. }
  2319. static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
  2320. struct bio *bio)
  2321. {
  2322. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2323. struct ioc *ioc = rqos_to_ioc(rqos);
  2324. sector_t bio_end = bio_end_sector(bio);
  2325. struct ioc_now now;
  2326. u64 vtime, abs_cost, cost;
  2327. unsigned long flags;
  2328. /* bypass if disabled, still initializing, or for root cgroup */
  2329. if (!ioc->enabled || !iocg || !iocg->level)
  2330. return;
  2331. abs_cost = calc_vtime_cost(bio, iocg, true);
  2332. if (!abs_cost)
  2333. return;
  2334. ioc_now(ioc, &now);
  2335. vtime = atomic64_read(&iocg->vtime);
  2336. cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
  2337. /* update cursor if backmerging into the request at the cursor */
  2338. if (blk_rq_pos(rq) < bio_end &&
  2339. blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
  2340. iocg->cursor = bio_end;
  2341. /*
  2342. * Charge if there's enough vtime budget and the existing request has
  2343. * cost assigned.
  2344. */
  2345. if (rq->bio && rq->bio->bi_iocost_cost &&
  2346. time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
  2347. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2348. return;
  2349. }
  2350. /*
  2351. * Otherwise, account it as debt if @iocg is online, which it should
  2352. * be for the vast majority of cases. See debt handling in
  2353. * ioc_rqos_throttle() for details.
  2354. */
  2355. spin_lock_irqsave(&ioc->lock, flags);
  2356. spin_lock(&iocg->waitq.lock);
  2357. if (likely(!list_empty(&iocg->active_list))) {
  2358. iocg_incur_debt(iocg, abs_cost, &now);
  2359. if (iocg_kick_delay(iocg, &now))
  2360. blkcg_schedule_throttle(rqos->q->disk,
  2361. (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
  2362. } else {
  2363. iocg_commit_bio(iocg, bio, abs_cost, cost);
  2364. }
  2365. spin_unlock(&iocg->waitq.lock);
  2366. spin_unlock_irqrestore(&ioc->lock, flags);
  2367. }
  2368. static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
  2369. {
  2370. struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
  2371. if (iocg && bio->bi_iocost_cost)
  2372. atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
  2373. }
  2374. static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
  2375. {
  2376. struct ioc *ioc = rqos_to_ioc(rqos);
  2377. struct ioc_pcpu_stat *ccs;
  2378. u64 on_q_ns, rq_wait_ns, size_nsec;
  2379. int pidx, rw;
  2380. if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
  2381. return;
  2382. switch (req_op(rq)) {
  2383. case REQ_OP_READ:
  2384. pidx = QOS_RLAT;
  2385. rw = READ;
  2386. break;
  2387. case REQ_OP_WRITE:
  2388. pidx = QOS_WLAT;
  2389. rw = WRITE;
  2390. break;
  2391. default:
  2392. return;
  2393. }
  2394. on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
  2395. rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
  2396. size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
  2397. ccs = get_cpu_ptr(ioc->pcpu_stat);
  2398. if (on_q_ns <= size_nsec ||
  2399. on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
  2400. local_inc(&ccs->missed[rw].nr_met);
  2401. else
  2402. local_inc(&ccs->missed[rw].nr_missed);
  2403. local64_add(rq_wait_ns, &ccs->rq_wait_ns);
  2404. put_cpu_ptr(ccs);
  2405. }
  2406. static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
  2407. {
  2408. struct ioc *ioc = rqos_to_ioc(rqos);
  2409. spin_lock_irq(&ioc->lock);
  2410. ioc_refresh_params(ioc, false);
  2411. spin_unlock_irq(&ioc->lock);
  2412. }
  2413. static void ioc_rqos_exit(struct rq_qos *rqos)
  2414. {
  2415. struct ioc *ioc = rqos_to_ioc(rqos);
  2416. blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
  2417. spin_lock_irq(&ioc->lock);
  2418. ioc->running = IOC_STOP;
  2419. spin_unlock_irq(&ioc->lock);
  2420. del_timer_sync(&ioc->timer);
  2421. free_percpu(ioc->pcpu_stat);
  2422. kfree(ioc);
  2423. }
  2424. static struct rq_qos_ops ioc_rqos_ops = {
  2425. .throttle = ioc_rqos_throttle,
  2426. .merge = ioc_rqos_merge,
  2427. .done_bio = ioc_rqos_done_bio,
  2428. .done = ioc_rqos_done,
  2429. .queue_depth_changed = ioc_rqos_queue_depth_changed,
  2430. .exit = ioc_rqos_exit,
  2431. };
  2432. static int blk_iocost_init(struct gendisk *disk)
  2433. {
  2434. struct request_queue *q = disk->queue;
  2435. struct ioc *ioc;
  2436. struct rq_qos *rqos;
  2437. int i, cpu, ret;
  2438. ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
  2439. if (!ioc)
  2440. return -ENOMEM;
  2441. ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
  2442. if (!ioc->pcpu_stat) {
  2443. kfree(ioc);
  2444. return -ENOMEM;
  2445. }
  2446. for_each_possible_cpu(cpu) {
  2447. struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
  2448. for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
  2449. local_set(&ccs->missed[i].nr_met, 0);
  2450. local_set(&ccs->missed[i].nr_missed, 0);
  2451. }
  2452. local64_set(&ccs->rq_wait_ns, 0);
  2453. }
  2454. rqos = &ioc->rqos;
  2455. rqos->id = RQ_QOS_COST;
  2456. rqos->ops = &ioc_rqos_ops;
  2457. rqos->q = q;
  2458. spin_lock_init(&ioc->lock);
  2459. timer_setup(&ioc->timer, ioc_timer_fn, 0);
  2460. INIT_LIST_HEAD(&ioc->active_iocgs);
  2461. ioc->running = IOC_IDLE;
  2462. ioc->vtime_base_rate = VTIME_PER_USEC;
  2463. atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
  2464. seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
  2465. ioc->period_at = ktime_to_us(ktime_get());
  2466. atomic64_set(&ioc->cur_period, 0);
  2467. atomic_set(&ioc->hweight_gen, 0);
  2468. spin_lock_irq(&ioc->lock);
  2469. ioc->autop_idx = AUTOP_INVALID;
  2470. ioc_refresh_params(ioc, true);
  2471. spin_unlock_irq(&ioc->lock);
  2472. /*
  2473. * rqos must be added before activation to allow iocg_pd_init() to
  2474. * lookup the ioc from q. This means that the rqos methods may get
  2475. * called before policy activation completion, can't assume that the
  2476. * target bio has an iocg associated and need to test for NULL iocg.
  2477. */
  2478. ret = rq_qos_add(q, rqos);
  2479. if (ret)
  2480. goto err_free_ioc;
  2481. ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
  2482. if (ret)
  2483. goto err_del_qos;
  2484. return 0;
  2485. err_del_qos:
  2486. rq_qos_del(q, rqos);
  2487. err_free_ioc:
  2488. free_percpu(ioc->pcpu_stat);
  2489. kfree(ioc);
  2490. return ret;
  2491. }
  2492. static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
  2493. {
  2494. struct ioc_cgrp *iocc;
  2495. iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
  2496. if (!iocc)
  2497. return NULL;
  2498. iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
  2499. return &iocc->cpd;
  2500. }
  2501. static void ioc_cpd_free(struct blkcg_policy_data *cpd)
  2502. {
  2503. kfree(container_of(cpd, struct ioc_cgrp, cpd));
  2504. }
  2505. static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
  2506. struct blkcg *blkcg)
  2507. {
  2508. int levels = blkcg->css.cgroup->level + 1;
  2509. struct ioc_gq *iocg;
  2510. iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
  2511. if (!iocg)
  2512. return NULL;
  2513. iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
  2514. if (!iocg->pcpu_stat) {
  2515. kfree(iocg);
  2516. return NULL;
  2517. }
  2518. return &iocg->pd;
  2519. }
  2520. static void ioc_pd_init(struct blkg_policy_data *pd)
  2521. {
  2522. struct ioc_gq *iocg = pd_to_iocg(pd);
  2523. struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
  2524. struct ioc *ioc = q_to_ioc(blkg->q);
  2525. struct ioc_now now;
  2526. struct blkcg_gq *tblkg;
  2527. unsigned long flags;
  2528. ioc_now(ioc, &now);
  2529. iocg->ioc = ioc;
  2530. atomic64_set(&iocg->vtime, now.vnow);
  2531. atomic64_set(&iocg->done_vtime, now.vnow);
  2532. atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
  2533. INIT_LIST_HEAD(&iocg->active_list);
  2534. INIT_LIST_HEAD(&iocg->walk_list);
  2535. INIT_LIST_HEAD(&iocg->surplus_list);
  2536. iocg->hweight_active = WEIGHT_ONE;
  2537. iocg->hweight_inuse = WEIGHT_ONE;
  2538. init_waitqueue_head(&iocg->waitq);
  2539. hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
  2540. iocg->waitq_timer.function = iocg_waitq_timer_fn;
  2541. iocg->level = blkg->blkcg->css.cgroup->level;
  2542. for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
  2543. struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
  2544. iocg->ancestors[tiocg->level] = tiocg;
  2545. }
  2546. spin_lock_irqsave(&ioc->lock, flags);
  2547. weight_updated(iocg, &now);
  2548. spin_unlock_irqrestore(&ioc->lock, flags);
  2549. }
  2550. static void ioc_pd_free(struct blkg_policy_data *pd)
  2551. {
  2552. struct ioc_gq *iocg = pd_to_iocg(pd);
  2553. struct ioc *ioc = iocg->ioc;
  2554. unsigned long flags;
  2555. if (ioc) {
  2556. spin_lock_irqsave(&ioc->lock, flags);
  2557. if (!list_empty(&iocg->active_list)) {
  2558. struct ioc_now now;
  2559. ioc_now(ioc, &now);
  2560. propagate_weights(iocg, 0, 0, false, &now);
  2561. list_del_init(&iocg->active_list);
  2562. }
  2563. WARN_ON_ONCE(!list_empty(&iocg->walk_list));
  2564. WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
  2565. spin_unlock_irqrestore(&ioc->lock, flags);
  2566. hrtimer_cancel(&iocg->waitq_timer);
  2567. }
  2568. free_percpu(iocg->pcpu_stat);
  2569. kfree(iocg);
  2570. }
  2571. static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
  2572. {
  2573. struct ioc_gq *iocg = pd_to_iocg(pd);
  2574. struct ioc *ioc = iocg->ioc;
  2575. if (!ioc->enabled)
  2576. return;
  2577. if (iocg->level == 0) {
  2578. unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
  2579. ioc->vtime_base_rate * 10000,
  2580. VTIME_PER_USEC);
  2581. seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
  2582. }
  2583. seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
  2584. if (blkcg_debug_stats)
  2585. seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
  2586. iocg->last_stat.wait_us,
  2587. iocg->last_stat.indebt_us,
  2588. iocg->last_stat.indelay_us);
  2589. }
  2590. static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2591. int off)
  2592. {
  2593. const char *dname = blkg_dev_name(pd->blkg);
  2594. struct ioc_gq *iocg = pd_to_iocg(pd);
  2595. if (dname && iocg->cfg_weight)
  2596. seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
  2597. return 0;
  2598. }
  2599. static int ioc_weight_show(struct seq_file *sf, void *v)
  2600. {
  2601. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2602. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2603. seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
  2604. blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
  2605. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2606. return 0;
  2607. }
  2608. static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
  2609. size_t nbytes, loff_t off)
  2610. {
  2611. struct blkcg *blkcg = css_to_blkcg(of_css(of));
  2612. struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
  2613. struct blkg_conf_ctx ctx;
  2614. struct ioc_now now;
  2615. struct ioc_gq *iocg;
  2616. u32 v;
  2617. int ret;
  2618. if (!strchr(buf, ':')) {
  2619. struct blkcg_gq *blkg;
  2620. if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
  2621. return -EINVAL;
  2622. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2623. return -EINVAL;
  2624. spin_lock_irq(&blkcg->lock);
  2625. iocc->dfl_weight = v * WEIGHT_ONE;
  2626. hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
  2627. struct ioc_gq *iocg = blkg_to_iocg(blkg);
  2628. if (iocg) {
  2629. spin_lock(&iocg->ioc->lock);
  2630. ioc_now(iocg->ioc, &now);
  2631. weight_updated(iocg, &now);
  2632. spin_unlock(&iocg->ioc->lock);
  2633. }
  2634. }
  2635. spin_unlock_irq(&blkcg->lock);
  2636. return nbytes;
  2637. }
  2638. ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
  2639. if (ret)
  2640. return ret;
  2641. iocg = blkg_to_iocg(ctx.blkg);
  2642. if (!strncmp(ctx.body, "default", 7)) {
  2643. v = 0;
  2644. } else {
  2645. if (!sscanf(ctx.body, "%u", &v))
  2646. goto einval;
  2647. if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
  2648. goto einval;
  2649. }
  2650. spin_lock(&iocg->ioc->lock);
  2651. iocg->cfg_weight = v * WEIGHT_ONE;
  2652. ioc_now(iocg->ioc, &now);
  2653. weight_updated(iocg, &now);
  2654. spin_unlock(&iocg->ioc->lock);
  2655. blkg_conf_finish(&ctx);
  2656. return nbytes;
  2657. einval:
  2658. blkg_conf_finish(&ctx);
  2659. return -EINVAL;
  2660. }
  2661. static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
  2662. int off)
  2663. {
  2664. const char *dname = blkg_dev_name(pd->blkg);
  2665. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2666. if (!dname)
  2667. return 0;
  2668. seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
  2669. dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
  2670. ioc->params.qos[QOS_RPPM] / 10000,
  2671. ioc->params.qos[QOS_RPPM] % 10000 / 100,
  2672. ioc->params.qos[QOS_RLAT],
  2673. ioc->params.qos[QOS_WPPM] / 10000,
  2674. ioc->params.qos[QOS_WPPM] % 10000 / 100,
  2675. ioc->params.qos[QOS_WLAT],
  2676. ioc->params.qos[QOS_MIN] / 10000,
  2677. ioc->params.qos[QOS_MIN] % 10000 / 100,
  2678. ioc->params.qos[QOS_MAX] / 10000,
  2679. ioc->params.qos[QOS_MAX] % 10000 / 100);
  2680. return 0;
  2681. }
  2682. static int ioc_qos_show(struct seq_file *sf, void *v)
  2683. {
  2684. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2685. blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
  2686. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2687. return 0;
  2688. }
  2689. static const match_table_t qos_ctrl_tokens = {
  2690. { QOS_ENABLE, "enable=%u" },
  2691. { QOS_CTRL, "ctrl=%s" },
  2692. { NR_QOS_CTRL_PARAMS, NULL },
  2693. };
  2694. static const match_table_t qos_tokens = {
  2695. { QOS_RPPM, "rpct=%s" },
  2696. { QOS_RLAT, "rlat=%u" },
  2697. { QOS_WPPM, "wpct=%s" },
  2698. { QOS_WLAT, "wlat=%u" },
  2699. { QOS_MIN, "min=%s" },
  2700. { QOS_MAX, "max=%s" },
  2701. { NR_QOS_PARAMS, NULL },
  2702. };
  2703. static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
  2704. size_t nbytes, loff_t off)
  2705. {
  2706. struct block_device *bdev;
  2707. struct gendisk *disk;
  2708. struct ioc *ioc;
  2709. u32 qos[NR_QOS_PARAMS];
  2710. bool enable, user;
  2711. char *p;
  2712. int ret;
  2713. bdev = blkcg_conf_open_bdev(&input);
  2714. if (IS_ERR(bdev))
  2715. return PTR_ERR(bdev);
  2716. disk = bdev->bd_disk;
  2717. ioc = q_to_ioc(disk->queue);
  2718. if (!ioc) {
  2719. ret = blk_iocost_init(disk);
  2720. if (ret)
  2721. goto err;
  2722. ioc = q_to_ioc(disk->queue);
  2723. }
  2724. spin_lock_irq(&ioc->lock);
  2725. memcpy(qos, ioc->params.qos, sizeof(qos));
  2726. enable = ioc->enabled;
  2727. user = ioc->user_qos_params;
  2728. spin_unlock_irq(&ioc->lock);
  2729. while ((p = strsep(&input, " \t\n"))) {
  2730. substring_t args[MAX_OPT_ARGS];
  2731. char buf[32];
  2732. int tok;
  2733. s64 v;
  2734. if (!*p)
  2735. continue;
  2736. switch (match_token(p, qos_ctrl_tokens, args)) {
  2737. case QOS_ENABLE:
  2738. match_u64(&args[0], &v);
  2739. enable = v;
  2740. continue;
  2741. case QOS_CTRL:
  2742. match_strlcpy(buf, &args[0], sizeof(buf));
  2743. if (!strcmp(buf, "auto"))
  2744. user = false;
  2745. else if (!strcmp(buf, "user"))
  2746. user = true;
  2747. else
  2748. goto einval;
  2749. continue;
  2750. }
  2751. tok = match_token(p, qos_tokens, args);
  2752. switch (tok) {
  2753. case QOS_RPPM:
  2754. case QOS_WPPM:
  2755. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2756. sizeof(buf))
  2757. goto einval;
  2758. if (cgroup_parse_float(buf, 2, &v))
  2759. goto einval;
  2760. if (v < 0 || v > 10000)
  2761. goto einval;
  2762. qos[tok] = v * 100;
  2763. break;
  2764. case QOS_RLAT:
  2765. case QOS_WLAT:
  2766. if (match_u64(&args[0], &v))
  2767. goto einval;
  2768. qos[tok] = v;
  2769. break;
  2770. case QOS_MIN:
  2771. case QOS_MAX:
  2772. if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
  2773. sizeof(buf))
  2774. goto einval;
  2775. if (cgroup_parse_float(buf, 2, &v))
  2776. goto einval;
  2777. if (v < 0)
  2778. goto einval;
  2779. qos[tok] = clamp_t(s64, v * 100,
  2780. VRATE_MIN_PPM, VRATE_MAX_PPM);
  2781. break;
  2782. default:
  2783. goto einval;
  2784. }
  2785. user = true;
  2786. }
  2787. if (qos[QOS_MIN] > qos[QOS_MAX])
  2788. goto einval;
  2789. spin_lock_irq(&ioc->lock);
  2790. if (enable) {
  2791. blk_stat_enable_accounting(disk->queue);
  2792. blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2793. ioc->enabled = true;
  2794. } else {
  2795. blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
  2796. ioc->enabled = false;
  2797. }
  2798. if (user) {
  2799. memcpy(ioc->params.qos, qos, sizeof(qos));
  2800. ioc->user_qos_params = true;
  2801. } else {
  2802. ioc->user_qos_params = false;
  2803. }
  2804. ioc_refresh_params(ioc, true);
  2805. spin_unlock_irq(&ioc->lock);
  2806. blkdev_put_no_open(bdev);
  2807. return nbytes;
  2808. einval:
  2809. ret = -EINVAL;
  2810. err:
  2811. blkdev_put_no_open(bdev);
  2812. return ret;
  2813. }
  2814. static u64 ioc_cost_model_prfill(struct seq_file *sf,
  2815. struct blkg_policy_data *pd, int off)
  2816. {
  2817. const char *dname = blkg_dev_name(pd->blkg);
  2818. struct ioc *ioc = pd_to_iocg(pd)->ioc;
  2819. u64 *u = ioc->params.i_lcoefs;
  2820. if (!dname)
  2821. return 0;
  2822. seq_printf(sf, "%s ctrl=%s model=linear "
  2823. "rbps=%llu rseqiops=%llu rrandiops=%llu "
  2824. "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
  2825. dname, ioc->user_cost_model ? "user" : "auto",
  2826. u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
  2827. u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
  2828. return 0;
  2829. }
  2830. static int ioc_cost_model_show(struct seq_file *sf, void *v)
  2831. {
  2832. struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
  2833. blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
  2834. &blkcg_policy_iocost, seq_cft(sf)->private, false);
  2835. return 0;
  2836. }
  2837. static const match_table_t cost_ctrl_tokens = {
  2838. { COST_CTRL, "ctrl=%s" },
  2839. { COST_MODEL, "model=%s" },
  2840. { NR_COST_CTRL_PARAMS, NULL },
  2841. };
  2842. static const match_table_t i_lcoef_tokens = {
  2843. { I_LCOEF_RBPS, "rbps=%u" },
  2844. { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
  2845. { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
  2846. { I_LCOEF_WBPS, "wbps=%u" },
  2847. { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
  2848. { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
  2849. { NR_I_LCOEFS, NULL },
  2850. };
  2851. static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
  2852. size_t nbytes, loff_t off)
  2853. {
  2854. struct block_device *bdev;
  2855. struct ioc *ioc;
  2856. u64 u[NR_I_LCOEFS];
  2857. bool user;
  2858. char *p;
  2859. int ret;
  2860. bdev = blkcg_conf_open_bdev(&input);
  2861. if (IS_ERR(bdev))
  2862. return PTR_ERR(bdev);
  2863. ioc = q_to_ioc(bdev_get_queue(bdev));
  2864. if (!ioc) {
  2865. ret = blk_iocost_init(bdev->bd_disk);
  2866. if (ret)
  2867. goto err;
  2868. ioc = q_to_ioc(bdev_get_queue(bdev));
  2869. }
  2870. spin_lock_irq(&ioc->lock);
  2871. memcpy(u, ioc->params.i_lcoefs, sizeof(u));
  2872. user = ioc->user_cost_model;
  2873. spin_unlock_irq(&ioc->lock);
  2874. while ((p = strsep(&input, " \t\n"))) {
  2875. substring_t args[MAX_OPT_ARGS];
  2876. char buf[32];
  2877. int tok;
  2878. u64 v;
  2879. if (!*p)
  2880. continue;
  2881. switch (match_token(p, cost_ctrl_tokens, args)) {
  2882. case COST_CTRL:
  2883. match_strlcpy(buf, &args[0], sizeof(buf));
  2884. if (!strcmp(buf, "auto"))
  2885. user = false;
  2886. else if (!strcmp(buf, "user"))
  2887. user = true;
  2888. else
  2889. goto einval;
  2890. continue;
  2891. case COST_MODEL:
  2892. match_strlcpy(buf, &args[0], sizeof(buf));
  2893. if (strcmp(buf, "linear"))
  2894. goto einval;
  2895. continue;
  2896. }
  2897. tok = match_token(p, i_lcoef_tokens, args);
  2898. if (tok == NR_I_LCOEFS)
  2899. goto einval;
  2900. if (match_u64(&args[0], &v))
  2901. goto einval;
  2902. u[tok] = v;
  2903. user = true;
  2904. }
  2905. spin_lock_irq(&ioc->lock);
  2906. if (user) {
  2907. memcpy(ioc->params.i_lcoefs, u, sizeof(u));
  2908. ioc->user_cost_model = true;
  2909. } else {
  2910. ioc->user_cost_model = false;
  2911. }
  2912. ioc_refresh_params(ioc, true);
  2913. spin_unlock_irq(&ioc->lock);
  2914. blkdev_put_no_open(bdev);
  2915. return nbytes;
  2916. einval:
  2917. ret = -EINVAL;
  2918. err:
  2919. blkdev_put_no_open(bdev);
  2920. return ret;
  2921. }
  2922. static struct cftype ioc_files[] = {
  2923. {
  2924. .name = "weight",
  2925. .flags = CFTYPE_NOT_ON_ROOT,
  2926. .seq_show = ioc_weight_show,
  2927. .write = ioc_weight_write,
  2928. },
  2929. {
  2930. .name = "cost.qos",
  2931. .flags = CFTYPE_ONLY_ON_ROOT,
  2932. .seq_show = ioc_qos_show,
  2933. .write = ioc_qos_write,
  2934. },
  2935. {
  2936. .name = "cost.model",
  2937. .flags = CFTYPE_ONLY_ON_ROOT,
  2938. .seq_show = ioc_cost_model_show,
  2939. .write = ioc_cost_model_write,
  2940. },
  2941. {}
  2942. };
  2943. static struct blkcg_policy blkcg_policy_iocost = {
  2944. .dfl_cftypes = ioc_files,
  2945. .cpd_alloc_fn = ioc_cpd_alloc,
  2946. .cpd_free_fn = ioc_cpd_free,
  2947. .pd_alloc_fn = ioc_pd_alloc,
  2948. .pd_init_fn = ioc_pd_init,
  2949. .pd_free_fn = ioc_pd_free,
  2950. .pd_stat_fn = ioc_pd_stat,
  2951. };
  2952. static int __init ioc_init(void)
  2953. {
  2954. return blkcg_policy_register(&blkcg_policy_iocost);
  2955. }
  2956. static void __exit ioc_exit(void)
  2957. {
  2958. blkcg_policy_unregister(&blkcg_policy_iocost);
  2959. }
  2960. module_init(ioc_init);
  2961. module_exit(ioc_exit);