timer.c 62 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Kernel internal timers
  4. *
  5. * Copyright (C) 1991, 1992 Linus Torvalds
  6. *
  7. * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better.
  8. *
  9. * 1997-09-10 Updated NTP code according to technical memorandum Jan '96
  10. * "A Kernel Model for Precision Timekeeping" by Dave Mills
  11. * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to
  12. * serialize accesses to xtime/lost_ticks).
  13. * Copyright (C) 1998 Andrea Arcangeli
  14. * 1999-03-10 Improved NTP compatibility by Ulrich Windl
  15. * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love
  16. * 2000-10-05 Implemented scalable SMP per-CPU timer handling.
  17. * Copyright (C) 2000, 2001, 2002 Ingo Molnar
  18. * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar
  19. */
  20. #include <linux/kernel_stat.h>
  21. #include <linux/export.h>
  22. #include <linux/interrupt.h>
  23. #include <linux/percpu.h>
  24. #include <linux/init.h>
  25. #include <linux/mm.h>
  26. #include <linux/swap.h>
  27. #include <linux/pid_namespace.h>
  28. #include <linux/notifier.h>
  29. #include <linux/thread_info.h>
  30. #include <linux/time.h>
  31. #include <linux/jiffies.h>
  32. #include <linux/posix-timers.h>
  33. #include <linux/cpu.h>
  34. #include <linux/syscalls.h>
  35. #include <linux/delay.h>
  36. #include <linux/tick.h>
  37. #include <linux/kallsyms.h>
  38. #include <linux/irq_work.h>
  39. #include <linux/sched/signal.h>
  40. #include <linux/sched/sysctl.h>
  41. #include <linux/sched/nohz.h>
  42. #include <linux/sched/debug.h>
  43. #include <linux/slab.h>
  44. #include <linux/compat.h>
  45. #include <linux/random.h>
  46. #include <linux/sysctl.h>
  47. #include <linux/uaccess.h>
  48. #include <asm/unistd.h>
  49. #include <asm/div64.h>
  50. #include <asm/timex.h>
  51. #include <asm/io.h>
  52. #include "tick-internal.h"
  53. #define CREATE_TRACE_POINTS
  54. #include <trace/events/timer.h>
  55. #undef CREATE_TRACE_POINTS
  56. #include <trace/hooks/timer.h>
  57. EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_entry);
  58. EXPORT_TRACEPOINT_SYMBOL_GPL(hrtimer_expire_exit);
  59. __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
  60. EXPORT_SYMBOL(jiffies_64);
  61. /*
  62. * The timer wheel has LVL_DEPTH array levels. Each level provides an array of
  63. * LVL_SIZE buckets. Each level is driven by its own clock and therefor each
  64. * level has a different granularity.
  65. *
  66. * The level granularity is: LVL_CLK_DIV ^ lvl
  67. * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level)
  68. *
  69. * The array level of a newly armed timer depends on the relative expiry
  70. * time. The farther the expiry time is away the higher the array level and
  71. * therefor the granularity becomes.
  72. *
  73. * Contrary to the original timer wheel implementation, which aims for 'exact'
  74. * expiry of the timers, this implementation removes the need for recascading
  75. * the timers into the lower array levels. The previous 'classic' timer wheel
  76. * implementation of the kernel already violated the 'exact' expiry by adding
  77. * slack to the expiry time to provide batched expiration. The granularity
  78. * levels provide implicit batching.
  79. *
  80. * This is an optimization of the original timer wheel implementation for the
  81. * majority of the timer wheel use cases: timeouts. The vast majority of
  82. * timeout timers (networking, disk I/O ...) are canceled before expiry. If
  83. * the timeout expires it indicates that normal operation is disturbed, so it
  84. * does not matter much whether the timeout comes with a slight delay.
  85. *
  86. * The only exception to this are networking timers with a small expiry
  87. * time. They rely on the granularity. Those fit into the first wheel level,
  88. * which has HZ granularity.
  89. *
  90. * We don't have cascading anymore. timers with a expiry time above the
  91. * capacity of the last wheel level are force expired at the maximum timeout
  92. * value of the last wheel level. From data sampling we know that the maximum
  93. * value observed is 5 days (network connection tracking), so this should not
  94. * be an issue.
  95. *
  96. * The currently chosen array constants values are a good compromise between
  97. * array size and granularity.
  98. *
  99. * This results in the following granularity and range levels:
  100. *
  101. * HZ 1000 steps
  102. * Level Offset Granularity Range
  103. * 0 0 1 ms 0 ms - 63 ms
  104. * 1 64 8 ms 64 ms - 511 ms
  105. * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s)
  106. * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s)
  107. * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m)
  108. * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m)
  109. * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h)
  110. * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d)
  111. * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d)
  112. *
  113. * HZ 300
  114. * Level Offset Granularity Range
  115. * 0 0 3 ms 0 ms - 210 ms
  116. * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s)
  117. * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s)
  118. * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m)
  119. * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m)
  120. * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h)
  121. * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h)
  122. * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d)
  123. * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d)
  124. *
  125. * HZ 250
  126. * Level Offset Granularity Range
  127. * 0 0 4 ms 0 ms - 255 ms
  128. * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s)
  129. * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s)
  130. * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m)
  131. * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m)
  132. * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h)
  133. * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h)
  134. * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d)
  135. * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d)
  136. *
  137. * HZ 100
  138. * Level Offset Granularity Range
  139. * 0 0 10 ms 0 ms - 630 ms
  140. * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s)
  141. * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s)
  142. * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m)
  143. * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m)
  144. * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h)
  145. * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d)
  146. * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d)
  147. */
  148. /* Clock divisor for the next level */
  149. #define LVL_CLK_SHIFT 3
  150. #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT)
  151. #define LVL_CLK_MASK (LVL_CLK_DIV - 1)
  152. #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT)
  153. #define LVL_GRAN(n) (1UL << LVL_SHIFT(n))
  154. /*
  155. * The time start value for each level to select the bucket at enqueue
  156. * time. We start from the last possible delta of the previous level
  157. * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()).
  158. */
  159. #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT))
  160. /* Size of each clock level */
  161. #define LVL_BITS 6
  162. #define LVL_SIZE (1UL << LVL_BITS)
  163. #define LVL_MASK (LVL_SIZE - 1)
  164. #define LVL_OFFS(n) ((n) * LVL_SIZE)
  165. /* Level depth */
  166. #if HZ > 100
  167. # define LVL_DEPTH 9
  168. # else
  169. # define LVL_DEPTH 8
  170. #endif
  171. /* The cutoff (max. capacity of the wheel) */
  172. #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH))
  173. #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1))
  174. /*
  175. * The resulting wheel size. If NOHZ is configured we allocate two
  176. * wheels so we have a separate storage for the deferrable timers.
  177. */
  178. #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH)
  179. #ifdef CONFIG_NO_HZ_COMMON
  180. # define NR_BASES 2
  181. # define BASE_STD 0
  182. # define BASE_DEF 1
  183. #else
  184. # define NR_BASES 1
  185. # define BASE_STD 0
  186. # define BASE_DEF 0
  187. #endif
  188. struct timer_base {
  189. raw_spinlock_t lock;
  190. struct timer_list *running_timer;
  191. #ifdef CONFIG_PREEMPT_RT
  192. spinlock_t expiry_lock;
  193. atomic_t timer_waiters;
  194. #endif
  195. unsigned long clk;
  196. unsigned long next_expiry;
  197. unsigned int cpu;
  198. bool next_expiry_recalc;
  199. bool is_idle;
  200. bool timers_pending;
  201. DECLARE_BITMAP(pending_map, WHEEL_SIZE);
  202. struct hlist_head vectors[WHEEL_SIZE];
  203. } ____cacheline_aligned;
  204. static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
  205. #ifdef CONFIG_NO_HZ_COMMON
  206. static DEFINE_STATIC_KEY_FALSE(timers_nohz_active);
  207. static DEFINE_MUTEX(timer_keys_mutex);
  208. static void timer_update_keys(struct work_struct *work);
  209. static DECLARE_WORK(timer_update_work, timer_update_keys);
  210. #ifdef CONFIG_SMP
  211. static unsigned int sysctl_timer_migration = 1;
  212. DEFINE_STATIC_KEY_FALSE(timers_migration_enabled);
  213. static void timers_update_migration(void)
  214. {
  215. if (sysctl_timer_migration && tick_nohz_active)
  216. static_branch_enable(&timers_migration_enabled);
  217. else
  218. static_branch_disable(&timers_migration_enabled);
  219. }
  220. #ifdef CONFIG_SYSCTL
  221. static int timer_migration_handler(struct ctl_table *table, int write,
  222. void *buffer, size_t *lenp, loff_t *ppos)
  223. {
  224. int ret;
  225. mutex_lock(&timer_keys_mutex);
  226. ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  227. if (!ret && write)
  228. timers_update_migration();
  229. mutex_unlock(&timer_keys_mutex);
  230. return ret;
  231. }
  232. static struct ctl_table timer_sysctl[] = {
  233. {
  234. .procname = "timer_migration",
  235. .data = &sysctl_timer_migration,
  236. .maxlen = sizeof(unsigned int),
  237. .mode = 0644,
  238. .proc_handler = timer_migration_handler,
  239. .extra1 = SYSCTL_ZERO,
  240. .extra2 = SYSCTL_ONE,
  241. },
  242. {}
  243. };
  244. static int __init timer_sysctl_init(void)
  245. {
  246. register_sysctl("kernel", timer_sysctl);
  247. return 0;
  248. }
  249. device_initcall(timer_sysctl_init);
  250. #endif /* CONFIG_SYSCTL */
  251. #else /* CONFIG_SMP */
  252. static inline void timers_update_migration(void) { }
  253. #endif /* !CONFIG_SMP */
  254. static void timer_update_keys(struct work_struct *work)
  255. {
  256. mutex_lock(&timer_keys_mutex);
  257. timers_update_migration();
  258. static_branch_enable(&timers_nohz_active);
  259. mutex_unlock(&timer_keys_mutex);
  260. }
  261. void timers_update_nohz(void)
  262. {
  263. schedule_work(&timer_update_work);
  264. }
  265. static inline bool is_timers_nohz_active(void)
  266. {
  267. return static_branch_unlikely(&timers_nohz_active);
  268. }
  269. #else
  270. static inline bool is_timers_nohz_active(void) { return false; }
  271. #endif /* NO_HZ_COMMON */
  272. static unsigned long round_jiffies_common(unsigned long j, int cpu,
  273. bool force_up)
  274. {
  275. int rem;
  276. unsigned long original = j;
  277. /*
  278. * We don't want all cpus firing their timers at once hitting the
  279. * same lock or cachelines, so we skew each extra cpu with an extra
  280. * 3 jiffies. This 3 jiffies came originally from the mm/ code which
  281. * already did this.
  282. * The skew is done by adding 3*cpunr, then round, then subtract this
  283. * extra offset again.
  284. */
  285. j += cpu * 3;
  286. rem = j % HZ;
  287. /*
  288. * If the target jiffie is just after a whole second (which can happen
  289. * due to delays of the timer irq, long irq off times etc etc) then
  290. * we should round down to the whole second, not up. Use 1/4th second
  291. * as cutoff for this rounding as an extreme upper bound for this.
  292. * But never round down if @force_up is set.
  293. */
  294. if (rem < HZ/4 && !force_up) /* round down */
  295. j = j - rem;
  296. else /* round up */
  297. j = j - rem + HZ;
  298. /* now that we have rounded, subtract the extra skew again */
  299. j -= cpu * 3;
  300. /*
  301. * Make sure j is still in the future. Otherwise return the
  302. * unmodified value.
  303. */
  304. return time_is_after_jiffies(j) ? j : original;
  305. }
  306. /**
  307. * __round_jiffies - function to round jiffies to a full second
  308. * @j: the time in (absolute) jiffies that should be rounded
  309. * @cpu: the processor number on which the timeout will happen
  310. *
  311. * __round_jiffies() rounds an absolute time in the future (in jiffies)
  312. * up or down to (approximately) full seconds. This is useful for timers
  313. * for which the exact time they fire does not matter too much, as long as
  314. * they fire approximately every X seconds.
  315. *
  316. * By rounding these timers to whole seconds, all such timers will fire
  317. * at the same time, rather than at various times spread out. The goal
  318. * of this is to have the CPU wake up less, which saves power.
  319. *
  320. * The exact rounding is skewed for each processor to avoid all
  321. * processors firing at the exact same time, which could lead
  322. * to lock contention or spurious cache line bouncing.
  323. *
  324. * The return value is the rounded version of the @j parameter.
  325. */
  326. unsigned long __round_jiffies(unsigned long j, int cpu)
  327. {
  328. return round_jiffies_common(j, cpu, false);
  329. }
  330. EXPORT_SYMBOL_GPL(__round_jiffies);
  331. /**
  332. * __round_jiffies_relative - function to round jiffies to a full second
  333. * @j: the time in (relative) jiffies that should be rounded
  334. * @cpu: the processor number on which the timeout will happen
  335. *
  336. * __round_jiffies_relative() rounds a time delta in the future (in jiffies)
  337. * up or down to (approximately) full seconds. This is useful for timers
  338. * for which the exact time they fire does not matter too much, as long as
  339. * they fire approximately every X seconds.
  340. *
  341. * By rounding these timers to whole seconds, all such timers will fire
  342. * at the same time, rather than at various times spread out. The goal
  343. * of this is to have the CPU wake up less, which saves power.
  344. *
  345. * The exact rounding is skewed for each processor to avoid all
  346. * processors firing at the exact same time, which could lead
  347. * to lock contention or spurious cache line bouncing.
  348. *
  349. * The return value is the rounded version of the @j parameter.
  350. */
  351. unsigned long __round_jiffies_relative(unsigned long j, int cpu)
  352. {
  353. unsigned long j0 = jiffies;
  354. /* Use j0 because jiffies might change while we run */
  355. return round_jiffies_common(j + j0, cpu, false) - j0;
  356. }
  357. EXPORT_SYMBOL_GPL(__round_jiffies_relative);
  358. /**
  359. * round_jiffies - function to round jiffies to a full second
  360. * @j: the time in (absolute) jiffies that should be rounded
  361. *
  362. * round_jiffies() rounds an absolute time in the future (in jiffies)
  363. * up or down to (approximately) full seconds. This is useful for timers
  364. * for which the exact time they fire does not matter too much, as long as
  365. * they fire approximately every X seconds.
  366. *
  367. * By rounding these timers to whole seconds, all such timers will fire
  368. * at the same time, rather than at various times spread out. The goal
  369. * of this is to have the CPU wake up less, which saves power.
  370. *
  371. * The return value is the rounded version of the @j parameter.
  372. */
  373. unsigned long round_jiffies(unsigned long j)
  374. {
  375. return round_jiffies_common(j, raw_smp_processor_id(), false);
  376. }
  377. EXPORT_SYMBOL_GPL(round_jiffies);
  378. /**
  379. * round_jiffies_relative - function to round jiffies to a full second
  380. * @j: the time in (relative) jiffies that should be rounded
  381. *
  382. * round_jiffies_relative() rounds a time delta in the future (in jiffies)
  383. * up or down to (approximately) full seconds. This is useful for timers
  384. * for which the exact time they fire does not matter too much, as long as
  385. * they fire approximately every X seconds.
  386. *
  387. * By rounding these timers to whole seconds, all such timers will fire
  388. * at the same time, rather than at various times spread out. The goal
  389. * of this is to have the CPU wake up less, which saves power.
  390. *
  391. * The return value is the rounded version of the @j parameter.
  392. */
  393. unsigned long round_jiffies_relative(unsigned long j)
  394. {
  395. return __round_jiffies_relative(j, raw_smp_processor_id());
  396. }
  397. EXPORT_SYMBOL_GPL(round_jiffies_relative);
  398. /**
  399. * __round_jiffies_up - function to round jiffies up to a full second
  400. * @j: the time in (absolute) jiffies that should be rounded
  401. * @cpu: the processor number on which the timeout will happen
  402. *
  403. * This is the same as __round_jiffies() except that it will never
  404. * round down. This is useful for timeouts for which the exact time
  405. * of firing does not matter too much, as long as they don't fire too
  406. * early.
  407. */
  408. unsigned long __round_jiffies_up(unsigned long j, int cpu)
  409. {
  410. return round_jiffies_common(j, cpu, true);
  411. }
  412. EXPORT_SYMBOL_GPL(__round_jiffies_up);
  413. /**
  414. * __round_jiffies_up_relative - function to round jiffies up to a full second
  415. * @j: the time in (relative) jiffies that should be rounded
  416. * @cpu: the processor number on which the timeout will happen
  417. *
  418. * This is the same as __round_jiffies_relative() except that it will never
  419. * round down. This is useful for timeouts for which the exact time
  420. * of firing does not matter too much, as long as they don't fire too
  421. * early.
  422. */
  423. unsigned long __round_jiffies_up_relative(unsigned long j, int cpu)
  424. {
  425. unsigned long j0 = jiffies;
  426. /* Use j0 because jiffies might change while we run */
  427. return round_jiffies_common(j + j0, cpu, true) - j0;
  428. }
  429. EXPORT_SYMBOL_GPL(__round_jiffies_up_relative);
  430. /**
  431. * round_jiffies_up - function to round jiffies up to a full second
  432. * @j: the time in (absolute) jiffies that should be rounded
  433. *
  434. * This is the same as round_jiffies() except that it will never
  435. * round down. This is useful for timeouts for which the exact time
  436. * of firing does not matter too much, as long as they don't fire too
  437. * early.
  438. */
  439. unsigned long round_jiffies_up(unsigned long j)
  440. {
  441. return round_jiffies_common(j, raw_smp_processor_id(), true);
  442. }
  443. EXPORT_SYMBOL_GPL(round_jiffies_up);
  444. /**
  445. * round_jiffies_up_relative - function to round jiffies up to a full second
  446. * @j: the time in (relative) jiffies that should be rounded
  447. *
  448. * This is the same as round_jiffies_relative() except that it will never
  449. * round down. This is useful for timeouts for which the exact time
  450. * of firing does not matter too much, as long as they don't fire too
  451. * early.
  452. */
  453. unsigned long round_jiffies_up_relative(unsigned long j)
  454. {
  455. return __round_jiffies_up_relative(j, raw_smp_processor_id());
  456. }
  457. EXPORT_SYMBOL_GPL(round_jiffies_up_relative);
  458. static inline unsigned int timer_get_idx(struct timer_list *timer)
  459. {
  460. return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT;
  461. }
  462. static inline void timer_set_idx(struct timer_list *timer, unsigned int idx)
  463. {
  464. timer->flags = (timer->flags & ~TIMER_ARRAYMASK) |
  465. idx << TIMER_ARRAYSHIFT;
  466. }
  467. /*
  468. * Helper function to calculate the array index for a given expiry
  469. * time.
  470. */
  471. static inline unsigned calc_index(unsigned long expires, unsigned lvl,
  472. unsigned long *bucket_expiry)
  473. {
  474. /*
  475. * The timer wheel has to guarantee that a timer does not fire
  476. * early. Early expiry can happen due to:
  477. * - Timer is armed at the edge of a tick
  478. * - Truncation of the expiry time in the outer wheel levels
  479. *
  480. * Round up with level granularity to prevent this.
  481. */
  482. trace_android_vh_timer_calc_index(lvl, &expires);
  483. expires = (expires >> LVL_SHIFT(lvl)) + 1;
  484. *bucket_expiry = expires << LVL_SHIFT(lvl);
  485. return LVL_OFFS(lvl) + (expires & LVL_MASK);
  486. }
  487. static int calc_wheel_index(unsigned long expires, unsigned long clk,
  488. unsigned long *bucket_expiry)
  489. {
  490. unsigned long delta = expires - clk;
  491. unsigned int idx;
  492. if (delta < LVL_START(1)) {
  493. idx = calc_index(expires, 0, bucket_expiry);
  494. } else if (delta < LVL_START(2)) {
  495. idx = calc_index(expires, 1, bucket_expiry);
  496. } else if (delta < LVL_START(3)) {
  497. idx = calc_index(expires, 2, bucket_expiry);
  498. } else if (delta < LVL_START(4)) {
  499. idx = calc_index(expires, 3, bucket_expiry);
  500. } else if (delta < LVL_START(5)) {
  501. idx = calc_index(expires, 4, bucket_expiry);
  502. } else if (delta < LVL_START(6)) {
  503. idx = calc_index(expires, 5, bucket_expiry);
  504. } else if (delta < LVL_START(7)) {
  505. idx = calc_index(expires, 6, bucket_expiry);
  506. } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) {
  507. idx = calc_index(expires, 7, bucket_expiry);
  508. } else if ((long) delta < 0) {
  509. idx = clk & LVL_MASK;
  510. *bucket_expiry = clk;
  511. } else {
  512. /*
  513. * Force expire obscene large timeouts to expire at the
  514. * capacity limit of the wheel.
  515. */
  516. if (delta >= WHEEL_TIMEOUT_CUTOFF)
  517. expires = clk + WHEEL_TIMEOUT_MAX;
  518. idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry);
  519. }
  520. return idx;
  521. }
  522. static void
  523. trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer)
  524. {
  525. if (!is_timers_nohz_active())
  526. return;
  527. /*
  528. * TODO: This wants some optimizing similar to the code below, but we
  529. * will do that when we switch from push to pull for deferrable timers.
  530. */
  531. if (timer->flags & TIMER_DEFERRABLE) {
  532. if (tick_nohz_full_cpu(base->cpu))
  533. wake_up_nohz_cpu(base->cpu);
  534. return;
  535. }
  536. /*
  537. * We might have to IPI the remote CPU if the base is idle and the
  538. * timer is not deferrable. If the other CPU is on the way to idle
  539. * then it can't set base->is_idle as we hold the base lock:
  540. */
  541. if (base->is_idle)
  542. wake_up_nohz_cpu(base->cpu);
  543. }
  544. /*
  545. * Enqueue the timer into the hash bucket, mark it pending in
  546. * the bitmap, store the index in the timer flags then wake up
  547. * the target CPU if needed.
  548. */
  549. static void enqueue_timer(struct timer_base *base, struct timer_list *timer,
  550. unsigned int idx, unsigned long bucket_expiry)
  551. {
  552. hlist_add_head(&timer->entry, base->vectors + idx);
  553. __set_bit(idx, base->pending_map);
  554. timer_set_idx(timer, idx);
  555. trace_timer_start(timer, timer->expires, timer->flags);
  556. /*
  557. * Check whether this is the new first expiring timer. The
  558. * effective expiry time of the timer is required here
  559. * (bucket_expiry) instead of timer->expires.
  560. */
  561. if (time_before(bucket_expiry, base->next_expiry)) {
  562. /*
  563. * Set the next expiry time and kick the CPU so it
  564. * can reevaluate the wheel:
  565. */
  566. base->next_expiry = bucket_expiry;
  567. base->timers_pending = true;
  568. base->next_expiry_recalc = false;
  569. trigger_dyntick_cpu(base, timer);
  570. }
  571. }
  572. static void internal_add_timer(struct timer_base *base, struct timer_list *timer)
  573. {
  574. unsigned long bucket_expiry;
  575. unsigned int idx;
  576. idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry);
  577. enqueue_timer(base, timer, idx, bucket_expiry);
  578. }
  579. #ifdef CONFIG_DEBUG_OBJECTS_TIMERS
  580. static const struct debug_obj_descr timer_debug_descr;
  581. struct timer_hint {
  582. void (*function)(struct timer_list *t);
  583. long offset;
  584. };
  585. #define TIMER_HINT(fn, container, timr, hintfn) \
  586. { \
  587. .function = fn, \
  588. .offset = offsetof(container, hintfn) - \
  589. offsetof(container, timr) \
  590. }
  591. static const struct timer_hint timer_hints[] = {
  592. TIMER_HINT(delayed_work_timer_fn,
  593. struct delayed_work, timer, work.func),
  594. TIMER_HINT(kthread_delayed_work_timer_fn,
  595. struct kthread_delayed_work, timer, work.func),
  596. };
  597. static void *timer_debug_hint(void *addr)
  598. {
  599. struct timer_list *timer = addr;
  600. int i;
  601. for (i = 0; i < ARRAY_SIZE(timer_hints); i++) {
  602. if (timer_hints[i].function == timer->function) {
  603. void (**fn)(void) = addr + timer_hints[i].offset;
  604. return *fn;
  605. }
  606. }
  607. return timer->function;
  608. }
  609. static bool timer_is_static_object(void *addr)
  610. {
  611. struct timer_list *timer = addr;
  612. return (timer->entry.pprev == NULL &&
  613. timer->entry.next == TIMER_ENTRY_STATIC);
  614. }
  615. /*
  616. * fixup_init is called when:
  617. * - an active object is initialized
  618. */
  619. static bool timer_fixup_init(void *addr, enum debug_obj_state state)
  620. {
  621. struct timer_list *timer = addr;
  622. switch (state) {
  623. case ODEBUG_STATE_ACTIVE:
  624. del_timer_sync(timer);
  625. debug_object_init(timer, &timer_debug_descr);
  626. return true;
  627. default:
  628. return false;
  629. }
  630. }
  631. /* Stub timer callback for improperly used timers. */
  632. static void stub_timer(struct timer_list *unused)
  633. {
  634. WARN_ON(1);
  635. }
  636. /*
  637. * fixup_activate is called when:
  638. * - an active object is activated
  639. * - an unknown non-static object is activated
  640. */
  641. static bool timer_fixup_activate(void *addr, enum debug_obj_state state)
  642. {
  643. struct timer_list *timer = addr;
  644. switch (state) {
  645. case ODEBUG_STATE_NOTAVAILABLE:
  646. timer_setup(timer, stub_timer, 0);
  647. return true;
  648. case ODEBUG_STATE_ACTIVE:
  649. WARN_ON(1);
  650. fallthrough;
  651. default:
  652. return false;
  653. }
  654. }
  655. /*
  656. * fixup_free is called when:
  657. * - an active object is freed
  658. */
  659. static bool timer_fixup_free(void *addr, enum debug_obj_state state)
  660. {
  661. struct timer_list *timer = addr;
  662. switch (state) {
  663. case ODEBUG_STATE_ACTIVE:
  664. del_timer_sync(timer);
  665. debug_object_free(timer, &timer_debug_descr);
  666. return true;
  667. default:
  668. return false;
  669. }
  670. }
  671. /*
  672. * fixup_assert_init is called when:
  673. * - an untracked/uninit-ed object is found
  674. */
  675. static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state)
  676. {
  677. struct timer_list *timer = addr;
  678. switch (state) {
  679. case ODEBUG_STATE_NOTAVAILABLE:
  680. timer_setup(timer, stub_timer, 0);
  681. return true;
  682. default:
  683. return false;
  684. }
  685. }
  686. static const struct debug_obj_descr timer_debug_descr = {
  687. .name = "timer_list",
  688. .debug_hint = timer_debug_hint,
  689. .is_static_object = timer_is_static_object,
  690. .fixup_init = timer_fixup_init,
  691. .fixup_activate = timer_fixup_activate,
  692. .fixup_free = timer_fixup_free,
  693. .fixup_assert_init = timer_fixup_assert_init,
  694. };
  695. static inline void debug_timer_init(struct timer_list *timer)
  696. {
  697. debug_object_init(timer, &timer_debug_descr);
  698. }
  699. static inline void debug_timer_activate(struct timer_list *timer)
  700. {
  701. debug_object_activate(timer, &timer_debug_descr);
  702. }
  703. static inline void debug_timer_deactivate(struct timer_list *timer)
  704. {
  705. debug_object_deactivate(timer, &timer_debug_descr);
  706. }
  707. static inline void debug_timer_assert_init(struct timer_list *timer)
  708. {
  709. debug_object_assert_init(timer, &timer_debug_descr);
  710. }
  711. static void do_init_timer(struct timer_list *timer,
  712. void (*func)(struct timer_list *),
  713. unsigned int flags,
  714. const char *name, struct lock_class_key *key);
  715. void init_timer_on_stack_key(struct timer_list *timer,
  716. void (*func)(struct timer_list *),
  717. unsigned int flags,
  718. const char *name, struct lock_class_key *key)
  719. {
  720. debug_object_init_on_stack(timer, &timer_debug_descr);
  721. do_init_timer(timer, func, flags, name, key);
  722. }
  723. EXPORT_SYMBOL_GPL(init_timer_on_stack_key);
  724. void destroy_timer_on_stack(struct timer_list *timer)
  725. {
  726. debug_object_free(timer, &timer_debug_descr);
  727. }
  728. EXPORT_SYMBOL_GPL(destroy_timer_on_stack);
  729. #else
  730. static inline void debug_timer_init(struct timer_list *timer) { }
  731. static inline void debug_timer_activate(struct timer_list *timer) { }
  732. static inline void debug_timer_deactivate(struct timer_list *timer) { }
  733. static inline void debug_timer_assert_init(struct timer_list *timer) { }
  734. #endif
  735. static inline void debug_init(struct timer_list *timer)
  736. {
  737. debug_timer_init(timer);
  738. trace_timer_init(timer);
  739. }
  740. static inline void debug_deactivate(struct timer_list *timer)
  741. {
  742. debug_timer_deactivate(timer);
  743. trace_timer_cancel(timer);
  744. }
  745. static inline void debug_assert_init(struct timer_list *timer)
  746. {
  747. debug_timer_assert_init(timer);
  748. }
  749. static void do_init_timer(struct timer_list *timer,
  750. void (*func)(struct timer_list *),
  751. unsigned int flags,
  752. const char *name, struct lock_class_key *key)
  753. {
  754. timer->entry.pprev = NULL;
  755. timer->function = func;
  756. if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS))
  757. flags &= TIMER_INIT_FLAGS;
  758. timer->flags = flags | raw_smp_processor_id();
  759. lockdep_init_map(&timer->lockdep_map, name, key, 0);
  760. }
  761. /**
  762. * init_timer_key - initialize a timer
  763. * @timer: the timer to be initialized
  764. * @func: timer callback function
  765. * @flags: timer flags
  766. * @name: name of the timer
  767. * @key: lockdep class key of the fake lock used for tracking timer
  768. * sync lock dependencies
  769. *
  770. * init_timer_key() must be done to a timer prior calling *any* of the
  771. * other timer functions.
  772. */
  773. void init_timer_key(struct timer_list *timer,
  774. void (*func)(struct timer_list *), unsigned int flags,
  775. const char *name, struct lock_class_key *key)
  776. {
  777. debug_init(timer);
  778. do_init_timer(timer, func, flags, name, key);
  779. }
  780. EXPORT_SYMBOL(init_timer_key);
  781. static inline void detach_timer(struct timer_list *timer, bool clear_pending)
  782. {
  783. struct hlist_node *entry = &timer->entry;
  784. debug_deactivate(timer);
  785. __hlist_del(entry);
  786. if (clear_pending)
  787. entry->pprev = NULL;
  788. entry->next = LIST_POISON2;
  789. }
  790. static int detach_if_pending(struct timer_list *timer, struct timer_base *base,
  791. bool clear_pending)
  792. {
  793. unsigned idx = timer_get_idx(timer);
  794. if (!timer_pending(timer))
  795. return 0;
  796. if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) {
  797. __clear_bit(idx, base->pending_map);
  798. base->next_expiry_recalc = true;
  799. }
  800. detach_timer(timer, clear_pending);
  801. return 1;
  802. }
  803. static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu)
  804. {
  805. struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu);
  806. /*
  807. * If the timer is deferrable and NO_HZ_COMMON is set then we need
  808. * to use the deferrable base.
  809. */
  810. if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
  811. base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu);
  812. return base;
  813. }
  814. static inline struct timer_base *get_timer_this_cpu_base(u32 tflags)
  815. {
  816. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  817. /*
  818. * If the timer is deferrable and NO_HZ_COMMON is set then we need
  819. * to use the deferrable base.
  820. */
  821. if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE))
  822. base = this_cpu_ptr(&timer_bases[BASE_DEF]);
  823. return base;
  824. }
  825. static inline struct timer_base *get_timer_base(u32 tflags)
  826. {
  827. return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK);
  828. }
  829. static inline struct timer_base *
  830. get_target_base(struct timer_base *base, unsigned tflags)
  831. {
  832. #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
  833. if (static_branch_likely(&timers_migration_enabled) &&
  834. !(tflags & TIMER_PINNED))
  835. return get_timer_cpu_base(tflags, get_nohz_timer_target());
  836. #endif
  837. return get_timer_this_cpu_base(tflags);
  838. }
  839. static inline void forward_timer_base(struct timer_base *base)
  840. {
  841. unsigned long jnow = READ_ONCE(jiffies);
  842. /*
  843. * No need to forward if we are close enough below jiffies.
  844. * Also while executing timers, base->clk is 1 offset ahead
  845. * of jiffies to avoid endless requeuing to current jiffies.
  846. */
  847. if ((long)(jnow - base->clk) < 1)
  848. return;
  849. /*
  850. * If the next expiry value is > jiffies, then we fast forward to
  851. * jiffies otherwise we forward to the next expiry value.
  852. */
  853. if (time_after(base->next_expiry, jnow)) {
  854. base->clk = jnow;
  855. } else {
  856. if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk)))
  857. return;
  858. base->clk = base->next_expiry;
  859. }
  860. }
  861. /*
  862. * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means
  863. * that all timers which are tied to this base are locked, and the base itself
  864. * is locked too.
  865. *
  866. * So __run_timers/migrate_timers can safely modify all timers which could
  867. * be found in the base->vectors array.
  868. *
  869. * When a timer is migrating then the TIMER_MIGRATING flag is set and we need
  870. * to wait until the migration is done.
  871. */
  872. static struct timer_base *lock_timer_base(struct timer_list *timer,
  873. unsigned long *flags)
  874. __acquires(timer->base->lock)
  875. {
  876. for (;;) {
  877. struct timer_base *base;
  878. u32 tf;
  879. /*
  880. * We need to use READ_ONCE() here, otherwise the compiler
  881. * might re-read @tf between the check for TIMER_MIGRATING
  882. * and spin_lock().
  883. */
  884. tf = READ_ONCE(timer->flags);
  885. if (!(tf & TIMER_MIGRATING)) {
  886. base = get_timer_base(tf);
  887. raw_spin_lock_irqsave(&base->lock, *flags);
  888. if (timer->flags == tf)
  889. return base;
  890. raw_spin_unlock_irqrestore(&base->lock, *flags);
  891. }
  892. cpu_relax();
  893. }
  894. }
  895. #define MOD_TIMER_PENDING_ONLY 0x01
  896. #define MOD_TIMER_REDUCE 0x02
  897. #define MOD_TIMER_NOTPENDING 0x04
  898. static inline int
  899. __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options)
  900. {
  901. unsigned long clk = 0, flags, bucket_expiry;
  902. struct timer_base *base, *new_base;
  903. unsigned int idx = UINT_MAX;
  904. int ret = 0;
  905. BUG_ON(!timer->function);
  906. /*
  907. * This is a common optimization triggered by the networking code - if
  908. * the timer is re-modified to have the same timeout or ends up in the
  909. * same array bucket then just return:
  910. */
  911. if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) {
  912. /*
  913. * The downside of this optimization is that it can result in
  914. * larger granularity than you would get from adding a new
  915. * timer with this expiry.
  916. */
  917. long diff = timer->expires - expires;
  918. if (!diff)
  919. return 1;
  920. if (options & MOD_TIMER_REDUCE && diff <= 0)
  921. return 1;
  922. /*
  923. * We lock timer base and calculate the bucket index right
  924. * here. If the timer ends up in the same bucket, then we
  925. * just update the expiry time and avoid the whole
  926. * dequeue/enqueue dance.
  927. */
  928. base = lock_timer_base(timer, &flags);
  929. forward_timer_base(base);
  930. if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) &&
  931. time_before_eq(timer->expires, expires)) {
  932. ret = 1;
  933. goto out_unlock;
  934. }
  935. clk = base->clk;
  936. idx = calc_wheel_index(expires, clk, &bucket_expiry);
  937. /*
  938. * Retrieve and compare the array index of the pending
  939. * timer. If it matches set the expiry to the new value so a
  940. * subsequent call will exit in the expires check above.
  941. */
  942. if (idx == timer_get_idx(timer)) {
  943. if (!(options & MOD_TIMER_REDUCE))
  944. timer->expires = expires;
  945. else if (time_after(timer->expires, expires))
  946. timer->expires = expires;
  947. ret = 1;
  948. goto out_unlock;
  949. }
  950. } else {
  951. base = lock_timer_base(timer, &flags);
  952. forward_timer_base(base);
  953. }
  954. ret = detach_if_pending(timer, base, false);
  955. if (!ret && (options & MOD_TIMER_PENDING_ONLY))
  956. goto out_unlock;
  957. new_base = get_target_base(base, timer->flags);
  958. if (base != new_base) {
  959. /*
  960. * We are trying to schedule the timer on the new base.
  961. * However we can't change timer's base while it is running,
  962. * otherwise del_timer_sync() can't detect that the timer's
  963. * handler yet has not finished. This also guarantees that the
  964. * timer is serialized wrt itself.
  965. */
  966. if (likely(base->running_timer != timer)) {
  967. /* See the comment in lock_timer_base() */
  968. timer->flags |= TIMER_MIGRATING;
  969. raw_spin_unlock(&base->lock);
  970. base = new_base;
  971. raw_spin_lock(&base->lock);
  972. WRITE_ONCE(timer->flags,
  973. (timer->flags & ~TIMER_BASEMASK) | base->cpu);
  974. forward_timer_base(base);
  975. }
  976. }
  977. debug_timer_activate(timer);
  978. timer->expires = expires;
  979. /*
  980. * If 'idx' was calculated above and the base time did not advance
  981. * between calculating 'idx' and possibly switching the base, only
  982. * enqueue_timer() is required. Otherwise we need to (re)calculate
  983. * the wheel index via internal_add_timer().
  984. */
  985. if (idx != UINT_MAX && clk == base->clk)
  986. enqueue_timer(base, timer, idx, bucket_expiry);
  987. else
  988. internal_add_timer(base, timer);
  989. out_unlock:
  990. raw_spin_unlock_irqrestore(&base->lock, flags);
  991. return ret;
  992. }
  993. /**
  994. * mod_timer_pending - modify a pending timer's timeout
  995. * @timer: the pending timer to be modified
  996. * @expires: new timeout in jiffies
  997. *
  998. * mod_timer_pending() is the same for pending timers as mod_timer(),
  999. * but will not re-activate and modify already deleted timers.
  1000. *
  1001. * It is useful for unserialized use of timers.
  1002. */
  1003. int mod_timer_pending(struct timer_list *timer, unsigned long expires)
  1004. {
  1005. return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY);
  1006. }
  1007. EXPORT_SYMBOL(mod_timer_pending);
  1008. /**
  1009. * mod_timer - modify a timer's timeout
  1010. * @timer: the timer to be modified
  1011. * @expires: new timeout in jiffies
  1012. *
  1013. * mod_timer() is a more efficient way to update the expire field of an
  1014. * active timer (if the timer is inactive it will be activated)
  1015. *
  1016. * mod_timer(timer, expires) is equivalent to:
  1017. *
  1018. * del_timer(timer); timer->expires = expires; add_timer(timer);
  1019. *
  1020. * Note that if there are multiple unserialized concurrent users of the
  1021. * same timer, then mod_timer() is the only safe way to modify the timeout,
  1022. * since add_timer() cannot modify an already running timer.
  1023. *
  1024. * The function returns whether it has modified a pending timer or not.
  1025. * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an
  1026. * active timer returns 1.)
  1027. */
  1028. int mod_timer(struct timer_list *timer, unsigned long expires)
  1029. {
  1030. return __mod_timer(timer, expires, 0);
  1031. }
  1032. EXPORT_SYMBOL(mod_timer);
  1033. /**
  1034. * timer_reduce - Modify a timer's timeout if it would reduce the timeout
  1035. * @timer: The timer to be modified
  1036. * @expires: New timeout in jiffies
  1037. *
  1038. * timer_reduce() is very similar to mod_timer(), except that it will only
  1039. * modify a running timer if that would reduce the expiration time (it will
  1040. * start a timer that isn't running).
  1041. */
  1042. int timer_reduce(struct timer_list *timer, unsigned long expires)
  1043. {
  1044. return __mod_timer(timer, expires, MOD_TIMER_REDUCE);
  1045. }
  1046. EXPORT_SYMBOL(timer_reduce);
  1047. /**
  1048. * add_timer - start a timer
  1049. * @timer: the timer to be added
  1050. *
  1051. * The kernel will do a ->function(@timer) callback from the
  1052. * timer interrupt at the ->expires point in the future. The
  1053. * current time is 'jiffies'.
  1054. *
  1055. * The timer's ->expires, ->function fields must be set prior calling this
  1056. * function.
  1057. *
  1058. * Timers with an ->expires field in the past will be executed in the next
  1059. * timer tick.
  1060. */
  1061. void add_timer(struct timer_list *timer)
  1062. {
  1063. BUG_ON(timer_pending(timer));
  1064. __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING);
  1065. }
  1066. EXPORT_SYMBOL(add_timer);
  1067. /**
  1068. * add_timer_on - start a timer on a particular CPU
  1069. * @timer: the timer to be added
  1070. * @cpu: the CPU to start it on
  1071. *
  1072. * This is not very scalable on SMP. Double adds are not possible.
  1073. */
  1074. void add_timer_on(struct timer_list *timer, int cpu)
  1075. {
  1076. struct timer_base *new_base, *base;
  1077. unsigned long flags;
  1078. BUG_ON(timer_pending(timer) || !timer->function);
  1079. new_base = get_timer_cpu_base(timer->flags, cpu);
  1080. /*
  1081. * If @timer was on a different CPU, it should be migrated with the
  1082. * old base locked to prevent other operations proceeding with the
  1083. * wrong base locked. See lock_timer_base().
  1084. */
  1085. base = lock_timer_base(timer, &flags);
  1086. if (base != new_base) {
  1087. timer->flags |= TIMER_MIGRATING;
  1088. raw_spin_unlock(&base->lock);
  1089. base = new_base;
  1090. raw_spin_lock(&base->lock);
  1091. WRITE_ONCE(timer->flags,
  1092. (timer->flags & ~TIMER_BASEMASK) | cpu);
  1093. }
  1094. forward_timer_base(base);
  1095. debug_timer_activate(timer);
  1096. internal_add_timer(base, timer);
  1097. raw_spin_unlock_irqrestore(&base->lock, flags);
  1098. }
  1099. EXPORT_SYMBOL_GPL(add_timer_on);
  1100. /**
  1101. * del_timer - deactivate a timer.
  1102. * @timer: the timer to be deactivated
  1103. *
  1104. * del_timer() deactivates a timer - this works on both active and inactive
  1105. * timers.
  1106. *
  1107. * The function returns whether it has deactivated a pending timer or not.
  1108. * (ie. del_timer() of an inactive timer returns 0, del_timer() of an
  1109. * active timer returns 1.)
  1110. */
  1111. int del_timer(struct timer_list *timer)
  1112. {
  1113. struct timer_base *base;
  1114. unsigned long flags;
  1115. int ret = 0;
  1116. debug_assert_init(timer);
  1117. if (timer_pending(timer)) {
  1118. base = lock_timer_base(timer, &flags);
  1119. ret = detach_if_pending(timer, base, true);
  1120. raw_spin_unlock_irqrestore(&base->lock, flags);
  1121. }
  1122. return ret;
  1123. }
  1124. EXPORT_SYMBOL(del_timer);
  1125. /**
  1126. * try_to_del_timer_sync - Try to deactivate a timer
  1127. * @timer: timer to delete
  1128. *
  1129. * This function tries to deactivate a timer. Upon successful (ret >= 0)
  1130. * exit the timer is not queued and the handler is not running on any CPU.
  1131. */
  1132. int try_to_del_timer_sync(struct timer_list *timer)
  1133. {
  1134. struct timer_base *base;
  1135. unsigned long flags;
  1136. int ret = -1;
  1137. debug_assert_init(timer);
  1138. base = lock_timer_base(timer, &flags);
  1139. if (base->running_timer != timer)
  1140. ret = detach_if_pending(timer, base, true);
  1141. raw_spin_unlock_irqrestore(&base->lock, flags);
  1142. return ret;
  1143. }
  1144. EXPORT_SYMBOL(try_to_del_timer_sync);
  1145. #ifdef CONFIG_PREEMPT_RT
  1146. static __init void timer_base_init_expiry_lock(struct timer_base *base)
  1147. {
  1148. spin_lock_init(&base->expiry_lock);
  1149. }
  1150. static inline void timer_base_lock_expiry(struct timer_base *base)
  1151. {
  1152. spin_lock(&base->expiry_lock);
  1153. }
  1154. static inline void timer_base_unlock_expiry(struct timer_base *base)
  1155. {
  1156. spin_unlock(&base->expiry_lock);
  1157. }
  1158. /*
  1159. * The counterpart to del_timer_wait_running().
  1160. *
  1161. * If there is a waiter for base->expiry_lock, then it was waiting for the
  1162. * timer callback to finish. Drop expiry_lock and reacquire it. That allows
  1163. * the waiter to acquire the lock and make progress.
  1164. */
  1165. static void timer_sync_wait_running(struct timer_base *base)
  1166. {
  1167. if (atomic_read(&base->timer_waiters)) {
  1168. raw_spin_unlock_irq(&base->lock);
  1169. spin_unlock(&base->expiry_lock);
  1170. spin_lock(&base->expiry_lock);
  1171. raw_spin_lock_irq(&base->lock);
  1172. }
  1173. }
  1174. /*
  1175. * This function is called on PREEMPT_RT kernels when the fast path
  1176. * deletion of a timer failed because the timer callback function was
  1177. * running.
  1178. *
  1179. * This prevents priority inversion, if the softirq thread on a remote CPU
  1180. * got preempted, and it prevents a life lock when the task which tries to
  1181. * delete a timer preempted the softirq thread running the timer callback
  1182. * function.
  1183. */
  1184. static void del_timer_wait_running(struct timer_list *timer)
  1185. {
  1186. u32 tf;
  1187. tf = READ_ONCE(timer->flags);
  1188. if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) {
  1189. struct timer_base *base = get_timer_base(tf);
  1190. /*
  1191. * Mark the base as contended and grab the expiry lock,
  1192. * which is held by the softirq across the timer
  1193. * callback. Drop the lock immediately so the softirq can
  1194. * expire the next timer. In theory the timer could already
  1195. * be running again, but that's more than unlikely and just
  1196. * causes another wait loop.
  1197. */
  1198. atomic_inc(&base->timer_waiters);
  1199. spin_lock_bh(&base->expiry_lock);
  1200. atomic_dec(&base->timer_waiters);
  1201. spin_unlock_bh(&base->expiry_lock);
  1202. }
  1203. }
  1204. #else
  1205. static inline void timer_base_init_expiry_lock(struct timer_base *base) { }
  1206. static inline void timer_base_lock_expiry(struct timer_base *base) { }
  1207. static inline void timer_base_unlock_expiry(struct timer_base *base) { }
  1208. static inline void timer_sync_wait_running(struct timer_base *base) { }
  1209. static inline void del_timer_wait_running(struct timer_list *timer) { }
  1210. #endif
  1211. #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
  1212. /**
  1213. * del_timer_sync - deactivate a timer and wait for the handler to finish.
  1214. * @timer: the timer to be deactivated
  1215. *
  1216. * This function only differs from del_timer() on SMP: besides deactivating
  1217. * the timer it also makes sure the handler has finished executing on other
  1218. * CPUs.
  1219. *
  1220. * Synchronization rules: Callers must prevent restarting of the timer,
  1221. * otherwise this function is meaningless. It must not be called from
  1222. * interrupt contexts unless the timer is an irqsafe one. The caller must
  1223. * not hold locks which would prevent completion of the timer's
  1224. * handler. The timer's handler must not call add_timer_on(). Upon exit the
  1225. * timer is not queued and the handler is not running on any CPU.
  1226. *
  1227. * Note: For !irqsafe timers, you must not hold locks that are held in
  1228. * interrupt context while calling this function. Even if the lock has
  1229. * nothing to do with the timer in question. Here's why::
  1230. *
  1231. * CPU0 CPU1
  1232. * ---- ----
  1233. * <SOFTIRQ>
  1234. * call_timer_fn();
  1235. * base->running_timer = mytimer;
  1236. * spin_lock_irq(somelock);
  1237. * <IRQ>
  1238. * spin_lock(somelock);
  1239. * del_timer_sync(mytimer);
  1240. * while (base->running_timer == mytimer);
  1241. *
  1242. * Now del_timer_sync() will never return and never release somelock.
  1243. * The interrupt on the other CPU is waiting to grab somelock but
  1244. * it has interrupted the softirq that CPU0 is waiting to finish.
  1245. *
  1246. * The function returns whether it has deactivated a pending timer or not.
  1247. */
  1248. int del_timer_sync(struct timer_list *timer)
  1249. {
  1250. int ret;
  1251. #ifdef CONFIG_LOCKDEP
  1252. unsigned long flags;
  1253. /*
  1254. * If lockdep gives a backtrace here, please reference
  1255. * the synchronization rules above.
  1256. */
  1257. local_irq_save(flags);
  1258. lock_map_acquire(&timer->lockdep_map);
  1259. lock_map_release(&timer->lockdep_map);
  1260. local_irq_restore(flags);
  1261. #endif
  1262. /*
  1263. * don't use it in hardirq context, because it
  1264. * could lead to deadlock.
  1265. */
  1266. WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE));
  1267. /*
  1268. * Must be able to sleep on PREEMPT_RT because of the slowpath in
  1269. * del_timer_wait_running().
  1270. */
  1271. if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE))
  1272. lockdep_assert_preemption_enabled();
  1273. do {
  1274. ret = try_to_del_timer_sync(timer);
  1275. if (unlikely(ret < 0)) {
  1276. del_timer_wait_running(timer);
  1277. cpu_relax();
  1278. }
  1279. } while (ret < 0);
  1280. return ret;
  1281. }
  1282. EXPORT_SYMBOL(del_timer_sync);
  1283. #endif
  1284. static void call_timer_fn(struct timer_list *timer,
  1285. void (*fn)(struct timer_list *),
  1286. unsigned long baseclk)
  1287. {
  1288. int count = preempt_count();
  1289. #ifdef CONFIG_LOCKDEP
  1290. /*
  1291. * It is permissible to free the timer from inside the
  1292. * function that is called from it, this we need to take into
  1293. * account for lockdep too. To avoid bogus "held lock freed"
  1294. * warnings as well as problems when looking into
  1295. * timer->lockdep_map, make a copy and use that here.
  1296. */
  1297. struct lockdep_map lockdep_map;
  1298. lockdep_copy_map(&lockdep_map, &timer->lockdep_map);
  1299. #endif
  1300. /*
  1301. * Couple the lock chain with the lock chain at
  1302. * del_timer_sync() by acquiring the lock_map around the fn()
  1303. * call here and in del_timer_sync().
  1304. */
  1305. lock_map_acquire(&lockdep_map);
  1306. trace_timer_expire_entry(timer, baseclk);
  1307. fn(timer);
  1308. trace_timer_expire_exit(timer);
  1309. lock_map_release(&lockdep_map);
  1310. if (count != preempt_count()) {
  1311. WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n",
  1312. fn, count, preempt_count());
  1313. /*
  1314. * Restore the preempt count. That gives us a decent
  1315. * chance to survive and extract information. If the
  1316. * callback kept a lock held, bad luck, but not worse
  1317. * than the BUG() we had.
  1318. */
  1319. preempt_count_set(count);
  1320. }
  1321. }
  1322. static void expire_timers(struct timer_base *base, struct hlist_head *head)
  1323. {
  1324. /*
  1325. * This value is required only for tracing. base->clk was
  1326. * incremented directly before expire_timers was called. But expiry
  1327. * is related to the old base->clk value.
  1328. */
  1329. unsigned long baseclk = base->clk - 1;
  1330. while (!hlist_empty(head)) {
  1331. struct timer_list *timer;
  1332. void (*fn)(struct timer_list *);
  1333. timer = hlist_entry(head->first, struct timer_list, entry);
  1334. base->running_timer = timer;
  1335. detach_timer(timer, true);
  1336. fn = timer->function;
  1337. if (timer->flags & TIMER_IRQSAFE) {
  1338. raw_spin_unlock(&base->lock);
  1339. call_timer_fn(timer, fn, baseclk);
  1340. raw_spin_lock(&base->lock);
  1341. base->running_timer = NULL;
  1342. } else {
  1343. raw_spin_unlock_irq(&base->lock);
  1344. call_timer_fn(timer, fn, baseclk);
  1345. raw_spin_lock_irq(&base->lock);
  1346. base->running_timer = NULL;
  1347. timer_sync_wait_running(base);
  1348. }
  1349. }
  1350. }
  1351. static int collect_expired_timers(struct timer_base *base,
  1352. struct hlist_head *heads)
  1353. {
  1354. unsigned long clk = base->clk = base->next_expiry;
  1355. struct hlist_head *vec;
  1356. int i, levels = 0;
  1357. unsigned int idx;
  1358. for (i = 0; i < LVL_DEPTH; i++) {
  1359. idx = (clk & LVL_MASK) + i * LVL_SIZE;
  1360. if (__test_and_clear_bit(idx, base->pending_map)) {
  1361. vec = base->vectors + idx;
  1362. hlist_move_list(vec, heads++);
  1363. levels++;
  1364. }
  1365. /* Is it time to look at the next level? */
  1366. if (clk & LVL_CLK_MASK)
  1367. break;
  1368. /* Shift clock for the next level granularity */
  1369. clk >>= LVL_CLK_SHIFT;
  1370. }
  1371. return levels;
  1372. }
  1373. /*
  1374. * Find the next pending bucket of a level. Search from level start (@offset)
  1375. * + @clk upwards and if nothing there, search from start of the level
  1376. * (@offset) up to @offset + clk.
  1377. */
  1378. static int next_pending_bucket(struct timer_base *base, unsigned offset,
  1379. unsigned clk)
  1380. {
  1381. unsigned pos, start = offset + clk;
  1382. unsigned end = offset + LVL_SIZE;
  1383. pos = find_next_bit(base->pending_map, end, start);
  1384. if (pos < end)
  1385. return pos - start;
  1386. pos = find_next_bit(base->pending_map, start, offset);
  1387. return pos < start ? pos + LVL_SIZE - start : -1;
  1388. }
  1389. /*
  1390. * Search the first expiring timer in the various clock levels. Caller must
  1391. * hold base->lock.
  1392. */
  1393. static unsigned long __next_timer_interrupt(struct timer_base *base)
  1394. {
  1395. unsigned long clk, next, adj;
  1396. unsigned lvl, offset = 0;
  1397. next = base->clk + NEXT_TIMER_MAX_DELTA;
  1398. clk = base->clk;
  1399. for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) {
  1400. int pos = next_pending_bucket(base, offset, clk & LVL_MASK);
  1401. unsigned long lvl_clk = clk & LVL_CLK_MASK;
  1402. if (pos >= 0) {
  1403. unsigned long tmp = clk + (unsigned long) pos;
  1404. tmp <<= LVL_SHIFT(lvl);
  1405. if (time_before(tmp, next))
  1406. next = tmp;
  1407. /*
  1408. * If the next expiration happens before we reach
  1409. * the next level, no need to check further.
  1410. */
  1411. if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK))
  1412. break;
  1413. }
  1414. /*
  1415. * Clock for the next level. If the current level clock lower
  1416. * bits are zero, we look at the next level as is. If not we
  1417. * need to advance it by one because that's going to be the
  1418. * next expiring bucket in that level. base->clk is the next
  1419. * expiring jiffie. So in case of:
  1420. *
  1421. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1422. * 0 0 0 0 0 0
  1423. *
  1424. * we have to look at all levels @index 0. With
  1425. *
  1426. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1427. * 0 0 0 0 0 2
  1428. *
  1429. * LVL0 has the next expiring bucket @index 2. The upper
  1430. * levels have the next expiring bucket @index 1.
  1431. *
  1432. * In case that the propagation wraps the next level the same
  1433. * rules apply:
  1434. *
  1435. * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0
  1436. * 0 0 0 0 F 2
  1437. *
  1438. * So after looking at LVL0 we get:
  1439. *
  1440. * LVL5 LVL4 LVL3 LVL2 LVL1
  1441. * 0 0 0 1 0
  1442. *
  1443. * So no propagation from LVL1 to LVL2 because that happened
  1444. * with the add already, but then we need to propagate further
  1445. * from LVL2 to LVL3.
  1446. *
  1447. * So the simple check whether the lower bits of the current
  1448. * level are 0 or not is sufficient for all cases.
  1449. */
  1450. adj = lvl_clk ? 1 : 0;
  1451. clk >>= LVL_CLK_SHIFT;
  1452. clk += adj;
  1453. }
  1454. base->next_expiry_recalc = false;
  1455. base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA);
  1456. return next;
  1457. }
  1458. #ifdef CONFIG_NO_HZ_COMMON
  1459. /*
  1460. * Check, if the next hrtimer event is before the next timer wheel
  1461. * event:
  1462. */
  1463. static u64 cmp_next_hrtimer_event(u64 basem, u64 expires)
  1464. {
  1465. u64 nextevt = hrtimer_get_next_event();
  1466. /*
  1467. * If high resolution timers are enabled
  1468. * hrtimer_get_next_event() returns KTIME_MAX.
  1469. */
  1470. if (expires <= nextevt)
  1471. return expires;
  1472. /*
  1473. * If the next timer is already expired, return the tick base
  1474. * time so the tick is fired immediately.
  1475. */
  1476. if (nextevt <= basem)
  1477. return basem;
  1478. /*
  1479. * Round up to the next jiffie. High resolution timers are
  1480. * off, so the hrtimers are expired in the tick and we need to
  1481. * make sure that this tick really expires the timer to avoid
  1482. * a ping pong of the nohz stop code.
  1483. *
  1484. * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3
  1485. */
  1486. return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC;
  1487. }
  1488. /**
  1489. * get_next_timer_interrupt - return the time (clock mono) of the next timer
  1490. * @basej: base time jiffies
  1491. * @basem: base time clock monotonic
  1492. *
  1493. * Returns the tick aligned clock monotonic time of the next pending
  1494. * timer or KTIME_MAX if no timer is pending.
  1495. */
  1496. u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
  1497. {
  1498. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  1499. u64 expires = KTIME_MAX;
  1500. unsigned long nextevt;
  1501. /*
  1502. * Pretend that there is no timer pending if the cpu is offline.
  1503. * Possible pending timers will be migrated later to an active cpu.
  1504. */
  1505. if (cpu_is_offline(smp_processor_id()))
  1506. return expires;
  1507. raw_spin_lock(&base->lock);
  1508. if (base->next_expiry_recalc)
  1509. base->next_expiry = __next_timer_interrupt(base);
  1510. nextevt = base->next_expiry;
  1511. /*
  1512. * We have a fresh next event. Check whether we can forward the
  1513. * base. We can only do that when @basej is past base->clk
  1514. * otherwise we might rewind base->clk.
  1515. */
  1516. if (time_after(basej, base->clk)) {
  1517. if (time_after(nextevt, basej))
  1518. base->clk = basej;
  1519. else if (time_after(nextevt, base->clk))
  1520. base->clk = nextevt;
  1521. }
  1522. if (time_before_eq(nextevt, basej)) {
  1523. expires = basem;
  1524. base->is_idle = false;
  1525. } else {
  1526. if (base->timers_pending)
  1527. expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
  1528. /*
  1529. * If we expect to sleep more than a tick, mark the base idle.
  1530. * Also the tick is stopped so any added timer must forward
  1531. * the base clk itself to keep granularity small. This idle
  1532. * logic is only maintained for the BASE_STD base, deferrable
  1533. * timers may still see large granularity skew (by design).
  1534. */
  1535. if ((expires - basem) > TICK_NSEC)
  1536. base->is_idle = true;
  1537. }
  1538. raw_spin_unlock(&base->lock);
  1539. return cmp_next_hrtimer_event(basem, expires);
  1540. }
  1541. /**
  1542. * timer_clear_idle - Clear the idle state of the timer base
  1543. *
  1544. * Called with interrupts disabled
  1545. */
  1546. void timer_clear_idle(void)
  1547. {
  1548. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  1549. /*
  1550. * We do this unlocked. The worst outcome is a remote enqueue sending
  1551. * a pointless IPI, but taking the lock would just make the window for
  1552. * sending the IPI a few instructions smaller for the cost of taking
  1553. * the lock in the exit from idle path.
  1554. */
  1555. base->is_idle = false;
  1556. }
  1557. #endif
  1558. /**
  1559. * __run_timers - run all expired timers (if any) on this CPU.
  1560. * @base: the timer vector to be processed.
  1561. */
  1562. static inline void __run_timers(struct timer_base *base)
  1563. {
  1564. struct hlist_head heads[LVL_DEPTH];
  1565. int levels;
  1566. if (time_before(jiffies, base->next_expiry))
  1567. return;
  1568. timer_base_lock_expiry(base);
  1569. raw_spin_lock_irq(&base->lock);
  1570. while (time_after_eq(jiffies, base->clk) &&
  1571. time_after_eq(jiffies, base->next_expiry)) {
  1572. levels = collect_expired_timers(base, heads);
  1573. /*
  1574. * The two possible reasons for not finding any expired
  1575. * timer at this clk are that all matching timers have been
  1576. * dequeued or no timer has been queued since
  1577. * base::next_expiry was set to base::clk +
  1578. * NEXT_TIMER_MAX_DELTA.
  1579. */
  1580. WARN_ON_ONCE(!levels && !base->next_expiry_recalc
  1581. && base->timers_pending);
  1582. base->clk++;
  1583. base->next_expiry = __next_timer_interrupt(base);
  1584. while (levels--)
  1585. expire_timers(base, heads + levels);
  1586. }
  1587. raw_spin_unlock_irq(&base->lock);
  1588. timer_base_unlock_expiry(base);
  1589. }
  1590. /*
  1591. * This function runs timers and the timer-tq in bottom half context.
  1592. */
  1593. static __latent_entropy void run_timer_softirq(struct softirq_action *h)
  1594. {
  1595. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  1596. __run_timers(base);
  1597. if (IS_ENABLED(CONFIG_NO_HZ_COMMON))
  1598. __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
  1599. }
  1600. /*
  1601. * Called by the local, per-CPU timer interrupt on SMP.
  1602. */
  1603. static void run_local_timers(void)
  1604. {
  1605. struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
  1606. hrtimer_run_queues();
  1607. /* Raise the softirq only if required. */
  1608. if (time_before(jiffies, base->next_expiry)) {
  1609. if (!IS_ENABLED(CONFIG_NO_HZ_COMMON))
  1610. return;
  1611. /* CPU is awake, so check the deferrable base. */
  1612. base++;
  1613. if (time_before(jiffies, base->next_expiry))
  1614. return;
  1615. }
  1616. raise_softirq(TIMER_SOFTIRQ);
  1617. }
  1618. /*
  1619. * Called from the timer interrupt handler to charge one tick to the current
  1620. * process. user_tick is 1 if the tick is user time, 0 for system.
  1621. */
  1622. void update_process_times(int user_tick)
  1623. {
  1624. struct task_struct *p = current;
  1625. /* Note: this timer irq context must be accounted for as well. */
  1626. account_process_tick(p, user_tick);
  1627. run_local_timers();
  1628. rcu_sched_clock_irq(user_tick);
  1629. #ifdef CONFIG_IRQ_WORK
  1630. if (in_irq())
  1631. irq_work_tick();
  1632. #endif
  1633. scheduler_tick();
  1634. if (IS_ENABLED(CONFIG_POSIX_TIMERS))
  1635. run_posix_cpu_timers();
  1636. }
  1637. /*
  1638. * Since schedule_timeout()'s timer is defined on the stack, it must store
  1639. * the target task on the stack as well.
  1640. */
  1641. struct process_timer {
  1642. struct timer_list timer;
  1643. struct task_struct *task;
  1644. };
  1645. static void process_timeout(struct timer_list *t)
  1646. {
  1647. struct process_timer *timeout = from_timer(timeout, t, timer);
  1648. wake_up_process(timeout->task);
  1649. }
  1650. /**
  1651. * schedule_timeout - sleep until timeout
  1652. * @timeout: timeout value in jiffies
  1653. *
  1654. * Make the current task sleep until @timeout jiffies have elapsed.
  1655. * The function behavior depends on the current task state
  1656. * (see also set_current_state() description):
  1657. *
  1658. * %TASK_RUNNING - the scheduler is called, but the task does not sleep
  1659. * at all. That happens because sched_submit_work() does nothing for
  1660. * tasks in %TASK_RUNNING state.
  1661. *
  1662. * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
  1663. * pass before the routine returns unless the current task is explicitly
  1664. * woken up, (e.g. by wake_up_process()).
  1665. *
  1666. * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
  1667. * delivered to the current task or the current task is explicitly woken
  1668. * up.
  1669. *
  1670. * The current task state is guaranteed to be %TASK_RUNNING when this
  1671. * routine returns.
  1672. *
  1673. * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
  1674. * the CPU away without a bound on the timeout. In this case the return
  1675. * value will be %MAX_SCHEDULE_TIMEOUT.
  1676. *
  1677. * Returns 0 when the timer has expired otherwise the remaining time in
  1678. * jiffies will be returned. In all cases the return value is guaranteed
  1679. * to be non-negative.
  1680. */
  1681. signed long __sched schedule_timeout(signed long timeout)
  1682. {
  1683. struct process_timer timer;
  1684. unsigned long expire;
  1685. switch (timeout)
  1686. {
  1687. case MAX_SCHEDULE_TIMEOUT:
  1688. /*
  1689. * These two special cases are useful to be comfortable
  1690. * in the caller. Nothing more. We could take
  1691. * MAX_SCHEDULE_TIMEOUT from one of the negative value
  1692. * but I' d like to return a valid offset (>=0) to allow
  1693. * the caller to do everything it want with the retval.
  1694. */
  1695. schedule();
  1696. goto out;
  1697. default:
  1698. /*
  1699. * Another bit of PARANOID. Note that the retval will be
  1700. * 0 since no piece of kernel is supposed to do a check
  1701. * for a negative retval of schedule_timeout() (since it
  1702. * should never happens anyway). You just have the printk()
  1703. * that will tell you if something is gone wrong and where.
  1704. */
  1705. if (timeout < 0) {
  1706. printk(KERN_ERR "schedule_timeout: wrong timeout "
  1707. "value %lx\n", timeout);
  1708. dump_stack();
  1709. __set_current_state(TASK_RUNNING);
  1710. goto out;
  1711. }
  1712. }
  1713. expire = timeout + jiffies;
  1714. timer.task = current;
  1715. timer_setup_on_stack(&timer.timer, process_timeout, 0);
  1716. __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING);
  1717. schedule();
  1718. del_singleshot_timer_sync(&timer.timer);
  1719. /* Remove the timer from the object tracker */
  1720. destroy_timer_on_stack(&timer.timer);
  1721. timeout = expire - jiffies;
  1722. out:
  1723. return timeout < 0 ? 0 : timeout;
  1724. }
  1725. EXPORT_SYMBOL(schedule_timeout);
  1726. /*
  1727. * We can use __set_current_state() here because schedule_timeout() calls
  1728. * schedule() unconditionally.
  1729. */
  1730. signed long __sched schedule_timeout_interruptible(signed long timeout)
  1731. {
  1732. __set_current_state(TASK_INTERRUPTIBLE);
  1733. return schedule_timeout(timeout);
  1734. }
  1735. EXPORT_SYMBOL(schedule_timeout_interruptible);
  1736. signed long __sched schedule_timeout_killable(signed long timeout)
  1737. {
  1738. __set_current_state(TASK_KILLABLE);
  1739. return schedule_timeout(timeout);
  1740. }
  1741. EXPORT_SYMBOL(schedule_timeout_killable);
  1742. signed long __sched schedule_timeout_uninterruptible(signed long timeout)
  1743. {
  1744. __set_current_state(TASK_UNINTERRUPTIBLE);
  1745. return schedule_timeout(timeout);
  1746. }
  1747. EXPORT_SYMBOL(schedule_timeout_uninterruptible);
  1748. /*
  1749. * Like schedule_timeout_uninterruptible(), except this task will not contribute
  1750. * to load average.
  1751. */
  1752. signed long __sched schedule_timeout_idle(signed long timeout)
  1753. {
  1754. __set_current_state(TASK_IDLE);
  1755. return schedule_timeout(timeout);
  1756. }
  1757. EXPORT_SYMBOL(schedule_timeout_idle);
  1758. #ifdef CONFIG_HOTPLUG_CPU
  1759. static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head)
  1760. {
  1761. struct timer_list *timer;
  1762. int cpu = new_base->cpu;
  1763. while (!hlist_empty(head)) {
  1764. timer = hlist_entry(head->first, struct timer_list, entry);
  1765. detach_timer(timer, false);
  1766. timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
  1767. internal_add_timer(new_base, timer);
  1768. }
  1769. }
  1770. int timers_prepare_cpu(unsigned int cpu)
  1771. {
  1772. struct timer_base *base;
  1773. int b;
  1774. for (b = 0; b < NR_BASES; b++) {
  1775. base = per_cpu_ptr(&timer_bases[b], cpu);
  1776. base->clk = jiffies;
  1777. base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
  1778. base->next_expiry_recalc = false;
  1779. base->timers_pending = false;
  1780. base->is_idle = false;
  1781. }
  1782. return 0;
  1783. }
  1784. int timers_dead_cpu(unsigned int cpu)
  1785. {
  1786. struct timer_base *old_base;
  1787. struct timer_base *new_base;
  1788. int b, i;
  1789. BUG_ON(cpu_online(cpu));
  1790. for (b = 0; b < NR_BASES; b++) {
  1791. old_base = per_cpu_ptr(&timer_bases[b], cpu);
  1792. new_base = get_cpu_ptr(&timer_bases[b]);
  1793. /*
  1794. * The caller is globally serialized and nobody else
  1795. * takes two locks at once, deadlock is not possible.
  1796. */
  1797. raw_spin_lock_irq(&new_base->lock);
  1798. raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
  1799. /*
  1800. * The current CPUs base clock might be stale. Update it
  1801. * before moving the timers over.
  1802. */
  1803. forward_timer_base(new_base);
  1804. BUG_ON(old_base->running_timer);
  1805. for (i = 0; i < WHEEL_SIZE; i++)
  1806. migrate_timer_list(new_base, old_base->vectors + i);
  1807. raw_spin_unlock(&old_base->lock);
  1808. raw_spin_unlock_irq(&new_base->lock);
  1809. put_cpu_ptr(&timer_bases);
  1810. }
  1811. return 0;
  1812. }
  1813. #endif /* CONFIG_HOTPLUG_CPU */
  1814. static void __init init_timer_cpu(int cpu)
  1815. {
  1816. struct timer_base *base;
  1817. int i;
  1818. for (i = 0; i < NR_BASES; i++) {
  1819. base = per_cpu_ptr(&timer_bases[i], cpu);
  1820. base->cpu = cpu;
  1821. raw_spin_lock_init(&base->lock);
  1822. base->clk = jiffies;
  1823. base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA;
  1824. timer_base_init_expiry_lock(base);
  1825. }
  1826. }
  1827. static void __init init_timer_cpus(void)
  1828. {
  1829. int cpu;
  1830. for_each_possible_cpu(cpu)
  1831. init_timer_cpu(cpu);
  1832. }
  1833. void __init init_timers(void)
  1834. {
  1835. init_timer_cpus();
  1836. posix_cputimers_init_work();
  1837. open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
  1838. }
  1839. /**
  1840. * msleep - sleep safely even with waitqueue interruptions
  1841. * @msecs: Time in milliseconds to sleep for
  1842. */
  1843. void msleep(unsigned int msecs)
  1844. {
  1845. unsigned long timeout = msecs_to_jiffies(msecs) + 1;
  1846. while (timeout)
  1847. timeout = schedule_timeout_uninterruptible(timeout);
  1848. }
  1849. EXPORT_SYMBOL(msleep);
  1850. /**
  1851. * msleep_interruptible - sleep waiting for signals
  1852. * @msecs: Time in milliseconds to sleep for
  1853. */
  1854. unsigned long msleep_interruptible(unsigned int msecs)
  1855. {
  1856. unsigned long timeout = msecs_to_jiffies(msecs) + 1;
  1857. while (timeout && !signal_pending(current))
  1858. timeout = schedule_timeout_interruptible(timeout);
  1859. return jiffies_to_msecs(timeout);
  1860. }
  1861. EXPORT_SYMBOL(msleep_interruptible);
  1862. /**
  1863. * usleep_range_state - Sleep for an approximate time in a given state
  1864. * @min: Minimum time in usecs to sleep
  1865. * @max: Maximum time in usecs to sleep
  1866. * @state: State of the current task that will be while sleeping
  1867. *
  1868. * In non-atomic context where the exact wakeup time is flexible, use
  1869. * usleep_range_state() instead of udelay(). The sleep improves responsiveness
  1870. * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces
  1871. * power usage by allowing hrtimers to take advantage of an already-
  1872. * scheduled interrupt instead of scheduling a new one just for this sleep.
  1873. */
  1874. void __sched usleep_range_state(unsigned long min, unsigned long max,
  1875. unsigned int state)
  1876. {
  1877. ktime_t exp = ktime_add_us(ktime_get(), min);
  1878. u64 delta = (u64)(max - min) * NSEC_PER_USEC;
  1879. for (;;) {
  1880. __set_current_state(state);
  1881. /* Do not return before the requested sleep time has elapsed */
  1882. if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS))
  1883. break;
  1884. }
  1885. }
  1886. EXPORT_SYMBOL(usleep_range_state);