tree_plugin.h 42 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304
  1. /* SPDX-License-Identifier: GPL-2.0+ */
  2. /*
  3. * Read-Copy Update mechanism for mutual exclusion (tree-based version)
  4. * Internal non-public definitions that provide either classic
  5. * or preemptible semantics.
  6. *
  7. * Copyright Red Hat, 2009
  8. * Copyright IBM Corporation, 2009
  9. *
  10. * Author: Ingo Molnar <[email protected]>
  11. * Paul E. McKenney <[email protected]>
  12. */
  13. #include "../locking/rtmutex_common.h"
  14. static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
  15. {
  16. /*
  17. * In order to read the offloaded state of an rdp in a safe
  18. * and stable way and prevent from its value to be changed
  19. * under us, we must either hold the barrier mutex, the cpu
  20. * hotplug lock (read or write) or the nocb lock. Local
  21. * non-preemptible reads are also safe. NOCB kthreads and
  22. * timers have their own means of synchronization against the
  23. * offloaded state updaters.
  24. */
  25. RCU_LOCKDEP_WARN(
  26. !(lockdep_is_held(&rcu_state.barrier_mutex) ||
  27. (IS_ENABLED(CONFIG_HOTPLUG_CPU) && lockdep_is_cpus_held()) ||
  28. rcu_lockdep_is_held_nocb(rdp) ||
  29. (rdp == this_cpu_ptr(&rcu_data) &&
  30. !(IS_ENABLED(CONFIG_PREEMPT_COUNT) && preemptible())) ||
  31. rcu_current_is_nocb_kthread(rdp)),
  32. "Unsafe read of RCU_NOCB offloaded state"
  33. );
  34. return rcu_segcblist_is_offloaded(&rdp->cblist);
  35. }
  36. /*
  37. * Check the RCU kernel configuration parameters and print informative
  38. * messages about anything out of the ordinary.
  39. */
  40. static void __init rcu_bootup_announce_oddness(void)
  41. {
  42. if (IS_ENABLED(CONFIG_RCU_TRACE))
  43. pr_info("\tRCU event tracing is enabled.\n");
  44. if ((IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 64) ||
  45. (!IS_ENABLED(CONFIG_64BIT) && RCU_FANOUT != 32))
  46. pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d.\n",
  47. RCU_FANOUT);
  48. if (rcu_fanout_exact)
  49. pr_info("\tHierarchical RCU autobalancing is disabled.\n");
  50. if (IS_ENABLED(CONFIG_PROVE_RCU))
  51. pr_info("\tRCU lockdep checking is enabled.\n");
  52. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD))
  53. pr_info("\tRCU strict (and thus non-scalable) grace periods are enabled.\n");
  54. if (RCU_NUM_LVLS >= 4)
  55. pr_info("\tFour(or more)-level hierarchy is enabled.\n");
  56. if (RCU_FANOUT_LEAF != 16)
  57. pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
  58. RCU_FANOUT_LEAF);
  59. if (rcu_fanout_leaf != RCU_FANOUT_LEAF)
  60. pr_info("\tBoot-time adjustment of leaf fanout to %d.\n",
  61. rcu_fanout_leaf);
  62. if (nr_cpu_ids != NR_CPUS)
  63. pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%u.\n", NR_CPUS, nr_cpu_ids);
  64. #ifdef CONFIG_RCU_BOOST
  65. pr_info("\tRCU priority boosting: priority %d delay %d ms.\n",
  66. kthread_prio, CONFIG_RCU_BOOST_DELAY);
  67. #endif
  68. if (blimit != DEFAULT_RCU_BLIMIT)
  69. pr_info("\tBoot-time adjustment of callback invocation limit to %ld.\n", blimit);
  70. if (qhimark != DEFAULT_RCU_QHIMARK)
  71. pr_info("\tBoot-time adjustment of callback high-water mark to %ld.\n", qhimark);
  72. if (qlowmark != DEFAULT_RCU_QLOMARK)
  73. pr_info("\tBoot-time adjustment of callback low-water mark to %ld.\n", qlowmark);
  74. if (qovld != DEFAULT_RCU_QOVLD)
  75. pr_info("\tBoot-time adjustment of callback overload level to %ld.\n", qovld);
  76. if (jiffies_till_first_fqs != ULONG_MAX)
  77. pr_info("\tBoot-time adjustment of first FQS scan delay to %ld jiffies.\n", jiffies_till_first_fqs);
  78. if (jiffies_till_next_fqs != ULONG_MAX)
  79. pr_info("\tBoot-time adjustment of subsequent FQS scan delay to %ld jiffies.\n", jiffies_till_next_fqs);
  80. if (jiffies_till_sched_qs != ULONG_MAX)
  81. pr_info("\tBoot-time adjustment of scheduler-enlistment delay to %ld jiffies.\n", jiffies_till_sched_qs);
  82. if (rcu_kick_kthreads)
  83. pr_info("\tKick kthreads if too-long grace period.\n");
  84. if (IS_ENABLED(CONFIG_DEBUG_OBJECTS_RCU_HEAD))
  85. pr_info("\tRCU callback double-/use-after-free debug is enabled.\n");
  86. if (gp_preinit_delay)
  87. pr_info("\tRCU debug GP pre-init slowdown %d jiffies.\n", gp_preinit_delay);
  88. if (gp_init_delay)
  89. pr_info("\tRCU debug GP init slowdown %d jiffies.\n", gp_init_delay);
  90. if (gp_cleanup_delay)
  91. pr_info("\tRCU debug GP cleanup slowdown %d jiffies.\n", gp_cleanup_delay);
  92. if (!use_softirq)
  93. pr_info("\tRCU_SOFTIRQ processing moved to rcuc kthreads.\n");
  94. if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG))
  95. pr_info("\tRCU debug extended QS entry/exit.\n");
  96. rcupdate_announce_bootup_oddness();
  97. }
  98. #ifdef CONFIG_PREEMPT_RCU
  99. static void rcu_report_exp_rnp(struct rcu_node *rnp, bool wake);
  100. static void rcu_read_unlock_special(struct task_struct *t);
  101. /*
  102. * Tell them what RCU they are running.
  103. */
  104. static void __init rcu_bootup_announce(void)
  105. {
  106. pr_info("Preemptible hierarchical RCU implementation.\n");
  107. rcu_bootup_announce_oddness();
  108. }
  109. /* Flags for rcu_preempt_ctxt_queue() decision table. */
  110. #define RCU_GP_TASKS 0x8
  111. #define RCU_EXP_TASKS 0x4
  112. #define RCU_GP_BLKD 0x2
  113. #define RCU_EXP_BLKD 0x1
  114. /*
  115. * Queues a task preempted within an RCU-preempt read-side critical
  116. * section into the appropriate location within the ->blkd_tasks list,
  117. * depending on the states of any ongoing normal and expedited grace
  118. * periods. The ->gp_tasks pointer indicates which element the normal
  119. * grace period is waiting on (NULL if none), and the ->exp_tasks pointer
  120. * indicates which element the expedited grace period is waiting on (again,
  121. * NULL if none). If a grace period is waiting on a given element in the
  122. * ->blkd_tasks list, it also waits on all subsequent elements. Thus,
  123. * adding a task to the tail of the list blocks any grace period that is
  124. * already waiting on one of the elements. In contrast, adding a task
  125. * to the head of the list won't block any grace period that is already
  126. * waiting on one of the elements.
  127. *
  128. * This queuing is imprecise, and can sometimes make an ongoing grace
  129. * period wait for a task that is not strictly speaking blocking it.
  130. * Given the choice, we needlessly block a normal grace period rather than
  131. * blocking an expedited grace period.
  132. *
  133. * Note that an endless sequence of expedited grace periods still cannot
  134. * indefinitely postpone a normal grace period. Eventually, all of the
  135. * fixed number of preempted tasks blocking the normal grace period that are
  136. * not also blocking the expedited grace period will resume and complete
  137. * their RCU read-side critical sections. At that point, the ->gp_tasks
  138. * pointer will equal the ->exp_tasks pointer, at which point the end of
  139. * the corresponding expedited grace period will also be the end of the
  140. * normal grace period.
  141. */
  142. static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
  143. __releases(rnp->lock) /* But leaves rrupts disabled. */
  144. {
  145. int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) +
  146. (rnp->exp_tasks ? RCU_EXP_TASKS : 0) +
  147. (rnp->qsmask & rdp->grpmask ? RCU_GP_BLKD : 0) +
  148. (rnp->expmask & rdp->grpmask ? RCU_EXP_BLKD : 0);
  149. struct task_struct *t = current;
  150. raw_lockdep_assert_held_rcu_node(rnp);
  151. WARN_ON_ONCE(rdp->mynode != rnp);
  152. WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
  153. /* RCU better not be waiting on newly onlined CPUs! */
  154. WARN_ON_ONCE(rnp->qsmaskinitnext & ~rnp->qsmaskinit & rnp->qsmask &
  155. rdp->grpmask);
  156. /*
  157. * Decide where to queue the newly blocked task. In theory,
  158. * this could be an if-statement. In practice, when I tried
  159. * that, it was quite messy.
  160. */
  161. switch (blkd_state) {
  162. case 0:
  163. case RCU_EXP_TASKS:
  164. case RCU_EXP_TASKS + RCU_GP_BLKD:
  165. case RCU_GP_TASKS:
  166. case RCU_GP_TASKS + RCU_EXP_TASKS:
  167. /*
  168. * Blocking neither GP, or first task blocking the normal
  169. * GP but not blocking the already-waiting expedited GP.
  170. * Queue at the head of the list to avoid unnecessarily
  171. * blocking the already-waiting GPs.
  172. */
  173. list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
  174. break;
  175. case RCU_EXP_BLKD:
  176. case RCU_GP_BLKD:
  177. case RCU_GP_BLKD + RCU_EXP_BLKD:
  178. case RCU_GP_TASKS + RCU_EXP_BLKD:
  179. case RCU_GP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
  180. case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
  181. /*
  182. * First task arriving that blocks either GP, or first task
  183. * arriving that blocks the expedited GP (with the normal
  184. * GP already waiting), or a task arriving that blocks
  185. * both GPs with both GPs already waiting. Queue at the
  186. * tail of the list to avoid any GP waiting on any of the
  187. * already queued tasks that are not blocking it.
  188. */
  189. list_add_tail(&t->rcu_node_entry, &rnp->blkd_tasks);
  190. break;
  191. case RCU_EXP_TASKS + RCU_EXP_BLKD:
  192. case RCU_EXP_TASKS + RCU_GP_BLKD + RCU_EXP_BLKD:
  193. case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_EXP_BLKD:
  194. /*
  195. * Second or subsequent task blocking the expedited GP.
  196. * The task either does not block the normal GP, or is the
  197. * first task blocking the normal GP. Queue just after
  198. * the first task blocking the expedited GP.
  199. */
  200. list_add(&t->rcu_node_entry, rnp->exp_tasks);
  201. break;
  202. case RCU_GP_TASKS + RCU_GP_BLKD:
  203. case RCU_GP_TASKS + RCU_EXP_TASKS + RCU_GP_BLKD:
  204. /*
  205. * Second or subsequent task blocking the normal GP.
  206. * The task does not block the expedited GP. Queue just
  207. * after the first task blocking the normal GP.
  208. */
  209. list_add(&t->rcu_node_entry, rnp->gp_tasks);
  210. break;
  211. default:
  212. /* Yet another exercise in excessive paranoia. */
  213. WARN_ON_ONCE(1);
  214. break;
  215. }
  216. /*
  217. * We have now queued the task. If it was the first one to
  218. * block either grace period, update the ->gp_tasks and/or
  219. * ->exp_tasks pointers, respectively, to reference the newly
  220. * blocked tasks.
  221. */
  222. if (!rnp->gp_tasks && (blkd_state & RCU_GP_BLKD)) {
  223. WRITE_ONCE(rnp->gp_tasks, &t->rcu_node_entry);
  224. WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq);
  225. }
  226. if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
  227. WRITE_ONCE(rnp->exp_tasks, &t->rcu_node_entry);
  228. WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
  229. !(rnp->qsmask & rdp->grpmask));
  230. WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
  231. !(rnp->expmask & rdp->grpmask));
  232. raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
  233. /*
  234. * Report the quiescent state for the expedited GP. This expedited
  235. * GP should not be able to end until we report, so there should be
  236. * no need to check for a subsequent expedited GP. (Though we are
  237. * still in a quiescent state in any case.)
  238. *
  239. * Interrupts are disabled, so ->cpu_no_qs.b.exp cannot change.
  240. */
  241. if (blkd_state & RCU_EXP_BLKD && rdp->cpu_no_qs.b.exp)
  242. rcu_report_exp_rdp(rdp);
  243. else
  244. WARN_ON_ONCE(rdp->cpu_no_qs.b.exp);
  245. }
  246. /*
  247. * Record a preemptible-RCU quiescent state for the specified CPU.
  248. * Note that this does not necessarily mean that the task currently running
  249. * on the CPU is in a quiescent state: Instead, it means that the current
  250. * grace period need not wait on any RCU read-side critical section that
  251. * starts later on this CPU. It also means that if the current task is
  252. * in an RCU read-side critical section, it has already added itself to
  253. * some leaf rcu_node structure's ->blkd_tasks list. In addition to the
  254. * current task, there might be any number of other tasks blocked while
  255. * in an RCU read-side critical section.
  256. *
  257. * Unlike non-preemptible-RCU, quiescent state reports for expedited
  258. * grace periods are handled separately via deferred quiescent states
  259. * and context switch events.
  260. *
  261. * Callers to this function must disable preemption.
  262. */
  263. static void rcu_qs(void)
  264. {
  265. RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!\n");
  266. if (__this_cpu_read(rcu_data.cpu_no_qs.b.norm)) {
  267. trace_rcu_grace_period(TPS("rcu_preempt"),
  268. __this_cpu_read(rcu_data.gp_seq),
  269. TPS("cpuqs"));
  270. __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
  271. barrier(); /* Coordinate with rcu_flavor_sched_clock_irq(). */
  272. WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, false);
  273. }
  274. }
  275. /*
  276. * We have entered the scheduler, and the current task might soon be
  277. * context-switched away from. If this task is in an RCU read-side
  278. * critical section, we will no longer be able to rely on the CPU to
  279. * record that fact, so we enqueue the task on the blkd_tasks list.
  280. * The task will dequeue itself when it exits the outermost enclosing
  281. * RCU read-side critical section. Therefore, the current grace period
  282. * cannot be permitted to complete until the blkd_tasks list entries
  283. * predating the current grace period drain, in other words, until
  284. * rnp->gp_tasks becomes NULL.
  285. *
  286. * Caller must disable interrupts.
  287. */
  288. void rcu_note_context_switch(bool preempt)
  289. {
  290. struct task_struct *t = current;
  291. struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
  292. struct rcu_node *rnp;
  293. trace_rcu_utilization(TPS("Start context switch"));
  294. lockdep_assert_irqs_disabled();
  295. WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
  296. if (rcu_preempt_depth() > 0 &&
  297. !t->rcu_read_unlock_special.b.blocked) {
  298. /* Possibly blocking in an RCU read-side critical section. */
  299. rnp = rdp->mynode;
  300. raw_spin_lock_rcu_node(rnp);
  301. t->rcu_read_unlock_special.b.blocked = true;
  302. t->rcu_blocked_node = rnp;
  303. /*
  304. * Verify the CPU's sanity, trace the preemption, and
  305. * then queue the task as required based on the states
  306. * of any ongoing and expedited grace periods.
  307. */
  308. WARN_ON_ONCE(!rcu_rdp_cpu_online(rdp));
  309. WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
  310. trace_rcu_preempt_task(rcu_state.name,
  311. t->pid,
  312. (rnp->qsmask & rdp->grpmask)
  313. ? rnp->gp_seq
  314. : rcu_seq_snap(&rnp->gp_seq));
  315. rcu_preempt_ctxt_queue(rnp, rdp);
  316. } else {
  317. rcu_preempt_deferred_qs(t);
  318. }
  319. /*
  320. * Either we were not in an RCU read-side critical section to
  321. * begin with, or we have now recorded that critical section
  322. * globally. Either way, we can now note a quiescent state
  323. * for this CPU. Again, if we were in an RCU read-side critical
  324. * section, and if that critical section was blocking the current
  325. * grace period, then the fact that the task has been enqueued
  326. * means that we continue to block the current grace period.
  327. */
  328. rcu_qs();
  329. if (rdp->cpu_no_qs.b.exp)
  330. rcu_report_exp_rdp(rdp);
  331. rcu_tasks_qs(current, preempt);
  332. trace_rcu_utilization(TPS("End context switch"));
  333. }
  334. EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  335. /*
  336. * Check for preempted RCU readers blocking the current grace period
  337. * for the specified rcu_node structure. If the caller needs a reliable
  338. * answer, it must hold the rcu_node's ->lock.
  339. */
  340. static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
  341. {
  342. return READ_ONCE(rnp->gp_tasks) != NULL;
  343. }
  344. /* limit value for ->rcu_read_lock_nesting. */
  345. #define RCU_NEST_PMAX (INT_MAX / 2)
  346. static void rcu_preempt_read_enter(void)
  347. {
  348. WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
  349. }
  350. static int rcu_preempt_read_exit(void)
  351. {
  352. int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
  353. WRITE_ONCE(current->rcu_read_lock_nesting, ret);
  354. return ret;
  355. }
  356. static void rcu_preempt_depth_set(int val)
  357. {
  358. WRITE_ONCE(current->rcu_read_lock_nesting, val);
  359. }
  360. /*
  361. * Preemptible RCU implementation for rcu_read_lock().
  362. * Just increment ->rcu_read_lock_nesting, shared state will be updated
  363. * if we block.
  364. */
  365. void __rcu_read_lock(void)
  366. {
  367. rcu_preempt_read_enter();
  368. if (IS_ENABLED(CONFIG_PROVE_LOCKING))
  369. WARN_ON_ONCE(rcu_preempt_depth() > RCU_NEST_PMAX);
  370. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) && rcu_state.gp_kthread)
  371. WRITE_ONCE(current->rcu_read_unlock_special.b.need_qs, true);
  372. barrier(); /* critical section after entry code. */
  373. }
  374. EXPORT_SYMBOL_GPL(__rcu_read_lock);
  375. /*
  376. * Preemptible RCU implementation for rcu_read_unlock().
  377. * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
  378. * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
  379. * invoke rcu_read_unlock_special() to clean up after a context switch
  380. * in an RCU read-side critical section and other special cases.
  381. */
  382. void __rcu_read_unlock(void)
  383. {
  384. struct task_struct *t = current;
  385. barrier(); // critical section before exit code.
  386. if (rcu_preempt_read_exit() == 0) {
  387. barrier(); // critical-section exit before .s check.
  388. if (unlikely(READ_ONCE(t->rcu_read_unlock_special.s)))
  389. rcu_read_unlock_special(t);
  390. }
  391. if (IS_ENABLED(CONFIG_PROVE_LOCKING)) {
  392. int rrln = rcu_preempt_depth();
  393. WARN_ON_ONCE(rrln < 0 || rrln > RCU_NEST_PMAX);
  394. }
  395. }
  396. EXPORT_SYMBOL_GPL(__rcu_read_unlock);
  397. /*
  398. * Advance a ->blkd_tasks-list pointer to the next entry, instead
  399. * returning NULL if at the end of the list.
  400. */
  401. static struct list_head *rcu_next_node_entry(struct task_struct *t,
  402. struct rcu_node *rnp)
  403. {
  404. struct list_head *np;
  405. np = t->rcu_node_entry.next;
  406. if (np == &rnp->blkd_tasks)
  407. np = NULL;
  408. return np;
  409. }
  410. /*
  411. * Return true if the specified rcu_node structure has tasks that were
  412. * preempted within an RCU read-side critical section.
  413. */
  414. static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  415. {
  416. return !list_empty(&rnp->blkd_tasks);
  417. }
  418. /*
  419. * Report deferred quiescent states. The deferral time can
  420. * be quite short, for example, in the case of the call from
  421. * rcu_read_unlock_special().
  422. */
  423. static notrace void
  424. rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
  425. {
  426. bool empty_exp;
  427. bool empty_norm;
  428. bool empty_exp_now;
  429. struct list_head *np;
  430. bool drop_boost_mutex = false;
  431. struct rcu_data *rdp;
  432. struct rcu_node *rnp;
  433. union rcu_special special;
  434. /*
  435. * If RCU core is waiting for this CPU to exit its critical section,
  436. * report the fact that it has exited. Because irqs are disabled,
  437. * t->rcu_read_unlock_special cannot change.
  438. */
  439. special = t->rcu_read_unlock_special;
  440. rdp = this_cpu_ptr(&rcu_data);
  441. if (!special.s && !rdp->cpu_no_qs.b.exp) {
  442. local_irq_restore(flags);
  443. return;
  444. }
  445. t->rcu_read_unlock_special.s = 0;
  446. if (special.b.need_qs) {
  447. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) {
  448. rdp->cpu_no_qs.b.norm = false;
  449. rcu_report_qs_rdp(rdp);
  450. udelay(rcu_unlock_delay);
  451. } else {
  452. rcu_qs();
  453. }
  454. }
  455. /*
  456. * Respond to a request by an expedited grace period for a
  457. * quiescent state from this CPU. Note that requests from
  458. * tasks are handled when removing the task from the
  459. * blocked-tasks list below.
  460. */
  461. if (rdp->cpu_no_qs.b.exp)
  462. rcu_report_exp_rdp(rdp);
  463. /* Clean up if blocked during RCU read-side critical section. */
  464. if (special.b.blocked) {
  465. /*
  466. * Remove this task from the list it blocked on. The task
  467. * now remains queued on the rcu_node corresponding to the
  468. * CPU it first blocked on, so there is no longer any need
  469. * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia.
  470. */
  471. rnp = t->rcu_blocked_node;
  472. raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
  473. WARN_ON_ONCE(rnp != t->rcu_blocked_node);
  474. WARN_ON_ONCE(!rcu_is_leaf_node(rnp));
  475. empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
  476. WARN_ON_ONCE(rnp->completedqs == rnp->gp_seq &&
  477. (!empty_norm || rnp->qsmask));
  478. empty_exp = sync_rcu_exp_done(rnp);
  479. smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
  480. np = rcu_next_node_entry(t, rnp);
  481. list_del_init(&t->rcu_node_entry);
  482. t->rcu_blocked_node = NULL;
  483. trace_rcu_unlock_preempted_task(TPS("rcu_preempt"),
  484. rnp->gp_seq, t->pid);
  485. if (&t->rcu_node_entry == rnp->gp_tasks)
  486. WRITE_ONCE(rnp->gp_tasks, np);
  487. if (&t->rcu_node_entry == rnp->exp_tasks)
  488. WRITE_ONCE(rnp->exp_tasks, np);
  489. if (IS_ENABLED(CONFIG_RCU_BOOST)) {
  490. /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
  491. drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx.rtmutex) == t;
  492. if (&t->rcu_node_entry == rnp->boost_tasks)
  493. WRITE_ONCE(rnp->boost_tasks, np);
  494. }
  495. /*
  496. * If this was the last task on the current list, and if
  497. * we aren't waiting on any CPUs, report the quiescent state.
  498. * Note that rcu_report_unblock_qs_rnp() releases rnp->lock,
  499. * so we must take a snapshot of the expedited state.
  500. */
  501. empty_exp_now = sync_rcu_exp_done(rnp);
  502. if (!empty_norm && !rcu_preempt_blocked_readers_cgp(rnp)) {
  503. trace_rcu_quiescent_state_report(TPS("preempt_rcu"),
  504. rnp->gp_seq,
  505. 0, rnp->qsmask,
  506. rnp->level,
  507. rnp->grplo,
  508. rnp->grphi,
  509. !!rnp->gp_tasks);
  510. rcu_report_unblock_qs_rnp(rnp, flags);
  511. } else {
  512. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  513. }
  514. /*
  515. * If this was the last task on the expedited lists,
  516. * then we need to report up the rcu_node hierarchy.
  517. */
  518. if (!empty_exp && empty_exp_now)
  519. rcu_report_exp_rnp(rnp, true);
  520. /* Unboost if we were boosted. */
  521. if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
  522. rt_mutex_futex_unlock(&rnp->boost_mtx.rtmutex);
  523. } else {
  524. local_irq_restore(flags);
  525. }
  526. }
  527. /*
  528. * Is a deferred quiescent-state pending, and are we also not in
  529. * an RCU read-side critical section? It is the caller's responsibility
  530. * to ensure it is otherwise safe to report any deferred quiescent
  531. * states. The reason for this is that it is safe to report a
  532. * quiescent state during context switch even though preemption
  533. * is disabled. This function cannot be expected to understand these
  534. * nuances, so the caller must handle them.
  535. */
  536. static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  537. {
  538. return (__this_cpu_read(rcu_data.cpu_no_qs.b.exp) ||
  539. READ_ONCE(t->rcu_read_unlock_special.s)) &&
  540. rcu_preempt_depth() == 0;
  541. }
  542. /*
  543. * Report a deferred quiescent state if needed and safe to do so.
  544. * As with rcu_preempt_need_deferred_qs(), "safe" involves only
  545. * not being in an RCU read-side critical section. The caller must
  546. * evaluate safety in terms of interrupt, softirq, and preemption
  547. * disabling.
  548. */
  549. notrace void rcu_preempt_deferred_qs(struct task_struct *t)
  550. {
  551. unsigned long flags;
  552. if (!rcu_preempt_need_deferred_qs(t))
  553. return;
  554. local_irq_save(flags);
  555. rcu_preempt_deferred_qs_irqrestore(t, flags);
  556. }
  557. /*
  558. * Minimal handler to give the scheduler a chance to re-evaluate.
  559. */
  560. static void rcu_preempt_deferred_qs_handler(struct irq_work *iwp)
  561. {
  562. struct rcu_data *rdp;
  563. rdp = container_of(iwp, struct rcu_data, defer_qs_iw);
  564. rdp->defer_qs_iw_pending = false;
  565. }
  566. /*
  567. * Handle special cases during rcu_read_unlock(), such as needing to
  568. * notify RCU core processing or task having blocked during the RCU
  569. * read-side critical section.
  570. */
  571. static void rcu_read_unlock_special(struct task_struct *t)
  572. {
  573. unsigned long flags;
  574. bool irqs_were_disabled;
  575. bool preempt_bh_were_disabled =
  576. !!(preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK));
  577. /* NMI handlers cannot block and cannot safely manipulate state. */
  578. if (in_nmi())
  579. return;
  580. local_irq_save(flags);
  581. irqs_were_disabled = irqs_disabled_flags(flags);
  582. if (preempt_bh_were_disabled || irqs_were_disabled) {
  583. bool expboost; // Expedited GP in flight or possible boosting.
  584. struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
  585. struct rcu_node *rnp = rdp->mynode;
  586. expboost = (t->rcu_blocked_node && READ_ONCE(t->rcu_blocked_node->exp_tasks)) ||
  587. (rdp->grpmask & READ_ONCE(rnp->expmask)) ||
  588. (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
  589. ((rdp->grpmask & READ_ONCE(rnp->qsmask)) || t->rcu_blocked_node)) ||
  590. (IS_ENABLED(CONFIG_RCU_BOOST) && irqs_were_disabled &&
  591. t->rcu_blocked_node);
  592. // Need to defer quiescent state until everything is enabled.
  593. if (use_softirq && (in_hardirq() || (expboost && !irqs_were_disabled))) {
  594. // Using softirq, safe to awaken, and either the
  595. // wakeup is free or there is either an expedited
  596. // GP in flight or a potential need to deboost.
  597. raise_softirq_irqoff(RCU_SOFTIRQ);
  598. } else {
  599. // Enabling BH or preempt does reschedule, so...
  600. // Also if no expediting and no possible deboosting,
  601. // slow is OK. Plus nohz_full CPUs eventually get
  602. // tick enabled.
  603. set_tsk_need_resched(current);
  604. set_preempt_need_resched();
  605. if (IS_ENABLED(CONFIG_IRQ_WORK) && irqs_were_disabled &&
  606. expboost && !rdp->defer_qs_iw_pending && cpu_online(rdp->cpu)) {
  607. // Get scheduler to re-evaluate and call hooks.
  608. // If !IRQ_WORK, FQS scan will eventually IPI.
  609. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD) &&
  610. IS_ENABLED(CONFIG_PREEMPT_RT))
  611. rdp->defer_qs_iw = IRQ_WORK_INIT_HARD(
  612. rcu_preempt_deferred_qs_handler);
  613. else
  614. init_irq_work(&rdp->defer_qs_iw,
  615. rcu_preempt_deferred_qs_handler);
  616. rdp->defer_qs_iw_pending = true;
  617. irq_work_queue_on(&rdp->defer_qs_iw, rdp->cpu);
  618. }
  619. }
  620. local_irq_restore(flags);
  621. return;
  622. }
  623. rcu_preempt_deferred_qs_irqrestore(t, flags);
  624. }
  625. /*
  626. * Check that the list of blocked tasks for the newly completed grace
  627. * period is in fact empty. It is a serious bug to complete a grace
  628. * period that still has RCU readers blocked! This function must be
  629. * invoked -before- updating this rnp's ->gp_seq.
  630. *
  631. * Also, if there are blocked tasks on the list, they automatically
  632. * block the newly created grace period, so set up ->gp_tasks accordingly.
  633. */
  634. static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  635. {
  636. struct task_struct *t;
  637. RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
  638. raw_lockdep_assert_held_rcu_node(rnp);
  639. if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)))
  640. dump_blkd_tasks(rnp, 10);
  641. if (rcu_preempt_has_tasks(rnp) &&
  642. (rnp->qsmaskinit || rnp->wait_blkd_tasks)) {
  643. WRITE_ONCE(rnp->gp_tasks, rnp->blkd_tasks.next);
  644. t = container_of(rnp->gp_tasks, struct task_struct,
  645. rcu_node_entry);
  646. trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
  647. rnp->gp_seq, t->pid);
  648. }
  649. WARN_ON_ONCE(rnp->qsmask);
  650. }
  651. /*
  652. * Check for a quiescent state from the current CPU, including voluntary
  653. * context switches for Tasks RCU. When a task blocks, the task is
  654. * recorded in the corresponding CPU's rcu_node structure, which is checked
  655. * elsewhere, hence this function need only check for quiescent states
  656. * related to the current CPU, not to those related to tasks.
  657. */
  658. static void rcu_flavor_sched_clock_irq(int user)
  659. {
  660. struct task_struct *t = current;
  661. lockdep_assert_irqs_disabled();
  662. if (rcu_preempt_depth() > 0 ||
  663. (preempt_count() & (PREEMPT_MASK | SOFTIRQ_MASK))) {
  664. /* No QS, force context switch if deferred. */
  665. if (rcu_preempt_need_deferred_qs(t)) {
  666. set_tsk_need_resched(t);
  667. set_preempt_need_resched();
  668. }
  669. } else if (rcu_preempt_need_deferred_qs(t)) {
  670. rcu_preempt_deferred_qs(t); /* Report deferred QS. */
  671. return;
  672. } else if (!WARN_ON_ONCE(rcu_preempt_depth())) {
  673. rcu_qs(); /* Report immediate QS. */
  674. return;
  675. }
  676. /* If GP is oldish, ask for help from rcu_read_unlock_special(). */
  677. if (rcu_preempt_depth() > 0 &&
  678. __this_cpu_read(rcu_data.core_needs_qs) &&
  679. __this_cpu_read(rcu_data.cpu_no_qs.b.norm) &&
  680. !t->rcu_read_unlock_special.b.need_qs &&
  681. time_after(jiffies, rcu_state.gp_start + HZ))
  682. t->rcu_read_unlock_special.b.need_qs = true;
  683. }
  684. /*
  685. * Check for a task exiting while in a preemptible-RCU read-side
  686. * critical section, clean up if so. No need to issue warnings, as
  687. * debug_check_no_locks_held() already does this if lockdep is enabled.
  688. * Besides, if this function does anything other than just immediately
  689. * return, there was a bug of some sort. Spewing warnings from this
  690. * function is like as not to simply obscure important prior warnings.
  691. */
  692. void exit_rcu(void)
  693. {
  694. struct task_struct *t = current;
  695. if (unlikely(!list_empty(&current->rcu_node_entry))) {
  696. rcu_preempt_depth_set(1);
  697. barrier();
  698. WRITE_ONCE(t->rcu_read_unlock_special.b.blocked, true);
  699. } else if (unlikely(rcu_preempt_depth())) {
  700. rcu_preempt_depth_set(1);
  701. } else {
  702. return;
  703. }
  704. __rcu_read_unlock();
  705. rcu_preempt_deferred_qs(current);
  706. }
  707. /*
  708. * Dump the blocked-tasks state, but limit the list dump to the
  709. * specified number of elements.
  710. */
  711. static void
  712. dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
  713. {
  714. int cpu;
  715. int i;
  716. struct list_head *lhp;
  717. struct rcu_data *rdp;
  718. struct rcu_node *rnp1;
  719. raw_lockdep_assert_held_rcu_node(rnp);
  720. pr_info("%s: grp: %d-%d level: %d ->gp_seq %ld ->completedqs %ld\n",
  721. __func__, rnp->grplo, rnp->grphi, rnp->level,
  722. (long)READ_ONCE(rnp->gp_seq), (long)rnp->completedqs);
  723. for (rnp1 = rnp; rnp1; rnp1 = rnp1->parent)
  724. pr_info("%s: %d:%d ->qsmask %#lx ->qsmaskinit %#lx ->qsmaskinitnext %#lx\n",
  725. __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext);
  726. pr_info("%s: ->gp_tasks %p ->boost_tasks %p ->exp_tasks %p\n",
  727. __func__, READ_ONCE(rnp->gp_tasks), data_race(rnp->boost_tasks),
  728. READ_ONCE(rnp->exp_tasks));
  729. pr_info("%s: ->blkd_tasks", __func__);
  730. i = 0;
  731. list_for_each(lhp, &rnp->blkd_tasks) {
  732. pr_cont(" %p", lhp);
  733. if (++i >= ncheck)
  734. break;
  735. }
  736. pr_cont("\n");
  737. for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) {
  738. rdp = per_cpu_ptr(&rcu_data, cpu);
  739. pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n",
  740. cpu, ".o"[rcu_rdp_cpu_online(rdp)],
  741. (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags,
  742. (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags);
  743. }
  744. }
  745. #else /* #ifdef CONFIG_PREEMPT_RCU */
  746. /*
  747. * If strict grace periods are enabled, and if the calling
  748. * __rcu_read_unlock() marks the beginning of a quiescent state, immediately
  749. * report that quiescent state and, if requested, spin for a bit.
  750. */
  751. void rcu_read_unlock_strict(void)
  752. {
  753. struct rcu_data *rdp;
  754. if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread)
  755. return;
  756. rdp = this_cpu_ptr(&rcu_data);
  757. rdp->cpu_no_qs.b.norm = false;
  758. rcu_report_qs_rdp(rdp);
  759. udelay(rcu_unlock_delay);
  760. }
  761. EXPORT_SYMBOL_GPL(rcu_read_unlock_strict);
  762. /*
  763. * Tell them what RCU they are running.
  764. */
  765. static void __init rcu_bootup_announce(void)
  766. {
  767. pr_info("Hierarchical RCU implementation.\n");
  768. rcu_bootup_announce_oddness();
  769. }
  770. /*
  771. * Note a quiescent state for PREEMPTION=n. Because we do not need to know
  772. * how many quiescent states passed, just if there was at least one since
  773. * the start of the grace period, this just sets a flag. The caller must
  774. * have disabled preemption.
  775. */
  776. static void rcu_qs(void)
  777. {
  778. RCU_LOCKDEP_WARN(preemptible(), "rcu_qs() invoked with preemption enabled!!!");
  779. if (!__this_cpu_read(rcu_data.cpu_no_qs.s))
  780. return;
  781. trace_rcu_grace_period(TPS("rcu_sched"),
  782. __this_cpu_read(rcu_data.gp_seq), TPS("cpuqs"));
  783. __this_cpu_write(rcu_data.cpu_no_qs.b.norm, false);
  784. if (__this_cpu_read(rcu_data.cpu_no_qs.b.exp))
  785. rcu_report_exp_rdp(this_cpu_ptr(&rcu_data));
  786. }
  787. /*
  788. * Register an urgently needed quiescent state. If there is an
  789. * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
  790. * dyntick-idle quiescent state visible to other CPUs, which will in
  791. * some cases serve for expedited as well as normal grace periods.
  792. * Either way, register a lightweight quiescent state.
  793. */
  794. void rcu_all_qs(void)
  795. {
  796. unsigned long flags;
  797. if (!raw_cpu_read(rcu_data.rcu_urgent_qs))
  798. return;
  799. preempt_disable(); // For CONFIG_PREEMPT_COUNT=y kernels
  800. /* Load rcu_urgent_qs before other flags. */
  801. if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs))) {
  802. preempt_enable();
  803. return;
  804. }
  805. this_cpu_write(rcu_data.rcu_urgent_qs, false);
  806. if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs))) {
  807. local_irq_save(flags);
  808. rcu_momentary_dyntick_idle();
  809. local_irq_restore(flags);
  810. }
  811. rcu_qs();
  812. preempt_enable();
  813. }
  814. EXPORT_SYMBOL_GPL(rcu_all_qs);
  815. /*
  816. * Note a PREEMPTION=n context switch. The caller must have disabled interrupts.
  817. */
  818. void rcu_note_context_switch(bool preempt)
  819. {
  820. trace_rcu_utilization(TPS("Start context switch"));
  821. rcu_qs();
  822. /* Load rcu_urgent_qs before other flags. */
  823. if (!smp_load_acquire(this_cpu_ptr(&rcu_data.rcu_urgent_qs)))
  824. goto out;
  825. this_cpu_write(rcu_data.rcu_urgent_qs, false);
  826. if (unlikely(raw_cpu_read(rcu_data.rcu_need_heavy_qs)))
  827. rcu_momentary_dyntick_idle();
  828. out:
  829. rcu_tasks_qs(current, preempt);
  830. trace_rcu_utilization(TPS("End context switch"));
  831. }
  832. EXPORT_SYMBOL_GPL(rcu_note_context_switch);
  833. /*
  834. * Because preemptible RCU does not exist, there are never any preempted
  835. * RCU readers.
  836. */
  837. static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
  838. {
  839. return 0;
  840. }
  841. /*
  842. * Because there is no preemptible RCU, there can be no readers blocked.
  843. */
  844. static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  845. {
  846. return false;
  847. }
  848. /*
  849. * Because there is no preemptible RCU, there can be no deferred quiescent
  850. * states.
  851. */
  852. static notrace bool rcu_preempt_need_deferred_qs(struct task_struct *t)
  853. {
  854. return false;
  855. }
  856. // Except that we do need to respond to a request by an expedited
  857. // grace period for a quiescent state from this CPU. Note that in
  858. // non-preemptible kernels, there can be no context switches within RCU
  859. // read-side critical sections, which in turn means that the leaf rcu_node
  860. // structure's blocked-tasks list is always empty. is therefore no need to
  861. // actually check it. Instead, a quiescent state from this CPU suffices,
  862. // and this function is only called from such a quiescent state.
  863. notrace void rcu_preempt_deferred_qs(struct task_struct *t)
  864. {
  865. struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
  866. if (READ_ONCE(rdp->cpu_no_qs.b.exp))
  867. rcu_report_exp_rdp(rdp);
  868. }
  869. /*
  870. * Because there is no preemptible RCU, there can be no readers blocked,
  871. * so there is no need to check for blocked tasks. So check only for
  872. * bogus qsmask values.
  873. */
  874. static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  875. {
  876. WARN_ON_ONCE(rnp->qsmask);
  877. }
  878. /*
  879. * Check to see if this CPU is in a non-context-switch quiescent state,
  880. * namely user mode and idle loop.
  881. */
  882. static void rcu_flavor_sched_clock_irq(int user)
  883. {
  884. if (user || rcu_is_cpu_rrupt_from_idle()) {
  885. /*
  886. * Get here if this CPU took its interrupt from user
  887. * mode or from the idle loop, and if this is not a
  888. * nested interrupt. In this case, the CPU is in
  889. * a quiescent state, so note it.
  890. *
  891. * No memory barrier is required here because rcu_qs()
  892. * references only CPU-local variables that other CPUs
  893. * neither access nor modify, at least not while the
  894. * corresponding CPU is online.
  895. */
  896. rcu_qs();
  897. }
  898. }
  899. /*
  900. * Because preemptible RCU does not exist, tasks cannot possibly exit
  901. * while in preemptible RCU read-side critical sections.
  902. */
  903. void exit_rcu(void)
  904. {
  905. }
  906. /*
  907. * Dump the guaranteed-empty blocked-tasks state. Trust but verify.
  908. */
  909. static void
  910. dump_blkd_tasks(struct rcu_node *rnp, int ncheck)
  911. {
  912. WARN_ON_ONCE(!list_empty(&rnp->blkd_tasks));
  913. }
  914. #endif /* #else #ifdef CONFIG_PREEMPT_RCU */
  915. /*
  916. * If boosting, set rcuc kthreads to realtime priority.
  917. */
  918. static void rcu_cpu_kthread_setup(unsigned int cpu)
  919. {
  920. struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
  921. #ifdef CONFIG_RCU_BOOST
  922. struct sched_param sp;
  923. sp.sched_priority = kthread_prio;
  924. sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
  925. #endif /* #ifdef CONFIG_RCU_BOOST */
  926. WRITE_ONCE(rdp->rcuc_activity, jiffies);
  927. }
  928. static bool rcu_is_callbacks_nocb_kthread(struct rcu_data *rdp)
  929. {
  930. #ifdef CONFIG_RCU_NOCB_CPU
  931. return rdp->nocb_cb_kthread == current;
  932. #else
  933. return false;
  934. #endif
  935. }
  936. /*
  937. * Is the current CPU running the RCU-callbacks kthread?
  938. * Caller must have preemption disabled.
  939. */
  940. static bool rcu_is_callbacks_kthread(struct rcu_data *rdp)
  941. {
  942. return rdp->rcu_cpu_kthread_task == current ||
  943. rcu_is_callbacks_nocb_kthread(rdp);
  944. }
  945. #ifdef CONFIG_RCU_BOOST
  946. /*
  947. * Carry out RCU priority boosting on the task indicated by ->exp_tasks
  948. * or ->boost_tasks, advancing the pointer to the next task in the
  949. * ->blkd_tasks list.
  950. *
  951. * Note that irqs must be enabled: boosting the task can block.
  952. * Returns 1 if there are more tasks needing to be boosted.
  953. */
  954. static int rcu_boost(struct rcu_node *rnp)
  955. {
  956. unsigned long flags;
  957. struct task_struct *t;
  958. struct list_head *tb;
  959. if (READ_ONCE(rnp->exp_tasks) == NULL &&
  960. READ_ONCE(rnp->boost_tasks) == NULL)
  961. return 0; /* Nothing left to boost. */
  962. raw_spin_lock_irqsave_rcu_node(rnp, flags);
  963. /*
  964. * Recheck under the lock: all tasks in need of boosting
  965. * might exit their RCU read-side critical sections on their own.
  966. */
  967. if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
  968. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  969. return 0;
  970. }
  971. /*
  972. * Preferentially boost tasks blocking expedited grace periods.
  973. * This cannot starve the normal grace periods because a second
  974. * expedited grace period must boost all blocked tasks, including
  975. * those blocking the pre-existing normal grace period.
  976. */
  977. if (rnp->exp_tasks != NULL)
  978. tb = rnp->exp_tasks;
  979. else
  980. tb = rnp->boost_tasks;
  981. /*
  982. * We boost task t by manufacturing an rt_mutex that appears to
  983. * be held by task t. We leave a pointer to that rt_mutex where
  984. * task t can find it, and task t will release the mutex when it
  985. * exits its outermost RCU read-side critical section. Then
  986. * simply acquiring this artificial rt_mutex will boost task
  987. * t's priority. (Thanks to tglx for suggesting this approach!)
  988. *
  989. * Note that task t must acquire rnp->lock to remove itself from
  990. * the ->blkd_tasks list, which it will do from exit() if from
  991. * nowhere else. We therefore are guaranteed that task t will
  992. * stay around at least until we drop rnp->lock. Note that
  993. * rnp->lock also resolves races between our priority boosting
  994. * and task t's exiting its outermost RCU read-side critical
  995. * section.
  996. */
  997. t = container_of(tb, struct task_struct, rcu_node_entry);
  998. rt_mutex_init_proxy_locked(&rnp->boost_mtx.rtmutex, t);
  999. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1000. /* Lock only for side effect: boosts task t's priority. */
  1001. rt_mutex_lock(&rnp->boost_mtx);
  1002. rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
  1003. rnp->n_boosts++;
  1004. return READ_ONCE(rnp->exp_tasks) != NULL ||
  1005. READ_ONCE(rnp->boost_tasks) != NULL;
  1006. }
  1007. /*
  1008. * Priority-boosting kthread, one per leaf rcu_node.
  1009. */
  1010. static int rcu_boost_kthread(void *arg)
  1011. {
  1012. struct rcu_node *rnp = (struct rcu_node *)arg;
  1013. int spincnt = 0;
  1014. int more2boost;
  1015. trace_rcu_utilization(TPS("Start boost kthread@init"));
  1016. for (;;) {
  1017. WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_WAITING);
  1018. trace_rcu_utilization(TPS("End boost kthread@rcu_wait"));
  1019. rcu_wait(READ_ONCE(rnp->boost_tasks) ||
  1020. READ_ONCE(rnp->exp_tasks));
  1021. trace_rcu_utilization(TPS("Start boost kthread@rcu_wait"));
  1022. WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_RUNNING);
  1023. more2boost = rcu_boost(rnp);
  1024. if (more2boost)
  1025. spincnt++;
  1026. else
  1027. spincnt = 0;
  1028. if (spincnt > 10) {
  1029. WRITE_ONCE(rnp->boost_kthread_status, RCU_KTHREAD_YIELDING);
  1030. trace_rcu_utilization(TPS("End boost kthread@rcu_yield"));
  1031. schedule_timeout_idle(2);
  1032. trace_rcu_utilization(TPS("Start boost kthread@rcu_yield"));
  1033. spincnt = 0;
  1034. }
  1035. }
  1036. /* NOTREACHED */
  1037. trace_rcu_utilization(TPS("End boost kthread@notreached"));
  1038. return 0;
  1039. }
  1040. /*
  1041. * Check to see if it is time to start boosting RCU readers that are
  1042. * blocking the current grace period, and, if so, tell the per-rcu_node
  1043. * kthread to start boosting them. If there is an expedited grace
  1044. * period in progress, it is always time to boost.
  1045. *
  1046. * The caller must hold rnp->lock, which this function releases.
  1047. * The ->boost_kthread_task is immortal, so we don't need to worry
  1048. * about it going away.
  1049. */
  1050. static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
  1051. __releases(rnp->lock)
  1052. {
  1053. raw_lockdep_assert_held_rcu_node(rnp);
  1054. if (!rnp->boost_kthread_task ||
  1055. (!rcu_preempt_blocked_readers_cgp(rnp) && !rnp->exp_tasks)) {
  1056. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1057. return;
  1058. }
  1059. if (rnp->exp_tasks != NULL ||
  1060. (rnp->gp_tasks != NULL &&
  1061. rnp->boost_tasks == NULL &&
  1062. rnp->qsmask == 0 &&
  1063. (!time_after(rnp->boost_time, jiffies) || rcu_state.cbovld ||
  1064. IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)))) {
  1065. if (rnp->exp_tasks == NULL)
  1066. WRITE_ONCE(rnp->boost_tasks, rnp->gp_tasks);
  1067. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1068. rcu_wake_cond(rnp->boost_kthread_task,
  1069. READ_ONCE(rnp->boost_kthread_status));
  1070. } else {
  1071. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1072. }
  1073. }
  1074. #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
  1075. /*
  1076. * Do priority-boost accounting for the start of a new grace period.
  1077. */
  1078. static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  1079. {
  1080. rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
  1081. }
  1082. /*
  1083. * Create an RCU-boost kthread for the specified node if one does not
  1084. * already exist. We only create this kthread for preemptible RCU.
  1085. */
  1086. static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  1087. {
  1088. unsigned long flags;
  1089. int rnp_index = rnp - rcu_get_root();
  1090. struct sched_param sp;
  1091. struct task_struct *t;
  1092. mutex_lock(&rnp->boost_kthread_mutex);
  1093. if (rnp->boost_kthread_task || !rcu_scheduler_fully_active)
  1094. goto out;
  1095. t = kthread_create(rcu_boost_kthread, (void *)rnp,
  1096. "rcub/%d", rnp_index);
  1097. if (WARN_ON_ONCE(IS_ERR(t)))
  1098. goto out;
  1099. raw_spin_lock_irqsave_rcu_node(rnp, flags);
  1100. rnp->boost_kthread_task = t;
  1101. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1102. sp.sched_priority = kthread_prio;
  1103. sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
  1104. wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
  1105. out:
  1106. mutex_unlock(&rnp->boost_kthread_mutex);
  1107. }
  1108. /*
  1109. * Set the per-rcu_node kthread's affinity to cover all CPUs that are
  1110. * served by the rcu_node in question. The CPU hotplug lock is still
  1111. * held, so the value of rnp->qsmaskinit will be stable.
  1112. *
  1113. * We don't include outgoingcpu in the affinity set, use -1 if there is
  1114. * no outgoing CPU. If there are no CPUs left in the affinity set,
  1115. * this function allows the kthread to execute on any CPU.
  1116. */
  1117. static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  1118. {
  1119. struct task_struct *t = rnp->boost_kthread_task;
  1120. unsigned long mask = rcu_rnp_online_cpus(rnp);
  1121. cpumask_var_t cm;
  1122. int cpu;
  1123. if (!t)
  1124. return;
  1125. if (!zalloc_cpumask_var(&cm, GFP_KERNEL))
  1126. return;
  1127. mutex_lock(&rnp->boost_kthread_mutex);
  1128. for_each_leaf_node_possible_cpu(rnp, cpu)
  1129. if ((mask & leaf_node_cpu_bit(rnp, cpu)) &&
  1130. cpu != outgoingcpu)
  1131. cpumask_set_cpu(cpu, cm);
  1132. cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU));
  1133. if (cpumask_empty(cm)) {
  1134. cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU));
  1135. if (outgoingcpu >= 0)
  1136. cpumask_clear_cpu(outgoingcpu, cm);
  1137. }
  1138. set_cpus_allowed_ptr(t, cm);
  1139. mutex_unlock(&rnp->boost_kthread_mutex);
  1140. free_cpumask_var(cm);
  1141. }
  1142. #else /* #ifdef CONFIG_RCU_BOOST */
  1143. static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
  1144. __releases(rnp->lock)
  1145. {
  1146. raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
  1147. }
  1148. static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  1149. {
  1150. }
  1151. static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp)
  1152. {
  1153. }
  1154. static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
  1155. {
  1156. }
  1157. #endif /* #else #ifdef CONFIG_RCU_BOOST */
  1158. /*
  1159. * Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
  1160. * grace-period kthread will do force_quiescent_state() processing?
  1161. * The idea is to avoid waking up RCU core processing on such a
  1162. * CPU unless the grace period has extended for too long.
  1163. *
  1164. * This code relies on the fact that all NO_HZ_FULL CPUs are also
  1165. * RCU_NOCB_CPU CPUs.
  1166. */
  1167. static bool rcu_nohz_full_cpu(void)
  1168. {
  1169. #ifdef CONFIG_NO_HZ_FULL
  1170. if (tick_nohz_full_cpu(smp_processor_id()) &&
  1171. (!rcu_gp_in_progress() ||
  1172. time_before(jiffies, READ_ONCE(rcu_state.gp_start) + HZ)))
  1173. return true;
  1174. #endif /* #ifdef CONFIG_NO_HZ_FULL */
  1175. return false;
  1176. }
  1177. /*
  1178. * Bind the RCU grace-period kthreads to the housekeeping CPU.
  1179. */
  1180. static void rcu_bind_gp_kthread(void)
  1181. {
  1182. if (!tick_nohz_full_enabled())
  1183. return;
  1184. housekeeping_affine(current, HK_TYPE_RCU);
  1185. }