mmu_notifier.c 35 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/mm/mmu_notifier.c
  4. *
  5. * Copyright (C) 2008 Qumranet, Inc.
  6. * Copyright (C) 2008 SGI
  7. * Christoph Lameter <[email protected]>
  8. */
  9. #include <linux/rculist.h>
  10. #include <linux/mmu_notifier.h>
  11. #include <linux/export.h>
  12. #include <linux/mm.h>
  13. #include <linux/err.h>
  14. #include <linux/interval_tree.h>
  15. #include <linux/srcu.h>
  16. #include <linux/rcupdate.h>
  17. #include <linux/sched.h>
  18. #include <linux/sched/mm.h>
  19. #include <linux/slab.h>
  20. /* global SRCU for all MMs */
  21. DEFINE_STATIC_SRCU(srcu);
  22. #ifdef CONFIG_LOCKDEP
  23. struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
  24. .name = "mmu_notifier_invalidate_range_start"
  25. };
  26. #endif
  27. /*
  28. * The mmu_notifier_subscriptions structure is allocated and installed in
  29. * mm->notifier_subscriptions inside the mm_take_all_locks() protected
  30. * critical section and it's released only when mm_count reaches zero
  31. * in mmdrop().
  32. */
  33. struct mmu_notifier_subscriptions {
  34. /* all mmu notifiers registered in this mm are queued in this list */
  35. struct hlist_head list;
  36. bool has_itree;
  37. /* to serialize the list modifications and hlist_unhashed */
  38. spinlock_t lock;
  39. unsigned long invalidate_seq;
  40. unsigned long active_invalidate_ranges;
  41. struct rb_root_cached itree;
  42. wait_queue_head_t wq;
  43. struct hlist_head deferred_list;
  44. };
  45. /*
  46. * This is a collision-retry read-side/write-side 'lock', a lot like a
  47. * seqcount, however this allows multiple write-sides to hold it at
  48. * once. Conceptually the write side is protecting the values of the PTEs in
  49. * this mm, such that PTES cannot be read into SPTEs (shadow PTEs) while any
  50. * writer exists.
  51. *
  52. * Note that the core mm creates nested invalidate_range_start()/end() regions
  53. * within the same thread, and runs invalidate_range_start()/end() in parallel
  54. * on multiple CPUs. This is designed to not reduce concurrency or block
  55. * progress on the mm side.
  56. *
  57. * As a secondary function, holding the full write side also serves to prevent
  58. * writers for the itree, this is an optimization to avoid extra locking
  59. * during invalidate_range_start/end notifiers.
  60. *
  61. * The write side has two states, fully excluded:
  62. * - mm->active_invalidate_ranges != 0
  63. * - subscriptions->invalidate_seq & 1 == True (odd)
  64. * - some range on the mm_struct is being invalidated
  65. * - the itree is not allowed to change
  66. *
  67. * And partially excluded:
  68. * - mm->active_invalidate_ranges != 0
  69. * - subscriptions->invalidate_seq & 1 == False (even)
  70. * - some range on the mm_struct is being invalidated
  71. * - the itree is allowed to change
  72. *
  73. * Operations on notifier_subscriptions->invalidate_seq (under spinlock):
  74. * seq |= 1 # Begin writing
  75. * seq++ # Release the writing state
  76. * seq & 1 # True if a writer exists
  77. *
  78. * The later state avoids some expensive work on inv_end in the common case of
  79. * no mmu_interval_notifier monitoring the VA.
  80. */
  81. static bool
  82. mn_itree_is_invalidating(struct mmu_notifier_subscriptions *subscriptions)
  83. {
  84. lockdep_assert_held(&subscriptions->lock);
  85. return subscriptions->invalidate_seq & 1;
  86. }
  87. static struct mmu_interval_notifier *
  88. mn_itree_inv_start_range(struct mmu_notifier_subscriptions *subscriptions,
  89. const struct mmu_notifier_range *range,
  90. unsigned long *seq)
  91. {
  92. struct interval_tree_node *node;
  93. struct mmu_interval_notifier *res = NULL;
  94. spin_lock(&subscriptions->lock);
  95. subscriptions->active_invalidate_ranges++;
  96. node = interval_tree_iter_first(&subscriptions->itree, range->start,
  97. range->end - 1);
  98. if (node) {
  99. subscriptions->invalidate_seq |= 1;
  100. res = container_of(node, struct mmu_interval_notifier,
  101. interval_tree);
  102. }
  103. *seq = subscriptions->invalidate_seq;
  104. spin_unlock(&subscriptions->lock);
  105. return res;
  106. }
  107. static struct mmu_interval_notifier *
  108. mn_itree_inv_next(struct mmu_interval_notifier *interval_sub,
  109. const struct mmu_notifier_range *range)
  110. {
  111. struct interval_tree_node *node;
  112. node = interval_tree_iter_next(&interval_sub->interval_tree,
  113. range->start, range->end - 1);
  114. if (!node)
  115. return NULL;
  116. return container_of(node, struct mmu_interval_notifier, interval_tree);
  117. }
  118. static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
  119. {
  120. struct mmu_interval_notifier *interval_sub;
  121. struct hlist_node *next;
  122. spin_lock(&subscriptions->lock);
  123. if (--subscriptions->active_invalidate_ranges ||
  124. !mn_itree_is_invalidating(subscriptions)) {
  125. spin_unlock(&subscriptions->lock);
  126. return;
  127. }
  128. /* Make invalidate_seq even */
  129. subscriptions->invalidate_seq++;
  130. /*
  131. * The inv_end incorporates a deferred mechanism like rtnl_unlock().
  132. * Adds and removes are queued until the final inv_end happens then
  133. * they are progressed. This arrangement for tree updates is used to
  134. * avoid using a blocking lock during invalidate_range_start.
  135. */
  136. hlist_for_each_entry_safe(interval_sub, next,
  137. &subscriptions->deferred_list,
  138. deferred_item) {
  139. if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb))
  140. interval_tree_insert(&interval_sub->interval_tree,
  141. &subscriptions->itree);
  142. else
  143. interval_tree_remove(&interval_sub->interval_tree,
  144. &subscriptions->itree);
  145. hlist_del(&interval_sub->deferred_item);
  146. }
  147. spin_unlock(&subscriptions->lock);
  148. wake_up_all(&subscriptions->wq);
  149. }
  150. /**
  151. * mmu_interval_read_begin - Begin a read side critical section against a VA
  152. * range
  153. * @interval_sub: The interval subscription
  154. *
  155. * mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
  156. * collision-retry scheme similar to seqcount for the VA range under
  157. * subscription. If the mm invokes invalidation during the critical section
  158. * then mmu_interval_read_retry() will return true.
  159. *
  160. * This is useful to obtain shadow PTEs where teardown or setup of the SPTEs
  161. * require a blocking context. The critical region formed by this can sleep,
  162. * and the required 'user_lock' can also be a sleeping lock.
  163. *
  164. * The caller is required to provide a 'user_lock' to serialize both teardown
  165. * and setup.
  166. *
  167. * The return value should be passed to mmu_interval_read_retry().
  168. */
  169. unsigned long
  170. mmu_interval_read_begin(struct mmu_interval_notifier *interval_sub)
  171. {
  172. struct mmu_notifier_subscriptions *subscriptions =
  173. interval_sub->mm->notifier_subscriptions;
  174. unsigned long seq;
  175. bool is_invalidating;
  176. /*
  177. * If the subscription has a different seq value under the user_lock
  178. * than we started with then it has collided.
  179. *
  180. * If the subscription currently has the same seq value as the
  181. * subscriptions seq, then it is currently between
  182. * invalidate_start/end and is colliding.
  183. *
  184. * The locking looks broadly like this:
  185. * mn_tree_invalidate_start(): mmu_interval_read_begin():
  186. * spin_lock
  187. * seq = READ_ONCE(interval_sub->invalidate_seq);
  188. * seq == subs->invalidate_seq
  189. * spin_unlock
  190. * spin_lock
  191. * seq = ++subscriptions->invalidate_seq
  192. * spin_unlock
  193. * op->invalidate_range():
  194. * user_lock
  195. * mmu_interval_set_seq()
  196. * interval_sub->invalidate_seq = seq
  197. * user_unlock
  198. *
  199. * [Required: mmu_interval_read_retry() == true]
  200. *
  201. * mn_itree_inv_end():
  202. * spin_lock
  203. * seq = ++subscriptions->invalidate_seq
  204. * spin_unlock
  205. *
  206. * user_lock
  207. * mmu_interval_read_retry():
  208. * interval_sub->invalidate_seq != seq
  209. * user_unlock
  210. *
  211. * Barriers are not needed here as any races here are closed by an
  212. * eventual mmu_interval_read_retry(), which provides a barrier via the
  213. * user_lock.
  214. */
  215. spin_lock(&subscriptions->lock);
  216. /* Pairs with the WRITE_ONCE in mmu_interval_set_seq() */
  217. seq = READ_ONCE(interval_sub->invalidate_seq);
  218. is_invalidating = seq == subscriptions->invalidate_seq;
  219. spin_unlock(&subscriptions->lock);
  220. /*
  221. * interval_sub->invalidate_seq must always be set to an odd value via
  222. * mmu_interval_set_seq() using the provided cur_seq from
  223. * mn_itree_inv_start_range(). This ensures that if seq does wrap we
  224. * will always clear the below sleep in some reasonable time as
  225. * subscriptions->invalidate_seq is even in the idle state.
  226. */
  227. lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
  228. lock_map_release(&__mmu_notifier_invalidate_range_start_map);
  229. if (is_invalidating)
  230. wait_event(subscriptions->wq,
  231. READ_ONCE(subscriptions->invalidate_seq) != seq);
  232. /*
  233. * Notice that mmu_interval_read_retry() can already be true at this
  234. * point, avoiding loops here allows the caller to provide a global
  235. * time bound.
  236. */
  237. return seq;
  238. }
  239. EXPORT_SYMBOL_GPL(mmu_interval_read_begin);
  240. static void mn_itree_release(struct mmu_notifier_subscriptions *subscriptions,
  241. struct mm_struct *mm)
  242. {
  243. struct mmu_notifier_range range = {
  244. .flags = MMU_NOTIFIER_RANGE_BLOCKABLE,
  245. .event = MMU_NOTIFY_RELEASE,
  246. .mm = mm,
  247. .start = 0,
  248. .end = ULONG_MAX,
  249. };
  250. struct mmu_interval_notifier *interval_sub;
  251. unsigned long cur_seq;
  252. bool ret;
  253. for (interval_sub =
  254. mn_itree_inv_start_range(subscriptions, &range, &cur_seq);
  255. interval_sub;
  256. interval_sub = mn_itree_inv_next(interval_sub, &range)) {
  257. ret = interval_sub->ops->invalidate(interval_sub, &range,
  258. cur_seq);
  259. WARN_ON(!ret);
  260. }
  261. mn_itree_inv_end(subscriptions);
  262. }
  263. /*
  264. * This function can't run concurrently against mmu_notifier_register
  265. * because mm->mm_users > 0 during mmu_notifier_register and exit_mmap
  266. * runs with mm_users == 0. Other tasks may still invoke mmu notifiers
  267. * in parallel despite there being no task using this mm any more,
  268. * through the vmas outside of the exit_mmap context, such as with
  269. * vmtruncate. This serializes against mmu_notifier_unregister with
  270. * the notifier_subscriptions->lock in addition to SRCU and it serializes
  271. * against the other mmu notifiers with SRCU. struct mmu_notifier_subscriptions
  272. * can't go away from under us as exit_mmap holds an mm_count pin
  273. * itself.
  274. */
  275. static void mn_hlist_release(struct mmu_notifier_subscriptions *subscriptions,
  276. struct mm_struct *mm)
  277. {
  278. struct mmu_notifier *subscription;
  279. int id;
  280. /*
  281. * SRCU here will block mmu_notifier_unregister until
  282. * ->release returns.
  283. */
  284. id = srcu_read_lock(&srcu);
  285. hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
  286. srcu_read_lock_held(&srcu))
  287. /*
  288. * If ->release runs before mmu_notifier_unregister it must be
  289. * handled, as it's the only way for the driver to flush all
  290. * existing sptes and stop the driver from establishing any more
  291. * sptes before all the pages in the mm are freed.
  292. */
  293. if (subscription->ops->release)
  294. subscription->ops->release(subscription, mm);
  295. spin_lock(&subscriptions->lock);
  296. while (unlikely(!hlist_empty(&subscriptions->list))) {
  297. subscription = hlist_entry(subscriptions->list.first,
  298. struct mmu_notifier, hlist);
  299. /*
  300. * We arrived before mmu_notifier_unregister so
  301. * mmu_notifier_unregister will do nothing other than to wait
  302. * for ->release to finish and for mmu_notifier_unregister to
  303. * return.
  304. */
  305. hlist_del_init_rcu(&subscription->hlist);
  306. }
  307. spin_unlock(&subscriptions->lock);
  308. srcu_read_unlock(&srcu, id);
  309. /*
  310. * synchronize_srcu here prevents mmu_notifier_release from returning to
  311. * exit_mmap (which would proceed with freeing all pages in the mm)
  312. * until the ->release method returns, if it was invoked by
  313. * mmu_notifier_unregister.
  314. *
  315. * The notifier_subscriptions can't go away from under us because
  316. * one mm_count is held by exit_mmap.
  317. */
  318. synchronize_srcu(&srcu);
  319. }
  320. void __mmu_notifier_release(struct mm_struct *mm)
  321. {
  322. struct mmu_notifier_subscriptions *subscriptions =
  323. mm->notifier_subscriptions;
  324. if (subscriptions->has_itree)
  325. mn_itree_release(subscriptions, mm);
  326. if (!hlist_empty(&subscriptions->list))
  327. mn_hlist_release(subscriptions, mm);
  328. }
  329. /*
  330. * If no young bitflag is supported by the hardware, ->clear_flush_young can
  331. * unmap the address and return 1 or 0 depending if the mapping previously
  332. * existed or not.
  333. */
  334. int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
  335. unsigned long start,
  336. unsigned long end)
  337. {
  338. struct mmu_notifier *subscription;
  339. int young = 0, id;
  340. id = srcu_read_lock(&srcu);
  341. hlist_for_each_entry_rcu(subscription,
  342. &mm->notifier_subscriptions->list, hlist,
  343. srcu_read_lock_held(&srcu)) {
  344. if (subscription->ops->clear_flush_young)
  345. young |= subscription->ops->clear_flush_young(
  346. subscription, mm, start, end);
  347. }
  348. srcu_read_unlock(&srcu, id);
  349. return young;
  350. }
  351. int __mmu_notifier_clear_young(struct mm_struct *mm,
  352. unsigned long start,
  353. unsigned long end)
  354. {
  355. struct mmu_notifier *subscription;
  356. int young = 0, id;
  357. id = srcu_read_lock(&srcu);
  358. hlist_for_each_entry_rcu(subscription,
  359. &mm->notifier_subscriptions->list, hlist,
  360. srcu_read_lock_held(&srcu)) {
  361. if (subscription->ops->clear_young)
  362. young |= subscription->ops->clear_young(subscription,
  363. mm, start, end);
  364. }
  365. srcu_read_unlock(&srcu, id);
  366. return young;
  367. }
  368. int __mmu_notifier_test_young(struct mm_struct *mm,
  369. unsigned long address)
  370. {
  371. struct mmu_notifier *subscription;
  372. int young = 0, id;
  373. id = srcu_read_lock(&srcu);
  374. hlist_for_each_entry_rcu(subscription,
  375. &mm->notifier_subscriptions->list, hlist,
  376. srcu_read_lock_held(&srcu)) {
  377. if (subscription->ops->test_young) {
  378. young = subscription->ops->test_young(subscription, mm,
  379. address);
  380. if (young)
  381. break;
  382. }
  383. }
  384. srcu_read_unlock(&srcu, id);
  385. return young;
  386. }
  387. void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address,
  388. pte_t pte)
  389. {
  390. struct mmu_notifier *subscription;
  391. int id;
  392. id = srcu_read_lock(&srcu);
  393. hlist_for_each_entry_rcu(subscription,
  394. &mm->notifier_subscriptions->list, hlist,
  395. srcu_read_lock_held(&srcu)) {
  396. if (subscription->ops->change_pte)
  397. subscription->ops->change_pte(subscription, mm, address,
  398. pte);
  399. }
  400. srcu_read_unlock(&srcu, id);
  401. }
  402. static int mn_itree_invalidate(struct mmu_notifier_subscriptions *subscriptions,
  403. const struct mmu_notifier_range *range)
  404. {
  405. struct mmu_interval_notifier *interval_sub;
  406. unsigned long cur_seq;
  407. for (interval_sub =
  408. mn_itree_inv_start_range(subscriptions, range, &cur_seq);
  409. interval_sub;
  410. interval_sub = mn_itree_inv_next(interval_sub, range)) {
  411. bool ret;
  412. ret = interval_sub->ops->invalidate(interval_sub, range,
  413. cur_seq);
  414. if (!ret) {
  415. if (WARN_ON(mmu_notifier_range_blockable(range)))
  416. continue;
  417. goto out_would_block;
  418. }
  419. }
  420. return 0;
  421. out_would_block:
  422. /*
  423. * On -EAGAIN the non-blocking caller is not allowed to call
  424. * invalidate_range_end()
  425. */
  426. mn_itree_inv_end(subscriptions);
  427. return -EAGAIN;
  428. }
  429. static int mn_hlist_invalidate_range_start(
  430. struct mmu_notifier_subscriptions *subscriptions,
  431. struct mmu_notifier_range *range)
  432. {
  433. struct mmu_notifier *subscription;
  434. int ret = 0;
  435. int id;
  436. id = srcu_read_lock(&srcu);
  437. hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
  438. srcu_read_lock_held(&srcu)) {
  439. const struct mmu_notifier_ops *ops = subscription->ops;
  440. if (ops->invalidate_range_start) {
  441. int _ret;
  442. if (!mmu_notifier_range_blockable(range))
  443. non_block_start();
  444. _ret = ops->invalidate_range_start(subscription, range);
  445. if (!mmu_notifier_range_blockable(range))
  446. non_block_end();
  447. if (_ret) {
  448. pr_info("%pS callback failed with %d in %sblockable context.\n",
  449. ops->invalidate_range_start, _ret,
  450. !mmu_notifier_range_blockable(range) ?
  451. "non-" :
  452. "");
  453. WARN_ON(mmu_notifier_range_blockable(range) ||
  454. _ret != -EAGAIN);
  455. /*
  456. * We call all the notifiers on any EAGAIN,
  457. * there is no way for a notifier to know if
  458. * its start method failed, thus a start that
  459. * does EAGAIN can't also do end.
  460. */
  461. WARN_ON(ops->invalidate_range_end);
  462. ret = _ret;
  463. }
  464. }
  465. }
  466. if (ret) {
  467. /*
  468. * Must be non-blocking to get here. If there are multiple
  469. * notifiers and one or more failed start, any that succeeded
  470. * start are expecting their end to be called. Do so now.
  471. */
  472. hlist_for_each_entry_rcu(subscription, &subscriptions->list,
  473. hlist, srcu_read_lock_held(&srcu)) {
  474. if (!subscription->ops->invalidate_range_end)
  475. continue;
  476. subscription->ops->invalidate_range_end(subscription,
  477. range);
  478. }
  479. }
  480. srcu_read_unlock(&srcu, id);
  481. return ret;
  482. }
  483. int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
  484. {
  485. struct mmu_notifier_subscriptions *subscriptions =
  486. range->mm->notifier_subscriptions;
  487. int ret;
  488. if (subscriptions->has_itree) {
  489. ret = mn_itree_invalidate(subscriptions, range);
  490. if (ret)
  491. return ret;
  492. }
  493. if (!hlist_empty(&subscriptions->list))
  494. return mn_hlist_invalidate_range_start(subscriptions, range);
  495. return 0;
  496. }
  497. static void
  498. mn_hlist_invalidate_end(struct mmu_notifier_subscriptions *subscriptions,
  499. struct mmu_notifier_range *range, bool only_end)
  500. {
  501. struct mmu_notifier *subscription;
  502. int id;
  503. id = srcu_read_lock(&srcu);
  504. hlist_for_each_entry_rcu(subscription, &subscriptions->list, hlist,
  505. srcu_read_lock_held(&srcu)) {
  506. /*
  507. * Call invalidate_range here too to avoid the need for the
  508. * subsystem of having to register an invalidate_range_end
  509. * call-back when there is invalidate_range already. Usually a
  510. * subsystem registers either invalidate_range_start()/end() or
  511. * invalidate_range(), so this will be no additional overhead
  512. * (besides the pointer check).
  513. *
  514. * We skip call to invalidate_range() if we know it is safe ie
  515. * call site use mmu_notifier_invalidate_range_only_end() which
  516. * is safe to do when we know that a call to invalidate_range()
  517. * already happen under page table lock.
  518. */
  519. if (!only_end && subscription->ops->invalidate_range)
  520. subscription->ops->invalidate_range(subscription,
  521. range->mm,
  522. range->start,
  523. range->end);
  524. if (subscription->ops->invalidate_range_end) {
  525. if (!mmu_notifier_range_blockable(range))
  526. non_block_start();
  527. subscription->ops->invalidate_range_end(subscription,
  528. range);
  529. if (!mmu_notifier_range_blockable(range))
  530. non_block_end();
  531. }
  532. }
  533. srcu_read_unlock(&srcu, id);
  534. }
  535. void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
  536. bool only_end)
  537. {
  538. struct mmu_notifier_subscriptions *subscriptions =
  539. range->mm->notifier_subscriptions;
  540. lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
  541. if (subscriptions->has_itree)
  542. mn_itree_inv_end(subscriptions);
  543. if (!hlist_empty(&subscriptions->list))
  544. mn_hlist_invalidate_end(subscriptions, range, only_end);
  545. lock_map_release(&__mmu_notifier_invalidate_range_start_map);
  546. }
  547. void __mmu_notifier_invalidate_range(struct mm_struct *mm,
  548. unsigned long start, unsigned long end)
  549. {
  550. struct mmu_notifier *subscription;
  551. int id;
  552. id = srcu_read_lock(&srcu);
  553. hlist_for_each_entry_rcu(subscription,
  554. &mm->notifier_subscriptions->list, hlist,
  555. srcu_read_lock_held(&srcu)) {
  556. if (subscription->ops->invalidate_range)
  557. subscription->ops->invalidate_range(subscription, mm,
  558. start, end);
  559. }
  560. srcu_read_unlock(&srcu, id);
  561. }
  562. /*
  563. * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
  564. * write mode. A NULL mn signals the notifier is being registered for itree
  565. * mode.
  566. */
  567. int __mmu_notifier_register(struct mmu_notifier *subscription,
  568. struct mm_struct *mm)
  569. {
  570. struct mmu_notifier_subscriptions *subscriptions = NULL;
  571. int ret;
  572. mmap_assert_write_locked(mm);
  573. BUG_ON(atomic_read(&mm->mm_users) <= 0);
  574. if (!mm->notifier_subscriptions) {
  575. /*
  576. * kmalloc cannot be called under mm_take_all_locks(), but we
  577. * know that mm->notifier_subscriptions can't change while we
  578. * hold the write side of the mmap_lock.
  579. */
  580. subscriptions = kzalloc(
  581. sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
  582. if (!subscriptions)
  583. return -ENOMEM;
  584. INIT_HLIST_HEAD(&subscriptions->list);
  585. spin_lock_init(&subscriptions->lock);
  586. subscriptions->invalidate_seq = 2;
  587. subscriptions->itree = RB_ROOT_CACHED;
  588. init_waitqueue_head(&subscriptions->wq);
  589. INIT_HLIST_HEAD(&subscriptions->deferred_list);
  590. }
  591. ret = mm_take_all_locks(mm);
  592. if (unlikely(ret))
  593. goto out_clean;
  594. /*
  595. * Serialize the update against mmu_notifier_unregister. A
  596. * side note: mmu_notifier_release can't run concurrently with
  597. * us because we hold the mm_users pin (either implicitly as
  598. * current->mm or explicitly with get_task_mm() or similar).
  599. * We can't race against any other mmu notifier method either
  600. * thanks to mm_take_all_locks().
  601. *
  602. * release semantics on the initialization of the
  603. * mmu_notifier_subscriptions's contents are provided for unlocked
  604. * readers. acquire can only be used while holding the mmgrab or
  605. * mmget, and is safe because once created the
  606. * mmu_notifier_subscriptions is not freed until the mm is destroyed.
  607. * As above, users holding the mmap_lock or one of the
  608. * mm_take_all_locks() do not need to use acquire semantics.
  609. */
  610. if (subscriptions)
  611. smp_store_release(&mm->notifier_subscriptions, subscriptions);
  612. if (subscription) {
  613. /* Pairs with the mmdrop in mmu_notifier_unregister_* */
  614. mmgrab(mm);
  615. subscription->mm = mm;
  616. subscription->users = 1;
  617. spin_lock(&mm->notifier_subscriptions->lock);
  618. hlist_add_head_rcu(&subscription->hlist,
  619. &mm->notifier_subscriptions->list);
  620. spin_unlock(&mm->notifier_subscriptions->lock);
  621. } else
  622. mm->notifier_subscriptions->has_itree = true;
  623. mm_drop_all_locks(mm);
  624. BUG_ON(atomic_read(&mm->mm_users) <= 0);
  625. return 0;
  626. out_clean:
  627. kfree(subscriptions);
  628. return ret;
  629. }
  630. EXPORT_SYMBOL_GPL(__mmu_notifier_register);
  631. /**
  632. * mmu_notifier_register - Register a notifier on a mm
  633. * @subscription: The notifier to attach
  634. * @mm: The mm to attach the notifier to
  635. *
  636. * Must not hold mmap_lock nor any other VM related lock when calling
  637. * this registration function. Must also ensure mm_users can't go down
  638. * to zero while this runs to avoid races with mmu_notifier_release,
  639. * so mm has to be current->mm or the mm should be pinned safely such
  640. * as with get_task_mm(). If the mm is not current->mm, the mm_users
  641. * pin should be released by calling mmput after mmu_notifier_register
  642. * returns.
  643. *
  644. * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
  645. * unregister the notifier.
  646. *
  647. * While the caller has a mmu_notifier get the subscription->mm pointer will remain
  648. * valid, and can be converted to an active mm pointer via mmget_not_zero().
  649. */
  650. int mmu_notifier_register(struct mmu_notifier *subscription,
  651. struct mm_struct *mm)
  652. {
  653. int ret;
  654. mmap_write_lock(mm);
  655. ret = __mmu_notifier_register(subscription, mm);
  656. mmap_write_unlock(mm);
  657. return ret;
  658. }
  659. EXPORT_SYMBOL_GPL(mmu_notifier_register);
  660. static struct mmu_notifier *
  661. find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
  662. {
  663. struct mmu_notifier *subscription;
  664. spin_lock(&mm->notifier_subscriptions->lock);
  665. hlist_for_each_entry_rcu(subscription,
  666. &mm->notifier_subscriptions->list, hlist,
  667. lockdep_is_held(&mm->notifier_subscriptions->lock)) {
  668. if (subscription->ops != ops)
  669. continue;
  670. if (likely(subscription->users != UINT_MAX))
  671. subscription->users++;
  672. else
  673. subscription = ERR_PTR(-EOVERFLOW);
  674. spin_unlock(&mm->notifier_subscriptions->lock);
  675. return subscription;
  676. }
  677. spin_unlock(&mm->notifier_subscriptions->lock);
  678. return NULL;
  679. }
  680. /**
  681. * mmu_notifier_get_locked - Return the single struct mmu_notifier for
  682. * the mm & ops
  683. * @ops: The operations struct being subscribe with
  684. * @mm : The mm to attach notifiers too
  685. *
  686. * This function either allocates a new mmu_notifier via
  687. * ops->alloc_notifier(), or returns an already existing notifier on the
  688. * list. The value of the ops pointer is used to determine when two notifiers
  689. * are the same.
  690. *
  691. * Each call to mmu_notifier_get() must be paired with a call to
  692. * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
  693. *
  694. * While the caller has a mmu_notifier get the mm pointer will remain valid,
  695. * and can be converted to an active mm pointer via mmget_not_zero().
  696. */
  697. struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
  698. struct mm_struct *mm)
  699. {
  700. struct mmu_notifier *subscription;
  701. int ret;
  702. mmap_assert_write_locked(mm);
  703. if (mm->notifier_subscriptions) {
  704. subscription = find_get_mmu_notifier(mm, ops);
  705. if (subscription)
  706. return subscription;
  707. }
  708. subscription = ops->alloc_notifier(mm);
  709. if (IS_ERR(subscription))
  710. return subscription;
  711. subscription->ops = ops;
  712. ret = __mmu_notifier_register(subscription, mm);
  713. if (ret)
  714. goto out_free;
  715. return subscription;
  716. out_free:
  717. subscription->ops->free_notifier(subscription);
  718. return ERR_PTR(ret);
  719. }
  720. EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
  721. /* this is called after the last mmu_notifier_unregister() returned */
  722. void __mmu_notifier_subscriptions_destroy(struct mm_struct *mm)
  723. {
  724. BUG_ON(!hlist_empty(&mm->notifier_subscriptions->list));
  725. kfree(mm->notifier_subscriptions);
  726. mm->notifier_subscriptions = LIST_POISON1; /* debug */
  727. }
  728. /*
  729. * This releases the mm_count pin automatically and frees the mm
  730. * structure if it was the last user of it. It serializes against
  731. * running mmu notifiers with SRCU and against mmu_notifier_unregister
  732. * with the unregister lock + SRCU. All sptes must be dropped before
  733. * calling mmu_notifier_unregister. ->release or any other notifier
  734. * method may be invoked concurrently with mmu_notifier_unregister,
  735. * and only after mmu_notifier_unregister returned we're guaranteed
  736. * that ->release or any other method can't run anymore.
  737. */
  738. void mmu_notifier_unregister(struct mmu_notifier *subscription,
  739. struct mm_struct *mm)
  740. {
  741. BUG_ON(atomic_read(&mm->mm_count) <= 0);
  742. if (!hlist_unhashed(&subscription->hlist)) {
  743. /*
  744. * SRCU here will force exit_mmap to wait for ->release to
  745. * finish before freeing the pages.
  746. */
  747. int id;
  748. id = srcu_read_lock(&srcu);
  749. /*
  750. * exit_mmap will block in mmu_notifier_release to guarantee
  751. * that ->release is called before freeing the pages.
  752. */
  753. if (subscription->ops->release)
  754. subscription->ops->release(subscription, mm);
  755. srcu_read_unlock(&srcu, id);
  756. spin_lock(&mm->notifier_subscriptions->lock);
  757. /*
  758. * Can not use list_del_rcu() since __mmu_notifier_release
  759. * can delete it before we hold the lock.
  760. */
  761. hlist_del_init_rcu(&subscription->hlist);
  762. spin_unlock(&mm->notifier_subscriptions->lock);
  763. }
  764. /*
  765. * Wait for any running method to finish, of course including
  766. * ->release if it was run by mmu_notifier_release instead of us.
  767. */
  768. synchronize_srcu(&srcu);
  769. BUG_ON(atomic_read(&mm->mm_count) <= 0);
  770. mmdrop(mm);
  771. }
  772. EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
  773. static void mmu_notifier_free_rcu(struct rcu_head *rcu)
  774. {
  775. struct mmu_notifier *subscription =
  776. container_of(rcu, struct mmu_notifier, rcu);
  777. struct mm_struct *mm = subscription->mm;
  778. subscription->ops->free_notifier(subscription);
  779. /* Pairs with the get in __mmu_notifier_register() */
  780. mmdrop(mm);
  781. }
  782. /**
  783. * mmu_notifier_put - Release the reference on the notifier
  784. * @subscription: The notifier to act on
  785. *
  786. * This function must be paired with each mmu_notifier_get(), it releases the
  787. * reference obtained by the get. If this is the last reference then process
  788. * to free the notifier will be run asynchronously.
  789. *
  790. * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
  791. * when the mm_struct is destroyed. Instead free_notifier is always called to
  792. * release any resources held by the user.
  793. *
  794. * As ops->release is not guaranteed to be called, the user must ensure that
  795. * all sptes are dropped, and no new sptes can be established before
  796. * mmu_notifier_put() is called.
  797. *
  798. * This function can be called from the ops->release callback, however the
  799. * caller must still ensure it is called pairwise with mmu_notifier_get().
  800. *
  801. * Modules calling this function must call mmu_notifier_synchronize() in
  802. * their __exit functions to ensure the async work is completed.
  803. */
  804. void mmu_notifier_put(struct mmu_notifier *subscription)
  805. {
  806. struct mm_struct *mm = subscription->mm;
  807. spin_lock(&mm->notifier_subscriptions->lock);
  808. if (WARN_ON(!subscription->users) || --subscription->users)
  809. goto out_unlock;
  810. hlist_del_init_rcu(&subscription->hlist);
  811. spin_unlock(&mm->notifier_subscriptions->lock);
  812. call_srcu(&srcu, &subscription->rcu, mmu_notifier_free_rcu);
  813. return;
  814. out_unlock:
  815. spin_unlock(&mm->notifier_subscriptions->lock);
  816. }
  817. EXPORT_SYMBOL_GPL(mmu_notifier_put);
  818. static int __mmu_interval_notifier_insert(
  819. struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
  820. struct mmu_notifier_subscriptions *subscriptions, unsigned long start,
  821. unsigned long length, const struct mmu_interval_notifier_ops *ops)
  822. {
  823. interval_sub->mm = mm;
  824. interval_sub->ops = ops;
  825. RB_CLEAR_NODE(&interval_sub->interval_tree.rb);
  826. interval_sub->interval_tree.start = start;
  827. /*
  828. * Note that the representation of the intervals in the interval tree
  829. * considers the ending point as contained in the interval.
  830. */
  831. if (length == 0 ||
  832. check_add_overflow(start, length - 1,
  833. &interval_sub->interval_tree.last))
  834. return -EOVERFLOW;
  835. /* Must call with a mmget() held */
  836. if (WARN_ON(atomic_read(&mm->mm_users) <= 0))
  837. return -EINVAL;
  838. /* pairs with mmdrop in mmu_interval_notifier_remove() */
  839. mmgrab(mm);
  840. /*
  841. * If some invalidate_range_start/end region is going on in parallel
  842. * we don't know what VA ranges are affected, so we must assume this
  843. * new range is included.
  844. *
  845. * If the itree is invalidating then we are not allowed to change
  846. * it. Retrying until invalidation is done is tricky due to the
  847. * possibility for live lock, instead defer the add to
  848. * mn_itree_inv_end() so this algorithm is deterministic.
  849. *
  850. * In all cases the value for the interval_sub->invalidate_seq should be
  851. * odd, see mmu_interval_read_begin()
  852. */
  853. spin_lock(&subscriptions->lock);
  854. if (subscriptions->active_invalidate_ranges) {
  855. if (mn_itree_is_invalidating(subscriptions))
  856. hlist_add_head(&interval_sub->deferred_item,
  857. &subscriptions->deferred_list);
  858. else {
  859. subscriptions->invalidate_seq |= 1;
  860. interval_tree_insert(&interval_sub->interval_tree,
  861. &subscriptions->itree);
  862. }
  863. interval_sub->invalidate_seq = subscriptions->invalidate_seq;
  864. } else {
  865. WARN_ON(mn_itree_is_invalidating(subscriptions));
  866. /*
  867. * The starting seq for a subscription not under invalidation
  868. * should be odd, not equal to the current invalidate_seq and
  869. * invalidate_seq should not 'wrap' to the new seq any time
  870. * soon.
  871. */
  872. interval_sub->invalidate_seq =
  873. subscriptions->invalidate_seq - 1;
  874. interval_tree_insert(&interval_sub->interval_tree,
  875. &subscriptions->itree);
  876. }
  877. spin_unlock(&subscriptions->lock);
  878. return 0;
  879. }
  880. /**
  881. * mmu_interval_notifier_insert - Insert an interval notifier
  882. * @interval_sub: Interval subscription to register
  883. * @start: Starting virtual address to monitor
  884. * @length: Length of the range to monitor
  885. * @mm: mm_struct to attach to
  886. * @ops: Interval notifier operations to be called on matching events
  887. *
  888. * This function subscribes the interval notifier for notifications from the
  889. * mm. Upon return the ops related to mmu_interval_notifier will be called
  890. * whenever an event that intersects with the given range occurs.
  891. *
  892. * Upon return the range_notifier may not be present in the interval tree yet.
  893. * The caller must use the normal interval notifier read flow via
  894. * mmu_interval_read_begin() to establish SPTEs for this range.
  895. */
  896. int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
  897. struct mm_struct *mm, unsigned long start,
  898. unsigned long length,
  899. const struct mmu_interval_notifier_ops *ops)
  900. {
  901. struct mmu_notifier_subscriptions *subscriptions;
  902. int ret;
  903. might_lock(&mm->mmap_lock);
  904. subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
  905. if (!subscriptions || !subscriptions->has_itree) {
  906. ret = mmu_notifier_register(NULL, mm);
  907. if (ret)
  908. return ret;
  909. subscriptions = mm->notifier_subscriptions;
  910. }
  911. return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
  912. start, length, ops);
  913. }
  914. EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert);
  915. int mmu_interval_notifier_insert_locked(
  916. struct mmu_interval_notifier *interval_sub, struct mm_struct *mm,
  917. unsigned long start, unsigned long length,
  918. const struct mmu_interval_notifier_ops *ops)
  919. {
  920. struct mmu_notifier_subscriptions *subscriptions =
  921. mm->notifier_subscriptions;
  922. int ret;
  923. mmap_assert_write_locked(mm);
  924. if (!subscriptions || !subscriptions->has_itree) {
  925. ret = __mmu_notifier_register(NULL, mm);
  926. if (ret)
  927. return ret;
  928. subscriptions = mm->notifier_subscriptions;
  929. }
  930. return __mmu_interval_notifier_insert(interval_sub, mm, subscriptions,
  931. start, length, ops);
  932. }
  933. EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
  934. static bool
  935. mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
  936. unsigned long seq)
  937. {
  938. bool ret;
  939. spin_lock(&subscriptions->lock);
  940. ret = subscriptions->invalidate_seq != seq;
  941. spin_unlock(&subscriptions->lock);
  942. return ret;
  943. }
  944. /**
  945. * mmu_interval_notifier_remove - Remove a interval notifier
  946. * @interval_sub: Interval subscription to unregister
  947. *
  948. * This function must be paired with mmu_interval_notifier_insert(). It cannot
  949. * be called from any ops callback.
  950. *
  951. * Once this returns ops callbacks are no longer running on other CPUs and
  952. * will not be called in future.
  953. */
  954. void mmu_interval_notifier_remove(struct mmu_interval_notifier *interval_sub)
  955. {
  956. struct mm_struct *mm = interval_sub->mm;
  957. struct mmu_notifier_subscriptions *subscriptions =
  958. mm->notifier_subscriptions;
  959. unsigned long seq = 0;
  960. might_sleep();
  961. spin_lock(&subscriptions->lock);
  962. if (mn_itree_is_invalidating(subscriptions)) {
  963. /*
  964. * remove is being called after insert put this on the
  965. * deferred list, but before the deferred list was processed.
  966. */
  967. if (RB_EMPTY_NODE(&interval_sub->interval_tree.rb)) {
  968. hlist_del(&interval_sub->deferred_item);
  969. } else {
  970. hlist_add_head(&interval_sub->deferred_item,
  971. &subscriptions->deferred_list);
  972. seq = subscriptions->invalidate_seq;
  973. }
  974. } else {
  975. WARN_ON(RB_EMPTY_NODE(&interval_sub->interval_tree.rb));
  976. interval_tree_remove(&interval_sub->interval_tree,
  977. &subscriptions->itree);
  978. }
  979. spin_unlock(&subscriptions->lock);
  980. /*
  981. * The possible sleep on progress in the invalidation requires the
  982. * caller not hold any locks held by invalidation callbacks.
  983. */
  984. lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
  985. lock_map_release(&__mmu_notifier_invalidate_range_start_map);
  986. if (seq)
  987. wait_event(subscriptions->wq,
  988. mmu_interval_seq_released(subscriptions, seq));
  989. /* pairs with mmgrab in mmu_interval_notifier_insert() */
  990. mmdrop(mm);
  991. }
  992. EXPORT_SYMBOL_GPL(mmu_interval_notifier_remove);
  993. /**
  994. * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
  995. *
  996. * This function ensures that all outstanding async SRU work from
  997. * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
  998. * associated with an unused mmu_notifier will no longer be called.
  999. *
  1000. * Before using the caller must ensure that all of its mmu_notifiers have been
  1001. * fully released via mmu_notifier_put().
  1002. *
  1003. * Modules using the mmu_notifier_put() API should call this in their __exit
  1004. * function to avoid module unloading races.
  1005. */
  1006. void mmu_notifier_synchronize(void)
  1007. {
  1008. synchronize_srcu(&srcu);
  1009. }
  1010. EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
  1011. bool
  1012. mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
  1013. {
  1014. if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
  1015. return false;
  1016. /* Return true if the vma still have the read flag set. */
  1017. return range->vma->vm_flags & VM_READ;
  1018. }
  1019. EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);