pi.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. #include <linux/slab.h>
  3. #include <linux/sched/task.h>
  4. #include "futex.h"
  5. #include "../locking/rtmutex_common.h"
  6. /*
  7. * PI code:
  8. */
  9. int refill_pi_state_cache(void)
  10. {
  11. struct futex_pi_state *pi_state;
  12. if (likely(current->pi_state_cache))
  13. return 0;
  14. pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
  15. if (!pi_state)
  16. return -ENOMEM;
  17. INIT_LIST_HEAD(&pi_state->list);
  18. /* pi_mutex gets initialized later */
  19. pi_state->owner = NULL;
  20. refcount_set(&pi_state->refcount, 1);
  21. pi_state->key = FUTEX_KEY_INIT;
  22. current->pi_state_cache = pi_state;
  23. return 0;
  24. }
  25. static struct futex_pi_state *alloc_pi_state(void)
  26. {
  27. struct futex_pi_state *pi_state = current->pi_state_cache;
  28. WARN_ON(!pi_state);
  29. current->pi_state_cache = NULL;
  30. return pi_state;
  31. }
  32. static void pi_state_update_owner(struct futex_pi_state *pi_state,
  33. struct task_struct *new_owner)
  34. {
  35. struct task_struct *old_owner = pi_state->owner;
  36. lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
  37. if (old_owner) {
  38. raw_spin_lock(&old_owner->pi_lock);
  39. WARN_ON(list_empty(&pi_state->list));
  40. list_del_init(&pi_state->list);
  41. raw_spin_unlock(&old_owner->pi_lock);
  42. }
  43. if (new_owner) {
  44. raw_spin_lock(&new_owner->pi_lock);
  45. WARN_ON(!list_empty(&pi_state->list));
  46. list_add(&pi_state->list, &new_owner->pi_state_list);
  47. pi_state->owner = new_owner;
  48. raw_spin_unlock(&new_owner->pi_lock);
  49. }
  50. }
  51. void get_pi_state(struct futex_pi_state *pi_state)
  52. {
  53. WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
  54. }
  55. /*
  56. * Drops a reference to the pi_state object and frees or caches it
  57. * when the last reference is gone.
  58. */
  59. void put_pi_state(struct futex_pi_state *pi_state)
  60. {
  61. if (!pi_state)
  62. return;
  63. if (!refcount_dec_and_test(&pi_state->refcount))
  64. return;
  65. /*
  66. * If pi_state->owner is NULL, the owner is most probably dying
  67. * and has cleaned up the pi_state already
  68. */
  69. if (pi_state->owner) {
  70. unsigned long flags;
  71. raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
  72. pi_state_update_owner(pi_state, NULL);
  73. rt_mutex_proxy_unlock(&pi_state->pi_mutex);
  74. raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
  75. }
  76. if (current->pi_state_cache) {
  77. kfree(pi_state);
  78. } else {
  79. /*
  80. * pi_state->list is already empty.
  81. * clear pi_state->owner.
  82. * refcount is at 0 - put it back to 1.
  83. */
  84. pi_state->owner = NULL;
  85. refcount_set(&pi_state->refcount, 1);
  86. current->pi_state_cache = pi_state;
  87. }
  88. }
  89. /*
  90. * We need to check the following states:
  91. *
  92. * Waiter | pi_state | pi->owner | uTID | uODIED | ?
  93. *
  94. * [1] NULL | --- | --- | 0 | 0/1 | Valid
  95. * [2] NULL | --- | --- | >0 | 0/1 | Valid
  96. *
  97. * [3] Found | NULL | -- | Any | 0/1 | Invalid
  98. *
  99. * [4] Found | Found | NULL | 0 | 1 | Valid
  100. * [5] Found | Found | NULL | >0 | 1 | Invalid
  101. *
  102. * [6] Found | Found | task | 0 | 1 | Valid
  103. *
  104. * [7] Found | Found | NULL | Any | 0 | Invalid
  105. *
  106. * [8] Found | Found | task | ==taskTID | 0/1 | Valid
  107. * [9] Found | Found | task | 0 | 0 | Invalid
  108. * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
  109. *
  110. * [1] Indicates that the kernel can acquire the futex atomically. We
  111. * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
  112. *
  113. * [2] Valid, if TID does not belong to a kernel thread. If no matching
  114. * thread is found then it indicates that the owner TID has died.
  115. *
  116. * [3] Invalid. The waiter is queued on a non PI futex
  117. *
  118. * [4] Valid state after exit_robust_list(), which sets the user space
  119. * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
  120. *
  121. * [5] The user space value got manipulated between exit_robust_list()
  122. * and exit_pi_state_list()
  123. *
  124. * [6] Valid state after exit_pi_state_list() which sets the new owner in
  125. * the pi_state but cannot access the user space value.
  126. *
  127. * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
  128. *
  129. * [8] Owner and user space value match
  130. *
  131. * [9] There is no transient state which sets the user space TID to 0
  132. * except exit_robust_list(), but this is indicated by the
  133. * FUTEX_OWNER_DIED bit. See [4]
  134. *
  135. * [10] There is no transient state which leaves owner and user space
  136. * TID out of sync. Except one error case where the kernel is denied
  137. * write access to the user address, see fixup_pi_state_owner().
  138. *
  139. *
  140. * Serialization and lifetime rules:
  141. *
  142. * hb->lock:
  143. *
  144. * hb -> futex_q, relation
  145. * futex_q -> pi_state, relation
  146. *
  147. * (cannot be raw because hb can contain arbitrary amount
  148. * of futex_q's)
  149. *
  150. * pi_mutex->wait_lock:
  151. *
  152. * {uval, pi_state}
  153. *
  154. * (and pi_mutex 'obviously')
  155. *
  156. * p->pi_lock:
  157. *
  158. * p->pi_state_list -> pi_state->list, relation
  159. * pi_mutex->owner -> pi_state->owner, relation
  160. *
  161. * pi_state->refcount:
  162. *
  163. * pi_state lifetime
  164. *
  165. *
  166. * Lock order:
  167. *
  168. * hb->lock
  169. * pi_mutex->wait_lock
  170. * p->pi_lock
  171. *
  172. */
  173. /*
  174. * Validate that the existing waiter has a pi_state and sanity check
  175. * the pi_state against the user space value. If correct, attach to
  176. * it.
  177. */
  178. static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
  179. struct futex_pi_state *pi_state,
  180. struct futex_pi_state **ps)
  181. {
  182. pid_t pid = uval & FUTEX_TID_MASK;
  183. u32 uval2;
  184. int ret;
  185. /*
  186. * Userspace might have messed up non-PI and PI futexes [3]
  187. */
  188. if (unlikely(!pi_state))
  189. return -EINVAL;
  190. /*
  191. * We get here with hb->lock held, and having found a
  192. * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
  193. * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
  194. * which in turn means that futex_lock_pi() still has a reference on
  195. * our pi_state.
  196. *
  197. * The waiter holding a reference on @pi_state also protects against
  198. * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
  199. * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
  200. * free pi_state before we can take a reference ourselves.
  201. */
  202. WARN_ON(!refcount_read(&pi_state->refcount));
  203. /*
  204. * Now that we have a pi_state, we can acquire wait_lock
  205. * and do the state validation.
  206. */
  207. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  208. /*
  209. * Since {uval, pi_state} is serialized by wait_lock, and our current
  210. * uval was read without holding it, it can have changed. Verify it
  211. * still is what we expect it to be, otherwise retry the entire
  212. * operation.
  213. */
  214. if (futex_get_value_locked(&uval2, uaddr))
  215. goto out_efault;
  216. if (uval != uval2)
  217. goto out_eagain;
  218. /*
  219. * Handle the owner died case:
  220. */
  221. if (uval & FUTEX_OWNER_DIED) {
  222. /*
  223. * exit_pi_state_list sets owner to NULL and wakes the
  224. * topmost waiter. The task which acquires the
  225. * pi_state->rt_mutex will fixup owner.
  226. */
  227. if (!pi_state->owner) {
  228. /*
  229. * No pi state owner, but the user space TID
  230. * is not 0. Inconsistent state. [5]
  231. */
  232. if (pid)
  233. goto out_einval;
  234. /*
  235. * Take a ref on the state and return success. [4]
  236. */
  237. goto out_attach;
  238. }
  239. /*
  240. * If TID is 0, then either the dying owner has not
  241. * yet executed exit_pi_state_list() or some waiter
  242. * acquired the rtmutex in the pi state, but did not
  243. * yet fixup the TID in user space.
  244. *
  245. * Take a ref on the state and return success. [6]
  246. */
  247. if (!pid)
  248. goto out_attach;
  249. } else {
  250. /*
  251. * If the owner died bit is not set, then the pi_state
  252. * must have an owner. [7]
  253. */
  254. if (!pi_state->owner)
  255. goto out_einval;
  256. }
  257. /*
  258. * Bail out if user space manipulated the futex value. If pi
  259. * state exists then the owner TID must be the same as the
  260. * user space TID. [9/10]
  261. */
  262. if (pid != task_pid_vnr(pi_state->owner))
  263. goto out_einval;
  264. out_attach:
  265. get_pi_state(pi_state);
  266. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  267. *ps = pi_state;
  268. return 0;
  269. out_einval:
  270. ret = -EINVAL;
  271. goto out_error;
  272. out_eagain:
  273. ret = -EAGAIN;
  274. goto out_error;
  275. out_efault:
  276. ret = -EFAULT;
  277. goto out_error;
  278. out_error:
  279. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  280. return ret;
  281. }
  282. static int handle_exit_race(u32 __user *uaddr, u32 uval,
  283. struct task_struct *tsk)
  284. {
  285. u32 uval2;
  286. /*
  287. * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
  288. * caller that the alleged owner is busy.
  289. */
  290. if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
  291. return -EBUSY;
  292. /*
  293. * Reread the user space value to handle the following situation:
  294. *
  295. * CPU0 CPU1
  296. *
  297. * sys_exit() sys_futex()
  298. * do_exit() futex_lock_pi()
  299. * futex_lock_pi_atomic()
  300. * exit_signals(tsk) No waiters:
  301. * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
  302. * mm_release(tsk) Set waiter bit
  303. * exit_robust_list(tsk) { *uaddr = 0x80000PID;
  304. * Set owner died attach_to_pi_owner() {
  305. * *uaddr = 0xC0000000; tsk = get_task(PID);
  306. * } if (!tsk->flags & PF_EXITING) {
  307. * ... attach();
  308. * tsk->futex_state = } else {
  309. * FUTEX_STATE_DEAD; if (tsk->futex_state !=
  310. * FUTEX_STATE_DEAD)
  311. * return -EAGAIN;
  312. * return -ESRCH; <--- FAIL
  313. * }
  314. *
  315. * Returning ESRCH unconditionally is wrong here because the
  316. * user space value has been changed by the exiting task.
  317. *
  318. * The same logic applies to the case where the exiting task is
  319. * already gone.
  320. */
  321. if (futex_get_value_locked(&uval2, uaddr))
  322. return -EFAULT;
  323. /* If the user space value has changed, try again. */
  324. if (uval2 != uval)
  325. return -EAGAIN;
  326. /*
  327. * The exiting task did not have a robust list, the robust list was
  328. * corrupted or the user space value in *uaddr is simply bogus.
  329. * Give up and tell user space.
  330. */
  331. return -ESRCH;
  332. }
  333. static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
  334. struct futex_pi_state **ps)
  335. {
  336. /*
  337. * No existing pi state. First waiter. [2]
  338. *
  339. * This creates pi_state, we have hb->lock held, this means nothing can
  340. * observe this state, wait_lock is irrelevant.
  341. */
  342. struct futex_pi_state *pi_state = alloc_pi_state();
  343. /*
  344. * Initialize the pi_mutex in locked state and make @p
  345. * the owner of it:
  346. */
  347. rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
  348. /* Store the key for possible exit cleanups: */
  349. pi_state->key = *key;
  350. WARN_ON(!list_empty(&pi_state->list));
  351. list_add(&pi_state->list, &p->pi_state_list);
  352. /*
  353. * Assignment without holding pi_state->pi_mutex.wait_lock is safe
  354. * because there is no concurrency as the object is not published yet.
  355. */
  356. pi_state->owner = p;
  357. *ps = pi_state;
  358. }
  359. /*
  360. * Lookup the task for the TID provided from user space and attach to
  361. * it after doing proper sanity checks.
  362. */
  363. static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
  364. struct futex_pi_state **ps,
  365. struct task_struct **exiting)
  366. {
  367. pid_t pid = uval & FUTEX_TID_MASK;
  368. struct task_struct *p;
  369. /*
  370. * We are the first waiter - try to look up the real owner and attach
  371. * the new pi_state to it, but bail out when TID = 0 [1]
  372. *
  373. * The !pid check is paranoid. None of the call sites should end up
  374. * with pid == 0, but better safe than sorry. Let the caller retry
  375. */
  376. if (!pid)
  377. return -EAGAIN;
  378. p = find_get_task_by_vpid(pid);
  379. if (!p)
  380. return handle_exit_race(uaddr, uval, NULL);
  381. if (unlikely(p->flags & PF_KTHREAD)) {
  382. put_task_struct(p);
  383. return -EPERM;
  384. }
  385. /*
  386. * We need to look at the task state to figure out, whether the
  387. * task is exiting. To protect against the change of the task state
  388. * in futex_exit_release(), we do this protected by p->pi_lock:
  389. */
  390. raw_spin_lock_irq(&p->pi_lock);
  391. if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
  392. /*
  393. * The task is on the way out. When the futex state is
  394. * FUTEX_STATE_DEAD, we know that the task has finished
  395. * the cleanup:
  396. */
  397. int ret = handle_exit_race(uaddr, uval, p);
  398. raw_spin_unlock_irq(&p->pi_lock);
  399. /*
  400. * If the owner task is between FUTEX_STATE_EXITING and
  401. * FUTEX_STATE_DEAD then store the task pointer and keep
  402. * the reference on the task struct. The calling code will
  403. * drop all locks, wait for the task to reach
  404. * FUTEX_STATE_DEAD and then drop the refcount. This is
  405. * required to prevent a live lock when the current task
  406. * preempted the exiting task between the two states.
  407. */
  408. if (ret == -EBUSY)
  409. *exiting = p;
  410. else
  411. put_task_struct(p);
  412. return ret;
  413. }
  414. __attach_to_pi_owner(p, key, ps);
  415. raw_spin_unlock_irq(&p->pi_lock);
  416. put_task_struct(p);
  417. return 0;
  418. }
  419. static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
  420. {
  421. int err;
  422. u32 curval;
  423. if (unlikely(should_fail_futex(true)))
  424. return -EFAULT;
  425. err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  426. if (unlikely(err))
  427. return err;
  428. /* If user space value changed, let the caller retry */
  429. return curval != uval ? -EAGAIN : 0;
  430. }
  431. /**
  432. * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
  433. * @uaddr: the pi futex user address
  434. * @hb: the pi futex hash bucket
  435. * @key: the futex key associated with uaddr and hb
  436. * @ps: the pi_state pointer where we store the result of the
  437. * lookup
  438. * @task: the task to perform the atomic lock work for. This will
  439. * be "current" except in the case of requeue pi.
  440. * @exiting: Pointer to store the task pointer of the owner task
  441. * which is in the middle of exiting
  442. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
  443. *
  444. * Return:
  445. * - 0 - ready to wait;
  446. * - 1 - acquired the lock;
  447. * - <0 - error
  448. *
  449. * The hb->lock must be held by the caller.
  450. *
  451. * @exiting is only set when the return value is -EBUSY. If so, this holds
  452. * a refcount on the exiting task on return and the caller needs to drop it
  453. * after waiting for the exit to complete.
  454. */
  455. int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
  456. union futex_key *key,
  457. struct futex_pi_state **ps,
  458. struct task_struct *task,
  459. struct task_struct **exiting,
  460. int set_waiters)
  461. {
  462. u32 uval, newval, vpid = task_pid_vnr(task);
  463. struct futex_q *top_waiter;
  464. int ret;
  465. /*
  466. * Read the user space value first so we can validate a few
  467. * things before proceeding further.
  468. */
  469. if (futex_get_value_locked(&uval, uaddr))
  470. return -EFAULT;
  471. if (unlikely(should_fail_futex(true)))
  472. return -EFAULT;
  473. /*
  474. * Detect deadlocks.
  475. */
  476. if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
  477. return -EDEADLK;
  478. if ((unlikely(should_fail_futex(true))))
  479. return -EDEADLK;
  480. /*
  481. * Lookup existing state first. If it exists, try to attach to
  482. * its pi_state.
  483. */
  484. top_waiter = futex_top_waiter(hb, key);
  485. if (top_waiter)
  486. return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  487. /*
  488. * No waiter and user TID is 0. We are here because the
  489. * waiters or the owner died bit is set or called from
  490. * requeue_cmp_pi or for whatever reason something took the
  491. * syscall.
  492. */
  493. if (!(uval & FUTEX_TID_MASK)) {
  494. /*
  495. * We take over the futex. No other waiters and the user space
  496. * TID is 0. We preserve the owner died bit.
  497. */
  498. newval = uval & FUTEX_OWNER_DIED;
  499. newval |= vpid;
  500. /* The futex requeue_pi code can enforce the waiters bit */
  501. if (set_waiters)
  502. newval |= FUTEX_WAITERS;
  503. ret = lock_pi_update_atomic(uaddr, uval, newval);
  504. if (ret)
  505. return ret;
  506. /*
  507. * If the waiter bit was requested the caller also needs PI
  508. * state attached to the new owner of the user space futex.
  509. *
  510. * @task is guaranteed to be alive and it cannot be exiting
  511. * because it is either sleeping or waiting in
  512. * futex_requeue_pi_wakeup_sync().
  513. *
  514. * No need to do the full attach_to_pi_owner() exercise
  515. * because @task is known and valid.
  516. */
  517. if (set_waiters) {
  518. raw_spin_lock_irq(&task->pi_lock);
  519. __attach_to_pi_owner(task, key, ps);
  520. raw_spin_unlock_irq(&task->pi_lock);
  521. }
  522. return 1;
  523. }
  524. /*
  525. * First waiter. Set the waiters bit before attaching ourself to
  526. * the owner. If owner tries to unlock, it will be forced into
  527. * the kernel and blocked on hb->lock.
  528. */
  529. newval = uval | FUTEX_WAITERS;
  530. ret = lock_pi_update_atomic(uaddr, uval, newval);
  531. if (ret)
  532. return ret;
  533. /*
  534. * If the update of the user space value succeeded, we try to
  535. * attach to the owner. If that fails, no harm done, we only
  536. * set the FUTEX_WAITERS bit in the user space variable.
  537. */
  538. return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
  539. }
  540. /*
  541. * Caller must hold a reference on @pi_state.
  542. */
  543. static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
  544. {
  545. struct rt_mutex_waiter *top_waiter;
  546. struct task_struct *new_owner;
  547. bool postunlock = false;
  548. DEFINE_RT_WAKE_Q(wqh);
  549. u32 curval, newval;
  550. int ret = 0;
  551. top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
  552. if (WARN_ON_ONCE(!top_waiter)) {
  553. /*
  554. * As per the comment in futex_unlock_pi() this should not happen.
  555. *
  556. * When this happens, give up our locks and try again, giving
  557. * the futex_lock_pi() instance time to complete, either by
  558. * waiting on the rtmutex or removing itself from the futex
  559. * queue.
  560. */
  561. ret = -EAGAIN;
  562. goto out_unlock;
  563. }
  564. new_owner = top_waiter->task;
  565. /*
  566. * We pass it to the next owner. The WAITERS bit is always kept
  567. * enabled while there is PI state around. We cleanup the owner
  568. * died bit, because we are the owner.
  569. */
  570. newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
  571. if (unlikely(should_fail_futex(true))) {
  572. ret = -EFAULT;
  573. goto out_unlock;
  574. }
  575. ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  576. if (!ret && (curval != uval)) {
  577. /*
  578. * If a unconditional UNLOCK_PI operation (user space did not
  579. * try the TID->0 transition) raced with a waiter setting the
  580. * FUTEX_WAITERS flag between get_user() and locking the hash
  581. * bucket lock, retry the operation.
  582. */
  583. if ((FUTEX_TID_MASK & curval) == uval)
  584. ret = -EAGAIN;
  585. else
  586. ret = -EINVAL;
  587. }
  588. if (!ret) {
  589. /*
  590. * This is a point of no return; once we modified the uval
  591. * there is no going back and subsequent operations must
  592. * not fail.
  593. */
  594. pi_state_update_owner(pi_state, new_owner);
  595. postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
  596. }
  597. out_unlock:
  598. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  599. if (postunlock)
  600. rt_mutex_postunlock(&wqh);
  601. return ret;
  602. }
  603. static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  604. struct task_struct *argowner)
  605. {
  606. struct futex_pi_state *pi_state = q->pi_state;
  607. struct task_struct *oldowner, *newowner;
  608. u32 uval, curval, newval, newtid;
  609. int err = 0;
  610. oldowner = pi_state->owner;
  611. /*
  612. * We are here because either:
  613. *
  614. * - we stole the lock and pi_state->owner needs updating to reflect
  615. * that (@argowner == current),
  616. *
  617. * or:
  618. *
  619. * - someone stole our lock and we need to fix things to point to the
  620. * new owner (@argowner == NULL).
  621. *
  622. * Either way, we have to replace the TID in the user space variable.
  623. * This must be atomic as we have to preserve the owner died bit here.
  624. *
  625. * Note: We write the user space value _before_ changing the pi_state
  626. * because we can fault here. Imagine swapped out pages or a fork
  627. * that marked all the anonymous memory readonly for cow.
  628. *
  629. * Modifying pi_state _before_ the user space value would leave the
  630. * pi_state in an inconsistent state when we fault here, because we
  631. * need to drop the locks to handle the fault. This might be observed
  632. * in the PID checks when attaching to PI state .
  633. */
  634. retry:
  635. if (!argowner) {
  636. if (oldowner != current) {
  637. /*
  638. * We raced against a concurrent self; things are
  639. * already fixed up. Nothing to do.
  640. */
  641. return 0;
  642. }
  643. if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
  644. /* We got the lock. pi_state is correct. Tell caller. */
  645. return 1;
  646. }
  647. /*
  648. * The trylock just failed, so either there is an owner or
  649. * there is a higher priority waiter than this one.
  650. */
  651. newowner = rt_mutex_owner(&pi_state->pi_mutex);
  652. /*
  653. * If the higher priority waiter has not yet taken over the
  654. * rtmutex then newowner is NULL. We can't return here with
  655. * that state because it's inconsistent vs. the user space
  656. * state. So drop the locks and try again. It's a valid
  657. * situation and not any different from the other retry
  658. * conditions.
  659. */
  660. if (unlikely(!newowner)) {
  661. err = -EAGAIN;
  662. goto handle_err;
  663. }
  664. } else {
  665. WARN_ON_ONCE(argowner != current);
  666. if (oldowner == current) {
  667. /*
  668. * We raced against a concurrent self; things are
  669. * already fixed up. Nothing to do.
  670. */
  671. return 1;
  672. }
  673. newowner = argowner;
  674. }
  675. newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
  676. /* Owner died? */
  677. if (!pi_state->owner)
  678. newtid |= FUTEX_OWNER_DIED;
  679. err = futex_get_value_locked(&uval, uaddr);
  680. if (err)
  681. goto handle_err;
  682. for (;;) {
  683. newval = (uval & FUTEX_OWNER_DIED) | newtid;
  684. err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
  685. if (err)
  686. goto handle_err;
  687. if (curval == uval)
  688. break;
  689. uval = curval;
  690. }
  691. /*
  692. * We fixed up user space. Now we need to fix the pi_state
  693. * itself.
  694. */
  695. pi_state_update_owner(pi_state, newowner);
  696. return argowner == current;
  697. /*
  698. * In order to reschedule or handle a page fault, we need to drop the
  699. * locks here. In the case of a fault, this gives the other task
  700. * (either the highest priority waiter itself or the task which stole
  701. * the rtmutex) the chance to try the fixup of the pi_state. So once we
  702. * are back from handling the fault we need to check the pi_state after
  703. * reacquiring the locks and before trying to do another fixup. When
  704. * the fixup has been done already we simply return.
  705. *
  706. * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
  707. * drop hb->lock since the caller owns the hb -> futex_q relation.
  708. * Dropping the pi_mutex->wait_lock requires the state revalidate.
  709. */
  710. handle_err:
  711. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  712. spin_unlock(q->lock_ptr);
  713. switch (err) {
  714. case -EFAULT:
  715. err = fault_in_user_writeable(uaddr);
  716. break;
  717. case -EAGAIN:
  718. cond_resched();
  719. err = 0;
  720. break;
  721. default:
  722. WARN_ON_ONCE(1);
  723. break;
  724. }
  725. spin_lock(q->lock_ptr);
  726. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  727. /*
  728. * Check if someone else fixed it for us:
  729. */
  730. if (pi_state->owner != oldowner)
  731. return argowner == current;
  732. /* Retry if err was -EAGAIN or the fault in succeeded */
  733. if (!err)
  734. goto retry;
  735. /*
  736. * fault_in_user_writeable() failed so user state is immutable. At
  737. * best we can make the kernel state consistent but user state will
  738. * be most likely hosed and any subsequent unlock operation will be
  739. * rejected due to PI futex rule [10].
  740. *
  741. * Ensure that the rtmutex owner is also the pi_state owner despite
  742. * the user space value claiming something different. There is no
  743. * point in unlocking the rtmutex if current is the owner as it
  744. * would need to wait until the next waiter has taken the rtmutex
  745. * to guarantee consistent state. Keep it simple. Userspace asked
  746. * for this wreckaged state.
  747. *
  748. * The rtmutex has an owner - either current or some other
  749. * task. See the EAGAIN loop above.
  750. */
  751. pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
  752. return err;
  753. }
  754. static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  755. struct task_struct *argowner)
  756. {
  757. struct futex_pi_state *pi_state = q->pi_state;
  758. int ret;
  759. lockdep_assert_held(q->lock_ptr);
  760. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  761. ret = __fixup_pi_state_owner(uaddr, q, argowner);
  762. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  763. return ret;
  764. }
  765. /**
  766. * fixup_pi_owner() - Post lock pi_state and corner case management
  767. * @uaddr: user address of the futex
  768. * @q: futex_q (contains pi_state and access to the rt_mutex)
  769. * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
  770. *
  771. * After attempting to lock an rt_mutex, this function is called to cleanup
  772. * the pi_state owner as well as handle race conditions that may allow us to
  773. * acquire the lock. Must be called with the hb lock held.
  774. *
  775. * Return:
  776. * - 1 - success, lock taken;
  777. * - 0 - success, lock not taken;
  778. * - <0 - on error (-EFAULT)
  779. */
  780. int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
  781. {
  782. if (locked) {
  783. /*
  784. * Got the lock. We might not be the anticipated owner if we
  785. * did a lock-steal - fix up the PI-state in that case:
  786. *
  787. * Speculative pi_state->owner read (we don't hold wait_lock);
  788. * since we own the lock pi_state->owner == current is the
  789. * stable state, anything else needs more attention.
  790. */
  791. if (q->pi_state->owner != current)
  792. return fixup_pi_state_owner(uaddr, q, current);
  793. return 1;
  794. }
  795. /*
  796. * If we didn't get the lock; check if anybody stole it from us. In
  797. * that case, we need to fix up the uval to point to them instead of
  798. * us, otherwise bad things happen. [10]
  799. *
  800. * Another speculative read; pi_state->owner == current is unstable
  801. * but needs our attention.
  802. */
  803. if (q->pi_state->owner == current)
  804. return fixup_pi_state_owner(uaddr, q, NULL);
  805. /*
  806. * Paranoia check. If we did not take the lock, then we should not be
  807. * the owner of the rt_mutex. Warn and establish consistent state.
  808. */
  809. if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
  810. return fixup_pi_state_owner(uaddr, q, current);
  811. return 0;
  812. }
  813. /*
  814. * Userspace tried a 0 -> TID atomic transition of the futex value
  815. * and failed. The kernel side here does the whole locking operation:
  816. * if there are waiters then it will block as a consequence of relying
  817. * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
  818. * a 0 value of the futex too.).
  819. *
  820. * Also serves as futex trylock_pi()'ing, and due semantics.
  821. */
  822. int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
  823. {
  824. struct hrtimer_sleeper timeout, *to;
  825. struct task_struct *exiting = NULL;
  826. struct rt_mutex_waiter rt_waiter;
  827. struct futex_hash_bucket *hb;
  828. struct futex_q q = futex_q_init;
  829. int res, ret;
  830. if (!IS_ENABLED(CONFIG_FUTEX_PI))
  831. return -ENOSYS;
  832. if (refill_pi_state_cache())
  833. return -ENOMEM;
  834. to = futex_setup_timer(time, &timeout, flags, 0);
  835. retry:
  836. ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
  837. if (unlikely(ret != 0))
  838. goto out;
  839. retry_private:
  840. hb = futex_q_lock(&q);
  841. ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
  842. &exiting, 0);
  843. if (unlikely(ret)) {
  844. /*
  845. * Atomic work succeeded and we got the lock,
  846. * or failed. Either way, we do _not_ block.
  847. */
  848. switch (ret) {
  849. case 1:
  850. /* We got the lock. */
  851. ret = 0;
  852. goto out_unlock_put_key;
  853. case -EFAULT:
  854. goto uaddr_faulted;
  855. case -EBUSY:
  856. case -EAGAIN:
  857. /*
  858. * Two reasons for this:
  859. * - EBUSY: Task is exiting and we just wait for the
  860. * exit to complete.
  861. * - EAGAIN: The user space value changed.
  862. */
  863. futex_q_unlock(hb);
  864. /*
  865. * Handle the case where the owner is in the middle of
  866. * exiting. Wait for the exit to complete otherwise
  867. * this task might loop forever, aka. live lock.
  868. */
  869. wait_for_owner_exiting(ret, exiting);
  870. cond_resched();
  871. goto retry;
  872. default:
  873. goto out_unlock_put_key;
  874. }
  875. }
  876. WARN_ON(!q.pi_state);
  877. /*
  878. * Only actually queue now that the atomic ops are done:
  879. */
  880. __futex_queue(&q, hb);
  881. if (trylock) {
  882. ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
  883. /* Fixup the trylock return value: */
  884. ret = ret ? 0 : -EWOULDBLOCK;
  885. goto no_block;
  886. }
  887. rt_mutex_init_waiter(&rt_waiter);
  888. /*
  889. * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
  890. * hold it while doing rt_mutex_start_proxy(), because then it will
  891. * include hb->lock in the blocking chain, even through we'll not in
  892. * fact hold it while blocking. This will lead it to report -EDEADLK
  893. * and BUG when futex_unlock_pi() interleaves with this.
  894. *
  895. * Therefore acquire wait_lock while holding hb->lock, but drop the
  896. * latter before calling __rt_mutex_start_proxy_lock(). This
  897. * interleaves with futex_unlock_pi() -- which does a similar lock
  898. * handoff -- such that the latter can observe the futex_q::pi_state
  899. * before __rt_mutex_start_proxy_lock() is done.
  900. */
  901. raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
  902. spin_unlock(q.lock_ptr);
  903. /*
  904. * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
  905. * such that futex_unlock_pi() is guaranteed to observe the waiter when
  906. * it sees the futex_q::pi_state.
  907. */
  908. ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
  909. raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
  910. if (ret) {
  911. if (ret == 1)
  912. ret = 0;
  913. goto cleanup;
  914. }
  915. if (unlikely(to))
  916. hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
  917. ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
  918. cleanup:
  919. spin_lock(q.lock_ptr);
  920. /*
  921. * If we failed to acquire the lock (deadlock/signal/timeout), we must
  922. * first acquire the hb->lock before removing the lock from the
  923. * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
  924. * lists consistent.
  925. *
  926. * In particular; it is important that futex_unlock_pi() can not
  927. * observe this inconsistency.
  928. */
  929. if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
  930. ret = 0;
  931. no_block:
  932. /*
  933. * Fixup the pi_state owner and possibly acquire the lock if we
  934. * haven't already.
  935. */
  936. res = fixup_pi_owner(uaddr, &q, !ret);
  937. /*
  938. * If fixup_pi_owner() returned an error, propagate that. If it acquired
  939. * the lock, clear our -ETIMEDOUT or -EINTR.
  940. */
  941. if (res)
  942. ret = (res < 0) ? res : 0;
  943. futex_unqueue_pi(&q);
  944. spin_unlock(q.lock_ptr);
  945. goto out;
  946. out_unlock_put_key:
  947. futex_q_unlock(hb);
  948. out:
  949. if (to) {
  950. hrtimer_cancel(&to->timer);
  951. destroy_hrtimer_on_stack(&to->timer);
  952. }
  953. return ret != -EINTR ? ret : -ERESTARTNOINTR;
  954. uaddr_faulted:
  955. futex_q_unlock(hb);
  956. ret = fault_in_user_writeable(uaddr);
  957. if (ret)
  958. goto out;
  959. if (!(flags & FLAGS_SHARED))
  960. goto retry_private;
  961. goto retry;
  962. }
  963. /*
  964. * Userspace attempted a TID -> 0 atomic transition, and failed.
  965. * This is the in-kernel slowpath: we look up the PI state (if any),
  966. * and do the rt-mutex unlock.
  967. */
  968. int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
  969. {
  970. u32 curval, uval, vpid = task_pid_vnr(current);
  971. union futex_key key = FUTEX_KEY_INIT;
  972. struct futex_hash_bucket *hb;
  973. struct futex_q *top_waiter;
  974. int ret;
  975. if (!IS_ENABLED(CONFIG_FUTEX_PI))
  976. return -ENOSYS;
  977. retry:
  978. if (get_user(uval, uaddr))
  979. return -EFAULT;
  980. /*
  981. * We release only a lock we actually own:
  982. */
  983. if ((uval & FUTEX_TID_MASK) != vpid)
  984. return -EPERM;
  985. ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
  986. if (ret)
  987. return ret;
  988. hb = futex_hash(&key);
  989. spin_lock(&hb->lock);
  990. /*
  991. * Check waiters first. We do not trust user space values at
  992. * all and we at least want to know if user space fiddled
  993. * with the futex value instead of blindly unlocking.
  994. */
  995. top_waiter = futex_top_waiter(hb, &key);
  996. if (top_waiter) {
  997. struct futex_pi_state *pi_state = top_waiter->pi_state;
  998. ret = -EINVAL;
  999. if (!pi_state)
  1000. goto out_unlock;
  1001. /*
  1002. * If current does not own the pi_state then the futex is
  1003. * inconsistent and user space fiddled with the futex value.
  1004. */
  1005. if (pi_state->owner != current)
  1006. goto out_unlock;
  1007. get_pi_state(pi_state);
  1008. /*
  1009. * By taking wait_lock while still holding hb->lock, we ensure
  1010. * there is no point where we hold neither; and therefore
  1011. * wake_futex_p() must observe a state consistent with what we
  1012. * observed.
  1013. *
  1014. * In particular; this forces __rt_mutex_start_proxy() to
  1015. * complete such that we're guaranteed to observe the
  1016. * rt_waiter. Also see the WARN in wake_futex_pi().
  1017. */
  1018. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  1019. spin_unlock(&hb->lock);
  1020. /* drops pi_state->pi_mutex.wait_lock */
  1021. ret = wake_futex_pi(uaddr, uval, pi_state);
  1022. put_pi_state(pi_state);
  1023. /*
  1024. * Success, we're done! No tricky corner cases.
  1025. */
  1026. if (!ret)
  1027. return ret;
  1028. /*
  1029. * The atomic access to the futex value generated a
  1030. * pagefault, so retry the user-access and the wakeup:
  1031. */
  1032. if (ret == -EFAULT)
  1033. goto pi_faulted;
  1034. /*
  1035. * A unconditional UNLOCK_PI op raced against a waiter
  1036. * setting the FUTEX_WAITERS bit. Try again.
  1037. */
  1038. if (ret == -EAGAIN)
  1039. goto pi_retry;
  1040. /*
  1041. * wake_futex_pi has detected invalid state. Tell user
  1042. * space.
  1043. */
  1044. return ret;
  1045. }
  1046. /*
  1047. * We have no kernel internal state, i.e. no waiters in the
  1048. * kernel. Waiters which are about to queue themselves are stuck
  1049. * on hb->lock. So we can safely ignore them. We do neither
  1050. * preserve the WAITERS bit not the OWNER_DIED one. We are the
  1051. * owner.
  1052. */
  1053. if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
  1054. spin_unlock(&hb->lock);
  1055. switch (ret) {
  1056. case -EFAULT:
  1057. goto pi_faulted;
  1058. case -EAGAIN:
  1059. goto pi_retry;
  1060. default:
  1061. WARN_ON_ONCE(1);
  1062. return ret;
  1063. }
  1064. }
  1065. /*
  1066. * If uval has changed, let user space handle it.
  1067. */
  1068. ret = (curval == uval) ? 0 : -EAGAIN;
  1069. out_unlock:
  1070. spin_unlock(&hb->lock);
  1071. return ret;
  1072. pi_retry:
  1073. cond_resched();
  1074. goto retry;
  1075. pi_faulted:
  1076. ret = fault_in_user_writeable(uaddr);
  1077. if (!ret)
  1078. goto retry;
  1079. return ret;
  1080. }