core.c 32 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * Fast Userspace Mutexes (which I call "Futexes!").
  4. * (C) Rusty Russell, IBM 2002
  5. *
  6. * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
  7. * (C) Copyright 2003 Red Hat Inc, All Rights Reserved
  8. *
  9. * Removed page pinning, fix privately mapped COW pages and other cleanups
  10. * (C) Copyright 2003, 2004 Jamie Lokier
  11. *
  12. * Robust futex support started by Ingo Molnar
  13. * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
  14. * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
  15. *
  16. * PI-futex support started by Ingo Molnar and Thomas Gleixner
  17. * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <[email protected]>
  18. * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <[email protected]>
  19. *
  20. * PRIVATE futexes by Eric Dumazet
  21. * Copyright (C) 2007 Eric Dumazet <[email protected]>
  22. *
  23. * Requeue-PI support by Darren Hart <[email protected]>
  24. * Copyright (C) IBM Corporation, 2009
  25. * Thanks to Thomas Gleixner for conceptual design and careful reviews.
  26. *
  27. * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  28. * enough at me, Linus for the original (flawed) idea, Matthew
  29. * Kirkwood for proof-of-concept implementation.
  30. *
  31. * "The futexes are also cursed."
  32. * "But they come in a choice of three flavours!"
  33. */
  34. #include <linux/compat.h>
  35. #include <linux/jhash.h>
  36. #include <linux/pagemap.h>
  37. #include <linux/memblock.h>
  38. #include <linux/fault-inject.h>
  39. #include <linux/slab.h>
  40. #include "futex.h"
  41. #include "../locking/rtmutex_common.h"
  42. #include <trace/hooks/futex.h>
  43. /*
  44. * The base of the bucket array and its size are always used together
  45. * (after initialization only in futex_hash()), so ensure that they
  46. * reside in the same cacheline.
  47. */
  48. static struct {
  49. struct futex_hash_bucket *queues;
  50. unsigned long hashsize;
  51. } __futex_data __read_mostly __aligned(2*sizeof(long));
  52. #define futex_queues (__futex_data.queues)
  53. #define futex_hashsize (__futex_data.hashsize)
  54. /*
  55. * Fault injections for futexes.
  56. */
  57. #ifdef CONFIG_FAIL_FUTEX
  58. static struct {
  59. struct fault_attr attr;
  60. bool ignore_private;
  61. } fail_futex = {
  62. .attr = FAULT_ATTR_INITIALIZER,
  63. .ignore_private = false,
  64. };
  65. static int __init setup_fail_futex(char *str)
  66. {
  67. return setup_fault_attr(&fail_futex.attr, str);
  68. }
  69. __setup("fail_futex=", setup_fail_futex);
  70. bool should_fail_futex(bool fshared)
  71. {
  72. if (fail_futex.ignore_private && !fshared)
  73. return false;
  74. return should_fail(&fail_futex.attr, 1);
  75. }
  76. #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
  77. static int __init fail_futex_debugfs(void)
  78. {
  79. umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
  80. struct dentry *dir;
  81. dir = fault_create_debugfs_attr("fail_futex", NULL,
  82. &fail_futex.attr);
  83. if (IS_ERR(dir))
  84. return PTR_ERR(dir);
  85. debugfs_create_bool("ignore-private", mode, dir,
  86. &fail_futex.ignore_private);
  87. return 0;
  88. }
  89. late_initcall(fail_futex_debugfs);
  90. #endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
  91. #endif /* CONFIG_FAIL_FUTEX */
  92. /**
  93. * futex_hash - Return the hash bucket in the global hash
  94. * @key: Pointer to the futex key for which the hash is calculated
  95. *
  96. * We hash on the keys returned from get_futex_key (see below) and return the
  97. * corresponding hash bucket in the global hash.
  98. */
  99. struct futex_hash_bucket *futex_hash(union futex_key *key)
  100. {
  101. u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4,
  102. key->both.offset);
  103. return &futex_queues[hash & (futex_hashsize - 1)];
  104. }
  105. /**
  106. * futex_setup_timer - set up the sleeping hrtimer.
  107. * @time: ptr to the given timeout value
  108. * @timeout: the hrtimer_sleeper structure to be set up
  109. * @flags: futex flags
  110. * @range_ns: optional range in ns
  111. *
  112. * Return: Initialized hrtimer_sleeper structure or NULL if no timeout
  113. * value given
  114. */
  115. struct hrtimer_sleeper *
  116. futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout,
  117. int flags, u64 range_ns)
  118. {
  119. if (!time)
  120. return NULL;
  121. hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ?
  122. CLOCK_REALTIME : CLOCK_MONOTONIC,
  123. HRTIMER_MODE_ABS);
  124. /*
  125. * If range_ns is 0, calling hrtimer_set_expires_range_ns() is
  126. * effectively the same as calling hrtimer_set_expires().
  127. */
  128. hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns);
  129. return timeout;
  130. }
  131. /*
  132. * Generate a machine wide unique identifier for this inode.
  133. *
  134. * This relies on u64 not wrapping in the life-time of the machine; which with
  135. * 1ns resolution means almost 585 years.
  136. *
  137. * This further relies on the fact that a well formed program will not unmap
  138. * the file while it has a (shared) futex waiting on it. This mapping will have
  139. * a file reference which pins the mount and inode.
  140. *
  141. * If for some reason an inode gets evicted and read back in again, it will get
  142. * a new sequence number and will _NOT_ match, even though it is the exact same
  143. * file.
  144. *
  145. * It is important that futex_match() will never have a false-positive, esp.
  146. * for PI futexes that can mess up the state. The above argues that false-negatives
  147. * are only possible for malformed programs.
  148. */
  149. static u64 get_inode_sequence_number(struct inode *inode)
  150. {
  151. static atomic64_t i_seq;
  152. u64 old;
  153. /* Does the inode already have a sequence number? */
  154. old = atomic64_read(&inode->i_sequence);
  155. if (likely(old))
  156. return old;
  157. for (;;) {
  158. u64 new = atomic64_add_return(1, &i_seq);
  159. if (WARN_ON_ONCE(!new))
  160. continue;
  161. old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new);
  162. if (old)
  163. return old;
  164. return new;
  165. }
  166. }
  167. /**
  168. * get_futex_key() - Get parameters which are the keys for a futex
  169. * @uaddr: virtual address of the futex
  170. * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED
  171. * @key: address where result is stored.
  172. * @rw: mapping needs to be read/write (values: FUTEX_READ,
  173. * FUTEX_WRITE)
  174. *
  175. * Return: a negative error code or 0
  176. *
  177. * The key words are stored in @key on success.
  178. *
  179. * For shared mappings (when @fshared), the key is:
  180. *
  181. * ( inode->i_sequence, page->index, offset_within_page )
  182. *
  183. * [ also see get_inode_sequence_number() ]
  184. *
  185. * For private mappings (or when !@fshared), the key is:
  186. *
  187. * ( current->mm, address, 0 )
  188. *
  189. * This allows (cross process, where applicable) identification of the futex
  190. * without keeping the page pinned for the duration of the FUTEX_WAIT.
  191. *
  192. * lock_page() might sleep, the caller should not hold a spinlock.
  193. */
  194. int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key,
  195. enum futex_access rw)
  196. {
  197. unsigned long address = (unsigned long)uaddr;
  198. struct mm_struct *mm = current->mm;
  199. struct page *page, *tail;
  200. struct address_space *mapping;
  201. int err, ro = 0;
  202. /*
  203. * The futex address must be "naturally" aligned.
  204. */
  205. key->both.offset = address % PAGE_SIZE;
  206. if (unlikely((address % sizeof(u32)) != 0))
  207. return -EINVAL;
  208. address -= key->both.offset;
  209. if (unlikely(!access_ok(uaddr, sizeof(u32))))
  210. return -EFAULT;
  211. if (unlikely(should_fail_futex(fshared)))
  212. return -EFAULT;
  213. /*
  214. * PROCESS_PRIVATE futexes are fast.
  215. * As the mm cannot disappear under us and the 'key' only needs
  216. * virtual address, we dont even have to find the underlying vma.
  217. * Note : We do have to check 'uaddr' is a valid user address,
  218. * but access_ok() should be faster than find_vma()
  219. */
  220. if (!fshared) {
  221. /*
  222. * On no-MMU, shared futexes are treated as private, therefore
  223. * we must not include the current process in the key. Since
  224. * there is only one address space, the address is a unique key
  225. * on its own.
  226. */
  227. if (IS_ENABLED(CONFIG_MMU))
  228. key->private.mm = mm;
  229. else
  230. key->private.mm = NULL;
  231. key->private.address = address;
  232. return 0;
  233. }
  234. again:
  235. /* Ignore any VERIFY_READ mapping (futex common case) */
  236. if (unlikely(should_fail_futex(true)))
  237. return -EFAULT;
  238. err = get_user_pages_fast(address, 1, FOLL_WRITE, &page);
  239. /*
  240. * If write access is not required (eg. FUTEX_WAIT), try
  241. * and get read-only access.
  242. */
  243. if (err == -EFAULT && rw == FUTEX_READ) {
  244. err = get_user_pages_fast(address, 1, 0, &page);
  245. ro = 1;
  246. }
  247. if (err < 0)
  248. return err;
  249. else
  250. err = 0;
  251. /*
  252. * The treatment of mapping from this point on is critical. The page
  253. * lock protects many things but in this context the page lock
  254. * stabilizes mapping, prevents inode freeing in the shared
  255. * file-backed region case and guards against movement to swap cache.
  256. *
  257. * Strictly speaking the page lock is not needed in all cases being
  258. * considered here and page lock forces unnecessarily serialization
  259. * From this point on, mapping will be re-verified if necessary and
  260. * page lock will be acquired only if it is unavoidable
  261. *
  262. * Mapping checks require the head page for any compound page so the
  263. * head page and mapping is looked up now. For anonymous pages, it
  264. * does not matter if the page splits in the future as the key is
  265. * based on the address. For filesystem-backed pages, the tail is
  266. * required as the index of the page determines the key. For
  267. * base pages, there is no tail page and tail == page.
  268. */
  269. tail = page;
  270. page = compound_head(page);
  271. mapping = READ_ONCE(page->mapping);
  272. /*
  273. * If page->mapping is NULL, then it cannot be a PageAnon
  274. * page; but it might be the ZERO_PAGE or in the gate area or
  275. * in a special mapping (all cases which we are happy to fail);
  276. * or it may have been a good file page when get_user_pages_fast
  277. * found it, but truncated or holepunched or subjected to
  278. * invalidate_complete_page2 before we got the page lock (also
  279. * cases which we are happy to fail). And we hold a reference,
  280. * so refcount care in invalidate_inode_page's remove_mapping
  281. * prevents drop_caches from setting mapping to NULL beneath us.
  282. *
  283. * The case we do have to guard against is when memory pressure made
  284. * shmem_writepage move it from filecache to swapcache beneath us:
  285. * an unlikely race, but we do need to retry for page->mapping.
  286. */
  287. if (unlikely(!mapping)) {
  288. int shmem_swizzled;
  289. /*
  290. * Page lock is required to identify which special case above
  291. * applies. If this is really a shmem page then the page lock
  292. * will prevent unexpected transitions.
  293. */
  294. lock_page(page);
  295. shmem_swizzled = PageSwapCache(page) || page->mapping;
  296. unlock_page(page);
  297. put_page(page);
  298. if (shmem_swizzled)
  299. goto again;
  300. return -EFAULT;
  301. }
  302. /*
  303. * Private mappings are handled in a simple way.
  304. *
  305. * If the futex key is stored on an anonymous page, then the associated
  306. * object is the mm which is implicitly pinned by the calling process.
  307. *
  308. * NOTE: When userspace waits on a MAP_SHARED mapping, even if
  309. * it's a read-only handle, it's expected that futexes attach to
  310. * the object not the particular process.
  311. */
  312. if (PageAnon(page)) {
  313. /*
  314. * A RO anonymous page will never change and thus doesn't make
  315. * sense for futex operations.
  316. */
  317. if (unlikely(should_fail_futex(true)) || ro) {
  318. err = -EFAULT;
  319. goto out;
  320. }
  321. key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
  322. key->private.mm = mm;
  323. key->private.address = address;
  324. } else {
  325. struct inode *inode;
  326. /*
  327. * The associated futex object in this case is the inode and
  328. * the page->mapping must be traversed. Ordinarily this should
  329. * be stabilised under page lock but it's not strictly
  330. * necessary in this case as we just want to pin the inode, not
  331. * update the radix tree or anything like that.
  332. *
  333. * The RCU read lock is taken as the inode is finally freed
  334. * under RCU. If the mapping still matches expectations then the
  335. * mapping->host can be safely accessed as being a valid inode.
  336. */
  337. rcu_read_lock();
  338. if (READ_ONCE(page->mapping) != mapping) {
  339. rcu_read_unlock();
  340. put_page(page);
  341. goto again;
  342. }
  343. inode = READ_ONCE(mapping->host);
  344. if (!inode) {
  345. rcu_read_unlock();
  346. put_page(page);
  347. goto again;
  348. }
  349. key->both.offset |= FUT_OFF_INODE; /* inode-based key */
  350. key->shared.i_seq = get_inode_sequence_number(inode);
  351. key->shared.pgoff = page_to_pgoff(tail);
  352. rcu_read_unlock();
  353. }
  354. out:
  355. put_page(page);
  356. return err;
  357. }
  358. /**
  359. * fault_in_user_writeable() - Fault in user address and verify RW access
  360. * @uaddr: pointer to faulting user space address
  361. *
  362. * Slow path to fixup the fault we just took in the atomic write
  363. * access to @uaddr.
  364. *
  365. * We have no generic implementation of a non-destructive write to the
  366. * user address. We know that we faulted in the atomic pagefault
  367. * disabled section so we can as well avoid the #PF overhead by
  368. * calling get_user_pages() right away.
  369. */
  370. int fault_in_user_writeable(u32 __user *uaddr)
  371. {
  372. struct mm_struct *mm = current->mm;
  373. int ret;
  374. mmap_read_lock(mm);
  375. ret = fixup_user_fault(mm, (unsigned long)uaddr,
  376. FAULT_FLAG_WRITE, NULL);
  377. mmap_read_unlock(mm);
  378. return ret < 0 ? ret : 0;
  379. }
  380. /**
  381. * futex_top_waiter() - Return the highest priority waiter on a futex
  382. * @hb: the hash bucket the futex_q's reside in
  383. * @key: the futex key (to distinguish it from other futex futex_q's)
  384. *
  385. * Must be called with the hb lock held.
  386. */
  387. struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key)
  388. {
  389. struct futex_q *this;
  390. plist_for_each_entry(this, &hb->chain, list) {
  391. if (futex_match(&this->key, key))
  392. return this;
  393. }
  394. return NULL;
  395. }
  396. int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval)
  397. {
  398. int ret;
  399. pagefault_disable();
  400. ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
  401. pagefault_enable();
  402. return ret;
  403. }
  404. int futex_get_value_locked(u32 *dest, u32 __user *from)
  405. {
  406. int ret;
  407. pagefault_disable();
  408. ret = __get_user(*dest, from);
  409. pagefault_enable();
  410. return ret ? -EFAULT : 0;
  411. }
  412. /**
  413. * wait_for_owner_exiting - Block until the owner has exited
  414. * @ret: owner's current futex lock status
  415. * @exiting: Pointer to the exiting task
  416. *
  417. * Caller must hold a refcount on @exiting.
  418. */
  419. void wait_for_owner_exiting(int ret, struct task_struct *exiting)
  420. {
  421. if (ret != -EBUSY) {
  422. WARN_ON_ONCE(exiting);
  423. return;
  424. }
  425. if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
  426. return;
  427. mutex_lock(&exiting->futex_exit_mutex);
  428. /*
  429. * No point in doing state checking here. If the waiter got here
  430. * while the task was in exec()->exec_futex_release() then it can
  431. * have any FUTEX_STATE_* value when the waiter has acquired the
  432. * mutex. OK, if running, EXITING or DEAD if it reached exit()
  433. * already. Highly unlikely and not a problem. Just one more round
  434. * through the futex maze.
  435. */
  436. mutex_unlock(&exiting->futex_exit_mutex);
  437. put_task_struct(exiting);
  438. }
  439. /**
  440. * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket
  441. * @q: The futex_q to unqueue
  442. *
  443. * The q->lock_ptr must not be NULL and must be held by the caller.
  444. */
  445. void __futex_unqueue(struct futex_q *q)
  446. {
  447. struct futex_hash_bucket *hb;
  448. if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list)))
  449. return;
  450. lockdep_assert_held(q->lock_ptr);
  451. hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
  452. plist_del(&q->list, &hb->chain);
  453. futex_hb_waiters_dec(hb);
  454. }
  455. /* The key must be already stored in q->key. */
  456. struct futex_hash_bucket *futex_q_lock(struct futex_q *q)
  457. __acquires(&hb->lock)
  458. {
  459. struct futex_hash_bucket *hb;
  460. hb = futex_hash(&q->key);
  461. /*
  462. * Increment the counter before taking the lock so that
  463. * a potential waker won't miss a to-be-slept task that is
  464. * waiting for the spinlock. This is safe as all futex_q_lock()
  465. * users end up calling futex_queue(). Similarly, for housekeeping,
  466. * decrement the counter at futex_q_unlock() when some error has
  467. * occurred and we don't end up adding the task to the list.
  468. */
  469. futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */
  470. q->lock_ptr = &hb->lock;
  471. spin_lock(&hb->lock);
  472. return hb;
  473. }
  474. void futex_q_unlock(struct futex_hash_bucket *hb)
  475. __releases(&hb->lock)
  476. {
  477. spin_unlock(&hb->lock);
  478. futex_hb_waiters_dec(hb);
  479. }
  480. void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb)
  481. {
  482. int prio;
  483. bool already_on_hb = false;
  484. /*
  485. * The priority used to register this element is
  486. * - either the real thread-priority for the real-time threads
  487. * (i.e. threads with a priority lower than MAX_RT_PRIO)
  488. * - or MAX_RT_PRIO for non-RT threads.
  489. * Thus, all RT-threads are woken first in priority order, and
  490. * the others are woken last, in FIFO order.
  491. */
  492. prio = min(current->normal_prio, MAX_RT_PRIO);
  493. plist_node_init(&q->list, prio);
  494. trace_android_vh_alter_futex_plist_add(&q->list, &hb->chain, &already_on_hb);
  495. if (!already_on_hb)
  496. plist_add(&q->list, &hb->chain);
  497. q->task = current;
  498. }
  499. /**
  500. * futex_unqueue() - Remove the futex_q from its futex_hash_bucket
  501. * @q: The futex_q to unqueue
  502. *
  503. * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must
  504. * be paired with exactly one earlier call to futex_queue().
  505. *
  506. * Return:
  507. * - 1 - if the futex_q was still queued (and we removed unqueued it);
  508. * - 0 - if the futex_q was already removed by the waking thread
  509. */
  510. int futex_unqueue(struct futex_q *q)
  511. {
  512. spinlock_t *lock_ptr;
  513. int ret = 0;
  514. /* In the common case we don't take the spinlock, which is nice. */
  515. retry:
  516. /*
  517. * q->lock_ptr can change between this read and the following spin_lock.
  518. * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
  519. * optimizing lock_ptr out of the logic below.
  520. */
  521. lock_ptr = READ_ONCE(q->lock_ptr);
  522. if (lock_ptr != NULL) {
  523. spin_lock(lock_ptr);
  524. /*
  525. * q->lock_ptr can change between reading it and
  526. * spin_lock(), causing us to take the wrong lock. This
  527. * corrects the race condition.
  528. *
  529. * Reasoning goes like this: if we have the wrong lock,
  530. * q->lock_ptr must have changed (maybe several times)
  531. * between reading it and the spin_lock(). It can
  532. * change again after the spin_lock() but only if it was
  533. * already changed before the spin_lock(). It cannot,
  534. * however, change back to the original value. Therefore
  535. * we can detect whether we acquired the correct lock.
  536. */
  537. if (unlikely(lock_ptr != q->lock_ptr)) {
  538. spin_unlock(lock_ptr);
  539. goto retry;
  540. }
  541. __futex_unqueue(q);
  542. BUG_ON(q->pi_state);
  543. spin_unlock(lock_ptr);
  544. ret = 1;
  545. }
  546. return ret;
  547. }
  548. /*
  549. * PI futexes can not be requeued and must remove themselves from the
  550. * hash bucket. The hash bucket lock (i.e. lock_ptr) is held.
  551. */
  552. void futex_unqueue_pi(struct futex_q *q)
  553. {
  554. __futex_unqueue(q);
  555. BUG_ON(!q->pi_state);
  556. put_pi_state(q->pi_state);
  557. q->pi_state = NULL;
  558. }
  559. /* Constants for the pending_op argument of handle_futex_death */
  560. #define HANDLE_DEATH_PENDING true
  561. #define HANDLE_DEATH_LIST false
  562. /*
  563. * Process a futex-list entry, check whether it's owned by the
  564. * dying task, and do notification if so:
  565. */
  566. static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr,
  567. bool pi, bool pending_op)
  568. {
  569. u32 uval, nval, mval;
  570. pid_t owner;
  571. int err;
  572. /* Futex address must be 32bit aligned */
  573. if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
  574. return -1;
  575. retry:
  576. if (get_user(uval, uaddr))
  577. return -1;
  578. /*
  579. * Special case for regular (non PI) futexes. The unlock path in
  580. * user space has two race scenarios:
  581. *
  582. * 1. The unlock path releases the user space futex value and
  583. * before it can execute the futex() syscall to wake up
  584. * waiters it is killed.
  585. *
  586. * 2. A woken up waiter is killed before it can acquire the
  587. * futex in user space.
  588. *
  589. * In the second case, the wake up notification could be generated
  590. * by the unlock path in user space after setting the futex value
  591. * to zero or by the kernel after setting the OWNER_DIED bit below.
  592. *
  593. * In both cases the TID validation below prevents a wakeup of
  594. * potential waiters which can cause these waiters to block
  595. * forever.
  596. *
  597. * In both cases the following conditions are met:
  598. *
  599. * 1) task->robust_list->list_op_pending != NULL
  600. * @pending_op == true
  601. * 2) The owner part of user space futex value == 0
  602. * 3) Regular futex: @pi == false
  603. *
  604. * If these conditions are met, it is safe to attempt waking up a
  605. * potential waiter without touching the user space futex value and
  606. * trying to set the OWNER_DIED bit. If the futex value is zero,
  607. * the rest of the user space mutex state is consistent, so a woken
  608. * waiter will just take over the uncontended futex. Setting the
  609. * OWNER_DIED bit would create inconsistent state and malfunction
  610. * of the user space owner died handling. Otherwise, the OWNER_DIED
  611. * bit is already set, and the woken waiter is expected to deal with
  612. * this.
  613. */
  614. owner = uval & FUTEX_TID_MASK;
  615. if (pending_op && !pi && !owner) {
  616. futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
  617. return 0;
  618. }
  619. if (owner != task_pid_vnr(curr))
  620. return 0;
  621. /*
  622. * Ok, this dying thread is truly holding a futex
  623. * of interest. Set the OWNER_DIED bit atomically
  624. * via cmpxchg, and if the value had FUTEX_WAITERS
  625. * set, wake up a waiter (if any). (We have to do a
  626. * futex_wake() even if OWNER_DIED is already set -
  627. * to handle the rare but possible case of recursive
  628. * thread-death.) The rest of the cleanup is done in
  629. * userspace.
  630. */
  631. mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
  632. /*
  633. * We are not holding a lock here, but we want to have
  634. * the pagefault_disable/enable() protection because
  635. * we want to handle the fault gracefully. If the
  636. * access fails we try to fault in the futex with R/W
  637. * verification via get_user_pages. get_user() above
  638. * does not guarantee R/W access. If that fails we
  639. * give up and leave the futex locked.
  640. */
  641. if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) {
  642. switch (err) {
  643. case -EFAULT:
  644. if (fault_in_user_writeable(uaddr))
  645. return -1;
  646. goto retry;
  647. case -EAGAIN:
  648. cond_resched();
  649. goto retry;
  650. default:
  651. WARN_ON_ONCE(1);
  652. return err;
  653. }
  654. }
  655. if (nval != uval)
  656. goto retry;
  657. /*
  658. * Wake robust non-PI futexes here. The wakeup of
  659. * PI futexes happens in exit_pi_state():
  660. */
  661. if (!pi && (uval & FUTEX_WAITERS))
  662. futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
  663. return 0;
  664. }
  665. /*
  666. * Fetch a robust-list pointer. Bit 0 signals PI futexes:
  667. */
  668. static inline int fetch_robust_entry(struct robust_list __user **entry,
  669. struct robust_list __user * __user *head,
  670. unsigned int *pi)
  671. {
  672. unsigned long uentry;
  673. if (get_user(uentry, (unsigned long __user *)head))
  674. return -EFAULT;
  675. *entry = (void __user *)(uentry & ~1UL);
  676. *pi = uentry & 1;
  677. return 0;
  678. }
  679. /*
  680. * Walk curr->robust_list (very carefully, it's a userspace list!)
  681. * and mark any locks found there dead, and notify any waiters.
  682. *
  683. * We silently return on any sign of list-walking problem.
  684. */
  685. static void exit_robust_list(struct task_struct *curr)
  686. {
  687. struct robust_list_head __user *head = curr->robust_list;
  688. struct robust_list __user *entry, *next_entry, *pending;
  689. unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
  690. unsigned int next_pi;
  691. unsigned long futex_offset;
  692. int rc;
  693. /*
  694. * Fetch the list head (which was registered earlier, via
  695. * sys_set_robust_list()):
  696. */
  697. if (fetch_robust_entry(&entry, &head->list.next, &pi))
  698. return;
  699. /*
  700. * Fetch the relative futex offset:
  701. */
  702. if (get_user(futex_offset, &head->futex_offset))
  703. return;
  704. /*
  705. * Fetch any possibly pending lock-add first, and handle it
  706. * if it exists:
  707. */
  708. if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
  709. return;
  710. next_entry = NULL; /* avoid warning with gcc */
  711. while (entry != &head->list) {
  712. /*
  713. * Fetch the next entry in the list before calling
  714. * handle_futex_death:
  715. */
  716. rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
  717. /*
  718. * A pending lock might already be on the list, so
  719. * don't process it twice:
  720. */
  721. if (entry != pending) {
  722. if (handle_futex_death((void __user *)entry + futex_offset,
  723. curr, pi, HANDLE_DEATH_LIST))
  724. return;
  725. }
  726. if (rc)
  727. return;
  728. entry = next_entry;
  729. pi = next_pi;
  730. /*
  731. * Avoid excessively long or circular lists:
  732. */
  733. if (!--limit)
  734. break;
  735. cond_resched();
  736. }
  737. if (pending) {
  738. handle_futex_death((void __user *)pending + futex_offset,
  739. curr, pip, HANDLE_DEATH_PENDING);
  740. }
  741. }
  742. #ifdef CONFIG_COMPAT
  743. static void __user *futex_uaddr(struct robust_list __user *entry,
  744. compat_long_t futex_offset)
  745. {
  746. compat_uptr_t base = ptr_to_compat(entry);
  747. void __user *uaddr = compat_ptr(base + futex_offset);
  748. return uaddr;
  749. }
  750. /*
  751. * Fetch a robust-list pointer. Bit 0 signals PI futexes:
  752. */
  753. static inline int
  754. compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
  755. compat_uptr_t __user *head, unsigned int *pi)
  756. {
  757. if (get_user(*uentry, head))
  758. return -EFAULT;
  759. *entry = compat_ptr((*uentry) & ~1);
  760. *pi = (unsigned int)(*uentry) & 1;
  761. return 0;
  762. }
  763. /*
  764. * Walk curr->robust_list (very carefully, it's a userspace list!)
  765. * and mark any locks found there dead, and notify any waiters.
  766. *
  767. * We silently return on any sign of list-walking problem.
  768. */
  769. static void compat_exit_robust_list(struct task_struct *curr)
  770. {
  771. struct compat_robust_list_head __user *head = curr->compat_robust_list;
  772. struct robust_list __user *entry, *next_entry, *pending;
  773. unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
  774. unsigned int next_pi;
  775. compat_uptr_t uentry, next_uentry, upending;
  776. compat_long_t futex_offset;
  777. int rc;
  778. /*
  779. * Fetch the list head (which was registered earlier, via
  780. * sys_set_robust_list()):
  781. */
  782. if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
  783. return;
  784. /*
  785. * Fetch the relative futex offset:
  786. */
  787. if (get_user(futex_offset, &head->futex_offset))
  788. return;
  789. /*
  790. * Fetch any possibly pending lock-add first, and handle it
  791. * if it exists:
  792. */
  793. if (compat_fetch_robust_entry(&upending, &pending,
  794. &head->list_op_pending, &pip))
  795. return;
  796. next_entry = NULL; /* avoid warning with gcc */
  797. while (entry != (struct robust_list __user *) &head->list) {
  798. /*
  799. * Fetch the next entry in the list before calling
  800. * handle_futex_death:
  801. */
  802. rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
  803. (compat_uptr_t __user *)&entry->next, &next_pi);
  804. /*
  805. * A pending lock might already be on the list, so
  806. * dont process it twice:
  807. */
  808. if (entry != pending) {
  809. void __user *uaddr = futex_uaddr(entry, futex_offset);
  810. if (handle_futex_death(uaddr, curr, pi,
  811. HANDLE_DEATH_LIST))
  812. return;
  813. }
  814. if (rc)
  815. return;
  816. uentry = next_uentry;
  817. entry = next_entry;
  818. pi = next_pi;
  819. /*
  820. * Avoid excessively long or circular lists:
  821. */
  822. if (!--limit)
  823. break;
  824. cond_resched();
  825. }
  826. if (pending) {
  827. void __user *uaddr = futex_uaddr(pending, futex_offset);
  828. handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING);
  829. }
  830. }
  831. #endif
  832. #ifdef CONFIG_FUTEX_PI
  833. /*
  834. * This task is holding PI mutexes at exit time => bad.
  835. * Kernel cleans up PI-state, but userspace is likely hosed.
  836. * (Robust-futex cleanup is separate and might save the day for userspace.)
  837. */
  838. static void exit_pi_state_list(struct task_struct *curr)
  839. {
  840. struct list_head *next, *head = &curr->pi_state_list;
  841. struct futex_pi_state *pi_state;
  842. struct futex_hash_bucket *hb;
  843. union futex_key key = FUTEX_KEY_INIT;
  844. /*
  845. * We are a ZOMBIE and nobody can enqueue itself on
  846. * pi_state_list anymore, but we have to be careful
  847. * versus waiters unqueueing themselves:
  848. */
  849. raw_spin_lock_irq(&curr->pi_lock);
  850. while (!list_empty(head)) {
  851. next = head->next;
  852. pi_state = list_entry(next, struct futex_pi_state, list);
  853. key = pi_state->key;
  854. hb = futex_hash(&key);
  855. /*
  856. * We can race against put_pi_state() removing itself from the
  857. * list (a waiter going away). put_pi_state() will first
  858. * decrement the reference count and then modify the list, so
  859. * its possible to see the list entry but fail this reference
  860. * acquire.
  861. *
  862. * In that case; drop the locks to let put_pi_state() make
  863. * progress and retry the loop.
  864. */
  865. if (!refcount_inc_not_zero(&pi_state->refcount)) {
  866. raw_spin_unlock_irq(&curr->pi_lock);
  867. cpu_relax();
  868. raw_spin_lock_irq(&curr->pi_lock);
  869. continue;
  870. }
  871. raw_spin_unlock_irq(&curr->pi_lock);
  872. spin_lock(&hb->lock);
  873. raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  874. raw_spin_lock(&curr->pi_lock);
  875. /*
  876. * We dropped the pi-lock, so re-check whether this
  877. * task still owns the PI-state:
  878. */
  879. if (head->next != next) {
  880. /* retain curr->pi_lock for the loop invariant */
  881. raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
  882. spin_unlock(&hb->lock);
  883. put_pi_state(pi_state);
  884. continue;
  885. }
  886. WARN_ON(pi_state->owner != curr);
  887. WARN_ON(list_empty(&pi_state->list));
  888. list_del_init(&pi_state->list);
  889. pi_state->owner = NULL;
  890. raw_spin_unlock(&curr->pi_lock);
  891. raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
  892. spin_unlock(&hb->lock);
  893. rt_mutex_futex_unlock(&pi_state->pi_mutex);
  894. put_pi_state(pi_state);
  895. raw_spin_lock_irq(&curr->pi_lock);
  896. }
  897. raw_spin_unlock_irq(&curr->pi_lock);
  898. }
  899. #else
  900. static inline void exit_pi_state_list(struct task_struct *curr) { }
  901. #endif
  902. static void futex_cleanup(struct task_struct *tsk)
  903. {
  904. if (unlikely(tsk->robust_list)) {
  905. exit_robust_list(tsk);
  906. tsk->robust_list = NULL;
  907. }
  908. #ifdef CONFIG_COMPAT
  909. if (unlikely(tsk->compat_robust_list)) {
  910. compat_exit_robust_list(tsk);
  911. tsk->compat_robust_list = NULL;
  912. }
  913. #endif
  914. if (unlikely(!list_empty(&tsk->pi_state_list)))
  915. exit_pi_state_list(tsk);
  916. }
  917. /**
  918. * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
  919. * @tsk: task to set the state on
  920. *
  921. * Set the futex exit state of the task lockless. The futex waiter code
  922. * observes that state when a task is exiting and loops until the task has
  923. * actually finished the futex cleanup. The worst case for this is that the
  924. * waiter runs through the wait loop until the state becomes visible.
  925. *
  926. * This is called from the recursive fault handling path in make_task_dead().
  927. *
  928. * This is best effort. Either the futex exit code has run already or
  929. * not. If the OWNER_DIED bit has been set on the futex then the waiter can
  930. * take it over. If not, the problem is pushed back to user space. If the
  931. * futex exit code did not run yet, then an already queued waiter might
  932. * block forever, but there is nothing which can be done about that.
  933. */
  934. void futex_exit_recursive(struct task_struct *tsk)
  935. {
  936. /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
  937. if (tsk->futex_state == FUTEX_STATE_EXITING)
  938. mutex_unlock(&tsk->futex_exit_mutex);
  939. tsk->futex_state = FUTEX_STATE_DEAD;
  940. }
  941. static void futex_cleanup_begin(struct task_struct *tsk)
  942. {
  943. /*
  944. * Prevent various race issues against a concurrent incoming waiter
  945. * including live locks by forcing the waiter to block on
  946. * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
  947. * attach_to_pi_owner().
  948. */
  949. mutex_lock(&tsk->futex_exit_mutex);
  950. /*
  951. * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
  952. *
  953. * This ensures that all subsequent checks of tsk->futex_state in
  954. * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
  955. * tsk->pi_lock held.
  956. *
  957. * It guarantees also that a pi_state which was queued right before
  958. * the state change under tsk->pi_lock by a concurrent waiter must
  959. * be observed in exit_pi_state_list().
  960. */
  961. raw_spin_lock_irq(&tsk->pi_lock);
  962. tsk->futex_state = FUTEX_STATE_EXITING;
  963. raw_spin_unlock_irq(&tsk->pi_lock);
  964. }
  965. static void futex_cleanup_end(struct task_struct *tsk, int state)
  966. {
  967. /*
  968. * Lockless store. The only side effect is that an observer might
  969. * take another loop until it becomes visible.
  970. */
  971. tsk->futex_state = state;
  972. /*
  973. * Drop the exit protection. This unblocks waiters which observed
  974. * FUTEX_STATE_EXITING to reevaluate the state.
  975. */
  976. mutex_unlock(&tsk->futex_exit_mutex);
  977. }
  978. void futex_exec_release(struct task_struct *tsk)
  979. {
  980. /*
  981. * The state handling is done for consistency, but in the case of
  982. * exec() there is no way to prevent further damage as the PID stays
  983. * the same. But for the unlikely and arguably buggy case that a
  984. * futex is held on exec(), this provides at least as much state
  985. * consistency protection which is possible.
  986. */
  987. futex_cleanup_begin(tsk);
  988. futex_cleanup(tsk);
  989. /*
  990. * Reset the state to FUTEX_STATE_OK. The task is alive and about
  991. * exec a new binary.
  992. */
  993. futex_cleanup_end(tsk, FUTEX_STATE_OK);
  994. }
  995. void futex_exit_release(struct task_struct *tsk)
  996. {
  997. futex_cleanup_begin(tsk);
  998. futex_cleanup(tsk);
  999. futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
  1000. }
  1001. static int __init futex_init(void)
  1002. {
  1003. unsigned int futex_shift;
  1004. unsigned long i;
  1005. #if CONFIG_BASE_SMALL
  1006. futex_hashsize = 16;
  1007. #else
  1008. futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus());
  1009. #endif
  1010. futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues),
  1011. futex_hashsize, 0,
  1012. futex_hashsize < 256 ? HASH_SMALL : 0,
  1013. &futex_shift, NULL,
  1014. futex_hashsize, futex_hashsize);
  1015. futex_hashsize = 1UL << futex_shift;
  1016. for (i = 0; i < futex_hashsize; i++) {
  1017. atomic_set(&futex_queues[i].waiters, 0);
  1018. plist_head_init(&futex_queues[i].chain);
  1019. spin_lock_init(&futex_queues[i].lock);
  1020. }
  1021. return 0;
  1022. }
  1023. core_initcall(futex_init);