123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233 |
- // SPDX-License-Identifier: GPL-2.0-or-later
- #include <linux/slab.h>
- #include <linux/sched/task.h>
- #include "futex.h"
- #include "../locking/rtmutex_common.h"
- /*
- * PI code:
- */
- int refill_pi_state_cache(void)
- {
- struct futex_pi_state *pi_state;
- if (likely(current->pi_state_cache))
- return 0;
- pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
- if (!pi_state)
- return -ENOMEM;
- INIT_LIST_HEAD(&pi_state->list);
- /* pi_mutex gets initialized later */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- pi_state->key = FUTEX_KEY_INIT;
- current->pi_state_cache = pi_state;
- return 0;
- }
- static struct futex_pi_state *alloc_pi_state(void)
- {
- struct futex_pi_state *pi_state = current->pi_state_cache;
- WARN_ON(!pi_state);
- current->pi_state_cache = NULL;
- return pi_state;
- }
- static void pi_state_update_owner(struct futex_pi_state *pi_state,
- struct task_struct *new_owner)
- {
- struct task_struct *old_owner = pi_state->owner;
- lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
- if (old_owner) {
- raw_spin_lock(&old_owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock(&old_owner->pi_lock);
- }
- if (new_owner) {
- raw_spin_lock(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock(&new_owner->pi_lock);
- }
- }
- void get_pi_state(struct futex_pi_state *pi_state)
- {
- WARN_ON_ONCE(!refcount_inc_not_zero(&pi_state->refcount));
- }
- /*
- * Drops a reference to the pi_state object and frees or caches it
- * when the last reference is gone.
- */
- void put_pi_state(struct futex_pi_state *pi_state)
- {
- if (!pi_state)
- return;
- if (!refcount_dec_and_test(&pi_state->refcount))
- return;
- /*
- * If pi_state->owner is NULL, the owner is most probably dying
- * and has cleaned up the pi_state already
- */
- if (pi_state->owner) {
- unsigned long flags;
- raw_spin_lock_irqsave(&pi_state->pi_mutex.wait_lock, flags);
- pi_state_update_owner(pi_state, NULL);
- rt_mutex_proxy_unlock(&pi_state->pi_mutex);
- raw_spin_unlock_irqrestore(&pi_state->pi_mutex.wait_lock, flags);
- }
- if (current->pi_state_cache) {
- kfree(pi_state);
- } else {
- /*
- * pi_state->list is already empty.
- * clear pi_state->owner.
- * refcount is at 0 - put it back to 1.
- */
- pi_state->owner = NULL;
- refcount_set(&pi_state->refcount, 1);
- current->pi_state_cache = pi_state;
- }
- }
- /*
- * We need to check the following states:
- *
- * Waiter | pi_state | pi->owner | uTID | uODIED | ?
- *
- * [1] NULL | --- | --- | 0 | 0/1 | Valid
- * [2] NULL | --- | --- | >0 | 0/1 | Valid
- *
- * [3] Found | NULL | -- | Any | 0/1 | Invalid
- *
- * [4] Found | Found | NULL | 0 | 1 | Valid
- * [5] Found | Found | NULL | >0 | 1 | Invalid
- *
- * [6] Found | Found | task | 0 | 1 | Valid
- *
- * [7] Found | Found | NULL | Any | 0 | Invalid
- *
- * [8] Found | Found | task | ==taskTID | 0/1 | Valid
- * [9] Found | Found | task | 0 | 0 | Invalid
- * [10] Found | Found | task | !=taskTID | 0/1 | Invalid
- *
- * [1] Indicates that the kernel can acquire the futex atomically. We
- * came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
- *
- * [2] Valid, if TID does not belong to a kernel thread. If no matching
- * thread is found then it indicates that the owner TID has died.
- *
- * [3] Invalid. The waiter is queued on a non PI futex
- *
- * [4] Valid state after exit_robust_list(), which sets the user space
- * value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
- *
- * [5] The user space value got manipulated between exit_robust_list()
- * and exit_pi_state_list()
- *
- * [6] Valid state after exit_pi_state_list() which sets the new owner in
- * the pi_state but cannot access the user space value.
- *
- * [7] pi_state->owner can only be NULL when the OWNER_DIED bit is set.
- *
- * [8] Owner and user space value match
- *
- * [9] There is no transient state which sets the user space TID to 0
- * except exit_robust_list(), but this is indicated by the
- * FUTEX_OWNER_DIED bit. See [4]
- *
- * [10] There is no transient state which leaves owner and user space
- * TID out of sync. Except one error case where the kernel is denied
- * write access to the user address, see fixup_pi_state_owner().
- *
- *
- * Serialization and lifetime rules:
- *
- * hb->lock:
- *
- * hb -> futex_q, relation
- * futex_q -> pi_state, relation
- *
- * (cannot be raw because hb can contain arbitrary amount
- * of futex_q's)
- *
- * pi_mutex->wait_lock:
- *
- * {uval, pi_state}
- *
- * (and pi_mutex 'obviously')
- *
- * p->pi_lock:
- *
- * p->pi_state_list -> pi_state->list, relation
- * pi_mutex->owner -> pi_state->owner, relation
- *
- * pi_state->refcount:
- *
- * pi_state lifetime
- *
- *
- * Lock order:
- *
- * hb->lock
- * pi_mutex->wait_lock
- * p->pi_lock
- *
- */
- /*
- * Validate that the existing waiter has a pi_state and sanity check
- * the pi_state against the user space value. If correct, attach to
- * it.
- */
- static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
- struct futex_pi_state *pi_state,
- struct futex_pi_state **ps)
- {
- pid_t pid = uval & FUTEX_TID_MASK;
- u32 uval2;
- int ret;
- /*
- * Userspace might have messed up non-PI and PI futexes [3]
- */
- if (unlikely(!pi_state))
- return -EINVAL;
- /*
- * We get here with hb->lock held, and having found a
- * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
- * has dropped the hb->lock in between futex_queue() and futex_unqueue_pi(),
- * which in turn means that futex_lock_pi() still has a reference on
- * our pi_state.
- *
- * The waiter holding a reference on @pi_state also protects against
- * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
- * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
- * free pi_state before we can take a reference ourselves.
- */
- WARN_ON(!refcount_read(&pi_state->refcount));
- /*
- * Now that we have a pi_state, we can acquire wait_lock
- * and do the state validation.
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- /*
- * Since {uval, pi_state} is serialized by wait_lock, and our current
- * uval was read without holding it, it can have changed. Verify it
- * still is what we expect it to be, otherwise retry the entire
- * operation.
- */
- if (futex_get_value_locked(&uval2, uaddr))
- goto out_efault;
- if (uval != uval2)
- goto out_eagain;
- /*
- * Handle the owner died case:
- */
- if (uval & FUTEX_OWNER_DIED) {
- /*
- * exit_pi_state_list sets owner to NULL and wakes the
- * topmost waiter. The task which acquires the
- * pi_state->rt_mutex will fixup owner.
- */
- if (!pi_state->owner) {
- /*
- * No pi state owner, but the user space TID
- * is not 0. Inconsistent state. [5]
- */
- if (pid)
- goto out_einval;
- /*
- * Take a ref on the state and return success. [4]
- */
- goto out_attach;
- }
- /*
- * If TID is 0, then either the dying owner has not
- * yet executed exit_pi_state_list() or some waiter
- * acquired the rtmutex in the pi state, but did not
- * yet fixup the TID in user space.
- *
- * Take a ref on the state and return success. [6]
- */
- if (!pid)
- goto out_attach;
- } else {
- /*
- * If the owner died bit is not set, then the pi_state
- * must have an owner. [7]
- */
- if (!pi_state->owner)
- goto out_einval;
- }
- /*
- * Bail out if user space manipulated the futex value. If pi
- * state exists then the owner TID must be the same as the
- * user space TID. [9/10]
- */
- if (pid != task_pid_vnr(pi_state->owner))
- goto out_einval;
- out_attach:
- get_pi_state(pi_state);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- *ps = pi_state;
- return 0;
- out_einval:
- ret = -EINVAL;
- goto out_error;
- out_eagain:
- ret = -EAGAIN;
- goto out_error;
- out_efault:
- ret = -EFAULT;
- goto out_error;
- out_error:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
- static int handle_exit_race(u32 __user *uaddr, u32 uval,
- struct task_struct *tsk)
- {
- u32 uval2;
- /*
- * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
- * caller that the alleged owner is busy.
- */
- if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
- return -EBUSY;
- /*
- * Reread the user space value to handle the following situation:
- *
- * CPU0 CPU1
- *
- * sys_exit() sys_futex()
- * do_exit() futex_lock_pi()
- * futex_lock_pi_atomic()
- * exit_signals(tsk) No waiters:
- * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
- * mm_release(tsk) Set waiter bit
- * exit_robust_list(tsk) { *uaddr = 0x80000PID;
- * Set owner died attach_to_pi_owner() {
- * *uaddr = 0xC0000000; tsk = get_task(PID);
- * } if (!tsk->flags & PF_EXITING) {
- * ... attach();
- * tsk->futex_state = } else {
- * FUTEX_STATE_DEAD; if (tsk->futex_state !=
- * FUTEX_STATE_DEAD)
- * return -EAGAIN;
- * return -ESRCH; <--- FAIL
- * }
- *
- * Returning ESRCH unconditionally is wrong here because the
- * user space value has been changed by the exiting task.
- *
- * The same logic applies to the case where the exiting task is
- * already gone.
- */
- if (futex_get_value_locked(&uval2, uaddr))
- return -EFAULT;
- /* If the user space value has changed, try again. */
- if (uval2 != uval)
- return -EAGAIN;
- /*
- * The exiting task did not have a robust list, the robust list was
- * corrupted or the user space value in *uaddr is simply bogus.
- * Give up and tell user space.
- */
- return -ESRCH;
- }
- static void __attach_to_pi_owner(struct task_struct *p, union futex_key *key,
- struct futex_pi_state **ps)
- {
- /*
- * No existing pi state. First waiter. [2]
- *
- * This creates pi_state, we have hb->lock held, this means nothing can
- * observe this state, wait_lock is irrelevant.
- */
- struct futex_pi_state *pi_state = alloc_pi_state();
- /*
- * Initialize the pi_mutex in locked state and make @p
- * the owner of it:
- */
- rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
- /* Store the key for possible exit cleanups: */
- pi_state->key = *key;
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &p->pi_state_list);
- /*
- * Assignment without holding pi_state->pi_mutex.wait_lock is safe
- * because there is no concurrency as the object is not published yet.
- */
- pi_state->owner = p;
- *ps = pi_state;
- }
- /*
- * Lookup the task for the TID provided from user space and attach to
- * it after doing proper sanity checks.
- */
- static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct **exiting)
- {
- pid_t pid = uval & FUTEX_TID_MASK;
- struct task_struct *p;
- /*
- * We are the first waiter - try to look up the real owner and attach
- * the new pi_state to it, but bail out when TID = 0 [1]
- *
- * The !pid check is paranoid. None of the call sites should end up
- * with pid == 0, but better safe than sorry. Let the caller retry
- */
- if (!pid)
- return -EAGAIN;
- p = find_get_task_by_vpid(pid);
- if (!p)
- return handle_exit_race(uaddr, uval, NULL);
- if (unlikely(p->flags & PF_KTHREAD)) {
- put_task_struct(p);
- return -EPERM;
- }
- /*
- * We need to look at the task state to figure out, whether the
- * task is exiting. To protect against the change of the task state
- * in futex_exit_release(), we do this protected by p->pi_lock:
- */
- raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
- /*
- * The task is on the way out. When the futex state is
- * FUTEX_STATE_DEAD, we know that the task has finished
- * the cleanup:
- */
- int ret = handle_exit_race(uaddr, uval, p);
- raw_spin_unlock_irq(&p->pi_lock);
- /*
- * If the owner task is between FUTEX_STATE_EXITING and
- * FUTEX_STATE_DEAD then store the task pointer and keep
- * the reference on the task struct. The calling code will
- * drop all locks, wait for the task to reach
- * FUTEX_STATE_DEAD and then drop the refcount. This is
- * required to prevent a live lock when the current task
- * preempted the exiting task between the two states.
- */
- if (ret == -EBUSY)
- *exiting = p;
- else
- put_task_struct(p);
- return ret;
- }
- __attach_to_pi_owner(p, key, ps);
- raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
- return 0;
- }
- static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
- {
- int err;
- u32 curval;
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
- err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
- if (unlikely(err))
- return err;
- /* If user space value changed, let the caller retry */
- return curval != uval ? -EAGAIN : 0;
- }
- /**
- * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the
- * lookup
- * @task: the task to perform the atomic lock work for. This will
- * be "current" except in the case of requeue pi.
- * @exiting: Pointer to store the task pointer of the owner task
- * which is in the middle of exiting
- * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
- *
- * Return:
- * - 0 - ready to wait;
- * - 1 - acquired the lock;
- * - <0 - error
- *
- * The hb->lock must be held by the caller.
- *
- * @exiting is only set when the return value is -EBUSY. If so, this holds
- * a refcount on the exiting task on return and the caller needs to drop it
- * after waiting for the exit to complete.
- */
- int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
- union futex_key *key,
- struct futex_pi_state **ps,
- struct task_struct *task,
- struct task_struct **exiting,
- int set_waiters)
- {
- u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *top_waiter;
- int ret;
- /*
- * Read the user space value first so we can validate a few
- * things before proceeding further.
- */
- if (futex_get_value_locked(&uval, uaddr))
- return -EFAULT;
- if (unlikely(should_fail_futex(true)))
- return -EFAULT;
- /*
- * Detect deadlocks.
- */
- if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
- return -EDEADLK;
- if ((unlikely(should_fail_futex(true))))
- return -EDEADLK;
- /*
- * Lookup existing state first. If it exists, try to attach to
- * its pi_state.
- */
- top_waiter = futex_top_waiter(hb, key);
- if (top_waiter)
- return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
- /*
- * No waiter and user TID is 0. We are here because the
- * waiters or the owner died bit is set or called from
- * requeue_cmp_pi or for whatever reason something took the
- * syscall.
- */
- if (!(uval & FUTEX_TID_MASK)) {
- /*
- * We take over the futex. No other waiters and the user space
- * TID is 0. We preserve the owner died bit.
- */
- newval = uval & FUTEX_OWNER_DIED;
- newval |= vpid;
- /* The futex requeue_pi code can enforce the waiters bit */
- if (set_waiters)
- newval |= FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the waiter bit was requested the caller also needs PI
- * state attached to the new owner of the user space futex.
- *
- * @task is guaranteed to be alive and it cannot be exiting
- * because it is either sleeping or waiting in
- * futex_requeue_pi_wakeup_sync().
- *
- * No need to do the full attach_to_pi_owner() exercise
- * because @task is known and valid.
- */
- if (set_waiters) {
- raw_spin_lock_irq(&task->pi_lock);
- __attach_to_pi_owner(task, key, ps);
- raw_spin_unlock_irq(&task->pi_lock);
- }
- return 1;
- }
- /*
- * First waiter. Set the waiters bit before attaching ourself to
- * the owner. If owner tries to unlock, it will be forced into
- * the kernel and blocked on hb->lock.
- */
- newval = uval | FUTEX_WAITERS;
- ret = lock_pi_update_atomic(uaddr, uval, newval);
- if (ret)
- return ret;
- /*
- * If the update of the user space value succeeded, we try to
- * attach to the owner. If that fails, no harm done, we only
- * set the FUTEX_WAITERS bit in the user space variable.
- */
- return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
- }
- /*
- * Caller must hold a reference on @pi_state.
- */
- static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
- {
- struct rt_mutex_waiter *top_waiter;
- struct task_struct *new_owner;
- bool postunlock = false;
- DEFINE_RT_WAKE_Q(wqh);
- u32 curval, newval;
- int ret = 0;
- top_waiter = rt_mutex_top_waiter(&pi_state->pi_mutex);
- if (WARN_ON_ONCE(!top_waiter)) {
- /*
- * As per the comment in futex_unlock_pi() this should not happen.
- *
- * When this happens, give up our locks and try again, giving
- * the futex_lock_pi() instance time to complete, either by
- * waiting on the rtmutex or removing itself from the futex
- * queue.
- */
- ret = -EAGAIN;
- goto out_unlock;
- }
- new_owner = top_waiter->task;
- /*
- * We pass it to the next owner. The WAITERS bit is always kept
- * enabled while there is PI state around. We cleanup the owner
- * died bit, because we are the owner.
- */
- newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- if (unlikely(should_fail_futex(true))) {
- ret = -EFAULT;
- goto out_unlock;
- }
- ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
- if (!ret && (curval != uval)) {
- /*
- * If a unconditional UNLOCK_PI operation (user space did not
- * try the TID->0 transition) raced with a waiter setting the
- * FUTEX_WAITERS flag between get_user() and locking the hash
- * bucket lock, retry the operation.
- */
- if ((FUTEX_TID_MASK & curval) == uval)
- ret = -EAGAIN;
- else
- ret = -EINVAL;
- }
- if (!ret) {
- /*
- * This is a point of no return; once we modified the uval
- * there is no going back and subsequent operations must
- * not fail.
- */
- pi_state_update_owner(pi_state, new_owner);
- postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wqh);
- }
- out_unlock:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- if (postunlock)
- rt_mutex_postunlock(&wqh);
- return ret;
- }
- static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
- {
- struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner, *newowner;
- u32 uval, curval, newval, newtid;
- int err = 0;
- oldowner = pi_state->owner;
- /*
- * We are here because either:
- *
- * - we stole the lock and pi_state->owner needs updating to reflect
- * that (@argowner == current),
- *
- * or:
- *
- * - someone stole our lock and we need to fix things to point to the
- * new owner (@argowner == NULL).
- *
- * Either way, we have to replace the TID in the user space variable.
- * This must be atomic as we have to preserve the owner died bit here.
- *
- * Note: We write the user space value _before_ changing the pi_state
- * because we can fault here. Imagine swapped out pages or a fork
- * that marked all the anonymous memory readonly for cow.
- *
- * Modifying pi_state _before_ the user space value would leave the
- * pi_state in an inconsistent state when we fault here, because we
- * need to drop the locks to handle the fault. This might be observed
- * in the PID checks when attaching to PI state .
- */
- retry:
- if (!argowner) {
- if (oldowner != current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 0;
- }
- if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
- /* We got the lock. pi_state is correct. Tell caller. */
- return 1;
- }
- /*
- * The trylock just failed, so either there is an owner or
- * there is a higher priority waiter than this one.
- */
- newowner = rt_mutex_owner(&pi_state->pi_mutex);
- /*
- * If the higher priority waiter has not yet taken over the
- * rtmutex then newowner is NULL. We can't return here with
- * that state because it's inconsistent vs. the user space
- * state. So drop the locks and try again. It's a valid
- * situation and not any different from the other retry
- * conditions.
- */
- if (unlikely(!newowner)) {
- err = -EAGAIN;
- goto handle_err;
- }
- } else {
- WARN_ON_ONCE(argowner != current);
- if (oldowner == current) {
- /*
- * We raced against a concurrent self; things are
- * already fixed up. Nothing to do.
- */
- return 1;
- }
- newowner = argowner;
- }
- newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
- err = futex_get_value_locked(&uval, uaddr);
- if (err)
- goto handle_err;
- for (;;) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
- err = futex_cmpxchg_value_locked(&curval, uaddr, uval, newval);
- if (err)
- goto handle_err;
- if (curval == uval)
- break;
- uval = curval;
- }
- /*
- * We fixed up user space. Now we need to fix the pi_state
- * itself.
- */
- pi_state_update_owner(pi_state, newowner);
- return argowner == current;
- /*
- * In order to reschedule or handle a page fault, we need to drop the
- * locks here. In the case of a fault, this gives the other task
- * (either the highest priority waiter itself or the task which stole
- * the rtmutex) the chance to try the fixup of the pi_state. So once we
- * are back from handling the fault we need to check the pi_state after
- * reacquiring the locks and before trying to do another fixup. When
- * the fixup has been done already we simply return.
- *
- * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
- * drop hb->lock since the caller owns the hb -> futex_q relation.
- * Dropping the pi_mutex->wait_lock requires the state revalidate.
- */
- handle_err:
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(q->lock_ptr);
- switch (err) {
- case -EFAULT:
- err = fault_in_user_writeable(uaddr);
- break;
- case -EAGAIN:
- cond_resched();
- err = 0;
- break;
- default:
- WARN_ON_ONCE(1);
- break;
- }
- spin_lock(q->lock_ptr);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- /*
- * Check if someone else fixed it for us:
- */
- if (pi_state->owner != oldowner)
- return argowner == current;
- /* Retry if err was -EAGAIN or the fault in succeeded */
- if (!err)
- goto retry;
- /*
- * fault_in_user_writeable() failed so user state is immutable. At
- * best we can make the kernel state consistent but user state will
- * be most likely hosed and any subsequent unlock operation will be
- * rejected due to PI futex rule [10].
- *
- * Ensure that the rtmutex owner is also the pi_state owner despite
- * the user space value claiming something different. There is no
- * point in unlocking the rtmutex if current is the owner as it
- * would need to wait until the next waiter has taken the rtmutex
- * to guarantee consistent state. Keep it simple. Userspace asked
- * for this wreckaged state.
- *
- * The rtmutex has an owner - either current or some other
- * task. See the EAGAIN loop above.
- */
- pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
- return err;
- }
- static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *argowner)
- {
- struct futex_pi_state *pi_state = q->pi_state;
- int ret;
- lockdep_assert_held(q->lock_ptr);
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- ret = __fixup_pi_state_owner(uaddr, q, argowner);
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
- /**
- * fixup_pi_owner() - Post lock pi_state and corner case management
- * @uaddr: user address of the futex
- * @q: futex_q (contains pi_state and access to the rt_mutex)
- * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0)
- *
- * After attempting to lock an rt_mutex, this function is called to cleanup
- * the pi_state owner as well as handle race conditions that may allow us to
- * acquire the lock. Must be called with the hb lock held.
- *
- * Return:
- * - 1 - success, lock taken;
- * - 0 - success, lock not taken;
- * - <0 - on error (-EFAULT)
- */
- int fixup_pi_owner(u32 __user *uaddr, struct futex_q *q, int locked)
- {
- if (locked) {
- /*
- * Got the lock. We might not be the anticipated owner if we
- * did a lock-steal - fix up the PI-state in that case:
- *
- * Speculative pi_state->owner read (we don't hold wait_lock);
- * since we own the lock pi_state->owner == current is the
- * stable state, anything else needs more attention.
- */
- if (q->pi_state->owner != current)
- return fixup_pi_state_owner(uaddr, q, current);
- return 1;
- }
- /*
- * If we didn't get the lock; check if anybody stole it from us. In
- * that case, we need to fix up the uval to point to them instead of
- * us, otherwise bad things happen. [10]
- *
- * Another speculative read; pi_state->owner == current is unstable
- * but needs our attention.
- */
- if (q->pi_state->owner == current)
- return fixup_pi_state_owner(uaddr, q, NULL);
- /*
- * Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex. Warn and establish consistent state.
- */
- if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
- return fixup_pi_state_owner(uaddr, q, current);
- return 0;
- }
- /*
- * Userspace tried a 0 -> TID atomic transition of the futex value
- * and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block as a consequence of relying
- * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
- * a 0 value of the futex too.).
- *
- * Also serves as futex trylock_pi()'ing, and due semantics.
- */
- int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock)
- {
- struct hrtimer_sleeper timeout, *to;
- struct task_struct *exiting = NULL;
- struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
- struct futex_q q = futex_q_init;
- int res, ret;
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
- if (refill_pi_state_cache())
- return -ENOMEM;
- to = futex_setup_timer(time, &timeout, flags, 0);
- retry:
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, FUTEX_WRITE);
- if (unlikely(ret != 0))
- goto out;
- retry_private:
- hb = futex_q_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
- &exiting, 0);
- if (unlikely(ret)) {
- /*
- * Atomic work succeeded and we got the lock,
- * or failed. Either way, we do _not_ block.
- */
- switch (ret) {
- case 1:
- /* We got the lock. */
- ret = 0;
- goto out_unlock_put_key;
- case -EFAULT:
- goto uaddr_faulted;
- case -EBUSY:
- case -EAGAIN:
- /*
- * Two reasons for this:
- * - EBUSY: Task is exiting and we just wait for the
- * exit to complete.
- * - EAGAIN: The user space value changed.
- */
- futex_q_unlock(hb);
- /*
- * Handle the case where the owner is in the middle of
- * exiting. Wait for the exit to complete otherwise
- * this task might loop forever, aka. live lock.
- */
- wait_for_owner_exiting(ret, exiting);
- cond_resched();
- goto retry;
- default:
- goto out_unlock_put_key;
- }
- }
- WARN_ON(!q.pi_state);
- /*
- * Only actually queue now that the atomic ops are done:
- */
- __futex_queue(&q, hb);
- if (trylock) {
- ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
- /* Fixup the trylock return value: */
- ret = ret ? 0 : -EWOULDBLOCK;
- goto no_block;
- }
- rt_mutex_init_waiter(&rt_waiter);
- /*
- * On PREEMPT_RT, when hb->lock becomes an rt_mutex, we must not
- * hold it while doing rt_mutex_start_proxy(), because then it will
- * include hb->lock in the blocking chain, even through we'll not in
- * fact hold it while blocking. This will lead it to report -EDEADLK
- * and BUG when futex_unlock_pi() interleaves with this.
- *
- * Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling __rt_mutex_start_proxy_lock(). This
- * interleaves with futex_unlock_pi() -- which does a similar lock
- * handoff -- such that the latter can observe the futex_q::pi_state
- * before __rt_mutex_start_proxy_lock() is done.
- */
- raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
- spin_unlock(q.lock_ptr);
- /*
- * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
- * such that futex_unlock_pi() is guaranteed to observe the waiter when
- * it sees the futex_q::pi_state.
- */
- ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
- raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
- if (ret) {
- if (ret == 1)
- ret = 0;
- goto cleanup;
- }
- if (unlikely(to))
- hrtimer_sleeper_start_expires(to, HRTIMER_MODE_ABS);
- ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
- cleanup:
- spin_lock(q.lock_ptr);
- /*
- * If we failed to acquire the lock (deadlock/signal/timeout), we must
- * first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
- * lists consistent.
- *
- * In particular; it is important that futex_unlock_pi() can not
- * observe this inconsistency.
- */
- if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
- ret = 0;
- no_block:
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
- */
- res = fixup_pi_owner(uaddr, &q, !ret);
- /*
- * If fixup_pi_owner() returned an error, propagate that. If it acquired
- * the lock, clear our -ETIMEDOUT or -EINTR.
- */
- if (res)
- ret = (res < 0) ? res : 0;
- futex_unqueue_pi(&q);
- spin_unlock(q.lock_ptr);
- goto out;
- out_unlock_put_key:
- futex_q_unlock(hb);
- out:
- if (to) {
- hrtimer_cancel(&to->timer);
- destroy_hrtimer_on_stack(&to->timer);
- }
- return ret != -EINTR ? ret : -ERESTARTNOINTR;
- uaddr_faulted:
- futex_q_unlock(hb);
- ret = fault_in_user_writeable(uaddr);
- if (ret)
- goto out;
- if (!(flags & FLAGS_SHARED))
- goto retry_private;
- goto retry;
- }
- /*
- * Userspace attempted a TID -> 0 atomic transition, and failed.
- * This is the in-kernel slowpath: we look up the PI state (if any),
- * and do the rt-mutex unlock.
- */
- int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
- {
- u32 curval, uval, vpid = task_pid_vnr(current);
- union futex_key key = FUTEX_KEY_INIT;
- struct futex_hash_bucket *hb;
- struct futex_q *top_waiter;
- int ret;
- if (!IS_ENABLED(CONFIG_FUTEX_PI))
- return -ENOSYS;
- retry:
- if (get_user(uval, uaddr))
- return -EFAULT;
- /*
- * We release only a lock we actually own:
- */
- if ((uval & FUTEX_TID_MASK) != vpid)
- return -EPERM;
- ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, FUTEX_WRITE);
- if (ret)
- return ret;
- hb = futex_hash(&key);
- spin_lock(&hb->lock);
- /*
- * Check waiters first. We do not trust user space values at
- * all and we at least want to know if user space fiddled
- * with the futex value instead of blindly unlocking.
- */
- top_waiter = futex_top_waiter(hb, &key);
- if (top_waiter) {
- struct futex_pi_state *pi_state = top_waiter->pi_state;
- ret = -EINVAL;
- if (!pi_state)
- goto out_unlock;
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- goto out_unlock;
- get_pi_state(pi_state);
- /*
- * By taking wait_lock while still holding hb->lock, we ensure
- * there is no point where we hold neither; and therefore
- * wake_futex_p() must observe a state consistent with what we
- * observed.
- *
- * In particular; this forces __rt_mutex_start_proxy() to
- * complete such that we're guaranteed to observe the
- * rt_waiter. Also see the WARN in wake_futex_pi().
- */
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
- spin_unlock(&hb->lock);
- /* drops pi_state->pi_mutex.wait_lock */
- ret = wake_futex_pi(uaddr, uval, pi_state);
- put_pi_state(pi_state);
- /*
- * Success, we're done! No tricky corner cases.
- */
- if (!ret)
- return ret;
- /*
- * The atomic access to the futex value generated a
- * pagefault, so retry the user-access and the wakeup:
- */
- if (ret == -EFAULT)
- goto pi_faulted;
- /*
- * A unconditional UNLOCK_PI op raced against a waiter
- * setting the FUTEX_WAITERS bit. Try again.
- */
- if (ret == -EAGAIN)
- goto pi_retry;
- /*
- * wake_futex_pi has detected invalid state. Tell user
- * space.
- */
- return ret;
- }
- /*
- * We have no kernel internal state, i.e. no waiters in the
- * kernel. Waiters which are about to queue themselves are stuck
- * on hb->lock. So we can safely ignore them. We do neither
- * preserve the WAITERS bit not the OWNER_DIED one. We are the
- * owner.
- */
- if ((ret = futex_cmpxchg_value_locked(&curval, uaddr, uval, 0))) {
- spin_unlock(&hb->lock);
- switch (ret) {
- case -EFAULT:
- goto pi_faulted;
- case -EAGAIN:
- goto pi_retry;
- default:
- WARN_ON_ONCE(1);
- return ret;
- }
- }
- /*
- * If uval has changed, let user space handle it.
- */
- ret = (curval == uval) ? 0 : -EAGAIN;
- out_unlock:
- spin_unlock(&hb->lock);
- return ret;
- pi_retry:
- cond_resched();
- goto retry;
- pi_faulted:
- ret = fault_in_user_writeable(uaddr);
- if (!ret)
- goto retry;
- return ret;
- }
|