Merge branch 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull locking updates from Ingo Molnar: "The main changes in this cycle were: - rwsem micro-optimizations (Davidlohr Bueso) - Improve the implementation and optimize the performance of percpu-rwsems. (Peter Zijlstra.) - Convert all lglock users to better facilities such as percpu-rwsems or percpu-spinlocks and remove lglocks. (Peter Zijlstra) - Remove the ticket (spin)lock implementation. (Peter Zijlstra) - Korean translation of memory-barriers.txt and related fixes to the English document. (SeongJae Park) - misc fixes and cleanups" * 'locking-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (24 commits) x86/cmpxchg, locking/atomics: Remove superfluous definitions x86, locking/spinlocks: Remove ticket (spin)lock implementation locking/lglock: Remove lglock implementation stop_machine: Remove stop_cpus_lock and lg_double_lock/unlock() fs/locks: Use percpu_down_read_preempt_disable() locking/percpu-rwsem: Add down_read_preempt_disable() fs/locks: Replace lg_local with a per-cpu spinlock fs/locks: Replace lg_global with a percpu-rwsem locking/percpu-rwsem: Add DEFINE_STATIC_PERCPU_RWSEMand percpu_rwsem_assert_held() locking/pv-qspinlock: Use cmpxchg_release() in __pv_queued_spin_unlock() locking/rwsem, x86: Drop a bogus cc clobber futex: Add some more function commentry locking/hung_task: Show all locks locking/rwsem: Scan the wait_list for readers only once locking/rwsem: Remove a few useless comments locking/rwsem: Return void in __rwsem_mark_wake() locking, rcu, cgroup: Avoid synchronize_sched() in __cgroup_procs_write() locking/Documentation: Add Korean translation locking/Documentation: Fix a typo of example result locking/Documentation: Fix wrong section reference ...
This commit is contained in:
@@ -5627,6 +5627,12 @@ int __init cgroup_init(void)
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
|
||||
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
|
||||
|
||||
/*
|
||||
* The latency of the synchronize_sched() is too high for cgroups,
|
||||
* avoid it at the cost of forcing all readers into the slow path.
|
||||
*/
|
||||
rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
|
||||
|
||||
get_user_ns(init_cgroup_ns.user_ns);
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
@@ -381,8 +381,12 @@ static inline int hb_waiters_pending(struct futex_hash_bucket *hb)
|
||||
#endif
|
||||
}
|
||||
|
||||
/*
|
||||
* We hash on the keys returned from get_futex_key (see below).
|
||||
/**
|
||||
* hash_futex - Return the hash bucket in the global hash
|
||||
* @key: Pointer to the futex key for which the hash is calculated
|
||||
*
|
||||
* We hash on the keys returned from get_futex_key (see below) and return the
|
||||
* corresponding hash bucket in the global hash.
|
||||
*/
|
||||
static struct futex_hash_bucket *hash_futex(union futex_key *key)
|
||||
{
|
||||
@@ -392,7 +396,12 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
|
||||
return &futex_queues[hash & (futex_hashsize - 1)];
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
/**
|
||||
* match_futex - Check whether two futex keys are equal
|
||||
* @key1: Pointer to key1
|
||||
* @key2: Pointer to key2
|
||||
*
|
||||
* Return 1 if two futex_keys are equal, 0 otherwise.
|
||||
*/
|
||||
static inline int match_futex(union futex_key *key1, union futex_key *key2)
|
||||
|
@@ -117,7 +117,7 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
|
||||
pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
|
||||
" disables this message.\n");
|
||||
sched_show_task(t);
|
||||
debug_show_held_locks(t);
|
||||
debug_show_all_locks();
|
||||
|
||||
touch_nmi_watchdog();
|
||||
|
||||
|
@@ -18,7 +18,6 @@ obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
|
||||
endif
|
||||
obj-$(CONFIG_SMP) += spinlock.o
|
||||
obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o
|
||||
obj-$(CONFIG_SMP) += lglock.o
|
||||
obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
|
||||
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
|
||||
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
|
||||
|
@@ -1,111 +0,0 @@
|
||||
/* See include/linux/lglock.h for description */
|
||||
#include <linux/module.h>
|
||||
#include <linux/lglock.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
/*
|
||||
* Note there is no uninit, so lglocks cannot be defined in
|
||||
* modules (but it's fine to use them from there)
|
||||
* Could be added though, just undo lg_lock_init
|
||||
*/
|
||||
|
||||
void lg_lock_init(struct lglock *lg, char *name)
|
||||
{
|
||||
LOCKDEP_INIT_MAP(&lg->lock_dep_map, name, &lg->lock_key, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(lg_lock_init);
|
||||
|
||||
void lg_local_lock(struct lglock *lg)
|
||||
{
|
||||
arch_spinlock_t *lock;
|
||||
|
||||
preempt_disable();
|
||||
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
|
||||
lock = this_cpu_ptr(lg->lock);
|
||||
arch_spin_lock(lock);
|
||||
}
|
||||
EXPORT_SYMBOL(lg_local_lock);
|
||||
|
||||
void lg_local_unlock(struct lglock *lg)
|
||||
{
|
||||
arch_spinlock_t *lock;
|
||||
|
||||
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
|
||||
lock = this_cpu_ptr(lg->lock);
|
||||
arch_spin_unlock(lock);
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(lg_local_unlock);
|
||||
|
||||
void lg_local_lock_cpu(struct lglock *lg, int cpu)
|
||||
{
|
||||
arch_spinlock_t *lock;
|
||||
|
||||
preempt_disable();
|
||||
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
|
||||
lock = per_cpu_ptr(lg->lock, cpu);
|
||||
arch_spin_lock(lock);
|
||||
}
|
||||
EXPORT_SYMBOL(lg_local_lock_cpu);
|
||||
|
||||
void lg_local_unlock_cpu(struct lglock *lg, int cpu)
|
||||
{
|
||||
arch_spinlock_t *lock;
|
||||
|
||||
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
|
||||
lock = per_cpu_ptr(lg->lock, cpu);
|
||||
arch_spin_unlock(lock);
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(lg_local_unlock_cpu);
|
||||
|
||||
void lg_double_lock(struct lglock *lg, int cpu1, int cpu2)
|
||||
{
|
||||
BUG_ON(cpu1 == cpu2);
|
||||
|
||||
/* lock in cpu order, just like lg_global_lock */
|
||||
if (cpu2 < cpu1)
|
||||
swap(cpu1, cpu2);
|
||||
|
||||
preempt_disable();
|
||||
lock_acquire_shared(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
|
||||
arch_spin_lock(per_cpu_ptr(lg->lock, cpu1));
|
||||
arch_spin_lock(per_cpu_ptr(lg->lock, cpu2));
|
||||
}
|
||||
|
||||
void lg_double_unlock(struct lglock *lg, int cpu1, int cpu2)
|
||||
{
|
||||
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
|
||||
arch_spin_unlock(per_cpu_ptr(lg->lock, cpu1));
|
||||
arch_spin_unlock(per_cpu_ptr(lg->lock, cpu2));
|
||||
preempt_enable();
|
||||
}
|
||||
|
||||
void lg_global_lock(struct lglock *lg)
|
||||
{
|
||||
int i;
|
||||
|
||||
preempt_disable();
|
||||
lock_acquire_exclusive(&lg->lock_dep_map, 0, 0, NULL, _RET_IP_);
|
||||
for_each_possible_cpu(i) {
|
||||
arch_spinlock_t *lock;
|
||||
lock = per_cpu_ptr(lg->lock, i);
|
||||
arch_spin_lock(lock);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(lg_global_lock);
|
||||
|
||||
void lg_global_unlock(struct lglock *lg)
|
||||
{
|
||||
int i;
|
||||
|
||||
lock_release(&lg->lock_dep_map, 1, _RET_IP_);
|
||||
for_each_possible_cpu(i) {
|
||||
arch_spinlock_t *lock;
|
||||
lock = per_cpu_ptr(lg->lock, i);
|
||||
arch_spin_unlock(lock);
|
||||
}
|
||||
preempt_enable();
|
||||
}
|
||||
EXPORT_SYMBOL(lg_global_unlock);
|
@@ -8,152 +8,186 @@
|
||||
#include <linux/sched.h>
|
||||
#include <linux/errno.h>
|
||||
|
||||
int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
|
||||
int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
|
||||
const char *name, struct lock_class_key *rwsem_key)
|
||||
{
|
||||
brw->fast_read_ctr = alloc_percpu(int);
|
||||
if (unlikely(!brw->fast_read_ctr))
|
||||
sem->read_count = alloc_percpu(int);
|
||||
if (unlikely(!sem->read_count))
|
||||
return -ENOMEM;
|
||||
|
||||
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
|
||||
__init_rwsem(&brw->rw_sem, name, rwsem_key);
|
||||
rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
|
||||
atomic_set(&brw->slow_read_ctr, 0);
|
||||
init_waitqueue_head(&brw->write_waitq);
|
||||
rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
|
||||
__init_rwsem(&sem->rw_sem, name, rwsem_key);
|
||||
init_waitqueue_head(&sem->writer);
|
||||
sem->readers_block = 0;
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
|
||||
|
||||
void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
|
||||
void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
|
||||
{
|
||||
/*
|
||||
* XXX: temporary kludge. The error path in alloc_super()
|
||||
* assumes that percpu_free_rwsem() is safe after kzalloc().
|
||||
*/
|
||||
if (!brw->fast_read_ctr)
|
||||
if (!sem->read_count)
|
||||
return;
|
||||
|
||||
rcu_sync_dtor(&brw->rss);
|
||||
free_percpu(brw->fast_read_ctr);
|
||||
brw->fast_read_ctr = NULL; /* catch use after free bugs */
|
||||
rcu_sync_dtor(&sem->rss);
|
||||
free_percpu(sem->read_count);
|
||||
sem->read_count = NULL; /* catch use after free bugs */
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_free_rwsem);
|
||||
|
||||
/*
|
||||
* This is the fast-path for down_read/up_read. If it succeeds we rely
|
||||
* on the barriers provided by rcu_sync_enter/exit; see the comments in
|
||||
* percpu_down_write() and percpu_up_write().
|
||||
*
|
||||
* If this helper fails the callers rely on the normal rw_semaphore and
|
||||
* atomic_dec_and_test(), so in this case we have the necessary barriers.
|
||||
*/
|
||||
static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
|
||||
{
|
||||
bool success;
|
||||
|
||||
preempt_disable();
|
||||
success = rcu_sync_is_idle(&brw->rss);
|
||||
if (likely(success))
|
||||
__this_cpu_add(*brw->fast_read_ctr, val);
|
||||
preempt_enable();
|
||||
|
||||
return success;
|
||||
}
|
||||
|
||||
/*
|
||||
* Like the normal down_read() this is not recursive, the writer can
|
||||
* come after the first percpu_down_read() and create the deadlock.
|
||||
*
|
||||
* Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
|
||||
* percpu_up_read() does rwsem_release(). This pairs with the usage
|
||||
* of ->rw_sem in percpu_down/up_write().
|
||||
*/
|
||||
void percpu_down_read(struct percpu_rw_semaphore *brw)
|
||||
{
|
||||
might_sleep();
|
||||
rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
|
||||
|
||||
if (likely(update_fast_ctr(brw, +1)))
|
||||
return;
|
||||
|
||||
/* Avoid rwsem_acquire_read() and rwsem_release() */
|
||||
__down_read(&brw->rw_sem);
|
||||
atomic_inc(&brw->slow_read_ctr);
|
||||
__up_read(&brw->rw_sem);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_down_read);
|
||||
|
||||
int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
|
||||
{
|
||||
if (unlikely(!update_fast_ctr(brw, +1))) {
|
||||
if (!__down_read_trylock(&brw->rw_sem))
|
||||
return 0;
|
||||
atomic_inc(&brw->slow_read_ctr);
|
||||
__up_read(&brw->rw_sem);
|
||||
}
|
||||
|
||||
rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
|
||||
return 1;
|
||||
}
|
||||
|
||||
void percpu_up_read(struct percpu_rw_semaphore *brw)
|
||||
{
|
||||
rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
|
||||
|
||||
if (likely(update_fast_ctr(brw, -1)))
|
||||
return;
|
||||
|
||||
/* false-positive is possible but harmless */
|
||||
if (atomic_dec_and_test(&brw->slow_read_ctr))
|
||||
wake_up_all(&brw->write_waitq);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_up_read);
|
||||
|
||||
static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
|
||||
{
|
||||
unsigned int sum = 0;
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
sum += per_cpu(*brw->fast_read_ctr, cpu);
|
||||
per_cpu(*brw->fast_read_ctr, cpu) = 0;
|
||||
}
|
||||
|
||||
return sum;
|
||||
}
|
||||
|
||||
void percpu_down_write(struct percpu_rw_semaphore *brw)
|
||||
int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
|
||||
{
|
||||
/*
|
||||
* Make rcu_sync_is_idle() == F and thus disable the fast-path in
|
||||
* percpu_down_read() and percpu_up_read(), and wait for gp pass.
|
||||
* Due to having preemption disabled the decrement happens on
|
||||
* the same CPU as the increment, avoiding the
|
||||
* increment-on-one-CPU-and-decrement-on-another problem.
|
||||
*
|
||||
* The latter synchronises us with the preceding readers which used
|
||||
* the fast-past, so we can not miss the result of __this_cpu_add()
|
||||
* or anything else inside their criticial sections.
|
||||
* If the reader misses the writer's assignment of readers_block, then
|
||||
* the writer is guaranteed to see the reader's increment.
|
||||
*
|
||||
* Conversely, any readers that increment their sem->read_count after
|
||||
* the writer looks are guaranteed to see the readers_block value,
|
||||
* which in turn means that they are guaranteed to immediately
|
||||
* decrement their sem->read_count, so that it doesn't matter that the
|
||||
* writer missed them.
|
||||
*/
|
||||
rcu_sync_enter(&brw->rss);
|
||||
|
||||
/* exclude other writers, and block the new readers completely */
|
||||
down_write(&brw->rw_sem);
|
||||
smp_mb(); /* A matches D */
|
||||
|
||||
/* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
|
||||
atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
|
||||
/*
|
||||
* If !readers_block the critical section starts here, matched by the
|
||||
* release in percpu_up_write().
|
||||
*/
|
||||
if (likely(!smp_load_acquire(&sem->readers_block)))
|
||||
return 1;
|
||||
|
||||
/* wait for all readers to complete their percpu_up_read() */
|
||||
wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
|
||||
/*
|
||||
* Per the above comment; we still have preemption disabled and
|
||||
* will thus decrement on the same CPU as we incremented.
|
||||
*/
|
||||
__percpu_up_read(sem);
|
||||
|
||||
if (try)
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* We either call schedule() in the wait, or we'll fall through
|
||||
* and reschedule on the preempt_enable() in percpu_down_read().
|
||||
*/
|
||||
preempt_enable_no_resched();
|
||||
|
||||
/*
|
||||
* Avoid lockdep for the down/up_read() we already have them.
|
||||
*/
|
||||
__down_read(&sem->rw_sem);
|
||||
this_cpu_inc(*sem->read_count);
|
||||
__up_read(&sem->rw_sem);
|
||||
|
||||
preempt_disable();
|
||||
return 1;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_down_read);
|
||||
|
||||
void __percpu_up_read(struct percpu_rw_semaphore *sem)
|
||||
{
|
||||
smp_mb(); /* B matches C */
|
||||
/*
|
||||
* In other words, if they see our decrement (presumably to aggregate
|
||||
* zero, as that is the only time it matters) they will also see our
|
||||
* critical section.
|
||||
*/
|
||||
__this_cpu_dec(*sem->read_count);
|
||||
|
||||
/* Prod writer to recheck readers_active */
|
||||
wake_up(&sem->writer);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__percpu_up_read);
|
||||
|
||||
#define per_cpu_sum(var) \
|
||||
({ \
|
||||
typeof(var) __sum = 0; \
|
||||
int cpu; \
|
||||
compiletime_assert_atomic_type(__sum); \
|
||||
for_each_possible_cpu(cpu) \
|
||||
__sum += per_cpu(var, cpu); \
|
||||
__sum; \
|
||||
})
|
||||
|
||||
/*
|
||||
* Return true if the modular sum of the sem->read_count per-CPU variable is
|
||||
* zero. If this sum is zero, then it is stable due to the fact that if any
|
||||
* newly arriving readers increment a given counter, they will immediately
|
||||
* decrement that same counter.
|
||||
*/
|
||||
static bool readers_active_check(struct percpu_rw_semaphore *sem)
|
||||
{
|
||||
if (per_cpu_sum(*sem->read_count) != 0)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If we observed the decrement; ensure we see the entire critical
|
||||
* section.
|
||||
*/
|
||||
|
||||
smp_mb(); /* C matches B */
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void percpu_down_write(struct percpu_rw_semaphore *sem)
|
||||
{
|
||||
/* Notify readers to take the slow path. */
|
||||
rcu_sync_enter(&sem->rss);
|
||||
|
||||
down_write(&sem->rw_sem);
|
||||
|
||||
/*
|
||||
* Notify new readers to block; up until now, and thus throughout the
|
||||
* longish rcu_sync_enter() above, new readers could still come in.
|
||||
*/
|
||||
WRITE_ONCE(sem->readers_block, 1);
|
||||
|
||||
smp_mb(); /* D matches A */
|
||||
|
||||
/*
|
||||
* If they don't see our writer of readers_block, then we are
|
||||
* guaranteed to see their sem->read_count increment, and therefore
|
||||
* will wait for them.
|
||||
*/
|
||||
|
||||
/* Wait for all now active readers to complete. */
|
||||
wait_event(sem->writer, readers_active_check(sem));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_down_write);
|
||||
|
||||
void percpu_up_write(struct percpu_rw_semaphore *brw)
|
||||
void percpu_up_write(struct percpu_rw_semaphore *sem)
|
||||
{
|
||||
/* release the lock, but the readers can't use the fast-path */
|
||||
up_write(&brw->rw_sem);
|
||||
/*
|
||||
* Enable the fast-path in percpu_down_read() and percpu_up_read()
|
||||
* but only after another gp pass; this adds the necessary barrier
|
||||
* to ensure the reader can't miss the changes done by us.
|
||||
* Signal the writer is done, no fast path yet.
|
||||
*
|
||||
* One reason that we cannot just immediately flip to readers_fast is
|
||||
* that new readers might fail to see the results of this writer's
|
||||
* critical section.
|
||||
*
|
||||
* Therefore we force it through the slow path which guarantees an
|
||||
* acquire and thereby guarantees the critical section's consistency.
|
||||
*/
|
||||
rcu_sync_exit(&brw->rss);
|
||||
smp_store_release(&sem->readers_block, 0);
|
||||
|
||||
/*
|
||||
* Release the write lock, this will allow readers back in the game.
|
||||
*/
|
||||
up_write(&sem->rw_sem);
|
||||
|
||||
/*
|
||||
* Once this completes (at least one RCU-sched grace period hence) the
|
||||
* reader fast path will be available again. Safe to use outside the
|
||||
* exclusive write lock because its counting.
|
||||
*/
|
||||
rcu_sync_exit(&sem->rss);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(percpu_up_write);
|
||||
|
@@ -70,11 +70,14 @@ struct pv_node {
|
||||
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
|
||||
{
|
||||
struct __qspinlock *l = (void *)lock;
|
||||
int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
|
||||
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
|
||||
|
||||
qstat_inc(qstat_pv_lock_stealing, ret);
|
||||
return ret;
|
||||
if (!(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
|
||||
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0)) {
|
||||
qstat_inc(qstat_pv_lock_stealing, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -257,7 +260,6 @@ static struct pv_node *pv_unhash(struct qspinlock *lock)
|
||||
static inline bool
|
||||
pv_wait_early(struct pv_node *prev, int loop)
|
||||
{
|
||||
|
||||
if ((loop & PV_PREV_CHECK_MASK) != 0)
|
||||
return false;
|
||||
|
||||
@@ -286,12 +288,10 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
|
||||
{
|
||||
struct pv_node *pn = (struct pv_node *)node;
|
||||
struct pv_node *pp = (struct pv_node *)prev;
|
||||
int waitcnt = 0;
|
||||
int loop;
|
||||
bool wait_early;
|
||||
|
||||
/* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
|
||||
for (;; waitcnt++) {
|
||||
for (;;) {
|
||||
for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) {
|
||||
if (READ_ONCE(node->locked))
|
||||
return;
|
||||
@@ -315,7 +315,6 @@ static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev)
|
||||
|
||||
if (!READ_ONCE(node->locked)) {
|
||||
qstat_inc(qstat_pv_wait_node, true);
|
||||
qstat_inc(qstat_pv_wait_again, waitcnt);
|
||||
qstat_inc(qstat_pv_wait_early, wait_early);
|
||||
pv_wait(&pn->state, vcpu_halted);
|
||||
}
|
||||
@@ -456,12 +455,9 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
|
||||
pv_wait(&l->locked, _Q_SLOW_VAL);
|
||||
|
||||
/*
|
||||
* The unlocker should have freed the lock before kicking the
|
||||
* CPU. So if the lock is still not free, it is a spurious
|
||||
* wakeup or another vCPU has stolen the lock. The current
|
||||
* vCPU should spin again.
|
||||
* Because of lock stealing, the queue head vCPU may not be
|
||||
* able to acquire the lock before it has to wait again.
|
||||
*/
|
||||
qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -544,7 +540,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
|
||||
* unhash. Otherwise it would be possible to have multiple @lock
|
||||
* entries, which would be BAD.
|
||||
*/
|
||||
locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
|
||||
locked = cmpxchg_release(&l->locked, _Q_LOCKED_VAL, 0);
|
||||
if (likely(locked == _Q_LOCKED_VAL))
|
||||
return;
|
||||
|
||||
|
@@ -24,8 +24,8 @@
|
||||
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
|
||||
* pv_lock_slowpath - # of locking operations via the slowpath
|
||||
* pv_lock_stealing - # of lock stealing operations
|
||||
* pv_spurious_wakeup - # of spurious wakeups
|
||||
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
|
||||
* pv_spurious_wakeup - # of spurious wakeups in non-head vCPUs
|
||||
* pv_wait_again - # of wait's after a queue head vCPU kick
|
||||
* pv_wait_early - # of early vCPU wait's
|
||||
* pv_wait_head - # of vCPU wait's at the queue head
|
||||
* pv_wait_node - # of vCPU wait's at a non-head queue node
|
||||
|
@@ -121,16 +121,19 @@ enum rwsem_wake_type {
|
||||
* - woken process blocks are discarded from the list after having task zeroed
|
||||
* - writers are only marked woken if downgrading is false
|
||||
*/
|
||||
static struct rw_semaphore *
|
||||
__rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
|
||||
static void __rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
enum rwsem_wake_type wake_type,
|
||||
struct wake_q_head *wake_q)
|
||||
{
|
||||
struct rwsem_waiter *waiter;
|
||||
struct task_struct *tsk;
|
||||
struct list_head *next;
|
||||
long oldcount, woken, loop, adjustment;
|
||||
struct rwsem_waiter *waiter, *tmp;
|
||||
long oldcount, woken = 0, adjustment = 0;
|
||||
|
||||
/*
|
||||
* Take a peek at the queue head waiter such that we can determine
|
||||
* the wakeup(s) to perform.
|
||||
*/
|
||||
waiter = list_first_entry(&sem->wait_list, struct rwsem_waiter, list);
|
||||
|
||||
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
|
||||
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
|
||||
if (wake_type == RWSEM_WAKE_ANY) {
|
||||
/*
|
||||
@@ -142,19 +145,19 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
*/
|
||||
wake_q_add(wake_q, waiter->task);
|
||||
}
|
||||
goto out;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/* Writers might steal the lock before we grant it to the next reader.
|
||||
/*
|
||||
* Writers might steal the lock before we grant it to the next reader.
|
||||
* We prefer to do the first reader grant before counting readers
|
||||
* so we can bail out early if a writer stole the lock.
|
||||
*/
|
||||
adjustment = 0;
|
||||
if (wake_type != RWSEM_WAKE_READ_OWNED) {
|
||||
adjustment = RWSEM_ACTIVE_READ_BIAS;
|
||||
try_reader_grant:
|
||||
oldcount = atomic_long_fetch_add(adjustment, &sem->count);
|
||||
|
||||
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
|
||||
/*
|
||||
* If the count is still less than RWSEM_WAITING_BIAS
|
||||
@@ -164,7 +167,8 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
*/
|
||||
if (atomic_long_add_return(-adjustment, &sem->count) <
|
||||
RWSEM_WAITING_BIAS)
|
||||
goto out;
|
||||
return;
|
||||
|
||||
/* Last active locker left. Retry waking readers. */
|
||||
goto try_reader_grant;
|
||||
}
|
||||
@@ -176,38 +180,23 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
rwsem_set_reader_owned(sem);
|
||||
}
|
||||
|
||||
/* Grant an infinite number of read locks to the readers at the front
|
||||
* of the queue. Note we increment the 'active part' of the count by
|
||||
* the number of readers before waking any processes up.
|
||||
/*
|
||||
* Grant an infinite number of read locks to the readers at the front
|
||||
* of the queue. We know that woken will be at least 1 as we accounted
|
||||
* for above. Note we increment the 'active part' of the count by the
|
||||
* number of readers before waking any processes up.
|
||||
*/
|
||||
woken = 0;
|
||||
do {
|
||||
woken++;
|
||||
list_for_each_entry_safe(waiter, tmp, &sem->wait_list, list) {
|
||||
struct task_struct *tsk;
|
||||
|
||||
if (waiter->list.next == &sem->wait_list)
|
||||
if (waiter->type == RWSEM_WAITING_FOR_WRITE)
|
||||
break;
|
||||
|
||||
waiter = list_entry(waiter->list.next,
|
||||
struct rwsem_waiter, list);
|
||||
|
||||
} while (waiter->type != RWSEM_WAITING_FOR_WRITE);
|
||||
|
||||
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
|
||||
if (waiter->type != RWSEM_WAITING_FOR_WRITE)
|
||||
/* hit end of list above */
|
||||
adjustment -= RWSEM_WAITING_BIAS;
|
||||
|
||||
if (adjustment)
|
||||
atomic_long_add(adjustment, &sem->count);
|
||||
|
||||
next = sem->wait_list.next;
|
||||
loop = woken;
|
||||
do {
|
||||
waiter = list_entry(next, struct rwsem_waiter, list);
|
||||
next = waiter->list.next;
|
||||
woken++;
|
||||
tsk = waiter->task;
|
||||
|
||||
wake_q_add(wake_q, tsk);
|
||||
list_del(&waiter->list);
|
||||
/*
|
||||
* Ensure that the last operation is setting the reader
|
||||
* waiter to nil such that rwsem_down_read_failed() cannot
|
||||
@@ -215,13 +204,16 @@ __rwsem_mark_wake(struct rw_semaphore *sem,
|
||||
* to the task to wakeup.
|
||||
*/
|
||||
smp_store_release(&waiter->task, NULL);
|
||||
} while (--loop);
|
||||
}
|
||||
|
||||
sem->wait_list.next = next;
|
||||
next->prev = &sem->wait_list;
|
||||
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
|
||||
if (list_empty(&sem->wait_list)) {
|
||||
/* hit end of list above */
|
||||
adjustment -= RWSEM_WAITING_BIAS;
|
||||
}
|
||||
|
||||
out:
|
||||
return sem;
|
||||
if (adjustment)
|
||||
atomic_long_add(adjustment, &sem->count);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -235,7 +227,6 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
|
||||
struct task_struct *tsk = current;
|
||||
WAKE_Q(wake_q);
|
||||
|
||||
/* set up my own style of waitqueue */
|
||||
waiter.task = tsk;
|
||||
waiter.type = RWSEM_WAITING_FOR_READ;
|
||||
|
||||
@@ -247,7 +238,8 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
|
||||
/* we're now waiting on the lock, but no longer actively locking */
|
||||
count = atomic_long_add_return(adjustment, &sem->count);
|
||||
|
||||
/* If there are no active locks, wake the front queued process(es).
|
||||
/*
|
||||
* If there are no active locks, wake the front queued process(es).
|
||||
*
|
||||
* If there are no writers and we are first in the queue,
|
||||
* wake our own waiter to join the existing active readers !
|
||||
@@ -255,7 +247,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
|
||||
if (count == RWSEM_WAITING_BIAS ||
|
||||
(count > RWSEM_WAITING_BIAS &&
|
||||
adjustment != -RWSEM_ACTIVE_READ_BIAS))
|
||||
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
|
||||
__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
|
||||
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
wake_up_q(&wake_q);
|
||||
@@ -505,7 +497,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
|
||||
if (count > RWSEM_WAITING_BIAS) {
|
||||
WAKE_Q(wake_q);
|
||||
|
||||
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
|
||||
__rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
|
||||
/*
|
||||
* The wakeup is normally called _after_ the wait_lock
|
||||
* is released, but given that we are proactively waking
|
||||
@@ -614,9 +606,8 @@ struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
|
||||
raw_spin_lock_irqsave(&sem->wait_lock, flags);
|
||||
locked:
|
||||
|
||||
/* do nothing if list empty */
|
||||
if (!list_empty(&sem->wait_list))
|
||||
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
|
||||
__rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
|
||||
|
||||
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
|
||||
wake_up_q(&wake_q);
|
||||
@@ -638,9 +629,8 @@ struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
|
||||
|
||||
raw_spin_lock_irqsave(&sem->wait_lock, flags);
|
||||
|
||||
/* do nothing if list empty */
|
||||
if (!list_empty(&sem->wait_list))
|
||||
sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
|
||||
__rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
|
||||
|
||||
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
|
||||
wake_up_q(&wake_q);
|
||||
|
@@ -68,6 +68,8 @@ void rcu_sync_lockdep_assert(struct rcu_sync *rsp)
|
||||
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
|
||||
"suspicious rcu_sync_is_idle() usage");
|
||||
}
|
||||
|
||||
EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
|
||||
#endif
|
||||
|
||||
/**
|
||||
@@ -82,6 +84,18 @@ void rcu_sync_init(struct rcu_sync *rsp, enum rcu_sync_type type)
|
||||
rsp->gp_type = type;
|
||||
}
|
||||
|
||||
/**
|
||||
* Must be called after rcu_sync_init() and before first use.
|
||||
*
|
||||
* Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
|
||||
* pairs turn into NO-OPs.
|
||||
*/
|
||||
void rcu_sync_enter_start(struct rcu_sync *rsp)
|
||||
{
|
||||
rsp->gp_count++;
|
||||
rsp->gp_state = GP_PASSED;
|
||||
}
|
||||
|
||||
/**
|
||||
* rcu_sync_enter() - Force readers onto slowpath
|
||||
* @rsp: Pointer to rcu_sync structure to use for synchronization
|
||||
|
@@ -20,7 +20,6 @@
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/smpboot.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/lglock.h>
|
||||
#include <linux/nmi.h>
|
||||
|
||||
/*
|
||||
@@ -47,13 +46,9 @@ struct cpu_stopper {
|
||||
static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper);
|
||||
static bool stop_machine_initialized = false;
|
||||
|
||||
/*
|
||||
* Avoids a race between stop_two_cpus and global stop_cpus, where
|
||||
* the stoppers could get queued up in reverse order, leading to
|
||||
* system deadlock. Using an lglock means stop_two_cpus remains
|
||||
* relatively cheap.
|
||||
*/
|
||||
DEFINE_STATIC_LGLOCK(stop_cpus_lock);
|
||||
/* static data for stop_cpus */
|
||||
static DEFINE_MUTEX(stop_cpus_mutex);
|
||||
static bool stop_cpus_in_progress;
|
||||
|
||||
static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo)
|
||||
{
|
||||
@@ -230,14 +225,26 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
|
||||
struct cpu_stopper *stopper1 = per_cpu_ptr(&cpu_stopper, cpu1);
|
||||
struct cpu_stopper *stopper2 = per_cpu_ptr(&cpu_stopper, cpu2);
|
||||
int err;
|
||||
|
||||
lg_double_lock(&stop_cpus_lock, cpu1, cpu2);
|
||||
retry:
|
||||
spin_lock_irq(&stopper1->lock);
|
||||
spin_lock_nested(&stopper2->lock, SINGLE_DEPTH_NESTING);
|
||||
|
||||
err = -ENOENT;
|
||||
if (!stopper1->enabled || !stopper2->enabled)
|
||||
goto unlock;
|
||||
/*
|
||||
* Ensure that if we race with __stop_cpus() the stoppers won't get
|
||||
* queued up in reverse order leading to system deadlock.
|
||||
*
|
||||
* We can't miss stop_cpus_in_progress if queue_stop_cpus_work() has
|
||||
* queued a work on cpu1 but not on cpu2, we hold both locks.
|
||||
*
|
||||
* It can be falsely true but it is safe to spin until it is cleared,
|
||||
* queue_stop_cpus_work() does everything under preempt_disable().
|
||||
*/
|
||||
err = -EDEADLK;
|
||||
if (unlikely(stop_cpus_in_progress))
|
||||
goto unlock;
|
||||
|
||||
err = 0;
|
||||
__cpu_stop_queue_work(stopper1, work1);
|
||||
@@ -245,8 +252,12 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1,
|
||||
unlock:
|
||||
spin_unlock(&stopper2->lock);
|
||||
spin_unlock_irq(&stopper1->lock);
|
||||
lg_double_unlock(&stop_cpus_lock, cpu1, cpu2);
|
||||
|
||||
if (unlikely(err == -EDEADLK)) {
|
||||
while (stop_cpus_in_progress)
|
||||
cpu_relax();
|
||||
goto retry;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
/**
|
||||
@@ -316,9 +327,6 @@ bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
|
||||
return cpu_stop_queue_work(cpu, work_buf);
|
||||
}
|
||||
|
||||
/* static data for stop_cpus */
|
||||
static DEFINE_MUTEX(stop_cpus_mutex);
|
||||
|
||||
static bool queue_stop_cpus_work(const struct cpumask *cpumask,
|
||||
cpu_stop_fn_t fn, void *arg,
|
||||
struct cpu_stop_done *done)
|
||||
@@ -332,7 +340,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
|
||||
* preempted by a stopper which might wait for other stoppers
|
||||
* to enter @fn which can lead to deadlock.
|
||||
*/
|
||||
lg_global_lock(&stop_cpus_lock);
|
||||
preempt_disable();
|
||||
stop_cpus_in_progress = true;
|
||||
for_each_cpu(cpu, cpumask) {
|
||||
work = &per_cpu(cpu_stopper.stop_work, cpu);
|
||||
work->fn = fn;
|
||||
@@ -341,7 +350,8 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask,
|
||||
if (cpu_stop_queue_work(cpu, work))
|
||||
queued = true;
|
||||
}
|
||||
lg_global_unlock(&stop_cpus_lock);
|
||||
stop_cpus_in_progress = false;
|
||||
preempt_enable();
|
||||
|
||||
return queued;
|
||||
}
|
||||
|
Reference in New Issue
Block a user