Merge branch 'odp_fixes' into hmm.git
From rdma.git Jason Gunthorpe says: ==================== This is a collection of general cleanups for ODP to clarify some of the flows around umem creation and use of the interval tree. ==================== The branch is based on v5.3-rc5 due to dependencies, and is being taken into hmm.git due to dependencies in the next patches. * odp_fixes: RDMA/mlx5: Use odp instead of mr->umem in pagefault_mr RDMA/mlx5: Use ib_umem_start instead of umem.address RDMA/core: Make invalidate_range a device operation RDMA/odp: Use kvcalloc for the dma_list and page_list RDMA/odp: Check for overflow when computing the umem_odp end RDMA/odp: Provide ib_umem_odp_release() to undo the allocs RDMA/odp: Split creating a umem_odp from ib_umem_get RDMA/odp: Make the three ways to create a umem_odp clear RMDA/odp: Consolidate umem_odp initialization RDMA/odp: Make it clearer when a umem is an implicit ODP umem RDMA/odp: Iterate over the whole rbtree directly RDMA/odp: Use the common interval tree library instead of generic RDMA/mlx5: Fix MR npages calculation for IB_ACCESS_HUGETLB Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
This commit is contained in:
@@ -35,10 +35,10 @@ config PREEMPT_VOLUNTARY
|
||||
|
||||
Select this if you are building a kernel for a desktop system.
|
||||
|
||||
config PREEMPT_LL
|
||||
config PREEMPT
|
||||
bool "Preemptible Kernel (Low-Latency Desktop)"
|
||||
depends on !ARCH_NO_PREEMPT
|
||||
select PREEMPT
|
||||
select PREEMPTION
|
||||
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
|
||||
help
|
||||
This option reduces the latency of the kernel by making
|
||||
@@ -58,7 +58,7 @@ config PREEMPT_LL
|
||||
config PREEMPT_RT
|
||||
bool "Fully Preemptible Kernel (Real-Time)"
|
||||
depends on EXPERT && ARCH_SUPPORTS_RT
|
||||
select PREEMPT
|
||||
select PREEMPTION
|
||||
help
|
||||
This option turns the kernel into a real-time kernel by replacing
|
||||
various locking primitives (spinlocks, rwlocks, etc.) with
|
||||
@@ -77,6 +77,6 @@ endchoice
|
||||
config PREEMPT_COUNT
|
||||
bool
|
||||
|
||||
config PREEMPT
|
||||
config PREEMPTION
|
||||
bool
|
||||
select PREEMPT_COUNT
|
||||
|
@@ -111,7 +111,6 @@ obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
|
||||
obj-$(CONFIG_TORTURE_TEST) += torture.o
|
||||
|
||||
obj-$(CONFIG_HAS_IOMEM) += iomem.o
|
||||
obj-$(CONFIG_ZONE_DEVICE) += memremap.o
|
||||
obj-$(CONFIG_RSEQ) += rseq.o
|
||||
|
||||
obj-$(CONFIG_GCC_PLUGIN_STACKLEAK) += stackleak.o
|
||||
|
@@ -8616,8 +8616,8 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env)
|
||||
}
|
||||
|
||||
if (is_narrower_load && size < target_size) {
|
||||
u8 shift = (off & (size_default - 1)) * 8;
|
||||
|
||||
u8 shift = bpf_ctx_narrow_load_shift(off, size,
|
||||
size_default);
|
||||
if (ctx_field_size <= 4) {
|
||||
if (shift)
|
||||
insn_buf[cnt++] = BPF_ALU32_IMM(BPF_RSH,
|
||||
|
@@ -1,3 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/*
|
||||
* kernel/configs.c
|
||||
* Echo the kernel .config file used to build the kernel
|
||||
@@ -6,21 +7,6 @@
|
||||
* Copyright (C) 2002 Randy Dunlap <rdunlap@xenotime.net>
|
||||
* Copyright (C) 2002 Al Stone <ahs3@fc.hp.com>
|
||||
* Copyright (C) 2002 Hewlett-Packard Company
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or (at
|
||||
* your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful, but
|
||||
* WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
|
||||
* NON INFRINGEMENT. See the GNU General Public License for more
|
||||
* details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
*/
|
||||
|
||||
#include <linux/kernel.h>
|
||||
|
@@ -144,7 +144,10 @@ void __put_cred(struct cred *cred)
|
||||
BUG_ON(cred == current->cred);
|
||||
BUG_ON(cred == current->real_cred);
|
||||
|
||||
call_rcu(&cred->rcu, put_cred_rcu);
|
||||
if (cred->non_rcu)
|
||||
put_cred_rcu(&cred->rcu);
|
||||
else
|
||||
call_rcu(&cred->rcu, put_cred_rcu);
|
||||
}
|
||||
EXPORT_SYMBOL(__put_cred);
|
||||
|
||||
@@ -261,6 +264,7 @@ struct cred *prepare_creds(void)
|
||||
old = task->cred;
|
||||
memcpy(new, old, sizeof(struct cred));
|
||||
|
||||
new->non_rcu = 0;
|
||||
atomic_set(&new->usage, 1);
|
||||
set_cred_subscribers(new, 0);
|
||||
get_group_info(new->group_info);
|
||||
@@ -544,7 +548,19 @@ const struct cred *override_creds(const struct cred *new)
|
||||
|
||||
validate_creds(old);
|
||||
validate_creds(new);
|
||||
get_cred(new);
|
||||
|
||||
/*
|
||||
* NOTE! This uses 'get_new_cred()' rather than 'get_cred()'.
|
||||
*
|
||||
* That means that we do not clear the 'non_rcu' flag, since
|
||||
* we are only installing the cred into the thread-synchronous
|
||||
* '->cred' pointer, not the '->real_cred' pointer that is
|
||||
* visible to other threads under RCU.
|
||||
*
|
||||
* Also note that we did validate_creds() manually, not depending
|
||||
* on the validation in 'get_cred()'.
|
||||
*/
|
||||
get_new_cred((struct cred *)new);
|
||||
alter_cred_subscribers(new, 1);
|
||||
rcu_assign_pointer(current->cred, new);
|
||||
alter_cred_subscribers(old, -1);
|
||||
@@ -681,6 +697,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
|
||||
validate_creds(old);
|
||||
|
||||
*new = *old;
|
||||
new->non_rcu = 0;
|
||||
atomic_set(&new->usage, 1);
|
||||
set_cred_subscribers(new, 0);
|
||||
get_uid(new->user);
|
||||
|
@@ -243,8 +243,9 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
|
||||
|
||||
/* CMA can be used only in the context which permits sleeping */
|
||||
if (cma && gfpflags_allow_blocking(gfp)) {
|
||||
align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
|
||||
page = cma_alloc(cma, count, align, gfp & __GFP_NOWARN);
|
||||
size_t cma_align = min_t(size_t, align, CONFIG_CMA_ALIGNMENT);
|
||||
|
||||
page = cma_alloc(cma, count, cma_align, gfp & __GFP_NOWARN);
|
||||
}
|
||||
|
||||
/* Fallback allocation of normal pages */
|
||||
@@ -266,7 +267,8 @@ struct page *dma_alloc_contiguous(struct device *dev, size_t size, gfp_t gfp)
|
||||
*/
|
||||
void dma_free_contiguous(struct device *dev, struct page *page, size_t size)
|
||||
{
|
||||
if (!cma_release(dev_get_cma_area(dev), page, size >> PAGE_SHIFT))
|
||||
if (!cma_release(dev_get_cma_area(dev), page,
|
||||
PAGE_ALIGN(size) >> PAGE_SHIFT))
|
||||
__free_pages(page, get_order(size));
|
||||
}
|
||||
|
||||
|
@@ -47,9 +47,6 @@ u64 dma_direct_get_required_mask(struct device *dev)
|
||||
{
|
||||
u64 max_dma = phys_to_dma_direct(dev, (max_pfn - 1) << PAGE_SHIFT);
|
||||
|
||||
if (dev->bus_dma_mask && dev->bus_dma_mask < max_dma)
|
||||
max_dma = dev->bus_dma_mask;
|
||||
|
||||
return (1ULL << (fls64(max_dma) - 1)) * 2 - 1;
|
||||
}
|
||||
|
||||
@@ -130,10 +127,12 @@ void *dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
|
||||
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
|
||||
!force_dma_unencrypted(dev)) {
|
||||
/* remove any dirty cache lines on the kernel alias */
|
||||
if (!PageHighMem(page))
|
||||
arch_dma_prep_coherent(page, size);
|
||||
*dma_handle = phys_to_dma(dev, page_to_phys(page));
|
||||
/* return the page pointer as the opaque cookie */
|
||||
return page;
|
||||
}
|
||||
@@ -178,7 +177,8 @@ void dma_direct_free_pages(struct device *dev, size_t size, void *cpu_addr,
|
||||
{
|
||||
unsigned int page_order = get_order(size);
|
||||
|
||||
if (attrs & DMA_ATTR_NO_KERNEL_MAPPING) {
|
||||
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
|
||||
!force_dma_unencrypted(dev)) {
|
||||
/* cpu_addr is a struct page cookie, not a kernel address */
|
||||
__dma_direct_free_pages(dev, size, cpu_addr);
|
||||
return;
|
||||
|
@@ -116,11 +116,16 @@ int dma_common_get_sgtable(struct device *dev, struct sg_table *sgt,
|
||||
int ret;
|
||||
|
||||
if (!dev_is_dma_coherent(dev)) {
|
||||
unsigned long pfn;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
|
||||
return -ENXIO;
|
||||
|
||||
page = pfn_to_page(arch_dma_coherent_to_pfn(dev, cpu_addr,
|
||||
dma_addr));
|
||||
/* If the PFN is not valid, we do not have a struct page */
|
||||
pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
|
||||
if (!pfn_valid(pfn))
|
||||
return -ENXIO;
|
||||
page = pfn_to_page(pfn);
|
||||
} else {
|
||||
page = virt_to_page(cpu_addr);
|
||||
}
|
||||
@@ -145,6 +150,23 @@ int dma_get_sgtable_attrs(struct device *dev, struct sg_table *sgt,
|
||||
}
|
||||
EXPORT_SYMBOL(dma_get_sgtable_attrs);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
/*
|
||||
* Return the page attributes used for mapping dma_alloc_* memory, either in
|
||||
* kernel space if remapping is needed, or to userspace through dma_mmap_*.
|
||||
*/
|
||||
pgprot_t dma_pgprot(struct device *dev, pgprot_t prot, unsigned long attrs)
|
||||
{
|
||||
if (dev_is_dma_coherent(dev) ||
|
||||
(IS_ENABLED(CONFIG_DMA_NONCOHERENT_CACHE_SYNC) &&
|
||||
(attrs & DMA_ATTR_NON_CONSISTENT)))
|
||||
return prot;
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_MMAP_PGPROT))
|
||||
return arch_dma_mmap_pgprot(dev, prot, attrs);
|
||||
return pgprot_noncached(prot);
|
||||
}
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
/*
|
||||
* Create userspace mapping for the DMA-coherent memory.
|
||||
*/
|
||||
@@ -159,7 +181,7 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
|
||||
unsigned long pfn;
|
||||
int ret = -ENXIO;
|
||||
|
||||
vma->vm_page_prot = arch_dma_mmap_pgprot(dev, vma->vm_page_prot, attrs);
|
||||
vma->vm_page_prot = dma_pgprot(dev, vma->vm_page_prot, attrs);
|
||||
|
||||
if (dma_mmap_from_dev_coherent(dev, vma, cpu_addr, size, &ret))
|
||||
return ret;
|
||||
@@ -170,7 +192,11 @@ int dma_common_mmap(struct device *dev, struct vm_area_struct *vma,
|
||||
if (!dev_is_dma_coherent(dev)) {
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_COHERENT_TO_PFN))
|
||||
return -ENXIO;
|
||||
|
||||
/* If the PFN is not valid, we do not have a struct page */
|
||||
pfn = arch_dma_coherent_to_pfn(dev, cpu_addr, dma_addr);
|
||||
if (!pfn_valid(pfn))
|
||||
return -ENXIO;
|
||||
} else {
|
||||
pfn = page_to_pfn(virt_to_page(cpu_addr));
|
||||
}
|
||||
|
@@ -218,7 +218,7 @@ void *arch_dma_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
|
||||
|
||||
/* create a coherent mapping */
|
||||
ret = dma_common_contiguous_remap(page, size, VM_USERMAP,
|
||||
arch_dma_mmap_pgprot(dev, PAGE_KERNEL, attrs),
|
||||
dma_pgprot(dev, PAGE_KERNEL, attrs),
|
||||
__builtin_return_address(0));
|
||||
if (!ret) {
|
||||
__dma_direct_free_pages(dev, size, page);
|
||||
|
@@ -11274,7 +11274,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
|
||||
goto err_unlock;
|
||||
}
|
||||
|
||||
perf_install_in_context(ctx, event, cpu);
|
||||
perf_install_in_context(ctx, event, event->cpu);
|
||||
perf_unpin_context(ctx);
|
||||
mutex_unlock(&ctx->mutex);
|
||||
|
||||
|
@@ -720,6 +720,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
|
||||
if (group_dead)
|
||||
kill_orphaned_pgrp(tsk->group_leader, NULL);
|
||||
|
||||
tsk->exit_state = EXIT_ZOMBIE;
|
||||
if (unlikely(tsk->ptrace)) {
|
||||
int sig = thread_group_leader(tsk) &&
|
||||
thread_group_empty(tsk) &&
|
||||
@@ -733,9 +734,10 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
|
||||
autoreap = true;
|
||||
}
|
||||
|
||||
tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
|
||||
if (tsk->exit_state == EXIT_DEAD)
|
||||
if (autoreap) {
|
||||
tsk->exit_state = EXIT_DEAD;
|
||||
list_add(&tsk->ptrace_entry, &dead);
|
||||
}
|
||||
|
||||
/* mt-exec, de_thread() is waiting for group leader */
|
||||
if (unlikely(tsk->signal->notify_count < 0))
|
||||
|
@@ -726,7 +726,7 @@ void __put_task_struct(struct task_struct *tsk)
|
||||
WARN_ON(tsk == current);
|
||||
|
||||
cgroup_free(tsk);
|
||||
task_numa_free(tsk);
|
||||
task_numa_free(tsk, true);
|
||||
security_task_free(tsk);
|
||||
exit_creds(tsk);
|
||||
delayacct_tsk_free(tsk);
|
||||
|
@@ -251,11 +251,9 @@ irq_create_affinity_masks(unsigned int nvecs, struct irq_affinity *affd)
|
||||
* Determine the number of vectors which need interrupt affinities
|
||||
* assigned. If the pre/post request exhausts the available vectors
|
||||
* then nothing to do here except for invoking the calc_sets()
|
||||
* callback so the device driver can adjust to the situation. If there
|
||||
* is only a single vector, then managing the queue is pointless as
|
||||
* well.
|
||||
* callback so the device driver can adjust to the situation.
|
||||
*/
|
||||
if (nvecs > 1 && nvecs > affd->pre_vectors + affd->post_vectors)
|
||||
if (nvecs > affd->pre_vectors + affd->post_vectors)
|
||||
affvecs = nvecs - affd->pre_vectors - affd->post_vectors;
|
||||
else
|
||||
affvecs = 0;
|
||||
|
@@ -448,7 +448,7 @@ static void print_lockdep_off(const char *bug_msg)
|
||||
|
||||
unsigned long nr_stack_trace_entries;
|
||||
|
||||
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
/*
|
||||
* Stack-trace: tightly packed array of stack backtrace
|
||||
* addresses. Protected by the graph_lock.
|
||||
@@ -491,7 +491,7 @@ unsigned int max_lockdep_depth;
|
||||
DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats);
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
/*
|
||||
* Locking printouts:
|
||||
*/
|
||||
@@ -2969,7 +2969,7 @@ static void check_chain_key(struct task_struct *curr)
|
||||
#endif
|
||||
}
|
||||
|
||||
#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
static int mark_lock(struct task_struct *curr, struct held_lock *this,
|
||||
enum lock_usage_bit new_bit);
|
||||
|
||||
@@ -3608,7 +3608,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
|
||||
#else /* CONFIG_PROVE_LOCKING */
|
||||
|
||||
static inline int
|
||||
mark_usage(struct task_struct *curr, struct held_lock *hlock, int check)
|
||||
@@ -3627,7 +3627,7 @@ static inline int separate_irq_context(struct task_struct *curr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#endif /* defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING) */
|
||||
#endif /* CONFIG_PROVE_LOCKING */
|
||||
|
||||
/*
|
||||
* Initialize a lock instance's lock-class mapping info:
|
||||
@@ -4321,8 +4321,7 @@ static void __lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie cookie
|
||||
*/
|
||||
static void check_flags(unsigned long flags)
|
||||
{
|
||||
#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
|
||||
defined(CONFIG_TRACE_IRQFLAGS)
|
||||
#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP)
|
||||
if (!debug_locks)
|
||||
return;
|
||||
|
||||
|
@@ -200,7 +200,6 @@ static void lockdep_stats_debug_show(struct seq_file *m)
|
||||
|
||||
static int lockdep_stats_show(struct seq_file *m, void *v)
|
||||
{
|
||||
struct lock_class *class;
|
||||
unsigned long nr_unused = 0, nr_uncategorized = 0,
|
||||
nr_irq_safe = 0, nr_irq_unsafe = 0,
|
||||
nr_softirq_safe = 0, nr_softirq_unsafe = 0,
|
||||
@@ -211,6 +210,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
|
||||
sum_forward_deps = 0;
|
||||
|
||||
#ifdef CONFIG_PROVE_LOCKING
|
||||
struct lock_class *class;
|
||||
|
||||
list_for_each_entry(class, &all_lock_classes, lock_entry) {
|
||||
|
||||
if (class->usage_mask == 0)
|
||||
|
@@ -908,6 +908,10 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
|
||||
|
||||
might_sleep();
|
||||
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
|
||||
#endif
|
||||
|
||||
ww = container_of(lock, struct ww_mutex, base);
|
||||
if (use_ww_ctx && ww_ctx) {
|
||||
if (unlikely(ww_ctx == READ_ONCE(ww->ctx)))
|
||||
@@ -1379,8 +1383,13 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock,
|
||||
*/
|
||||
int __sched mutex_trylock(struct mutex *lock)
|
||||
{
|
||||
bool locked = __mutex_trylock(lock);
|
||||
bool locked;
|
||||
|
||||
#ifdef CONFIG_DEBUG_MUTEXES
|
||||
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
|
||||
#endif
|
||||
|
||||
locked = __mutex_trylock(lock);
|
||||
if (locked)
|
||||
mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
|
||||
|
||||
|
@@ -666,7 +666,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem,
|
||||
preempt_disable();
|
||||
rcu_read_lock();
|
||||
owner = rwsem_owner_flags(sem, &flags);
|
||||
if ((flags & nonspinnable) || (owner && !owner_on_cpu(owner)))
|
||||
/*
|
||||
* Don't check the read-owner as the entry may be stale.
|
||||
*/
|
||||
if ((flags & nonspinnable) ||
|
||||
(owner && !(flags & RWSEM_READER_OWNED) && !owner_on_cpu(owner)))
|
||||
ret = false;
|
||||
rcu_read_unlock();
|
||||
preempt_enable();
|
||||
@@ -1000,6 +1004,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
|
||||
atomic_long_add(-RWSEM_READER_BIAS, &sem->count);
|
||||
adjustment = 0;
|
||||
if (rwsem_optimistic_spin(sem, false)) {
|
||||
/* rwsem_optimistic_spin() implies ACQUIRE on success */
|
||||
/*
|
||||
* Wake up other readers in the wait list if the front
|
||||
* waiter is a reader.
|
||||
@@ -1014,6 +1019,7 @@ rwsem_down_read_slowpath(struct rw_semaphore *sem, int state)
|
||||
}
|
||||
return sem;
|
||||
} else if (rwsem_reader_phase_trylock(sem, waiter.last_rowner)) {
|
||||
/* rwsem_reader_phase_trylock() implies ACQUIRE on success */
|
||||
return sem;
|
||||
}
|
||||
|
||||
@@ -1032,6 +1038,8 @@ queue:
|
||||
*/
|
||||
if (adjustment && !(atomic_long_read(&sem->count) &
|
||||
(RWSEM_WRITER_MASK | RWSEM_FLAG_HANDOFF))) {
|
||||
/* Provide lock ACQUIRE */
|
||||
smp_acquire__after_ctrl_dep();
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
rwsem_set_reader_owned(sem);
|
||||
lockevent_inc(rwsem_rlock_fast);
|
||||
@@ -1065,15 +1073,18 @@ queue:
|
||||
wake_up_q(&wake_q);
|
||||
|
||||
/* wait to be given the lock */
|
||||
while (true) {
|
||||
for (;;) {
|
||||
set_current_state(state);
|
||||
if (!waiter.task)
|
||||
if (!smp_load_acquire(&waiter.task)) {
|
||||
/* Matches rwsem_mark_wake()'s smp_store_release(). */
|
||||
break;
|
||||
}
|
||||
if (signal_pending_state(state, current)) {
|
||||
raw_spin_lock_irq(&sem->wait_lock);
|
||||
if (waiter.task)
|
||||
goto out_nolock;
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
/* Ordered by sem->wait_lock against rwsem_mark_wake(). */
|
||||
break;
|
||||
}
|
||||
schedule();
|
||||
@@ -1083,6 +1094,7 @@ queue:
|
||||
__set_current_state(TASK_RUNNING);
|
||||
lockevent_inc(rwsem_rlock);
|
||||
return sem;
|
||||
|
||||
out_nolock:
|
||||
list_del(&waiter.list);
|
||||
if (list_empty(&sem->wait_list)) {
|
||||
@@ -1123,8 +1135,10 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
|
||||
|
||||
/* do optimistic spinning and steal lock if possible */
|
||||
if (rwsem_can_spin_on_owner(sem, RWSEM_WR_NONSPINNABLE) &&
|
||||
rwsem_optimistic_spin(sem, true))
|
||||
rwsem_optimistic_spin(sem, true)) {
|
||||
/* rwsem_optimistic_spin() implies ACQUIRE on success */
|
||||
return sem;
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable reader optimistic spinning for this rwsem after
|
||||
@@ -1184,9 +1198,11 @@ rwsem_down_write_slowpath(struct rw_semaphore *sem, int state)
|
||||
wait:
|
||||
/* wait until we successfully acquire the lock */
|
||||
set_current_state(state);
|
||||
while (true) {
|
||||
if (rwsem_try_write_lock(sem, wstate))
|
||||
for (;;) {
|
||||
if (rwsem_try_write_lock(sem, wstate)) {
|
||||
/* rwsem_try_write_lock() implies ACQUIRE on success */
|
||||
break;
|
||||
}
|
||||
|
||||
raw_spin_unlock_irq(&sem->wait_lock);
|
||||
|
||||
|
@@ -1,426 +0,0 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
|
||||
#include <linux/device.h>
|
||||
#include <linux/io.h>
|
||||
#include <linux/kasan.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/pfn_t.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/wait_bit.h>
|
||||
#include <linux/xarray.h>
|
||||
|
||||
static DEFINE_XARRAY(pgmap_array);
|
||||
#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1)
|
||||
#define SECTION_SIZE (1UL << PA_SECTION_SHIFT)
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
|
||||
EXPORT_SYMBOL(devmap_managed_key);
|
||||
static atomic_t devmap_managed_enable;
|
||||
|
||||
static void devmap_managed_enable_put(void)
|
||||
{
|
||||
if (atomic_dec_and_test(&devmap_managed_enable))
|
||||
static_branch_disable(&devmap_managed_key);
|
||||
}
|
||||
|
||||
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (!pgmap->ops || !pgmap->ops->page_free) {
|
||||
WARN(1, "Missing page_free method\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (atomic_inc_return(&devmap_managed_enable) == 1)
|
||||
static_branch_enable(&devmap_managed_key);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
static void devmap_managed_enable_put(void)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
||||
|
||||
static void pgmap_array_delete(struct resource *res)
|
||||
{
|
||||
xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
|
||||
NULL, GFP_KERNEL);
|
||||
synchronize_rcu();
|
||||
}
|
||||
|
||||
static unsigned long pfn_first(struct dev_pagemap *pgmap)
|
||||
{
|
||||
return PHYS_PFN(pgmap->res.start) +
|
||||
vmem_altmap_offset(pgmap_altmap(pgmap));
|
||||
}
|
||||
|
||||
static unsigned long pfn_end(struct dev_pagemap *pgmap)
|
||||
{
|
||||
const struct resource *res = &pgmap->res;
|
||||
|
||||
return (res->start + resource_size(res)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static unsigned long pfn_next(unsigned long pfn)
|
||||
{
|
||||
if (pfn % 1024 == 0)
|
||||
cond_resched();
|
||||
return pfn + 1;
|
||||
}
|
||||
|
||||
#define for_each_device_pfn(pfn, map) \
|
||||
for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
|
||||
|
||||
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->kill)
|
||||
pgmap->ops->kill(pgmap);
|
||||
else
|
||||
percpu_ref_kill(pgmap->ref);
|
||||
}
|
||||
|
||||
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->cleanup) {
|
||||
pgmap->ops->cleanup(pgmap);
|
||||
} else {
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(pgmap->ref);
|
||||
}
|
||||
}
|
||||
|
||||
void memunmap_pages(struct dev_pagemap *pgmap)
|
||||
{
|
||||
struct resource *res = &pgmap->res;
|
||||
unsigned long pfn;
|
||||
int nid;
|
||||
|
||||
dev_pagemap_kill(pgmap);
|
||||
for_each_device_pfn(pfn, pgmap)
|
||||
put_page(pfn_to_page(pfn));
|
||||
dev_pagemap_cleanup(pgmap);
|
||||
|
||||
/* pages are dead and unused, undo the arch mapping */
|
||||
nid = page_to_nid(pfn_to_page(PHYS_PFN(res->start)));
|
||||
|
||||
mem_hotplug_begin();
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
|
||||
pfn = PHYS_PFN(res->start);
|
||||
__remove_pages(page_zone(pfn_to_page(pfn)), pfn,
|
||||
PHYS_PFN(resource_size(res)), NULL);
|
||||
} else {
|
||||
arch_remove_memory(nid, res->start, resource_size(res),
|
||||
pgmap_altmap(pgmap));
|
||||
kasan_remove_zero_shadow(__va(res->start), resource_size(res));
|
||||
}
|
||||
mem_hotplug_done();
|
||||
|
||||
untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
|
||||
pgmap_array_delete(res);
|
||||
WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
|
||||
devmap_managed_enable_put();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memunmap_pages);
|
||||
|
||||
static void devm_memremap_pages_release(void *data)
|
||||
{
|
||||
memunmap_pages(data);
|
||||
}
|
||||
|
||||
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_pagemap *pgmap =
|
||||
container_of(ref, struct dev_pagemap, internal_ref);
|
||||
|
||||
complete(&pgmap->done);
|
||||
}
|
||||
|
||||
/*
|
||||
* Not device managed version of dev_memremap_pages, undone by
|
||||
* memunmap_pages(). Please use dev_memremap_pages if you have a struct
|
||||
* device available.
|
||||
*/
|
||||
void *memremap_pages(struct dev_pagemap *pgmap, int nid)
|
||||
{
|
||||
struct resource *res = &pgmap->res;
|
||||
struct dev_pagemap *conflict_pgmap;
|
||||
struct mhp_restrictions restrictions = {
|
||||
/*
|
||||
* We do not want any optional features only our own memmap
|
||||
*/
|
||||
.altmap = pgmap_altmap(pgmap),
|
||||
};
|
||||
pgprot_t pgprot = PAGE_KERNEL;
|
||||
int error, is_ram;
|
||||
bool need_devmap_managed = true;
|
||||
|
||||
switch (pgmap->type) {
|
||||
case MEMORY_DEVICE_PRIVATE:
|
||||
if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
|
||||
WARN(1, "Device private memory not supported\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
|
||||
WARN(1, "Missing migrate_to_ram method\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
break;
|
||||
case MEMORY_DEVICE_FS_DAX:
|
||||
if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
|
||||
IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
|
||||
WARN(1, "File system DAX not supported\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
break;
|
||||
case MEMORY_DEVICE_DEVDAX:
|
||||
case MEMORY_DEVICE_PCI_P2PDMA:
|
||||
need_devmap_managed = false;
|
||||
break;
|
||||
default:
|
||||
WARN(1, "Invalid pgmap type %d\n", pgmap->type);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!pgmap->ref) {
|
||||
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
init_completion(&pgmap->done);
|
||||
error = percpu_ref_init(&pgmap->internal_ref,
|
||||
dev_pagemap_percpu_release, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
pgmap->ref = &pgmap->internal_ref;
|
||||
} else {
|
||||
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
|
||||
WARN(1, "Missing reference count teardown definition\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
}
|
||||
|
||||
if (need_devmap_managed) {
|
||||
error = devmap_managed_enable_get(pgmap);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
|
||||
conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
|
||||
if (conflict_pgmap) {
|
||||
WARN(1, "Conflicting mapping in same section\n");
|
||||
put_dev_pagemap(conflict_pgmap);
|
||||
error = -ENOMEM;
|
||||
goto err_array;
|
||||
}
|
||||
|
||||
conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
|
||||
if (conflict_pgmap) {
|
||||
WARN(1, "Conflicting mapping in same section\n");
|
||||
put_dev_pagemap(conflict_pgmap);
|
||||
error = -ENOMEM;
|
||||
goto err_array;
|
||||
}
|
||||
|
||||
is_ram = region_intersects(res->start, resource_size(res),
|
||||
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
|
||||
|
||||
if (is_ram != REGION_DISJOINT) {
|
||||
WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
|
||||
is_ram == REGION_MIXED ? "mixed" : "ram", res);
|
||||
error = -ENXIO;
|
||||
goto err_array;
|
||||
}
|
||||
|
||||
error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
|
||||
PHYS_PFN(res->end), pgmap, GFP_KERNEL));
|
||||
if (error)
|
||||
goto err_array;
|
||||
|
||||
if (nid < 0)
|
||||
nid = numa_mem_id();
|
||||
|
||||
error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
|
||||
resource_size(res));
|
||||
if (error)
|
||||
goto err_pfn_remap;
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
/*
|
||||
* For device private memory we call add_pages() as we only need to
|
||||
* allocate and initialize struct page for the device memory. More-
|
||||
* over the device memory is un-accessible thus we do not want to
|
||||
* create a linear mapping for the memory like arch_add_memory()
|
||||
* would do.
|
||||
*
|
||||
* For all other device memory types, which are accessible by
|
||||
* the CPU, we do want the linear mapping and thus use
|
||||
* arch_add_memory().
|
||||
*/
|
||||
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
|
||||
error = add_pages(nid, PHYS_PFN(res->start),
|
||||
PHYS_PFN(resource_size(res)), &restrictions);
|
||||
} else {
|
||||
error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
|
||||
if (error) {
|
||||
mem_hotplug_done();
|
||||
goto err_kasan;
|
||||
}
|
||||
|
||||
error = arch_add_memory(nid, res->start, resource_size(res),
|
||||
&restrictions);
|
||||
}
|
||||
|
||||
if (!error) {
|
||||
struct zone *zone;
|
||||
|
||||
zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
|
||||
move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
|
||||
PHYS_PFN(resource_size(res)), restrictions.altmap);
|
||||
}
|
||||
|
||||
mem_hotplug_done();
|
||||
if (error)
|
||||
goto err_add_memory;
|
||||
|
||||
/*
|
||||
* Initialization of the pages has been deferred until now in order
|
||||
* to allow us to do the work while not holding the hotplug lock.
|
||||
*/
|
||||
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
|
||||
PHYS_PFN(res->start),
|
||||
PHYS_PFN(resource_size(res)), pgmap);
|
||||
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
|
||||
return __va(res->start);
|
||||
|
||||
err_add_memory:
|
||||
kasan_remove_zero_shadow(__va(res->start), resource_size(res));
|
||||
err_kasan:
|
||||
untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
|
||||
err_pfn_remap:
|
||||
pgmap_array_delete(res);
|
||||
err_array:
|
||||
dev_pagemap_kill(pgmap);
|
||||
dev_pagemap_cleanup(pgmap);
|
||||
devmap_managed_enable_put();
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memremap_pages);
|
||||
|
||||
/**
|
||||
* devm_memremap_pages - remap and provide memmap backing for the given resource
|
||||
* @dev: hosting device for @res
|
||||
* @pgmap: pointer to a struct dev_pagemap
|
||||
*
|
||||
* Notes:
|
||||
* 1/ At a minimum the res and type members of @pgmap must be initialized
|
||||
* by the caller before passing it to this function
|
||||
*
|
||||
* 2/ The altmap field may optionally be initialized, in which case
|
||||
* PGMAP_ALTMAP_VALID must be set in pgmap->flags.
|
||||
*
|
||||
* 3/ The ref field may optionally be provided, in which pgmap->ref must be
|
||||
* 'live' on entry and will be killed and reaped at
|
||||
* devm_memremap_pages_release() time, or if this routine fails.
|
||||
*
|
||||
* 4/ res is expected to be a host memory range that could feasibly be
|
||||
* treated as a "System RAM" range, i.e. not a device mmio range, but
|
||||
* this is not enforced.
|
||||
*/
|
||||
void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
int error;
|
||||
void *ret;
|
||||
|
||||
ret = memremap_pages(pgmap, dev_to_node(dev));
|
||||
if (IS_ERR(ret))
|
||||
return ret;
|
||||
|
||||
error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
|
||||
pgmap);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(devm_memremap_pages);
|
||||
|
||||
void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
|
||||
{
|
||||
devm_release_action(dev, devm_memremap_pages_release, pgmap);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(devm_memunmap_pages);
|
||||
|
||||
unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
|
||||
{
|
||||
/* number of pfns from base where pfn_to_page() is valid */
|
||||
if (altmap)
|
||||
return altmap->reserve + altmap->free;
|
||||
return 0;
|
||||
}
|
||||
|
||||
void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
|
||||
{
|
||||
altmap->alloc -= nr_pfns;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
|
||||
* @pfn: page frame number to lookup page_map
|
||||
* @pgmap: optional known pgmap that already has a reference
|
||||
*
|
||||
* If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
|
||||
* is non-NULL but does not cover @pfn the reference to it will be released.
|
||||
*/
|
||||
struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
||||
struct dev_pagemap *pgmap)
|
||||
{
|
||||
resource_size_t phys = PFN_PHYS(pfn);
|
||||
|
||||
/*
|
||||
* In the cached case we're already holding a live reference.
|
||||
*/
|
||||
if (pgmap) {
|
||||
if (phys >= pgmap->res.start && phys <= pgmap->res.end)
|
||||
return pgmap;
|
||||
put_dev_pagemap(pgmap);
|
||||
}
|
||||
|
||||
/* fall back to slow path lookup */
|
||||
rcu_read_lock();
|
||||
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
|
||||
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
|
||||
pgmap = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
return pgmap;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_dev_pagemap);
|
||||
|
||||
#ifdef CONFIG_DEV_PAGEMAP_OPS
|
||||
void __put_devmap_managed_page(struct page *page)
|
||||
{
|
||||
int count = page_ref_dec_return(page);
|
||||
|
||||
/*
|
||||
* If refcount is 1 then page is freed and refcount is stable as nobody
|
||||
* holds a reference on the page.
|
||||
*/
|
||||
if (count == 1) {
|
||||
/* Clear Active bit in case of parallel mark_page_accessed */
|
||||
__ClearPageActive(page);
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
mem_cgroup_uncharge(page);
|
||||
|
||||
page->pgmap->ops->page_free(page);
|
||||
} else if (!count)
|
||||
__put_page(page);
|
||||
}
|
||||
EXPORT_SYMBOL(__put_devmap_managed_page);
|
||||
#endif /* CONFIG_DEV_PAGEMAP_OPS */
|
@@ -40,6 +40,7 @@ struct sugov_policy {
|
||||
struct task_struct *thread;
|
||||
bool work_in_progress;
|
||||
|
||||
bool limits_changed;
|
||||
bool need_freq_update;
|
||||
};
|
||||
|
||||
@@ -89,8 +90,11 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
|
||||
!cpufreq_this_cpu_can_update(sg_policy->policy))
|
||||
return false;
|
||||
|
||||
if (unlikely(sg_policy->need_freq_update))
|
||||
if (unlikely(sg_policy->limits_changed)) {
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->need_freq_update = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
delta_ns = time - sg_policy->last_freq_update_time;
|
||||
|
||||
@@ -437,7 +441,7 @@ static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; }
|
||||
static inline void ignore_dl_rate_limit(struct sugov_cpu *sg_cpu, struct sugov_policy *sg_policy)
|
||||
{
|
||||
if (cpu_bw_dl(cpu_rq(sg_cpu->cpu)) > sg_cpu->bw_dl)
|
||||
sg_policy->need_freq_update = true;
|
||||
sg_policy->limits_changed = true;
|
||||
}
|
||||
|
||||
static void sugov_update_single(struct update_util_data *hook, u64 time,
|
||||
@@ -457,7 +461,8 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
|
||||
if (!sugov_should_update_freq(sg_policy, time))
|
||||
return;
|
||||
|
||||
busy = sugov_cpu_is_busy(sg_cpu);
|
||||
/* Limits may have changed, don't skip frequency update */
|
||||
busy = !sg_policy->need_freq_update && sugov_cpu_is_busy(sg_cpu);
|
||||
|
||||
util = sugov_get_util(sg_cpu);
|
||||
max = sg_cpu->max;
|
||||
@@ -831,6 +836,7 @@ static int sugov_start(struct cpufreq_policy *policy)
|
||||
sg_policy->last_freq_update_time = 0;
|
||||
sg_policy->next_freq = 0;
|
||||
sg_policy->work_in_progress = false;
|
||||
sg_policy->limits_changed = false;
|
||||
sg_policy->need_freq_update = false;
|
||||
sg_policy->cached_raw_freq = 0;
|
||||
|
||||
@@ -879,7 +885,7 @@ static void sugov_limits(struct cpufreq_policy *policy)
|
||||
mutex_unlock(&sg_policy->work_lock);
|
||||
}
|
||||
|
||||
sg_policy->need_freq_update = true;
|
||||
sg_policy->limits_changed = true;
|
||||
}
|
||||
|
||||
struct cpufreq_governor schedutil_gov = {
|
||||
|
@@ -2088,17 +2088,13 @@ retry:
|
||||
}
|
||||
|
||||
deactivate_task(rq, next_task, 0);
|
||||
sub_running_bw(&next_task->dl, &rq->dl);
|
||||
sub_rq_bw(&next_task->dl, &rq->dl);
|
||||
set_task_cpu(next_task, later_rq->cpu);
|
||||
add_rq_bw(&next_task->dl, &later_rq->dl);
|
||||
|
||||
/*
|
||||
* Update the later_rq clock here, because the clock is used
|
||||
* by the cpufreq_update_util() inside __add_running_bw().
|
||||
*/
|
||||
update_rq_clock(later_rq);
|
||||
add_running_bw(&next_task->dl, &later_rq->dl);
|
||||
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
|
||||
ret = 1;
|
||||
|
||||
@@ -2186,11 +2182,7 @@ static void pull_dl_task(struct rq *this_rq)
|
||||
resched = true;
|
||||
|
||||
deactivate_task(src_rq, p, 0);
|
||||
sub_running_bw(&p->dl, &src_rq->dl);
|
||||
sub_rq_bw(&p->dl, &src_rq->dl);
|
||||
set_task_cpu(p, this_cpu);
|
||||
add_rq_bw(&p->dl, &this_rq->dl);
|
||||
add_running_bw(&p->dl, &this_rq->dl);
|
||||
activate_task(this_rq, p, 0);
|
||||
dmin = p->dl.deadline;
|
||||
|
||||
|
@@ -1086,6 +1086,21 @@ struct numa_group {
|
||||
unsigned long faults[0];
|
||||
};
|
||||
|
||||
/*
|
||||
* For functions that can be called in multiple contexts that permit reading
|
||||
* ->numa_group (see struct task_struct for locking rules).
|
||||
*/
|
||||
static struct numa_group *deref_task_numa_group(struct task_struct *p)
|
||||
{
|
||||
return rcu_dereference_check(p->numa_group, p == current ||
|
||||
(lockdep_is_held(&task_rq(p)->lock) && !READ_ONCE(p->on_cpu)));
|
||||
}
|
||||
|
||||
static struct numa_group *deref_curr_numa_group(struct task_struct *p)
|
||||
{
|
||||
return rcu_dereference_protected(p->numa_group, p == current);
|
||||
}
|
||||
|
||||
static inline unsigned long group_faults_priv(struct numa_group *ng);
|
||||
static inline unsigned long group_faults_shared(struct numa_group *ng);
|
||||
|
||||
@@ -1129,10 +1144,12 @@ static unsigned int task_scan_start(struct task_struct *p)
|
||||
{
|
||||
unsigned long smin = task_scan_min(p);
|
||||
unsigned long period = smin;
|
||||
struct numa_group *ng;
|
||||
|
||||
/* Scale the maximum scan period with the amount of shared memory. */
|
||||
if (p->numa_group) {
|
||||
struct numa_group *ng = p->numa_group;
|
||||
rcu_read_lock();
|
||||
ng = rcu_dereference(p->numa_group);
|
||||
if (ng) {
|
||||
unsigned long shared = group_faults_shared(ng);
|
||||
unsigned long private = group_faults_priv(ng);
|
||||
|
||||
@@ -1140,6 +1157,7 @@ static unsigned int task_scan_start(struct task_struct *p)
|
||||
period *= shared + 1;
|
||||
period /= private + shared + 1;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return max(smin, period);
|
||||
}
|
||||
@@ -1148,13 +1166,14 @@ static unsigned int task_scan_max(struct task_struct *p)
|
||||
{
|
||||
unsigned long smin = task_scan_min(p);
|
||||
unsigned long smax;
|
||||
struct numa_group *ng;
|
||||
|
||||
/* Watch for min being lower than max due to floor calculations */
|
||||
smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
|
||||
|
||||
/* Scale the maximum scan period with the amount of shared memory. */
|
||||
if (p->numa_group) {
|
||||
struct numa_group *ng = p->numa_group;
|
||||
ng = deref_curr_numa_group(p);
|
||||
if (ng) {
|
||||
unsigned long shared = group_faults_shared(ng);
|
||||
unsigned long private = group_faults_priv(ng);
|
||||
unsigned long period = smax;
|
||||
@@ -1186,7 +1205,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
|
||||
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
|
||||
p->numa_work.next = &p->numa_work;
|
||||
p->numa_faults = NULL;
|
||||
p->numa_group = NULL;
|
||||
RCU_INIT_POINTER(p->numa_group, NULL);
|
||||
p->last_task_numa_placement = 0;
|
||||
p->last_sum_exec_runtime = 0;
|
||||
|
||||
@@ -1233,7 +1252,16 @@ static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
|
||||
|
||||
pid_t task_numa_group_id(struct task_struct *p)
|
||||
{
|
||||
return p->numa_group ? p->numa_group->gid : 0;
|
||||
struct numa_group *ng;
|
||||
pid_t gid = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
ng = rcu_dereference(p->numa_group);
|
||||
if (ng)
|
||||
gid = ng->gid;
|
||||
rcu_read_unlock();
|
||||
|
||||
return gid;
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -1258,11 +1286,13 @@ static inline unsigned long task_faults(struct task_struct *p, int nid)
|
||||
|
||||
static inline unsigned long group_faults(struct task_struct *p, int nid)
|
||||
{
|
||||
if (!p->numa_group)
|
||||
struct numa_group *ng = deref_task_numa_group(p);
|
||||
|
||||
if (!ng)
|
||||
return 0;
|
||||
|
||||
return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
||||
p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
||||
return ng->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
|
||||
ng->faults[task_faults_idx(NUMA_MEM, nid, 1)];
|
||||
}
|
||||
|
||||
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
|
||||
@@ -1400,12 +1430,13 @@ static inline unsigned long task_weight(struct task_struct *p, int nid,
|
||||
static inline unsigned long group_weight(struct task_struct *p, int nid,
|
||||
int dist)
|
||||
{
|
||||
struct numa_group *ng = deref_task_numa_group(p);
|
||||
unsigned long faults, total_faults;
|
||||
|
||||
if (!p->numa_group)
|
||||
if (!ng)
|
||||
return 0;
|
||||
|
||||
total_faults = p->numa_group->total_faults;
|
||||
total_faults = ng->total_faults;
|
||||
|
||||
if (!total_faults)
|
||||
return 0;
|
||||
@@ -1419,7 +1450,7 @@ static inline unsigned long group_weight(struct task_struct *p, int nid,
|
||||
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
||||
int src_nid, int dst_cpu)
|
||||
{
|
||||
struct numa_group *ng = p->numa_group;
|
||||
struct numa_group *ng = deref_curr_numa_group(p);
|
||||
int dst_nid = cpu_to_node(dst_cpu);
|
||||
int last_cpupid, this_cpupid;
|
||||
|
||||
@@ -1600,13 +1631,14 @@ static bool load_too_imbalanced(long src_load, long dst_load,
|
||||
static void task_numa_compare(struct task_numa_env *env,
|
||||
long taskimp, long groupimp, bool maymove)
|
||||
{
|
||||
struct numa_group *cur_ng, *p_ng = deref_curr_numa_group(env->p);
|
||||
struct rq *dst_rq = cpu_rq(env->dst_cpu);
|
||||
long imp = p_ng ? groupimp : taskimp;
|
||||
struct task_struct *cur;
|
||||
long src_load, dst_load;
|
||||
long load;
|
||||
long imp = env->p->numa_group ? groupimp : taskimp;
|
||||
long moveimp = imp;
|
||||
int dist = env->dist;
|
||||
long moveimp = imp;
|
||||
long load;
|
||||
|
||||
if (READ_ONCE(dst_rq->numa_migrate_on))
|
||||
return;
|
||||
@@ -1645,21 +1677,22 @@ static void task_numa_compare(struct task_numa_env *env,
|
||||
* If dst and source tasks are in the same NUMA group, or not
|
||||
* in any group then look only at task weights.
|
||||
*/
|
||||
if (cur->numa_group == env->p->numa_group) {
|
||||
cur_ng = rcu_dereference(cur->numa_group);
|
||||
if (cur_ng == p_ng) {
|
||||
imp = taskimp + task_weight(cur, env->src_nid, dist) -
|
||||
task_weight(cur, env->dst_nid, dist);
|
||||
/*
|
||||
* Add some hysteresis to prevent swapping the
|
||||
* tasks within a group over tiny differences.
|
||||
*/
|
||||
if (cur->numa_group)
|
||||
if (cur_ng)
|
||||
imp -= imp / 16;
|
||||
} else {
|
||||
/*
|
||||
* Compare the group weights. If a task is all by itself
|
||||
* (not part of a group), use the task weight instead.
|
||||
*/
|
||||
if (cur->numa_group && env->p->numa_group)
|
||||
if (cur_ng && p_ng)
|
||||
imp += group_weight(cur, env->src_nid, dist) -
|
||||
group_weight(cur, env->dst_nid, dist);
|
||||
else
|
||||
@@ -1757,11 +1790,12 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
.best_imp = 0,
|
||||
.best_cpu = -1,
|
||||
};
|
||||
struct sched_domain *sd;
|
||||
struct rq *best_rq;
|
||||
unsigned long taskweight, groupweight;
|
||||
int nid, ret, dist;
|
||||
struct sched_domain *sd;
|
||||
long taskimp, groupimp;
|
||||
struct numa_group *ng;
|
||||
struct rq *best_rq;
|
||||
int nid, ret, dist;
|
||||
|
||||
/*
|
||||
* Pick the lowest SD_NUMA domain, as that would have the smallest
|
||||
@@ -1807,7 +1841,8 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
* multiple NUMA nodes; in order to better consolidate the group,
|
||||
* we need to check other locations.
|
||||
*/
|
||||
if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
|
||||
ng = deref_curr_numa_group(p);
|
||||
if (env.best_cpu == -1 || (ng && ng->active_nodes > 1)) {
|
||||
for_each_online_node(nid) {
|
||||
if (nid == env.src_nid || nid == p->numa_preferred_nid)
|
||||
continue;
|
||||
@@ -1840,7 +1875,7 @@ static int task_numa_migrate(struct task_struct *p)
|
||||
* A task that migrated to a second choice node will be better off
|
||||
* trying for a better one later. Do not set the preferred node here.
|
||||
*/
|
||||
if (p->numa_group) {
|
||||
if (ng) {
|
||||
if (env.best_cpu == -1)
|
||||
nid = env.src_nid;
|
||||
else
|
||||
@@ -2135,6 +2170,7 @@ static void task_numa_placement(struct task_struct *p)
|
||||
unsigned long total_faults;
|
||||
u64 runtime, period;
|
||||
spinlock_t *group_lock = NULL;
|
||||
struct numa_group *ng;
|
||||
|
||||
/*
|
||||
* The p->mm->numa_scan_seq field gets updated without
|
||||
@@ -2152,8 +2188,9 @@ static void task_numa_placement(struct task_struct *p)
|
||||
runtime = numa_get_avg_runtime(p, &period);
|
||||
|
||||
/* If the task is part of a group prevent parallel updates to group stats */
|
||||
if (p->numa_group) {
|
||||
group_lock = &p->numa_group->lock;
|
||||
ng = deref_curr_numa_group(p);
|
||||
if (ng) {
|
||||
group_lock = &ng->lock;
|
||||
spin_lock_irq(group_lock);
|
||||
}
|
||||
|
||||
@@ -2194,7 +2231,7 @@ static void task_numa_placement(struct task_struct *p)
|
||||
p->numa_faults[cpu_idx] += f_diff;
|
||||
faults += p->numa_faults[mem_idx];
|
||||
p->total_numa_faults += diff;
|
||||
if (p->numa_group) {
|
||||
if (ng) {
|
||||
/*
|
||||
* safe because we can only change our own group
|
||||
*
|
||||
@@ -2202,14 +2239,14 @@ static void task_numa_placement(struct task_struct *p)
|
||||
* nid and priv in a specific region because it
|
||||
* is at the beginning of the numa_faults array.
|
||||
*/
|
||||
p->numa_group->faults[mem_idx] += diff;
|
||||
p->numa_group->faults_cpu[mem_idx] += f_diff;
|
||||
p->numa_group->total_faults += diff;
|
||||
group_faults += p->numa_group->faults[mem_idx];
|
||||
ng->faults[mem_idx] += diff;
|
||||
ng->faults_cpu[mem_idx] += f_diff;
|
||||
ng->total_faults += diff;
|
||||
group_faults += ng->faults[mem_idx];
|
||||
}
|
||||
}
|
||||
|
||||
if (!p->numa_group) {
|
||||
if (!ng) {
|
||||
if (faults > max_faults) {
|
||||
max_faults = faults;
|
||||
max_nid = nid;
|
||||
@@ -2220,8 +2257,8 @@ static void task_numa_placement(struct task_struct *p)
|
||||
}
|
||||
}
|
||||
|
||||
if (p->numa_group) {
|
||||
numa_group_count_active_nodes(p->numa_group);
|
||||
if (ng) {
|
||||
numa_group_count_active_nodes(ng);
|
||||
spin_unlock_irq(group_lock);
|
||||
max_nid = preferred_group_nid(p, max_nid);
|
||||
}
|
||||
@@ -2255,7 +2292,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
||||
int cpu = cpupid_to_cpu(cpupid);
|
||||
int i;
|
||||
|
||||
if (unlikely(!p->numa_group)) {
|
||||
if (unlikely(!deref_curr_numa_group(p))) {
|
||||
unsigned int size = sizeof(struct numa_group) +
|
||||
4*nr_node_ids*sizeof(unsigned long);
|
||||
|
||||
@@ -2291,7 +2328,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
|
||||
if (!grp)
|
||||
goto no_join;
|
||||
|
||||
my_grp = p->numa_group;
|
||||
my_grp = deref_curr_numa_group(p);
|
||||
if (grp == my_grp)
|
||||
goto no_join;
|
||||
|
||||
@@ -2353,13 +2390,24 @@ no_join:
|
||||
return;
|
||||
}
|
||||
|
||||
void task_numa_free(struct task_struct *p)
|
||||
/*
|
||||
* Get rid of NUMA staticstics associated with a task (either current or dead).
|
||||
* If @final is set, the task is dead and has reached refcount zero, so we can
|
||||
* safely free all relevant data structures. Otherwise, there might be
|
||||
* concurrent reads from places like load balancing and procfs, and we should
|
||||
* reset the data back to default state without freeing ->numa_faults.
|
||||
*/
|
||||
void task_numa_free(struct task_struct *p, bool final)
|
||||
{
|
||||
struct numa_group *grp = p->numa_group;
|
||||
void *numa_faults = p->numa_faults;
|
||||
/* safe: p either is current or is being freed by current */
|
||||
struct numa_group *grp = rcu_dereference_raw(p->numa_group);
|
||||
unsigned long *numa_faults = p->numa_faults;
|
||||
unsigned long flags;
|
||||
int i;
|
||||
|
||||
if (!numa_faults)
|
||||
return;
|
||||
|
||||
if (grp) {
|
||||
spin_lock_irqsave(&grp->lock, flags);
|
||||
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
||||
@@ -2372,8 +2420,14 @@ void task_numa_free(struct task_struct *p)
|
||||
put_numa_group(grp);
|
||||
}
|
||||
|
||||
p->numa_faults = NULL;
|
||||
kfree(numa_faults);
|
||||
if (final) {
|
||||
p->numa_faults = NULL;
|
||||
kfree(numa_faults);
|
||||
} else {
|
||||
p->total_numa_faults = 0;
|
||||
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
|
||||
numa_faults[i] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2426,7 +2480,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
|
||||
* actively using should be counted as local. This allows the
|
||||
* scan rate to slow down when a workload has settled down.
|
||||
*/
|
||||
ng = p->numa_group;
|
||||
ng = deref_curr_numa_group(p);
|
||||
if (!priv && !local && ng && ng->active_nodes > 1 &&
|
||||
numa_is_active_node(cpu_node, ng) &&
|
||||
numa_is_active_node(mem_node, ng))
|
||||
@@ -10444,18 +10498,22 @@ void show_numa_stats(struct task_struct *p, struct seq_file *m)
|
||||
{
|
||||
int node;
|
||||
unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
|
||||
struct numa_group *ng;
|
||||
|
||||
rcu_read_lock();
|
||||
ng = rcu_dereference(p->numa_group);
|
||||
for_each_online_node(node) {
|
||||
if (p->numa_faults) {
|
||||
tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
|
||||
tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
|
||||
}
|
||||
if (p->numa_group) {
|
||||
gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
|
||||
gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
|
||||
if (ng) {
|
||||
gsf = ng->faults[task_faults_idx(NUMA_MEM, node, 0)],
|
||||
gpf = ng->faults[task_faults_idx(NUMA_MEM, node, 1)];
|
||||
}
|
||||
print_numa_stats(m, node, tsf, tpf, gsf, gpf);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#endif /* CONFIG_NUMA_BALANCING */
|
||||
#endif /* CONFIG_SCHED_DEBUG */
|
||||
|
@@ -1051,7 +1051,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
|
||||
if (!rcu_access_pointer(group->poll_kworker)) {
|
||||
struct sched_param param = {
|
||||
.sched_priority = MAX_RT_PRIO - 1,
|
||||
.sched_priority = 1,
|
||||
};
|
||||
struct kthread_worker *kworker;
|
||||
|
||||
@@ -1061,7 +1061,7 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
|
||||
mutex_unlock(&group->trigger_lock);
|
||||
return ERR_CAST(kworker);
|
||||
}
|
||||
sched_setscheduler(kworker->task, SCHED_FIFO, ¶m);
|
||||
sched_setscheduler_nocheck(kworker->task, SCHED_FIFO, ¶m);
|
||||
kthread_init_delayed_work(&group->poll_work,
|
||||
psi_poll_work);
|
||||
rcu_assign_pointer(group->poll_kworker, kworker);
|
||||
|
@@ -349,7 +349,7 @@ void task_clear_jobctl_pending(struct task_struct *task, unsigned long mask)
|
||||
* @task has %JOBCTL_STOP_PENDING set and is participating in a group stop.
|
||||
* Group stop states are cleared and the group stop count is consumed if
|
||||
* %JOBCTL_STOP_CONSUME was set. If the consumption completes the group
|
||||
* stop, the appropriate %SIGNAL_* flags are set.
|
||||
* stop, the appropriate `SIGNAL_*` flags are set.
|
||||
*
|
||||
* CONTEXT:
|
||||
* Must be called with @task->sighand->siglock held.
|
||||
@@ -1885,6 +1885,7 @@ static void do_notify_pidfd(struct task_struct *task)
|
||||
{
|
||||
struct pid *pid;
|
||||
|
||||
WARN_ON(task->exit_state == 0);
|
||||
pid = task_pid(task);
|
||||
wake_up_all(&pid->wait_pidfd);
|
||||
}
|
||||
|
@@ -137,6 +137,13 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
|
||||
if (trace_recursion_test(TRACE_GRAPH_NOTRACE_BIT))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Do not trace a function if it's filtered by set_graph_notrace.
|
||||
* Make the index of ret stack negative to indicate that it should
|
||||
* ignore further functions. But it needs its own ret stack entry
|
||||
* to recover the original index in order to continue tracing after
|
||||
* returning from the function.
|
||||
*/
|
||||
if (ftrace_graph_notrace_addr(trace->func)) {
|
||||
trace_recursion_set(TRACE_GRAPH_NOTRACE_BIT);
|
||||
/*
|
||||
@@ -155,16 +162,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
|
||||
if (ftrace_graph_ignore_irqs())
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* Do not trace a function if it's filtered by set_graph_notrace.
|
||||
* Make the index of ret stack negative to indicate that it should
|
||||
* ignore further functions. But it needs its own ret stack entry
|
||||
* to recover the original index in order to continue tracing after
|
||||
* returning from the function.
|
||||
*/
|
||||
if (ftrace_graph_notrace_addr(trace->func))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* Stop here if tracing_threshold is set. We only write function return
|
||||
* events to the ring buffer.
|
||||
|
Reference in New Issue
Block a user