Merge branch 'linus' into sched/urgent, to pick up dependencies

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2016-03-21 10:47:40 +01:00
986 changed files with 26926 additions and 9456 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp)
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
" Change the kernel CONFIG_DEBUG_RODATA=n\n"
" Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
#endif
return 1;
}
return 0;

View File

@@ -3112,17 +3112,6 @@ done:
return rotate;
}
#ifdef CONFIG_NO_HZ_FULL
bool perf_event_can_stop_tick(void)
{
if (atomic_read(&nr_freq_events) ||
__this_cpu_read(perf_throttled_count))
return false;
else
return true;
}
#endif
void perf_event_task_tick(void)
{
struct list_head *head = this_cpu_ptr(&active_ctx_list);
@@ -3133,6 +3122,7 @@ void perf_event_task_tick(void)
__this_cpu_inc(perf_throttled_seq);
throttled = __this_cpu_xchg(perf_throttled_count, 0);
tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
perf_adjust_freq_unthr_context(ctx, throttled);
@@ -3564,6 +3554,28 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
atomic_dec(&per_cpu(perf_cgroup_events, cpu));
}
#ifdef CONFIG_NO_HZ_FULL
static DEFINE_SPINLOCK(nr_freq_lock);
#endif
static void unaccount_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
spin_lock(&nr_freq_lock);
if (atomic_dec_and_test(&nr_freq_events))
tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
spin_unlock(&nr_freq_lock);
#endif
}
static void unaccount_freq_event(void)
{
if (tick_nohz_full_enabled())
unaccount_freq_event_nohz();
else
atomic_dec(&nr_freq_events);
}
static void unaccount_event(struct perf_event *event)
{
bool dec = false;
@@ -3580,7 +3592,7 @@ static void unaccount_event(struct perf_event *event)
if (event->attr.task)
atomic_dec(&nr_task_events);
if (event->attr.freq)
atomic_dec(&nr_freq_events);
unaccount_freq_event();
if (event->attr.context_switch) {
dec = true;
atomic_dec(&nr_switch_events);
@@ -6424,9 +6436,9 @@ static int __perf_event_overflow(struct perf_event *event,
if (unlikely(throttle
&& hwc->interrupts >= max_samples_per_tick)) {
__this_cpu_inc(perf_throttled_count);
tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
hwc->interrupts = MAX_INTERRUPTS;
perf_log_throttle(event, 0);
tick_nohz_full_kick();
ret = 1;
}
}
@@ -6785,7 +6797,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
kfree_rcu(hlist, rcu_head);
}
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
static void swevent_hlist_put_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
@@ -6797,15 +6809,15 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
mutex_unlock(&swhash->hlist_mutex);
}
static void swevent_hlist_put(struct perf_event *event)
static void swevent_hlist_put(void)
{
int cpu;
for_each_possible_cpu(cpu)
swevent_hlist_put_cpu(event, cpu);
swevent_hlist_put_cpu(cpu);
}
static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
static int swevent_hlist_get_cpu(int cpu)
{
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
int err = 0;
@@ -6828,14 +6840,13 @@ exit:
return err;
}
static int swevent_hlist_get(struct perf_event *event)
static int swevent_hlist_get(void)
{
int err;
int cpu, failed_cpu;
int err, cpu, failed_cpu;
get_online_cpus();
for_each_possible_cpu(cpu) {
err = swevent_hlist_get_cpu(event, cpu);
err = swevent_hlist_get_cpu(cpu);
if (err) {
failed_cpu = cpu;
goto fail;
@@ -6848,7 +6859,7 @@ fail:
for_each_possible_cpu(cpu) {
if (cpu == failed_cpu)
break;
swevent_hlist_put_cpu(event, cpu);
swevent_hlist_put_cpu(cpu);
}
put_online_cpus();
@@ -6864,7 +6875,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
WARN_ON(event->parent);
static_key_slow_dec(&perf_swevent_enabled[event_id]);
swevent_hlist_put(event);
swevent_hlist_put();
}
static int perf_swevent_init(struct perf_event *event)
@@ -6895,7 +6906,7 @@ static int perf_swevent_init(struct perf_event *event)
if (!event->parent) {
int err;
err = swevent_hlist_get(event);
err = swevent_hlist_get();
if (err)
return err;
@@ -7816,6 +7827,27 @@ static void account_event_cpu(struct perf_event *event, int cpu)
atomic_inc(&per_cpu(perf_cgroup_events, cpu));
}
/* Freq events need the tick to stay alive (see perf_event_task_tick). */
static void account_freq_event_nohz(void)
{
#ifdef CONFIG_NO_HZ_FULL
/* Lock so we don't race with concurrent unaccount */
spin_lock(&nr_freq_lock);
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
spin_unlock(&nr_freq_lock);
#endif
}
static void account_freq_event(void)
{
if (tick_nohz_full_enabled())
account_freq_event_nohz();
else
atomic_inc(&nr_freq_events);
}
static void account_event(struct perf_event *event)
{
bool inc = false;
@@ -7831,10 +7863,8 @@ static void account_event(struct perf_event *event)
atomic_inc(&nr_comm_events);
if (event->attr.task)
atomic_inc(&nr_task_events);
if (event->attr.freq) {
if (atomic_inc_return(&nr_freq_events) == 1)
tick_nohz_full_kick_all();
}
if (event->attr.freq)
account_freq_event();
if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
inc = true;
@@ -8001,6 +8031,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
}
}
/* symmetric to unaccount_event() in _free_event() */
account_event(event);
return event;
err_per_task:
@@ -8364,8 +8397,6 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
account_event(event);
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -8662,8 +8693,6 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
/* Mark owner so we could distinguish it from user events. */
event->owner = TASK_TOMBSTONE;
account_event(event);
ctx = find_get_context(event->pmu, task, event);
if (IS_ERR(ctx)) {
err = PTR_ERR(ctx);
@@ -9447,6 +9476,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
static int __init perf_event_sysfs_init(void)
{

View File

@@ -1178,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr)
goto free_area;
area->xol_mapping.name = "[uprobes]";
area->xol_mapping.fault = NULL;
area->xol_mapping.pages = area->pages;
area->pages[0] = alloc_page(GFP_HIGHUSER);
if (!area->pages[0])

View File

@@ -124,16 +124,16 @@
* futex_wait(futex, val);
*
* waiters++; (a)
* mb(); (A) <-- paired with -.
* |
* lock(hash_bucket(futex)); |
* |
* uval = *futex; |
* | *futex = newval;
* | sys_futex(WAKE, futex);
* | futex_wake(futex);
* |
* `-------> mb(); (B)
* smp_mb(); (A) <-- paired with -.
* |
* lock(hash_bucket(futex)); |
* |
* uval = *futex; |
* | *futex = newval;
* | sys_futex(WAKE, futex);
* | futex_wake(futex);
* |
* `--------> smp_mb(); (B)
* if (uval == val)
* queue();
* unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
* as full barrier (B), see the ordering comment above.
* as smp_mb(); (B), see the ordering comment above.
*/
smp_mb__after_atomic();
}
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
ihold(key->shared.inode); /* implies MB (B) */
ihold(key->shared.inode); /* implies smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
futex_get_mm(key); /* implies MB (B) */
futex_get_mm(key); /* implies smp_mb(); (B) */
break;
default:
/*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
* mm, therefore the only purpose of calling get_futex_key_refs
* is because we need the barrier for the lockless waiter check.
*/
smp_mb(); /* explicit MB (B) */
smp_mb(); /* explicit smp_mb(); (B) */
}
}
@@ -497,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key); /* implies MB (B) */
get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
@@ -520,7 +520,20 @@ again:
else
err = 0;
lock_page(page);
/*
* The treatment of mapping from this point on is critical. The page
* lock protects many things but in this context the page lock
* stabilizes mapping, prevents inode freeing in the shared
* file-backed region case and guards against movement to swap cache.
*
* Strictly speaking the page lock is not needed in all cases being
* considered here and page lock forces unnecessarily serialization
* From this point on, mapping will be re-verified if necessary and
* page lock will be acquired only if it is unavoidable
*/
page = compound_head(page);
mapping = READ_ONCE(page->mapping);
/*
* If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
@@ -536,19 +549,31 @@ again:
* shmem_writepage move it from filecache to swapcache beneath us:
* an unlikely race, but we do need to retry for page->mapping.
*/
mapping = compound_head(page)->mapping;
if (!mapping) {
int shmem_swizzled = PageSwapCache(page);
if (unlikely(!mapping)) {
int shmem_swizzled;
/*
* Page lock is required to identify which special case above
* applies. If this is really a shmem page then the page lock
* will prevent unexpected transitions.
*/
lock_page(page);
shmem_swizzled = PageSwapCache(page) || page->mapping;
unlock_page(page);
put_page(page);
if (shmem_swizzled)
goto again;
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
* If the futex key is stored on an anonymous page, then the associated
* object is the mm which is implicitly pinned by the calling process.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
@@ -566,16 +591,74 @@ again:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
get_futex_key_refs(key); /* implies smp_mb(); (B) */
} else {
struct inode *inode;
/*
* The associated futex object in this case is the inode and
* the page->mapping must be traversed. Ordinarily this should
* be stabilised under page lock but it's not strictly
* necessary in this case as we just want to pin the inode, not
* update the radix tree or anything like that.
*
* The RCU read lock is taken as the inode is finally freed
* under RCU. If the mapping still matches expectations then the
* mapping->host can be safely accessed as being a valid inode.
*/
rcu_read_lock();
if (READ_ONCE(page->mapping) != mapping) {
rcu_read_unlock();
put_page(page);
goto again;
}
inode = READ_ONCE(mapping->host);
if (!inode) {
rcu_read_unlock();
put_page(page);
goto again;
}
/*
* Take a reference unless it is about to be freed. Previously
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
* truncated in parallel so warn for now if this happens.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
rcu_read_unlock();
put_page(page);
goto again;
}
/* Should be impossible but lets be paranoid for now */
if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
err = -EFAULT;
rcu_read_unlock();
iput(inode);
goto out;
}
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
key->shared.inode = mapping->host;
key->shared.inode = inode;
key->shared.pgoff = basepage_index(page);
rcu_read_unlock();
}
get_futex_key_refs(key); /* implies MB (B) */
out:
unlock_page(page);
put_page(page);
return err;
}
@@ -1864,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
spin_lock(&hb->lock); /* implies MB (A) */
spin_lock(&hb->lock); /* implies smp_mb(); (A) */
return hb;
}
@@ -1927,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
barrier();
/*
* q->lock_ptr can change between this read and the following spin_lock.
* Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
* optimizing lock_ptr out of the logic below.
*/
lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
/*

View File

@@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY
bool
select IRQ_DOMAIN
# Generic IRQ IPI support
config GENERIC_IRQ_IPI
bool
# Generic MSI interrupt support
config GENERIC_MSI_IRQ
bool

View File

@@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o
obj-$(CONFIG_PM_SLEEP) += pm.o
obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o
obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o

View File

@@ -961,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_mask(data);
}
EXPORT_SYMBOL_GPL(irq_chip_mask_parent);
/**
* irq_chip_unmask_parent - Unmask the parent interrupt
@@ -971,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_unmask(data);
}
EXPORT_SYMBOL_GPL(irq_chip_unmask_parent);
/**
* irq_chip_eoi_parent - Invoke EOI on the parent interrupt
@@ -981,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data)
data = data->parent_data;
data->chip->irq_eoi(data);
}
EXPORT_SYMBOL_GPL(irq_chip_eoi_parent);
/**
* irq_chip_set_affinity_parent - Set affinity on the parent interrupt
@@ -1016,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type)
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(irq_chip_set_type_parent);
/**
* irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware

View File

@@ -136,10 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int flags = 0, irq = desc->irq_data.irq;
struct irqaction *action = desc->action;
struct irqaction *action;
/* action might have become NULL since we dropped the lock */
while (action) {
for_each_action_of_desc(desc, action) {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
@@ -173,7 +172,6 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
}
retval |= res;
action = action->next;
}
add_interrupt_randomness(irq, flags);

View File

@@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
#define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK)
#define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU)
#define for_each_action_of_desc(desc, act) \
for (act = desc->act; act; act = act->next)
struct irq_desc *
__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus,
unsigned int check);
@@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
__irq_put_desc_unlock(desc, flags, false);
}
#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors)
/*
* Manipulation functions for irq_data.state
*/
@@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
return __irqd_to_state(d) & mask;
}
#undef __irqd_to_state
static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);

326
kernel/irq/ipi.c Normal file
View File

@@ -0,0 +1,326 @@
/*
* linux/kernel/irq/ipi.c
*
* Copyright (C) 2015 Imagination Technologies Ltd
* Author: Qais Yousef <qais.yousef@imgtec.com>
*
* This file contains driver APIs to the IPI subsystem.
*/
#define pr_fmt(fmt) "genirq/ipi: " fmt
#include <linux/irqdomain.h>
#include <linux/irq.h>
/**
* irq_reserve_ipi() - Setup an IPI to destination cpumask
* @domain: IPI domain
* @dest: cpumask of cpus which can receive the IPI
*
* Allocate a virq that can be used to send IPI to any CPU in dest mask.
*
* On success it'll return linux irq number and 0 on failure
*/
unsigned int irq_reserve_ipi(struct irq_domain *domain,
const struct cpumask *dest)
{
unsigned int nr_irqs, offset;
struct irq_data *data;
int virq, i;
if (!domain ||!irq_domain_is_ipi(domain)) {
pr_warn("Reservation on a non IPI domain\n");
return 0;
}
if (!cpumask_subset(dest, cpu_possible_mask)) {
pr_warn("Reservation is not in possible_cpu_mask\n");
return 0;
}
nr_irqs = cpumask_weight(dest);
if (!nr_irqs) {
pr_warn("Reservation for empty destination mask\n");
return 0;
}
if (irq_domain_is_ipi_single(domain)) {
/*
* If the underlying implementation uses a single HW irq on
* all cpus then we only need a single Linux irq number for
* it. We have no restrictions vs. the destination mask. The
* underlying implementation can deal with holes nicely.
*/
nr_irqs = 1;
offset = 0;
} else {
unsigned int next;
/*
* The IPI requires a seperate HW irq on each CPU. We require
* that the destination mask is consecutive. If an
* implementation needs to support holes, it can reserve
* several IPI ranges.
*/
offset = cpumask_first(dest);
/*
* Find a hole and if found look for another set bit after the
* hole. For now we don't support this scenario.
*/
next = cpumask_next_zero(offset, dest);
if (next < nr_cpu_ids)
next = cpumask_next(next, dest);
if (next < nr_cpu_ids) {
pr_warn("Destination mask has holes\n");
return 0;
}
}
virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc descs\n");
return 0;
}
virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE,
(void *) dest, true);
if (virq <= 0) {
pr_warn("Can't reserve IPI, failed to alloc hw irqs\n");
goto free_descs;
}
for (i = 0; i < nr_irqs; i++) {
data = irq_get_irq_data(virq + i);
cpumask_copy(data->common->affinity, dest);
data->common->ipi_offset = offset;
}
return virq;
free_descs:
irq_free_descs(virq, nr_irqs);
return 0;
}
/**
* irq_destroy_ipi() - unreserve an IPI that was previously allocated
* @irq: linux irq number to be destroyed
*
* Return the IPIs allocated with irq_reserve_ipi() to the system destroying
* all virqs associated with them.
*/
void irq_destroy_ipi(unsigned int irq)
{
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
struct irq_domain *domain;
unsigned int nr_irqs;
if (!irq || !data || !ipimask)
return;
domain = data->domain;
if (WARN_ON(domain == NULL))
return;
if (!irq_domain_is_ipi(domain)) {
pr_warn("Trying to destroy a non IPI domain!\n");
return;
}
if (irq_domain_is_ipi_per_cpu(domain))
nr_irqs = cpumask_weight(ipimask);
else
nr_irqs = 1;
irq_domain_free_irqs(irq, nr_irqs);
}
/**
* ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu
* @irq: linux irq number
* @cpu: the target cpu
*
* When dealing with coprocessors IPI, we need to inform the coprocessor of
* the hwirq it needs to use to receive and send IPIs.
*
* Returns hwirq value on success and INVALID_HWIRQ on failure.
*/
irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
{
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
if (!data || !ipimask || cpu > nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
return INVALID_HWIRQ;
/*
* Get the real hardware irq number if the underlying implementation
* uses a seperate irq per cpu. If the underlying implementation uses
* a single hardware irq for all cpus then the IPI send mechanism
* needs to take care of the cpu destinations.
*/
if (irq_domain_is_ipi_per_cpu(data->domain))
data = irq_get_irq_data(irq + cpu - data->common->ipi_offset);
return data ? irqd_to_hwirq(data) : INVALID_HWIRQ;
}
EXPORT_SYMBOL_GPL(ipi_get_hwirq);
static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
const struct cpumask *dest, unsigned int cpu)
{
struct cpumask *ipimask = irq_data_get_affinity_mask(data);
if (!chip || !ipimask)
return -EINVAL;
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
if (cpu > nr_cpu_ids)
return -EINVAL;
if (dest) {
if (!cpumask_subset(dest, ipimask))
return -EINVAL;
} else {
if (!cpumask_test_cpu(cpu, ipimask))
return -EINVAL;
}
return 0;
}
/**
* __ipi_send_single - send an IPI to a target Linux SMP CPU
* @desc: pointer to irq_desc of the IRQ
* @cpu: destination CPU, must in the destination mask passed to
* irq_reserve_ipi()
*
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
* Returns zero on success and negative error number on failure.
*/
int __ipi_send_single(struct irq_desc *desc, unsigned int cpu)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
struct irq_chip *chip = irq_data_get_irq_chip(data);
#ifdef DEBUG
/*
* Minimise the overhead by omitting the checks for Linux SMP IPIs.
* Since the callers should be arch or core code which is generally
* trusted, only check for errors when debugging.
*/
if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
return -EINVAL;
#endif
if (!chip->ipi_send_single) {
chip->ipi_send_mask(data, cpumask_of(cpu));
return 0;
}
/* FIXME: Store this information in irqdata flags */
if (irq_domain_is_ipi_per_cpu(data->domain) &&
cpu != data->common->ipi_offset) {
/* use the correct data for that cpu */
unsigned irq = data->irq + cpu - data->common->ipi_offset;
data = irq_get_irq_data(irq);
}
chip->ipi_send_single(data, cpu);
return 0;
}
/**
* ipi_send_mask - send an IPI to target Linux SMP CPU(s)
* @desc: pointer to irq_desc of the IRQ
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
*
* This function is for architecture or core code to speed up IPI sending. Not
* usable from driver code.
*
* Returns zero on success and negative error number on failure.
*/
int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest)
{
struct irq_data *data = irq_desc_get_irq_data(desc);
struct irq_chip *chip = irq_data_get_irq_chip(data);
unsigned int cpu;
#ifdef DEBUG
/*
* Minimise the overhead by omitting the checks for Linux SMP IPIs.
* Since the callers should be arch or core code which is generally
* trusted, only check for errors when debugging.
*/
if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
return -EINVAL;
#endif
if (chip->ipi_send_mask) {
chip->ipi_send_mask(data, dest);
return 0;
}
if (irq_domain_is_ipi_per_cpu(data->domain)) {
unsigned int base = data->irq;
for_each_cpu(cpu, dest) {
unsigned irq = base + cpu - data->common->ipi_offset;
data = irq_get_irq_data(irq);
chip->ipi_send_single(data, cpu);
}
} else {
for_each_cpu(cpu, dest)
chip->ipi_send_single(data, cpu);
}
return 0;
}
/**
* ipi_send_single - Send an IPI to a single CPU
* @virq: linux irq number from irq_reserve_ipi()
* @cpu: destination CPU, must in the destination mask passed to
* irq_reserve_ipi()
*
* Returns zero on success and negative error number on failure.
*/
int ipi_send_single(unsigned int virq, unsigned int cpu)
{
struct irq_desc *desc = irq_to_desc(virq);
struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu)))
return -EINVAL;
return __ipi_send_single(desc, cpu);
}
EXPORT_SYMBOL_GPL(ipi_send_single);
/**
* ipi_send_mask - Send an IPI to target CPU(s)
* @virq: linux irq number from irq_reserve_ipi()
* @dest: dest CPU(s), must be a subset of the mask passed to
* irq_reserve_ipi()
*
* Returns zero on success and negative error number on failure.
*/
int ipi_send_mask(unsigned int virq, const struct cpumask *dest)
{
struct irq_desc *desc = irq_to_desc(virq);
struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL;
struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL;
if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0)))
return -EINVAL;
return __ipi_send_mask(desc, dest);
}
EXPORT_SYMBOL_GPL(ipi_send_mask);

View File

@@ -24,10 +24,27 @@
static struct lock_class_key irq_desc_lock_class;
#if defined(CONFIG_SMP)
static int __init irq_affinity_setup(char *str)
{
zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpulist_parse(str, irq_default_affinity);
/*
* Set at least the boot cpu. We don't want to end up with
* bugreports caused by random comandline masks
*/
cpumask_set_cpu(smp_processor_id(), irq_default_affinity);
return 1;
}
__setup("irqaffinity=", irq_affinity_setup);
static void __init init_irq_default_affinity(void)
{
alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
cpumask_setall(irq_default_affinity);
#ifdef CONFIG_CPUMASK_OFFSTACK
if (!irq_default_affinity)
zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
#endif
if (cpumask_empty(irq_default_affinity))
cpumask_setall(irq_default_affinity);
}
#else
static void __init init_irq_default_affinity(void)

View File

@@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex);
static DEFINE_MUTEX(revmap_trees_mutex);
static struct irq_domain *irq_default_domain;
static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs,
irq_hw_number_t hwirq, int node);
static void irq_domain_check_hierarchy(struct irq_domain *domain);
struct irqchip_fwid {
@@ -840,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = {
};
EXPORT_SYMBOL_GPL(irq_domain_simple_ops);
static int irq_domain_alloc_descs(int virq, unsigned int cnt,
irq_hw_number_t hwirq, int node)
int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq,
int node)
{
unsigned int hint;
@@ -895,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent,
return domain;
}
EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy);
static void irq_domain_insert_irq(int virq)
{
@@ -1045,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq,
return 0;
}
EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip);
/**
* irq_domain_set_info - Set the complete data for a @virq in @domain
@@ -1078,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data)
irq_data->chip = &no_irq_chip;
irq_data->chip_data = NULL;
}
EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data);
/**
* irq_domain_free_irqs_common - Clear irq_data and free the parent
@@ -1275,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain,
nr_irqs, arg);
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent);
/**
* irq_domain_free_irqs_parent - Free interrupts from parent domain
@@ -1292,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain,
irq_domain_free_irqs_recursive(domain->parent, irq_base,
nr_irqs);
}
EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent);
/**
* irq_domain_activate_irq - Call domain_ops->activate recursively to activate

View File

@@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq)
*/
void irq_set_thread_affinity(struct irq_desc *desc)
{
struct irqaction *action = desc->action;
struct irqaction *action;
while (action) {
for_each_action_of_desc(desc, action)
if (action->thread)
set_bit(IRQTF_AFFINITY, &action->thread_flags);
action = action->next;
}
}
#ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id)
return;
raw_spin_lock_irqsave(&desc->lock, flags);
for (action = desc->action; action; action = action->next) {
for_each_action_of_desc(desc, action) {
if (action->dev_id == dev_id) {
if (action->thread)
__irq_wake_thread(desc, action);

View File

@@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
int ret = 1;
raw_spin_lock_irqsave(&desc->lock, flags);
for (action = desc->action ; action; action = action->next) {
for_each_action_of_desc(desc, action) {
if ((action != new_action) && action->name &&
!strcmp(new_action->name, action->name)) {
ret = 0;

View File

@@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret)
* desc->lock here. See synchronize_irq().
*/
raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
for_each_action_of_desc(desc, action) {
printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler);
if (action->thread_fn)
printk(KERN_CONT " threaded [<%p>] %pf",
action->thread_fn, action->thread_fn);
printk(KERN_CONT "\n");
action = action->next;
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
}

View File

@@ -66,13 +66,15 @@ struct resource crashk_res = {
.name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
.desc = IORES_DESC_CRASH_KERNEL
};
struct resource crashk_low_res = {
.name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
.flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM,
.desc = IORES_DESC_CRASH_KERNEL
};
int kexec_should_crash(struct task_struct *p)
@@ -959,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size)
ram_res->start = end;
ram_res->end = crashk_res.end;
ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM;
ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM;
ram_res->name = "System RAM";
crashk_res.end = end - 1;

View File

@@ -524,10 +524,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz,
/* Walk the RAM ranges and allocate a suitable range for the buffer */
if (image->type == KEXEC_TYPE_CRASH)
ret = walk_iomem_res("Crash kernel",
IORESOURCE_MEM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end, kbuf,
locate_mem_hole_callback);
ret = walk_iomem_res_desc(crashk_res.desc,
IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY,
crashk_res.start, crashk_res.end, kbuf,
locate_mem_hole_callback);
else
ret = walk_system_ram_res(0, -1, kbuf,
locate_mem_hole_callback);

View File

@@ -47,12 +47,12 @@
* of times)
*/
#include <linux/latencytop.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/spinlock.h>
#include <linux/proc_fs.h>
#include <linux/latencytop.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/list.h>
@@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void)
proc_create("latency_stats", 0644, NULL, &lstats_fops);
return 0;
}
int sysctl_latencytop(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
int err;
err = proc_dointvec(table, write, buffer, lenp, ppos);
if (latencytop_enabled)
force_schedstat_enabled();
return err;
}
device_initcall(init_lstats_procfs);

View File

@@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void)
return ret;
}
static int lockdep_initialized;
unsigned long nr_list_entries;
static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
@@ -433,19 +431,6 @@ unsigned int nr_process_chains;
unsigned int max_lockdep_depth;
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* We cannot printk in early bootup code. Not even early_printk()
* might work. So we mark any initialization errors and printk
* about it later on, in lockdep_info().
*/
static int lockdep_init_error;
static const char *lock_init_error;
static unsigned long lockdep_init_trace_data[20];
static struct stack_trace lockdep_init_trace = {
.max_entries = ARRAY_SIZE(lockdep_init_trace_data),
.entries = lockdep_init_trace_data,
};
/*
* Various lockdep statistics:
*/
@@ -669,20 +654,6 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
struct hlist_head *hash_head;
struct lock_class *class;
#ifdef CONFIG_DEBUG_LOCKDEP
/*
* If the architecture calls into lockdep before initializing
* the hashes then we'll warn about it later. (we cannot printk
* right now)
*/
if (unlikely(!lockdep_initialized)) {
lockdep_init();
lockdep_init_error = 1;
lock_init_error = lock->name;
save_stack_trace(&lockdep_init_trace);
}
#endif
if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
debug_locks_off();
printk(KERN_ERR
@@ -2010,6 +1981,53 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
return lock_classes + chain_hlocks[chain->base + i];
}
/*
* Returns the index of the first held_lock of the current chain
*/
static inline int get_first_held_lock(struct task_struct *curr,
struct held_lock *hlock)
{
int i;
struct held_lock *hlock_curr;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
if (hlock_curr->irq_context != hlock->irq_context)
break;
}
return ++i;
}
/*
* Checks whether the chain and the current held locks are consistent
* in depth and also in content. If they are not it most likely means
* that there was a collision during the calculation of the chain_key.
* Returns: 0 not passed, 1 passed
*/
static int check_no_collision(struct task_struct *curr,
struct held_lock *hlock,
struct lock_chain *chain)
{
#ifdef CONFIG_DEBUG_LOCKDEP
int i, j, id;
i = get_first_held_lock(curr, hlock);
if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1)))
return 0;
for (j = 0; j < chain->depth - 1; j++, i++) {
id = curr->held_locks[i].class_idx - 1;
if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id))
return 0;
}
#endif
return 1;
}
/*
* Look up a dependency chain. If the key is not present yet then
* add it and return 1 - in this case the new dependency chain is
@@ -2023,7 +2041,6 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr;
int i, j;
/*
@@ -2041,6 +2058,9 @@ static inline int lookup_chain_cache(struct task_struct *curr,
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
if (!check_no_collision(curr, hlock, chain))
return 0;
if (very_verbose(class))
printk("\nhash chain already cached, key: "
"%016Lx tail class: [%p] %s\n",
@@ -2078,13 +2098,7 @@ cache_hit:
chain = lock_chains + nr_lock_chains++;
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
if (hlock_curr->irq_context != hlock->irq_context)
break;
}
i++;
i = get_first_held_lock(curr, hlock);
chain->depth = curr->lockdep_depth + 1 - i;
if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
chain->base = nr_chain_hlocks;
@@ -2172,7 +2186,7 @@ static void check_chain_key(struct task_struct *curr)
{
#ifdef CONFIG_DEBUG_LOCKDEP
struct held_lock *hlock, *prev_hlock = NULL;
unsigned int i, id;
unsigned int i;
u64 chain_key = 0;
for (i = 0; i < curr->lockdep_depth; i++) {
@@ -2189,17 +2203,16 @@ static void check_chain_key(struct task_struct *curr)
(unsigned long long)hlock->prev_chain_key);
return;
}
id = hlock->class_idx - 1;
/*
* Whoops ran out of static storage again?
*/
if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS))
return;
if (prev_hlock && (prev_hlock->irq_context !=
hlock->irq_context))
chain_key = 0;
chain_key = iterate_chain_key(chain_key, id);
chain_key = iterate_chain_key(chain_key, hlock->class_idx);
prev_hlock = hlock;
}
if (chain_key != curr->curr_chain_key) {
@@ -3077,7 +3090,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
struct task_struct *curr = current;
struct lock_class *class = NULL;
struct held_lock *hlock;
unsigned int depth, id;
unsigned int depth;
int chain_head = 0;
int class_idx;
u64 chain_key;
@@ -3180,11 +3193,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
* The 'key ID' is what is the most compact key value to drive
* the hash, not class->key.
*/
id = class - lock_classes;
/*
* Whoops, we did it again.. ran straight out of our static allocation.
*/
if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS))
return 0;
chain_key = curr->curr_chain_key;
@@ -3202,7 +3214,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
chain_key = 0;
chain_head = 1;
}
chain_key = iterate_chain_key(chain_key, id);
chain_key = iterate_chain_key(chain_key, class_idx);
if (nest_lock && !__lock_is_held(nest_lock))
return print_lock_nested_lock_not_held(curr, hlock, ip);
@@ -4013,28 +4025,6 @@ out_restore:
raw_local_irq_restore(flags);
}
void lockdep_init(void)
{
int i;
/*
* Some architectures have their own start_kernel()
* code which calls lockdep_init(), while we also
* call lockdep_init() from the start_kernel() itself,
* and we want to initialize the hashes only once:
*/
if (lockdep_initialized)
return;
for (i = 0; i < CLASSHASH_SIZE; i++)
INIT_HLIST_HEAD(classhash_table + i);
for (i = 0; i < CHAINHASH_SIZE; i++)
INIT_HLIST_HEAD(chainhash_table + i);
lockdep_initialized = 1;
}
void __init lockdep_info(void)
{
printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
@@ -4061,14 +4051,6 @@ void __init lockdep_info(void)
printk(" per task-struct memory footprint: %lu bytes\n",
sizeof(struct held_lock) * MAX_LOCK_DEPTH);
#ifdef CONFIG_DEBUG_LOCKDEP
if (lockdep_init_error) {
printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error);
printk("Call stack leading to lockdep invocation was:\n");
print_stack_trace(&lockdep_init_trace, 0);
}
#endif
}
static void

View File

@@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node)
node->locked = 0;
node->next = NULL;
prev = xchg_acquire(lock, node);
/*
* We rely on the full barrier with global transitivity implied by the
* below xchg() to order the initialization stores above against any
* observation of @node. And to provide the ACQUIRE ordering associated
* with a LOCK primitive.
*/
prev = xchg(lock, node);
if (likely(prev == NULL)) {
/*
* Lock acquired, don't need to set node->locked to 1. Threads

View File

@@ -716,6 +716,7 @@ static inline void
__mutex_unlock_common_slowpath(struct mutex *lock, int nested)
{
unsigned long flags;
WAKE_Q(wake_q);
/*
* As a performance measurement, release the lock before doing other
@@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested)
struct mutex_waiter, list);
debug_mutex_wake_waiter(lock, waiter);
wake_up_process(waiter->task);
wake_q_add(&wake_q, waiter->task);
}
spin_unlock_mutex(&lock->wait_lock, flags);
wake_up_q(&wake_q);
}
/*

View File

@@ -358,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
* sequentiality; this is because not all clear_pending_set_locked()
* implementations imply full barriers.
*/
while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
cpu_relax();
smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK));
/*
* take ownership and clear the pending bit.
@@ -435,7 +434,7 @@ queue:
*
* The PV pv_wait_head_or_lock function, if active, will acquire
* the lock and return a non-zero value. So we have to skip the
* smp_load_acquire() call. As the next PV queue head hasn't been
* smp_cond_acquire() call. As the next PV queue head hasn't been
* designated yet, there is no way for the locked value to become
* _Q_SLOW_VAL. So both the set_locked() and the
* atomic_cmpxchg_relaxed() calls will be safe.
@@ -466,7 +465,7 @@ locked:
break;
}
/*
* The smp_load_acquire() call above has provided the necessary
* The smp_cond_acquire() call above has provided the necessary
* acquire semantics required for locking. At most two
* iterations of this loop may be ran.
*/

View File

@@ -54,6 +54,11 @@ struct pv_node {
u8 state;
};
/*
* Include queued spinlock statistics code
*/
#include "qspinlock_stat.h"
/*
* By replacing the regular queued_spin_trylock() with the function below,
* it will be called once when a lock waiter enter the PV slowpath before
@@ -65,9 +70,11 @@ struct pv_node {
static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
(cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
qstat_inc(qstat_pv_lock_stealing, ret);
return ret;
}
/*
@@ -137,11 +144,6 @@ static __always_inline int trylock_clear_pending(struct qspinlock *lock)
}
#endif /* _Q_PENDING_BITS == 8 */
/*
* Include queued spinlock statistics code
*/
#include "qspinlock_stat.h"
/*
* Lock and MCS node addresses hash table for fast lookup
*
@@ -398,6 +400,11 @@ pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
if (READ_ONCE(pn->state) == vcpu_hashed)
lp = (struct qspinlock **)1;
/*
* Tracking # of slowpath locking operations
*/
qstat_inc(qstat_pv_lock_slowpath, true);
for (;; waitcnt++) {
/*
* Set correct vCPU state to be used by queue node wait-early

View File

@@ -22,6 +22,7 @@
* pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake
* pv_latency_kick - average latency (ns) of vCPU kick operation
* pv_latency_wake - average latency (ns) from vCPU kick to wakeup
* pv_lock_slowpath - # of locking operations via the slowpath
* pv_lock_stealing - # of lock stealing operations
* pv_spurious_wakeup - # of spurious wakeups
* pv_wait_again - # of vCPU wait's that happened after a vCPU kick
@@ -45,6 +46,7 @@ enum qlock_stats {
qstat_pv_kick_wake,
qstat_pv_latency_kick,
qstat_pv_latency_wake,
qstat_pv_lock_slowpath,
qstat_pv_lock_stealing,
qstat_pv_spurious_wakeup,
qstat_pv_wait_again,
@@ -70,6 +72,7 @@ static const char * const qstat_names[qstat_num + 1] = {
[qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
[qstat_pv_latency_kick] = "pv_latency_kick",
[qstat_pv_latency_wake] = "pv_latency_wake",
[qstat_pv_lock_slowpath] = "pv_lock_slowpath",
[qstat_pv_lock_stealing] = "pv_lock_stealing",
[qstat_pv_wait_again] = "pv_wait_again",
[qstat_pv_wait_early] = "pv_wait_early",
@@ -279,19 +282,6 @@ static inline void __pv_wait(u8 *ptr, u8 val)
#define pv_kick(c) __pv_kick(c)
#define pv_wait(p, v) __pv_wait(p, v)
/*
* PV unfair trylock count tracking function
*/
static inline int qstat_spin_steal_lock(struct qspinlock *lock)
{
int ret = pv_queued_spin_steal_lock(lock);
qstat_inc(qstat_pv_lock_stealing, ret);
return ret;
}
#undef queued_spin_trylock
#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
#else /* CONFIG_QUEUED_LOCK_STAT */
static inline void qstat_inc(enum qlock_stats stat, bool cond) { }

View File

@@ -29,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
static void *try_ram_remap(resource_size_t offset, size_t size)
{
struct page *page = pfn_to_page(offset >> PAGE_SHIFT);
unsigned long pfn = PHYS_PFN(offset);
/* In the simple case just return the existing linear address */
if (!PageHighMem(page))
if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)))
return __va(offset);
return NULL; /* fallback to ioremap_cache */
}
@@ -47,7 +47,7 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* being mapped does not have i/o side effects and the __iomem
* annotation is not applicable.
*
* MEMREMAP_WB - matches the default mapping for "System RAM" on
* MEMREMAP_WB - matches the default mapping for System RAM on
* the architecture. This is usually a read-allocate write-back cache.
* Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
* memremap() will bypass establishing a new mapping and instead return
@@ -56,11 +56,12 @@ static void *try_ram_remap(resource_size_t offset, size_t size)
* MEMREMAP_WT - establish a mapping whereby writes either bypass the
* cache or are written through to memory and never exist in a
* cache-dirty state with respect to program visibility. Attempts to
* map "System RAM" with this mapping type will fail.
* map System RAM with this mapping type will fail.
*/
void *memremap(resource_size_t offset, size_t size, unsigned long flags)
{
int is_ram = region_intersects(offset, size, "System RAM");
int is_ram = region_intersects(offset, size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
void *addr = NULL;
if (is_ram == REGION_MIXED) {
@@ -76,7 +77,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* MEMREMAP_WB is special in that it can be satisifed
* from the direct map. Some archs depend on the
* capability of memremap() to autodetect cases where
* the requested range is potentially in "System RAM"
* the requested range is potentially in System RAM.
*/
if (is_ram == REGION_INTERSECTS)
addr = try_ram_remap(offset, size);
@@ -88,7 +89,7 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags)
* If we don't have a mapping yet and more request flags are
* pending then we will be attempting to establish a new virtual
* address mapping. Enforce that this mapping is not aliasing
* "System RAM"
* System RAM.
*/
if (!addr && is_ram == REGION_INTERSECTS && flags) {
WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
@@ -270,13 +271,17 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys)
void *devm_memremap_pages(struct device *dev, struct resource *res,
struct percpu_ref *ref, struct vmem_altmap *altmap)
{
int is_ram = region_intersects(res->start, resource_size(res),
"System RAM");
resource_size_t key, align_start, align_size, align_end;
struct dev_pagemap *pgmap;
struct page_map *page_map;
int error, nid, is_ram;
unsigned long pfn;
int error, nid;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE)
- align_start;
is_ram = region_intersects(align_start, align_size,
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
if (is_ram == REGION_MIXED) {
WARN_ONCE(1, "%s attempted on mixed region %pr\n",
@@ -314,8 +319,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
mutex_lock(&pgmap_lock);
error = 0;
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
align_end = align_start + align_size - 1;
for (key = align_start; key <= align_end; key += SECTION_SIZE) {
struct dev_pagemap *dup;
@@ -351,8 +354,13 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
for_each_device_pfn(pfn, page_map) {
struct page *page = pfn_to_page(pfn);
/* ZONE_DEVICE pages must never appear on a slab lru */
list_force_poison(&page->lru);
/*
* ZONE_DEVICE pages union ->lru with a ->pgmap back
* pointer. It is a bug if a ZONE_DEVICE page is ever
* freed or placed on a driver-private list. Seed the
* storage with LIST_POISON* values.
*/
list_del(&page->lru);
page->pgmap = pgmap;
}
devres_add(dev, page_map);

View File

@@ -59,6 +59,7 @@ int profile_setup(char *str)
if (!strncmp(str, sleepstr, strlen(sleepstr))) {
#ifdef CONFIG_SCHEDSTATS
force_schedstat_enabled();
prof_on = SLEEP_PROFILING;
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;

View File

@@ -932,12 +932,14 @@ rcu_torture_writer(void *arg)
int nsynctypes = 0;
VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
pr_alert("%s" TORTURE_FLAG
" Grace periods expedited from boot/sysfs for %s,\n",
torture_type, cur_ops->name);
pr_alert("%s" TORTURE_FLAG
" Testing of dynamic grace-period expediting diabled.\n",
torture_type);
if (!can_expedite) {
pr_alert("%s" TORTURE_FLAG
" Grace periods expedited from boot/sysfs for %s,\n",
torture_type, cur_ops->name);
pr_alert("%s" TORTURE_FLAG
" Disabled dynamic grace-period expediting.\n",
torture_type);
}
/* Initialize synctype[] array. If none set, take default. */
if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)

View File

@@ -23,7 +23,7 @@
*/
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -122,18 +122,7 @@ free_out:
debugfs_remove_recursive(rcudir);
return 1;
}
static void __exit rcutiny_trace_cleanup(void)
{
debugfs_remove_recursive(rcudir);
}
module_init(rcutiny_trace_init);
module_exit(rcutiny_trace_cleanup);
MODULE_AUTHOR("Paul E. McKenney");
MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation");
MODULE_LICENSE("GPL");
device_initcall(rcutiny_trace_init);
static void check_cpu_stall(struct rcu_ctrlblk *rcp)
{

View File

@@ -108,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
static struct rcu_state *const rcu_state_p;
static struct rcu_data __percpu *const rcu_data_p;
LIST_HEAD(rcu_struct_flavors);
/* Dump rcu_node combining tree at boot to verify correct setup. */
@@ -1083,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp,
rcu_sysidle_check_cpu(rdp, isidle, maxj);
if ((rdp->dynticks_snap & 0x1) == 0) {
trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti"));
return 1;
} else {
if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4,
rdp->mynode->gpnum))
WRITE_ONCE(rdp->gpwrap, true);
return 0;
return 1;
}
return 0;
}
/*
@@ -1173,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
smp_mb(); /* ->cond_resched_completed before *rcrmp. */
WRITE_ONCE(*rcrmp,
READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask);
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
rdp->rsp->jiffies_resched += 5; /* Enable beating. */
} else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) {
/* Time to beat on that CPU again! */
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */
}
/* And if it has been a really long time, kick the CPU as well. */
if (ULONG_CMP_GE(jiffies,
rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) ||
ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs))
resched_cpu(rdp->cpu); /* Force CPU into scheduler. */
return 0;
}
@@ -1246,7 +1245,7 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp)
if (rnp->qsmask & (1UL << cpu))
dump_cpu_task(rnp->grplo + cpu);
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1266,12 +1265,12 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
delta = jiffies - READ_ONCE(rsp->jiffies_stall);
if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/*
* OK, time to rat on our buddy...
@@ -1292,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
ndetected++;
}
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
print_cpu_stall_info_end();
@@ -1357,7 +1356,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
WRITE_ONCE(rsp->jiffies_stall,
jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/*
* Attempt to revive the RCU machinery by forcing a context switch.
@@ -1595,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp,
}
unlock_out:
if (rnp != rnp_root)
raw_spin_unlock(&rnp_root->lock);
raw_spin_unlock_rcu_node(rnp_root);
out:
if (c_out != NULL)
*c_out = c;
@@ -1614,7 +1613,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
int needmore;
struct rcu_data *rdp = this_cpu_ptr(rsp->rda);
rcu_nocb_gp_cleanup(rsp, rnp);
rnp->need_future_gp[c & 0x1] = 0;
needmore = rnp->need_future_gp[(c + 1) & 0x1];
trace_rcu_future_gp(rnp, rdp, c,
@@ -1635,7 +1633,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp)
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
wake_up(&rsp->gp_wq);
swake_up(&rsp->gp_wq);
}
/*
@@ -1815,7 +1813,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
return;
}
needwake = __note_gp_changes(rsp, rnp, rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -1840,7 +1838,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
raw_spin_lock_irq_rcu_node(rnp);
if (!READ_ONCE(rsp->gp_flags)) {
/* Spurious wakeup, tell caller to go back to sleep. */
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
return false;
}
WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */
@@ -1850,7 +1848,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
* Grace period already in progress, don't start another.
* Not supposed to be able to happen.
*/
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
return false;
}
@@ -1859,7 +1857,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
/* Record GP times before starting GP, hence smp_store_release(). */
smp_store_release(&rsp->gpnum, rsp->gpnum + 1);
trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
/*
* Apply per-leaf buffered online and offline operations to the
@@ -1873,7 +1871,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
!rnp->wait_blkd_tasks) {
/* Nothing to do on this leaf rcu_node structure. */
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
continue;
}
@@ -1907,7 +1905,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
rcu_cleanup_dead_rnp(rnp);
}
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
}
/*
@@ -1938,7 +1936,7 @@ static bool rcu_gp_init(struct rcu_state *rsp)
trace_rcu_grace_period_init(rsp->name, rnp->gpnum,
rnp->level, rnp->grplo,
rnp->grphi, rnp->qsmask);
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
}
@@ -1996,7 +1994,7 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time)
raw_spin_lock_irq_rcu_node(rnp);
WRITE_ONCE(rsp->gp_flags,
READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS);
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
}
}
@@ -2010,6 +2008,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
int nocb = 0;
struct rcu_data *rdp;
struct rcu_node *rnp = rcu_get_root(rsp);
struct swait_queue_head *sq;
WRITE_ONCE(rsp->gp_activity, jiffies);
raw_spin_lock_irq_rcu_node(rnp);
@@ -2025,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
* safe for us to drop the lock in order to mark the grace
* period as completed in all of the rcu_node structures.
*/
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
/*
* Propagate new ->completed value to rcu_node structures so
@@ -2046,7 +2045,9 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
needgp = __note_gp_changes(rsp, rnp, rdp) || needgp;
/* smp_mb() provided by prior unlock-lock pair. */
nocb += rcu_future_gp_cleanup(rsp, rnp);
raw_spin_unlock_irq(&rnp->lock);
sq = rcu_nocb_gp_get(rnp);
raw_spin_unlock_irq_rcu_node(rnp);
rcu_nocb_gp_cleanup(sq);
cond_resched_rcu_qs();
WRITE_ONCE(rsp->gp_activity, jiffies);
rcu_gp_slow(rsp, gp_cleanup_delay);
@@ -2068,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
READ_ONCE(rsp->gpnum),
TPS("newreq"));
}
raw_spin_unlock_irq(&rnp->lock);
raw_spin_unlock_irq_rcu_node(rnp);
}
/*
@@ -2092,7 +2093,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("reqwait"));
rsp->gp_state = RCU_GP_WAIT_GPS;
wait_event_interruptible(rsp->gp_wq,
swait_event_interruptible(rsp->gp_wq,
READ_ONCE(rsp->gp_flags) &
RCU_GP_FLAG_INIT);
rsp->gp_state = RCU_GP_DONE_GPS;
@@ -2122,7 +2123,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
READ_ONCE(rsp->gpnum),
TPS("fqswait"));
rsp->gp_state = RCU_GP_WAIT_FQS;
ret = wait_event_interruptible_timeout(rsp->gp_wq,
ret = swait_event_interruptible_timeout(rsp->gp_wq,
rcu_gp_fqs_check_wake(rsp, &gf), j);
rsp->gp_state = RCU_GP_DOING_FQS;
/* Locking provides needed memory barriers. */
@@ -2234,19 +2235,21 @@ static bool rcu_start_gp(struct rcu_state *rsp)
}
/*
* Report a full set of quiescent states to the specified rcu_state
* data structure. This involves cleaning up after the prior grace
* period and letting rcu_start_gp() start up the next grace period
* if one is needed. Note that the caller must hold rnp->lock, which
* is released before return.
* Report a full set of quiescent states to the specified rcu_state data
* structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period
* kthread if another grace period is required. Whether we wake
* the grace-period kthread or it awakens itself for the next round
* of quiescent-state forcing, that kthread will clean up after the
* just-completed grace period. Note that the caller must hold rnp->lock,
* which is released before return.
*/
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
rcu_gp_kthread_wake(rsp);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2275,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
* Our bit has already been cleared, or the
* relevant grace period is already over, so done.
*/
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
@@ -2287,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
mask = rnp->grpmask;
@@ -2297,7 +2300,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
break;
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
rnp_c = rnp;
rnp = rnp->parent;
raw_spin_lock_irqsave_rcu_node(rnp, flags);
@@ -2329,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return; /* Still need more quiescent states! */
}
@@ -2346,19 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
/* Report up the rest of the hierarchy, tracking current ->gpnum. */
gps = rnp->gpnum;
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */
rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
}
/*
* Record a quiescent state for the specified CPU to that CPU's rcu_data
* structure. This must be either called from the specified CPU, or
* called when the specified CPU is known to be offline (and when it is
* also known that no other CPU is concurrently trying to help the offline
* CPU). The lastcomp argument is used to make sure we are still in the
* grace period of interest. We don't want to end the current grace period
* based on quiescent states detected in an earlier grace period!
* structure. This must be called from the specified CPU.
*/
static void
rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
@@ -2383,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
*/
rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */
rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
mask = rdp->grpmask;
if ((rnp->qsmask & mask) == 0) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
} else {
rdp->core_needs_qs = 0;
rdp->core_needs_qs = false;
/*
* This GP can't end until cpu checks in, so all of our
@@ -2599,35 +2597,14 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
rnp->qsmaskinit &= ~mask;
rnp->qsmask &= ~mask;
if (rnp->qsmaskinit) {
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_rcu_node(rnp);
/* irqs remain disabled. */
return;
}
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
}
}
/*
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* function. We now remove it from the rcu_node tree's ->qsmaskinit
* bit masks.
*/
static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return;
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rnp->qsmaskinitnext &= ~mask;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
* The CPU has been completely removed, and some other CPU is reporting
* this fact from process context. Do the remainder of the cleanup,
@@ -2859,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp,
rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
} else {
/* Nothing to do here, so just drop the lock. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
}
@@ -2895,12 +2872,12 @@ static void force_quiescent_state(struct rcu_state *rsp)
raw_spin_unlock(&rnp_old->fqslock);
if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) {
rsp->n_force_qs_lh++;
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
return; /* Someone beat us to it. */
}
WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS);
raw_spin_unlock_irqrestore(&rnp_old->lock, flags);
rcu_gp_kthread_wake(rsp);
raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags);
swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */
}
/*
@@ -2925,7 +2902,7 @@ __rcu_process_callbacks(struct rcu_state *rsp)
if (cpu_needs_another_gp(rsp, rdp)) {
raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */
needwake = rcu_start_gp(rsp);
raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3016,7 +2993,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
raw_spin_lock_rcu_node(rnp_root);
needwake = rcu_start_gp(rsp);
raw_spin_unlock(&rnp_root->lock);
raw_spin_unlock_rcu_node(rnp_root);
if (needwake)
rcu_gp_kthread_wake(rsp);
} else {
@@ -3436,14 +3413,14 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
rcu_for_each_leaf_node(rsp, rnp) {
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (rnp->expmaskinit == rnp->expmaskinitnext) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
continue; /* No new CPUs, nothing to do. */
}
/* Update this node's mask, track old value for propagation. */
oldmask = rnp->expmaskinit;
rnp->expmaskinit = rnp->expmaskinitnext;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* If was already nonzero, nothing to propagate. */
if (oldmask)
@@ -3458,7 +3435,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
if (rnp_up->expmaskinit)
done = true;
rnp_up->expmaskinit |= mask;
raw_spin_unlock_irqrestore(&rnp_up->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags);
if (done)
break;
mask = rnp_up->grpmask;
@@ -3481,7 +3458,7 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
WARN_ON_ONCE(rnp->expmask);
rnp->expmask = rnp->expmaskinit;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -3522,19 +3499,19 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
if (!rnp->expmask)
rcu_initiate_boost(rnp, flags);
else
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
break;
}
if (rnp->parent == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (wake) {
smp_mb(); /* EGP done before wake_up(). */
wake_up(&rsp->expedited_wq);
swake_up(&rsp->expedited_wq);
}
break;
}
mask = rnp->grpmask;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */
rnp = rnp->parent;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled */
WARN_ON_ONCE(!(rnp->expmask & mask));
@@ -3569,7 +3546,7 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp,
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
rnp->expmask &= ~mask;
@@ -3730,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp,
*/
if (rcu_preempt_has_tasks(rnp))
rnp->exp_tasks = rnp->blkd_tasks.next;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* IPI the remaining CPUs for expedited quiescent state. */
mask = 1;
@@ -3747,7 +3724,7 @@ retry_ipi:
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (cpu_online(cpu) &&
(rnp->expmask & mask)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
schedule_timeout_uninterruptible(1);
if (cpu_online(cpu) &&
(rnp->expmask & mask))
@@ -3756,7 +3733,7 @@ retry_ipi:
}
if (!(rnp->expmask & mask))
mask_ofl_ipi &= ~mask;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Report quiescent states for those that went offline. */
mask_ofl_test |= mask_ofl_ipi;
@@ -3780,7 +3757,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
jiffies_start = jiffies;
for (;;) {
ret = wait_event_interruptible_timeout(
ret = swait_event_timeout(
rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root),
jiffies_stall);
@@ -3788,7 +3765,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp)
return;
if (ret < 0) {
/* Hit a signal, disable CPU stall warnings. */
wait_event(rsp->expedited_wq,
swait_event(rsp->expedited_wq,
sync_rcu_preempt_exp_done(rnp_root));
return;
}
@@ -4163,7 +4140,7 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
return;
raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */
rnp->qsmaskinit |= mask;
raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */
}
}
@@ -4187,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->rsp = rsp;
mutex_init(&rdp->exp_funnel_mutex);
rcu_boot_init_nocb_percpu_data(rdp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -4215,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rcu_sysidle_init_percpu_data(rdp->dynticks);
atomic_set(&rdp->dynticks->dynticks,
(atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
/*
* Add CPU to leaf rcu_node pending-online bitmask. Any needed
@@ -4236,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu);
rdp->core_needs_qs = false;
trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static void rcu_prepare_cpu(int cpu)
@@ -4247,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu)
rcu_init_percpu_data(cpu, rsp);
}
#ifdef CONFIG_HOTPLUG_CPU
/*
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* function. We now remove it from the rcu_node tree's ->qsmaskinit
* bit masks.
* The CPU is exiting the idle loop into the arch_cpu_idle_dead()
* function. We now remove it from the rcu_node tree's ->qsmaskinit
* bit masks.
*/
static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
unsigned long mask;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */
if (!IS_ENABLED(CONFIG_HOTPLUG_CPU))
return;
/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
mask = rdp->grpmask;
raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */
rnp->qsmaskinitnext &= ~mask;
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
void rcu_report_dead(unsigned int cpu)
{
struct rcu_state *rsp;
/* QS for any half-done expedited RCU-sched GP. */
preempt_disable();
rcu_report_exp_rdp(&rcu_sched_state,
this_cpu_ptr(rcu_sched_state.rda), true);
preempt_enable();
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
#endif
/*
* Handle CPU online/offline notification events.
*/
@@ -4278,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self,
for_each_rcu_flavor(rsp)
rcu_cleanup_dying_cpu(rsp);
break;
case CPU_DYING_IDLE:
/* QS for any half-done expedited RCU-sched GP. */
preempt_disable();
rcu_report_exp_rdp(&rcu_sched_state,
this_cpu_ptr(rcu_sched_state.rda), true);
preempt_enable();
for_each_rcu_flavor(rsp) {
rcu_cleanup_dying_idle_cpu(cpu, rsp);
}
break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
case CPU_UP_CANCELED:
@@ -4358,7 +4364,7 @@ static int __init rcu_spawn_gp_kthread(void)
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
wake_up_process(t);
}
rcu_spawn_nocb_kthreads();
@@ -4449,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
cpustride *= levelspread[i];
rnp = rsp->level[i];
for (j = 0; j < levelcnt[i]; j++, rnp++) {
raw_spin_lock_init(&rnp->lock);
lockdep_set_class_and_name(&rnp->lock,
raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock));
lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock),
&rcu_node_class[i], buf[i]);
raw_spin_lock_init(&rnp->fqslock);
lockdep_set_class_and_name(&rnp->fqslock,
@@ -4482,8 +4488,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
}
}
init_waitqueue_head(&rsp->gp_wq);
init_waitqueue_head(&rsp->expedited_wq);
init_swait_queue_head(&rsp->gp_wq);
init_swait_queue_head(&rsp->expedited_wq);
rnp = rsp->level[rcu_num_lvls - 1];
for_each_possible_cpu(i) {
while (i > rnp->grphi)

View File

@@ -27,6 +27,7 @@
#include <linux/threads.h>
#include <linux/cpumask.h>
#include <linux/seqlock.h>
#include <linux/swait.h>
#include <linux/stop_machine.h>
/*
@@ -149,8 +150,9 @@ struct rcu_dynticks {
* Definition for node within the RCU grace-period-detection hierarchy.
*/
struct rcu_node {
raw_spinlock_t lock; /* Root rcu_node's lock protects some */
/* rcu_state fields as well as following. */
raw_spinlock_t __private lock; /* Root rcu_node's lock protects */
/* some rcu_state fields as well as */
/* following. */
unsigned long gpnum; /* Current grace period for this node. */
/* This will either be equal to or one */
/* behind the root rcu_node's gpnum. */
@@ -243,7 +245,7 @@ struct rcu_node {
/* Refused to boost: not sure why, though. */
/* This can happen due to race conditions. */
#ifdef CONFIG_RCU_NOCB_CPU
wait_queue_head_t nocb_gp_wq[2];
struct swait_queue_head nocb_gp_wq[2];
/* Place for rcu_nocb_kthread() to wait GP. */
#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
int need_future_gp[2];
@@ -399,7 +401,7 @@ struct rcu_data {
atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */
struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */
struct rcu_head **nocb_follower_tail;
wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */
struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
struct task_struct *nocb_kthread;
int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
@@ -478,7 +480,7 @@ struct rcu_state {
unsigned long gpnum; /* Current gp number. */
unsigned long completed; /* # of last completed gp. */
struct task_struct *gp_kthread; /* Task for grace periods. */
wait_queue_head_t gp_wq; /* Where GP task waits. */
struct swait_queue_head gp_wq; /* Where GP task waits. */
short gp_flags; /* Commands for GP task. */
short gp_state; /* GP kthread sleep state. */
@@ -506,7 +508,7 @@ struct rcu_state {
unsigned long expedited_sequence; /* Take a ticket. */
atomic_long_t expedited_normal; /* # fallbacks to normal. */
atomic_t expedited_need_qs; /* # CPUs left to check in. */
wait_queue_head_t expedited_wq; /* Wait for check-ins. */
struct swait_queue_head expedited_wq; /* Wait for check-ins. */
int ncpus_snap; /* # CPUs seen last time. */
unsigned long jiffies_force_qs; /* Time at which to invoke */
@@ -621,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp);
static void increment_cpu_stall_ticks(void);
static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu);
static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq);
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp);
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
static void rcu_init_one_nocb(struct rcu_node *rnp);
static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
bool lazy, unsigned long flags);
@@ -680,7 +683,7 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
#endif /* #else #ifdef CONFIG_PPC */
/*
* Wrappers for the rcu_node::lock acquire.
* Wrappers for the rcu_node::lock acquire and release.
*
* Because the rcu_nodes form a tree, the tree traversal locking will observe
* different lock values, this in turn means that an UNLOCK of one level
@@ -689,29 +692,48 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll)
*
* In order to restore full ordering between tree levels, augment the regular
* lock acquire functions with smp_mb__after_unlock_lock().
*
* As ->lock of struct rcu_node is a __private field, therefore one should use
* these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock.
*/
static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp)
{
raw_spin_lock(&rnp->lock);
raw_spin_lock(&ACCESS_PRIVATE(rnp, lock));
smp_mb__after_unlock_lock();
}
static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp)
{
raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock));
}
static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp)
{
raw_spin_lock_irq(&rnp->lock);
raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock));
smp_mb__after_unlock_lock();
}
#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
do { \
typecheck(unsigned long, flags); \
raw_spin_lock_irqsave(&(rnp)->lock, flags); \
smp_mb__after_unlock_lock(); \
static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp)
{
raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock));
}
#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \
do { \
typecheck(unsigned long, flags); \
raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \
smp_mb__after_unlock_lock(); \
} while (0)
#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \
do { \
typecheck(unsigned long, flags); \
raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \
} while (0)
static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp)
{
bool locked = raw_spin_trylock(&rnp->lock);
bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock));
if (locked)
smp_mb__after_unlock_lock();

View File

@@ -235,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
rnp->gp_tasks = &t->rcu_node_entry;
if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
rnp->exp_tasks = &t->rcu_node_entry;
raw_spin_unlock(&rnp->lock); /* rrupts remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
/*
* Report the quiescent state for the expedited GP. This expedited
@@ -489,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t)
!!rnp->gp_tasks);
rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags);
} else {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/* Unboost if we were boosted. */
@@ -518,14 +518,14 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
if (!rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
t = list_entry(rnp->gp_tasks->prev,
struct task_struct, rcu_node_entry);
list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
sched_show_task(t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
/*
@@ -807,7 +807,6 @@ void exit_rcu(void)
#else /* #ifdef CONFIG_PREEMPT_RCU */
static struct rcu_state *const rcu_state_p = &rcu_sched_state;
static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data;
/*
* Tell them what RCU they are running.
@@ -991,7 +990,7 @@ static int rcu_boost(struct rcu_node *rnp)
* might exit their RCU read-side critical sections on their own.
*/
if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
}
@@ -1028,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp)
*/
t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&rnp->boost_mtx, t);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
/* Lock only for side effect: boosts task t's priority. */
rt_mutex_lock(&rnp->boost_mtx);
rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */
@@ -1088,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
rnp->n_balk_exp_gp_tasks++;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return;
}
if (rnp->exp_tasks != NULL ||
@@ -1098,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
ULONG_CMP_GE(jiffies, rnp->boost_time))) {
if (rnp->exp_tasks == NULL)
rnp->boost_tasks = rnp->gp_tasks;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
t = rnp->boost_kthread_task;
if (t)
rcu_wake_cond(t, rnp->boost_kthread_status);
} else {
rcu_initiate_boost_trace(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
}
@@ -1172,7 +1171,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
return PTR_ERR(t);
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rnp->boost_kthread_task = t;
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
sp.sched_priority = kthread_prio;
sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */
@@ -1308,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu)
static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
__releases(rnp->lock)
{
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
static void invoke_rcu_callbacks_kthread(void)
@@ -1559,7 +1558,7 @@ static void rcu_prepare_for_idle(void)
rnp = rdp->mynode;
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
if (needwake)
rcu_gp_kthread_wake(rsp);
}
@@ -1811,9 +1810,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
* Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
* grace period.
*/
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]);
swake_up_all(sq);
}
/*
@@ -1829,10 +1828,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq;
}
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return &rnp->nocb_gp_wq[rnp->completed & 0x1];
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
init_waitqueue_head(&rnp->nocb_gp_wq[0]);
init_waitqueue_head(&rnp->nocb_gp_wq[1]);
init_swait_queue_head(&rnp->nocb_gp_wq[0]);
init_swait_queue_head(&rnp->nocb_gp_wq[1]);
}
#ifndef CONFIG_RCU_NOCB_CPU_ALL
@@ -1857,7 +1861,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
wake_up(&rdp_leader->nocb_wq);
swake_up(&rdp_leader->nocb_wq);
}
}
@@ -2059,7 +2063,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
raw_spin_lock_irqsave_rcu_node(rnp, flags);
needwake = rcu_start_future_gp(rnp, rdp, &c);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
if (needwake)
rcu_gp_kthread_wake(rdp->rsp);
@@ -2069,7 +2073,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
*/
trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait"));
for (;;) {
wait_event_interruptible(
swait_event_interruptible(
rnp->nocb_gp_wq[c & 0x1],
(d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c)));
if (likely(d))
@@ -2097,7 +2101,7 @@ wait_again:
/* Wait for callbacks to appear. */
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep");
wait_event_interruptible(my_rdp->nocb_wq,
swait_event_interruptible(my_rdp->nocb_wq,
!READ_ONCE(my_rdp->nocb_leader_sleep));
/* Memory barrier handled by smp_mb() calls below and repoll. */
} else if (firsttime) {
@@ -2172,7 +2176,7 @@ wait_again:
* List was empty, wake up the follower.
* Memory barriers supplied by atomic_long_add().
*/
wake_up(&rdp->nocb_wq);
swake_up(&rdp->nocb_wq);
}
}
@@ -2193,7 +2197,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
if (!rcu_nocb_poll) {
trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
"FollowerSleep");
wait_event_interruptible(rdp->nocb_wq,
swait_event_interruptible(rdp->nocb_wq,
READ_ONCE(rdp->nocb_follower_head));
} else if (firsttime) {
/* Don't drown trace log with "Poll"! */
@@ -2352,7 +2356,7 @@ void __init rcu_init_nohz(void)
static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
{
rdp->nocb_tail = &rdp->nocb_head;
init_waitqueue_head(&rdp->nocb_wq);
init_swait_queue_head(&rdp->nocb_wq);
rdp->nocb_follower_tail = &rdp->nocb_follower_head;
}
@@ -2502,7 +2506,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
return false;
}
static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
{
}
@@ -2510,6 +2514,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq)
{
}
static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
{
return NULL;
}
static void rcu_init_one_nocb(struct rcu_node *rnp)
{
}

View File

@@ -128,6 +128,7 @@ bool rcu_gp_is_normal(void)
{
return READ_ONCE(rcu_normal);
}
EXPORT_SYMBOL_GPL(rcu_gp_is_normal);
static atomic_t rcu_expedited_nesting =
ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);

View File

@@ -333,13 +333,13 @@ int release_resource(struct resource *old)
EXPORT_SYMBOL(release_resource);
/*
* Finds the lowest iomem reosurce exists with-in [res->start.res->end)
* the caller must specify res->start, res->end, res->flags and "name".
* If found, returns 0, res is overwritten, if not found, returns -1.
* This walks through whole tree and not just first level children
* until and unless first_level_children_only is true.
* Finds the lowest iomem resource existing within [res->start.res->end).
* The caller must specify res->start, res->end, res->flags, and optionally
* desc. If found, returns 0, res is overwritten, if not found, returns -1.
* This function walks the whole tree and not just first level children until
* and unless first_level_children_only is true.
*/
static int find_next_iomem_res(struct resource *res, char *name,
static int find_next_iomem_res(struct resource *res, unsigned long desc,
bool first_level_children_only)
{
resource_size_t start, end;
@@ -358,9 +358,9 @@ static int find_next_iomem_res(struct resource *res, char *name,
read_lock(&resource_lock);
for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) {
if (p->flags != res->flags)
if ((p->flags & res->flags) != res->flags)
continue;
if (name && strcmp(p->name, name))
if ((desc != IORES_DESC_NONE) && (desc != p->desc))
continue;
if (p->start > end) {
p = NULL;
@@ -385,15 +385,18 @@ static int find_next_iomem_res(struct resource *res, char *name,
* Walks through iomem resources and calls func() with matching resource
* ranges. This walks through whole tree and not just first level children.
* All the memory ranges which overlap start,end and also match flags and
* name are valid candidates.
* desc are valid candidates.
*
* @name: name of resource
* @flags: resource flags
* @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check.
* @flags: I/O resource flags
* @start: start addr
* @end: end addr
*
* NOTE: For a new descriptor search, define a new IORES_DESC in
* <linux/ioport.h> and set it in 'desc' of a target resource entry.
*/
int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
void *arg, int (*func)(u64, u64, void *))
int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start,
u64 end, void *arg, int (*func)(u64, u64, void *))
{
struct resource res;
u64 orig_end;
@@ -403,23 +406,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end,
res.end = end;
res.flags = flags;
orig_end = res.end;
while ((res.start < res.end) &&
(!find_next_iomem_res(&res, name, false))) {
(!find_next_iomem_res(&res, desc, false))) {
ret = (*func)(res.start, res.end, arg);
if (ret)
break;
res.start = res.end + 1;
res.end = orig_end;
}
return ret;
}
/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM". This function deals with
* full ranges and not pfn. If resources are not pfn aligned, dealing
* with pfn can truncate ranges.
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* Now, this function is only for System RAM, it deals with full ranges and
* not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate
* ranges.
*/
int walk_system_ram_res(u64 start, u64 end, void *arg,
int (*func)(u64, u64, void *))
@@ -430,10 +437,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
res.start = start;
res.end = end;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
(!find_next_iomem_res(&res, "System RAM", true))) {
(!find_next_iomem_res(&res, IORES_DESC_NONE, true))) {
ret = (*func)(res.start, res.end, arg);
if (ret)
break;
@@ -446,9 +453,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg,
#if !defined(CONFIG_ARCH_HAS_WALK_MEMORY)
/*
* This function calls callback against all memory range of "System RAM"
* which are marked as IORESOURCE_MEM and IORESOUCE_BUSY.
* Now, this function is only for "System RAM".
* This function calls the @func callback against all memory ranges of type
* System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY.
* It is to be used only for System RAM.
*/
int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg, int (*func)(unsigned long, unsigned long, void *))
@@ -460,10 +467,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages,
res.start = (u64) start_pfn << PAGE_SHIFT;
res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
orig_end = res.end;
while ((res.start < res.end) &&
(find_next_iomem_res(&res, "System RAM", true) >= 0)) {
(find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) {
pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT;
end_pfn = (res.end + 1) >> PAGE_SHIFT;
if (end_pfn > pfn)
@@ -484,7 +491,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg)
}
/*
* This generic page_is_ram() returns true if specified address is
* registered as "System RAM" in iomem_resource list.
* registered as System RAM in iomem_resource list.
*/
int __weak page_is_ram(unsigned long pfn)
{
@@ -496,30 +503,34 @@ EXPORT_SYMBOL_GPL(page_is_ram);
* region_intersects() - determine intersection of region with known resources
* @start: region start address
* @size: size of region
* @name: name of resource (in iomem_resource)
* @flags: flags of resource (in iomem_resource)
* @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE
*
* Check if the specified region partially overlaps or fully eclipses a
* resource identified by @name. Return REGION_DISJOINT if the region
* does not overlap @name, return REGION_MIXED if the region overlaps
* @type and another resource, and return REGION_INTERSECTS if the
* region overlaps @type and no other defined resource. Note, that
* REGION_INTERSECTS is also returned in the case when the specified
* region overlaps RAM and undefined memory holes.
* resource identified by @flags and @desc (optional with IORES_DESC_NONE).
* Return REGION_DISJOINT if the region does not overlap @flags/@desc,
* return REGION_MIXED if the region overlaps @flags/@desc and another
* resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc
* and no other defined resource. Note that REGION_INTERSECTS is also
* returned in the case when the specified region overlaps RAM and undefined
* memory holes.
*
* region_intersect() is used by memory remapping functions to ensure
* the user is not remapping RAM and is a vast speed up over walking
* through the resource table page by page.
*/
int region_intersects(resource_size_t start, size_t size, const char *name)
int region_intersects(resource_size_t start, size_t size, unsigned long flags,
unsigned long desc)
{
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
resource_size_t end = start + size - 1;
int type = 0; int other = 0;
struct resource *p;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
bool is_type = (((p->flags & flags) == flags) &&
((desc == IORES_DESC_NONE) ||
(desc == p->desc)));
if (start >= p->start && start <= p->end)
is_type ? type++ : other++;
@@ -538,6 +549,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name)
return REGION_DISJOINT;
}
EXPORT_SYMBOL_GPL(region_intersects);
void __weak arch_remove_reservations(struct resource *avail)
{
@@ -948,6 +960,7 @@ static void __init __reserve_region_with_split(struct resource *root,
res->start = start;
res->end = end;
res->flags = IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
while (1) {
@@ -982,6 +995,7 @@ static void __init __reserve_region_with_split(struct resource *root,
next_res->start = conflict->end + 1;
next_res->end = end;
next_res->flags = IORESOURCE_BUSY;
next_res->desc = IORES_DESC_NONE;
}
} else {
res->start = conflict->end + 1;
@@ -1071,8 +1085,9 @@ struct resource * __request_region(struct resource *parent,
res->name = name;
res->start = start;
res->end = start + n - 1;
res->flags = resource_type(parent);
res->flags = resource_type(parent) | resource_ext_type(parent);
res->flags |= IORESOURCE_BUSY | flags;
res->desc = IORES_DESC_NONE;
write_lock(&resource_lock);
@@ -1238,6 +1253,7 @@ int release_mem_region_adjustable(struct resource *parent,
new_res->start = end + 1;
new_res->end = res->end;
new_res->flags = res->flags;
new_res->desc = res->desc;
new_res->parent = res->parent;
new_res->sibling = res->sibling;
new_res->child = NULL;
@@ -1413,6 +1429,7 @@ static int __init reserve_setup(char *str)
res->start = io_start;
res->end = io_start + io_num - 1;
res->flags = IORESOURCE_BUSY;
res->desc = IORES_DESC_NONE;
res->child = NULL;
if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0)
reserved = x+1;

View File

@@ -13,7 +13,7 @@ endif
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
obj-y += wait.o swait.o completion.o idle.o
obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o

View File

@@ -61,6 +61,7 @@
#include <linux/static_key.h>
#include <linux/workqueue.h>
#include <linux/compiler.h>
#include <linux/tick.h>
/*
* Scheduler clock - returns current time in nanosec units.
@@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void)
{
if (!sched_clock_stable())
static_key_slow_inc(&__sched_clock_stable);
tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
void set_sched_clock_stable(void)
@@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work)
/* XXX worry about clock continuity */
if (sched_clock_stable())
static_key_slow_dec(&__sched_clock_stable);
tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE);
}
static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable);

View File

@@ -26,6 +26,7 @@
* Thomas Gleixner, Mike Kravetz
*/
#include <linux/kasan.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
@@ -66,12 +67,10 @@
#include <linux/pagemap.h>
#include <linux/hrtimer.h>
#include <linux/tick.h>
#include <linux/debugfs.h>
#include <linux/ctype.h>
#include <linux/ftrace.h>
#include <linux/slab.h>
#include <linux/init_task.h>
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
@@ -124,138 +123,6 @@ const_debug unsigned int sysctl_sched_features =
#undef SCHED_FEAT
#ifdef CONFIG_SCHED_DEBUG
#define SCHED_FEAT(name, enabled) \
#name ,
static const char * const sched_feat_names[] = {
#include "features.h"
};
#undef SCHED_FEAT
static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
return 0;
}
#ifdef HAVE_JUMP_LABEL
#define jump_label_key__true STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
#undef SCHED_FEAT
static void sched_feat_disable(int i)
{
static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
int i;
int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
}
return i;
}
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp;
int i;
struct inode *inode;
if (cnt > 63)
cnt = 63;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
cmp = strstrip(buf);
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static const struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
return 0;
}
late_initcall(sched_init_debug);
#endif /* CONFIG_SCHED_DEBUG */
/*
* Number of tasks to iterate in a single balance run.
* Limited because this is done with IRQs disabled.
@@ -453,20 +320,6 @@ static inline void init_hrtick(void)
}
#endif /* CONFIG_SCHED_HRTICK */
/*
* cmpxchg based fetch_or, macro so it works for different integer types
*/
#define fetch_or(ptr, val) \
({ typeof(*(ptr)) __old, __val = *(ptr); \
for (;;) { \
__old = cmpxchg((ptr), __val, __val | (val)); \
if (__old == __val) \
break; \
__val = __old; \
} \
__old; \
})
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
@@ -715,31 +568,36 @@ static inline bool got_nohz_idle_kick(void)
#endif /* CONFIG_NO_HZ_COMMON */
#ifdef CONFIG_NO_HZ_FULL
bool sched_can_stop_tick(void)
bool sched_can_stop_tick(struct rq *rq)
{
int fifo_nr_running;
/* Deadline tasks, even if single, need the tick */
if (rq->dl.dl_nr_running)
return false;
/*
* FIFO realtime policy runs the highest priority task. Other runnable
* tasks are of a lower priority. The scheduler tick does nothing.
* FIFO realtime policy runs the highest priority task (after DEADLINE).
* Other runnable tasks are of a lower priority. The scheduler tick
* isn't needed.
*/
if (current->policy == SCHED_FIFO)
fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
if (fifo_nr_running)
return true;
/*
* Round-robin realtime tasks time slice with other tasks at the same
* realtime priority. Is this task the only one at this priority?
* realtime priority.
*/
if (current->policy == SCHED_RR) {
struct sched_rt_entity *rt_se = &current->rt;
return list_is_singular(&rt_se->run_list);
if (rq->rt.rr_nr_running) {
if (rq->rt.rr_nr_running == 1)
return true;
else
return false;
}
/*
* More than one running task need preemption.
* nr_running update is assumed to be visible
* after IPI is sent from wakers.
*/
if (this_rq()->nr_running > 1)
/* Normal multitasking need periodic preemption checks */
if (rq->cfs.nr_running > 1)
return false;
return true;
@@ -2093,7 +1951,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
if (schedstat_enabled())
ttwu_stat(p, cpu, wake_flags);
out:
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -2141,7 +2000,8 @@ static void try_to_wake_up_local(struct task_struct *p)
ttwu_activate(rq, p, ENQUEUE_WAKEUP);
ttwu_do_wakeup(rq, p, 0);
ttwu_stat(p, smp_processor_id(), 0);
if (schedstat_enabled())
ttwu_stat(p, smp_processor_id(), 0);
out:
raw_spin_unlock(&p->pi_lock);
}
@@ -2183,7 +2043,6 @@ void __dl_clear_params(struct task_struct *p)
dl_se->dl_bw = 0;
dl_se->dl_throttled = 0;
dl_se->dl_new = 1;
dl_se->dl_yielded = 0;
}
@@ -2210,6 +2069,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
#endif
#ifdef CONFIG_SCHEDSTATS
/* Even if schedstat is disabled, there should not be garbage */
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
@@ -2218,6 +2078,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
__dl_clear_params(p);
INIT_LIST_HEAD(&p->rt.run_list);
p->rt.timeout = 0;
p->rt.time_slice = sched_rr_timeslice;
p->rt.on_rq = 0;
p->rt.on_list = 0;
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2281,6 +2145,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
#endif
#endif
DEFINE_STATIC_KEY_FALSE(sched_schedstats);
#ifdef CONFIG_SCHEDSTATS
static void set_schedstats(bool enabled)
{
if (enabled)
static_branch_enable(&sched_schedstats);
else
static_branch_disable(&sched_schedstats);
}
void force_schedstat_enabled(void)
{
if (!schedstat_enabled()) {
pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
static_branch_enable(&sched_schedstats);
}
}
static int __init setup_schedstats(char *str)
{
int ret = 0;
if (!str)
goto out;
if (!strcmp(str, "enable")) {
set_schedstats(true);
ret = 1;
} else if (!strcmp(str, "disable")) {
set_schedstats(false);
ret = 1;
}
out:
if (!ret)
pr_warn("Unable to parse schedstats=\n");
return ret;
}
__setup("schedstats=", setup_schedstats);
#ifdef CONFIG_PROC_SYSCTL
int sysctl_schedstats(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
struct ctl_table t;
int err;
int state = static_branch_likely(&sched_schedstats);
if (write && !capable(CAP_SYS_ADMIN))
return -EPERM;
t = *table;
t.data = &state;
err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
if (err < 0)
return err;
if (write)
set_schedstats(state);
return err;
}
#endif
#endif
/*
* fork()/clone()-time setup:
*/
@@ -3010,16 +2937,6 @@ u64 scheduler_tick_max_deferment(void)
}
#endif
notrace unsigned long get_parent_ip(unsigned long addr)
{
if (in_lock_functions(addr)) {
addr = CALLER_ADDR2;
if (in_lock_functions(addr))
addr = CALLER_ADDR3;
}
return addr;
}
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
@@ -3041,7 +2958,7 @@ void preempt_count_add(int val)
PREEMPT_MASK - 10);
#endif
if (preempt_count() == val) {
unsigned long ip = get_parent_ip(CALLER_ADDR1);
unsigned long ip = get_lock_parent_ip();
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = ip;
#endif
@@ -3068,7 +2985,7 @@ void preempt_count_sub(int val)
#endif
if (preempt_count() == val)
trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
@@ -3280,7 +3197,6 @@ static void __sched notrace __schedule(bool preempt)
trace_sched_switch(preempt, prev, next);
rq = context_switch(rq, prev, next); /* unlocks the rq */
cpu = cpu_of(rq);
} else {
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock_irq(&rq->lock);
@@ -3466,7 +3382,7 @@ EXPORT_SYMBOL(default_wake_function);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
struct rq *rq;
const struct sched_class *prev_class;
@@ -3494,11 +3410,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
if (oldprio == prio)
queue_flag &= ~DEQUEUE_MOVE;
prev_class = p->sched_class;
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
dequeue_task(rq, p, queue_flag);
if (running)
put_prev_task(rq, p);
@@ -3516,7 +3436,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (!dl_prio(p->normal_prio) ||
(pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
enqueue_flag |= ENQUEUE_REPLENISH;
queue_flag |= ENQUEUE_REPLENISH;
} else
p->dl.dl_boosted = 0;
p->sched_class = &dl_sched_class;
@@ -3524,7 +3444,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(oldprio))
p->dl.dl_boosted = 0;
if (oldprio < prio)
enqueue_flag |= ENQUEUE_HEAD;
queue_flag |= ENQUEUE_HEAD;
p->sched_class = &rt_sched_class;
} else {
if (dl_prio(oldprio))
@@ -3539,7 +3459,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, p, enqueue_flag);
enqueue_task(rq, p, queue_flag);
check_class_changed(rq, p, prev_class, oldprio);
out_unlock:
@@ -3895,6 +3815,7 @@ static int __sched_setscheduler(struct task_struct *p,
const struct sched_class *prev_class;
struct rq *rq;
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
/* may grab non-irq protected spin_locks */
BUG_ON(in_interrupt());
@@ -4077,17 +3998,14 @@ change:
* itself.
*/
new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
if (new_effective_prio == oldprio) {
__setscheduler_params(p, attr);
task_rq_unlock(rq, p, &flags);
return 0;
}
if (new_effective_prio == oldprio)
queue_flags &= ~DEQUEUE_MOVE;
}
queued = task_on_rq_queued(p);
running = task_current(rq, p);
if (queued)
dequeue_task(rq, p, DEQUEUE_SAVE);
dequeue_task(rq, p, queue_flags);
if (running)
put_prev_task(rq, p);
@@ -4097,15 +4015,14 @@ change:
if (running)
p->sched_class->set_curr_task(rq);
if (queued) {
int enqueue_flags = ENQUEUE_RESTORE;
/*
* We enqueue to tail when the priority of a task is
* increased (user space view).
*/
if (oldprio <= p->prio)
enqueue_flags |= ENQUEUE_HEAD;
if (oldprio < p->prio)
queue_flags |= ENQUEUE_HEAD;
enqueue_task(rq, p, enqueue_flags);
enqueue_task(rq, p, queue_flags);
}
check_class_changed(rq, p, prev_class, oldprio);
@@ -5096,6 +5013,8 @@ void init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
kasan_unpoison_task_stack(idle);
#ifdef CONFIG_SMP
/*
* Its possible that init_idle() gets called multiple times on a task,
@@ -5405,183 +5324,6 @@ static void migrate_tasks(struct rq *dead_rq)
}
#endif /* CONFIG_HOTPLUG_CPU */
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
static struct ctl_table sd_ctl_dir[] = {
{
.procname = "sched_domain",
.mode = 0555,
},
{}
};
static struct ctl_table sd_ctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
.child = sd_ctl_dir,
},
{}
};
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
return entry;
}
static void sd_free_ctl_entry(struct ctl_table **tablep)
{
struct ctl_table *entry;
/*
* In the intermediate directories, both the child directory and
* procname are dynamically allocated and could fail but the mode
* will always be set. In the lowest directory the names are
* static strings and all have proc handlers.
*/
for (entry = *tablep; entry->mode; entry++) {
if (entry->child)
sd_free_ctl_entry(&entry->child);
if (entry->proc_handler == NULL)
kfree(entry->procname);
}
kfree(*tablep);
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
}
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
int domain_num = 0, i;
char buf[32];
for_each_domain(cpu, sd)
domain_num++;
entry = table = sd_alloc_ctl_entry(domain_num + 1);
if (table == NULL)
return NULL;
i = 0;
for_each_domain(cpu, sd) {
snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_domain_table(sd);
entry++;
i++;
}
return table;
}
static struct ctl_table_header *sd_sysctl_header;
static void register_sched_domain_sysctl(void)
{
int i, cpu_num = num_possible_cpus();
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
char buf[32];
WARN_ON(sd_ctl_dir[0].child);
sd_ctl_dir[0].child = entry;
if (entry == NULL)
return;
for_each_possible_cpu(i) {
snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i);
entry++;
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
/* may be called multiple times per register */
static void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#else
static void register_sched_domain_sysctl(void)
{
}
static void unregister_sched_domain_sysctl(void)
{
}
#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */
static void set_rq_online(struct rq *rq)
{
if (!rq->online) {
@@ -5693,16 +5435,6 @@ static int sched_cpu_active(struct notifier_block *nfb,
set_cpu_rq_start_time();
return NOTIFY_OK;
case CPU_ONLINE:
/*
* At this point a starting CPU has marked itself as online via
* set_cpu_online(). But it might not yet have marked itself
* as active, which is essential from here on.
*/
set_cpu_active(cpu, true);
stop_machine_unpark(cpu);
return NOTIFY_OK;
case CPU_DOWN_FAILED:
set_cpu_active(cpu, true);
return NOTIFY_OK;
@@ -6174,11 +5906,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
/* Setup the mask of cpus configured for isolated domains */
static int __init isolated_cpu_setup(char *str)
{
int ret;
alloc_bootmem_cpumask_var(&cpu_isolated_map);
cpulist_parse(str, cpu_isolated_map);
ret = cpulist_parse(str, cpu_isolated_map);
if (ret) {
pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids);
return 0;
}
return 1;
}
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
@@ -7889,7 +7626,7 @@ void sched_move_task(struct task_struct *tsk)
queued = task_on_rq_queued(tsk);
if (queued)
dequeue_task(rq, tsk, DEQUEUE_SAVE);
dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
if (unlikely(running))
put_prev_task(rq, tsk);
@@ -7913,7 +7650,7 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
if (queued)
enqueue_task(rq, tsk, ENQUEUE_RESTORE);
enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
task_rq_unlock(rq, tsk, &flags);
}

View File

@@ -262,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void)
#ifdef CONFIG_PARAVIRT
if (static_key_false(&paravirt_steal_enabled)) {
u64 steal;
cputime_t steal_ct;
unsigned long steal_jiffies;
steal = paravirt_steal_clock(smp_processor_id());
steal -= this_rq()->prev_steal_time;
/*
* cputime_t may be less precise than nsecs (eg: if it's
* based on jiffies). Lets cast the result to cputime
* steal is in nsecs but our caller is expecting steal
* time in jiffies. Lets cast the result to jiffies
* granularity and account the rest on the next rounds.
*/
steal_ct = nsecs_to_cputime(steal);
this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct);
steal_jiffies = nsecs_to_jiffies(steal);
this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies);
account_steal_time(steal_ct);
return steal_ct;
account_steal_time(jiffies_to_cputime(steal_jiffies));
return steal_jiffies;
}
#endif
return false;
@@ -668,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime
#endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
static unsigned long long vtime_delta(struct task_struct *tsk)
static cputime_t vtime_delta(struct task_struct *tsk)
{
unsigned long long clock;
unsigned long now = READ_ONCE(jiffies);
clock = local_clock();
if (clock < tsk->vtime_snap)
if (time_before(now, (unsigned long)tsk->vtime_snap))
return 0;
return clock - tsk->vtime_snap;
return jiffies_to_cputime(now - tsk->vtime_snap);
}
static cputime_t get_vtime_delta(struct task_struct *tsk)
{
unsigned long long delta = vtime_delta(tsk);
unsigned long now = READ_ONCE(jiffies);
unsigned long delta = now - tsk->vtime_snap;
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
tsk->vtime_snap += delta;
tsk->vtime_snap = now;
/* CHECKME: always safe to convert nsecs to cputime? */
return nsecs_to_cputime(delta);
return jiffies_to_cputime(delta);
}
static void __vtime_account_system(struct task_struct *tsk)
@@ -699,6 +698,9 @@ static void __vtime_account_system(struct task_struct *tsk)
void vtime_account_system(struct task_struct *tsk)
{
if (!vtime_delta(tsk))
return;
write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
write_seqcount_end(&tsk->vtime_seqcount);
@@ -707,7 +709,8 @@ void vtime_account_system(struct task_struct *tsk)
void vtime_gen_account_irq_exit(struct task_struct *tsk)
{
write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
if (vtime_delta(tsk))
__vtime_account_system(tsk);
if (context_tracking_in_user())
tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount);
@@ -718,16 +721,19 @@ void vtime_account_user(struct task_struct *tsk)
cputime_t delta_cpu;
write_seqcount_begin(&tsk->vtime_seqcount);
delta_cpu = get_vtime_delta(tsk);
tsk->vtime_snap_whence = VTIME_SYS;
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
if (vtime_delta(tsk)) {
delta_cpu = get_vtime_delta(tsk);
account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu));
}
write_seqcount_end(&tsk->vtime_seqcount);
}
void vtime_user_enter(struct task_struct *tsk)
{
write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
if (vtime_delta(tsk))
__vtime_account_system(tsk);
tsk->vtime_snap_whence = VTIME_USER;
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -742,7 +748,8 @@ void vtime_guest_enter(struct task_struct *tsk)
* that can thus safely catch up with a tickless delta.
*/
write_seqcount_begin(&tsk->vtime_seqcount);
__vtime_account_system(tsk);
if (vtime_delta(tsk))
__vtime_account_system(tsk);
current->flags |= PF_VCPU;
write_seqcount_end(&tsk->vtime_seqcount);
}
@@ -772,7 +779,7 @@ void arch_vtime_task_switch(struct task_struct *prev)
write_seqcount_begin(&current->vtime_seqcount);
current->vtime_snap_whence = VTIME_SYS;
current->vtime_snap = sched_clock_cpu(smp_processor_id());
current->vtime_snap = jiffies;
write_seqcount_end(&current->vtime_seqcount);
}
@@ -783,7 +790,7 @@ void vtime_init_idle(struct task_struct *t, int cpu)
local_irq_save(flags);
write_seqcount_begin(&t->vtime_seqcount);
t->vtime_snap_whence = VTIME_SYS;
t->vtime_snap = sched_clock_cpu(cpu);
t->vtime_snap = jiffies;
write_seqcount_end(&t->vtime_seqcount);
local_irq_restore(flags);
}

View File

@@ -352,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
WARN_ON(!dl_se->dl_new || dl_se->dl_throttled);
WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline));
/*
* We are racing with the deadline timer. So, do nothing because
* the deadline timer handler will take care of properly recharging
* the runtime and postponing the deadline
*/
if (dl_se->dl_throttled)
return;
/*
* We use the regular wall clock time to set deadlines in the
@@ -361,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se,
*/
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
dl_se->dl_new = 0;
}
/*
@@ -503,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se,
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
/*
* The arrival of a new instance needs special treatment, i.e.,
* the actual scheduling parameters have to be "renewed".
*/
if (dl_se->dl_new) {
setup_new_dl_entity(dl_se, pi_se);
return;
}
if (dl_time_before(dl_se->deadline, rq_clock(rq)) ||
dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) {
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
@@ -607,16 +605,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
goto unlock;
}
/*
* This is possible if switched_from_dl() raced against a running
* callback that took the above !dl_task() path and we've since then
* switched back into SCHED_DEADLINE.
*
* There's nothing to do except drop our task reference.
*/
if (dl_se->dl_new)
goto unlock;
/*
* The task might have been boosted by someone else and might be in the
* boosting/deboosting path, its not throttled.
@@ -925,7 +913,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se,
* parameters of the task might need updating. Otherwise,
* we want a replenishment of its runtime.
*/
if (dl_se->dl_new || flags & ENQUEUE_WAKEUP)
if (flags & ENQUEUE_WAKEUP)
update_dl_entity(dl_se, pi_se);
else if (flags & ENQUEUE_REPLENISH)
replenish_dl_entity(dl_se, pi_se);
@@ -1726,6 +1714,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
*/
static void switched_to_dl(struct rq *rq, struct task_struct *p)
{
if (dl_time_before(p->dl.deadline, rq_clock(rq)))
setup_new_dl_entity(&p->dl, &p->dl);
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
@@ -1772,8 +1763,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
*/
resched_curr(rq);
#endif /* CONFIG_SMP */
} else
switched_to_dl(rq, p);
}
}
const struct sched_class dl_sched_class = {

View File

@@ -16,6 +16,7 @@
#include <linux/kallsyms.h>
#include <linux/utsname.h>
#include <linux/mempolicy.h>
#include <linux/debugfs.h>
#include "sched.h"
@@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec)
#define SPLIT_NS(x) nsec_high(x), nsec_low(x)
#define SCHED_FEAT(name, enabled) \
#name ,
static const char * const sched_feat_names[] = {
#include "features.h"
};
#undef SCHED_FEAT
static int sched_feat_show(struct seq_file *m, void *v)
{
int i;
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (!(sysctl_sched_features & (1UL << i)))
seq_puts(m, "NO_");
seq_printf(m, "%s ", sched_feat_names[i]);
}
seq_puts(m, "\n");
return 0;
}
#ifdef HAVE_JUMP_LABEL
#define jump_label_key__true STATIC_KEY_INIT_TRUE
#define jump_label_key__false STATIC_KEY_INIT_FALSE
#define SCHED_FEAT(name, enabled) \
jump_label_key__##enabled ,
struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
#include "features.h"
};
#undef SCHED_FEAT
static void sched_feat_disable(int i)
{
static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
static void sched_feat_enable(int i) { };
#endif /* HAVE_JUMP_LABEL */
static int sched_feat_set(char *cmp)
{
int i;
int neg = 0;
if (strncmp(cmp, "NO_", 3) == 0) {
neg = 1;
cmp += 3;
}
for (i = 0; i < __SCHED_FEAT_NR; i++) {
if (strcmp(cmp, sched_feat_names[i]) == 0) {
if (neg) {
sysctl_sched_features &= ~(1UL << i);
sched_feat_disable(i);
} else {
sysctl_sched_features |= (1UL << i);
sched_feat_enable(i);
}
break;
}
}
return i;
}
static ssize_t
sched_feat_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
char buf[64];
char *cmp;
int i;
struct inode *inode;
if (cnt > 63)
cnt = 63;
if (copy_from_user(&buf, ubuf, cnt))
return -EFAULT;
buf[cnt] = 0;
cmp = strstrip(buf);
/* Ensure the static_key remains in a consistent state */
inode = file_inode(filp);
inode_lock(inode);
i = sched_feat_set(cmp);
inode_unlock(inode);
if (i == __SCHED_FEAT_NR)
return -EINVAL;
*ppos += cnt;
return cnt;
}
static int sched_feat_open(struct inode *inode, struct file *filp)
{
return single_open(filp, sched_feat_show, NULL);
}
static const struct file_operations sched_feat_fops = {
.open = sched_feat_open,
.write = sched_feat_write,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
static __init int sched_init_debug(void)
{
debugfs_create_file("sched_features", 0644, NULL, NULL,
&sched_feat_fops);
return 0;
}
late_initcall(sched_init_debug);
#ifdef CONFIG_SMP
#ifdef CONFIG_SYSCTL
static struct ctl_table sd_ctl_dir[] = {
{
.procname = "sched_domain",
.mode = 0555,
},
{}
};
static struct ctl_table sd_ctl_root[] = {
{
.procname = "kernel",
.mode = 0555,
.child = sd_ctl_dir,
},
{}
};
static struct ctl_table *sd_alloc_ctl_entry(int n)
{
struct ctl_table *entry =
kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL);
return entry;
}
static void sd_free_ctl_entry(struct ctl_table **tablep)
{
struct ctl_table *entry;
/*
* In the intermediate directories, both the child directory and
* procname are dynamically allocated and could fail but the mode
* will always be set. In the lowest directory the names are
* static strings and all have proc handlers.
*/
for (entry = *tablep; entry->mode; entry++) {
if (entry->child)
sd_free_ctl_entry(&entry->child);
if (entry->proc_handler == NULL)
kfree(entry->procname);
}
kfree(*tablep);
*tablep = NULL;
}
static int min_load_idx = 0;
static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
const char *procname, void *data, int maxlen,
umode_t mode, proc_handler *proc_handler,
bool load_idx)
{
entry->procname = procname;
entry->data = data;
entry->maxlen = maxlen;
entry->mode = mode;
entry->proc_handler = proc_handler;
if (load_idx) {
entry->extra1 = &min_load_idx;
entry->extra2 = &max_load_idx;
}
}
static struct ctl_table *
sd_alloc_ctl_domain_table(struct sched_domain *sd)
{
struct ctl_table *table = sd_alloc_ctl_entry(14);
if (table == NULL)
return NULL;
set_table_entry(&table[0], "min_interval", &sd->min_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[1], "max_interval", &sd->max_interval,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
sizeof(int), 0644, proc_dointvec_minmax, true);
set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[9], "cache_nice_tries",
&sd->cache_nice_tries,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[10], "flags", &sd->flags,
sizeof(int), 0644, proc_dointvec_minmax, false);
set_table_entry(&table[11], "max_newidle_lb_cost",
&sd->max_newidle_lb_cost,
sizeof(long), 0644, proc_doulongvec_minmax, false);
set_table_entry(&table[12], "name", sd->name,
CORENAME_MAX_SIZE, 0444, proc_dostring, false);
/* &table[13] is terminator */
return table;
}
static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
{
struct ctl_table *entry, *table;
struct sched_domain *sd;
int domain_num = 0, i;
char buf[32];
for_each_domain(cpu, sd)
domain_num++;
entry = table = sd_alloc_ctl_entry(domain_num + 1);
if (table == NULL)
return NULL;
i = 0;
for_each_domain(cpu, sd) {
snprintf(buf, 32, "domain%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_domain_table(sd);
entry++;
i++;
}
return table;
}
static struct ctl_table_header *sd_sysctl_header;
void register_sched_domain_sysctl(void)
{
int i, cpu_num = num_possible_cpus();
struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1);
char buf[32];
WARN_ON(sd_ctl_dir[0].child);
sd_ctl_dir[0].child = entry;
if (entry == NULL)
return;
for_each_possible_cpu(i) {
snprintf(buf, 32, "cpu%d", i);
entry->procname = kstrdup(buf, GFP_KERNEL);
entry->mode = 0555;
entry->child = sd_alloc_ctl_cpu_table(i);
entry++;
}
WARN_ON(sd_sysctl_header);
sd_sysctl_header = register_sysctl_table(sd_ctl_root);
}
/* may be called multiple times per register */
void unregister_sched_domain_sysctl(void)
{
unregister_sysctl_table(sd_sysctl_header);
sd_sysctl_header = NULL;
if (sd_ctl_dir[0].child)
sd_free_ctl_entry(&sd_ctl_dir[0].child);
}
#endif /* CONFIG_SYSCTL */
#endif /* CONFIG_SMP */
#ifdef CONFIG_FAIR_GROUP_SCHED
static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg)
{
@@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
PN(se->vruntime);
PN(se->sum_exec_runtime);
#ifdef CONFIG_SCHEDSTATS
PN(se->statistics.wait_start);
PN(se->statistics.sleep_start);
PN(se->statistics.block_start);
PN(se->statistics.sleep_max);
PN(se->statistics.block_max);
PN(se->statistics.exec_max);
PN(se->statistics.slice_max);
PN(se->statistics.wait_max);
PN(se->statistics.wait_sum);
P(se->statistics.wait_count);
if (schedstat_enabled()) {
PN(se->statistics.wait_start);
PN(se->statistics.sleep_start);
PN(se->statistics.block_start);
PN(se->statistics.sleep_max);
PN(se->statistics.block_max);
PN(se->statistics.exec_max);
PN(se->statistics.slice_max);
PN(se->statistics.wait_max);
PN(se->statistics.wait_sum);
P(se->statistics.wait_count);
}
#endif
P(se->load.weight);
#ifdef CONFIG_SMP
@@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
(long long)(p->nvcsw + p->nivcsw),
p->prio);
#ifdef CONFIG_SCHEDSTATS
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
if (schedstat_enabled()) {
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
SPLIT_NS(p->se.statistics.wait_sum),
SPLIT_NS(p->se.sum_exec_runtime),
SPLIT_NS(p->se.statistics.sum_sleep_runtime));
}
#else
SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld",
0LL, 0L,
@@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
{
struct dl_bw *dl_bw;
SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
#ifdef CONFIG_SMP
dl_bw = &cpu_rq(cpu)->rd->dl_bw;
#else
dl_bw = &dl_rq->dl_bw;
#endif
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw);
SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw);
}
extern __read_mostly int sched_clock_running;
@@ -313,17 +630,18 @@ do { \
#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
#define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n);
P(yld_count);
P(sched_count);
P(sched_goidle);
#ifdef CONFIG_SMP
P64(avg_idle);
P64(max_idle_balance_cost);
#endif
P(ttwu_count);
P(ttwu_local);
if (schedstat_enabled()) {
P(yld_count);
P(sched_count);
P(sched_goidle);
P(ttwu_count);
P(ttwu_local);
}
#undef P
#undef P64
@@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
nr_switches = p->nvcsw + p->nivcsw;
#ifdef CONFIG_SCHEDSTATS
PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
P(se.nr_migrations);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
{
if (schedstat_enabled()) {
u64 avg_atom, avg_per_cpu;
PN(se.statistics.sum_sleep_runtime);
PN(se.statistics.wait_start);
PN(se.statistics.sleep_start);
PN(se.statistics.block_start);
PN(se.statistics.sleep_max);
PN(se.statistics.block_max);
PN(se.statistics.exec_max);
PN(se.statistics.slice_max);
PN(se.statistics.wait_max);
PN(se.statistics.wait_sum);
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
P(se.statistics.nr_failed_migrations_running);
P(se.statistics.nr_failed_migrations_hot);
P(se.statistics.nr_forced_migrations);
P(se.statistics.nr_wakeups);
P(se.statistics.nr_wakeups_sync);
P(se.statistics.nr_wakeups_migrate);
P(se.statistics.nr_wakeups_local);
P(se.statistics.nr_wakeups_remote);
P(se.statistics.nr_wakeups_affine);
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
avg_atom = p->se.sum_exec_runtime;
if (nr_switches)
avg_atom = div64_ul(avg_atom, nr_switches);

View File

@@ -20,8 +20,8 @@
* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
*/
#include <linux/latencytop.h>
#include <linux/sched.h>
#include <linux/latencytop.h>
#include <linux/cpumask.h>
#include <linux/cpuidle.h>
#include <linux/slab.h>
@@ -755,7 +755,9 @@ static void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
struct task_struct *p;
u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
u64 delta;
delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start;
if (entity_is_task(se)) {
p = task_of(se);
@@ -776,22 +778,12 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
se->statistics.wait_sum += delta;
se->statistics.wait_start = 0;
}
#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
#endif
/*
* Task is being enqueued - update stats:
*/
static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
/*
* Are we enqueueing a waiting task? (for current tasks
@@ -802,7 +794,7 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
/*
* Mark the end of the wait period if dequeueing a
@@ -810,7 +802,40 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
*/
if (se != cfs_rq->curr)
update_stats_wait_end(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
}
}
#else
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
}
static inline void
update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
}
#endif
/*
* We are picking a new current task - update its stats:
@@ -907,10 +932,11 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
int active_nodes;
struct rcu_head rcu;
nodemask_t active_nodes;
unsigned long total_faults;
unsigned long max_faults_cpu;
/*
* Faults_cpu is used to decide whether memory should move
* towards the CPU. As a consequence, these stats are weighted
@@ -969,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
}
/*
* A node triggering more than 1/3 as many NUMA faults as the maximum is
* considered part of a numa group's pseudo-interleaving set. Migrations
* between these nodes are slowed down, to allow things to settle down.
*/
#define ACTIVE_NODE_FRACTION 3
static bool numa_is_active_node(int nid, struct numa_group *ng)
{
return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu;
}
/* Handle placement on systems where not all nodes are directly connected. */
static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
int maxdist, bool task)
@@ -1118,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
return true;
/*
* Do not migrate if the destination is not a node that
* is actively used by this numa group.
* Destination node is much more heavily used than the source
* node? Allow migration.
*/
if (!node_isset(dst_nid, ng->active_nodes))
return false;
/*
* Source is a node that is not actively used by this
* numa group, while the destination is. Migrate.
*/
if (!node_isset(src_nid, ng->active_nodes))
if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) *
ACTIVE_NODE_FRACTION)
return true;
/*
* Both source and destination are nodes in active
* use by this numa group. Maximize memory bandwidth
* by migrating from more heavily used groups, to less
* heavily used ones, spreading the load around.
* Use a 1/4 hysteresis to avoid spurious page movement.
* Distribute memory according to CPU & memory use on each node,
* with 3/4 hysteresis to avoid unnecessary memory migrations:
*
* faults_cpu(dst) 3 faults_cpu(src)
* --------------- * - > ---------------
* faults_mem(dst) 4 faults_mem(src)
*/
return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 >
group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4;
}
static unsigned long weighted_cpuload(const int cpu);
@@ -1484,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p)
.best_task = NULL,
.best_imp = 0,
.best_cpu = -1
.best_cpu = -1,
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
@@ -1536,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p)
* multiple NUMA nodes; in order to better consolidate the group,
* we need to check other locations.
*/
if (env.best_cpu == -1 || (p->numa_group &&
nodes_weight(p->numa_group->active_nodes) > 1)) {
if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
@@ -1572,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p)
* trying for a better one later. Do not set the preferred node here.
*/
if (p->numa_group) {
struct numa_group *ng = p->numa_group;
if (env.best_cpu == -1)
nid = env.src_nid;
else
nid = env.dst_nid;
if (node_isset(nid, p->numa_group->active_nodes))
if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng))
sched_setnuma(p, env.dst_nid);
}
@@ -1627,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p)
}
/*
* Find the nodes on which the workload is actively running. We do this by
* Find out how many nodes on the workload is actively running on. Do this by
* tracking the nodes from which NUMA hinting faults are triggered. This can
* be different from the set of nodes where the workload's memory is currently
* located.
*
* The bitmask is used to make smarter decisions on when to do NUMA page
* migrations, To prevent flip-flopping, and excessive page migrations, nodes
* are added when they cause over 6/16 of the maximum number of faults, but
* only removed when they drop below 3/16.
*/
static void update_numa_active_node_mask(struct numa_group *numa_group)
static void numa_group_count_active_nodes(struct numa_group *numa_group)
{
unsigned long faults, max_faults = 0;
int nid;
int nid, active_nodes = 0;
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
@@ -1650,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group)
for_each_online_node(nid) {
faults = group_faults_cpu(numa_group, nid);
if (!node_isset(nid, numa_group->active_nodes)) {
if (faults > max_faults * 6 / 16)
node_set(nid, numa_group->active_nodes);
} else if (faults < max_faults * 3 / 16)
node_clear(nid, numa_group->active_nodes);
if (faults * ACTIVE_NODE_FRACTION > max_faults)
active_nodes++;
}
numa_group->max_faults_cpu = max_faults;
numa_group->active_nodes = active_nodes;
}
/*
@@ -1946,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p)
update_task_scan_period(p, fault_types[0], fault_types[1]);
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
numa_group_count_active_nodes(p->numa_group);
spin_unlock_irq(group_lock);
max_nid = preferred_group_nid(p, max_group_nid);
}
@@ -1990,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
return;
atomic_set(&grp->refcount, 1);
grp->active_nodes = 1;
grp->max_faults_cpu = 0;
spin_lock_init(&grp->lock);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
nr_node_ids;
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
grp->faults[i] = p->numa_faults[i];
@@ -2111,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
bool migrated = flags & TNF_MIGRATED;
int cpu_node = task_node(current);
int local = !!(flags & TNF_FAULT_LOCAL);
struct numa_group *ng;
int priv;
if (!static_branch_likely(&sched_numa_balancing))
@@ -2151,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
* actively using should be counted as local. This allows the
* scan rate to slow down when a workload has settled down.
*/
if (!priv && !local && p->numa_group &&
node_isset(cpu_node, p->numa_group->active_nodes) &&
node_isset(mem_node, p->numa_group->active_nodes))
ng = p->numa_group;
if (!priv && !local && ng && ng->active_nodes > 1 &&
numa_is_active_node(cpu_node, ng) &&
numa_is_active_node(mem_node, ng))
local = 1;
task_numa_placement(p);
@@ -3102,6 +3134,26 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
static inline void check_schedstat_required(void)
{
#ifdef CONFIG_SCHEDSTATS
if (schedstat_enabled())
return;
/* Force schedstat enabled if a dependent tracepoint is active */
if (trace_sched_stat_wait_enabled() ||
trace_sched_stat_sleep_enabled() ||
trace_sched_stat_iowait_enabled() ||
trace_sched_stat_blocked_enabled() ||
trace_sched_stat_runtime_enabled()) {
pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, "
"stat_blocked and stat_runtime require the "
"kernel parameter schedstats=enabled or "
"kernel.sched_schedstats=1\n");
}
#endif
}
static void
enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
{
@@ -3122,11 +3174,15 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
enqueue_sleeper(cfs_rq, se);
if (schedstat_enabled())
enqueue_sleeper(cfs_rq, se);
}
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
check_schedstat_required();
if (schedstat_enabled()) {
update_stats_enqueue(cfs_rq, se);
check_spread(cfs_rq, se);
}
if (se != cfs_rq->curr)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
@@ -3193,19 +3249,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
update_curr(cfs_rq);
dequeue_entity_load_avg(cfs_rq, se);
update_stats_dequeue(cfs_rq, se);
if (flags & DEQUEUE_SLEEP) {
#ifdef CONFIG_SCHEDSTATS
if (entity_is_task(se)) {
struct task_struct *tsk = task_of(se);
if (tsk->state & TASK_INTERRUPTIBLE)
se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
if (tsk->state & TASK_UNINTERRUPTIBLE)
se->statistics.block_start = rq_clock(rq_of(cfs_rq));
}
#endif
}
if (schedstat_enabled())
update_stats_dequeue(cfs_rq, se, flags);
clear_buddies(cfs_rq, se);
@@ -3279,7 +3324,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* a CPU. So account for the time it spent waiting on the
* runqueue.
*/
update_stats_wait_end(cfs_rq, se);
if (schedstat_enabled())
update_stats_wait_end(cfs_rq, se);
__dequeue_entity(cfs_rq, se);
update_load_avg(se, 1);
}
@@ -3292,7 +3338,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
* least twice that of our own weight (i.e. dont track it
* when there are only lesser-weight tasks around):
*/
if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
se->statistics.slice_max = max(se->statistics.slice_max,
se->sum_exec_runtime - se->prev_sum_exec_runtime);
}
@@ -3375,9 +3421,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
/* throttle cfs_rqs exceeding runtime */
check_cfs_rq_runtime(cfs_rq);
check_spread(cfs_rq, prev);
if (schedstat_enabled()) {
check_spread(cfs_rq, prev);
if (prev->on_rq)
update_stats_wait_start(cfs_rq, prev);
}
if (prev->on_rq) {
update_stats_wait_start(cfs_rq, prev);
/* Put 'current' back into the tree. */
__enqueue_entity(cfs_rq, prev);
/* in !on_rq case, update occurred at dequeue */
@@ -4459,9 +4509,17 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
/* scale is effectively 1 << i now, and >> i divides by scale */
old_load = this_rq->cpu_load[i] - tickless_load;
old_load = this_rq->cpu_load[i];
old_load = decay_load_missed(old_load, pending_updates - 1, i);
old_load += tickless_load;
if (tickless_load) {
old_load -= decay_load_missed(tickless_load, pending_updates - 1, i);
/*
* old_load can never be a negative value because a
* decayed tickless_load cannot be greater than the
* original tickless_load.
*/
old_load += tickless_load;
}
new_load = this_load;
/*
* Round up the averaging division if load is increasing. This
@@ -4484,6 +4542,25 @@ static unsigned long weighted_cpuload(const int cpu)
}
#ifdef CONFIG_NO_HZ_COMMON
static void __update_cpu_load_nohz(struct rq *this_rq,
unsigned long curr_jiffies,
unsigned long load,
int active)
{
unsigned long pending_updates;
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
* In the regular NOHZ case, we were idle, this means load 0.
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
__update_cpu_load(this_rq, load, pending_updates, active);
}
}
/*
* There is no sane way to deal with nohz on smp when using jiffies because the
* cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
@@ -4501,22 +4578,15 @@ static unsigned long weighted_cpuload(const int cpu)
* Called from nohz_idle_balance() to update the load ratings before doing the
* idle balance.
*/
static void update_idle_cpu_load(struct rq *this_rq)
static void update_cpu_load_idle(struct rq *this_rq)
{
unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = weighted_cpuload(cpu_of(this_rq));
unsigned long pending_updates;
/*
* bail if there's load or we're actually up-to-date.
*/
if (load || curr_jiffies == this_rq->last_load_update_tick)
if (weighted_cpuload(cpu_of(this_rq)))
return;
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
this_rq->last_load_update_tick = curr_jiffies;
__update_cpu_load(this_rq, load, pending_updates, 0);
__update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0);
}
/*
@@ -4527,22 +4597,12 @@ void update_cpu_load_nohz(int active)
struct rq *this_rq = this_rq();
unsigned long curr_jiffies = READ_ONCE(jiffies);
unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0;
unsigned long pending_updates;
if (curr_jiffies == this_rq->last_load_update_tick)
return;
raw_spin_lock(&this_rq->lock);
pending_updates = curr_jiffies - this_rq->last_load_update_tick;
if (pending_updates) {
this_rq->last_load_update_tick = curr_jiffies;
/*
* In the regular NOHZ case, we were idle, this means load 0.
* In the NOHZ_FULL case, we were non-idle, we should consider
* its weighted load.
*/
__update_cpu_load(this_rq, load, pending_updates, active);
}
__update_cpu_load_nohz(this_rq, curr_jiffies, load, active);
raw_spin_unlock(&this_rq->lock);
}
#endif /* CONFIG_NO_HZ */
@@ -4554,7 +4614,7 @@ void update_cpu_load_active(struct rq *this_rq)
{
unsigned long load = weighted_cpuload(cpu_of(this_rq));
/*
* See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
* See the mess around update_cpu_load_idle() / update_cpu_load_nohz().
*/
this_rq->last_load_update_tick = jiffies;
__update_cpu_load(this_rq, load, 1, 1);
@@ -7848,7 +7908,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
update_idle_cpu_load(rq);
update_cpu_load_idle(rq);
raw_spin_unlock_irq(&rq->lock);
rebalance_domains(rq, CPU_IDLE);
}

View File

@@ -4,6 +4,7 @@
#include <linux/sched.h>
#include <linux/cpu.h>
#include <linux/cpuidle.h>
#include <linux/cpuhotplug.h>
#include <linux/tick.h>
#include <linux/mm.h>
#include <linux/stackprotector.h>
@@ -193,8 +194,6 @@ exit_idle:
rcu_idle_exit();
}
DEFINE_PER_CPU(bool, cpu_dead_idle);
/*
* Generic idle loop implementation
*
@@ -221,10 +220,7 @@ static void cpu_idle_loop(void)
rmb();
if (cpu_is_offline(smp_processor_id())) {
rcu_cpu_notify(NULL, CPU_DYING_IDLE,
(void *)(long)smp_processor_id());
smp_mb(); /* all activity before dead. */
this_cpu_write(cpu_dead_idle, true);
cpuhp_report_idle_dead();
arch_cpu_idle_dead();
}
@@ -291,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state)
boot_init_stack_canary();
#endif
arch_cpu_idle_prepare();
cpuhp_online_idle(state);
cpu_idle_loop();
}

View File

@@ -58,7 +58,15 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
/*
* SCHED_DEADLINE updates the bandwidth, as a run away
* RT task with a DL task could hog a CPU. But DL does
* not reset the period. If a deadline task was running
* without an RT task running, it can cause RT tasks to
* throttle when they start up. Kick the timer right away
* to update the period.
*/
hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
}
raw_spin_unlock(&rt_b->rt_runtime_lock);
@@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
static inline int on_rt_rq(struct sched_rt_entity *rt_se)
{
return !list_empty(&rt_se->run_list);
return rt_se->on_rq;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
return rt_se->my_q;
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
@@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (!rt_se)
enqueue_top_rt_rq(rt_rq);
else if (!on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
enqueue_rt_entity(rt_se, 0);
if (rt_rq->highest_prio.curr < curr->prio)
resched_curr(rq);
@@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
if (!rt_se)
dequeue_top_rt_rq(rt_rq);
else if (on_rt_rq(rt_se))
dequeue_rt_entity(rt_se);
dequeue_rt_entity(rt_se, 0);
}
static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1141,6 +1149,20 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
return 1;
}
static inline
unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
{
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct task_struct *tsk;
if (group_rq)
return group_rq->rr_nr_running;
tsk = rt_task_of(rt_se);
return (tsk->policy == SCHED_RR) ? 1 : 0;
}
static inline
void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
{
@@ -1148,6 +1170,7 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(prio));
rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
inc_rt_prio(rt_rq, prio);
inc_rt_migration(rt_se, rt_rq);
@@ -1160,13 +1183,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
WARN_ON(!rt_prio(rt_se_prio(rt_se)));
WARN_ON(!rt_rq->rt_nr_running);
rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
dec_rt_prio(rt_rq, rt_se_prio(rt_se));
dec_rt_migration(rt_se, rt_rq);
dec_rt_group(rt_se, rt_rq);
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
/*
* Change rt_se->run_list location unless SAVE && !MOVE
*
* assumes ENQUEUE/DEQUEUE flags match
*/
static inline bool move_entity(unsigned int flags)
{
if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
return false;
return true;
}
static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
{
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
rt_se->on_list = 0;
}
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1226,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
* get throttled and the current group doesn't have any other
* active members.
*/
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
if (rt_se->on_list)
__delist_rt_entity(rt_se, array);
return;
}
if (head)
list_add(&rt_se->run_list, queue);
else
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
if (move_entity(flags)) {
WARN_ON_ONCE(rt_se->on_list);
if (flags & ENQUEUE_HEAD)
list_add(&rt_se->run_list, queue);
else
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
rt_se->on_list = 1;
}
rt_se->on_rq = 1;
inc_rt_tasks(rt_se, rt_rq);
}
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
if (move_entity(flags)) {
WARN_ON_ONCE(!rt_se->on_list);
__delist_rt_entity(rt_se, array);
}
rt_se->on_rq = 0;
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1207,7 +1265,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
* Because the prio of an upper entry depends on the lower
* entries, we must remove entries top - down.
*/
static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct sched_rt_entity *back = NULL;
@@ -1220,31 +1278,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
for (rt_se = back; rt_se; rt_se = rt_se->back) {
if (on_rt_rq(rt_se))
__dequeue_rt_entity(rt_se);
__dequeue_rt_entity(rt_se, flags);
}
}
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se);
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se)
__enqueue_rt_entity(rt_se, head);
__enqueue_rt_entity(rt_se, flags);
enqueue_top_rt_rq(&rq->rt);
}
static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
{
struct rq *rq = rq_of_rt_se(rt_se);
dequeue_rt_stack(rt_se);
dequeue_rt_stack(rt_se, flags);
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = group_rt_rq(rt_se);
if (rt_rq && rt_rq->rt_nr_running)
__enqueue_rt_entity(rt_se, false);
__enqueue_rt_entity(rt_se, flags);
}
enqueue_top_rt_rq(&rq->rt);
}
@@ -1260,7 +1318,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
enqueue_rt_entity(rt_se, flags);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
@@ -1271,7 +1329,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
struct sched_rt_entity *rt_se = &p->rt;
update_curr_rt(rq);
dequeue_rt_entity(rt_se);
dequeue_rt_entity(rt_se, flags);
dequeue_pushable_task(rq, p);
}

View File

@@ -3,6 +3,7 @@
#include <linux/sched/sysctl.h>
#include <linux/sched/rt.h>
#include <linux/sched/deadline.h>
#include <linux/binfmts.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/stop_machine.h>
@@ -318,7 +319,6 @@ extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
struct sched_entity *se, int cpu,
struct sched_entity *parent);
extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
@@ -450,6 +450,7 @@ static inline int rt_bandwidth_enabled(void)
struct rt_rq {
struct rt_prio_array active;
unsigned int rt_nr_running;
unsigned int rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
struct {
int curr; /* highest queued rt task prio */
@@ -909,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
void register_sched_domain_sysctl(void);
void unregister_sched_domain_sysctl(void);
#else
static inline void register_sched_domain_sysctl(void)
{
}
static inline void unregister_sched_domain_sysctl(void)
{
}
#endif
#else
static inline void sched_ttwu_pending(void) { }
@@ -1022,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
extern struct static_key_false sched_numa_balancing;
extern struct static_key_false sched_schedstats;
static inline u64 global_rt_period(void)
{
@@ -1130,18 +1144,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
extern const int sched_prio_to_weight[40];
extern const u32 sched_prio_to_wmult[40];
/*
* {de,en}queue flags:
*
* DEQUEUE_SLEEP - task is no longer runnable
* ENQUEUE_WAKEUP - task just became runnable
*
* SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
* are in a known state which allows modification. Such pairs
* should preserve as much state as possible.
*
* MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
* in the runqueue.
*
* ENQUEUE_HEAD - place at front of runqueue (tail if not specified)
* ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
* ENQUEUE_WAKING - sched_class::task_waking was called
*
*/
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */
#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */
#define ENQUEUE_WAKEUP 0x01
#define ENQUEUE_HEAD 0x02
#define ENQUEUE_RESTORE 0x02
#define ENQUEUE_MOVE 0x04
#define ENQUEUE_HEAD 0x08
#define ENQUEUE_REPLENISH 0x10
#ifdef CONFIG_SMP
#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */
#define ENQUEUE_WAKING 0x20
#else
#define ENQUEUE_WAKING 0x00
#endif
#define ENQUEUE_REPLENISH 0x08
#define ENQUEUE_RESTORE 0x10
#define DEQUEUE_SLEEP 0x01
#define DEQUEUE_SAVE 0x02
#define RETRY_TASK ((void *)-1UL)
@@ -1278,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
#ifdef CONFIG_NO_HZ_FULL
extern bool sched_can_stop_tick(struct rq *rq);
/*
* Tick may be needed by tasks in the runqueue depending on their policy and
* requirements. If tick is needed, lets send the target an IPI to kick it out of
* nohz mode if necessary.
*/
static inline void sched_update_tick_dependency(struct rq *rq)
{
int cpu;
if (!tick_nohz_full_enabled())
return;
cpu = cpu_of(rq);
if (!tick_nohz_full_cpu(cpu))
return;
if (sched_can_stop_tick(rq))
tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED);
else
tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
}
#else
static inline void sched_update_tick_dependency(struct rq *rq) { }
#endif
static inline void add_nr_running(struct rq *rq, unsigned count)
{
unsigned prev_nr = rq->nr_running;
@@ -1289,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
if (!rq->rd->overload)
rq->rd->overload = true;
#endif
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_cpu(rq->cpu)) {
/*
* Tick is needed if more than one task runs on a CPU.
* Send the target an IPI to kick it out of nohz mode.
*
* We assume that IPI implies full memory barrier and the
* new value of rq->nr_running is visible on reception
* from the target.
*/
tick_nohz_full_kick_cpu(rq->cpu);
}
#endif
}
sched_update_tick_dependency(rq);
}
static inline void sub_nr_running(struct rq *rq, unsigned count)
{
rq->nr_running -= count;
/* Check if we still need preemption */
sched_update_tick_dependency(rq);
}
static inline void rq_last_tick_reset(struct rq *rq)

View File

@@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
if (rq)
rq->rq_sched_info.run_delay += delta;
}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
# define schedstat_set(var, val) do { var = (val); } while (0)
# define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0)
# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0)
# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
@@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long long delta)
{}
# define schedstat_enabled() 0
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
# define schedstat_set(var, val) do { } while (0)

123
kernel/sched/swait.c Normal file
View File

@@ -0,0 +1,123 @@
#include <linux/sched.h>
#include <linux/swait.h>
void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
struct lock_class_key *key)
{
raw_spin_lock_init(&q->lock);
lockdep_set_class_and_name(&q->lock, key, name);
INIT_LIST_HEAD(&q->task_list);
}
EXPORT_SYMBOL(__init_swait_queue_head);
/*
* The thing about the wake_up_state() return value; I think we can ignore it.
*
* If for some reason it would return 0, that means the previously waiting
* task is already running, so it will observe condition true (or has already).
*/
void swake_up_locked(struct swait_queue_head *q)
{
struct swait_queue *curr;
if (list_empty(&q->task_list))
return;
curr = list_first_entry(&q->task_list, typeof(*curr), task_list);
wake_up_process(curr->task);
list_del_init(&curr->task_list);
}
EXPORT_SYMBOL(swake_up_locked);
void swake_up(struct swait_queue_head *q)
{
unsigned long flags;
if (!swait_active(q))
return;
raw_spin_lock_irqsave(&q->lock, flags);
swake_up_locked(q);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(swake_up);
/*
* Does not allow usage from IRQ disabled, since we must be able to
* release IRQs to guarantee bounded hold time.
*/
void swake_up_all(struct swait_queue_head *q)
{
struct swait_queue *curr;
LIST_HEAD(tmp);
if (!swait_active(q))
return;
raw_spin_lock_irq(&q->lock);
list_splice_init(&q->task_list, &tmp);
while (!list_empty(&tmp)) {
curr = list_first_entry(&tmp, typeof(*curr), task_list);
wake_up_state(curr->task, TASK_NORMAL);
list_del_init(&curr->task_list);
if (list_empty(&tmp))
break;
raw_spin_unlock_irq(&q->lock);
raw_spin_lock_irq(&q->lock);
}
raw_spin_unlock_irq(&q->lock);
}
EXPORT_SYMBOL(swake_up_all);
void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
wait->task = current;
if (list_empty(&wait->task_list))
list_add(&wait->task_list, &q->task_list);
}
void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
unsigned long flags;
raw_spin_lock_irqsave(&q->lock, flags);
__prepare_to_swait(q, wait);
set_current_state(state);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(prepare_to_swait);
long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state)
{
if (signal_pending_state(state, current))
return -ERESTARTSYS;
prepare_to_swait(q, wait, state);
return 0;
}
EXPORT_SYMBOL(prepare_to_swait_event);
void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
__set_current_state(TASK_RUNNING);
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
}
void finish_swait(struct swait_queue_head *q, struct swait_queue *wait)
{
unsigned long flags;
__set_current_state(TASK_RUNNING);
if (!list_empty_careful(&wait->task_list)) {
raw_spin_lock_irqsave(&q->lock, flags);
list_del_init(&wait->task_list);
raw_spin_unlock_irqrestore(&q->lock, flags);
}
}
EXPORT_SYMBOL(finish_swait);

View File

@@ -105,13 +105,12 @@ void __init call_function_init(void)
* previous function call. For multi-cpu calls its even more interesting
* as we'll have to ensure no other cpu is observing our csd.
*/
static void csd_lock_wait(struct call_single_data *csd)
static __always_inline void csd_lock_wait(struct call_single_data *csd)
{
while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK)
cpu_relax();
smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK));
}
static void csd_lock(struct call_single_data *csd)
static __always_inline void csd_lock(struct call_single_data *csd)
{
csd_lock_wait(csd);
csd->flags |= CSD_FLAG_LOCK;
@@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd)
smp_wmb();
}
static void csd_unlock(struct call_single_data *csd)
static __always_inline void csd_unlock(struct call_single_data *csd)
{
WARN_ON(!(csd->flags & CSD_FLAG_LOCK));
@@ -569,6 +568,7 @@ void __init smp_init(void)
unsigned int cpu;
idle_threads_init();
cpuhp_threads_init();
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {

View File

@@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
kthread_unpark(tsk);
}
void smpboot_unpark_threads(unsigned int cpu)
int smpboot_unpark_threads(unsigned int cpu)
{
struct smp_hotplug_thread *cur;
@@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu)
if (cpumask_test_cpu(cpu, cur->cpumask))
smpboot_unpark_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
return 0;
}
static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
@@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
kthread_park(tsk);
}
void smpboot_park_threads(unsigned int cpu)
int smpboot_park_threads(unsigned int cpu)
{
struct smp_hotplug_thread *cur;
@@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu)
list_for_each_entry_reverse(cur, &hotplug_threads, list)
smpboot_park_thread(cur, cpu);
mutex_unlock(&smpboot_threads_lock);
return 0;
}
static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)

View File

@@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { }
#endif
int smpboot_create_threads(unsigned int cpu);
void smpboot_park_threads(unsigned int cpu);
void smpboot_unpark_threads(unsigned int cpu);
int smpboot_park_threads(unsigned int cpu);
int smpboot_unpark_threads(unsigned int cpu);
void __init cpuhp_threads_init(void);
#endif

View File

@@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt)
if (preempt_count() == cnt) {
#ifdef CONFIG_DEBUG_PREEMPT
current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1);
current->preempt_disable_ip = get_lock_parent_ip();
#endif
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip());
}
}
EXPORT_SYMBOL(__local_bh_disable_ip);

View File

@@ -350,6 +350,17 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
#ifdef CONFIG_SCHEDSTATS
{
.procname = "sched_schedstats",
.data = NULL,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = sysctl_schedstats,
.extra1 = &zero,
.extra2 = &one,
},
#endif /* CONFIG_SCHEDSTATS */
#endif /* CONFIG_SMP */
#ifdef CONFIG_NUMA_BALANCING
{
@@ -505,7 +516,7 @@ static struct ctl_table kern_table[] = {
.data = &latencytop_enabled,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
.proc_handler = sysctl_latencytop,
},
#endif
#ifdef CONFIG_BLK_DEV_INITRD

View File

@@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
/* cs is a watchdog. */
if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
/* Pick the best watchdog. */
if (!watchdog || cs->rating > watchdog->rating) {
watchdog = cs;
/* Reset watchdog cycles */
clocksource_reset_watchdog();
}
}
spin_unlock_irqrestore(&watchdog_lock, flags);
}
static void clocksource_select_watchdog(bool fallback)
{
struct clocksource *cs, *old_wd;
unsigned long flags;
spin_lock_irqsave(&watchdog_lock, flags);
/* save current watchdog */
old_wd = watchdog;
if (fallback)
watchdog = NULL;
list_for_each_entry(cs, &clocksource_list, list) {
/* cs is a clocksource to be watched. */
if (cs->flags & CLOCK_SOURCE_MUST_VERIFY)
continue;
/* Skip current if we were requested for a fallback. */
if (fallback && cs == old_wd)
continue;
/* Pick the best watchdog. */
if (!watchdog || cs->rating > watchdog->rating)
watchdog = cs;
}
/* If we failed to find a fallback restore the old one. */
if (!watchdog)
watchdog = old_wd;
/* If we changed the watchdog we need to reset cycles. */
if (watchdog != old_wd)
clocksource_reset_watchdog();
/* Check if the watchdog timer needs to be started. */
clocksource_start_watchdog();
spin_unlock_irqrestore(&watchdog_lock, flags);
@@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs)
cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES;
}
static void clocksource_select_watchdog(bool fallback) { }
static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
static inline void clocksource_resume_watchdog(void) { }
static inline int __clocksource_watchdog_kthread(void) { return 0; }
@@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
clocksource_enqueue(cs);
clocksource_enqueue_watchdog(cs);
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
mutex_lock(&clocksource_mutex);
__clocksource_change_rating(cs, rating);
clocksource_select();
clocksource_select_watchdog(false);
mutex_unlock(&clocksource_mutex);
}
EXPORT_SYMBOL(clocksource_change_rating);
@@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating);
*/
static int clocksource_unbind(struct clocksource *cs)
{
/*
* I really can't convince myself to support this on hardware
* designed by lobotomized monkeys.
*/
if (clocksource_is_watchdog(cs))
return -EBUSY;
if (clocksource_is_watchdog(cs)) {
/* Select and try to install a replacement watchdog. */
clocksource_select_watchdog(true);
if (clocksource_is_watchdog(cs))
return -EBUSY;
}
if (cs == curr_clocksource) {
/* Select and try to install a replacement clock source */

View File

@@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = {
.name = "jiffies",
.rating = 1, /* lowest valid rating*/
.read = jiffies_read,
.mask = 0xffffffff, /*32bits*/
.mask = CLOCKSOURCE_MASK(32),
.mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
.shift = JIFFIES_SHIFT,
.max_cycles = 10,

View File

@@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
return err;
}
/*
* Validate the clockid_t for a new CPU-clock timer, and initialize the timer.
* This is called from sys_timer_create() and do_cpu_nanosleep() with the
@@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer)
cputime_expires->sched_exp = exp;
break;
}
if (CPUCLOCK_PERTHREAD(timer->it_clock))
tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER);
else
tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER);
}
}
@@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
return 0;
}
#ifdef CONFIG_NO_HZ_FULL
static void nohz_kick_work_fn(struct work_struct *work)
{
tick_nohz_full_kick_all();
}
static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
/*
* We need the IPIs to be sent from sane process context.
* The posix cpu timers are always set with irqs disabled.
*/
static void posix_cpu_timer_kick_nohz(void)
{
if (context_tracking_is_enabled())
schedule_work(&nohz_kick_work);
}
bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
{
if (!task_cputime_zero(&tsk->cputime_expires))
return false;
/* Check if cputimer is running. This is accessed without locking. */
if (READ_ONCE(tsk->signal->cputimer.running))
return false;
return true;
}
#else
static inline void posix_cpu_timer_kick_nohz(void) { }
#endif
/*
* Guts of sys_timer_settime for CPU timers.
* This is called with the timer locked and interrupts disabled.
@@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
sample_to_timespec(timer->it_clock,
old_incr, &old->it_interval);
}
if (!ret)
posix_cpu_timer_kick_nohz();
return ret;
}
@@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk,
__group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
}
}
if (task_cputime_zero(tsk_expires))
tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER);
}
static inline void stop_process_timers(struct signal_struct *sig)
@@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig)
/* Turn off cputimer->running. This is done without locking. */
WRITE_ONCE(cputimer->running, false);
tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER);
}
static u32 onecputick;
@@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
arm_timer(timer);
unlock_task_sighand(p, &flags);
/* Kick full dynticks CPUs in case they need to tick on the new timer */
posix_cpu_timer_kick_nohz();
out:
timer->it_overrun_last = timer->it_overrun;
timer->it_overrun = -1;
@@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
}
if (!*newval)
goto out;
return;
*newval += now;
}
@@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
tsk->signal->cputime_expires.virt_exp = *newval;
break;
}
out:
posix_cpu_timer_kick_nohz();
tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER);
}
static int do_cpu_nanosleep(const clockid_t which_clock, int flags,

View File

@@ -22,7 +22,6 @@
#include <linux/module.h>
#include <linux/irq_work.h>
#include <linux/posix-timers.h>
#include <linux/perf_event.h>
#include <linux/context_tracking.h>
#include <asm/irq_regs.h>
@@ -158,54 +157,63 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
cpumask_var_t tick_nohz_full_mask;
cpumask_var_t housekeeping_mask;
bool tick_nohz_full_running;
static unsigned long tick_dep_mask;
static bool can_stop_full_tick(void)
static void trace_tick_dependency(unsigned long dep)
{
if (dep & TICK_DEP_MASK_POSIX_TIMER) {
trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER);
return;
}
if (dep & TICK_DEP_MASK_PERF_EVENTS) {
trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS);
return;
}
if (dep & TICK_DEP_MASK_SCHED) {
trace_tick_stop(0, TICK_DEP_MASK_SCHED);
return;
}
if (dep & TICK_DEP_MASK_CLOCK_UNSTABLE)
trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE);
}
static bool can_stop_full_tick(struct tick_sched *ts)
{
WARN_ON_ONCE(!irqs_disabled());
if (!sched_can_stop_tick()) {
trace_tick_stop(0, "more than 1 task in runqueue\n");
if (tick_dep_mask) {
trace_tick_dependency(tick_dep_mask);
return false;
}
if (!posix_cpu_timers_can_stop_tick(current)) {
trace_tick_stop(0, "posix timers running\n");
if (ts->tick_dep_mask) {
trace_tick_dependency(ts->tick_dep_mask);
return false;
}
if (!perf_event_can_stop_tick()) {
trace_tick_stop(0, "perf events running\n");
if (current->tick_dep_mask) {
trace_tick_dependency(current->tick_dep_mask);
return false;
}
/* sched_clock_tick() needs us? */
#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
/*
* TODO: kick full dynticks CPUs when
* sched_clock_stable is set.
*/
if (!sched_clock_stable()) {
trace_tick_stop(0, "unstable sched clock\n");
/*
* Don't allow the user to think they can get
* full NO_HZ with this machine.
*/
WARN_ONCE(tick_nohz_full_running,
"NO_HZ FULL will not work with unstable sched clock");
if (current->signal->tick_dep_mask) {
trace_tick_dependency(current->signal->tick_dep_mask);
return false;
}
#endif
return true;
}
static void nohz_full_kick_work_func(struct irq_work *work)
static void nohz_full_kick_func(struct irq_work *work)
{
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
.func = nohz_full_kick_work_func,
.func = nohz_full_kick_func,
};
/*
@@ -214,7 +222,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = {
* This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(),
* is NMI safe.
*/
void tick_nohz_full_kick(void)
static void tick_nohz_full_kick(void)
{
if (!tick_nohz_full_cpu(smp_processor_id()))
return;
@@ -234,27 +242,112 @@ void tick_nohz_full_kick_cpu(int cpu)
irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu);
}
static void nohz_full_kick_ipi(void *info)
{
/* Empty, the tick restart happens on tick_nohz_irq_exit() */
}
/*
* Kick all full dynticks CPUs in order to force these to re-evaluate
* their dependency on the tick and restart it if necessary.
*/
void tick_nohz_full_kick_all(void)
static void tick_nohz_full_kick_all(void)
{
int cpu;
if (!tick_nohz_full_running)
return;
preempt_disable();
smp_call_function_many(tick_nohz_full_mask,
nohz_full_kick_ipi, NULL, false);
tick_nohz_full_kick();
for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask)
tick_nohz_full_kick_cpu(cpu);
preempt_enable();
}
static void tick_nohz_dep_set_all(unsigned long *dep,
enum tick_dep_bits bit)
{
unsigned long prev;
prev = fetch_or(dep, BIT_MASK(bit));
if (!prev)
tick_nohz_full_kick_all();
}
/*
* Set a global tick dependency. Used by perf events that rely on freq and
* by unstable clock.
*/
void tick_nohz_dep_set(enum tick_dep_bits bit)
{
tick_nohz_dep_set_all(&tick_dep_mask, bit);
}
void tick_nohz_dep_clear(enum tick_dep_bits bit)
{
clear_bit(bit, &tick_dep_mask);
}
/*
* Set per-CPU tick dependency. Used by scheduler and perf events in order to
* manage events throttling.
*/
void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit)
{
unsigned long prev;
struct tick_sched *ts;
ts = per_cpu_ptr(&tick_cpu_sched, cpu);
prev = fetch_or(&ts->tick_dep_mask, BIT_MASK(bit));
if (!prev) {
preempt_disable();
/* Perf needs local kick that is NMI safe */
if (cpu == smp_processor_id()) {
tick_nohz_full_kick();
} else {
/* Remote irq work not NMI-safe */
if (!WARN_ON_ONCE(in_nmi()))
tick_nohz_full_kick_cpu(cpu);
}
preempt_enable();
}
}
void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit)
{
struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
clear_bit(bit, &ts->tick_dep_mask);
}
/*
* Set a per-task tick dependency. Posix CPU timers need this in order to elapse
* per task timers.
*/
void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
/*
* We could optimize this with just kicking the target running the task
* if that noise matters for nohz full users.
*/
tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit);
}
void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit)
{
clear_bit(bit, &tsk->tick_dep_mask);
}
/*
* Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse
* per process timers.
*/
void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
tick_nohz_dep_set_all(&sig->tick_dep_mask, bit);
}
void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit)
{
clear_bit(bit, &sig->tick_dep_mask);
}
/*
* Re-evaluate the need for the tick as we switch the current task.
* It might need the tick due to per task/process properties:
@@ -263,15 +356,19 @@ void tick_nohz_full_kick_all(void)
void __tick_nohz_task_switch(void)
{
unsigned long flags;
struct tick_sched *ts;
local_irq_save(flags);
if (!tick_nohz_full_cpu(smp_processor_id()))
goto out;
if (tick_nohz_tick_stopped() && !can_stop_full_tick())
tick_nohz_full_kick();
ts = this_cpu_ptr(&tick_cpu_sched);
if (ts->tick_stopped) {
if (current->tick_dep_mask || current->signal->tick_dep_mask)
tick_nohz_full_kick();
}
out:
local_irq_restore(flags);
}
@@ -689,7 +786,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
ts->last_tick = hrtimer_get_expires(&ts->sched_timer);
ts->tick_stopped = 1;
trace_tick_stop(1, " ");
trace_tick_stop(1, TICK_DEP_MASK_NONE);
}
/*
@@ -740,7 +837,7 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts)
if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE)
return;
if (can_stop_full_tick())
if (can_stop_full_tick(ts))
tick_nohz_stop_sched_tick(ts, ktime_get(), cpu);
else if (ts->tick_stopped)
tick_nohz_restart_sched_tick(ts, ktime_get(), 1);

View File

@@ -60,6 +60,7 @@ struct tick_sched {
u64 next_timer;
ktime_t idle_expires;
int do_timer_last;
unsigned long tick_dep_mask;
};
extern struct tick_sched *tick_get_tick_sched(int cpu);

View File

@@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
u64 tmp, ntpinterval;
struct clocksource *old_clock;
++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
tk->tkr_mono.read = clock->read;
@@ -298,19 +299,36 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset;
static inline u32 arch_gettimeoffset(void) { return 0; }
#endif
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr,
cycle_t delta)
{
cycle_t delta;
s64 nsec;
delta = timekeeping_get_delta(tkr);
nsec = (delta * tkr->mult + tkr->xtime_nsec) >> tkr->shift;
nsec = delta * tkr->mult + tkr->xtime_nsec;
nsec >>= tkr->shift;
/* If arch requires, add in get_arch_timeoffset() */
return nsec + arch_gettimeoffset();
}
static inline s64 timekeeping_get_ns(struct tk_read_base *tkr)
{
cycle_t delta;
delta = timekeeping_get_delta(tkr);
return timekeeping_delta_to_ns(tkr, delta);
}
static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr,
cycle_t cycles)
{
cycle_t delta;
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask);
return timekeeping_delta_to_ns(tkr, delta);
}
/**
* update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper.
* @tkr: Timekeeping readout base from which we take the update
@@ -857,44 +875,262 @@ time64_t __ktime_get_real_seconds(void)
return tk->xtime_sec;
}
#ifdef CONFIG_NTP_PPS
/**
* ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format
* @ts_raw: pointer to the timespec to be set to raw monotonic time
* @ts_real: pointer to the timespec to be set to the time of day
*
* This function reads both the time of day and raw monotonic time at the
* same time atomically and stores the resulting timestamps in timespec
* format.
* ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter
* @systime_snapshot: pointer to struct receiving the system time snapshot
*/
void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real)
void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
{
struct timekeeper *tk = &tk_core.timekeeper;
unsigned long seq;
s64 nsecs_raw, nsecs_real;
ktime_t base_raw;
ktime_t base_real;
s64 nsec_raw;
s64 nsec_real;
cycle_t now;
WARN_ON_ONCE(timekeeping_suspended);
do {
seq = read_seqcount_begin(&tk_core.seq);
*ts_raw = tk->raw_time;
ts_real->tv_sec = tk->xtime_sec;
ts_real->tv_nsec = 0;
nsecs_raw = timekeeping_get_ns(&tk->tkr_raw);
nsecs_real = timekeeping_get_ns(&tk->tkr_mono);
now = tk->tkr_mono.read(tk->tkr_mono.clock);
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now);
} while (read_seqcount_retry(&tk_core.seq, seq));
timespec64_add_ns(ts_raw, nsecs_raw);
timespec64_add_ns(ts_real, nsecs_real);
systime_snapshot->cycles = now;
systime_snapshot->real = ktime_add_ns(base_real, nsec_real);
systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw);
}
EXPORT_SYMBOL(ktime_get_raw_and_real_ts64);
EXPORT_SYMBOL_GPL(ktime_get_snapshot);
#endif /* CONFIG_NTP_PPS */
/* Scale base by mult/div checking for overflow */
static int scale64_check_overflow(u64 mult, u64 div, u64 *base)
{
u64 tmp, rem;
tmp = div64_u64_rem(*base, div, &rem);
if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) ||
((int)sizeof(u64)*8 - fls64(mult) < fls64(rem)))
return -EOVERFLOW;
tmp *= mult;
rem *= mult;
do_div(rem, div);
*base = tmp + rem;
return 0;
}
/**
* adjust_historical_crosststamp - adjust crosstimestamp previous to current interval
* @history: Snapshot representing start of history
* @partial_history_cycles: Cycle offset into history (fractional part)
* @total_history_cycles: Total history length in cycles
* @discontinuity: True indicates clock was set on history period
* @ts: Cross timestamp that should be adjusted using
* partial/total ratio
*
* Helper function used by get_device_system_crosststamp() to correct the
* crosstimestamp corresponding to the start of the current interval to the
* system counter value (timestamp point) provided by the driver. The
* total_history_* quantities are the total history starting at the provided
* reference point and ending at the start of the current interval. The cycle
* count between the driver timestamp point and the start of the current
* interval is partial_history_cycles.
*/
static int adjust_historical_crosststamp(struct system_time_snapshot *history,
cycle_t partial_history_cycles,
cycle_t total_history_cycles,
bool discontinuity,
struct system_device_crosststamp *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
u64 corr_raw, corr_real;
bool interp_forward;
int ret;
if (total_history_cycles == 0 || partial_history_cycles == 0)
return 0;
/* Interpolate shortest distance from beginning or end of history */
interp_forward = partial_history_cycles > total_history_cycles/2 ?
true : false;
partial_history_cycles = interp_forward ?
total_history_cycles - partial_history_cycles :
partial_history_cycles;
/*
* Scale the monotonic raw time delta by:
* partial_history_cycles / total_history_cycles
*/
corr_raw = (u64)ktime_to_ns(
ktime_sub(ts->sys_monoraw, history->raw));
ret = scale64_check_overflow(partial_history_cycles,
total_history_cycles, &corr_raw);
if (ret)
return ret;
/*
* If there is a discontinuity in the history, scale monotonic raw
* correction by:
* mult(real)/mult(raw) yielding the realtime correction
* Otherwise, calculate the realtime correction similar to monotonic
* raw calculation
*/
if (discontinuity) {
corr_real = mul_u64_u32_div
(corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult);
} else {
corr_real = (u64)ktime_to_ns(
ktime_sub(ts->sys_realtime, history->real));
ret = scale64_check_overflow(partial_history_cycles,
total_history_cycles, &corr_real);
if (ret)
return ret;
}
/* Fixup monotonic raw and real time time values */
if (interp_forward) {
ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw);
ts->sys_realtime = ktime_add_ns(history->real, corr_real);
} else {
ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw);
ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real);
}
return 0;
}
/*
* cycle_between - true if test occurs chronologically between before and after
*/
static bool cycle_between(cycle_t before, cycle_t test, cycle_t after)
{
if (test > before && test < after)
return true;
if (test < before && before > after)
return true;
return false;
}
/**
* get_device_system_crosststamp - Synchronously capture system/device timestamp
* @get_time_fn: Callback to get simultaneous device time and
* system counter from the device driver
* @ctx: Context passed to get_time_fn()
* @history_begin: Historical reference point used to interpolate system
* time when counter provided by the driver is before the current interval
* @xtstamp: Receives simultaneously captured system and device time
*
* Reads a timestamp from a device and correlates it to system time
*/
int get_device_system_crosststamp(int (*get_time_fn)
(ktime_t *device_time,
struct system_counterval_t *sys_counterval,
void *ctx),
void *ctx,
struct system_time_snapshot *history_begin,
struct system_device_crosststamp *xtstamp)
{
struct system_counterval_t system_counterval;
struct timekeeper *tk = &tk_core.timekeeper;
cycle_t cycles, now, interval_start;
unsigned int clock_was_set_seq = 0;
ktime_t base_real, base_raw;
s64 nsec_real, nsec_raw;
u8 cs_was_changed_seq;
unsigned long seq;
bool do_interp;
int ret;
do {
seq = read_seqcount_begin(&tk_core.seq);
/*
* Try to synchronously capture device time and a system
* counter value calling back into the device driver
*/
ret = get_time_fn(&xtstamp->device, &system_counterval, ctx);
if (ret)
return ret;
/*
* Verify that the clocksource associated with the captured
* system counter value is the same as the currently installed
* timekeeper clocksource
*/
if (tk->tkr_mono.clock != system_counterval.cs)
return -ENODEV;
cycles = system_counterval.cycles;
/*
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
now = tk->tkr_mono.read(tk->tkr_mono.clock);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
cs_was_changed_seq = tk->cs_was_changed_seq;
cycles = interval_start;
do_interp = true;
} else {
do_interp = false;
}
base_real = ktime_add(tk->tkr_mono.base,
tk_core.timekeeper.offs_real);
base_raw = tk->tkr_raw.base;
nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono,
system_counterval.cycles);
nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw,
system_counterval.cycles);
} while (read_seqcount_retry(&tk_core.seq, seq));
xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real);
xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw);
/*
* Interpolate if necessary, adjusting back from the start of the
* current interval
*/
if (do_interp) {
cycle_t partial_history_cycles, total_history_cycles;
bool discontinuity;
/*
* Check that the counter value occurs after the provided
* history reference and that the history doesn't cross a
* clocksource change
*/
if (!history_begin ||
!cycle_between(history_begin->cycles,
system_counterval.cycles, cycles) ||
history_begin->cs_was_changed_seq != cs_was_changed_seq)
return -EINVAL;
partial_history_cycles = cycles - system_counterval.cycles;
total_history_cycles = cycles - history_begin->cycles;
discontinuity =
history_begin->clock_was_set_seq != clock_was_set_seq;
ret = adjust_historical_crosststamp(history_begin,
partial_history_cycles,
total_history_cycles,
discontinuity, xtstamp);
if (ret)
return ret;
}
return 0;
}
EXPORT_SYMBOL_GPL(get_device_system_crosststamp);
/**
* do_gettimeofday - Returns the time of day in a timeval

View File

@@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
head = trace_get_fields(call);
field = __find_event_field(head, name);
if (field)
return field;
field = __find_event_field(&ftrace_generic_fields, name);
if (field)
return field;
field = __find_event_field(&ftrace_common_fields, name);
if (field)
return field;
head = trace_get_fields(call);
return __find_event_field(head, name);
return __find_event_field(&ftrace_common_fields, name);
}
static int __trace_define_field(struct list_head *head, const char *type,
@@ -171,8 +171,10 @@ static int trace_define_generic_fields(void)
{
int ret;
__generic_field(int, cpu, FILTER_OTHER);
__generic_field(char *, comm, FILTER_PTR_STRING);
__generic_field(int, CPU, FILTER_CPU);
__generic_field(int, cpu, FILTER_CPU);
__generic_field(char *, COMM, FILTER_COMM);
__generic_field(char *, comm, FILTER_COMM);
return ret;
}

View File

@@ -1043,13 +1043,14 @@ static int init_pred(struct filter_parse_state *ps,
return -EINVAL;
}
if (is_string_field(field)) {
if (field->filter_type == FILTER_COMM) {
filter_build_regex(pred);
fn = filter_pred_comm;
pred->regex.field_len = TASK_COMM_LEN;
} else if (is_string_field(field)) {
filter_build_regex(pred);
if (!strcmp(field->name, "comm")) {
fn = filter_pred_comm;
pred->regex.field_len = TASK_COMM_LEN;
} else if (field->filter_type == FILTER_STATIC_STRING) {
if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1072,7 +1073,7 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
if (!strcmp(field->name, "cpu"))
if (field->filter_type == FILTER_CPU)
fn = filter_pred_cpu;
else
fn = select_comparison_fn(pred->op, field->size,

View File

@@ -30,7 +30,7 @@
struct trace_kprobe {
struct list_head list;
struct kretprobe rp; /* Use rp.kp for kprobe use */
unsigned long nhit;
unsigned long __percpu *nhit;
const char *symbol; /* symbol name */
struct trace_probe tp;
};
@@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
if (!tk)
return ERR_PTR(ret);
tk->nhit = alloc_percpu(unsigned long);
if (!tk->nhit)
goto error;
if (symbol) {
tk->symbol = kstrdup(symbol, GFP_KERNEL);
if (!tk->symbol)
@@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group,
error:
kfree(tk->tp.call.name);
kfree(tk->symbol);
free_percpu(tk->nhit);
kfree(tk);
return ERR_PTR(ret);
}
@@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk)
kfree(tk->tp.call.class->system);
kfree(tk->tp.call.name);
kfree(tk->symbol);
free_percpu(tk->nhit);
kfree(tk);
}
@@ -874,9 +880,14 @@ static const struct file_operations kprobe_events_ops = {
static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct trace_kprobe *tk = v;
unsigned long nhit = 0;
int cpu;
for_each_possible_cpu(cpu)
nhit += *per_cpu_ptr(tk->nhit, cpu);
seq_printf(m, " %-44s %15lu %15lu\n",
trace_event_name(&tk->tp.call), tk->nhit,
trace_event_name(&tk->tp.call), nhit,
tk->rp.kp.nmissed);
return 0;
@@ -1225,7 +1236,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
tk->nhit++;
raw_cpu_inc(*tk->nhit);
if (tk->tp.flags & TP_FLAG_TRACE)
kprobe_trace_func(tk, regs);
@@ -1242,7 +1253,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
tk->nhit++;
raw_cpu_inc(*tk->nhit);
if (tk->tp.flags & TP_FLAG_TRACE)
kretprobe_trace_func(tk, ri, regs);

View File

@@ -186,11 +186,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags,
extern char *__bad_type_size(void);
#define SYSCALL_FIELD(type, name) \
sizeof(type) != sizeof(trace.name) ? \
#define SYSCALL_FIELD(type, field, name) \
sizeof(type) != sizeof(trace.field) ? \
__bad_type_size() : \
#type, #name, offsetof(typeof(trace), name), \
sizeof(trace.name), is_signed_type(type)
#type, #name, offsetof(typeof(trace), field), \
sizeof(trace.field), is_signed_type(type)
static int __init
__set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len)
@@ -261,7 +261,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call)
int i;
int offset = offsetof(typeof(trace), args);
ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
FILTER_OTHER);
if (ret)
return ret;
@@ -281,11 +282,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call)
struct syscall_trace_exit trace;
int ret;
ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER);
ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr),
FILTER_OTHER);
if (ret)
return ret;
ret = trace_define_field(call, SYSCALL_FIELD(long, ret),
ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret),
FILTER_OTHER);
return ret;

View File

@@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
{
struct mm_struct *mm;
/* convert pages-usec to Mbyte-usec */
stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
/* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */
stats->coremem = p->acct_rss_mem1 * PAGE_SIZE;
do_div(stats->coremem, 1000 * KB);
stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE;
do_div(stats->virtmem, 1000 * KB);
mm = get_task_mm(p);
if (mm) {
/* adjust to KB unit */
@@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
static void __acct_update_integrals(struct task_struct *tsk,
cputime_t utime, cputime_t stime)
{
if (likely(tsk->mm)) {
cputime_t time, dtime;
struct timeval value;
unsigned long flags;
u64 delta;
cputime_t time, dtime;
u64 delta;
local_irq_save(flags);
time = stime + utime;
dtime = time - tsk->acct_timexpd;
jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
delta = value.tv_sec;
delta = delta * USEC_PER_SEC + value.tv_usec;
if (!likely(tsk->mm))
return;
if (delta == 0)
goto out;
tsk->acct_timexpd = time;
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
out:
local_irq_restore(flags);
}
time = stime + utime;
dtime = time - tsk->acct_timexpd;
/* Avoid division: cputime_t is often in nanoseconds already. */
delta = cputime_to_nsecs(dtime);
if (delta < TICK_NSEC)
return;
tsk->acct_timexpd = time;
/*
* Divide by 1024 to avoid overflow, and to avoid division.
* The final unit reported to userspace is Mbyte-usecs,
* the rest of the math is done in xacct_add_tsk.
*/
tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10;
tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10;
}
/**
@@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk,
void acct_update_integrals(struct task_struct *tsk)
{
cputime_t utime, stime;
unsigned long flags;
local_irq_save(flags);
task_cputime(tsk, &utime, &stime);
__acct_update_integrals(tsk, utime, stime);
local_irq_restore(flags);
}
/**