Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (29 commits) sched: Export account_system_vtime() sched: Call tick_check_idle before __irq_enter sched: Remove irq time from available CPU power sched: Do not account irq time to current task x86: Add IRQ_TIME_ACCOUNTING sched: Add IRQ_TIME_ACCOUNTING, finer accounting of irq time sched: Add a PF flag for ksoftirqd identification sched: Consolidate account_system_vtime extern declaration sched: Fix softirq time accounting sched: Drop group_capacity to 1 only if local group has extra capacity sched: Force balancing on newidle balance if local group has capacity sched: Set group_imb only a task can be pulled from the busiest cpu sched: Do not consider SCHED_IDLE tasks to be cache hot sched: Drop all load weight manipulation for RT tasks sched: Create special class for stop/migrate work sched: Unindent labels sched: Comment updates: fix default latency and granularity numbers tracing/sched: Add sched_pi_setprio tracepoint sched: Give CPU bound RT tasks preference sched: Try not to migrate higher priority RT tasks ...
This commit is contained in:
@@ -14,25 +14,39 @@ to /proc/cpuinfo.
|
|||||||
identifier (rather than the kernel's). The actual value is
|
identifier (rather than the kernel's). The actual value is
|
||||||
architecture and platform dependent.
|
architecture and platform dependent.
|
||||||
|
|
||||||
3) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
|
3) /sys/devices/system/cpu/cpuX/topology/book_id:
|
||||||
|
|
||||||
|
the book ID of cpuX. Typically it is the hardware platform's
|
||||||
|
identifier (rather than the kernel's). The actual value is
|
||||||
|
architecture and platform dependent.
|
||||||
|
|
||||||
|
4) /sys/devices/system/cpu/cpuX/topology/thread_siblings:
|
||||||
|
|
||||||
internel kernel map of cpuX's hardware threads within the same
|
internel kernel map of cpuX's hardware threads within the same
|
||||||
core as cpuX
|
core as cpuX
|
||||||
|
|
||||||
4) /sys/devices/system/cpu/cpuX/topology/core_siblings:
|
5) /sys/devices/system/cpu/cpuX/topology/core_siblings:
|
||||||
|
|
||||||
internal kernel map of cpuX's hardware threads within the same
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
physical_package_id.
|
physical_package_id.
|
||||||
|
|
||||||
|
6) /sys/devices/system/cpu/cpuX/topology/book_siblings:
|
||||||
|
|
||||||
|
internal kernel map of cpuX's hardware threads within the same
|
||||||
|
book_id.
|
||||||
|
|
||||||
To implement it in an architecture-neutral way, a new source file,
|
To implement it in an architecture-neutral way, a new source file,
|
||||||
drivers/base/topology.c, is to export the 4 attributes.
|
drivers/base/topology.c, is to export the 4 or 6 attributes. The two book
|
||||||
|
related sysfs files will only be created if CONFIG_SCHED_BOOK is selected.
|
||||||
|
|
||||||
For an architecture to support this feature, it must define some of
|
For an architecture to support this feature, it must define some of
|
||||||
these macros in include/asm-XXX/topology.h:
|
these macros in include/asm-XXX/topology.h:
|
||||||
#define topology_physical_package_id(cpu)
|
#define topology_physical_package_id(cpu)
|
||||||
#define topology_core_id(cpu)
|
#define topology_core_id(cpu)
|
||||||
|
#define topology_book_id(cpu)
|
||||||
#define topology_thread_cpumask(cpu)
|
#define topology_thread_cpumask(cpu)
|
||||||
#define topology_core_cpumask(cpu)
|
#define topology_core_cpumask(cpu)
|
||||||
|
#define topology_book_cpumask(cpu)
|
||||||
|
|
||||||
The type of **_id is int.
|
The type of **_id is int.
|
||||||
The type of siblings is (const) struct cpumask *.
|
The type of siblings is (const) struct cpumask *.
|
||||||
@@ -45,6 +59,9 @@ not defined by include/asm-XXX/topology.h:
|
|||||||
3) thread_siblings: just the given CPU
|
3) thread_siblings: just the given CPU
|
||||||
4) core_siblings: just the given CPU
|
4) core_siblings: just the given CPU
|
||||||
|
|
||||||
|
For architectures that don't support books (CONFIG_SCHED_BOOK) there are no
|
||||||
|
default definitions for topology_book_id() and topology_book_cpumask().
|
||||||
|
|
||||||
Additionally, CPU topology information is provided under
|
Additionally, CPU topology information is provided under
|
||||||
/sys/devices/system/cpu and includes these files. The internal
|
/sys/devices/system/cpu and includes these files. The internal
|
||||||
source for the output is in brackets ("[]").
|
source for the output is in brackets ("[]").
|
||||||
|
@@ -2435,6 +2435,10 @@ and is between 256 and 4096 characters. It is defined in the file
|
|||||||
disables clocksource verification at runtime.
|
disables clocksource verification at runtime.
|
||||||
Used to enable high-resolution timer mode on older
|
Used to enable high-resolution timer mode on older
|
||||||
hardware, and in virtualized environment.
|
hardware, and in virtualized environment.
|
||||||
|
[x86] noirqtime: Do not use TSC to do irq accounting.
|
||||||
|
Used to run time disable IRQ_TIME_ACCOUNTING on any
|
||||||
|
platforms where RDTSC is slow and this accounting
|
||||||
|
can add overhead.
|
||||||
|
|
||||||
turbografx.map[2|3]= [HW,JOY]
|
turbografx.map[2|3]= [HW,JOY]
|
||||||
TurboGraFX parallel port interface
|
TurboGraFX parallel port interface
|
||||||
|
@@ -272,10 +272,6 @@ void cpu_idle_wait(void);
|
|||||||
|
|
||||||
void default_idle(void);
|
void default_idle(void);
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
|
||||||
extern void account_system_vtime(struct task_struct *);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#endif /* __KERNEL__ */
|
#endif /* __KERNEL__ */
|
||||||
|
|
||||||
#endif /* __ASSEMBLY__ */
|
#endif /* __ASSEMBLY__ */
|
||||||
|
@@ -542,10 +542,6 @@ extern void reloc_got2(unsigned long);
|
|||||||
|
|
||||||
#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
|
#define PTRRELOC(x) ((typeof(x)) add_reloc_offset((unsigned long)(x)))
|
||||||
|
|
||||||
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
|
|
||||||
extern void account_system_vtime(struct task_struct *);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
extern struct dentry *powerpc_debugfs_root;
|
extern struct dentry *powerpc_debugfs_root;
|
||||||
|
|
||||||
#endif /* __KERNEL__ */
|
#endif /* __KERNEL__ */
|
||||||
|
@@ -199,6 +199,13 @@ config HOTPLUG_CPU
|
|||||||
can be controlled through /sys/devices/system/cpu/cpu#.
|
can be controlled through /sys/devices/system/cpu/cpu#.
|
||||||
Say N if you want to disable CPU hotplug.
|
Say N if you want to disable CPU hotplug.
|
||||||
|
|
||||||
|
config SCHED_BOOK
|
||||||
|
bool "Book scheduler support"
|
||||||
|
depends on SMP
|
||||||
|
help
|
||||||
|
Book scheduler support improves the CPU scheduler's decision making
|
||||||
|
when dealing with machines that have several books.
|
||||||
|
|
||||||
config MATHEMU
|
config MATHEMU
|
||||||
bool "IEEE FPU emulation"
|
bool "IEEE FPU emulation"
|
||||||
depends on MARCH_G5
|
depends on MARCH_G5
|
||||||
|
@@ -97,7 +97,6 @@ static inline void restore_access_regs(unsigned int *acrs)
|
|||||||
|
|
||||||
extern void account_vtime(struct task_struct *, struct task_struct *);
|
extern void account_vtime(struct task_struct *, struct task_struct *);
|
||||||
extern void account_tick_vtime(struct task_struct *);
|
extern void account_tick_vtime(struct task_struct *);
|
||||||
extern void account_system_vtime(struct task_struct *);
|
|
||||||
|
|
||||||
#ifdef CONFIG_PFAULT
|
#ifdef CONFIG_PFAULT
|
||||||
extern void pfault_irq_init(void);
|
extern void pfault_irq_init(void);
|
||||||
|
@@ -3,15 +3,32 @@
|
|||||||
|
|
||||||
#include <linux/cpumask.h>
|
#include <linux/cpumask.h>
|
||||||
|
|
||||||
#define mc_capable() (1)
|
|
||||||
|
|
||||||
const struct cpumask *cpu_coregroup_mask(unsigned int cpu);
|
|
||||||
|
|
||||||
extern unsigned char cpu_core_id[NR_CPUS];
|
extern unsigned char cpu_core_id[NR_CPUS];
|
||||||
extern cpumask_t cpu_core_map[NR_CPUS];
|
extern cpumask_t cpu_core_map[NR_CPUS];
|
||||||
|
|
||||||
|
static inline const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
|
||||||
|
{
|
||||||
|
return &cpu_core_map[cpu];
|
||||||
|
}
|
||||||
|
|
||||||
#define topology_core_id(cpu) (cpu_core_id[cpu])
|
#define topology_core_id(cpu) (cpu_core_id[cpu])
|
||||||
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
|
#define topology_core_cpumask(cpu) (&cpu_core_map[cpu])
|
||||||
|
#define mc_capable() (1)
|
||||||
|
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
|
||||||
|
extern unsigned char cpu_book_id[NR_CPUS];
|
||||||
|
extern cpumask_t cpu_book_map[NR_CPUS];
|
||||||
|
|
||||||
|
static inline const struct cpumask *cpu_book_mask(unsigned int cpu)
|
||||||
|
{
|
||||||
|
return &cpu_book_map[cpu];
|
||||||
|
}
|
||||||
|
|
||||||
|
#define topology_book_id(cpu) (cpu_book_id[cpu])
|
||||||
|
#define topology_book_cpumask(cpu) (&cpu_book_map[cpu])
|
||||||
|
|
||||||
|
#endif /* CONFIG_SCHED_BOOK */
|
||||||
|
|
||||||
int topology_set_cpu_management(int fc);
|
int topology_set_cpu_management(int fc);
|
||||||
void topology_schedule_update(void);
|
void topology_schedule_update(void);
|
||||||
@@ -30,6 +47,8 @@ static inline void s390_init_cpu_topology(void)
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#define SD_BOOK_INIT SD_CPU_INIT
|
||||||
|
|
||||||
#include <asm-generic/topology.h>
|
#include <asm-generic/topology.h>
|
||||||
|
|
||||||
#endif /* _ASM_S390_TOPOLOGY_H */
|
#endif /* _ASM_S390_TOPOLOGY_H */
|
||||||
|
@@ -57,8 +57,8 @@ struct tl_info {
|
|||||||
union tl_entry tle[0];
|
union tl_entry tle[0];
|
||||||
};
|
};
|
||||||
|
|
||||||
struct core_info {
|
struct mask_info {
|
||||||
struct core_info *next;
|
struct mask_info *next;
|
||||||
unsigned char id;
|
unsigned char id;
|
||||||
cpumask_t mask;
|
cpumask_t mask;
|
||||||
};
|
};
|
||||||
@@ -66,7 +66,6 @@ struct core_info {
|
|||||||
static int topology_enabled;
|
static int topology_enabled;
|
||||||
static void topology_work_fn(struct work_struct *work);
|
static void topology_work_fn(struct work_struct *work);
|
||||||
static struct tl_info *tl_info;
|
static struct tl_info *tl_info;
|
||||||
static struct core_info core_info;
|
|
||||||
static int machine_has_topology;
|
static int machine_has_topology;
|
||||||
static struct timer_list topology_timer;
|
static struct timer_list topology_timer;
|
||||||
static void set_topology_timer(void);
|
static void set_topology_timer(void);
|
||||||
@@ -74,38 +73,37 @@ static DECLARE_WORK(topology_work, topology_work_fn);
|
|||||||
/* topology_lock protects the core linked list */
|
/* topology_lock protects the core linked list */
|
||||||
static DEFINE_SPINLOCK(topology_lock);
|
static DEFINE_SPINLOCK(topology_lock);
|
||||||
|
|
||||||
|
static struct mask_info core_info;
|
||||||
cpumask_t cpu_core_map[NR_CPUS];
|
cpumask_t cpu_core_map[NR_CPUS];
|
||||||
unsigned char cpu_core_id[NR_CPUS];
|
unsigned char cpu_core_id[NR_CPUS];
|
||||||
|
|
||||||
static cpumask_t cpu_coregroup_map(unsigned int cpu)
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
static struct mask_info book_info;
|
||||||
|
cpumask_t cpu_book_map[NR_CPUS];
|
||||||
|
unsigned char cpu_book_id[NR_CPUS];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu)
|
||||||
{
|
{
|
||||||
struct core_info *core = &core_info;
|
|
||||||
unsigned long flags;
|
|
||||||
cpumask_t mask;
|
cpumask_t mask;
|
||||||
|
|
||||||
cpus_clear(mask);
|
cpus_clear(mask);
|
||||||
if (!topology_enabled || !machine_has_topology)
|
if (!topology_enabled || !machine_has_topology)
|
||||||
return cpu_possible_map;
|
return cpu_possible_map;
|
||||||
spin_lock_irqsave(&topology_lock, flags);
|
while (info) {
|
||||||
while (core) {
|
if (cpu_isset(cpu, info->mask)) {
|
||||||
if (cpu_isset(cpu, core->mask)) {
|
mask = info->mask;
|
||||||
mask = core->mask;
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
core = core->next;
|
info = info->next;
|
||||||
}
|
}
|
||||||
spin_unlock_irqrestore(&topology_lock, flags);
|
|
||||||
if (cpus_empty(mask))
|
if (cpus_empty(mask))
|
||||||
mask = cpumask_of_cpu(cpu);
|
mask = cpumask_of_cpu(cpu);
|
||||||
return mask;
|
return mask;
|
||||||
}
|
}
|
||||||
|
|
||||||
const struct cpumask *cpu_coregroup_mask(unsigned int cpu)
|
static void add_cpus_to_mask(struct tl_cpu *tl_cpu, struct mask_info *book,
|
||||||
{
|
struct mask_info *core)
|
||||||
return &cpu_core_map[cpu];
|
|
||||||
}
|
|
||||||
|
|
||||||
static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
|
|
||||||
{
|
{
|
||||||
unsigned int cpu;
|
unsigned int cpu;
|
||||||
|
|
||||||
@@ -117,23 +115,35 @@ static void add_cpus_to_core(struct tl_cpu *tl_cpu, struct core_info *core)
|
|||||||
|
|
||||||
rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
|
rcpu = CPU_BITS - 1 - cpu + tl_cpu->origin;
|
||||||
for_each_present_cpu(lcpu) {
|
for_each_present_cpu(lcpu) {
|
||||||
if (cpu_logical_map(lcpu) == rcpu) {
|
if (cpu_logical_map(lcpu) != rcpu)
|
||||||
cpu_set(lcpu, core->mask);
|
continue;
|
||||||
cpu_core_id[lcpu] = core->id;
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
smp_cpu_polarization[lcpu] = tl_cpu->pp;
|
cpu_set(lcpu, book->mask);
|
||||||
}
|
cpu_book_id[lcpu] = book->id;
|
||||||
|
#endif
|
||||||
|
cpu_set(lcpu, core->mask);
|
||||||
|
cpu_core_id[lcpu] = core->id;
|
||||||
|
smp_cpu_polarization[lcpu] = tl_cpu->pp;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void clear_cores(void)
|
static void clear_masks(void)
|
||||||
{
|
{
|
||||||
struct core_info *core = &core_info;
|
struct mask_info *info;
|
||||||
|
|
||||||
while (core) {
|
info = &core_info;
|
||||||
cpus_clear(core->mask);
|
while (info) {
|
||||||
core = core->next;
|
cpus_clear(info->mask);
|
||||||
|
info = info->next;
|
||||||
}
|
}
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
info = &book_info;
|
||||||
|
while (info) {
|
||||||
|
cpus_clear(info->mask);
|
||||||
|
info = info->next;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
static union tl_entry *next_tle(union tl_entry *tle)
|
static union tl_entry *next_tle(union tl_entry *tle)
|
||||||
@@ -146,29 +156,36 @@ static union tl_entry *next_tle(union tl_entry *tle)
|
|||||||
|
|
||||||
static void tl_to_cores(struct tl_info *info)
|
static void tl_to_cores(struct tl_info *info)
|
||||||
{
|
{
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
struct mask_info *book = &book_info;
|
||||||
|
#else
|
||||||
|
struct mask_info *book = NULL;
|
||||||
|
#endif
|
||||||
|
struct mask_info *core = &core_info;
|
||||||
union tl_entry *tle, *end;
|
union tl_entry *tle, *end;
|
||||||
struct core_info *core = &core_info;
|
|
||||||
|
|
||||||
spin_lock_irq(&topology_lock);
|
spin_lock_irq(&topology_lock);
|
||||||
clear_cores();
|
clear_masks();
|
||||||
tle = info->tle;
|
tle = info->tle;
|
||||||
end = (union tl_entry *)((unsigned long)info + info->length);
|
end = (union tl_entry *)((unsigned long)info + info->length);
|
||||||
while (tle < end) {
|
while (tle < end) {
|
||||||
switch (tle->nl) {
|
switch (tle->nl) {
|
||||||
case 5:
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
case 4:
|
|
||||||
case 3:
|
|
||||||
case 2:
|
case 2:
|
||||||
|
book = book->next;
|
||||||
|
book->id = tle->container.id;
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
case 1:
|
case 1:
|
||||||
core = core->next;
|
core = core->next;
|
||||||
core->id = tle->container.id;
|
core->id = tle->container.id;
|
||||||
break;
|
break;
|
||||||
case 0:
|
case 0:
|
||||||
add_cpus_to_core(&tle->cpu, core);
|
add_cpus_to_mask(&tle->cpu, book, core);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
clear_cores();
|
clear_masks();
|
||||||
machine_has_topology = 0;
|
machine_has_topology = 0;
|
||||||
goto out;
|
goto out;
|
||||||
}
|
}
|
||||||
@@ -221,10 +238,29 @@ int topology_set_cpu_management(int fc)
|
|||||||
|
|
||||||
static void update_cpu_core_map(void)
|
static void update_cpu_core_map(void)
|
||||||
{
|
{
|
||||||
|
unsigned long flags;
|
||||||
int cpu;
|
int cpu;
|
||||||
|
|
||||||
for_each_possible_cpu(cpu)
|
spin_lock_irqsave(&topology_lock, flags);
|
||||||
cpu_core_map[cpu] = cpu_coregroup_map(cpu);
|
for_each_possible_cpu(cpu) {
|
||||||
|
cpu_core_map[cpu] = cpu_group_map(&core_info, cpu);
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
cpu_book_map[cpu] = cpu_group_map(&book_info, cpu);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
spin_unlock_irqrestore(&topology_lock, flags);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void store_topology(struct tl_info *info)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
int rc;
|
||||||
|
|
||||||
|
rc = stsi(info, 15, 1, 3);
|
||||||
|
if (rc != -ENOSYS)
|
||||||
|
return;
|
||||||
|
#endif
|
||||||
|
stsi(info, 15, 1, 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
int arch_update_cpu_topology(void)
|
int arch_update_cpu_topology(void)
|
||||||
@@ -238,7 +274,7 @@ int arch_update_cpu_topology(void)
|
|||||||
topology_update_polarization_simple();
|
topology_update_polarization_simple();
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
stsi(info, 15, 1, 2);
|
store_topology(info);
|
||||||
tl_to_cores(info);
|
tl_to_cores(info);
|
||||||
update_cpu_core_map();
|
update_cpu_core_map();
|
||||||
for_each_online_cpu(cpu) {
|
for_each_online_cpu(cpu) {
|
||||||
@@ -299,12 +335,24 @@ out:
|
|||||||
}
|
}
|
||||||
__initcall(init_topology_update);
|
__initcall(init_topology_update);
|
||||||
|
|
||||||
|
static void alloc_masks(struct tl_info *info, struct mask_info *mask, int offset)
|
||||||
|
{
|
||||||
|
int i, nr_masks;
|
||||||
|
|
||||||
|
nr_masks = info->mag[NR_MAG - offset];
|
||||||
|
for (i = 0; i < info->mnest - offset; i++)
|
||||||
|
nr_masks *= info->mag[NR_MAG - offset - 1 - i];
|
||||||
|
nr_masks = max(nr_masks, 1);
|
||||||
|
for (i = 0; i < nr_masks; i++) {
|
||||||
|
mask->next = alloc_bootmem(sizeof(struct mask_info));
|
||||||
|
mask = mask->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void __init s390_init_cpu_topology(void)
|
void __init s390_init_cpu_topology(void)
|
||||||
{
|
{
|
||||||
unsigned long long facility_bits;
|
unsigned long long facility_bits;
|
||||||
struct tl_info *info;
|
struct tl_info *info;
|
||||||
struct core_info *core;
|
|
||||||
int nr_cores;
|
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
if (stfle(&facility_bits, 1) <= 0)
|
if (stfle(&facility_bits, 1) <= 0)
|
||||||
@@ -315,25 +363,13 @@ void __init s390_init_cpu_topology(void)
|
|||||||
|
|
||||||
tl_info = alloc_bootmem_pages(PAGE_SIZE);
|
tl_info = alloc_bootmem_pages(PAGE_SIZE);
|
||||||
info = tl_info;
|
info = tl_info;
|
||||||
stsi(info, 15, 1, 2);
|
store_topology(info);
|
||||||
|
|
||||||
nr_cores = info->mag[NR_MAG - 2];
|
|
||||||
for (i = 0; i < info->mnest - 2; i++)
|
|
||||||
nr_cores *= info->mag[NR_MAG - 3 - i];
|
|
||||||
|
|
||||||
pr_info("The CPU configuration topology of the machine is:");
|
pr_info("The CPU configuration topology of the machine is:");
|
||||||
for (i = 0; i < NR_MAG; i++)
|
for (i = 0; i < NR_MAG; i++)
|
||||||
printk(" %d", info->mag[i]);
|
printk(" %d", info->mag[i]);
|
||||||
printk(" / %d\n", info->mnest);
|
printk(" / %d\n", info->mnest);
|
||||||
|
alloc_masks(info, &core_info, 2);
|
||||||
core = &core_info;
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
for (i = 0; i < nr_cores; i++) {
|
alloc_masks(info, &book_info, 3);
|
||||||
core->next = alloc_bootmem(sizeof(struct core_info));
|
#endif
|
||||||
core = core->next;
|
|
||||||
if (!core)
|
|
||||||
goto error;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
error:
|
|
||||||
machine_has_topology = 0;
|
|
||||||
}
|
}
|
||||||
|
@@ -799,6 +799,17 @@ config SCHED_MC
|
|||||||
making when dealing with multi-core CPU chips at a cost of slightly
|
making when dealing with multi-core CPU chips at a cost of slightly
|
||||||
increased overhead in some places. If unsure say N here.
|
increased overhead in some places. If unsure say N here.
|
||||||
|
|
||||||
|
config IRQ_TIME_ACCOUNTING
|
||||||
|
bool "Fine granularity task level IRQ time accounting"
|
||||||
|
default n
|
||||||
|
---help---
|
||||||
|
Select this option to enable fine granularity task irq time
|
||||||
|
accounting. This is done by reading a timestamp on each
|
||||||
|
transitions between softirq and hardirq state, so there can be a
|
||||||
|
small performance impact.
|
||||||
|
|
||||||
|
If in doubt, say N here.
|
||||||
|
|
||||||
source "kernel/Kconfig.preempt"
|
source "kernel/Kconfig.preempt"
|
||||||
|
|
||||||
config X86_UP_APIC
|
config X86_UP_APIC
|
||||||
|
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
|
|||||||
|
|
||||||
__setup("notsc", notsc_setup);
|
__setup("notsc", notsc_setup);
|
||||||
|
|
||||||
|
static int no_sched_irq_time;
|
||||||
|
|
||||||
static int __init tsc_setup(char *str)
|
static int __init tsc_setup(char *str)
|
||||||
{
|
{
|
||||||
if (!strcmp(str, "reliable"))
|
if (!strcmp(str, "reliable"))
|
||||||
tsc_clocksource_reliable = 1;
|
tsc_clocksource_reliable = 1;
|
||||||
|
if (!strncmp(str, "noirqtime", 9))
|
||||||
|
no_sched_irq_time = 1;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -801,6 +805,7 @@ void mark_tsc_unstable(char *reason)
|
|||||||
if (!tsc_unstable) {
|
if (!tsc_unstable) {
|
||||||
tsc_unstable = 1;
|
tsc_unstable = 1;
|
||||||
sched_clock_stable = 0;
|
sched_clock_stable = 0;
|
||||||
|
disable_sched_clock_irqtime();
|
||||||
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
|
printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
|
||||||
/* Change only the rating, when not registered */
|
/* Change only the rating, when not registered */
|
||||||
if (clocksource_tsc.mult)
|
if (clocksource_tsc.mult)
|
||||||
@@ -987,6 +992,9 @@ void __init tsc_init(void)
|
|||||||
/* now allow native_sched_clock() to use rdtsc */
|
/* now allow native_sched_clock() to use rdtsc */
|
||||||
tsc_disabled = 0;
|
tsc_disabled = 0;
|
||||||
|
|
||||||
|
if (!no_sched_irq_time)
|
||||||
|
enable_sched_clock_irqtime();
|
||||||
|
|
||||||
lpj = ((u64)tsc_khz * 1000);
|
lpj = ((u64)tsc_khz * 1000);
|
||||||
do_div(lpj, HZ);
|
do_div(lpj, HZ);
|
||||||
lpj_fine = lpj;
|
lpj_fine = lpj;
|
||||||
|
@@ -45,7 +45,8 @@ static ssize_t show_##name(struct sys_device *dev, \
|
|||||||
return sprintf(buf, "%d\n", topology_##name(cpu)); \
|
return sprintf(buf, "%d\n", topology_##name(cpu)); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(topology_thread_cpumask) || defined(topology_core_cpumask)
|
#if defined(topology_thread_cpumask) || defined(topology_core_cpumask) || \
|
||||||
|
defined(topology_book_cpumask)
|
||||||
static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
|
static ssize_t show_cpumap(int type, const struct cpumask *mask, char *buf)
|
||||||
{
|
{
|
||||||
ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
|
ptrdiff_t len = PTR_ALIGN(buf + PAGE_SIZE - 1, PAGE_SIZE) - buf;
|
||||||
@@ -114,6 +115,14 @@ define_siblings_show_func(core_cpumask);
|
|||||||
define_one_ro_named(core_siblings, show_core_cpumask);
|
define_one_ro_named(core_siblings, show_core_cpumask);
|
||||||
define_one_ro_named(core_siblings_list, show_core_cpumask_list);
|
define_one_ro_named(core_siblings_list, show_core_cpumask_list);
|
||||||
|
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
define_id_show_func(book_id);
|
||||||
|
define_one_ro(book_id);
|
||||||
|
define_siblings_show_func(book_cpumask);
|
||||||
|
define_one_ro_named(book_siblings, show_book_cpumask);
|
||||||
|
define_one_ro_named(book_siblings_list, show_book_cpumask_list);
|
||||||
|
#endif
|
||||||
|
|
||||||
static struct attribute *default_attrs[] = {
|
static struct attribute *default_attrs[] = {
|
||||||
&attr_physical_package_id.attr,
|
&attr_physical_package_id.attr,
|
||||||
&attr_core_id.attr,
|
&attr_core_id.attr,
|
||||||
@@ -121,6 +130,11 @@ static struct attribute *default_attrs[] = {
|
|||||||
&attr_thread_siblings_list.attr,
|
&attr_thread_siblings_list.attr,
|
||||||
&attr_core_siblings.attr,
|
&attr_core_siblings.attr,
|
||||||
&attr_core_siblings_list.attr,
|
&attr_core_siblings_list.attr,
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
&attr_book_id.attr,
|
||||||
|
&attr_book_siblings.attr,
|
||||||
|
&attr_book_siblings_list.attr,
|
||||||
|
#endif
|
||||||
NULL
|
NULL
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@@ -64,6 +64,8 @@
|
|||||||
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
|
#define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT)
|
||||||
#define NMI_OFFSET (1UL << NMI_SHIFT)
|
#define NMI_OFFSET (1UL << NMI_SHIFT)
|
||||||
|
|
||||||
|
#define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET)
|
||||||
|
|
||||||
#ifndef PREEMPT_ACTIVE
|
#ifndef PREEMPT_ACTIVE
|
||||||
#define PREEMPT_ACTIVE_BITS 1
|
#define PREEMPT_ACTIVE_BITS 1
|
||||||
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
|
#define PREEMPT_ACTIVE_SHIFT (NMI_SHIFT + NMI_BITS)
|
||||||
@@ -82,10 +84,13 @@
|
|||||||
/*
|
/*
|
||||||
* Are we doing bottom half or hardware interrupt processing?
|
* Are we doing bottom half or hardware interrupt processing?
|
||||||
* Are we in a softirq context? Interrupt context?
|
* Are we in a softirq context? Interrupt context?
|
||||||
|
* in_softirq - Are we currently processing softirq or have bh disabled?
|
||||||
|
* in_serving_softirq - Are we currently processing softirq?
|
||||||
*/
|
*/
|
||||||
#define in_irq() (hardirq_count())
|
#define in_irq() (hardirq_count())
|
||||||
#define in_softirq() (softirq_count())
|
#define in_softirq() (softirq_count())
|
||||||
#define in_interrupt() (irq_count())
|
#define in_interrupt() (irq_count())
|
||||||
|
#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET)
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Are we in NMI context?
|
* Are we in NMI context?
|
||||||
@@ -132,10 +137,12 @@ extern void synchronize_irq(unsigned int irq);
|
|||||||
|
|
||||||
struct task_struct;
|
struct task_struct;
|
||||||
|
|
||||||
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
|
#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
|
||||||
static inline void account_system_vtime(struct task_struct *tsk)
|
static inline void account_system_vtime(struct task_struct *tsk)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
#else
|
||||||
|
extern void account_system_vtime(struct task_struct *tsk);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(CONFIG_NO_HZ)
|
#if defined(CONFIG_NO_HZ)
|
||||||
|
@@ -875,6 +875,7 @@ enum sched_domain_level {
|
|||||||
SD_LV_NONE = 0,
|
SD_LV_NONE = 0,
|
||||||
SD_LV_SIBLING,
|
SD_LV_SIBLING,
|
||||||
SD_LV_MC,
|
SD_LV_MC,
|
||||||
|
SD_LV_BOOK,
|
||||||
SD_LV_CPU,
|
SD_LV_CPU,
|
||||||
SD_LV_NODE,
|
SD_LV_NODE,
|
||||||
SD_LV_ALLNODES,
|
SD_LV_ALLNODES,
|
||||||
@@ -1690,8 +1691,7 @@ extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *
|
|||||||
/*
|
/*
|
||||||
* Per process flags
|
* Per process flags
|
||||||
*/
|
*/
|
||||||
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */
|
#define PF_KSOFTIRQD 0x00000001 /* I am ksoftirqd */
|
||||||
/* Not implemented yet, only for 486*/
|
|
||||||
#define PF_STARTING 0x00000002 /* being created */
|
#define PF_STARTING 0x00000002 /* being created */
|
||||||
#define PF_EXITING 0x00000004 /* getting shut down */
|
#define PF_EXITING 0x00000004 /* getting shut down */
|
||||||
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
|
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
|
||||||
@@ -1837,6 +1837,19 @@ extern void sched_clock_idle_sleep_event(void);
|
|||||||
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||||
|
/*
|
||||||
|
* An i/f to runtime opt-in for irq time accounting based off of sched_clock.
|
||||||
|
* The reason for this explicit opt-in is not to have perf penalty with
|
||||||
|
* slow sched_clocks.
|
||||||
|
*/
|
||||||
|
extern void enable_sched_clock_irqtime(void);
|
||||||
|
extern void disable_sched_clock_irqtime(void);
|
||||||
|
#else
|
||||||
|
static inline void enable_sched_clock_irqtime(void) {}
|
||||||
|
static inline void disable_sched_clock_irqtime(void) {}
|
||||||
|
#endif
|
||||||
|
|
||||||
extern unsigned long long
|
extern unsigned long long
|
||||||
task_sched_runtime(struct task_struct *task);
|
task_sched_runtime(struct task_struct *task);
|
||||||
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
|
extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
|
||||||
@@ -2378,9 +2391,9 @@ extern int __cond_resched_lock(spinlock_t *lock);
|
|||||||
|
|
||||||
extern int __cond_resched_softirq(void);
|
extern int __cond_resched_softirq(void);
|
||||||
|
|
||||||
#define cond_resched_softirq() ({ \
|
#define cond_resched_softirq() ({ \
|
||||||
__might_sleep(__FILE__, __LINE__, SOFTIRQ_OFFSET); \
|
__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
|
||||||
__cond_resched_softirq(); \
|
__cond_resched_softirq(); \
|
||||||
})
|
})
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@@ -201,6 +201,12 @@ int arch_update_cpu_topology(void);
|
|||||||
.balance_interval = 64, \
|
.balance_interval = 64, \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
#ifndef SD_BOOK_INIT
|
||||||
|
#error Please define an appropriate SD_BOOK_INIT in include/asm/topology.h!!!
|
||||||
|
#endif
|
||||||
|
#endif /* CONFIG_SCHED_BOOK */
|
||||||
|
|
||||||
#ifdef CONFIG_NUMA
|
#ifdef CONFIG_NUMA
|
||||||
#ifndef SD_NODE_INIT
|
#ifndef SD_NODE_INIT
|
||||||
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
|
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
|
||||||
|
@@ -362,6 +362,35 @@ TRACE_EVENT(sched_stat_runtime,
|
|||||||
(unsigned long long)__entry->vruntime)
|
(unsigned long long)__entry->vruntime)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Tracepoint for showing priority inheritance modifying a tasks
|
||||||
|
* priority.
|
||||||
|
*/
|
||||||
|
TRACE_EVENT(sched_pi_setprio,
|
||||||
|
|
||||||
|
TP_PROTO(struct task_struct *tsk, int newprio),
|
||||||
|
|
||||||
|
TP_ARGS(tsk, newprio),
|
||||||
|
|
||||||
|
TP_STRUCT__entry(
|
||||||
|
__array( char, comm, TASK_COMM_LEN )
|
||||||
|
__field( pid_t, pid )
|
||||||
|
__field( int, oldprio )
|
||||||
|
__field( int, newprio )
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_fast_assign(
|
||||||
|
memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
|
||||||
|
__entry->pid = tsk->pid;
|
||||||
|
__entry->oldprio = tsk->prio;
|
||||||
|
__entry->newprio = newprio;
|
||||||
|
),
|
||||||
|
|
||||||
|
TP_printk("comm=%s pid=%d oldprio=%d newprio=%d",
|
||||||
|
__entry->comm, __entry->pid,
|
||||||
|
__entry->oldprio, __entry->newprio)
|
||||||
|
);
|
||||||
|
|
||||||
#endif /* _TRACE_SCHED_H */
|
#endif /* _TRACE_SCHED_H */
|
||||||
|
|
||||||
/* This part must be outside protection */
|
/* This part must be outside protection */
|
||||||
|
293
kernel/sched.c
293
kernel/sched.c
@@ -426,9 +426,7 @@ struct root_domain {
|
|||||||
*/
|
*/
|
||||||
cpumask_var_t rto_mask;
|
cpumask_var_t rto_mask;
|
||||||
atomic_t rto_count;
|
atomic_t rto_count;
|
||||||
#ifdef CONFIG_SMP
|
|
||||||
struct cpupri cpupri;
|
struct cpupri cpupri;
|
||||||
#endif
|
|
||||||
};
|
};
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -437,7 +435,7 @@ struct root_domain {
|
|||||||
*/
|
*/
|
||||||
static struct root_domain def_root_domain;
|
static struct root_domain def_root_domain;
|
||||||
|
|
||||||
#endif
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This is the main, per-CPU runqueue data structure.
|
* This is the main, per-CPU runqueue data structure.
|
||||||
@@ -488,11 +486,12 @@ struct rq {
|
|||||||
*/
|
*/
|
||||||
unsigned long nr_uninterruptible;
|
unsigned long nr_uninterruptible;
|
||||||
|
|
||||||
struct task_struct *curr, *idle;
|
struct task_struct *curr, *idle, *stop;
|
||||||
unsigned long next_balance;
|
unsigned long next_balance;
|
||||||
struct mm_struct *prev_mm;
|
struct mm_struct *prev_mm;
|
||||||
|
|
||||||
u64 clock;
|
u64 clock;
|
||||||
|
u64 clock_task;
|
||||||
|
|
||||||
atomic_t nr_iowait;
|
atomic_t nr_iowait;
|
||||||
|
|
||||||
@@ -520,6 +519,10 @@ struct rq {
|
|||||||
u64 avg_idle;
|
u64 avg_idle;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||||
|
u64 prev_irq_time;
|
||||||
|
#endif
|
||||||
|
|
||||||
/* calc_load related fields */
|
/* calc_load related fields */
|
||||||
unsigned long calc_load_update;
|
unsigned long calc_load_update;
|
||||||
long calc_load_active;
|
long calc_load_active;
|
||||||
@@ -643,10 +646,22 @@ static inline struct task_group *task_group(struct task_struct *p)
|
|||||||
|
|
||||||
#endif /* CONFIG_CGROUP_SCHED */
|
#endif /* CONFIG_CGROUP_SCHED */
|
||||||
|
|
||||||
|
static u64 irq_time_cpu(int cpu);
|
||||||
|
static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
|
||||||
|
|
||||||
inline void update_rq_clock(struct rq *rq)
|
inline void update_rq_clock(struct rq *rq)
|
||||||
{
|
{
|
||||||
if (!rq->skip_clock_update)
|
if (!rq->skip_clock_update) {
|
||||||
rq->clock = sched_clock_cpu(cpu_of(rq));
|
int cpu = cpu_of(rq);
|
||||||
|
u64 irq_time;
|
||||||
|
|
||||||
|
rq->clock = sched_clock_cpu(cpu);
|
||||||
|
irq_time = irq_time_cpu(cpu);
|
||||||
|
if (rq->clock - irq_time > rq->clock_task)
|
||||||
|
rq->clock_task = rq->clock - irq_time;
|
||||||
|
|
||||||
|
sched_irq_time_avg_update(rq, irq_time);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -723,7 +738,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|||||||
size_t cnt, loff_t *ppos)
|
size_t cnt, loff_t *ppos)
|
||||||
{
|
{
|
||||||
char buf[64];
|
char buf[64];
|
||||||
char *cmp = buf;
|
char *cmp;
|
||||||
int neg = 0;
|
int neg = 0;
|
||||||
int i;
|
int i;
|
||||||
|
|
||||||
@@ -734,6 +749,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|||||||
return -EFAULT;
|
return -EFAULT;
|
||||||
|
|
||||||
buf[cnt] = 0;
|
buf[cnt] = 0;
|
||||||
|
cmp = strstrip(buf);
|
||||||
|
|
||||||
if (strncmp(buf, "NO_", 3) == 0) {
|
if (strncmp(buf, "NO_", 3) == 0) {
|
||||||
neg = 1;
|
neg = 1;
|
||||||
@@ -741,9 +757,7 @@ sched_feat_write(struct file *filp, const char __user *ubuf,
|
|||||||
}
|
}
|
||||||
|
|
||||||
for (i = 0; sched_feat_names[i]; i++) {
|
for (i = 0; sched_feat_names[i]; i++) {
|
||||||
int len = strlen(sched_feat_names[i]);
|
if (strcmp(cmp, sched_feat_names[i]) == 0) {
|
||||||
|
|
||||||
if (strncmp(cmp, sched_feat_names[i], len) == 0) {
|
|
||||||
if (neg)
|
if (neg)
|
||||||
sysctl_sched_features &= ~(1UL << i);
|
sysctl_sched_features &= ~(1UL << i);
|
||||||
else
|
else
|
||||||
@@ -1840,7 +1854,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|||||||
|
|
||||||
static const struct sched_class rt_sched_class;
|
static const struct sched_class rt_sched_class;
|
||||||
|
|
||||||
#define sched_class_highest (&rt_sched_class)
|
#define sched_class_highest (&stop_sched_class)
|
||||||
#define for_each_class(class) \
|
#define for_each_class(class) \
|
||||||
for (class = sched_class_highest; class; class = class->next)
|
for (class = sched_class_highest; class; class = class->next)
|
||||||
|
|
||||||
@@ -1858,12 +1872,6 @@ static void dec_nr_running(struct rq *rq)
|
|||||||
|
|
||||||
static void set_load_weight(struct task_struct *p)
|
static void set_load_weight(struct task_struct *p)
|
||||||
{
|
{
|
||||||
if (task_has_rt_policy(p)) {
|
|
||||||
p->se.load.weight = 0;
|
|
||||||
p->se.load.inv_weight = WMULT_CONST;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* SCHED_IDLE tasks get minimal weight:
|
* SCHED_IDLE tasks get minimal weight:
|
||||||
*/
|
*/
|
||||||
@@ -1917,13 +1925,132 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
|
|||||||
dec_nr_running(rq);
|
dec_nr_running(rq);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||||
|
|
||||||
|
/*
|
||||||
|
* There are no locks covering percpu hardirq/softirq time.
|
||||||
|
* They are only modified in account_system_vtime, on corresponding CPU
|
||||||
|
* with interrupts disabled. So, writes are safe.
|
||||||
|
* They are read and saved off onto struct rq in update_rq_clock().
|
||||||
|
* This may result in other CPU reading this CPU's irq time and can
|
||||||
|
* race with irq/account_system_vtime on this CPU. We would either get old
|
||||||
|
* or new value (or semi updated value on 32 bit) with a side effect of
|
||||||
|
* accounting a slice of irq time to wrong task when irq is in progress
|
||||||
|
* while we read rq->clock. That is a worthy compromise in place of having
|
||||||
|
* locks on each irq in account_system_time.
|
||||||
|
*/
|
||||||
|
static DEFINE_PER_CPU(u64, cpu_hardirq_time);
|
||||||
|
static DEFINE_PER_CPU(u64, cpu_softirq_time);
|
||||||
|
|
||||||
|
static DEFINE_PER_CPU(u64, irq_start_time);
|
||||||
|
static int sched_clock_irqtime;
|
||||||
|
|
||||||
|
void enable_sched_clock_irqtime(void)
|
||||||
|
{
|
||||||
|
sched_clock_irqtime = 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
void disable_sched_clock_irqtime(void)
|
||||||
|
{
|
||||||
|
sched_clock_irqtime = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static u64 irq_time_cpu(int cpu)
|
||||||
|
{
|
||||||
|
if (!sched_clock_irqtime)
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
|
||||||
|
}
|
||||||
|
|
||||||
|
void account_system_vtime(struct task_struct *curr)
|
||||||
|
{
|
||||||
|
unsigned long flags;
|
||||||
|
int cpu;
|
||||||
|
u64 now, delta;
|
||||||
|
|
||||||
|
if (!sched_clock_irqtime)
|
||||||
|
return;
|
||||||
|
|
||||||
|
local_irq_save(flags);
|
||||||
|
|
||||||
|
cpu = smp_processor_id();
|
||||||
|
now = sched_clock_cpu(cpu);
|
||||||
|
delta = now - per_cpu(irq_start_time, cpu);
|
||||||
|
per_cpu(irq_start_time, cpu) = now;
|
||||||
|
/*
|
||||||
|
* We do not account for softirq time from ksoftirqd here.
|
||||||
|
* We want to continue accounting softirq time to ksoftirqd thread
|
||||||
|
* in that case, so as not to confuse scheduler with a special task
|
||||||
|
* that do not consume any time, but still wants to run.
|
||||||
|
*/
|
||||||
|
if (hardirq_count())
|
||||||
|
per_cpu(cpu_hardirq_time, cpu) += delta;
|
||||||
|
else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
|
||||||
|
per_cpu(cpu_softirq_time, cpu) += delta;
|
||||||
|
|
||||||
|
local_irq_restore(flags);
|
||||||
|
}
|
||||||
|
EXPORT_SYMBOL_GPL(account_system_vtime);
|
||||||
|
|
||||||
|
static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
|
||||||
|
{
|
||||||
|
if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
|
||||||
|
u64 delta_irq = curr_irq_time - rq->prev_irq_time;
|
||||||
|
rq->prev_irq_time = curr_irq_time;
|
||||||
|
sched_rt_avg_update(rq, delta_irq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
static u64 irq_time_cpu(int cpu)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "sched_idletask.c"
|
#include "sched_idletask.c"
|
||||||
#include "sched_fair.c"
|
#include "sched_fair.c"
|
||||||
#include "sched_rt.c"
|
#include "sched_rt.c"
|
||||||
|
#include "sched_stoptask.c"
|
||||||
#ifdef CONFIG_SCHED_DEBUG
|
#ifdef CONFIG_SCHED_DEBUG
|
||||||
# include "sched_debug.c"
|
# include "sched_debug.c"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
void sched_set_stop_task(int cpu, struct task_struct *stop)
|
||||||
|
{
|
||||||
|
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
|
||||||
|
struct task_struct *old_stop = cpu_rq(cpu)->stop;
|
||||||
|
|
||||||
|
if (stop) {
|
||||||
|
/*
|
||||||
|
* Make it appear like a SCHED_FIFO task, its something
|
||||||
|
* userspace knows about and won't get confused about.
|
||||||
|
*
|
||||||
|
* Also, it will make PI more or less work without too
|
||||||
|
* much confusion -- but then, stop work should not
|
||||||
|
* rely on PI working anyway.
|
||||||
|
*/
|
||||||
|
sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
|
||||||
|
|
||||||
|
stop->sched_class = &stop_sched_class;
|
||||||
|
}
|
||||||
|
|
||||||
|
cpu_rq(cpu)->stop = stop;
|
||||||
|
|
||||||
|
if (old_stop) {
|
||||||
|
/*
|
||||||
|
* Reset it back to a normal scheduling class so that
|
||||||
|
* it can die in pieces.
|
||||||
|
*/
|
||||||
|
old_stop->sched_class = &rt_sched_class;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __normal_prio - return the priority that is based on the static prio
|
* __normal_prio - return the priority that is based on the static prio
|
||||||
*/
|
*/
|
||||||
@@ -2003,6 +2130,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
|
|||||||
if (p->sched_class != &fair_sched_class)
|
if (p->sched_class != &fair_sched_class)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
|
if (unlikely(p->policy == SCHED_IDLE))
|
||||||
|
return 0;
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Buddy candidates are cache hot:
|
* Buddy candidates are cache hot:
|
||||||
*/
|
*/
|
||||||
@@ -2852,14 +2982,14 @@ context_switch(struct rq *rq, struct task_struct *prev,
|
|||||||
*/
|
*/
|
||||||
arch_start_context_switch(prev);
|
arch_start_context_switch(prev);
|
||||||
|
|
||||||
if (likely(!mm)) {
|
if (!mm) {
|
||||||
next->active_mm = oldmm;
|
next->active_mm = oldmm;
|
||||||
atomic_inc(&oldmm->mm_count);
|
atomic_inc(&oldmm->mm_count);
|
||||||
enter_lazy_tlb(oldmm, next);
|
enter_lazy_tlb(oldmm, next);
|
||||||
} else
|
} else
|
||||||
switch_mm(oldmm, mm, next);
|
switch_mm(oldmm, mm, next);
|
||||||
|
|
||||||
if (likely(!prev->mm)) {
|
if (!prev->mm) {
|
||||||
prev->active_mm = NULL;
|
prev->active_mm = NULL;
|
||||||
rq->prev_mm = oldmm;
|
rq->prev_mm = oldmm;
|
||||||
}
|
}
|
||||||
@@ -3248,7 +3378,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
|
|||||||
|
|
||||||
if (task_current(rq, p)) {
|
if (task_current(rq, p)) {
|
||||||
update_rq_clock(rq);
|
update_rq_clock(rq);
|
||||||
ns = rq->clock - p->se.exec_start;
|
ns = rq->clock_task - p->se.exec_start;
|
||||||
if ((s64)ns < 0)
|
if ((s64)ns < 0)
|
||||||
ns = 0;
|
ns = 0;
|
||||||
}
|
}
|
||||||
@@ -3397,7 +3527,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
|
|||||||
tmp = cputime_to_cputime64(cputime);
|
tmp = cputime_to_cputime64(cputime);
|
||||||
if (hardirq_count() - hardirq_offset)
|
if (hardirq_count() - hardirq_offset)
|
||||||
cpustat->irq = cputime64_add(cpustat->irq, tmp);
|
cpustat->irq = cputime64_add(cpustat->irq, tmp);
|
||||||
else if (softirq_count())
|
else if (in_serving_softirq())
|
||||||
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
|
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
|
||||||
else
|
else
|
||||||
cpustat->system = cputime64_add(cpustat->system, tmp);
|
cpustat->system = cputime64_add(cpustat->system, tmp);
|
||||||
@@ -3723,17 +3853,13 @@ pick_next_task(struct rq *rq)
|
|||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
class = sched_class_highest;
|
for_each_class(class) {
|
||||||
for ( ; ; ) {
|
|
||||||
p = class->pick_next_task(rq);
|
p = class->pick_next_task(rq);
|
||||||
if (p)
|
if (p)
|
||||||
return p;
|
return p;
|
||||||
/*
|
|
||||||
* Will never be NULL as the idle class always
|
|
||||||
* returns a non-NULL p:
|
|
||||||
*/
|
|
||||||
class = class->next;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BUG(); /* the idle class will always have a runnable task */
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -4358,6 +4484,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
|
|||||||
|
|
||||||
rq = task_rq_lock(p, &flags);
|
rq = task_rq_lock(p, &flags);
|
||||||
|
|
||||||
|
trace_sched_pi_setprio(p, prio);
|
||||||
oldprio = p->prio;
|
oldprio = p->prio;
|
||||||
prev_class = p->sched_class;
|
prev_class = p->sched_class;
|
||||||
on_rq = p->se.on_rq;
|
on_rq = p->se.on_rq;
|
||||||
@@ -4661,6 +4788,15 @@ recheck:
|
|||||||
*/
|
*/
|
||||||
rq = __task_rq_lock(p);
|
rq = __task_rq_lock(p);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Changing the policy of the stop threads its a very bad idea
|
||||||
|
*/
|
||||||
|
if (p == rq->stop) {
|
||||||
|
__task_rq_unlock(rq);
|
||||||
|
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
|
||||||
|
return -EINVAL;
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef CONFIG_RT_GROUP_SCHED
|
#ifdef CONFIG_RT_GROUP_SCHED
|
||||||
if (user) {
|
if (user) {
|
||||||
/*
|
/*
|
||||||
@@ -4893,7 +5029,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
|
|||||||
|
|
||||||
cpuset_cpus_allowed(p, cpus_allowed);
|
cpuset_cpus_allowed(p, cpus_allowed);
|
||||||
cpumask_and(new_mask, in_mask, cpus_allowed);
|
cpumask_and(new_mask, in_mask, cpus_allowed);
|
||||||
again:
|
again:
|
||||||
retval = set_cpus_allowed_ptr(p, new_mask);
|
retval = set_cpus_allowed_ptr(p, new_mask);
|
||||||
|
|
||||||
if (!retval) {
|
if (!retval) {
|
||||||
@@ -6526,6 +6662,7 @@ struct s_data {
|
|||||||
cpumask_var_t nodemask;
|
cpumask_var_t nodemask;
|
||||||
cpumask_var_t this_sibling_map;
|
cpumask_var_t this_sibling_map;
|
||||||
cpumask_var_t this_core_map;
|
cpumask_var_t this_core_map;
|
||||||
|
cpumask_var_t this_book_map;
|
||||||
cpumask_var_t send_covered;
|
cpumask_var_t send_covered;
|
||||||
cpumask_var_t tmpmask;
|
cpumask_var_t tmpmask;
|
||||||
struct sched_group **sched_group_nodes;
|
struct sched_group **sched_group_nodes;
|
||||||
@@ -6537,6 +6674,7 @@ enum s_alloc {
|
|||||||
sa_rootdomain,
|
sa_rootdomain,
|
||||||
sa_tmpmask,
|
sa_tmpmask,
|
||||||
sa_send_covered,
|
sa_send_covered,
|
||||||
|
sa_this_book_map,
|
||||||
sa_this_core_map,
|
sa_this_core_map,
|
||||||
sa_this_sibling_map,
|
sa_this_sibling_map,
|
||||||
sa_nodemask,
|
sa_nodemask,
|
||||||
@@ -6572,31 +6710,48 @@ cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
|
|||||||
#ifdef CONFIG_SCHED_MC
|
#ifdef CONFIG_SCHED_MC
|
||||||
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
|
static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
|
||||||
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
|
static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
|
||||||
#endif /* CONFIG_SCHED_MC */
|
|
||||||
|
|
||||||
#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
|
|
||||||
static int
|
static int
|
||||||
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
|
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
|
||||||
struct sched_group **sg, struct cpumask *mask)
|
struct sched_group **sg, struct cpumask *mask)
|
||||||
{
|
{
|
||||||
int group;
|
int group;
|
||||||
|
#ifdef CONFIG_SCHED_SMT
|
||||||
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
|
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
|
||||||
group = cpumask_first(mask);
|
group = cpumask_first(mask);
|
||||||
|
#else
|
||||||
|
group = cpu;
|
||||||
|
#endif
|
||||||
if (sg)
|
if (sg)
|
||||||
*sg = &per_cpu(sched_group_core, group).sg;
|
*sg = &per_cpu(sched_group_core, group).sg;
|
||||||
return group;
|
return group;
|
||||||
}
|
}
|
||||||
#elif defined(CONFIG_SCHED_MC)
|
#endif /* CONFIG_SCHED_MC */
|
||||||
|
|
||||||
|
/*
|
||||||
|
* book sched-domains:
|
||||||
|
*/
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
|
||||||
|
static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
|
||||||
|
|
||||||
static int
|
static int
|
||||||
cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
|
cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
|
||||||
struct sched_group **sg, struct cpumask *unused)
|
struct sched_group **sg, struct cpumask *mask)
|
||||||
{
|
{
|
||||||
if (sg)
|
int group = cpu;
|
||||||
*sg = &per_cpu(sched_group_core, cpu).sg;
|
#ifdef CONFIG_SCHED_MC
|
||||||
return cpu;
|
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
|
||||||
}
|
group = cpumask_first(mask);
|
||||||
|
#elif defined(CONFIG_SCHED_SMT)
|
||||||
|
cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
|
||||||
|
group = cpumask_first(mask);
|
||||||
#endif
|
#endif
|
||||||
|
if (sg)
|
||||||
|
*sg = &per_cpu(sched_group_book, group).sg;
|
||||||
|
return group;
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_SCHED_BOOK */
|
||||||
|
|
||||||
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
|
static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
|
||||||
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
|
static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
|
||||||
@@ -6606,7 +6761,10 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
|
|||||||
struct sched_group **sg, struct cpumask *mask)
|
struct sched_group **sg, struct cpumask *mask)
|
||||||
{
|
{
|
||||||
int group;
|
int group;
|
||||||
#ifdef CONFIG_SCHED_MC
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
|
||||||
|
group = cpumask_first(mask);
|
||||||
|
#elif defined(CONFIG_SCHED_MC)
|
||||||
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
|
cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
|
||||||
group = cpumask_first(mask);
|
group = cpumask_first(mask);
|
||||||
#elif defined(CONFIG_SCHED_SMT)
|
#elif defined(CONFIG_SCHED_SMT)
|
||||||
@@ -6867,6 +7025,9 @@ SD_INIT_FUNC(CPU)
|
|||||||
#ifdef CONFIG_SCHED_MC
|
#ifdef CONFIG_SCHED_MC
|
||||||
SD_INIT_FUNC(MC)
|
SD_INIT_FUNC(MC)
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
SD_INIT_FUNC(BOOK)
|
||||||
|
#endif
|
||||||
|
|
||||||
static int default_relax_domain_level = -1;
|
static int default_relax_domain_level = -1;
|
||||||
|
|
||||||
@@ -6916,6 +7077,8 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
|
|||||||
free_cpumask_var(d->tmpmask); /* fall through */
|
free_cpumask_var(d->tmpmask); /* fall through */
|
||||||
case sa_send_covered:
|
case sa_send_covered:
|
||||||
free_cpumask_var(d->send_covered); /* fall through */
|
free_cpumask_var(d->send_covered); /* fall through */
|
||||||
|
case sa_this_book_map:
|
||||||
|
free_cpumask_var(d->this_book_map); /* fall through */
|
||||||
case sa_this_core_map:
|
case sa_this_core_map:
|
||||||
free_cpumask_var(d->this_core_map); /* fall through */
|
free_cpumask_var(d->this_core_map); /* fall through */
|
||||||
case sa_this_sibling_map:
|
case sa_this_sibling_map:
|
||||||
@@ -6962,8 +7125,10 @@ static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
|
|||||||
return sa_nodemask;
|
return sa_nodemask;
|
||||||
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
|
if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
|
||||||
return sa_this_sibling_map;
|
return sa_this_sibling_map;
|
||||||
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
|
if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
|
||||||
return sa_this_core_map;
|
return sa_this_core_map;
|
||||||
|
if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
|
||||||
|
return sa_this_book_map;
|
||||||
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
|
if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
|
||||||
return sa_send_covered;
|
return sa_send_covered;
|
||||||
d->rd = alloc_rootdomain();
|
d->rd = alloc_rootdomain();
|
||||||
@@ -7021,6 +7186,23 @@ static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
|
|||||||
return sd;
|
return sd;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct sched_domain *__build_book_sched_domain(struct s_data *d,
|
||||||
|
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
||||||
|
struct sched_domain *parent, int i)
|
||||||
|
{
|
||||||
|
struct sched_domain *sd = parent;
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
sd = &per_cpu(book_domains, i).sd;
|
||||||
|
SD_INIT(sd, BOOK);
|
||||||
|
set_domain_attribute(sd, attr);
|
||||||
|
cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
|
||||||
|
sd->parent = parent;
|
||||||
|
parent->child = sd;
|
||||||
|
cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
|
||||||
|
#endif
|
||||||
|
return sd;
|
||||||
|
}
|
||||||
|
|
||||||
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
|
static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
|
||||||
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
||||||
struct sched_domain *parent, int i)
|
struct sched_domain *parent, int i)
|
||||||
@@ -7077,6 +7259,15 @@ static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
|
|||||||
&cpu_to_core_group,
|
&cpu_to_core_group,
|
||||||
d->send_covered, d->tmpmask);
|
d->send_covered, d->tmpmask);
|
||||||
break;
|
break;
|
||||||
|
#endif
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
case SD_LV_BOOK: /* set up book groups */
|
||||||
|
cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
|
||||||
|
if (cpu == cpumask_first(d->this_book_map))
|
||||||
|
init_sched_build_groups(d->this_book_map, cpu_map,
|
||||||
|
&cpu_to_book_group,
|
||||||
|
d->send_covered, d->tmpmask);
|
||||||
|
break;
|
||||||
#endif
|
#endif
|
||||||
case SD_LV_CPU: /* set up physical groups */
|
case SD_LV_CPU: /* set up physical groups */
|
||||||
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
|
cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
|
||||||
@@ -7125,12 +7316,14 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
|
|||||||
|
|
||||||
sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
|
sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
|
||||||
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
|
sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
|
||||||
|
sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
|
||||||
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
|
sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
|
||||||
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
|
sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
for_each_cpu(i, cpu_map) {
|
for_each_cpu(i, cpu_map) {
|
||||||
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
|
build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
|
||||||
|
build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
|
||||||
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
|
build_sched_groups(&d, SD_LV_MC, cpu_map, i);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -7161,6 +7354,12 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
|
|||||||
init_sched_groups_power(i, sd);
|
init_sched_groups_power(i, sd);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef CONFIG_SCHED_BOOK
|
||||||
|
for_each_cpu(i, cpu_map) {
|
||||||
|
sd = &per_cpu(book_domains, i).sd;
|
||||||
|
init_sched_groups_power(i, sd);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
for_each_cpu(i, cpu_map) {
|
for_each_cpu(i, cpu_map) {
|
||||||
sd = &per_cpu(phys_domains, i).sd;
|
sd = &per_cpu(phys_domains, i).sd;
|
||||||
@@ -7186,6 +7385,8 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
|
|||||||
sd = &per_cpu(cpu_domains, i).sd;
|
sd = &per_cpu(cpu_domains, i).sd;
|
||||||
#elif defined(CONFIG_SCHED_MC)
|
#elif defined(CONFIG_SCHED_MC)
|
||||||
sd = &per_cpu(core_domains, i).sd;
|
sd = &per_cpu(core_domains, i).sd;
|
||||||
|
#elif defined(CONFIG_SCHED_BOOK)
|
||||||
|
sd = &per_cpu(book_domains, i).sd;
|
||||||
#else
|
#else
|
||||||
sd = &per_cpu(phys_domains, i).sd;
|
sd = &per_cpu(phys_domains, i).sd;
|
||||||
#endif
|
#endif
|
||||||
@@ -8090,9 +8291,9 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
|
|||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
err_free_rq:
|
err_free_rq:
|
||||||
kfree(cfs_rq);
|
kfree(cfs_rq);
|
||||||
err:
|
err:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -8180,9 +8381,9 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
|
|||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
|
|
||||||
err_free_rq:
|
err_free_rq:
|
||||||
kfree(rt_rq);
|
kfree(rt_rq);
|
||||||
err:
|
err:
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -8540,7 +8741,7 @@ static int tg_set_bandwidth(struct task_group *tg,
|
|||||||
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
raw_spin_unlock(&rt_rq->rt_runtime_lock);
|
||||||
}
|
}
|
||||||
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
|
||||||
unlock:
|
unlock:
|
||||||
read_unlock(&tasklist_lock);
|
read_unlock(&tasklist_lock);
|
||||||
mutex_unlock(&rt_constraints_mutex);
|
mutex_unlock(&rt_constraints_mutex);
|
||||||
|
|
||||||
|
@@ -25,7 +25,7 @@
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Targeted preemption latency for CPU-bound tasks:
|
* Targeted preemption latency for CPU-bound tasks:
|
||||||
* (default: 5ms * (1 + ilog(ncpus)), units: nanoseconds)
|
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
|
||||||
*
|
*
|
||||||
* NOTE: this latency value is not the same as the concept of
|
* NOTE: this latency value is not the same as the concept of
|
||||||
* 'timeslice length' - timeslices in CFS are of variable length
|
* 'timeslice length' - timeslices in CFS are of variable length
|
||||||
@@ -52,7 +52,7 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling
|
|||||||
|
|
||||||
/*
|
/*
|
||||||
* Minimal preemption granularity for CPU-bound tasks:
|
* Minimal preemption granularity for CPU-bound tasks:
|
||||||
* (default: 2 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
|
||||||
*/
|
*/
|
||||||
unsigned int sysctl_sched_min_granularity = 750000ULL;
|
unsigned int sysctl_sched_min_granularity = 750000ULL;
|
||||||
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
|
||||||
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
|
|||||||
static void update_curr(struct cfs_rq *cfs_rq)
|
static void update_curr(struct cfs_rq *cfs_rq)
|
||||||
{
|
{
|
||||||
struct sched_entity *curr = cfs_rq->curr;
|
struct sched_entity *curr = cfs_rq->curr;
|
||||||
u64 now = rq_of(cfs_rq)->clock;
|
u64 now = rq_of(cfs_rq)->clock_task;
|
||||||
unsigned long delta_exec;
|
unsigned long delta_exec;
|
||||||
|
|
||||||
if (unlikely(!curr))
|
if (unlikely(!curr))
|
||||||
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||||||
/*
|
/*
|
||||||
* We are starting a new run period:
|
* We are starting a new run period:
|
||||||
*/
|
*/
|
||||||
se->exec_start = rq_of(cfs_rq)->clock;
|
se->exec_start = rq_of(cfs_rq)->clock_task;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**************************************************
|
/**************************************************
|
||||||
@@ -1764,6 +1764,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
|
|||||||
set_task_cpu(p, this_cpu);
|
set_task_cpu(p, this_cpu);
|
||||||
activate_task(this_rq, p, 0);
|
activate_task(this_rq, p, 0);
|
||||||
check_preempt_curr(this_rq, p, 0);
|
check_preempt_curr(this_rq, p, 0);
|
||||||
|
|
||||||
|
/* re-arm NEWIDLE balancing when moving tasks */
|
||||||
|
src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
|
||||||
|
this_rq->idle_stamp = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@@ -1798,7 +1802,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
|
|||||||
* 2) too many balance attempts have failed.
|
* 2) too many balance attempts have failed.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
tsk_cache_hot = task_hot(p, rq->clock, sd);
|
tsk_cache_hot = task_hot(p, rq->clock_task, sd);
|
||||||
if (!tsk_cache_hot ||
|
if (!tsk_cache_hot ||
|
||||||
sd->nr_balance_failed > sd->cache_nice_tries) {
|
sd->nr_balance_failed > sd->cache_nice_tries) {
|
||||||
#ifdef CONFIG_SCHEDSTATS
|
#ifdef CONFIG_SCHEDSTATS
|
||||||
@@ -2030,12 +2034,14 @@ struct sd_lb_stats {
|
|||||||
unsigned long this_load;
|
unsigned long this_load;
|
||||||
unsigned long this_load_per_task;
|
unsigned long this_load_per_task;
|
||||||
unsigned long this_nr_running;
|
unsigned long this_nr_running;
|
||||||
|
unsigned long this_has_capacity;
|
||||||
|
|
||||||
/* Statistics of the busiest group */
|
/* Statistics of the busiest group */
|
||||||
unsigned long max_load;
|
unsigned long max_load;
|
||||||
unsigned long busiest_load_per_task;
|
unsigned long busiest_load_per_task;
|
||||||
unsigned long busiest_nr_running;
|
unsigned long busiest_nr_running;
|
||||||
unsigned long busiest_group_capacity;
|
unsigned long busiest_group_capacity;
|
||||||
|
unsigned long busiest_has_capacity;
|
||||||
|
|
||||||
int group_imb; /* Is there imbalance in this sd */
|
int group_imb; /* Is there imbalance in this sd */
|
||||||
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
|
||||||
@@ -2058,6 +2064,7 @@ struct sg_lb_stats {
|
|||||||
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
|
||||||
unsigned long group_capacity;
|
unsigned long group_capacity;
|
||||||
int group_imb; /* Is there an imbalance in the group ? */
|
int group_imb; /* Is there an imbalance in the group ? */
|
||||||
|
int group_has_capacity; /* Is there extra capacity in the group? */
|
||||||
};
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -2268,7 +2275,13 @@ unsigned long scale_rt_power(int cpu)
|
|||||||
u64 total, available;
|
u64 total, available;
|
||||||
|
|
||||||
total = sched_avg_period() + (rq->clock - rq->age_stamp);
|
total = sched_avg_period() + (rq->clock - rq->age_stamp);
|
||||||
available = total - rq->rt_avg;
|
|
||||||
|
if (unlikely(total < rq->rt_avg)) {
|
||||||
|
/* Ensures that power won't end up being negative */
|
||||||
|
available = 0;
|
||||||
|
} else {
|
||||||
|
available = total - rq->rt_avg;
|
||||||
|
}
|
||||||
|
|
||||||
if (unlikely((s64)total < SCHED_LOAD_SCALE))
|
if (unlikely((s64)total < SCHED_LOAD_SCALE))
|
||||||
total = SCHED_LOAD_SCALE;
|
total = SCHED_LOAD_SCALE;
|
||||||
@@ -2378,7 +2391,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||||||
int local_group, const struct cpumask *cpus,
|
int local_group, const struct cpumask *cpus,
|
||||||
int *balance, struct sg_lb_stats *sgs)
|
int *balance, struct sg_lb_stats *sgs)
|
||||||
{
|
{
|
||||||
unsigned long load, max_cpu_load, min_cpu_load;
|
unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
|
||||||
int i;
|
int i;
|
||||||
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
unsigned int balance_cpu = -1, first_idle_cpu = 0;
|
||||||
unsigned long avg_load_per_task = 0;
|
unsigned long avg_load_per_task = 0;
|
||||||
@@ -2389,6 +2402,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||||||
/* Tally up the load of all CPUs in the group */
|
/* Tally up the load of all CPUs in the group */
|
||||||
max_cpu_load = 0;
|
max_cpu_load = 0;
|
||||||
min_cpu_load = ~0UL;
|
min_cpu_load = ~0UL;
|
||||||
|
max_nr_running = 0;
|
||||||
|
|
||||||
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
|
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
|
||||||
struct rq *rq = cpu_rq(i);
|
struct rq *rq = cpu_rq(i);
|
||||||
@@ -2406,8 +2420,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||||||
load = target_load(i, load_idx);
|
load = target_load(i, load_idx);
|
||||||
} else {
|
} else {
|
||||||
load = source_load(i, load_idx);
|
load = source_load(i, load_idx);
|
||||||
if (load > max_cpu_load)
|
if (load > max_cpu_load) {
|
||||||
max_cpu_load = load;
|
max_cpu_load = load;
|
||||||
|
max_nr_running = rq->nr_running;
|
||||||
|
}
|
||||||
if (min_cpu_load > load)
|
if (min_cpu_load > load)
|
||||||
min_cpu_load = load;
|
min_cpu_load = load;
|
||||||
}
|
}
|
||||||
@@ -2447,13 +2463,15 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
|
|||||||
if (sgs->sum_nr_running)
|
if (sgs->sum_nr_running)
|
||||||
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
|
||||||
|
|
||||||
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
|
if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
|
||||||
sgs->group_imb = 1;
|
sgs->group_imb = 1;
|
||||||
|
|
||||||
sgs->group_capacity =
|
sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
|
||||||
DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
|
|
||||||
if (!sgs->group_capacity)
|
if (!sgs->group_capacity)
|
||||||
sgs->group_capacity = fix_small_capacity(sd, group);
|
sgs->group_capacity = fix_small_capacity(sd, group);
|
||||||
|
|
||||||
|
if (sgs->group_capacity > sgs->sum_nr_running)
|
||||||
|
sgs->group_has_capacity = 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -2542,9 +2560,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
|||||||
/*
|
/*
|
||||||
* In case the child domain prefers tasks go to siblings
|
* In case the child domain prefers tasks go to siblings
|
||||||
* first, lower the sg capacity to one so that we'll try
|
* first, lower the sg capacity to one so that we'll try
|
||||||
* and move all the excess tasks away.
|
* and move all the excess tasks away. We lower the capacity
|
||||||
|
* of a group only if the local group has the capacity to fit
|
||||||
|
* these excess tasks, i.e. nr_running < group_capacity. The
|
||||||
|
* extra check prevents the case where you always pull from the
|
||||||
|
* heaviest group when it is already under-utilized (possible
|
||||||
|
* with a large weight task outweighs the tasks on the system).
|
||||||
*/
|
*/
|
||||||
if (prefer_sibling)
|
if (prefer_sibling && !local_group && sds->this_has_capacity)
|
||||||
sgs.group_capacity = min(sgs.group_capacity, 1UL);
|
sgs.group_capacity = min(sgs.group_capacity, 1UL);
|
||||||
|
|
||||||
if (local_group) {
|
if (local_group) {
|
||||||
@@ -2552,12 +2575,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
|
|||||||
sds->this = sg;
|
sds->this = sg;
|
||||||
sds->this_nr_running = sgs.sum_nr_running;
|
sds->this_nr_running = sgs.sum_nr_running;
|
||||||
sds->this_load_per_task = sgs.sum_weighted_load;
|
sds->this_load_per_task = sgs.sum_weighted_load;
|
||||||
|
sds->this_has_capacity = sgs.group_has_capacity;
|
||||||
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
|
} else if (update_sd_pick_busiest(sd, sds, sg, &sgs, this_cpu)) {
|
||||||
sds->max_load = sgs.avg_load;
|
sds->max_load = sgs.avg_load;
|
||||||
sds->busiest = sg;
|
sds->busiest = sg;
|
||||||
sds->busiest_nr_running = sgs.sum_nr_running;
|
sds->busiest_nr_running = sgs.sum_nr_running;
|
||||||
sds->busiest_group_capacity = sgs.group_capacity;
|
sds->busiest_group_capacity = sgs.group_capacity;
|
||||||
sds->busiest_load_per_task = sgs.sum_weighted_load;
|
sds->busiest_load_per_task = sgs.sum_weighted_load;
|
||||||
|
sds->busiest_has_capacity = sgs.group_has_capacity;
|
||||||
sds->group_imb = sgs.group_imb;
|
sds->group_imb = sgs.group_imb;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2754,6 +2779,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
|
|||||||
return fix_small_imbalance(sds, this_cpu, imbalance);
|
return fix_small_imbalance(sds, this_cpu, imbalance);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/******* find_busiest_group() helpers end here *********************/
|
/******* find_busiest_group() helpers end here *********************/
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@@ -2805,6 +2831,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
|||||||
* 4) This group is more busy than the avg busieness at this
|
* 4) This group is more busy than the avg busieness at this
|
||||||
* sched_domain.
|
* sched_domain.
|
||||||
* 5) The imbalance is within the specified limit.
|
* 5) The imbalance is within the specified limit.
|
||||||
|
*
|
||||||
|
* Note: when doing newidle balance, if the local group has excess
|
||||||
|
* capacity (i.e. nr_running < group_capacity) and the busiest group
|
||||||
|
* does not have any capacity, we force a load balance to pull tasks
|
||||||
|
* to the local group. In this case, we skip past checks 3, 4 and 5.
|
||||||
*/
|
*/
|
||||||
if (!(*balance))
|
if (!(*balance))
|
||||||
goto ret;
|
goto ret;
|
||||||
@@ -2816,6 +2847,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
|||||||
if (!sds.busiest || sds.busiest_nr_running == 0)
|
if (!sds.busiest || sds.busiest_nr_running == 0)
|
||||||
goto out_balanced;
|
goto out_balanced;
|
||||||
|
|
||||||
|
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
|
||||||
|
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
|
||||||
|
!sds.busiest_has_capacity)
|
||||||
|
goto force_balance;
|
||||||
|
|
||||||
if (sds.this_load >= sds.max_load)
|
if (sds.this_load >= sds.max_load)
|
||||||
goto out_balanced;
|
goto out_balanced;
|
||||||
|
|
||||||
@@ -2827,6 +2863,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
|
|||||||
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
|
if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
|
||||||
goto out_balanced;
|
goto out_balanced;
|
||||||
|
|
||||||
|
force_balance:
|
||||||
/* Looks like there is an imbalance. Compute it */
|
/* Looks like there is an imbalance. Compute it */
|
||||||
calculate_imbalance(&sds, this_cpu, imbalance);
|
calculate_imbalance(&sds, this_cpu, imbalance);
|
||||||
return sds.busiest;
|
return sds.busiest;
|
||||||
@@ -3031,7 +3068,14 @@ redo:
|
|||||||
|
|
||||||
if (!ld_moved) {
|
if (!ld_moved) {
|
||||||
schedstat_inc(sd, lb_failed[idle]);
|
schedstat_inc(sd, lb_failed[idle]);
|
||||||
sd->nr_balance_failed++;
|
/*
|
||||||
|
* Increment the failure counter only on periodic balance.
|
||||||
|
* We do not want newidle balance, which can be very
|
||||||
|
* frequent, pollute the failure counter causing
|
||||||
|
* excessive cache_hot migrations and active balances.
|
||||||
|
*/
|
||||||
|
if (idle != CPU_NEWLY_IDLE)
|
||||||
|
sd->nr_balance_failed++;
|
||||||
|
|
||||||
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
|
if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
|
||||||
this_cpu)) {
|
this_cpu)) {
|
||||||
@@ -3153,10 +3197,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
|
|||||||
interval = msecs_to_jiffies(sd->balance_interval);
|
interval = msecs_to_jiffies(sd->balance_interval);
|
||||||
if (time_after(next_balance, sd->last_balance + interval))
|
if (time_after(next_balance, sd->last_balance + interval))
|
||||||
next_balance = sd->last_balance + interval;
|
next_balance = sd->last_balance + interval;
|
||||||
if (pulled_task) {
|
if (pulled_task)
|
||||||
this_rq->idle_stamp = 0;
|
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
raw_spin_lock(&this_rq->lock);
|
raw_spin_lock(&this_rq->lock);
|
||||||
|
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
|
|||||||
* release the lock. Decreases scheduling overhead.
|
* release the lock. Decreases scheduling overhead.
|
||||||
*/
|
*/
|
||||||
SCHED_FEAT(OWNER_SPIN, 1)
|
SCHED_FEAT(OWNER_SPIN, 1)
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Decrement CPU power based on irq activity
|
||||||
|
*/
|
||||||
|
SCHED_FEAT(NONIRQ_POWER, 1)
|
||||||
|
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
|
|||||||
if (!task_has_rt_policy(curr))
|
if (!task_has_rt_policy(curr))
|
||||||
return;
|
return;
|
||||||
|
|
||||||
delta_exec = rq->clock - curr->se.exec_start;
|
delta_exec = rq->clock_task - curr->se.exec_start;
|
||||||
if (unlikely((s64)delta_exec < 0))
|
if (unlikely((s64)delta_exec < 0))
|
||||||
delta_exec = 0;
|
delta_exec = 0;
|
||||||
|
|
||||||
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
|
|||||||
curr->se.sum_exec_runtime += delta_exec;
|
curr->se.sum_exec_runtime += delta_exec;
|
||||||
account_group_exec_runtime(curr, delta_exec);
|
account_group_exec_runtime(curr, delta_exec);
|
||||||
|
|
||||||
curr->se.exec_start = rq->clock;
|
curr->se.exec_start = rq->clock_task;
|
||||||
cpuacct_charge(curr, delta_exec);
|
cpuacct_charge(curr, delta_exec);
|
||||||
|
|
||||||
sched_rt_avg_update(rq, delta_exec);
|
sched_rt_avg_update(rq, delta_exec);
|
||||||
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
|
|||||||
* runqueue. Otherwise simply start this RT task
|
* runqueue. Otherwise simply start this RT task
|
||||||
* on its current runqueue.
|
* on its current runqueue.
|
||||||
*
|
*
|
||||||
* We want to avoid overloading runqueues. Even if
|
* We want to avoid overloading runqueues. If the woken
|
||||||
* the RT task is of higher priority than the current RT task.
|
* task is a higher priority, then it will stay on this CPU
|
||||||
* RT tasks behave differently than other tasks. If
|
* and the lower prio task should be moved to another CPU.
|
||||||
* one gets preempted, we try to push it off to another queue.
|
* Even though this will probably make the lower prio task
|
||||||
* So trying to keep a preempting RT task on the same
|
* lose its cache, we do not want to bounce a higher task
|
||||||
* cache hot CPU will force the running RT task to
|
* around just because it gave up its CPU, perhaps for a
|
||||||
* a cold CPU. So we waste all the cache for the lower
|
* lock?
|
||||||
* RT task in hopes of saving some of a RT task
|
*
|
||||||
* that is just being woken and probably will have
|
* For equal prio tasks, we just let the scheduler sort it out.
|
||||||
* cold cache anyway.
|
|
||||||
*/
|
*/
|
||||||
if (unlikely(rt_task(rq->curr)) &&
|
if (unlikely(rt_task(rq->curr)) &&
|
||||||
|
(rq->curr->rt.nr_cpus_allowed < 2 ||
|
||||||
|
rq->curr->prio < p->prio) &&
|
||||||
(p->rt.nr_cpus_allowed > 1)) {
|
(p->rt.nr_cpus_allowed > 1)) {
|
||||||
int cpu = find_lowest_rq(p);
|
int cpu = find_lowest_rq(p);
|
||||||
|
|
||||||
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
|
|||||||
} while (rt_rq);
|
} while (rt_rq);
|
||||||
|
|
||||||
p = rt_task_of(rt_se);
|
p = rt_task_of(rt_se);
|
||||||
p->se.exec_start = rq->clock;
|
p->se.exec_start = rq->clock_task;
|
||||||
|
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
@@ -1139,7 +1140,7 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
|
|||||||
for_each_leaf_rt_rq(rt_rq, rq) {
|
for_each_leaf_rt_rq(rt_rq, rq) {
|
||||||
array = &rt_rq->active;
|
array = &rt_rq->active;
|
||||||
idx = sched_find_first_bit(array->bitmap);
|
idx = sched_find_first_bit(array->bitmap);
|
||||||
next_idx:
|
next_idx:
|
||||||
if (idx >= MAX_RT_PRIO)
|
if (idx >= MAX_RT_PRIO)
|
||||||
continue;
|
continue;
|
||||||
if (next && next->prio < idx)
|
if (next && next->prio < idx)
|
||||||
@@ -1315,7 +1316,7 @@ static int push_rt_task(struct rq *rq)
|
|||||||
if (!next_task)
|
if (!next_task)
|
||||||
return 0;
|
return 0;
|
||||||
|
|
||||||
retry:
|
retry:
|
||||||
if (unlikely(next_task == rq->curr)) {
|
if (unlikely(next_task == rq->curr)) {
|
||||||
WARN_ON(1);
|
WARN_ON(1);
|
||||||
return 0;
|
return 0;
|
||||||
@@ -1463,7 +1464,7 @@ static int pull_rt_task(struct rq *this_rq)
|
|||||||
* but possible)
|
* but possible)
|
||||||
*/
|
*/
|
||||||
}
|
}
|
||||||
skip:
|
skip:
|
||||||
double_unlock_balance(this_rq, src_rq);
|
double_unlock_balance(this_rq, src_rq);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
|
|||||||
if (!task_running(rq, p) &&
|
if (!task_running(rq, p) &&
|
||||||
!test_tsk_need_resched(rq->curr) &&
|
!test_tsk_need_resched(rq->curr) &&
|
||||||
has_pushable_tasks(rq) &&
|
has_pushable_tasks(rq) &&
|
||||||
p->rt.nr_cpus_allowed > 1)
|
p->rt.nr_cpus_allowed > 1 &&
|
||||||
|
rt_task(rq->curr) &&
|
||||||
|
(rq->curr->rt.nr_cpus_allowed < 2 ||
|
||||||
|
rq->curr->prio < p->prio))
|
||||||
push_rt_tasks(rq);
|
push_rt_tasks(rq);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1709,7 +1713,7 @@ static void set_curr_task_rt(struct rq *rq)
|
|||||||
{
|
{
|
||||||
struct task_struct *p = rq->curr;
|
struct task_struct *p = rq->curr;
|
||||||
|
|
||||||
p->se.exec_start = rq->clock;
|
p->se.exec_start = rq->clock_task;
|
||||||
|
|
||||||
/* The running task is never eligible for pushing */
|
/* The running task is never eligible for pushing */
|
||||||
dequeue_pushable_task(rq, p);
|
dequeue_pushable_task(rq, p);
|
||||||
|
108
kernel/sched_stoptask.c
Normal file
108
kernel/sched_stoptask.c
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
/*
|
||||||
|
* stop-task scheduling class.
|
||||||
|
*
|
||||||
|
* The stop task is the highest priority task in the system, it preempts
|
||||||
|
* everything and will be preempted by nothing.
|
||||||
|
*
|
||||||
|
* See kernel/stop_machine.c
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
static int
|
||||||
|
select_task_rq_stop(struct rq *rq, struct task_struct *p,
|
||||||
|
int sd_flag, int flags)
|
||||||
|
{
|
||||||
|
return task_cpu(p); /* stop tasks as never migrate */
|
||||||
|
}
|
||||||
|
#endif /* CONFIG_SMP */
|
||||||
|
|
||||||
|
static void
|
||||||
|
check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||||
|
{
|
||||||
|
resched_task(rq->curr); /* we preempt everything */
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct task_struct *pick_next_task_stop(struct rq *rq)
|
||||||
|
{
|
||||||
|
struct task_struct *stop = rq->stop;
|
||||||
|
|
||||||
|
if (stop && stop->state == TASK_RUNNING)
|
||||||
|
return stop;
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void yield_task_stop(struct rq *rq)
|
||||||
|
{
|
||||||
|
BUG(); /* the stop task should never yield, its pointless. */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void set_curr_task_stop(struct rq *rq)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
static void switched_to_stop(struct rq *rq, struct task_struct *p,
|
||||||
|
int running)
|
||||||
|
{
|
||||||
|
BUG(); /* its impossible to change to this class */
|
||||||
|
}
|
||||||
|
|
||||||
|
static void prio_changed_stop(struct rq *rq, struct task_struct *p,
|
||||||
|
int oldprio, int running)
|
||||||
|
{
|
||||||
|
BUG(); /* how!?, what priority? */
|
||||||
|
}
|
||||||
|
|
||||||
|
static unsigned int
|
||||||
|
get_rr_interval_stop(struct rq *rq, struct task_struct *task)
|
||||||
|
{
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Simple, special scheduling class for the per-CPU stop tasks:
|
||||||
|
*/
|
||||||
|
static const struct sched_class stop_sched_class = {
|
||||||
|
.next = &rt_sched_class,
|
||||||
|
|
||||||
|
.enqueue_task = enqueue_task_stop,
|
||||||
|
.dequeue_task = dequeue_task_stop,
|
||||||
|
.yield_task = yield_task_stop,
|
||||||
|
|
||||||
|
.check_preempt_curr = check_preempt_curr_stop,
|
||||||
|
|
||||||
|
.pick_next_task = pick_next_task_stop,
|
||||||
|
.put_prev_task = put_prev_task_stop,
|
||||||
|
|
||||||
|
#ifdef CONFIG_SMP
|
||||||
|
.select_task_rq = select_task_rq_stop,
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.set_curr_task = set_curr_task_stop,
|
||||||
|
.task_tick = task_tick_stop,
|
||||||
|
|
||||||
|
.get_rr_interval = get_rr_interval_stop,
|
||||||
|
|
||||||
|
.prio_changed = prio_changed_stop,
|
||||||
|
.switched_to = switched_to_stop,
|
||||||
|
|
||||||
|
/* no .task_new for stop tasks */
|
||||||
|
};
|
@@ -76,12 +76,22 @@ void wakeup_softirqd(void)
|
|||||||
wake_up_process(tsk);
|
wake_up_process(tsk);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* preempt_count and SOFTIRQ_OFFSET usage:
|
||||||
|
* - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
|
||||||
|
* softirq processing.
|
||||||
|
* - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
|
||||||
|
* on local_bh_disable or local_bh_enable.
|
||||||
|
* This lets us distinguish between whether we are currently processing
|
||||||
|
* softirq and whether we just have bh disabled.
|
||||||
|
*/
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* This one is for softirq.c-internal use,
|
* This one is for softirq.c-internal use,
|
||||||
* where hardirqs are disabled legitimately:
|
* where hardirqs are disabled legitimately:
|
||||||
*/
|
*/
|
||||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||||
static void __local_bh_disable(unsigned long ip)
|
static void __local_bh_disable(unsigned long ip, unsigned int cnt)
|
||||||
{
|
{
|
||||||
unsigned long flags;
|
unsigned long flags;
|
||||||
|
|
||||||
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
|
|||||||
* We must manually increment preempt_count here and manually
|
* We must manually increment preempt_count here and manually
|
||||||
* call the trace_preempt_off later.
|
* call the trace_preempt_off later.
|
||||||
*/
|
*/
|
||||||
preempt_count() += SOFTIRQ_OFFSET;
|
preempt_count() += cnt;
|
||||||
/*
|
/*
|
||||||
* Were softirqs turned off above:
|
* Were softirqs turned off above:
|
||||||
*/
|
*/
|
||||||
if (softirq_count() == SOFTIRQ_OFFSET)
|
if (softirq_count() == cnt)
|
||||||
trace_softirqs_off(ip);
|
trace_softirqs_off(ip);
|
||||||
raw_local_irq_restore(flags);
|
raw_local_irq_restore(flags);
|
||||||
|
|
||||||
if (preempt_count() == SOFTIRQ_OFFSET)
|
if (preempt_count() == cnt)
|
||||||
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
|
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
|
||||||
}
|
}
|
||||||
#else /* !CONFIG_TRACE_IRQFLAGS */
|
#else /* !CONFIG_TRACE_IRQFLAGS */
|
||||||
static inline void __local_bh_disable(unsigned long ip)
|
static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
|
||||||
{
|
{
|
||||||
add_preempt_count(SOFTIRQ_OFFSET);
|
add_preempt_count(cnt);
|
||||||
barrier();
|
barrier();
|
||||||
}
|
}
|
||||||
#endif /* CONFIG_TRACE_IRQFLAGS */
|
#endif /* CONFIG_TRACE_IRQFLAGS */
|
||||||
|
|
||||||
void local_bh_disable(void)
|
void local_bh_disable(void)
|
||||||
{
|
{
|
||||||
__local_bh_disable((unsigned long)__builtin_return_address(0));
|
__local_bh_disable((unsigned long)__builtin_return_address(0),
|
||||||
|
SOFTIRQ_DISABLE_OFFSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPORT_SYMBOL(local_bh_disable);
|
EXPORT_SYMBOL(local_bh_disable);
|
||||||
|
|
||||||
|
static void __local_bh_enable(unsigned int cnt)
|
||||||
|
{
|
||||||
|
WARN_ON_ONCE(in_irq());
|
||||||
|
WARN_ON_ONCE(!irqs_disabled());
|
||||||
|
|
||||||
|
if (softirq_count() == cnt)
|
||||||
|
trace_softirqs_on((unsigned long)__builtin_return_address(0));
|
||||||
|
sub_preempt_count(cnt);
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Special-case - softirqs can safely be enabled in
|
* Special-case - softirqs can safely be enabled in
|
||||||
* cond_resched_softirq(), or by __do_softirq(),
|
* cond_resched_softirq(), or by __do_softirq(),
|
||||||
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
|
|||||||
*/
|
*/
|
||||||
void _local_bh_enable(void)
|
void _local_bh_enable(void)
|
||||||
{
|
{
|
||||||
WARN_ON_ONCE(in_irq());
|
__local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
|
||||||
WARN_ON_ONCE(!irqs_disabled());
|
|
||||||
|
|
||||||
if (softirq_count() == SOFTIRQ_OFFSET)
|
|
||||||
trace_softirqs_on((unsigned long)__builtin_return_address(0));
|
|
||||||
sub_preempt_count(SOFTIRQ_OFFSET);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
EXPORT_SYMBOL(_local_bh_enable);
|
EXPORT_SYMBOL(_local_bh_enable);
|
||||||
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
|
|||||||
/*
|
/*
|
||||||
* Are softirqs going to be turned on now:
|
* Are softirqs going to be turned on now:
|
||||||
*/
|
*/
|
||||||
if (softirq_count() == SOFTIRQ_OFFSET)
|
if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
|
||||||
trace_softirqs_on(ip);
|
trace_softirqs_on(ip);
|
||||||
/*
|
/*
|
||||||
* Keep preemption disabled until we are done with
|
* Keep preemption disabled until we are done with
|
||||||
* softirq processing:
|
* softirq processing:
|
||||||
*/
|
*/
|
||||||
sub_preempt_count(SOFTIRQ_OFFSET - 1);
|
sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
|
||||||
|
|
||||||
if (unlikely(!in_interrupt() && local_softirq_pending()))
|
if (unlikely(!in_interrupt() && local_softirq_pending()))
|
||||||
do_softirq();
|
do_softirq();
|
||||||
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
|
|||||||
pending = local_softirq_pending();
|
pending = local_softirq_pending();
|
||||||
account_system_vtime(current);
|
account_system_vtime(current);
|
||||||
|
|
||||||
__local_bh_disable((unsigned long)__builtin_return_address(0));
|
__local_bh_disable((unsigned long)__builtin_return_address(0),
|
||||||
|
SOFTIRQ_OFFSET);
|
||||||
lockdep_softirq_enter();
|
lockdep_softirq_enter();
|
||||||
|
|
||||||
cpu = smp_processor_id();
|
cpu = smp_processor_id();
|
||||||
@@ -245,7 +262,7 @@ restart:
|
|||||||
lockdep_softirq_exit();
|
lockdep_softirq_exit();
|
||||||
|
|
||||||
account_system_vtime(current);
|
account_system_vtime(current);
|
||||||
_local_bh_enable();
|
__local_bh_enable(SOFTIRQ_OFFSET);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef __ARCH_HAS_DO_SOFTIRQ
|
#ifndef __ARCH_HAS_DO_SOFTIRQ
|
||||||
@@ -279,10 +296,16 @@ void irq_enter(void)
|
|||||||
|
|
||||||
rcu_irq_enter();
|
rcu_irq_enter();
|
||||||
if (idle_cpu(cpu) && !in_interrupt()) {
|
if (idle_cpu(cpu) && !in_interrupt()) {
|
||||||
__irq_enter();
|
/*
|
||||||
|
* Prevent raise_softirq from needlessly waking up ksoftirqd
|
||||||
|
* here, as softirq will be serviced on return from interrupt.
|
||||||
|
*/
|
||||||
|
local_bh_disable();
|
||||||
tick_check_idle(cpu);
|
tick_check_idle(cpu);
|
||||||
} else
|
_local_bh_enable();
|
||||||
__irq_enter();
|
}
|
||||||
|
|
||||||
|
__irq_enter();
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
|
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
|
||||||
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
|
|||||||
{
|
{
|
||||||
set_current_state(TASK_INTERRUPTIBLE);
|
set_current_state(TASK_INTERRUPTIBLE);
|
||||||
|
|
||||||
|
current->flags |= PF_KSOFTIRQD;
|
||||||
while (!kthread_should_stop()) {
|
while (!kthread_should_stop()) {
|
||||||
preempt_disable();
|
preempt_disable();
|
||||||
if (!local_softirq_pending()) {
|
if (!local_softirq_pending()) {
|
||||||
|
@@ -287,11 +287,12 @@ repeat:
|
|||||||
goto repeat;
|
goto repeat;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
|
||||||
|
|
||||||
/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
|
/* manage stopper for a cpu, mostly lifted from sched migration thread mgmt */
|
||||||
static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
|
static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
|
||||||
unsigned long action, void *hcpu)
|
unsigned long action, void *hcpu)
|
||||||
{
|
{
|
||||||
struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
|
|
||||||
unsigned int cpu = (unsigned long)hcpu;
|
unsigned int cpu = (unsigned long)hcpu;
|
||||||
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu);
|
||||||
struct task_struct *p;
|
struct task_struct *p;
|
||||||
@@ -304,13 +305,13 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
|
|||||||
cpu);
|
cpu);
|
||||||
if (IS_ERR(p))
|
if (IS_ERR(p))
|
||||||
return NOTIFY_BAD;
|
return NOTIFY_BAD;
|
||||||
sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m);
|
|
||||||
get_task_struct(p);
|
get_task_struct(p);
|
||||||
|
kthread_bind(p, cpu);
|
||||||
|
sched_set_stop_task(cpu, p);
|
||||||
stopper->thread = p;
|
stopper->thread = p;
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case CPU_ONLINE:
|
case CPU_ONLINE:
|
||||||
kthread_bind(stopper->thread, cpu);
|
|
||||||
/* strictly unnecessary, as first user will wake it */
|
/* strictly unnecessary, as first user will wake it */
|
||||||
wake_up_process(stopper->thread);
|
wake_up_process(stopper->thread);
|
||||||
/* mark enabled */
|
/* mark enabled */
|
||||||
@@ -325,6 +326,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
|
|||||||
{
|
{
|
||||||
struct cpu_stop_work *work;
|
struct cpu_stop_work *work;
|
||||||
|
|
||||||
|
sched_set_stop_task(cpu, NULL);
|
||||||
/* kill the stopper */
|
/* kill the stopper */
|
||||||
kthread_stop(stopper->thread);
|
kthread_stop(stopper->thread);
|
||||||
/* drain remaining works */
|
/* drain remaining works */
|
||||||
|
@@ -123,7 +123,7 @@ static int cls_cgroup_classify(struct sk_buff *skb, struct tcf_proto *tp,
|
|||||||
* calls by looking at the number of nested bh disable calls because
|
* calls by looking at the number of nested bh disable calls because
|
||||||
* softirqs always disables bh.
|
* softirqs always disables bh.
|
||||||
*/
|
*/
|
||||||
if (softirq_count() != SOFTIRQ_OFFSET) {
|
if (in_serving_softirq()) {
|
||||||
/* If there is an sk_classid we'll use that. */
|
/* If there is an sk_classid we'll use that. */
|
||||||
if (!skb->sk)
|
if (!skb->sk)
|
||||||
return -1;
|
return -1;
|
||||||
|
Reference in New Issue
Block a user