sparse irq_desc[] array: core kernel and x86 changes

Impact: new feature

Problem on distro kernels: irq_desc[NR_IRQS] takes megabytes of RAM with
NR_CPUS set to large values. The goal is to be able to scale up to much
larger NR_IRQS value without impacting the (important) common case.

To solve this, we generalize irq_desc[NR_IRQS] to an (optional) array of
irq_desc pointers.

When CONFIG_SPARSE_IRQ=y is used, we use kzalloc_node to get irq_desc,
this also makes the IRQ descriptors NUMA-local (to the site that calls
request_irq()).

This gets rid of the irq_cfg[] static array on x86 as well: irq_cfg now
uses desc->chip_data for x86 to store irq_cfg.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
This commit is contained in:
Yinghai Lu
2008-12-05 18:58:31 -08:00
committed by Ingo Molnar
parent 218d11a8b0
commit 0b8f1efad3
23 changed files with 659 additions and 173 deletions

View File

@@ -108,55 +108,6 @@ static int __init parse_noapic(char *str)
early_param("noapic", parse_noapic);
struct irq_pin_list;
struct irq_cfg {
unsigned int irq;
struct irq_pin_list *irq_2_pin;
cpumask_t domain;
cpumask_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
static struct irq_cfg irq_cfgx[NR_IRQS] = {
[0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
[3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
[4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
[5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
[6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
[7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
[8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
[9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
[10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
[11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
[12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
[13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
[14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
[15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
};
#define for_each_irq_cfg(irq, cfg) \
for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
static struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
{
return irq_cfg(irq);
}
/*
* Rough estimation of how many shared IRQs there are, can be changed
* anytime.
*/
#define MAX_PLUS_SHARED_IRQS NR_IRQS
#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
/*
* This is performance-critical, we want to do it O(1)
@@ -170,32 +121,116 @@ struct irq_pin_list {
struct irq_pin_list *next;
};
static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
static struct irq_pin_list *irq_2_pin_ptr;
static void __init irq_2_pin_init(void)
static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
{
struct irq_pin_list *pin = irq_2_pin_head;
int i;
struct irq_pin_list *pin;
int node;
for (i = 1; i < PIN_MAP_SIZE; i++)
pin[i-1].next = &pin[i];
node = cpu_to_node(cpu);
irq_2_pin_ptr = &pin[0];
}
pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
printk(KERN_DEBUG " alloc irq_2_pin on cpu %d node %d\n", cpu, node);
static struct irq_pin_list *get_one_free_irq_2_pin(void)
{
struct irq_pin_list *pin = irq_2_pin_ptr;
if (!pin)
panic("can not get more irq_2_pin\n");
irq_2_pin_ptr = pin->next;
pin->next = NULL;
return pin;
}
struct irq_cfg {
struct irq_pin_list *irq_2_pin;
cpumask_t domain;
cpumask_t old_domain;
unsigned move_cleanup_count;
u8 vector;
u8 move_in_progress : 1;
};
/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg irq_cfgx[] = {
#else
static struct irq_cfg irq_cfgx[NR_IRQS] = {
#endif
[0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
[1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
[2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
[3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
[4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
[5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
[6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
[7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
[8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
[9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
[10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
[11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
[12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
[13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
[14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
[15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
};
void __init arch_early_irq_init(void)
{
struct irq_cfg *cfg;
struct irq_desc *desc;
int count;
int i;
cfg = irq_cfgx;
count = ARRAY_SIZE(irq_cfgx);
for (i = 0; i < count; i++) {
desc = irq_to_desc(i);
desc->chip_data = &cfg[i];
}
}
#ifdef CONFIG_SPARSE_IRQ
static struct irq_cfg *irq_cfg(unsigned int irq)
{
struct irq_cfg *cfg = NULL;
struct irq_desc *desc;
desc = irq_to_desc(irq);
if (desc)
cfg = desc->chip_data;
return cfg;
}
static struct irq_cfg *get_one_free_irq_cfg(int cpu)
{
struct irq_cfg *cfg;
int node;
node = cpu_to_node(cpu);
cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
printk(KERN_DEBUG " alloc irq_cfg on cpu %d node %d\n", cpu, node);
return cfg;
}
void arch_init_chip_data(struct irq_desc *desc, int cpu)
{
struct irq_cfg *cfg;
cfg = desc->chip_data;
if (!cfg) {
desc->chip_data = get_one_free_irq_cfg(cpu);
if (!desc->chip_data) {
printk(KERN_ERR "can not alloc irq_cfg\n");
BUG_ON(1);
}
}
}
#else
static struct irq_cfg *irq_cfg(unsigned int irq)
{
return irq < nr_irqs ? irq_cfgx + irq : NULL;
}
#endif
struct io_apic {
unsigned int index;
unsigned int unused[3];
@@ -397,16 +432,19 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
* shared ISA-space IRQs, so we have to support them. We are super
* fast in the common case, and fast for shared ISA-space IRQs.
*/
static void add_pin_to_irq(unsigned int irq, int apic, int pin)
static void add_pin_to_irq_cpu(unsigned int irq, int cpu, int apic, int pin)
{
struct irq_cfg *cfg;
struct irq_pin_list *entry;
struct irq_cfg *cfg = irq_cfg(irq);
/* first time to refer irq_cfg, so with new */
cfg = irq_cfg_alloc(irq);
entry = cfg->irq_2_pin;
if (!entry) {
entry = get_one_free_irq_2_pin();
entry = get_one_free_irq_2_pin(cpu);
if (!entry) {
printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
apic, pin);
return;
}
cfg->irq_2_pin = entry;
entry->apic = apic;
entry->pin = pin;
@@ -421,7 +459,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
entry = entry->next;
}
entry->next = get_one_free_irq_2_pin();
entry->next = get_one_free_irq_2_pin(cpu);
entry = entry->next;
entry->apic = apic;
entry->pin = pin;
@@ -430,7 +468,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin)
/*
* Reroute an IRQ to a different pin.
*/
static void __init replace_pin_at_irq(unsigned int irq,
static void __init replace_pin_at_irq(unsigned int irq, int cpu,
int oldapic, int oldpin,
int newapic, int newpin)
{
@@ -451,7 +489,7 @@ static void __init replace_pin_at_irq(unsigned int irq,
/* why? call replace before add? */
if (!replaced)
add_pin_to_irq(irq, newapic, newpin);
add_pin_to_irq_cpu(irq, cpu, newapic, newpin);
}
static inline void io_apic_modify_irq(unsigned int irq,
@@ -1162,9 +1200,13 @@ void __setup_vector_irq(int cpu)
/* This function must be called with vector_lock held */
int irq, vector;
struct irq_cfg *cfg;
struct irq_desc *desc;
/* Mark the inuse vectors */
for_each_irq_cfg(irq, cfg) {
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
cfg = desc->chip_data;
if (!cpu_isset(cpu, cfg->domain))
continue;
vector = cfg->vector;
@@ -1356,6 +1398,8 @@ static void __init setup_IO_APIC_irqs(void)
{
int apic, pin, idx, irq;
int notcon = 0;
struct irq_desc *desc;
int cpu = boot_cpu_id;
apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
@@ -1387,7 +1431,12 @@ static void __init setup_IO_APIC_irqs(void)
if (multi_timer_check(apic, irq))
continue;
#endif
add_pin_to_irq(irq, apic, pin);
desc = irq_to_desc_alloc_cpu(irq, cpu);
if (!desc) {
printk(KERN_INFO "can not get irq_desc for %d\n", irq);
continue;
}
add_pin_to_irq_cpu(irq, cpu, apic, pin);
setup_IO_APIC_irq(apic, pin, irq,
irq_trigger(idx), irq_polarity(idx));
@@ -1448,6 +1497,7 @@ __apicdebuginit(void) print_IO_APIC(void)
union IO_APIC_reg_03 reg_03;
unsigned long flags;
struct irq_cfg *cfg;
struct irq_desc *desc;
unsigned int irq;
if (apic_verbosity == APIC_QUIET)
@@ -1537,8 +1587,13 @@ __apicdebuginit(void) print_IO_APIC(void)
}
}
printk(KERN_DEBUG "IRQ to pin mappings:\n");
for_each_irq_cfg(irq, cfg) {
struct irq_pin_list *entry = cfg->irq_2_pin;
for_each_irq_desc(irq, desc) {
struct irq_pin_list *entry;
if (!desc)
continue;
cfg = desc->chip_data;
entry = cfg->irq_2_pin;
if (!entry)
continue;
printk(KERN_DEBUG "IRQ%d ", irq);
@@ -2022,6 +2077,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
{
int was_pending = 0;
unsigned long flags;
struct irq_cfg *cfg;
spin_lock_irqsave(&ioapic_lock, flags);
if (irq < 16) {
@@ -2029,6 +2085,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
if (i8259A_irq_pending(irq))
was_pending = 1;
}
cfg = irq_cfg(irq);
__unmask_IO_APIC_irq(irq);
spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2178,6 +2235,9 @@ static void ir_irq_migration(struct work_struct *work)
struct irq_desc *desc;
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
if (desc->status & IRQ_MOVE_PENDING) {
unsigned long flags;
@@ -2229,6 +2289,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
struct irq_cfg *cfg;
irq = __get_cpu_var(vector_irq)[vector];
if (irq == -1)
continue;
desc = irq_to_desc(irq);
if (!desc)
continue;
@@ -2430,8 +2493,12 @@ static inline void init_IO_APIC_traps(void)
* Also, we've got to be careful not to trash gate
* 0x80, because int 0x80 is hm, kind of importantish. ;)
*/
for_each_irq_cfg(irq, cfg) {
if (IO_APIC_IRQ(irq) && !cfg->vector) {
for_each_irq_desc(irq, desc) {
if (!desc)
continue;
cfg = desc->chip_data;
if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
/*
* Hmm.. We don't have an entry for this,
* so default to an old-fashioned 8259
@@ -2439,11 +2506,9 @@ static inline void init_IO_APIC_traps(void)
*/
if (irq < 16)
make_8259A_irq(irq);
else {
desc = irq_to_desc(irq);
else
/* Strange. Oh, well.. */
desc->chip = &no_irq_chip;
}
}
}
}
@@ -2654,7 +2719,7 @@ static inline void __init check_timer(void)
* Ok, does IRQ0 through the IOAPIC work?
*/
if (no_pin1) {
add_pin_to_irq(0, apic1, pin1);
add_pin_to_irq_cpu(0, boot_cpu_id, apic1, pin1);
setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
}
unmask_IO_APIC_irq(0);
@@ -2683,7 +2748,7 @@ static inline void __init check_timer(void)
/*
* legacy devices should be connected to IO APIC #0
*/
replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
replace_pin_at_irq(0, boot_cpu_id, apic1, pin1, apic2, pin2);
setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
unmask_IO_APIC_irq(0);
enable_8259A_irq(0);
@@ -2902,21 +2967,25 @@ unsigned int create_irq_nr(unsigned int irq_want)
unsigned int irq;
unsigned int new;
unsigned long flags;
struct irq_cfg *cfg_new;
irq_want = nr_irqs - 1;
struct irq_cfg *cfg_new = NULL;
int cpu = boot_cpu_id;
struct irq_desc *desc_new = NULL;
irq = 0;
spin_lock_irqsave(&vector_lock, flags);
for (new = irq_want; new > 0; new--) {
if (platform_legacy_irq(new))
continue;
cfg_new = irq_cfg(new);
if (cfg_new && cfg_new->vector != 0)
desc_new = irq_to_desc_alloc_cpu(new, cpu);
if (!desc_new) {
printk(KERN_INFO "can not get irq_desc for %d\n", new);
continue;
}
cfg_new = desc_new->chip_data;
if (cfg_new->vector != 0)
continue;
/* check if need to create one */
if (!cfg_new)
cfg_new = irq_cfg_alloc(new);
if (__assign_irq_vector(new, TARGET_CPUS) == 0)
irq = new;
break;
@@ -2925,6 +2994,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
if (irq > 0) {
dynamic_irq_init(irq);
/* restore it, in case dynamic_irq_init clear it */
if (desc_new)
desc_new->chip_data = cfg_new;
}
return irq;
}
@@ -2944,8 +3016,16 @@ int create_irq(void)
void destroy_irq(unsigned int irq)
{
unsigned long flags;
struct irq_cfg *cfg;
struct irq_desc *desc;
/* store it, in case dynamic_irq_cleanup clear it */
desc = irq_to_desc(irq);
cfg = desc->chip_data;
dynamic_irq_cleanup(irq);
/* connect back irq_cfg */
if (desc)
desc->chip_data = cfg;
#ifdef CONFIG_INTR_REMAP
free_irte(irq);
@@ -3195,26 +3275,13 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
return 0;
}
static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
{
unsigned int irq;
irq = dev->bus->number;
irq <<= 8;
irq |= dev->devfn;
irq <<= 12;
return irq;
}
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc)
{
unsigned int irq;
int ret;
unsigned int irq_want;
irq_want = build_irq_for_pci_dev(dev) + 0x100;
irq_want = nr_irqs - 1;
irq = create_irq_nr(irq_want);
if (irq == 0)
return -1;
@@ -3228,7 +3295,7 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
goto error;
no_ir:
#endif
ret = setup_msi_irq(dev, desc, irq);
ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0) {
destroy_irq(irq);
return ret;
@@ -3246,7 +3313,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
{
unsigned int irq;
int ret, sub_handle;
struct msi_desc *desc;
struct msi_desc *msidesc;
unsigned int irq_want;
#ifdef CONFIG_INTR_REMAP
@@ -3254,10 +3321,11 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
int index = 0;
#endif
irq_want = build_irq_for_pci_dev(dev) + 0x100;
irq_want = nr_irqs - 1;
sub_handle = 0;
list_for_each_entry(desc, &dev->msi_list, list) {
irq = create_irq_nr(irq_want--);
list_for_each_entry(msidesc, &dev->msi_list, list) {
irq = create_irq_nr(irq_want);
irq_want--;
if (irq == 0)
return -1;
#ifdef CONFIG_INTR_REMAP
@@ -3289,7 +3357,7 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
}
no_ir:
#endif
ret = setup_msi_irq(dev, desc, irq);
ret = setup_msi_irq(dev, msidesc, irq);
if (ret < 0)
goto error;
sub_handle++;
@@ -3707,17 +3775,29 @@ int __init io_apic_get_version(int ioapic)
int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
{
struct irq_desc *desc;
struct irq_cfg *cfg;
int cpu = boot_cpu_id;
if (!IO_APIC_IRQ(irq)) {
apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
ioapic);
return -EINVAL;
}
desc = irq_to_desc_alloc_cpu(irq, cpu);
if (!desc) {
printk(KERN_INFO "can not get irq_desc %d\n", irq);
return 0;
}
/*
* IRQs < 16 are already in the irq_2_pin[] map
*/
if (irq >= 16)
add_pin_to_irq(irq, ioapic, pin);
if (irq >= 16) {
cfg = desc->chip_data;
add_pin_to_irq_cpu(irq, cpu, ioapic, pin);
}
setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity);
@@ -3773,7 +3853,8 @@ void __init setup_ioapic_dest(void)
* when you have too many devices, because at that time only boot
* cpu is online.
*/
cfg = irq_cfg(irq);
desc = irq_to_desc(irq);
cfg = desc->chip_data;
if (!cfg->vector) {
setup_IO_APIC_irq(ioapic, pin, irq,
irq_trigger(irq_entry),
@@ -3785,7 +3866,6 @@ void __init setup_ioapic_dest(void)
/*
* Honour affinities which have been set in early boot
*/
desc = irq_to_desc(irq);
if (desc->status &
(IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
mask = desc->affinity;
@@ -3846,7 +3926,6 @@ void __init ioapic_init_mappings(void)
struct resource *ioapic_res;
int i;
irq_2_pin_init();
ioapic_res = ioapic_setup_resources();
for (i = 0; i < nr_ioapics; i++) {
if (smp_found_config) {

View File

@@ -118,6 +118,9 @@ int show_interrupts(struct seq_file *p, void *v)
}
desc = irq_to_desc(i);
if (!desc)
return 0;
spin_lock_irqsave(&desc->lock, flags);
#ifndef CONFIG_SMP
any_count = kstat_irqs(i);

View File

@@ -242,6 +242,8 @@ void fixup_irqs(cpumask_t map)
for_each_irq_desc(irq, desc) {
cpumask_t mask;
if (!desc)
continue;
if (irq == 2)
continue;

View File

@@ -94,6 +94,8 @@ void fixup_irqs(cpumask_t map)
int break_affinity = 0;
int set_affinity = 1;
if (!desc)
continue;
if (irq == 2)
continue;

View File

@@ -69,7 +69,6 @@ void __init init_ISA_irqs (void)
* 16 old-style INTA-cycle interrupts:
*/
for (i = 0; i < 16; i++) {
/* first time call this irq_desc */
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;

View File

@@ -143,7 +143,6 @@ void __init init_ISA_irqs(void)
init_8259A(0);
for (i = 0; i < 16; i++) {
/* first time call this irq_desc */
struct irq_desc *desc = irq_to_desc(i);
desc->status = IRQ_DISABLED;