Merge branch 'irq/for-block' into irq/core

Add the new irq spreading infrastructure.
2016-09-15 20:54:40 +02:00
parent 16217dc79d ee8d41e53e
commit 0a30d69195
9 changed files with 295 additions and 125 deletions
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -4,60 +4,151 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>

-static int get_first_sibling(unsigned int cpu)
+static void irq_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk,
+				int cpus_per_vec)
 {
-	unsigned int ret;
+	const struct cpumask *siblmsk;
+	int cpu, sibl;

-	ret = cpumask_first(topology_sibling_cpumask(cpu));
-	if (ret < nr_cpu_ids)
-		return ret;
-	return cpu;
+	for ( ; cpus_per_vec > 0; ) {
+		cpu = cpumask_first(nmsk);
+
+		/* Should not happen, but I'm too lazy to think about it */
+		if (cpu >= nr_cpu_ids)
+			return;
+
+		cpumask_clear_cpu(cpu, nmsk);
+		cpumask_set_cpu(cpu, irqmsk);
+		cpus_per_vec--;
+
+		/* If the cpu has siblings, use them first */
+		siblmsk = topology_sibling_cpumask(cpu);
+		for (sibl = -1; cpus_per_vec > 0; ) {
+			sibl = cpumask_next(sibl, siblmsk);
+			if (sibl >= nr_cpu_ids)
+				break;
+			if (!cpumask_test_and_clear_cpu(sibl, nmsk))
+				continue;
+			cpumask_set_cpu(sibl, irqmsk);
+			cpus_per_vec--;
+		}
+	}
 }

-/*
- * Take a map of online CPUs and the number of available interrupt vectors
- * and generate an output cpumask suitable for spreading MSI/MSI-X vectors
- * so that they are distributed as good as possible around the CPUs.  If
- * more vectors than CPUs are available we'll map one to each CPU,
- * otherwise we map one to the first sibling of each socket.
- *
- * If there are more vectors than CPUs we will still only have one bit
- * set per CPU, but interrupt code will keep on assigning the vectors from
- * the start of the bitmap until we run out of vectors.
- */
-struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
+static int get_nodes_in_cpumask(const struct cpumask *mask, nodemask_t *nodemsk)
 {
-	struct cpumask *affinity_mask;
-	unsigned int max_vecs = *nr_vecs;
+	int n, nodes;

-	if (max_vecs == 1)
-		return NULL;
-
-	affinity_mask = kzalloc(cpumask_size(), GFP_KERNEL);
-	if (!affinity_mask) {
-		*nr_vecs = 1;
-		return NULL;
+	/* Calculate the number of nodes in the supplied affinity mask */
+	for (n = 0, nodes = 0; n < num_online_nodes(); n++) {
+		if (cpumask_intersects(mask, cpumask_of_node(n))) {
+			node_set(n, *nodemsk);
+			nodes++;
+		}
 	}
+	return nodes;
+}

+/**
+ * irq_create_affinity_masks - Create affinity masks for multiqueue spreading
+ * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
+ *			is used
+ * @nvecs:		The number of vectors
+ *
+ * Returns the masks pointer or NULL if allocation failed.
+ */
+struct cpumask *irq_create_affinity_masks(const struct cpumask *affinity,
+					  int nvec)
+{
+	int n, nodes, vecs_per_node, cpus_per_vec, extra_vecs, curvec = 0;
+	nodemask_t nodemsk = NODE_MASK_NONE;
+	struct cpumask *masks;
+	cpumask_var_t nmsk;
+
+	if (!zalloc_cpumask_var(&nmsk, GFP_KERNEL))
+		return NULL;
+
+	masks = kzalloc(nvec * sizeof(*masks), GFP_KERNEL);
+	if (!masks)
+		goto out;
+
+	/* Stabilize the cpumasks */
 	get_online_cpus();
-	if (max_vecs >= num_online_cpus()) {
-		cpumask_copy(affinity_mask, cpu_online_mask);
-		*nr_vecs = num_online_cpus();
-	} else {
-		unsigned int vecs = 0, cpu;
+	/* If the supplied affinity mask is NULL, use cpu online mask */
+	if (!affinity)
+		affinity = cpu_online_mask;

-		for_each_online_cpu(cpu) {
-			if (cpu == get_first_sibling(cpu)) {
-				cpumask_set_cpu(cpu, affinity_mask);
-				vecs++;
-			}
+	nodes = get_nodes_in_cpumask(affinity, &nodemsk);

-			if (--max_vecs == 0)
+	/*
+	 * If the number of nodes in the mask is less than or equal the
+	 * number of vectors we just spread the vectors across the nodes.
+	 */
+	if (nvec <= nodes) {
+		for_each_node_mask(n, nodemsk) {
+			cpumask_copy(masks + curvec, cpumask_of_node(n));
+			if (++curvec == nvec)
 				break;
 		}
-		*nr_vecs = vecs;
+		goto outonl;
 	}
-	put_online_cpus();

-	return affinity_mask;
+	/* Spread the vectors per node */
+	vecs_per_node = nvec / nodes;
+	/* Account for rounding errors */
+	extra_vecs = nvec - (nodes * vecs_per_node);
+
+	for_each_node_mask(n, nodemsk) {
+		int ncpus, v, vecs_to_assign = vecs_per_node;
+
+		/* Get the cpus on this node which are in the mask */
+		cpumask_and(nmsk, affinity, cpumask_of_node(n));
+
+		/* Calculate the number of cpus per vector */
+		ncpus = cpumask_weight(nmsk);
+
+		for (v = 0; curvec < nvec && v < vecs_to_assign; curvec++, v++) {
+			cpus_per_vec = ncpus / vecs_to_assign;
+
+			/* Account for extra vectors to compensate rounding errors */
+			if (extra_vecs) {
+				cpus_per_vec++;
+				if (!--extra_vecs)
+					vecs_per_node++;
+			}
+			irq_spread_init_one(masks + curvec, nmsk, cpus_per_vec);
+		}
+
+		if (curvec >= nvec)
+			break;
+	}
+
+outonl:
+	put_online_cpus();
+out:
+	free_cpumask_var(nmsk);
+	return masks;
+}
+
+/**
+ * irq_calc_affinity_vectors - Calculate to optimal number of vectors for a given affinity mask
+ * @affinity:		The affinity mask to spread. If NULL cpu_online_mask
+ *			is used
+ * @maxvec:		The maximum number of vectors available
+ */
+int irq_calc_affinity_vectors(const struct cpumask *affinity, int maxvec)
+{
+	int cpus, ret;
+
+	/* Stabilize the cpumasks */
+	get_online_cpus();
+	/* If the supplied affinity mask is NULL, use cpu online mask */
+	if (!affinity)
+		affinity = cpu_online_mask;
+
+	cpus = cpumask_weight(affinity);
+	ret = (cpus < maxvec) ? cpus : maxvec;
+
+	put_online_cpus();
+	return ret;
 }
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -424,25 +424,24 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
 	const struct cpumask *mask = NULL;
 	struct irq_desc *desc;
 	unsigned int flags;
-	int i, cpu = -1;
+	int i;

-	if (affinity && cpumask_empty(affinity))
-		return -EINVAL;
+	/* Validate affinity mask(s) */
+	if (affinity) {
+		for (i = 0, mask = affinity; i < cnt; i++, mask++) {
+			if (cpumask_empty(mask))
+				return -EINVAL;
+		}
+	}

 	flags = affinity ? IRQD_AFFINITY_MANAGED : 0;
+	mask = NULL;

 	for (i = 0; i < cnt; i++) {
 		if (affinity) {
-			cpu = cpumask_next(cpu, affinity);
-			if (cpu >= nr_cpu_ids)
-				cpu = cpumask_first(affinity);
-			node = cpu_to_node(cpu);
-
-			/*
-			 * For single allocations we use the caller provided
-			 * mask otherwise we use the mask of the target cpu
-			 */
-			mask = cnt == 1 ? affinity : cpumask_of(cpu);
+			node = cpu_to_node(cpumask_first(affinity));
+			mask = affinity;
+			affinity++;
 		}
 		desc = alloc_desc(start + i, node, flags, mask, owner);
 		if (!desc)
@@ -670,9 +669,9 @@ EXPORT_SYMBOL_GPL(irq_free_descs);
 * @cnt:	Number of consecutive irqs to allocate.
 * @node:	Preferred node on which the irq descriptor should be allocated
 * @owner:	Owning module (can be NULL)
- * @affinity:	Optional pointer to an affinity mask which hints where the
- *		irq descriptors should be allocated and which default
- *		affinities to use
+ * @affinity:	Optional pointer to an affinity mask array of size @cnt which
+ *		hints where the irq descriptors should be allocated and which
+ *		default affinities to use
 *
 * Returns the first irq number or error code
 */
--- a/kernel/irq/msi.c
+++ b/kernel/irq/msi.c
@@ -18,20 +18,42 @@
 /* Temparory solution for building, will be removed later */
 #include <linux/pci.h>

-struct msi_desc *alloc_msi_entry(struct device *dev)
+/**
+ * alloc_msi_entry - Allocate an initialize msi_entry
+ * @dev:	Pointer to the device for which this is allocated
+ * @nvec:	The number of vectors used in this entry
+ * @affinity:	Optional pointer to an affinity mask array size of @nvec
+ *
+ * If @affinity is not NULL then a an affinity array[@nvec] is allocated
+ * and the affinity masks from @affinity are copied.
+ */
+struct msi_desc *
+alloc_msi_entry(struct device *dev, int nvec, const struct cpumask *affinity)
 {
-	struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+	struct msi_desc *desc;
+
+	desc = kzalloc(sizeof(*desc), GFP_KERNEL);
 	if (!desc)
 		return NULL;

 	INIT_LIST_HEAD(&desc->list);
 	desc->dev = dev;
+	desc->nvec_used = nvec;
+	if (affinity) {
+		desc->affinity = kmemdup(affinity,
+			nvec * sizeof(*desc->affinity), GFP_KERNEL);
+		if (!desc->affinity) {
+			kfree(desc);
+			return NULL;
+		}
+	}

 	return desc;
 }

 void free_msi_entry(struct msi_desc *entry)
 {
+	kfree(entry->affinity);
 	kfree(entry);
 }