kernel: Add snapshot of changes to support cpu isolation

This snapshot is taken from msm-4.19 as of commit 5debecbe7195 ("trace: filter out spurious preemption and IRQs disable traces"). Change-Id: I222aa448ac68f7365065f62dba9db94925da38a0 Signed-off-by: Satya Durga Srinivasu Prabhala <satyap@codeaurora.org>
2019-09-17 10:34:18 -07:00
parent 8573d7bfaa
commit 201ea48219
20 changed files with 398 additions and 77 deletions
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1156,6 +1156,11 @@ int lock_device_hotplug_sysfs(void)
 	return restart_syscall();
 }
 void lock_device_hotplug_assert(void)
 {
 	lockdep_assert_held(&device_hotplug_lock);
 }
 #ifdef CONFIG_BLOCK
 static inline int device_is_not_partition(struct device *dev)
 {
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -183,6 +183,32 @@ static struct attribute_group crash_note_cpu_attr_group = {
 };
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
 static ssize_t isolate_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct cpu *cpu = container_of(dev, struct cpu, dev);
 	ssize_t rc;
 	int cpuid = cpu->dev.id;
 	unsigned int isolated = cpu_isolated(cpuid);
 	rc = scnprintf(buf, PAGE_SIZE-2, "%d\n", isolated);
 	return rc;
 }
 static DEVICE_ATTR_RO(isolate);
 static struct attribute *cpu_isolated_attrs[] = {
 	&dev_attr_isolate.attr,
 	NULL
 };
 static struct attribute_group cpu_isolated_attr_group = {
 	.attrs = cpu_isolated_attrs,
 };
 #endif
 #ifdef CONFIG_SCHED_WALT
 static ssize_t sched_load_boost_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
@@ -240,6 +266,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
 	&cpu_isolated_attr_group,
 #endif
 #ifdef CONFIG_SCHED_WALT
 	&sched_cpu_attr_group,
 #endif
@@ -250,6 +279,9 @@ static const struct attribute_group *hotplugable_cpu_attr_groups[] = {
 #ifdef CONFIG_KEXEC
 	&crash_note_cpu_attr_group,
 #endif
 #ifdef CONFIG_HOTPLUG_CPU
 	&cpu_isolated_attr_group,
 #endif
 #ifdef CONFIG_SCHED_WALT
 	&sched_cpu_attr_group,
 #endif
@@ -282,6 +314,7 @@ static struct cpu_attr cpu_attrs[] = {
 	_CPU_ATTR(online, &__cpu_online_mask),
 	_CPU_ATTR(possible, &__cpu_possible_mask),
 	_CPU_ATTR(present, &__cpu_present_mask),
 	_CPU_ATTR(core_ctl_isolated, &__cpu_isolated_mask),
 };
 /*
@@ -531,6 +564,7 @@ static struct attribute *cpu_root_attrs[] = {
 	&cpu_attrs[0].attr.attr,
 	&cpu_attrs[1].attr.attr,
 	&cpu_attrs[2].attr.attr,
 	&cpu_attrs[3].attr.attr,
 	&dev_attr_kernel_max.attr,
 	&dev_attr_offline.attr,
 	&dev_attr_isolated.attr,
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -55,6 +55,7 @@ extern unsigned int nr_cpu_ids;
 *     cpu_present_mask - has bit 'cpu' set iff cpu is populated
 *     cpu_online_mask  - has bit 'cpu' set iff cpu available to scheduler
 *     cpu_active_mask  - has bit 'cpu' set iff cpu available to migration
 *     cpu_isolated_mask- has bit 'cpu' set iff cpu isolated
 *
 *  If !CONFIG_HOTPLUG_CPU, present == possible, and active == online.
 *
@@ -91,10 +92,12 @@ extern struct cpumask __cpu_possible_mask;
 extern struct cpumask __cpu_online_mask;
 extern struct cpumask __cpu_present_mask;
 extern struct cpumask __cpu_active_mask;
 extern struct cpumask __cpu_isolated_mask;
 #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask)
 #define cpu_online_mask   ((const struct cpumask *)&__cpu_online_mask)
 #define cpu_present_mask  ((const struct cpumask *)&__cpu_present_mask)
 #define cpu_active_mask   ((const struct cpumask *)&__cpu_active_mask)
 #define cpu_isolated_mask ((const struct cpumask *)&__cpu_isolated_mask)
 extern atomic_t __num_online_cpus;
@@ -114,19 +117,31 @@ static inline unsigned int num_online_cpus(void)
 #define num_possible_cpus()	cpumask_weight(cpu_possible_mask)
 #define num_present_cpus()	cpumask_weight(cpu_present_mask)
 #define num_active_cpus()	cpumask_weight(cpu_active_mask)
 #define num_isolated_cpus()	cpumask_weight(cpu_isolated_mask)
 #define num_online_uniso_cpus()						\
 ({									\
 	cpumask_t mask;							\
 									\
 	cpumask_andnot(&mask, cpu_online_mask, cpu_isolated_mask);	\
 	cpumask_weight(&mask);						\
 })
 #define cpu_online(cpu)		cpumask_test_cpu((cpu), cpu_online_mask)
 #define cpu_possible(cpu)	cpumask_test_cpu((cpu), cpu_possible_mask)
 #define cpu_present(cpu)	cpumask_test_cpu((cpu), cpu_present_mask)
 #define cpu_active(cpu)		cpumask_test_cpu((cpu), cpu_active_mask)
 #define cpu_isolated(cpu)	cpumask_test_cpu((cpu), cpu_isolated_mask)
 #else
 #define num_online_cpus()	1U
 #define num_possible_cpus()	1U
 #define num_present_cpus()	1U
 #define num_active_cpus()	1U
 #define num_isolated_cpus()	0U
 #define num_online_uniso_cpus()	1U
 #define cpu_online(cpu)		((cpu) == 0)
 #define cpu_possible(cpu)	((cpu) == 0)
 #define cpu_present(cpu)	((cpu) == 0)
 #define cpu_active(cpu)		((cpu) == 0)
 #define cpu_isolated(cpu)	((cpu) != 0)
 #endif
 extern cpumask_t cpus_booted_once_mask;
@@ -806,6 +821,7 @@ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS);
 #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask)
 #define for_each_online_cpu(cpu)   for_each_cpu((cpu), cpu_online_mask)
 #define for_each_present_cpu(cpu)  for_each_cpu((cpu), cpu_present_mask)
 #define for_each_isolated_cpu(cpu) for_each_cpu((cpu), cpu_isolated_mask)
 /* Wrappers for arch boot code to manipulate normally-constant masks */
 void init_cpu_present(const struct cpumask *src);
@@ -846,6 +862,15 @@ set_cpu_active(unsigned int cpu, bool active)
 		cpumask_clear_cpu(cpu, &__cpu_active_mask);
 }
 static inline void
 set_cpu_isolated(unsigned int cpu, bool isolated)
 {
 	if (isolated)
 		cpumask_set_cpu(cpu, &__cpu_isolated_mask);
 	else
 		cpumask_clear_cpu(cpu, &__cpu_isolated_mask);
 }
 /**
 * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask *
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -1551,6 +1551,7 @@ static inline bool device_supports_offline(struct device *dev)
 extern void lock_device_hotplug(void);
 extern void unlock_device_hotplug(void);
 extern int lock_device_hotplug_sysfs(void);
 extern void lock_device_hotplug_assert(void);
 extern int device_offline(struct device *dev);
 extern int device_online(struct device *dev);
 extern void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode);
--- a/include/linux/nmi.h
+++ b/include/linux/nmi.h
@@ -13,6 +13,9 @@
 #ifdef CONFIG_LOCKUP_DETECTOR
 void lockup_detector_init(void);
 extern void watchdog_enable(unsigned int cpu);
 extern void watchdog_disable(unsigned int cpu);
 extern bool watchdog_configured(unsigned int cpu);
 void lockup_detector_soft_poweroff(void);
 void lockup_detector_cleanup(void);
 bool is_hardlockup(void);
@@ -37,6 +40,20 @@ extern int sysctl_hardlockup_all_cpu_backtrace;
 static inline void lockup_detector_init(void) { }
 static inline void lockup_detector_soft_poweroff(void) { }
 static inline void lockup_detector_cleanup(void) { }
 static inline void watchdog_enable(unsigned int cpu)
 {
 }
 static inline void watchdog_disable(unsigned int cpu)
 {
 }
 static inline bool watchdog_configured(unsigned int cpu)
 {
 	/*
 	 * Pretend the watchdog is always configured.
 	 * We will be waiting for the watchdog to be enabled in core isolation
 	 */
 	return true;
 }
 #endif /* !CONFIG_LOCKUP_DETECTOR */
 #ifdef CONFIG_SOFTLOCKUP_DETECTOR
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -239,6 +239,27 @@ enum migrate_types {
 	RQ_TO_GROUP,
 };
 #ifdef CONFIG_HOTPLUG_CPU
 extern int __weak sched_isolate_cpu(int cpu);
 extern int __weak sched_unisolate_cpu(int cpu);
 extern int __weak sched_unisolate_cpu_unlocked(int cpu);
 #else
 static inline int sched_isolate_cpu(int cpu)
 {
 	return 0;
 }
 static inline int sched_unisolate_cpu(int cpu)
 {
 	return 0;
 }
 static inline int sched_unisolate_cpu_unlocked(int cpu)
 {
 	return 0;
 }
 #endif
 extern void scheduler_tick(void);
 #define	MAX_SCHEDULE_TIMEOUT		LONG_MAX
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -28,7 +28,15 @@ extern void __init housekeeping_init(void);
 static inline int housekeeping_any_cpu(enum hk_flags flags)
 {
-	return smp_processor_id();
+	cpumask_t available;
 	int cpu;
 	cpumask_andnot(&available, cpu_online_mask, cpu_isolated_mask);
 	cpu = cpumask_any(&available);
 	if (cpu >= nr_cpu_ids)
 		cpu = smp_processor_id();
 	return cpu;
 }
 static inline const struct cpumask *housekeeping_cpumask(enum hk_flags flags)
@@ -52,7 +60,7 @@ static inline bool housekeeping_cpu(int cpu, enum hk_flags flags)
 	if (static_branch_unlikely(&housekeeping_overridden))
 		return housekeeping_test_cpu(cpu, flags);
 #endif
-	return true;
+	return !cpu_isolated(cpu);
 }
 #endif /* _LINUX_SCHED_ISOLATION_H */
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -990,6 +990,9 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen,
 	if (!cpu_present(cpu))
 		return -EINVAL;
 	if (!tasks_frozen && !cpu_isolated(cpu) && num_online_uniso_cpus() == 1)
 		return -EBUSY;
 	cpus_write_lock();
 	if (trace_cpuhp_latency_enabled())
 		start_time = sched_clock();
@@ -2377,6 +2380,9 @@ EXPORT_SYMBOL(__cpu_present_mask);
 struct cpumask __cpu_active_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_active_mask);
 struct cpumask __cpu_isolated_mask __read_mostly;
 EXPORT_SYMBOL(__cpu_isolated_mask);
 atomic_t __num_online_cpus __read_mostly;
 EXPORT_SYMBOL(__num_online_cpus);
@@ -2395,6 +2401,11 @@ void init_cpu_online(const struct cpumask *src)
 	cpumask_copy(&__cpu_online_mask, src);
 }
 void init_cpu_isolated(const struct cpumask *src)
 {
 	cpumask_copy(&__cpu_isolated_mask, src);
 }
 void set_cpu_online(unsigned int cpu, bool online)
 {
 	/*
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -12,6 +12,7 @@
 #include <linux/interrupt.h>
 #include <linux/ratelimit.h>
 #include <linux/irq.h>
 #include <linux/cpumask.h>
 #include "internals.h"
@@ -57,6 +58,7 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	const struct cpumask *affinity;
 	bool brokeaff = false;
 	int err;
 	struct cpumask available_cpus;
 	/*
 	 * IRQ chip might be already torn down, but the irq descriptor is
@@ -109,6 +111,10 @@ static bool migrate_one_irq(struct irq_desc *desc)
 	if (maskchip && chip->irq_mask)
 		chip->irq_mask(d);
 	cpumask_copy(&available_cpus, affinity);
 	cpumask_andnot(&available_cpus, &available_cpus, cpu_isolated_mask);
 	affinity = &available_cpus;
 	if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 		/*
 		 * If the interrupt is managed, then shut it down and leave
@@ -119,16 +125,41 @@ static bool migrate_one_irq(struct irq_desc *desc)
 			irq_shutdown_and_deactivate(desc);
 			return false;
 		}
-		affinity = cpu_online_mask;
+		/*
 		 * The order of preference for selecting a fallback CPU is
 		 *
 		 * (1) online and un-isolated CPU from default affinity
 		 * (2) online and un-isolated CPU
 		 * (3) online CPU
 		 */
 		cpumask_andnot(&available_cpus, cpu_online_mask,
 							cpu_isolated_mask);
 		if (cpumask_intersects(&available_cpus, irq_default_affinity))
 			cpumask_and(&available_cpus, &available_cpus,
 							irq_default_affinity);
 		else if (cpumask_empty(&available_cpus))
 			affinity = cpu_online_mask;
 		/*
 		 * We are overriding the affinity with all online and
 		 * un-isolated cpus. irq_set_affinity_locked() call
 		 * below notify this mask to PM QOS affinity listener.
 		 * That results in applying the CPU_DMA_LATENCY QOS
 		 * to all the CPUs specified in the mask. But the low
 		 * level irqchip driver sets the affinity of an irq
 		 * to only one CPU. So pick only one CPU from the
 		 * prepared mask while overriding the user affinity.
 		 */
 		affinity = cpumask_of(cpumask_any(affinity));
 		brokeaff = true;
 	}
 	/*
-	 * Do not set the force argument of irq_do_set_affinity() as this
+	 * Do not set the force argument of irq_set_affinity_locked() as this
 	 * disables the masking of offline CPUs from the supplied affinity
 	 * mask and therefore might keep/reassign the irq to the outgoing
 	 * CPU.
 	 */
-	err = irq_do_set_affinity(d, affinity, false);
+	err = irq_set_affinity_locked(d, affinity, false);
 	if (err) {
 		pr_warn_ratelimited("IRQ%u: set affinity failed(%d).\n",
 				    d->irq, err);
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -132,6 +132,11 @@ static ssize_t write_irq_affinity(int type, struct file *file,
 	if (err)
 		goto free_cpumask;
 	if (cpumask_subset(new_value, cpu_isolated_mask)) {
 		err = -EINVAL;
 		goto free_cpumask;
 	}
 	/*
 	 * Do not allow disabling IRQs completely - it's a too easy
 	 * way to make the system unusable accidentally :-) At least
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -11,6 +11,7 @@
 #include <linux/nospec.h>
 #include <linux/kcov.h>
 #include <linux/irq.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -1649,6 +1650,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	struct rq_flags rf;
 	struct rq *rq;
 	int ret = 0;
 	cpumask_t allowed_mask;
 	rq = task_rq_lock(p, &rf);
 	update_rq_clock(rq);
@@ -1672,10 +1674,17 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	if (cpumask_equal(p->cpus_ptr, new_mask))
 		goto out;
-	dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
 	cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask);
 	dest_cpu = cpumask_any(&allowed_mask);
 	if (dest_cpu >= nr_cpu_ids) {
-		ret = -EINVAL;
+		cpumask_and(&allowed_mask, cpu_valid_mask, new_mask);
-		goto out;
+		dest_cpu = cpumask_any(&allowed_mask);
 		if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
 			ret = -EINVAL;
 			goto out;
 		}
 	}
 	do_set_cpus_allowed(p, new_mask);
@@ -1691,7 +1700,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
 	}
 	/* Can the task run on the task's current CPU? If so, we're done */
-	if (cpumask_test_cpu(task_cpu(p), new_mask))
+	if (cpumask_test_cpu(task_cpu(p), &allowed_mask))
 		goto out;
 	if (task_running(rq, p) || p->state == TASK_WAKING) {
@@ -2043,12 +2052,13 @@ EXPORT_SYMBOL_GPL(kick_process);
 * select_task_rq() below may allow selection of !active CPUs in order
 * to satisfy the above rules.
 */
-static int select_fallback_rq(int cpu, struct task_struct *p)
+static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
 {
 	int nid = cpu_to_node(cpu);
 	const struct cpumask *nodemask = NULL;
-	enum { cpuset, possible, fail } state = cpuset;
+	enum { cpuset, possible, fail, bug } state = cpuset;
 	int dest_cpu;
 	int isolated_candidate = -1;
 	/*
 	 * If the node that the CPU is on has been offlined, cpu_to_node()
@@ -2062,6 +2072,8 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		for_each_cpu(dest_cpu, nodemask) {
 			if (!cpu_active(dest_cpu))
 				continue;
 			if (cpu_isolated(dest_cpu))
 				continue;
 			if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
 				return dest_cpu;
 		}
@@ -2072,7 +2084,16 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 		for_each_cpu(dest_cpu, p->cpus_ptr) {
 			if (!is_cpu_allowed(p, dest_cpu))
 				continue;
 			if (cpu_isolated(dest_cpu)) {
 				if (allow_iso)
 					isolated_candidate = dest_cpu;
 				continue;
 			}
 			goto out;
 		}
 		if (isolated_candidate != -1) {
 			dest_cpu = isolated_candidate;
 			goto out;
 		}
@@ -2091,6 +2112,11 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
 			break;
 		case fail:
 			allow_iso = true;
 			state = bug;
 			break;
 		case bug:
 			BUG();
 			break;
 		}
@@ -2118,6 +2144,8 @@ out:
 static inline
 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 {
 	bool allow_isolated = (p->flags & PF_KTHREAD);
 	lockdep_assert_held(&p->pi_lock);
 	if (p->nr_cpus_allowed > 1)
@@ -2135,8 +2163,9 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
 	 * [ this allows ->select_task() to simply return task_cpu(p) and
 	 *   not worry about this generic constraint ]
 	 */
-	if (unlikely(!is_cpu_allowed(p, cpu)))
+	if (unlikely(!is_cpu_allowed(p, cpu)) ||
-		cpu = select_fallback_rq(task_cpu(p), p);
+			(cpu_isolated(cpu) && !allow_isolated))
 		cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
 	return cpu;
 }
@@ -2327,6 +2356,7 @@ void sched_ttwu_pending(void)
 void scheduler_ipi(void)
 {
 	int cpu = smp_processor_id();
 	/*
 	 * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
 	 * TIF_NEED_RESCHED remotely (for the first time) will also send
@@ -2356,7 +2386,7 @@ void scheduler_ipi(void)
 	/*
 	 * Check if someone kicked us for doing the nohz idle load balance.
 	 */
-	if (unlikely(got_nohz_idle_kick())) {
+	if (unlikely(got_nohz_idle_kick()) && !cpu_isolated(cpu)) {
 		this_rq()->idle_balance = 1;
 		raise_softirq_irqoff(SCHED_SOFTIRQ);
 	}
@@ -3542,7 +3572,7 @@ void sched_exec(void)
 	if (dest_cpu == smp_processor_id())
 		goto unlock;
-	if (likely(cpu_active(dest_cpu))) {
+	if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
 		struct migration_arg arg = { p, dest_cpu };
 		raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5463,6 +5493,8 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	cpumask_var_t cpus_allowed, new_mask;
 	struct task_struct *p;
 	int retval;
 	int dest_cpu;
 	cpumask_t allowed_mask;
 	rcu_read_lock();
@@ -5524,20 +5556,26 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	}
 #endif
 again:
-	retval = __set_cpus_allowed_ptr(p, new_mask, true);
+	cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
-
+	dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
-	if (!retval) {
+	if (dest_cpu < nr_cpu_ids) {
-		cpuset_cpus_allowed(p, cpus_allowed);
+		retval = __set_cpus_allowed_ptr(p, new_mask, true);
-		if (!cpumask_subset(new_mask, cpus_allowed)) {
+		if (!retval) {
-			/*
+			cpuset_cpus_allowed(p, cpus_allowed);
-			 * We must have raced with a concurrent cpuset
+			if (!cpumask_subset(new_mask, cpus_allowed)) {
-			 * update. Just reset the cpus_allowed to the
+				/*
-			 * cpuset's cpus_allowed
+				 * We must have raced with a concurrent cpuset
-			 */
+				 * update. Just reset the cpus_allowed to the
-			cpumask_copy(new_mask, cpus_allowed);
+				 * cpuset's cpus_allowed
-			goto again;
+				 */
 				cpumask_copy(new_mask, cpus_allowed);
 				goto again;
 			}
 		}
 	} else {
 		retval = -EINVAL;
 	}
 out_free_new_mask:
 	free_cpumask_var(new_mask);
 out_free_cpus_allowed:
@@ -5655,6 +5693,14 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
 	/* The userspace tasks are forbidden to run on
 	 * isolated CPUs. So exclude isolated CPUs from
 	 * the getaffinity.
 	 */
 	if (!(p->flags & PF_KTHREAD))
 		cpumask_andnot(mask, mask, cpu_isolated_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 out_unlock:
@@ -6351,19 +6397,25 @@ static struct task_struct *__pick_migrate_task(struct rq *rq)
 }
 /*
- * Migrate all tasks from the rq, sleeping tasks will be migrated by
+ * Migrate all tasks (not pinned if pinned argument say so) from the rq,
- * try_to_wake_up()->select_task_rq().
+ * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
 *
 * Called with rq->lock held even though we'er in stop_machine() and
 * there's no concurrency possible, we hold the required locks anyway
 * because of lock validation efforts.
 */
-void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
+void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
 			  bool migrate_pinned_tasks)
 {
 	struct rq *rq = dead_rq;
 	struct task_struct *next, *stop = rq->stop;
 	struct rq_flags orf = *rf;
 	int dest_cpu;
 	unsigned int num_pinned_kthreads = 1; /* this thread */
 	LIST_HEAD(tasks);
 	cpumask_t avail_cpus;
 	cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
 	/*
 	 * Fudge the rq selection such that the below task selection loop
@@ -6386,13 +6438,20 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 	for (;;) {
 		/*
 		 * There's this thread running, bail when that's the only
-		 * remaining thread:
+		 * remaining thread.
 		 */
 		if (rq->nr_running == 1)
 			break;
 		next = __pick_migrate_task(rq);
 		if (!migrate_pinned_tasks && next->flags & PF_KTHREAD &&
 			!cpumask_intersects(&avail_cpus, &next->cpus_mask)) {
 			detach_one_task_core(next, rq, &tasks);
 			num_pinned_kthreads += 1;
 			continue;
 		}
 		/*
 		 * Rules for changing task_struct::cpus_mask are holding
 		 * both pi_lock and rq->lock, such that holding either
@@ -6405,31 +6464,43 @@ void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
 		rq_unlock(rq, rf);
 		raw_spin_lock(&next->pi_lock);
 		rq_relock(rq, rf);
 		if (!(rq->clock_update_flags & RQCF_UPDATED))
 			update_rq_clock(rq);
 		/*
 		 * Since we're inside stop-machine, _nothing_ should have
 		 * changed the task, WARN if weird stuff happened, because in
 		 * that case the above rq->lock drop is a fail too.
 		 * However, during cpu isolation the load balancer might have
 		 * interferred since we don't stop all CPUs. Ignore warning for
 		 * this case.
 		 */
-		if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
+		if (task_rq(next) != rq || !task_on_rq_queued(next)) {
 			WARN_ON(migrate_pinned_tasks);
 			raw_spin_unlock(&next->pi_lock);
 			continue;
 		}
 		/* Find suitable destination for @next, with force if needed. */
-		dest_cpu = select_fallback_rq(dead_rq->cpu, next);
+		dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
 		rq = __migrate_task(rq, rf, next, dest_cpu);
 		if (rq != dead_rq) {
 			rq_unlock(rq, rf);
 			rq = dead_rq;
 			*rf = orf;
 			rq_relock(rq, rf);
 			if (!(rq->clock_update_flags & RQCF_UPDATED))
 				update_rq_clock(rq);
 		}
 		raw_spin_unlock(&next->pi_lock);
 	}
 	rq->stop = stop;
 	if (num_pinned_kthreads > 1)
 		attach_tasks_core(&tasks, rq);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 void set_rq_online(struct rq *rq)
@@ -6619,7 +6690,7 @@ int sched_cpu_dying(unsigned int cpu)
 		BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 		set_rq_offline(rq);
 	}
-	migrate_tasks(rq, &rf);
+	migrate_tasks(rq, &rf, true);
 	BUG_ON(rq->nr_running != 1);
 	rq_unlock_irqrestore(rq, &rf);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5983,6 +5983,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
 			return si_cpu;
 		if (!cpumask_test_cpu(cpu, p->cpus_ptr))
 			continue;
 		if (cpu_isolated(cpu))
 			continue;
 		if (available_idle_cpu(cpu))
 			break;
 		if (si_cpu == -1 && sched_idle_cpu(cpu))
@@ -6005,14 +6007,16 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 	struct sched_domain *sd;
 	int i, recent_used_cpu;
-	if (available_idle_cpu(target) || sched_idle_cpu(target))
+	if ((available_idle_cpu(target) || sched_idle_cpu(target)) &&
 						!cpu_isolated(target))
 		return target;
 	/*
 	 * If the previous CPU is cache affine and idle, don't be stupid:
 	 */
 	if (prev != target && cpus_share_cache(prev, target) &&
-	    (available_idle_cpu(prev) || sched_idle_cpu(prev)))
+	    ((available_idle_cpu(prev) || sched_idle_cpu(prev)) &&
 						!cpu_isolated(prev)))
 		return prev;
 	/* Check a recently used CPU as a potential idle candidate: */
@@ -7892,6 +7896,8 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 			struct sched_group_capacity *sgc;
 			struct rq *rq = cpu_rq(cpu);
 			if (cpumask_test_cpu(cpu, cpu_isolated_mask))
 				continue;
 			/*
 			 * build_sched_domains() -> init_sched_groups_capacity()
 			 * gets here before we've attached the domains to the
@@ -7922,10 +7928,15 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
 		group = child->groups;
 		do {
 			struct sched_group_capacity *sgc = group->sgc;
 			cpumask_t *cpus = sched_group_span(group);
-			capacity += sgc->capacity;
+			if (!cpu_isolated(cpumask_first(cpus))) {
-			min_capacity = min(sgc->min_capacity, min_capacity);
+				capacity += sgc->capacity;
-			max_capacity = max(sgc->max_capacity, max_capacity);
+				min_capacity = min(sgc->min_capacity,
 							min_capacity);
 				max_capacity = max(sgc->max_capacity,
 							max_capacity);
 			}
 			group = group->next;
 		} while (group != child->groups);
 	}
@@ -8129,6 +8140,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
 		if (cpu_isolated(i))
 			continue;
 		if ((env->flags & LBF_NOHZ_STATS) && update_nohz_stats(rq, false))
 			env->flags |= LBF_NOHZ_AGAIN;
@@ -8160,17 +8174,27 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		}
 	}
-	/* Adjust by relative CPU capacity of the group */
+	/* Isolated CPU has no weight */
-	sgs->group_capacity = group->sgc->capacity;
+	if (!group->group_weight) {
-	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
+		sgs->group_capacity = 0;
 		sgs->avg_load = 0;
 		sgs->group_no_capacity = 1;
 		sgs->group_type = group_other;
 		sgs->group_weight = group->group_weight;
 	} else {
 		/* Adjust by relative CPU capacity of the group */
 		sgs->group_capacity = group->sgc->capacity;
 		sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) /
 							sgs->group_capacity;
 		sgs->group_weight = group->group_weight;
 		sgs->group_no_capacity = group_is_overloaded(env, sgs);
 		sgs->group_type = group_classify(group, sgs);
 	}
 	if (sgs->sum_nr_running)
 		sgs->load_per_task = sgs->group_load / sgs->sum_nr_running;
 	sgs->group_weight = group->group_weight;
 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
 	sgs->group_type = group_classify(group, sgs);
 }
 /**
@@ -8910,7 +8934,7 @@ static int should_we_balance(struct lb_env *env)
 	/* Try to find first idle CPU */
 	for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
-		if (!idle_cpu(cpu))
+		if (!idle_cpu(cpu) || cpu_isolated(cpu))
 			continue;
 		balance_cpu = cpu;
@@ -8918,7 +8942,7 @@ static int should_we_balance(struct lb_env *env)
 	}
 	if (balance_cpu == -1)
-		balance_cpu = group_balance_cpu(sg);
+		balance_cpu = group_balance_cpu_not_isolated(sg);
 	/*
 	 * First idle CPU or the first CPU(busiest) in this sched group
@@ -9127,7 +9151,8 @@ more_balance:
 			 * ->active_balance_work.  Once set, it's cleared
 			 * only after active load balance is finished.
 			 */
-			if (!busiest->active_balance) {
+			if (!busiest->active_balance &&
 			    !cpu_isolated(cpu_of(busiest))) {
 				busiest->active_balance = 1;
 				busiest->push_cpu = this_cpu;
 				active_balance = 1;
@@ -9333,7 +9358,13 @@ static DEFINE_SPINLOCK(balancing);
 */
 void update_max_interval(void)
 {
-	max_load_balance_interval = HZ*num_online_cpus()/10;
+	cpumask_t avail_mask;
 	unsigned int available_cpus;
 	cpumask_andnot(&avail_mask, cpu_online_mask, cpu_isolated_mask);
 	available_cpus = cpumask_weight(&avail_mask);
 	max_load_balance_interval = HZ*available_cpus/10;
 }
 /*
@@ -9510,6 +9541,7 @@ static void nohz_balancer_kick(struct rq *rq)
 	struct sched_domain *sd;
 	int nr_busy, i, cpu = rq->cpu;
 	unsigned int flags = 0;
 	cpumask_t cpumask;
 	if (unlikely(rq->idle_balance))
 		return;
@@ -9524,7 +9556,8 @@ static void nohz_balancer_kick(struct rq *rq)
 	 * None are in tickless mode and hence no need for NOHZ idle load
 	 * balancing.
 	 */
-	if (likely(!atomic_read(&nohz.nr_cpus)))
+	cpumask_andnot(&cpumask, nohz.idle_cpus_mask, cpu_isolated_mask);
 	if (cpumask_empty(&cpumask))
 		return;
 	if (READ_ONCE(nohz.has_blocked) &&
@@ -9561,7 +9594,7 @@ static void nohz_balancer_kick(struct rq *rq)
 		 * currently idle; in which case, kick the ILB to move tasks
 		 * around.
 		 */
-		for_each_cpu_and(i, sched_domain_span(sd), nohz.idle_cpus_mask) {
+		for_each_cpu_and(i, sched_domain_span(sd), &cpumask) {
 			if (sched_asym_prefer(i, cpu)) {
 				flags = NOHZ_KICK_MASK;
 				goto unlock;
@@ -9739,6 +9772,7 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	int balance_cpu;
 	int ret = false;
 	struct rq *rq;
 	cpumask_t cpus;
 	SCHED_WARN_ON((flags & NOHZ_KICK_MASK) == NOHZ_BALANCE_KICK);
@@ -9758,7 +9792,9 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
 	 */
 	smp_mb();
-	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
+	cpumask_andnot(&cpus, nohz.idle_cpus_mask, cpu_isolated_mask);
 	for_each_cpu(balance_cpu, &cpus) {
 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
 			continue;
@@ -9910,6 +9946,9 @@ int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 	int pulled_task = 0;
 	u64 curr_cost = 0;
 	if (cpu_isolated(this_cpu))
 		return 0;
 	update_misfit_status(NULL, this_rq);
 	/*
 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
@@ -10026,6 +10065,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 	enum cpu_idle_type idle = this_rq->idle_balance ?
 						CPU_IDLE : CPU_NOT_IDLE;
 	/*
 	 * Since core isolation doesn't update nohz.idle_cpus_mask, there
 	 * is a possibility this nohz kicked cpu could be isolated. Hence
 	 * return if the cpu is isolated.
 	 */
 	if (cpu_isolated(this_rq->cpu))
 		return;
 	/*
 	 * If this CPU has a pending nohz_balance_kick, then do the
 	 * balancing on behalf of the other idle CPUs whose ticks are
@@ -10047,8 +10094,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
 */
 void trigger_load_balance(struct rq *rq)
 {
-	/* Don't need to rebalance while attached to NULL domain */
+	/* Don't need to rebalance while attached to NULL domain or
-	if (unlikely(on_null_domain(rq)))
+	 * cpu is isolated.
 	 */
 	if (unlikely(on_null_domain(rq)) || cpu_isolated(cpu_of(rq)))
 		return;
 	if (time_after_eq(jiffies, rq->next_balance))
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -263,8 +263,12 @@ static void pull_rt_task(struct rq *this_rq);
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 {
-	/* Try to pull RT tasks here if we lower this rq's prio */
+	/*
-	return rq->rt.highest_prio.curr > prev->prio;
+	 * Try to pull RT tasks here if we lower this rq's prio and cpu is not
 	 * isolated
 	 */
 	return rq->rt.highest_prio.curr > prev->prio &&
 	       !cpu_isolated(cpu_of(rq));
 }
 static inline int rt_overloaded(struct rq *rq)
@@ -2192,7 +2196,8 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
 	 * we may need to handle the pulling of RT tasks
 	 * now.
 	 */
-	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
+	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running ||
 		cpu_isolated(cpu_of(rq)))
 		return;
 	rt_queue_pull_task(rq);
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -163,6 +163,10 @@ extern atomic_long_t calc_load_tasks;
 extern void calc_global_load_tick(struct rq *this_rq);
 extern long calc_load_fold_active(struct rq *this_rq, long adjust);
 #ifdef CONFIG_SMP
 extern void init_sched_groups_capacity(int cpu, struct sched_domain *sd);
 #endif
 /*
 * Helpers for converting nanosecond timing to jiffy resolution
 */
@@ -3307,7 +3311,8 @@ extern int active_load_balance_cpu_stop(void *data);
 #ifdef CONFIG_HOTPLUG_CPU
 extern void set_rq_online(struct rq *rq);
 extern void set_rq_offline(struct rq *rq);
-extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf);
+extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
 					bool migrate_pinned_tasks);
 extern void calc_load_migrate(struct rq *rq);
 #ifdef CONFIG_SCHED_WALT
 extern void __weak
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1163,16 +1163,19 @@ build_sched_groups(struct sched_domain *sd, int cpu)
 * group having more cpu_capacity will pickup more load compared to the
 * group having less cpu_capacity.
 */
-static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
+void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 {
 	struct sched_group *sg = sd->groups;
 	cpumask_t avail_mask;
 	WARN_ON(!sg);
 	do {
 		int cpu, max_cpu = -1;
-		sg->group_weight = cpumask_weight(sched_group_span(sg));
+		cpumask_andnot(&avail_mask, sched_group_span(sg),
 							cpu_isolated_mask);
 		sg->group_weight = cpumask_weight(&avail_mask);
 		if (!(sd->flags & SD_ASYM_PACKING))
 			goto next;
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -792,7 +792,8 @@ void wake_up_all_idle_cpus(void)
 		if (cpu == smp_processor_id())
 			continue;
-		wake_up_if_idle(cpu);
+		if (!cpu_isolated(cpu))
 			wake_up_if_idle(cpu);
 	}
 	preempt_enable();
 }
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -2014,7 +2014,7 @@ int hrtimers_prepare_cpu(unsigned int cpu)
 	return 0;
 }
-#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_CPUSETS)
+#ifdef CONFIG_HOTPLUG_CPU
 static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 				 struct hrtimer_clock_base *new_base,
 				 bool remove_pinned)
@@ -2023,12 +2023,14 @@ static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
 	struct timerqueue_node *node;
 	struct timerqueue_head pinned;
 	int is_pinned;
 	bool is_hotplug = !cpu_online(old_base->cpu_base->cpu);
 	timerqueue_init_head(&pinned);
 	while ((node = timerqueue_getnext(&old_base->active))) {
 		timer = container_of(node, struct hrtimer, node);
-		BUG_ON(hrtimer_callback_running(timer));
+		if (is_hotplug)
 			BUG_ON(hrtimer_callback_running(timer));
 		debug_deactivate(timer);
 		/*
@@ -2106,9 +2108,7 @@ static void __migrate_hrtimers(unsigned int scpu, bool remove_pinned)
 	local_irq_restore(flags);
 	local_bh_enable();
 }
 #endif /* CONFIG_HOTPLUG_CPU || CONFIG_CPUSETS */
 #ifdef CONFIG_HOTPLUG_CPU
 int hrtimers_dead_cpu(unsigned int scpu)
 {
 	BUG_ON(cpu_online(scpu));
@@ -2117,14 +2117,13 @@ int hrtimers_dead_cpu(unsigned int scpu)
 	__migrate_hrtimers(scpu, true);
 	return 0;
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 #ifdef CONFIG_CPUSETS
 void hrtimer_quiesce_cpu(void *cpup)
 {
 	__migrate_hrtimers(*(int *)cpup, false);
 }
-#endif /* CONFIG_CPUSETS */
+
 #endif /* CONFIG_HOTPLUG_CPU */
 void __init hrtimers_init(void)
 {
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -2038,7 +2038,8 @@ static void __migrate_timers(unsigned int cpu, bool remove_pinned)
 		 */
 		forward_timer_base(new_base);
-		BUG_ON(old_base->running_timer);
+		if (!cpu_online(cpu))
 			BUG_ON(old_base->running_timer);
 		for (i = 0; i < WHEEL_SIZE; i++)
 			migrate_timer_list(new_base, old_base->vectors + i,
@@ -2057,12 +2058,10 @@ int timers_dead_cpu(unsigned int cpu)
 	return 0;
 }
 #ifdef CONFIG_CPUSETS
 void timer_quiesce_cpu(void *cpup)
 {
 	__migrate_timers(*(unsigned int *)cpup, false);
 }
 #endif /* CONFIG_CPUSETS */
 #endif /* CONFIG_HOTPLUG_CPU */
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -14,6 +14,7 @@
 #include <linux/mm.h>
 #include <linux/cpu.h>
 #include <linux/device.h>
 #include <linux/nmi.h>
 #include <linux/init.h>
 #include <linux/module.h>
@@ -170,6 +171,7 @@ static u64 __read_mostly sample_period;
 static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
 static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
 static DEFINE_PER_CPU(unsigned int, watchdog_en);
 static DEFINE_PER_CPU(bool, softlockup_touch_sync);
 static DEFINE_PER_CPU(bool, soft_watchdog_warn);
 static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
@@ -476,16 +478,20 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
 	return HRTIMER_RESTART;
 }
-static void watchdog_enable(unsigned int cpu)
+void watchdog_enable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 	struct completion *done = this_cpu_ptr(&softlockup_completion);
 	unsigned int *enabled = this_cpu_ptr(&watchdog_en);
 	WARN_ON_ONCE(cpu != smp_processor_id());
 	init_completion(done);
 	complete(done);
 	if (*enabled)
 		return;
 	/*
 	 * Start the timer first to prevent the NMI watchdog triggering
 	 * before the timer has a chance to fire.
@@ -500,11 +506,24 @@ static void watchdog_enable(unsigned int cpu)
 	/* Enable the perf event */
 	if (watchdog_enabled & NMI_WATCHDOG_ENABLED)
 		watchdog_nmi_enable(cpu);
 	/*
 	 * Need to ensure above operations are observed by other CPUs before
 	 * indicating that timer is enabled. This is to synchronize core
 	 * isolation and hotplug. Core isolation will wait for this flag to be
 	 * set.
 	 */
 	mb();
 	*enabled = 1;
 }
-static void watchdog_disable(unsigned int cpu)
+void watchdog_disable(unsigned int cpu)
 {
 	struct hrtimer *hrtimer = this_cpu_ptr(&watchdog_hrtimer);
 	unsigned int *enabled = this_cpu_ptr(&watchdog_en);
 	if (!*enabled)
 		return;
 	WARN_ON_ONCE(cpu != smp_processor_id());
@@ -516,6 +535,17 @@ static void watchdog_disable(unsigned int cpu)
 	watchdog_nmi_disable(cpu);
 	hrtimer_cancel(hrtimer);
 	wait_for_completion(this_cpu_ptr(&softlockup_completion));
 	/*
 	 * No need for barrier here since disabling the watchdog is
 	 * synchronized with hotplug lock
 	 */
 	*enabled = 0;
 }
 bool watchdog_configured(unsigned int cpu)
 {
 	return *per_cpu_ptr(&watchdog_en, cpu);
 }
 static int softlockup_stop_fn(void *data)
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1807,7 +1807,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
 static void vmstat_update(struct work_struct *w)
 {
-	if (refresh_cpu_vm_stats(true)) {
+	if (refresh_cpu_vm_stats(true) && !cpu_isolated(smp_processor_id())) {
 		/*
 		 * Counters were updated so we expect more updates
 		 * to occur in the future. Keep on running the
@@ -1899,7 +1899,8 @@ static void vmstat_shepherd(struct work_struct *w)
 	for_each_online_cpu(cpu) {
 		struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
-		if (!delayed_work_pending(dw) && need_update(cpu))
+		if (!delayed_work_pending(dw) && need_update(cpu) &&
 		     !cpu_isolated(cpu))
 			queue_delayed_work_on(cpu, mm_percpu_wq, dw, 0);
 	}
 	put_online_cpus();