Merge branch 'linus' into x86/urgent, to be able to merge a dependent fix

Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-09-05 09:00:30 +02:00
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,8 +23,10 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 CFLAGS_irq.o := -I$(src)/../include/asm/trace

 obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
@@ -107,8 +109,6 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o

 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
-obj-$(CONFIG_IOSF_MBI)			+= iosf_mbi.o
-obj-$(CONFIG_PMC_ATOM)			+= pmc_atom.o

 ###
 # 64 bit specific files
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -710,7 +710,7 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
 #endif
 }

-static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
+int acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
 {
 	int cpu;

@@ -726,12 +726,6 @@ static int _acpi_map_lsapic(acpi_handle handle, int physid, int *pcpu)
 	*pcpu = cpu;
 	return 0;
 }
-
-/* wrapper to silence section mismatch warning */
-int __ref acpi_map_cpu(acpi_handle handle, phys_cpuid_t physid, int *pcpu)
-{
-	return _acpi_map_lsapic(handle, physid, pcpu);
-}
 EXPORT_SYMBOL(acpi_map_cpu);

 int acpi_unmap_cpu(int cpu)
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)

 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = rdtsc();

 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);

 	/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;

-	t1 = __native_read_tsc();
+	t1 = rdtsc();

 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);

-	t2 = __native_read_tsc();
+	t2 = rdtsc();

 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,45 +457,45 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;

-	rdtscll(tsc);
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }

-/*
- * Setup the lapic timer in periodic or oneshot mode
- */
-static void lapic_timer_setup(enum clock_event_mode mode,
-			      struct clock_event_device *evt)
+static int lapic_timer_shutdown(struct clock_event_device *evt)
 {
-	unsigned long flags;
 	unsigned int v;

 	/* Lapic used as dummy for broadcast ? */
 	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
-		return;
+		return 0;

-	local_irq_save(flags);
+	v = apic_read(APIC_LVTT);
+	v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
+	apic_write(APIC_LVTT, v);
+	apic_write(APIC_TMICT, 0);
+	return 0;
+}

-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-	case CLOCK_EVT_MODE_ONESHOT:
-		__setup_APIC_LVTT(lapic_timer_frequency,
-				  mode != CLOCK_EVT_MODE_PERIODIC, 1);
-		break;
-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		v = apic_read(APIC_LVTT);
-		v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
-		apic_write(APIC_LVTT, v);
-		apic_write(APIC_TMICT, 0);
-		break;
-	case CLOCK_EVT_MODE_RESUME:
-		/* Nothing to do here */
-		break;
-	}
+static inline int
+lapic_timer_set_periodic_oneshot(struct clock_event_device *evt, bool oneshot)
+{
+	/* Lapic used as dummy for broadcast ? */
+	if (evt->features & CLOCK_EVT_FEAT_DUMMY)
+		return 0;

-	local_irq_restore(flags);
+	__setup_APIC_LVTT(lapic_timer_frequency, oneshot, 1);
+	return 0;
+}
+
+static int lapic_timer_set_periodic(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, false);
+}
+
+static int lapic_timer_set_oneshot(struct clock_event_device *evt)
+{
+	return lapic_timer_set_periodic_oneshot(evt, true);
 }

 /*
@@ -513,15 +513,18 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
 * The local apic timer can be used for any function which is CPU local.
 */
 static struct clock_event_device lapic_clockevent = {
-	.name		= "lapic",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
-			| CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
-	.shift		= 32,
-	.set_mode	= lapic_timer_setup,
-	.set_next_event	= lapic_next_event,
-	.broadcast	= lapic_timer_broadcast,
-	.rating		= 100,
-	.irq		= -1,
+	.name			= "lapic",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP
+				  | CLOCK_EVT_FEAT_DUMMY,
+	.shift			= 32,
+	.set_state_shutdown	= lapic_timer_shutdown,
+	.set_state_periodic	= lapic_timer_set_periodic,
+	.set_state_oneshot	= lapic_timer_set_oneshot,
+	.set_next_event		= lapic_next_event,
+	.broadcast		= lapic_timer_broadcast,
+	.rating			= 100,
+	.irq			= -1,
 };
 static DEFINE_PER_CPU(struct clock_event_device, lapic_events);

@@ -592,7 +595,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();

 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();

 	switch (lapic_cal_loops++) {
 	case 0:
@@ -778,7 +781,7 @@ static int __init calibrate_APIC_clock(void)
 		 * Setup the apic timer manually
 		 */
 		levt->event_handler = lapic_cal_handler;
-		lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt);
+		lapic_timer_set_periodic(levt);
 		lapic_cal_loops = -1;

 		/* Let the interrupts run */
@@ -788,7 +791,8 @@ static int __init calibrate_APIC_clock(void)
 			cpu_relax();

 		/* Stop the lapic timer */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt);
+		local_irq_disable();
+		lapic_timer_shutdown(levt);

 		/* Jiffies delta */
 		deltaj = lapic_cal_j2 - lapic_cal_j1;
@@ -799,8 +803,8 @@ static int __init calibrate_APIC_clock(void)
 			apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
 		else
 			levt->features |= CLOCK_EVT_FEAT_DUMMY;
-	} else
-		local_irq_enable();
+	}
+	local_irq_enable();

 	if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
 		pr_warning("APIC timer disabled due to verification failure\n");
@@ -878,7 +882,7 @@ static void local_apic_timer_interrupt(void)
 	if (!evt->event_handler) {
 		pr_warning("Spurious LAPIC timer interrupt on cpu %d\n", cpu);
 		/* Switch it off */
-		lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
+		lapic_timer_shutdown(evt);
 		return;
 	}

@@ -1209,7 +1213,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;

 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = rdtsc();

 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1297,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -191,7 +191,6 @@ static struct apic apic_flat =  {
 	.send_IPI_all			= flat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,

-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,

 	.read				= native_apic_mem_read,
@@ -299,7 +298,6 @@ static struct apic apic_physflat =  {
 	.send_IPI_all			= physflat_send_IPI_all,
 	.send_IPI_self			= apic_send_IPI_self,

-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= default_inquire_remote_apic,

 	.read				= native_apic_mem_read,
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -152,7 +152,6 @@ struct apic apic_noop = {

 	.wakeup_secondary_cpu		= noop_wakeup_secondary_cpu,

-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,

 	.read				= noop_apic_read,
--- a/arch/x86/kernel/apic/apic_numachip.c
+++ b/arch/x86/kernel/apic/apic_numachip.c
@@ -92,7 +92,6 @@ static int numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip)

 	write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v);

-	atomic_set(&init_deasserted, 1);
 	return 0;
 }

@@ -235,7 +234,6 @@ static const struct apic apic_numachip __refconst = {
 	.send_IPI_self			= numachip_send_IPI_self,

 	.wakeup_secondary_cpu		= numachip_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL, /* REMRD not supported */

 	.read				= native_apic_mem_read,
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -186,7 +186,6 @@ static struct apic apic_bigsmp = {
 	.send_IPI_all			= bigsmp_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,

-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,

 	.read				= native_apic_mem_read,
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2541,7 +2541,7 @@ void __init setup_ioapic_dest(void)
 		 * Honour affinities which have been set in early boot
 		 */
 		if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
-			mask = idata->affinity;
+			mask = irq_data_get_affinity_mask(idata);
 		else
 			mask = apic->target_cpus();

--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -264,7 +264,7 @@ static inline int hpet_dev_id(struct irq_domain *domain)

 static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg)
 {
-	hpet_msi_write(data->handler_data, msg);
+	hpet_msi_write(irq_data_get_irq_handler_data(data), msg);
 }

 static struct irq_chip hpet_msi_controller = {
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -111,7 +111,6 @@ static struct apic apic_default = {
 	.send_IPI_all			= default_send_IPI_all,
 	.send_IPI_self			= default_send_IPI_self,

-	.wait_for_init_deassert		= true,
 	.inquire_remote_apic		= default_inquire_remote_apic,

 	.read				= native_apic_mem_read,
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -169,8 +169,7 @@ next:
 			goto next;

 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask) {
-			if (per_cpu(vector_irq, new_cpu)[vector] >
-			    VECTOR_UNDEFINED)
+			if (!IS_ERR_OR_NULL(per_cpu(vector_irq, new_cpu)[vector]))
 				goto next;
 		}
 		/* Found one! */
@@ -182,7 +181,7 @@ next:
 			   cpumask_intersects(d->old_domain, cpu_online_mask);
 		}
 		for_each_cpu_and(new_cpu, vector_cpumask, cpu_online_mask)
-			per_cpu(vector_irq, new_cpu)[vector] = irq;
+			per_cpu(vector_irq, new_cpu)[vector] = irq_to_desc(irq);
 		d->cfg.vector = vector;
 		cpumask_copy(d->domain, vector_cpumask);
 		err = 0;
@@ -224,15 +223,16 @@ static int assign_irq_vector_policy(int irq, int node,

 static void clear_irq_vector(int irq, struct apic_chip_data *data)
 {
-	int cpu, vector;
+	struct irq_desc *desc;
 	unsigned long flags;
+	int cpu, vector;

 	raw_spin_lock_irqsave(&vector_lock, flags);
 	BUG_ON(!data->cfg.vector);

 	vector = data->cfg.vector;
 	for_each_cpu_and(cpu, data->domain, cpu_online_mask)
-		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+		per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;

 	data->cfg.vector = 0;
 	cpumask_clear(data->domain);
@@ -242,12 +242,13 @@ static void clear_irq_vector(int irq, struct apic_chip_data *data)
 		return;
 	}

+	desc = irq_to_desc(irq);
 	for_each_cpu_and(cpu, data->old_domain, cpu_online_mask) {
 		for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS;
 		     vector++) {
-			if (per_cpu(vector_irq, cpu)[vector] != irq)
+			if (per_cpu(vector_irq, cpu)[vector] != desc)
 				continue;
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 			break;
 		}
 	}
@@ -296,7 +297,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	struct irq_alloc_info *info = arg;
 	struct apic_chip_data *data;
 	struct irq_data *irq_data;
-	int i, err;
+	int i, err, node;

 	if (disable_apic)
 		return -ENXIO;
@@ -308,12 +309,13 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 	for (i = 0; i < nr_irqs; i++) {
 		irq_data = irq_domain_get_irq_data(domain, virq + i);
 		BUG_ON(!irq_data);
+		node = irq_data_get_node(irq_data);
 #ifdef	CONFIG_X86_IO_APIC
 		if (virq + i < nr_legacy_irqs() && legacy_irq_data[virq + i])
 			data = legacy_irq_data[virq + i];
 		else
 #endif
-			data = alloc_apic_chip_data(irq_data->node);
+			data = alloc_apic_chip_data(node);
 		if (!data) {
 			err = -ENOMEM;
 			goto error;
@@ -322,8 +324,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		irq_data->chip = &lapic_controller;
 		irq_data->chip_data = data;
 		irq_data->hwirq = virq + i;
-		err = assign_irq_vector_policy(virq + i, irq_data->node, data,
-					       info);
+		err = assign_irq_vector_policy(virq + i, node, data, info);
 		if (err)
 			goto error;
 	}
@@ -403,32 +404,32 @@ int __init arch_early_irq_init(void)
 	return arch_early_ioapic_init();
 }

+/* Initialize vector_irq on a new cpu */
 static void __setup_vector_irq(int cpu)
 {
-	/* Initialize vector_irq on a new cpu */
-	int irq, vector;
 	struct apic_chip_data *data;
+	struct irq_desc *desc;
+	int irq, vector;

 	/* Mark the inuse vectors */
-	for_each_active_irq(irq) {
-		data = apic_chip_data(irq_get_irq_data(irq));
-		if (!data)
-			continue;
+	for_each_irq_desc(irq, desc) {
+		struct irq_data *idata = irq_desc_get_irq_data(desc);

-		if (!cpumask_test_cpu(cpu, data->domain))
+		data = apic_chip_data(idata);
+		if (!data || !cpumask_test_cpu(cpu, data->domain))
 			continue;
 		vector = data->cfg.vector;
-		per_cpu(vector_irq, cpu)[vector] = irq;
+		per_cpu(vector_irq, cpu)[vector] = desc;
 	}
 	/* Mark the free vectors */
 	for (vector = 0; vector < NR_VECTORS; ++vector) {
-		irq = per_cpu(vector_irq, cpu)[vector];
-		if (irq <= VECTOR_UNDEFINED)
+		desc = per_cpu(vector_irq, cpu)[vector];
+		if (IS_ERR_OR_NULL(desc))
 			continue;

-		data = apic_chip_data(irq_get_irq_data(irq));
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!cpumask_test_cpu(cpu, data->domain))
-			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED;
+			per_cpu(vector_irq, cpu)[vector] = VECTOR_UNUSED;
 	}
 }

@@ -448,7 +449,7 @@ void setup_vector_irq(int cpu)
 	 * legacy vector to irq mapping:
 	 */
 	for (irq = 0; irq < nr_legacy_irqs(); irq++)
-		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq;
+		per_cpu(vector_irq, cpu)[ISA_IRQ_VECTOR(irq)] = irq_to_desc(irq);

 	__setup_vector_irq(cpu);
 }
@@ -490,7 +491,8 @@ static int apic_set_affinity(struct irq_data *irq_data,
 	if (err) {
 		struct irq_data *top = irq_get_irq_data(irq);

-		if (assign_irq_vector(irq, data, top->affinity))
+		if (assign_irq_vector(irq, data,
+				      irq_data_get_affinity_mask(top)))
 			pr_err("Failed to recover vector for irq %d\n", irq);
 		return err;
 	}
@@ -538,27 +540,30 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)

 	entering_ack_irq();

+	/* Prevent vectors vanishing under us */
+	raw_spin_lock(&vector_lock);
+
 	me = smp_processor_id();
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		int irq;
-		unsigned int irr;
-		struct irq_desc *desc;
 		struct apic_chip_data *data;
+		struct irq_desc *desc;
+		unsigned int irr;

-		irq = __this_cpu_read(vector_irq[vector]);
-
-		if (irq <= VECTOR_UNDEFINED)
+	retry:
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
 			continue;

-		desc = irq_to_desc(irq);
-		if (!desc)
-			continue;
+		if (!raw_spin_trylock(&desc->lock)) {
+			raw_spin_unlock(&vector_lock);
+			cpu_relax();
+			raw_spin_lock(&vector_lock);
+			goto retry;
+		}

-		data = apic_chip_data(&desc->irq_data);
+		data = apic_chip_data(irq_desc_get_irq_data(desc));
 		if (!data)
-			continue;
-
-		raw_spin_lock(&desc->lock);
+			goto unlock;

 		/*
 		 * Check if the irq migration is in progress. If so, we
@@ -583,11 +588,13 @@ asmlinkage __visible void smp_irq_move_cleanup_interrupt(void)
 			apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
 			goto unlock;
 		}
-		__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+		__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 unlock:
 		raw_spin_unlock(&desc->lock);
 	}

+	raw_spin_unlock(&vector_lock);
+
 	exiting_irq();
 }

--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -182,7 +182,7 @@ update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
 	return notifier_from_errno(err);
 }

-static struct notifier_block __refdata x2apic_cpu_notifier = {
+static struct notifier_block x2apic_cpu_notifier = {
 	.notifier_call = update_clusterinfo,
 };

@@ -272,7 +272,6 @@ static struct apic apic_x2apic_cluster = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,

-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,

 	.read				= native_apic_msr_read,
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -128,7 +128,6 @@ static struct apic apic_x2apic_phys = {
 	.send_IPI_all			= x2apic_send_IPI_all,
 	.send_IPI_self			= x2apic_send_IPI_self,

-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,

 	.read				= native_apic_msr_read,
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -248,7 +248,6 @@ static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 	    APIC_DM_STARTUP;
 	uv_write_global_mmr64(pnode, UVH_IPI_INT, val);

-	atomic_set(&init_deasserted, 1);
 	return 0;
 }

@@ -414,7 +413,6 @@ static struct apic __refdata apic_x2apic_uv_x = {
 	.send_IPI_self			= uv_send_IPI_self,

 	.wakeup_secondary_cpu		= uv_wakeup_secondary,
-	.wait_for_init_deassert		= false,
 	.inquire_remote_apic		= NULL,

 	.read				= native_apic_msr_read,
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -919,7 +919,7 @@ recalc:
 	} else if (jiffies_since_last_check > idle_period) {
 		unsigned int idle_percentage;

-		idle_percentage = stime - last_stime;
+		idle_percentage = cputime_to_jiffies(stime - last_stime);
 		idle_percentage *= 100;
 		idle_percentage /= jiffies_since_last_check;
 		use_apm_idle = (idle_percentage > idle_threshold);
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -1,4 +1,4 @@
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/sched.h>
 #include <linux/kthread.h>
 #include <linux/workqueue.h>
@@ -163,6 +163,5 @@ static int start_periodic_check_for_corruption(void)
 	schedule_delayed_work(&bios_check_work, 0);
 	return 0;
 }
-
-module_init(start_periodic_check_for_corruption);
+device_initcall(start_periodic_check_for_corruption);

--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -46,6 +46,8 @@ obj-$(CONFIG_PERF_EVENTS_INTEL_UNCORE)	+= perf_event_intel_uncore.o \
 					   perf_event_intel_uncore_snb.o \
 					   perf_event_intel_uncore_snbep.o \
 					   perf_event_intel_uncore_nhmex.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= perf_event_msr.o
+obj-$(CONFIG_CPU_SUP_AMD)		+= perf_event_msr.o
 endif


--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include <asm/cpu.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
+#include <asm/delay.h>

 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -114,7 +115,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;

 		printk(KERN_INFO "AMD K6 stepping B detected - ");

@@ -125,10 +126,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)

 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		rdtscl(d);
+		d = rdtsc();
 		while (n--)
 			f_vide();
-		rdtscl(d2);
+		d2 = rdtsc();
 		d = d2-d;

 		if (d > 20*K6_BUG_LOOP)
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		/* A random value per boot for bit slice [12:upper_bit) */
 		va_align.bits = get_random_int() & va_align.mask;
 	}
+
+	if (cpu_has(c, X86_FEATURE_MWAITX))
+		use_mwaitx_delay();
 }

 static void early_init_amd(struct cpuinfo_x86 *c)
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -13,6 +13,7 @@
 #include <linux/kgdb.h>
 #include <linux/smp.h>
 #include <linux/io.h>
+#include <linux/syscore_ops.h>

 #include <asm/stackprotector.h>
 #include <asm/perf_event.h>
@@ -1185,10 +1186,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
+	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1200,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
-	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
@@ -1488,3 +1489,20 @@ inline bool __static_cpu_has_safe(u16 bit)
 	return boot_cpu_has(bit);
 }
 EXPORT_SYMBOL_GPL(__static_cpu_has_safe);
+
+static void bsp_resume(void)
+{
+	if (this_cpu->c_bsp_resume)
+		this_cpu->c_bsp_resume(&boot_cpu_data);
+}
+
+static struct syscore_ops cpu_syscore_ops = {
+	.resume		= bsp_resume,
+};
+
+static int __init init_cpu_syscore(void)
+{
+	register_syscore_ops(&cpu_syscore_ops);
+	return 0;
+}
+core_initcall(init_cpu_syscore);
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -13,6 +13,7 @@ struct cpu_dev {
 	void		(*c_init)(struct cpuinfo_x86 *);
 	void		(*c_identify)(struct cpuinfo_x86 *);
 	void		(*c_detect_tlb)(struct cpuinfo_x86 *);
+	void		(*c_bsp_resume)(struct cpuinfo_x86 *);
 	int		c_x86_vendor;
 #ifdef CONFIG_X86_32
 	/* Optional vendor specific routine to obtain the cache size. */
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -371,6 +371,36 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }

+static void init_intel_energy_perf(struct cpuinfo_x86 *c)
+{
+	u64 epb;
+
+	/*
+	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if not already initialized.
+	 * (x86_energy_perf_policy(8) is available to change it at run-time.)
+	 */
+	if (!cpu_has(c, X86_FEATURE_EPB))
+		return;
+
+	rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+	if ((epb & 0xF) != ENERGY_PERF_BIAS_PERFORMANCE)
+		return;
+
+	pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
+	pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
+	epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
+	wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
+}
+
+static void intel_bsp_resume(struct cpuinfo_x86 *c)
+{
+	/*
+	 * MSR_IA32_ENERGY_PERF_BIAS is lost across suspend/resume,
+	 * so reinitialize it properly like during bootup:
+	 */
+	init_intel_energy_perf(c);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -478,21 +508,7 @@ static void init_intel(struct cpuinfo_x86 *c)
 	if (cpu_has(c, X86_FEATURE_VMX))
 		detect_vmx_virtcap(c);

-	/*
-	 * Initialize MSR_IA32_ENERGY_PERF_BIAS if BIOS did not.
-	 * x86_energy_perf_policy(8) is available to change it at run-time
-	 */
-	if (cpu_has(c, X86_FEATURE_EPB)) {
-		u64 epb;
-
-		rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		if ((epb & 0xF) == ENERGY_PERF_BIAS_PERFORMANCE) {
-			pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n");
-			pr_warn_once("ENERGY_PERF_BIAS: View and update with x86_energy_perf_policy(8)\n");
-			epb = (epb & ~0xF) | ENERGY_PERF_BIAS_NORMAL;
-			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
-		}
-	}
+	init_intel_energy_perf(c);
 }

 #ifdef CONFIG_X86_32
@@ -747,6 +763,7 @@ static const struct cpu_dev intel_cpu_dev = {
 	.c_detect_tlb	= intel_detect_tlb,
 	.c_early_init   = early_init_intel,
 	.c_init		= init_intel,
+	.c_bsp_resume	= intel_bsp_resume,
 	.c_x86_vendor	= X86_VENDOR_INTEL,
 };

--- a/arch/x86/kernel/cpu/intel_pt.h
+++ b/arch/x86/kernel/cpu/intel_pt.h
@@ -25,32 +25,11 @@
 */
 #define TOPA_PMI_MARGIN 512

-/*
- * Table of Physical Addresses bits
- */
-enum topa_sz {
-	TOPA_4K	= 0,
-	TOPA_8K,
-	TOPA_16K,
-	TOPA_32K,
-	TOPA_64K,
-	TOPA_128K,
-	TOPA_256K,
-	TOPA_512K,
-	TOPA_1MB,
-	TOPA_2MB,
-	TOPA_4MB,
-	TOPA_8MB,
-	TOPA_16MB,
-	TOPA_32MB,
-	TOPA_64MB,
-	TOPA_128MB,
-	TOPA_SZ_END,
-};
+#define TOPA_SHIFT 12

-static inline unsigned int sizes(enum topa_sz tsz)
+static inline unsigned int sizes(unsigned int tsz)
 {
-	return 1 << (tsz + 12);
+	return 1 << (tsz + TOPA_SHIFT);
 };

 struct topa_entry {
@@ -66,20 +45,26 @@ struct topa_entry {
 	u64	rsvd4	: 16;
 };

-#define TOPA_SHIFT 12
-#define PT_CPUID_LEAVES 2
+#define PT_CPUID_LEAVES		2
+#define PT_CPUID_REGS_NUM	4 /* number of regsters (eax, ebx, ecx, edx) */

 enum pt_capabilities {
 	PT_CAP_max_subleaf = 0,
 	PT_CAP_cr3_filtering,
+	PT_CAP_psb_cyc,
+	PT_CAP_mtc,
 	PT_CAP_topa_output,
 	PT_CAP_topa_multiple_entries,
+	PT_CAP_single_range_output,
 	PT_CAP_payloads_lip,
+	PT_CAP_mtc_periods,
+	PT_CAP_cycle_thresholds,
+	PT_CAP_psb_periods,
 };

 struct pt_pmu {
 	struct pmu		pmu;
-	u32			caps[4 * PT_CPUID_LEAVES];
+	u32			caps[PT_CPUID_REGS_NUM * PT_CPUID_LEAVES];
 };

 /**
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -1,4 +1,4 @@
-obj-y				=  mce.o mce-severity.o
+obj-y				=  mce.o mce-severity.o mce-genpool.o

 obj-$(CONFIG_X86_ANCIENT_MCE)	+= winchip.o p5.o
 obj-$(CONFIG_X86_MCE_INTEL)	+= mce_intel.o
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -57,7 +57,6 @@ void apei_mce_report_mem_error(int severity, struct cper_sec_mem_err *mem_err)

 	m.addr = mem_err->physical_addr;
 	mce_log(&m);
-	mce_notify_irq();
 }
 EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);

--- a/arch/x86/kernel/cpu/mcheck/mce-genpool.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-genpool.c
@@ -0,0 +1,99 @@
+/*
+ * MCE event pool management in MCE context
+ *
+ * Copyright (C) 2015 Intel Corp.
+ * Author: Chen, Gong <gong.chen@linux.intel.com>
+ *
+ * This file is licensed under GPLv2.
+ */
+#include <linux/smp.h>
+#include <linux/mm.h>
+#include <linux/genalloc.h>
+#include <linux/llist.h>
+#include "mce-internal.h"
+
+/*
+ * printk() is not safe in MCE context. This is a lock-less memory allocator
+ * used to save error information organized in a lock-less list.
+ *
+ * This memory pool is only to be used to save MCE records in MCE context.
+ * MCE events are rare, so a fixed size memory pool should be enough. Use
+ * 2 pages to save MCE events for now (~80 MCE records at most).
+ */
+#define MCE_POOLSZ	(2 * PAGE_SIZE)
+
+static struct gen_pool *mce_evt_pool;
+static LLIST_HEAD(mce_event_llist);
+static char gen_pool_buf[MCE_POOLSZ];
+
+void mce_gen_pool_process(void)
+{
+	struct llist_node *head;
+	struct mce_evt_llist *node;
+	struct mce *mce;
+
+	head = llist_del_all(&mce_event_llist);
+	if (!head)
+		return;
+
+	head = llist_reverse_order(head);
+	llist_for_each_entry(node, head, llnode) {
+		mce = &node->mce;
+		atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+		gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
+	}
+}
+
+bool mce_gen_pool_empty(void)
+{
+	return llist_empty(&mce_event_llist);
+}
+
+int mce_gen_pool_add(struct mce *mce)
+{
+	struct mce_evt_llist *node;
+
+	if (!mce_evt_pool)
+		return -EINVAL;
+
+	node = (void *)gen_pool_alloc(mce_evt_pool, sizeof(*node));
+	if (!node) {
+		pr_warn_ratelimited("MCE records pool full!\n");
+		return -ENOMEM;
+	}
+
+	memcpy(&node->mce, mce, sizeof(*mce));
+	llist_add(&node->llnode, &mce_event_llist);
+
+	return 0;
+}
+
+static int mce_gen_pool_create(void)
+{
+	struct gen_pool *tmpp;
+	int ret = -ENOMEM;
+
+	tmpp = gen_pool_create(ilog2(sizeof(struct mce_evt_llist)), -1);
+	if (!tmpp)
+		goto out;
+
+	ret = gen_pool_add(tmpp, (unsigned long)gen_pool_buf, MCE_POOLSZ, -1);
+	if (ret) {
+		gen_pool_destroy(tmpp);
+		goto out;
+	}
+
+	mce_evt_pool = tmpp;
+
+out:
+	return ret;
+}
+
+int mce_gen_pool_init(void)
+{
+	/* Just init mce_gen_pool once. */
+	if (mce_evt_pool)
+		return 0;
+
+	return mce_gen_pool_create();
+}
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -13,6 +13,8 @@ enum severity_level {
 	MCE_PANIC_SEVERITY,
 };

+extern struct atomic_notifier_head x86_mce_decoder_chain;
+
 #define ATTR_LEN		16
 #define INITIAL_CHECK_INTERVAL	5 * 60 /* 5 minutes */

@@ -24,6 +26,16 @@ struct mce_bank {
 	char			attrname[ATTR_LEN];	/* attribute name */
 };

+struct mce_evt_llist {
+	struct llist_node llnode;
+	struct mce mce;
+};
+
+void mce_gen_pool_process(void);
+bool mce_gen_pool_empty(void);
+int mce_gen_pool_add(struct mce *mce);
+int mce_gen_pool_init(void);
+
 extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
 struct dentry *mce_get_debugfs_dir(void);

@@ -67,3 +79,5 @@ static inline int apei_clear_mce(u64 record_id)
 	return -EINVAL;
 }
 #endif
+
+void mce_inject_log(struct mce *m);
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -52,11 +52,11 @@

 static DEFINE_MUTEX(mce_chrdev_read_mutex);

-#define rcu_dereference_check_mce(p) \
+#define mce_log_get_idx_check(p) \
 ({ \
-	rcu_lockdep_assert(rcu_read_lock_sched_held() || \
-			   lockdep_is_held(&mce_chrdev_read_mutex), \
-			   "suspicious rcu_dereference_check_mce() usage"); \
+	RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \
+			 !lockdep_is_held(&mce_chrdev_read_mutex), \
+			 "suspicious mce_log_get_idx_check() usage"); \
 	smp_load_acquire(&(p)); \
 })

@@ -110,22 +110,24 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
 */
 mce_banks_t mce_banks_ce_disabled;

-static DEFINE_PER_CPU(struct work_struct, mce_work);
+static struct work_struct mce_work;
+static struct irq_work mce_irq_work;

 static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
+static int mce_usable_address(struct mce *m);

 /*
 * CPU/chipset specific EDAC code can register a notifier call here to print
 * MCE errors in a human-readable form.
 */
-static ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
+ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);

 /* Do initial initialization of a struct mce */
 void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -157,12 +159,13 @@ void mce_log(struct mce *mce)
 	/* Emit the trace record: */
 	trace_mce_record(mce);

-	atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
+	if (!mce_gen_pool_add(mce))
+		irq_work_queue(&mce_irq_work);

 	mce->finished = 0;
 	wmb();
 	for (;;) {
-		entry = rcu_dereference_check_mce(mcelog.next);
+		entry = mce_log_get_idx_check(mcelog.next);
 		for (;;) {

 			/*
@@ -196,48 +199,23 @@ void mce_log(struct mce *mce)
 	set_bit(0, &mce_need_notify);
 }

-static void drain_mcelog_buffer(void)
+void mce_inject_log(struct mce *m)
 {
-	unsigned int next, i, prev = 0;
-
-	next = ACCESS_ONCE(mcelog.next);
-
-	do {
-		struct mce *m;
-
-		/* drain what was logged during boot */
-		for (i = prev; i < next; i++) {
-			unsigned long start = jiffies;
-			unsigned retries = 1;
-
-			m = &mcelog.entry[i];
-
-			while (!m->finished) {
-				if (time_after_eq(jiffies, start + 2*retries))
-					retries++;
-
-				cpu_relax();
-
-				if (!m->finished && retries >= 4) {
-					pr_err("skipping error being logged currently!\n");
-					break;
-				}
-			}
-			smp_rmb();
-			atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
-		}
-
-		memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m));
-		prev = next;
-		next = cmpxchg(&mcelog.next, prev, 0);
-	} while (next != prev);
+	mutex_lock(&mce_chrdev_read_mutex);
+	mce_log(m);
+	mutex_unlock(&mce_chrdev_read_mutex);
 }
+EXPORT_SYMBOL_GPL(mce_inject_log);

+static struct notifier_block mce_srao_nb;

 void mce_register_decode_chain(struct notifier_block *nb)
 {
+	/* Ensure SRAO notifier has the highest priority in the decode chain. */
+	if (nb != &mce_srao_nb && nb->priority == INT_MAX)
+		nb->priority -= 1;
+
 	atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
-	drain_mcelog_buffer();
 }
 EXPORT_SYMBOL_GPL(mce_register_decode_chain);

@@ -461,61 +439,6 @@ static inline void mce_gather_info(struct mce *m, struct pt_regs *regs)
 	}
 }

-/*
- * Simple lockless ring to communicate PFNs from the exception handler with the
- * process context work function. This is vastly simplified because there's
- * only a single reader and a single writer.
- */
-#define MCE_RING_SIZE 16	/* we use one entry less */
-
-struct mce_ring {
-	unsigned short start;
-	unsigned short end;
-	unsigned long ring[MCE_RING_SIZE];
-};
-static DEFINE_PER_CPU(struct mce_ring, mce_ring);
-
-/* Runs with CPU affinity in workqueue */
-static int mce_ring_empty(void)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-
-	return r->start == r->end;
-}
-
-static int mce_ring_get(unsigned long *pfn)
-{
-	struct mce_ring *r;
-	int ret = 0;
-
-	*pfn = 0;
-	get_cpu();
-	r = this_cpu_ptr(&mce_ring);
-	if (r->start == r->end)
-		goto out;
-	*pfn = r->ring[r->start];
-	r->start = (r->start + 1) % MCE_RING_SIZE;
-	ret = 1;
-out:
-	put_cpu();
-	return ret;
-}
-
-/* Always runs in MCE context with preempt off */
-static int mce_ring_add(unsigned long pfn)
-{
-	struct mce_ring *r = this_cpu_ptr(&mce_ring);
-	unsigned next;
-
-	next = (r->end + 1) % MCE_RING_SIZE;
-	if (next == r->start)
-		return -1;
-	r->ring[r->end] = pfn;
-	wmb();
-	r->end = next;
-	return 0;
-}
-
 int mce_available(struct cpuinfo_x86 *c)
 {
 	if (mca_cfg.disabled)
@@ -525,12 +448,10 @@ int mce_available(struct cpuinfo_x86 *c)

 static void mce_schedule_work(void)
 {
-	if (!mce_ring_empty())
-		schedule_work(this_cpu_ptr(&mce_work));
+	if (!mce_gen_pool_empty() && keventd_up())
+		schedule_work(&mce_work);
 }

-static DEFINE_PER_CPU(struct irq_work, mce_irq_work);
-
 static void mce_irq_work_cb(struct irq_work *entry)
 {
 	mce_notify_irq();
@@ -551,9 +472,30 @@ static void mce_report_event(struct pt_regs *regs)
 		return;
 	}

-	irq_work_queue(this_cpu_ptr(&mce_irq_work));
+	irq_work_queue(&mce_irq_work);
 }

+static int srao_decode_notifier(struct notifier_block *nb, unsigned long val,
+				void *data)
+{
+	struct mce *mce = (struct mce *)data;
+	unsigned long pfn;
+
+	if (!mce)
+		return NOTIFY_DONE;
+
+	if (mce->usable_addr && (mce->severity == MCE_AO_SEVERITY)) {
+		pfn = mce->addr >> PAGE_SHIFT;
+		memory_failure(pfn, MCE_VECTOR, 0);
+	}
+
+	return NOTIFY_OK;
+}
+static struct notifier_block mce_srao_nb = {
+	.notifier_call	= srao_decode_notifier,
+	.priority = INT_MAX,
+};
+
 /*
 * Read ADDR and MISC registers.
 */
@@ -672,8 +614,11 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
 		 */
 		if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) {
 			if (m.status & MCI_STATUS_ADDRV) {
-				mce_ring_add(m.addr >> PAGE_SHIFT);
-				mce_schedule_work();
+				m.severity = severity;
+				m.usable_addr = mce_usable_address(&m);
+
+				if (!mce_gen_pool_add(&m))
+					mce_schedule_work();
 			}
 		}

@@ -1029,7 +974,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int worst = 0;
 	int severity;
@@ -1055,7 +999,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;

-	prev_state = ist_enter(regs);
+	ist_enter(regs);

 	this_cpu_inc(mce_exception_count);

@@ -1143,15 +1087,9 @@ void do_machine_check(struct pt_regs *regs, long error_code)

 		mce_read_aux(&m, i);

-		/*
-		 * Action optional error. Queue address for later processing.
-		 * When the ring overflows we just ignore the AO error.
-		 * RED-PEN add some logging mechanism when
-		 * usable_address or mce_add_ring fails.
-		 * RED-PEN don't ignore overflow for mca_cfg.tolerant == 0
-		 */
-		if (severity == MCE_AO_SEVERITY && mce_usable_address(&m))
-			mce_ring_add(m.addr >> PAGE_SHIFT);
+		/* assuming valid severity level != 0 */
+		m.severity = severity;
+		m.usable_addr = mce_usable_address(&m);

 		mce_log(&m);

@@ -1227,7 +1165,7 @@ out:
 	local_irq_disable();
 	ist_end_non_atomic();
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);

@@ -1247,14 +1185,11 @@ int memory_failure(unsigned long pfn, int vector, int flags)
 /*
 * Action optional processing happens here (picking up
 * from the list of faulting pages that do_machine_check()
- * placed into the "ring").
+ * placed into the genpool).
 */
 static void mce_process_work(struct work_struct *dummy)
 {
-	unsigned long pfn;
-
-	while (mce_ring_get(&pfn))
-		memory_failure(pfn, MCE_VECTOR, 0);
+	mce_gen_pool_process();
 }

 #ifdef CONFIG_X86_MCE_INTEL
@@ -1678,6 +1613,17 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
 	}
 }

+static void __mcheck_cpu_clear_vendor(struct cpuinfo_x86 *c)
+{
+	switch (c->x86_vendor) {
+	case X86_VENDOR_INTEL:
+		mce_intel_feature_clear(c);
+		break;
+	default:
+		break;
+	}
+}
+
 static void mce_start_timer(unsigned int cpu, struct timer_list *t)
 {
 	unsigned long iv = check_interval * HZ;
@@ -1731,13 +1677,36 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c)
 		return;
 	}

+	if (mce_gen_pool_init()) {
+		mca_cfg.disabled = true;
+		pr_emerg("Couldn't allocate MCE records pool!\n");
+		return;
+	}
+
 	machine_check_vector = do_machine_check;

 	__mcheck_cpu_init_generic();
 	__mcheck_cpu_init_vendor(c);
 	__mcheck_cpu_init_timer();
-	INIT_WORK(this_cpu_ptr(&mce_work), mce_process_work);
-	init_irq_work(this_cpu_ptr(&mce_irq_work), &mce_irq_work_cb);
+}
+
+/*
+ * Called for each booted CPU to clear some machine checks opt-ins
+ */
+void mcheck_cpu_clear(struct cpuinfo_x86 *c)
+{
+	if (mca_cfg.disabled)
+		return;
+
+	if (!mce_available(c))
+		return;
+
+	/*
+	 * Possibly to clear general settings generic to x86
+	 * __mcheck_cpu_clear_generic(c);
+	 */
+	__mcheck_cpu_clear_vendor(c);
+
 }

 /*
@@ -1784,7 +1753,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;

-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }

 static int mce_apei_read_done;
@@ -1850,7 +1819,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
 			goto out;
 	}

-	next = rcu_dereference_check_mce(mcelog.next);
+	next = mce_log_get_idx_check(mcelog.next);

 	/* Only supports full reads right now */
 	err = -EINVAL;
@@ -2056,8 +2025,12 @@ __setup("mce", mcheck_enable);
 int __init mcheck_init(void)
 {
 	mcheck_intel_therm_init();
+	mce_register_decode_chain(&mce_srao_nb);
 	mcheck_vendor_init_severity();

+	INIT_WORK(&mce_work, mce_process_work);
+	init_irq_work(&mce_irq_work, mce_irq_work_cb);
+
 	return 0;
 }

@@ -2591,5 +2564,20 @@ static int __init mcheck_debugfs_init(void)

 	return 0;
 }
-late_initcall(mcheck_debugfs_init);
+#else
+static int __init mcheck_debugfs_init(void) { return -EINVAL; }
 #endif
+
+static int __init mcheck_late_init(void)
+{
+	mcheck_debugfs_init();
+
+	/*
+	 * Flush out everything that has been logged during early boot, now that
+	 * everything has been initialized (workqueues, decoders, ...).
+	 */
+	mce_schedule_work();
+
+	return 0;
+}
+late_initcall(mcheck_late_init);
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -146,6 +146,27 @@ void mce_intel_hcpu_update(unsigned long cpu)
 	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
 }

+static void cmci_toggle_interrupt_mode(bool on)
+{
+	unsigned long flags, *owned;
+	int bank;
+	u64 val;
+
+	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
+	owned = this_cpu_ptr(mce_banks_owned);
+	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
+		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
+
+		if (on)
+			val |= MCI_CTL2_CMCI_EN;
+		else
+			val &= ~MCI_CTL2_CMCI_EN;
+
+		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
+	}
+	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
+}
+
 unsigned long cmci_intel_adjust_timer(unsigned long interval)
 {
 	if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
@@ -175,7 +196,7 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
 		 */
 		if (!atomic_read(&cmci_storm_on_cpus)) {
 			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
-			cmci_reenable();
+			cmci_toggle_interrupt_mode(true);
 			cmci_recheck();
 		}
 		return CMCI_POLL_INTERVAL;
@@ -186,22 +207,6 @@ unsigned long cmci_intel_adjust_timer(unsigned long interval)
 	}
 }

-static void cmci_storm_disable_banks(void)
-{
-	unsigned long flags, *owned;
-	int bank;
-	u64 val;
-
-	raw_spin_lock_irqsave(&cmci_discover_lock, flags);
-	owned = this_cpu_ptr(mce_banks_owned);
-	for_each_set_bit(bank, owned, MAX_NR_BANKS) {
-		rdmsrl(MSR_IA32_MCx_CTL2(bank), val);
-		val &= ~MCI_CTL2_CMCI_EN;
-		wrmsrl(MSR_IA32_MCx_CTL2(bank), val);
-	}
-	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags);
-}
-
 static bool cmci_storm_detect(void)
 {
 	unsigned int cnt = __this_cpu_read(cmci_storm_cnt);
@@ -223,7 +228,7 @@ static bool cmci_storm_detect(void)
 	if (cnt <= CMCI_STORM_THRESHOLD)
 		return false;

-	cmci_storm_disable_banks();
+	cmci_toggle_interrupt_mode(false);
 	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
 	r = atomic_add_return(1, &cmci_storm_on_cpus);
 	mce_timer_kick(CMCI_STORM_INTERVAL);
@@ -246,7 +251,6 @@ static void intel_threshold_interrupt(void)
 		return;

 	machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
-	mce_notify_irq();
 }

 /*
@@ -435,7 +439,7 @@ static void intel_init_cmci(void)
 	cmci_recheck();
 }

-void intel_init_lmce(void)
+static void intel_init_lmce(void)
 {
 	u64 val;

@@ -448,9 +452,26 @@ void intel_init_lmce(void)
 		wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN);
 }

+static void intel_clear_lmce(void)
+{
+	u64 val;
+
+	if (!lmce_supported())
+		return;
+
+	rdmsrl(MSR_IA32_MCG_EXT_CTL, val);
+	val &= ~MCG_EXT_CTL_LMCE_EN;
+	wrmsrl(MSR_IA32_MCG_EXT_CTL, val);
+}
+
 void mce_intel_feature_init(struct cpuinfo_x86 *c)
 {
 	intel_init_thermal(c);
 	intel_init_cmci();
 	intel_init_lmce();
 }
+
+void mce_intel_feature_clear(struct cpuinfo_x86 *c)
+{
+	intel_clear_lmce();
+}
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;

-	prev_state = ist_enter(regs);
+	ist_enter(regs);

 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)

 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }

 /* Set up machine check reporting for processors with Intel style MCE: */
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);

 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);

-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }

 /* Set up machine check reporting on the Winchip C6 series */
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -377,17 +377,16 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)
 	return err;
 }

-static int mc_device_remove(struct device *dev, struct subsys_interface *sif)
+static void mc_device_remove(struct device *dev, struct subsys_interface *sif)
 {
 	int cpu = dev->id;

 	if (!cpu_online(cpu))
-		return 0;
+		return;

 	pr_debug("CPU%d removed\n", cpu);
 	microcode_fini_cpu(cpu);
 	sysfs_remove_group(&dev->kobj, &mc_attr_group);
-	return 0;
 }

 static struct subsys_interface mc_cpu_interface = {
@@ -460,7 +459,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
 	return NOTIFY_OK;
 }

-static struct notifier_block __refdata mc_cpu_notifier = {
+static struct notifier_block mc_cpu_notifier = {
 	.notifier_call	= mc_cpu_callback,
 };

--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -390,7 +390,7 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
 }

 #ifdef DEBUG
-static void __ref show_saved_mc(void)
+static void show_saved_mc(void)
 {
 	int i, j;
 	unsigned int sig, pf, rev, total_size, data_size, date;
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -18,6 +18,7 @@
 #include <linux/efi.h>
 #include <linux/interrupt.h>
 #include <linux/irq.h>
+#include <linux/kexec.h>
 #include <asm/processor.h>
 #include <asm/hypervisor.h>
 #include <asm/hyperv.h>
@@ -28,10 +29,14 @@
 #include <asm/i8259.h>
 #include <asm/apic.h>
 #include <asm/timer.h>
+#include <asm/reboot.h>

 struct ms_hyperv_info ms_hyperv;
 EXPORT_SYMBOL_GPL(ms_hyperv);

+static void (*hv_kexec_handler)(void);
+static void (*hv_crash_handler)(struct pt_regs *regs);
+
 #if IS_ENABLED(CONFIG_HYPERV)
 static void (*vmbus_handler)(void);

@@ -67,8 +72,47 @@ void hv_remove_vmbus_irq(void)
 }
 EXPORT_SYMBOL_GPL(hv_setup_vmbus_irq);
 EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq);
+
+void hv_setup_kexec_handler(void (*handler)(void))
+{
+	hv_kexec_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_kexec_handler);
+
+void hv_remove_kexec_handler(void)
+{
+	hv_kexec_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_kexec_handler);
+
+void hv_setup_crash_handler(void (*handler)(struct pt_regs *regs))
+{
+	hv_crash_handler = handler;
+}
+EXPORT_SYMBOL_GPL(hv_setup_crash_handler);
+
+void hv_remove_crash_handler(void)
+{
+	hv_crash_handler = NULL;
+}
+EXPORT_SYMBOL_GPL(hv_remove_crash_handler);
 #endif

+static void hv_machine_shutdown(void)
+{
+	if (kexec_in_progress && hv_kexec_handler)
+		hv_kexec_handler();
+	native_machine_shutdown();
+}
+
+static void hv_machine_crash_shutdown(struct pt_regs *regs)
+{
+	if (hv_crash_handler)
+		hv_crash_handler(regs);
+	native_machine_crash_shutdown(regs);
+}
+
+
 static uint32_t  __init ms_hyperv_platform(void)
 {
 	u32 eax;
@@ -114,6 +158,7 @@ static void __init ms_hyperv_init_platform(void)
 	 * Extract the features and hints
 	 */
 	ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+	ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES);
 	ms_hyperv.hints    = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);

 	printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
@@ -141,6 +186,9 @@ static void __init ms_hyperv_init_platform(void)
 	no_timer_check = 1;
 #endif

+	machine_ops.shutdown = hv_machine_shutdown;
+	machine_ops.crash_shutdown = hv_machine_crash_shutdown;
+	mark_tsc_unstable("running on Hyper-V");
 }

 const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -448,7 +448,6 @@ int mtrr_add(unsigned long base, unsigned long size, unsigned int type,
 	return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
 			     increment);
 }
-EXPORT_SYMBOL(mtrr_add);

 /**
 * mtrr_del_page - delete a memory type region
@@ -537,7 +536,6 @@ int mtrr_del(int reg, unsigned long base, unsigned long size)
 		return -EINVAL;
 	return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
 }
-EXPORT_SYMBOL(mtrr_del);

 /**
 * arch_phys_wc_add - add a WC MTRR and handle errors if PAT is unavailable
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -1551,7 +1551,7 @@ static void __init filter_events(struct attribute **attrs)
 }

 /* Merge two pointer arrays */
-static __init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
+__init struct attribute **merge_attr(struct attribute **a, struct attribute **b)
 {
 	struct attribute **new;
 	int j, i;
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;

 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;

 		if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;

 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
@@ -2200,7 +2204,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc);
 }

-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION

 #include <asm/compat.h>

--- a/arch/x86/kernel/cpu/perf_event.h
+++ b/arch/x86/kernel/cpu/perf_event.h
@@ -165,7 +165,7 @@ struct intel_excl_cntrs {
 	unsigned	core_id;	/* per-core: core id */
 };

-#define MAX_LBR_ENTRIES		16
+#define MAX_LBR_ENTRIES		32

 enum {
 	X86_PERF_KFREE_SHARED = 0,
@@ -594,6 +594,7 @@ struct x86_pmu {
 	struct event_constraint *pebs_constraints;
 	void		(*pebs_aliases)(struct perf_event *event);
 	int 		max_pebs_events;
+	unsigned long	free_running_flags;

 	/*
 	 * Intel LBR
@@ -624,6 +625,7 @@ struct x86_pmu {
 struct x86_perf_task_context {
 	u64 lbr_from[MAX_LBR_ENTRIES];
 	u64 lbr_to[MAX_LBR_ENTRIES];
+	u64 lbr_info[MAX_LBR_ENTRIES];
 	int lbr_callstack_users;
 	int lbr_stack_state;
 };
@@ -793,6 +795,8 @@ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip)
 ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event);
 ssize_t intel_event_sysfs_show(char *page, u64 config);

+struct attribute **merge_attr(struct attribute **a, struct attribute **b);
+
 #ifdef CONFIG_CPU_SUP_AMD

 int amd_pmu_init(void);
@@ -808,20 +812,6 @@ static inline int amd_pmu_init(void)

 #ifdef CONFIG_CPU_SUP_INTEL

-static inline bool intel_pmu_needs_lbr_smpl(struct perf_event *event)
-{
-	/* user explicitly requested branch sampling */
-	if (has_branch_stack(event))
-		return true;
-
-	/* implicit branch sampling to correct PEBS skid */
-	if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1 &&
-	    x86_pmu.intel_cap.pebs_format < 2)
-		return true;
-
-	return false;
-}
-
 static inline bool intel_pmu_has_bts(struct perf_event *event)
 {
 	if (event->attr.config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
@@ -873,6 +863,8 @@ extern struct event_constraint intel_ivb_pebs_event_constraints[];

 extern struct event_constraint intel_hsw_pebs_event_constraints[];

+extern struct event_constraint intel_skl_pebs_event_constraints[];
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event);

 void intel_pmu_pebs_enable(struct perf_event *event);
@@ -911,6 +903,8 @@ void intel_pmu_lbr_init_snb(void);

 void intel_pmu_lbr_init_hsw(void);

+void intel_pmu_lbr_init_skl(void);
+
 int intel_pmu_setup_lbr_filter(struct perf_event *event);

 void intel_pt_interrupt(void);
@@ -934,6 +928,7 @@ static inline int is_ht_workaround_enabled(void)
 {
 	return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED);
 }
+
 #else /* CONFIG_CPU_SUP_INTEL */

 static inline void reserve_ds_buffers(void)
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -177,6 +177,14 @@ static struct event_constraint intel_slm_event_constraints[] __read_mostly =
 	EVENT_CONSTRAINT_END
 };

+struct event_constraint intel_skl_event_constraints[] = {
+	FIXED_EVENT_CONSTRAINT(0x00c0, 0),	/* INST_RETIRED.ANY */
+	FIXED_EVENT_CONSTRAINT(0x003c, 1),	/* CPU_CLK_UNHALTED.CORE */
+	FIXED_EVENT_CONSTRAINT(0x0300, 2),	/* CPU_CLK_UNHALTED.REF */
+	INTEL_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	EVENT_CONSTRAINT_END
+};
+
 static struct extra_reg intel_snb_extra_regs[] __read_mostly = {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3f807f8fffull, RSP_0),
@@ -193,6 +201,13 @@ static struct extra_reg intel_snbep_extra_regs[] __read_mostly = {
 	EVENT_EXTRA_END
 };

+static struct extra_reg intel_skl_extra_regs[] __read_mostly = {
+	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x3fffff8fffull, RSP_0),
+	INTEL_UEVENT_EXTRA_REG(0x01bb, MSR_OFFCORE_RSP_1, 0x3fffff8fffull, RSP_1),
+	INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd),
+	EVENT_EXTRA_END
+};
+
 EVENT_ATTR_STR(mem-loads,	mem_ld_nhm,	"event=0x0b,umask=0x10,ldlat=3");
 EVENT_ATTR_STR(mem-loads,	mem_ld_snb,	"event=0xcd,umask=0x1,ldlat=3");
 EVENT_ATTR_STR(mem-stores,	mem_st_snb,	"event=0xcd,umask=0x2");
@@ -244,6 +259,200 @@ static u64 intel_pmu_event_map(int hw_event)
 	return intel_perfmon_event_map[hw_event];
 }

+/*
+ * Notes on the events:
+ * - data reads do not include code reads (comparable to earlier tables)
+ * - data counts include speculative execution (except L1 write, dtlb, bpu)
+ * - remote node access includes remote memory, remote cache, remote mmio.
+ * - prefetches are not included in the counts.
+ * - icache miss does not include decoded icache
+ */
+
+#define SKL_DEMAND_DATA_RD		BIT_ULL(0)
+#define SKL_DEMAND_RFO			BIT_ULL(1)
+#define SKL_ANY_RESPONSE		BIT_ULL(16)
+#define SKL_SUPPLIER_NONE		BIT_ULL(17)
+#define SKL_L3_MISS_LOCAL_DRAM		BIT_ULL(26)
+#define SKL_L3_MISS_REMOTE_HOP0_DRAM	BIT_ULL(27)
+#define SKL_L3_MISS_REMOTE_HOP1_DRAM	BIT_ULL(28)
+#define SKL_L3_MISS_REMOTE_HOP2P_DRAM	BIT_ULL(29)
+#define SKL_L3_MISS			(SKL_L3_MISS_LOCAL_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+#define SKL_SPL_HIT			BIT_ULL(30)
+#define SKL_SNOOP_NONE			BIT_ULL(31)
+#define SKL_SNOOP_NOT_NEEDED		BIT_ULL(32)
+#define SKL_SNOOP_MISS			BIT_ULL(33)
+#define SKL_SNOOP_HIT_NO_FWD		BIT_ULL(34)
+#define SKL_SNOOP_HIT_WITH_FWD		BIT_ULL(35)
+#define SKL_SNOOP_HITM			BIT_ULL(36)
+#define SKL_SNOOP_NON_DRAM		BIT_ULL(37)
+#define SKL_ANY_SNOOP			(SKL_SPL_HIT|SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SNOOP_NON_DRAM)
+#define SKL_DEMAND_READ			SKL_DEMAND_DATA_RD
+#define SKL_SNOOP_DRAM			(SKL_SNOOP_NONE| \
+					 SKL_SNOOP_NOT_NEEDED|SKL_SNOOP_MISS| \
+					 SKL_SNOOP_HIT_NO_FWD|SKL_SNOOP_HIT_WITH_FWD| \
+					 SKL_SNOOP_HITM|SKL_SPL_HIT)
+#define SKL_DEMAND_WRITE		SKL_DEMAND_RFO
+#define SKL_LLC_ACCESS			SKL_ANY_RESPONSE
+#define SKL_L3_MISS_REMOTE		(SKL_L3_MISS_REMOTE_HOP0_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP1_DRAM| \
+					 SKL_L3_MISS_REMOTE_HOP2P_DRAM)
+
+static __initconst const u64 skl_hw_cache_event_ids
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x151,	/* L1D.REPLACEMENT */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(L1I ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x283,	/* ICACHE_64B.MISS */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(DTLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x81d0,	/* MEM_INST_RETIRED.ALL_LOADS */
+		[ C(RESULT_MISS)   ] = 0x608,	/* DTLB_LOAD_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x82d0,	/* MEM_INST_RETIRED.ALL_STORES */
+		[ C(RESULT_MISS)   ] = 0x649,	/* DTLB_STORE_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(ITLB) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x2085,	/* ITLB_MISSES.STLB_HIT */
+		[ C(RESULT_MISS)   ] = 0xe85,	/* ITLB_MISSES.WALK_COMPLETED */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(BPU ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0xc4,	/* BR_INST_RETIRED.ALL_BRANCHES */
+		[ C(RESULT_MISS)   ] = 0xc5,	/* BR_MISP_RETIRED.ALL_BRANCHES */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = -1,
+		[ C(RESULT_MISS)   ] = -1,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = 0x1b7,	/* OFFCORE_RESPONSE */
+		[ C(RESULT_MISS)   ] = 0x1b7,	/* OFFCORE_RESPONSE */
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
+static __initconst const u64 skl_hw_cache_extra_regs
+				[PERF_COUNT_HW_CACHE_MAX]
+				[PERF_COUNT_HW_CACHE_OP_MAX]
+				[PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(LL  ) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_LLC_ACCESS|SKL_ANY_SNOOP,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS|SKL_ANY_SNOOP|
+				       SKL_SUPPLIER_NONE,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+ [ C(NODE) ] = {
+	[ C(OP_READ) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_READ|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_WRITE) ] = {
+		[ C(RESULT_ACCESS) ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_LOCAL_DRAM|SKL_SNOOP_DRAM,
+		[ C(RESULT_MISS)   ] = SKL_DEMAND_WRITE|
+				       SKL_L3_MISS_REMOTE|SKL_SNOOP_DRAM,
+	},
+	[ C(OP_PREFETCH) ] = {
+		[ C(RESULT_ACCESS) ] = 0x0,
+		[ C(RESULT_MISS)   ] = 0x0,
+	},
+ },
+};
+
 #define SNB_DMND_DATA_RD	(1ULL << 0)
 #define SNB_DMND_RFO		(1ULL << 1)
 #define SNB_DMND_IFETCH		(1ULL << 2)
@@ -1114,7 +1323,7 @@ static struct extra_reg intel_slm_extra_regs[] __read_mostly =
 {
 	/* must define OFFCORE_RSP_X first, see intel_fixup_er() */
 	INTEL_UEVENT_EXTRA_REG(0x01b7, MSR_OFFCORE_RSP_0, 0x768005ffffull, RSP_0),
-	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x768005ffffull, RSP_1),
+	INTEL_UEVENT_EXTRA_REG(0x02b7, MSR_OFFCORE_RSP_1, 0x368005ffffull, RSP_1),
 	EVENT_EXTRA_END
 };

@@ -1594,6 +1803,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)

 	loops = 0;
 again:
+	intel_pmu_lbr_read();
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
 		static bool warned = false;
@@ -1608,16 +1818,16 @@ again:

 	inc_irq_stat(apic_perf_irqs);

-	intel_pmu_lbr_read();

 	/*
-	 * CondChgd bit 63 doesn't mean any overflow status. Ignore
-	 * and clear the bit.
+	 * Ignore a range of extra bits in status that do not indicate
+	 * overflow by themselves.
 	 */
-	if (__test_and_clear_bit(63, (unsigned long *)&status)) {
-		if (!status)
-			goto done;
-	}
+	status &= ~(GLOBAL_STATUS_COND_CHG |
+		    GLOBAL_STATUS_ASIF |
+		    GLOBAL_STATUS_LBRS_FROZEN);
+	if (!status)
+		goto done;

 	/*
 	 * PEBS overflow sets bit 62 in the global status register
@@ -1699,18 +1909,22 @@ intel_bts_constraints(struct perf_event *event)
 	return NULL;
 }

-static int intel_alt_er(int idx)
+static int intel_alt_er(int idx, u64 config)
 {
+	int alt_idx;
 	if (!(x86_pmu.flags & PMU_FL_HAS_RSP_1))
 		return idx;

 	if (idx == EXTRA_REG_RSP_0)
-		return EXTRA_REG_RSP_1;
+		alt_idx = EXTRA_REG_RSP_1;

 	if (idx == EXTRA_REG_RSP_1)
-		return EXTRA_REG_RSP_0;
+		alt_idx = EXTRA_REG_RSP_0;

-	return idx;
+	if (config & ~x86_pmu.extra_regs[alt_idx].valid_mask)
+		return idx;
+
+	return alt_idx;
 }

 static void intel_fixup_er(struct perf_event *event, int idx)
@@ -1799,7 +2013,7 @@ again:
 		 */
 		c = NULL;
 	} else {
-		idx = intel_alt_er(idx);
+		idx = intel_alt_er(idx, reg->config);
 		if (idx != reg->idx) {
 			raw_spin_unlock_irqrestore(&era->lock, flags);
 			goto again;
@@ -2253,6 +2467,15 @@ static void intel_pebs_aliases_snb(struct perf_event *event)
 	}
 }

+static unsigned long intel_pmu_free_running_flags(struct perf_event *event)
+{
+	unsigned long flags = x86_pmu.free_running_flags;
+
+	if (event->attr.use_clockid)
+		flags &= ~PERF_SAMPLE_TIME;
+	return flags;
+}
+
 static int intel_pmu_hw_config(struct perf_event *event)
 {
 	int ret = x86_pmu_hw_config(event);
@@ -2263,7 +2486,8 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (event->attr.precise_ip) {
 		if (!event->attr.freq) {
 			event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD;
-			if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS))
+			if (!(event->attr.sample_type &
+			      ~intel_pmu_free_running_flags(event)))
 				event->hw.flags |= PERF_X86_EVENT_FREERUNNING;
 		}
 		if (x86_pmu.pebs_aliases)
@@ -2694,6 +2918,8 @@ static __initconst const struct x86_pmu core_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
+
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32-bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -2732,6 +2958,7 @@ static __initconst const struct x86_pmu intel_pmu = {
 	.event_map		= intel_pmu_event_map,
 	.max_events		= ARRAY_SIZE(intel_perfmon_event_map),
 	.apic			= 1,
+	.free_running_flags	= PEBS_FREERUNNING_FLAGS,
 	/*
 	 * Intel PMCs cannot be accessed sanely above 32 bit width,
 	 * so we install an artificial 1<<31 period regardless of
@@ -3269,6 +3496,29 @@ __init int intel_pmu_init(void)
 		pr_cont("Broadwell events, ");
 		break;

+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		x86_pmu.late_ack = true;
+		memcpy(hw_cache_event_ids, skl_hw_cache_event_ids, sizeof(hw_cache_event_ids));
+		memcpy(hw_cache_extra_regs, skl_hw_cache_extra_regs, sizeof(hw_cache_extra_regs));
+		intel_pmu_lbr_init_skl();
+
+		x86_pmu.event_constraints = intel_skl_event_constraints;
+		x86_pmu.pebs_constraints = intel_skl_pebs_event_constraints;
+		x86_pmu.extra_regs = intel_skl_extra_regs;
+		x86_pmu.pebs_aliases = intel_pebs_aliases_snb;
+		/* all extra regs are per-cpu when HT is on */
+		x86_pmu.flags |= PMU_FL_HAS_RSP_1;
+		x86_pmu.flags |= PMU_FL_NO_HT_SHARING;
+
+		x86_pmu.hw_config = hsw_hw_config;
+		x86_pmu.get_event_constraints = hsw_get_event_constraints;
+		x86_pmu.cpu_events = hsw_events_attrs;
+		WARN_ON(!x86_pmu.format_attrs);
+		x86_pmu.cpu_events = hsw_events_attrs;
+		pr_cont("Skylake events, ");
+		break;
+
 	default:
 		switch (x86_pmu.version) {
 		case 1:
@@ -3338,7 +3588,7 @@ __init int intel_pmu_init(void)
 	 */
 	if (x86_pmu.extra_regs) {
 		for (er = x86_pmu.extra_regs; er->msr; er++) {
-			er->extra_msr_access = check_msr(er->msr, 0x1ffUL);
+			er->extra_msr_access = check_msr(er->msr, 0x11UL);
 			/* Disable LBR select mapping */
 			if ((er->idx == EXTRA_REG_LBR) && !er->extra_msr_access)
 				x86_pmu.lbr_sel_map = NULL;
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -62,9 +62,6 @@ struct bts_buffer {

 struct pmu bts_pmu;

-void intel_pmu_enable_bts(u64 config);
-void intel_pmu_disable_bts(void);
-
 static size_t buf_size(struct page *page)
 {
 	return 1 << (PAGE_SHIFT + page_private(page));
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -224,6 +224,19 @@ union hsw_tsx_tuning {

 #define PEBS_HSW_TSX_FLAGS	0xff00000000ULL

+/* Same as HSW, plus TSC */
+
+struct pebs_record_skl {
+	u64 flags, ip;
+	u64 ax, bx, cx, dx;
+	u64 si, di, bp, sp;
+	u64 r8,  r9,  r10, r11;
+	u64 r12, r13, r14, r15;
+	u64 status, dla, dse, lat;
+	u64 real_ip, tsx_tuning;
+	u64 tsc;
+};
+
 void init_debug_store_on_cpu(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -675,6 +688,28 @@ struct event_constraint intel_hsw_pebs_event_constraints[] = {
 	EVENT_CONSTRAINT_END
 };

+struct event_constraint intel_skl_pebs_event_constraints[] = {
+	INTEL_FLAGS_UEVENT_CONSTRAINT(0x1c0, 0x2),	/* INST_RETIRED.PREC_DIST */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
+	/* UOPS_RETIRED.ALL, inv=1, cmask=16 (cycles:p). */
+	INTEL_FLAGS_EVENT_CONSTRAINT(0x108001c2, 0xf),
+	INTEL_PLD_CONSTRAINT(0x1cd, 0xf),		      /* MEM_TRANS_RETIRED.* */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x11d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x12d0, 0xf), /* MEM_INST_RETIRED.STLB_MISS_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x21d0, 0xf), /* MEM_INST_RETIRED.LOCK_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x22d0, 0xf), /* MEM_INST_RETIRED.LOCK_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x41d0, 0xf), /* MEM_INST_RETIRED.SPLIT_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x42d0, 0xf), /* MEM_INST_RETIRED.SPLIT_STORES */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(0x81d0, 0xf), /* MEM_INST_RETIRED.ALL_LOADS */
+	INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(0x82d0, 0xf), /* MEM_INST_RETIRED.ALL_STORES */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd1, 0xf),    /* MEM_LOAD_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd2, 0xf),    /* MEM_LOAD_L3_HIT_RETIRED.* */
+	INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(0xd3, 0xf),    /* MEM_LOAD_L3_MISS_RETIRED.* */
+	/* Allow all events as PEBS with no flags */
+	INTEL_ALL_EVENT_CONSTRAINT(0, 0xf),
+	EVENT_CONSTRAINT_END
+};
+
 struct event_constraint *intel_pebs_constraints(struct perf_event *event)
 {
 	struct event_constraint *c;
@@ -754,6 +789,11 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	struct hw_perf_event *hwc = &event->hw;
 	struct debug_store *ds = cpuc->ds;
+	bool large_pebs = ds->pebs_interrupt_threshold >
+		ds->pebs_buffer_base + x86_pmu.pebs_record_size;
+
+	if (large_pebs)
+		intel_pmu_drain_pebs_buffer();

 	cpuc->pebs_enabled &= ~(1ULL << hwc->idx);

@@ -762,12 +802,8 @@ void intel_pmu_pebs_disable(struct perf_event *event)
 	else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST)
 		cpuc->pebs_enabled &= ~(1ULL << 63);

-	if (ds->pebs_interrupt_threshold >
-	    ds->pebs_buffer_base + x86_pmu.pebs_record_size) {
-		intel_pmu_drain_pebs_buffer();
-		if (!pebs_is_enabled(cpuc))
-			perf_sched_cb_dec(event->ctx->pmu);
-	}
+	if (large_pebs && !pebs_is_enabled(cpuc))
+		perf_sched_cb_dec(event->ctx->pmu);

 	if (cpuc->enabled)
 		wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
@@ -885,7 +921,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
 	return 0;
 }

-static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_weight(struct pebs_record_skl *pebs)
 {
 	if (pebs->tsx_tuning) {
 		union hsw_tsx_tuning tsx = { .value = pebs->tsx_tuning };
@@ -894,7 +930,7 @@ static inline u64 intel_hsw_weight(struct pebs_record_hsw *pebs)
 	return 0;
 }

-static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs)
+static inline u64 intel_hsw_transaction(struct pebs_record_skl *pebs)
 {
 	u64 txn = (pebs->tsx_tuning & PEBS_HSW_TSX_FLAGS) >> 32;

@@ -918,7 +954,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
 	 * unconditionally access the 'extra' entries.
 	 */
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
-	struct pebs_record_hsw *pebs = __pebs;
+	struct pebs_record_skl *pebs = __pebs;
 	u64 sample_type;
 	int fll, fst, dsrc;
 	int fl = event->hw.flags;
@@ -1016,6 +1052,16 @@ static void setup_pebs_sample_data(struct perf_event *event,
 			data->txn = intel_hsw_transaction(pebs);
 	}

+	/*
+	 * v3 supplies an accurate time stamp, so we use that
+	 * for the time stamp.
+	 *
+	 * We can only do this for the default trace clock.
+	 */
+	if (x86_pmu.intel_cap.pebs_format >= 3 &&
+		event->attr.use_clockid == 0)
+		data->time = native_sched_clock_from_tsc(pebs->tsc);
+
 	if (has_branch_stack(event))
 		data->br_stack = &cpuc->lbr_stack;
 }
@@ -1142,6 +1188,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)

 	for (at = base; at < top; at += x86_pmu.pebs_record_size) {
 		struct pebs_record_nhm *p = at;
+		u64 pebs_status;

 		/* PEBS v3 has accurate status bits */
 		if (x86_pmu.intel_cap.pebs_format >= 3) {
@@ -1152,12 +1199,17 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 			continue;
 		}

-		bit = find_first_bit((unsigned long *)&p->status,
+		pebs_status = p->status & cpuc->pebs_enabled;
+		pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1;
+
+		bit = find_first_bit((unsigned long *)&pebs_status,
 					x86_pmu.max_pebs_events);
-		if (bit >= x86_pmu.max_pebs_events)
-			continue;
-		if (!test_bit(bit, cpuc->active_mask))
+		if (WARN(bit >= x86_pmu.max_pebs_events,
+			 "PEBS record without PEBS event! status=%Lx pebs_enabled=%Lx active_mask=%Lx",
+			 (unsigned long long)p->status, (unsigned long long)cpuc->pebs_enabled,
+			 *(unsigned long long *)cpuc->active_mask))
 			continue;
+
 		/*
 		 * The PEBS hardware does not deal well with the situation
 		 * when events happen near to each other and multiple bits
@@ -1172,27 +1224,21 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
 		 * one, and it's not possible to reconstruct all events
 		 * that caused the PEBS record. It's called collision.
 		 * If collision happened, the record will be dropped.
-		 *
 		 */
-		if (p->status != (1 << bit)) {
-			u64 pebs_status;
-
-			/* slow path */
-			pebs_status = p->status & cpuc->pebs_enabled;
-			pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1;
-			if (pebs_status != (1 << bit)) {
-				for_each_set_bit(i, (unsigned long *)&pebs_status,
-						 MAX_PEBS_EVENTS)
-					error[i]++;
-				continue;
-			}
+		if (p->status != (1ULL << bit)) {
+			for_each_set_bit(i, (unsigned long *)&pebs_status,
+					 x86_pmu.max_pebs_events)
+				error[i]++;
+			continue;
 		}
+
 		counts[bit]++;
 	}

 	for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) {
 		if ((counts[bit] == 0) && (error[bit] == 0))
 			continue;
+
 		event = cpuc->events[bit];
 		WARN_ON_ONCE(!event);
 		WARN_ON_ONCE(!event->attr.precise_ip);
@@ -1245,6 +1291,14 @@ void __init intel_ds_init(void)
 			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
 			break;

+		case 3:
+			pr_cont("PEBS fmt3%c, ", pebs_type);
+			x86_pmu.pebs_record_size =
+						sizeof(struct pebs_record_skl);
+			x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+			x86_pmu.free_running_flags |= PERF_SAMPLE_TIME;
+			break;
+
 		default:
 			printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
 			x86_pmu.pebs = 0;
--- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -13,7 +13,8 @@ enum {
 	LBR_FORMAT_EIP		= 0x02,
 	LBR_FORMAT_EIP_FLAGS	= 0x03,
 	LBR_FORMAT_EIP_FLAGS2	= 0x04,
-	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_EIP_FLAGS2,
+	LBR_FORMAT_INFO		= 0x05,
+	LBR_FORMAT_MAX_KNOWN    = LBR_FORMAT_INFO,
 };

 static enum {
@@ -139,6 +140,13 @@ static void __intel_pmu_lbr_enable(bool pmi)
 	struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
 	u64 debugctl, lbr_select = 0, orig_debugctl;

+	/*
+	 * No need to unfreeze manually, as v4 can do that as part
+	 * of the GLOBAL_STATUS ack.
+	 */
+	if (pmi && x86_pmu.version >= 4)
+		return;
+
 	/*
 	 * No need to reprogram LBR_SELECT in a PMI, as it
 	 * did not change.
@@ -186,6 +194,8 @@ static void intel_pmu_lbr_reset_64(void)
 	for (i = 0; i < x86_pmu.lbr_nr; i++) {
 		wrmsrl(x86_pmu.lbr_from + i, 0);
 		wrmsrl(x86_pmu.lbr_to   + i, 0);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + i, 0);
 	}
 }

@@ -230,10 +240,12 @@ static void __intel_pmu_lbr_restore(struct x86_perf_task_context *task_ctx)

 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		wrmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		wrmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			wrmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_NONE;
 }
@@ -251,10 +263,12 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx)

 	mask = x86_pmu.lbr_nr - 1;
 	tos = intel_pmu_lbr_tos();
-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	for (i = 0; i < tos; i++) {
 		lbr_idx = (tos - i) & mask;
 		rdmsrl(x86_pmu.lbr_from + lbr_idx, task_ctx->lbr_from[i]);
 		rdmsrl(x86_pmu.lbr_to + lbr_idx, task_ctx->lbr_to[i]);
+		if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_INFO)
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, task_ctx->lbr_info[i]);
 	}
 	task_ctx->lbr_stack_state = LBR_VALID;
 }
@@ -411,16 +425,31 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 	u64 tos = intel_pmu_lbr_tos();
 	int i;
 	int out = 0;
+	int num = x86_pmu.lbr_nr;

-	for (i = 0; i < x86_pmu.lbr_nr; i++) {
+	if (cpuc->lbr_sel->config & LBR_CALL_STACK)
+		num = tos;
+
+	for (i = 0; i < num; i++) {
 		unsigned long lbr_idx = (tos - i) & mask;
 		u64 from, to, mis = 0, pred = 0, in_tx = 0, abort = 0;
 		int skip = 0;
+		u16 cycles = 0;
 		int lbr_flags = lbr_desc[lbr_format];

 		rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
 		rdmsrl(x86_pmu.lbr_to   + lbr_idx, to);

+		if (lbr_format == LBR_FORMAT_INFO) {
+			u64 info;
+
+			rdmsrl(MSR_LBR_INFO_0 + lbr_idx, info);
+			mis = !!(info & LBR_INFO_MISPRED);
+			pred = !mis;
+			in_tx = !!(info & LBR_INFO_IN_TX);
+			abort = !!(info & LBR_INFO_ABORT);
+			cycles = (info & LBR_INFO_CYCLES);
+		}
 		if (lbr_flags & LBR_EIP_FLAGS) {
 			mis = !!(from & LBR_FROM_FLAG_MISPRED);
 			pred = !mis;
@@ -450,6 +479,7 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
 		cpuc->lbr_entries[out].predicted = pred;
 		cpuc->lbr_entries[out].in_tx	 = in_tx;
 		cpuc->lbr_entries[out].abort	 = abort;
+		cpuc->lbr_entries[out].cycles	 = cycles;
 		cpuc->lbr_entries[out].reserved	 = 0;
 		out++;
 	}
@@ -947,6 +977,26 @@ void intel_pmu_lbr_init_hsw(void)
 	pr_cont("16-deep LBR, ");
 }

+/* skylake */
+__init void intel_pmu_lbr_init_skl(void)
+{
+	x86_pmu.lbr_nr	 = 32;
+	x86_pmu.lbr_tos	 = MSR_LBR_TOS;
+	x86_pmu.lbr_from = MSR_LBR_NHM_FROM;
+	x86_pmu.lbr_to   = MSR_LBR_NHM_TO;
+
+	x86_pmu.lbr_sel_mask = LBR_SEL_MASK;
+	x86_pmu.lbr_sel_map  = hsw_lbr_sel_map;
+
+	/*
+	 * SW branch filter usage:
+	 * - support syscall, sysret capture.
+	 *   That requires LBR_FAR but that means far
+	 *   jmp need to be filtered out
+	 */
+	pr_cont("32-deep LBR, ");
+}
+
 /* atom */
 void __init intel_pmu_lbr_init_atom(void)
 {
--- a/arch/x86/kernel/cpu/perf_event_intel_pt.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c
@@ -65,15 +65,21 @@ static struct pt_cap_desc {
 } pt_caps[] = {
 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
+	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
+	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
+	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
+	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
+	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
+	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
 };

 static u32 pt_cap_get(enum pt_capabilities cap)
 {
 	struct pt_cap_desc *cd = &pt_caps[cap];
-	u32 c = pt_pmu.caps[cd->leaf * 4 + cd->reg];
+	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
 	unsigned int shift = __ffs(cd->mask);

 	return (c & cd->mask) >> shift;
@@ -94,12 +100,22 @@ static struct attribute_group pt_cap_group = {
 	.name	= "caps",
 };

+PMU_FORMAT_ATTR(cyc,		"config:1"	);
+PMU_FORMAT_ATTR(mtc,		"config:9"	);
 PMU_FORMAT_ATTR(tsc,		"config:10"	);
 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
+PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
+PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
+PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);

 static struct attribute *pt_formats_attr[] = {
+	&format_attr_cyc.attr,
+	&format_attr_mtc.attr,
 	&format_attr_tsc.attr,
 	&format_attr_noretcomp.attr,
+	&format_attr_mtc_period.attr,
+	&format_attr_cyc_thresh.attr,
+	&format_attr_psb_period.attr,
 	NULL,
 };

@@ -129,10 +145,10 @@ static int __init pt_pmu_hw_init(void)

 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
 		cpuid_count(20, i,
-			    &pt_pmu.caps[CR_EAX + i*4],
-			    &pt_pmu.caps[CR_EBX + i*4],
-			    &pt_pmu.caps[CR_ECX + i*4],
-			    &pt_pmu.caps[CR_EDX + i*4]);
+			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
+			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
 	}

 	ret = -ENOMEM;
@@ -170,15 +186,65 @@ fail:
 	return ret;
 }

-#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | RTIT_CTL_DISRETC)
+#define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
+			  RTIT_CTL_CYC_THRESH	| \
+			  RTIT_CTL_PSB_FREQ)
+
+#define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
+			 RTIT_CTL_MTC_RANGE)
+
+#define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
+			RTIT_CTL_DISRETC	| \
+			RTIT_CTL_CYC_PSB	| \
+			RTIT_CTL_MTC)

 static bool pt_event_valid(struct perf_event *event)
 {
 	u64 config = event->attr.config;
+	u64 allowed, requested;

 	if ((config & PT_CONFIG_MASK) != config)
 		return false;

+	if (config & RTIT_CTL_CYC_PSB) {
+		if (!pt_cap_get(PT_CAP_psb_cyc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_psb_periods);
+		requested = (config & RTIT_CTL_PSB_FREQ) >>
+			RTIT_CTL_PSB_FREQ_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
+		requested = (config & RTIT_CTL_CYC_THRESH) >>
+			RTIT_CTL_CYC_THRESH_OFFSET;
+		if (requested && (!(allowed & BIT(requested))))
+			return false;
+	}
+
+	if (config & RTIT_CTL_MTC) {
+		/*
+		 * In the unlikely case that CPUID lists valid mtc periods,
+		 * but not the mtc capability, drop out here.
+		 *
+		 * Spec says that setting mtc period bits while mtc bit in
+		 * CPUID is 0 will #GP, so better safe than sorry.
+		 */
+		if (!pt_cap_get(PT_CAP_mtc))
+			return false;
+
+		allowed = pt_cap_get(PT_CAP_mtc_periods);
+		if (!allowed)
+			return false;
+
+		requested = (config & RTIT_CTL_MTC_RANGE) >>
+			RTIT_CTL_MTC_RANGE_OFFSET;
+
+		if (!(allowed & BIT(requested)))
+			return false;
+	}
+
 	return true;
 }

@@ -191,6 +257,11 @@ static void pt_config(struct perf_event *event)
 {
 	u64 reg;

+	if (!event->hw.itrace_started) {
+		event->hw.itrace_started = 1;
+		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
+	}
+
 	reg = RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;

 	if (!event->attr.exclude_kernel)
@@ -910,7 +981,6 @@ void intel_pt_interrupt(void)

 		pt_config_buffer(buf->cur->table, buf->cur_idx,
 				 buf->output_off);
-		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 		pt_config(event);
 	}
 }
@@ -934,7 +1004,6 @@ static void pt_event_start(struct perf_event *event, int mode)

 	pt_config_buffer(buf->cur->table, buf->cur_idx,
 			 buf->output_off);
-	wrmsrl(MSR_IA32_RTIT_STATUS, 0);
 	pt_config(event);
 }

--- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c
@@ -86,6 +86,10 @@ static const char *rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
 			 1<<RAPL_IDX_RAM_NRG_STAT|\
 			 1<<RAPL_IDX_PP1_NRG_STAT)

+/* Knights Landing has PKG, RAM */
+#define RAPL_IDX_KNL	(1<<RAPL_IDX_PKG_NRG_STAT|\
+			 1<<RAPL_IDX_RAM_NRG_STAT)
+
 /*
 * event code: LSB 8 bits, passed in attr->config
 * any other bit is reserved
@@ -486,6 +490,18 @@ static struct attribute *rapl_events_hsw_attr[] = {
 	NULL,
 };

+static struct attribute *rapl_events_knl_attr[] = {
+	EVENT_PTR(rapl_pkg),
+	EVENT_PTR(rapl_ram),
+
+	EVENT_PTR(rapl_pkg_unit),
+	EVENT_PTR(rapl_ram_unit),
+
+	EVENT_PTR(rapl_pkg_scale),
+	EVENT_PTR(rapl_ram_scale),
+	NULL,
+};
+
 static struct attribute_group rapl_pmu_events_group = {
 	.name = "events",
 	.attrs = NULL, /* patched at runtime */
@@ -730,6 +746,10 @@ static int __init rapl_pmu_init(void)
 		rapl_cntr_mask = RAPL_IDX_SRV;
 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
 		break;
+	case 87: /* Knights Landing */
+		rapl_add_quirk(rapl_hsw_server_quirk);
+		rapl_cntr_mask = RAPL_IDX_KNL;
+		rapl_pmu_events_group.attrs = rapl_events_knl_attr;

 	default:
 		/* unsupported */
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c
@@ -911,6 +911,9 @@ static int __init uncore_pci_init(void)
 	case 63: /* Haswell-EP */
 		ret = hswep_uncore_pci_init();
 		break;
+	case 86: /* BDX-DE */
+		ret = bdx_uncore_pci_init();
+		break;
 	case 42: /* Sandy Bridge */
 		ret = snb_uncore_pci_init();
 		break;
@@ -1209,6 +1212,11 @@ static int __init uncore_cpu_init(void)
 		break;
 	case 42: /* Sandy Bridge */
 	case 58: /* Ivy Bridge */
+	case 60: /* Haswell */
+	case 69: /* Haswell */
+	case 70: /* Haswell */
+	case 61: /* Broadwell */
+	case 71: /* Broadwell */
 		snb_uncore_cpu_init();
 		break;
 	case 45: /* Sandy Bridge-EP */
@@ -1224,6 +1232,9 @@ static int __init uncore_cpu_init(void)
 	case 63: /* Haswell-EP */
 		hswep_uncore_cpu_init();
 		break;
+	case 86: /* BDX-DE */
+		bdx_uncore_cpu_init();
+		break;
 	default:
 		return 0;
 	}
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h
@@ -336,6 +336,8 @@ int ivbep_uncore_pci_init(void);
 void ivbep_uncore_cpu_init(void);
 int hswep_uncore_pci_init(void);
 void hswep_uncore_cpu_init(void);
+int bdx_uncore_pci_init(void);
+void bdx_uncore_cpu_init(void);

 /* perf_event_intel_uncore_nhmex.c */
 void nhmex_uncore_cpu_init(void);
--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c
@@ -45,6 +45,11 @@
 #define SNB_UNC_CBO_0_PER_CTR0                  0x706
 #define SNB_UNC_CBO_MSR_OFFSET                  0x10

+/* SNB ARB register */
+#define SNB_UNC_ARB_PER_CTR0			0x3b0
+#define SNB_UNC_ARB_PERFEVTSEL0			0x3b2
+#define SNB_UNC_ARB_MSR_OFFSET			0x10
+
 /* NHM global control register */
 #define NHM_UNC_PERF_GLOBAL_CTL                 0x391
 #define NHM_UNC_FIXED_CTR                       0x394
@@ -115,7 +120,7 @@ static struct intel_uncore_ops snb_uncore_msr_ops = {
 	.read_counter	= uncore_msr_read_counter,
 };

-static struct event_constraint snb_uncore_cbox_constraints[] = {
+static struct event_constraint snb_uncore_arb_constraints[] = {
 	UNCORE_EVENT_CONSTRAINT(0x80, 0x1),
 	UNCORE_EVENT_CONSTRAINT(0x83, 0x1),
 	EVENT_CONSTRAINT_END
@@ -134,14 +139,28 @@ static struct intel_uncore_type snb_uncore_cbox = {
 	.single_fixed	= 1,
 	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
 	.msr_offset	= SNB_UNC_CBO_MSR_OFFSET,
-	.constraints	= snb_uncore_cbox_constraints,
 	.ops		= &snb_uncore_msr_ops,
 	.format_group	= &snb_uncore_format_group,
 	.event_descs	= snb_uncore_events,
 };

+static struct intel_uncore_type snb_uncore_arb = {
+	.name		= "arb",
+	.num_counters   = 2,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 44,
+	.perf_ctr	= SNB_UNC_ARB_PER_CTR0,
+	.event_ctl	= SNB_UNC_ARB_PERFEVTSEL0,
+	.event_mask	= SNB_UNC_RAW_EVENT_MASK,
+	.msr_offset	= SNB_UNC_ARB_MSR_OFFSET,
+	.constraints	= snb_uncore_arb_constraints,
+	.ops		= &snb_uncore_msr_ops,
+	.format_group	= &snb_uncore_format_group,
+};
+
 static struct intel_uncore_type *snb_msr_uncores[] = {
 	&snb_uncore_cbox,
+	&snb_uncore_arb,
 	NULL,
 };

--- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snbep.c
@@ -2215,7 +2215,7 @@ static struct intel_uncore_type *hswep_pci_uncores[] = {
 	NULL,
 };

-static DEFINE_PCI_DEVICE_TABLE(hswep_uncore_pci_ids) = {
+static const struct pci_device_id hswep_uncore_pci_ids[] = {
 	{ /* Home Agent 0 */
 		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2f30),
 		.driver_data = UNCORE_PCI_DEV_DATA(HSWEP_PCI_UNCORE_HA, 0),
@@ -2321,3 +2321,167 @@ int hswep_uncore_pci_init(void)
 	return 0;
 }
 /* end of Haswell-EP uncore support */
+
+/* BDX-DE uncore support */
+
+static struct intel_uncore_type bdx_uncore_ubox = {
+	.name			= "ubox",
+	.num_counters		= 2,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.fixed_ctr_bits		= 48,
+	.perf_ctr		= HSWEP_U_MSR_PMON_CTR0,
+	.event_ctl		= HSWEP_U_MSR_PMON_CTL0,
+	.event_mask		= SNBEP_U_MSR_PMON_RAW_EVENT_MASK,
+	.fixed_ctr		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTR,
+	.fixed_ctl		= HSWEP_U_MSR_PMON_UCLK_FIXED_CTL,
+	.num_shared_regs	= 1,
+	.ops			= &ivbep_uncore_msr_ops,
+	.format_group		= &ivbep_uncore_ubox_format_group,
+};
+
+static struct event_constraint bdx_uncore_cbox_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x09, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x36, 0x1),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_cbox = {
+	.name			= "cbox",
+	.num_counters		= 4,
+	.num_boxes		= 8,
+	.perf_ctr_bits		= 48,
+	.event_ctl		= HSWEP_C0_MSR_PMON_CTL0,
+	.perf_ctr		= HSWEP_C0_MSR_PMON_CTR0,
+	.event_mask		= SNBEP_CBO_MSR_PMON_RAW_EVENT_MASK,
+	.box_ctl		= HSWEP_C0_MSR_PMON_BOX_CTL,
+	.msr_offset		= HSWEP_CBO_MSR_OFFSET,
+	.num_shared_regs	= 1,
+	.constraints		= bdx_uncore_cbox_constraints,
+	.ops			= &hswep_uncore_cbox_ops,
+	.format_group		= &hswep_uncore_cbox_format_group,
+};
+
+static struct intel_uncore_type *bdx_msr_uncores[] = {
+	&bdx_uncore_ubox,
+	&bdx_uncore_cbox,
+	&hswep_uncore_pcu,
+	NULL,
+};
+
+void bdx_uncore_cpu_init(void)
+{
+	if (bdx_uncore_cbox.num_boxes > boot_cpu_data.x86_max_cores)
+		bdx_uncore_cbox.num_boxes = boot_cpu_data.x86_max_cores;
+	uncore_msr_uncores = bdx_msr_uncores;
+}
+
+static struct intel_uncore_type bdx_uncore_ha = {
+	.name		= "ha",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_imc = {
+	.name		= "imc",
+	.num_counters   = 5,
+	.num_boxes	= 2,
+	.perf_ctr_bits	= 48,
+	.fixed_ctr_bits	= 48,
+	.fixed_ctr	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTR,
+	.fixed_ctl	= SNBEP_MC_CHy_PCI_PMON_FIXED_CTL,
+	.event_descs	= hswep_uncore_imc_events,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+static struct intel_uncore_type bdx_uncore_irp = {
+	.name			= "irp",
+	.num_counters		= 4,
+	.num_boxes		= 1,
+	.perf_ctr_bits		= 48,
+	.event_mask		= SNBEP_PMON_RAW_EVENT_MASK,
+	.box_ctl		= SNBEP_PCI_PMON_BOX_CTL,
+	.ops			= &hswep_uncore_irp_ops,
+	.format_group		= &snbep_uncore_format_group,
+};
+
+
+static struct event_constraint bdx_uncore_r2pcie_constraints[] = {
+	UNCORE_EVENT_CONSTRAINT(0x10, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x11, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x13, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x23, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x25, 0x1),
+	UNCORE_EVENT_CONSTRAINT(0x26, 0x3),
+	UNCORE_EVENT_CONSTRAINT(0x2d, 0x3),
+	EVENT_CONSTRAINT_END
+};
+
+static struct intel_uncore_type bdx_uncore_r2pcie = {
+	.name		= "r2pcie",
+	.num_counters   = 4,
+	.num_boxes	= 1,
+	.perf_ctr_bits	= 48,
+	.constraints	= bdx_uncore_r2pcie_constraints,
+	SNBEP_UNCORE_PCI_COMMON_INIT(),
+};
+
+enum {
+	BDX_PCI_UNCORE_HA,
+	BDX_PCI_UNCORE_IMC,
+	BDX_PCI_UNCORE_IRP,
+	BDX_PCI_UNCORE_R2PCIE,
+};
+
+static struct intel_uncore_type *bdx_pci_uncores[] = {
+	[BDX_PCI_UNCORE_HA]	= &bdx_uncore_ha,
+	[BDX_PCI_UNCORE_IMC]	= &bdx_uncore_imc,
+	[BDX_PCI_UNCORE_IRP]	= &bdx_uncore_irp,
+	[BDX_PCI_UNCORE_R2PCIE]	= &bdx_uncore_r2pcie,
+	NULL,
+};
+
+static DEFINE_PCI_DEVICE_TABLE(bdx_uncore_pci_ids) = {
+	{ /* Home Agent 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f30),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_HA, 0),
+	},
+	{ /* MC0 Channel 0 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb0),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 0),
+	},
+	{ /* MC0 Channel 1 */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6fb1),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IMC, 1),
+	},
+	{ /* IRP */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f39),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_IRP, 0),
+	},
+	{ /* R2PCIe */
+		PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x6f34),
+		.driver_data = UNCORE_PCI_DEV_DATA(BDX_PCI_UNCORE_R2PCIE, 0),
+	},
+	{ /* end: all zeroes */ }
+};
+
+static struct pci_driver bdx_uncore_pci_driver = {
+	.name		= "bdx_uncore",
+	.id_table	= bdx_uncore_pci_ids,
+};
+
+int bdx_uncore_pci_init(void)
+{
+	int ret = snbep_pci2phy_map_init(0x6f1e);
+
+	if (ret)
+		return ret;
+	uncore_pci_uncores = bdx_pci_uncores;
+	uncore_pci_driver = &bdx_uncore_pci_driver;
+	return 0;
+}
+
+/* end of BDX-DE uncore support */
--- a/arch/x86/kernel/cpu/perf_event_msr.c
+++ b/arch/x86/kernel/cpu/perf_event_msr.c
@@ -0,0 +1,242 @@
+#include <linux/perf_event.h>
+
+enum perf_msr_id {
+	PERF_MSR_TSC			= 0,
+	PERF_MSR_APERF			= 1,
+	PERF_MSR_MPERF			= 2,
+	PERF_MSR_PPERF			= 3,
+	PERF_MSR_SMI			= 4,
+
+	PERF_MSR_EVENT_MAX,
+};
+
+bool test_aperfmperf(int idx)
+{
+	return boot_cpu_has(X86_FEATURE_APERFMPERF);
+}
+
+bool test_intel(int idx)
+{
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ||
+	    boot_cpu_data.x86 != 6)
+		return false;
+
+	switch (boot_cpu_data.x86_model) {
+	case 30: /* 45nm Nehalem    */
+	case 26: /* 45nm Nehalem-EP */
+	case 46: /* 45nm Nehalem-EX */
+
+	case 37: /* 32nm Westmere    */
+	case 44: /* 32nm Westmere-EP */
+	case 47: /* 32nm Westmere-EX */
+
+	case 42: /* 32nm SandyBridge         */
+	case 45: /* 32nm SandyBridge-E/EN/EP */
+
+	case 58: /* 22nm IvyBridge       */
+	case 62: /* 22nm IvyBridge-EP/EX */
+
+	case 60: /* 22nm Haswell Core */
+	case 63: /* 22nm Haswell Server */
+	case 69: /* 22nm Haswell ULT */
+	case 70: /* 22nm Haswell + GT3e (Intel Iris Pro graphics) */
+
+	case 61: /* 14nm Broadwell Core-M */
+	case 86: /* 14nm Broadwell Xeon D */
+	case 71: /* 14nm Broadwell + GT3e (Intel Iris Pro graphics) */
+	case 79: /* 14nm Broadwell Server */
+
+	case 55: /* 22nm Atom "Silvermont"                */
+	case 77: /* 22nm Atom "Silvermont Avoton/Rangely" */
+	case 76: /* 14nm Atom "Airmont"                   */
+		if (idx == PERF_MSR_SMI)
+			return true;
+		break;
+
+	case 78: /* 14nm Skylake Mobile */
+	case 94: /* 14nm Skylake Desktop */
+		if (idx == PERF_MSR_SMI || idx == PERF_MSR_PPERF)
+			return true;
+		break;
+	}
+
+	return false;
+}
+
+struct perf_msr {
+	u64	msr;
+	struct	perf_pmu_events_attr *attr;
+	bool	(*test)(int idx);
+};
+
+PMU_EVENT_ATTR_STRING(tsc,   evattr_tsc,   "event=0x00");
+PMU_EVENT_ATTR_STRING(aperf, evattr_aperf, "event=0x01");
+PMU_EVENT_ATTR_STRING(mperf, evattr_mperf, "event=0x02");
+PMU_EVENT_ATTR_STRING(pperf, evattr_pperf, "event=0x03");
+PMU_EVENT_ATTR_STRING(smi,   evattr_smi,   "event=0x04");
+
+static struct perf_msr msr[] = {
+	[PERF_MSR_TSC]   = { 0,			&evattr_tsc,	NULL,		 },
+	[PERF_MSR_APERF] = { MSR_IA32_APERF,	&evattr_aperf,	test_aperfmperf, },
+	[PERF_MSR_MPERF] = { MSR_IA32_MPERF,	&evattr_mperf,	test_aperfmperf, },
+	[PERF_MSR_PPERF] = { MSR_PPERF,		&evattr_pperf,	test_intel,	 },
+	[PERF_MSR_SMI]   = { MSR_SMI_COUNT,	&evattr_smi,	test_intel,	 },
+};
+
+static struct attribute *events_attrs[PERF_MSR_EVENT_MAX + 1] = {
+	NULL,
+};
+
+static struct attribute_group events_attr_group = {
+	.name = "events",
+	.attrs = events_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-63");
+static struct attribute *format_attrs[] = {
+	&format_attr_event.attr,
+	NULL,
+};
+static struct attribute_group format_attr_group = {
+	.name = "format",
+	.attrs = format_attrs,
+};
+
+static const struct attribute_group *attr_groups[] = {
+	&events_attr_group,
+	&format_attr_group,
+	NULL,
+};
+
+static int msr_event_init(struct perf_event *event)
+{
+	u64 cfg = event->attr.config;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (cfg >= PERF_MSR_EVENT_MAX)
+		return -EINVAL;
+
+	/* unsupported modes and filters */
+	if (event->attr.exclude_user   ||
+	    event->attr.exclude_kernel ||
+	    event->attr.exclude_hv     ||
+	    event->attr.exclude_idle   ||
+	    event->attr.exclude_host   ||
+	    event->attr.exclude_guest  ||
+	    event->attr.sample_period) /* no sampling */
+		return -EINVAL;
+
+	if (!msr[cfg].attr)
+		return -EINVAL;
+
+	event->hw.idx = -1;
+	event->hw.event_base = msr[cfg].msr;
+	event->hw.config = cfg;
+
+	return 0;
+}
+
+static inline u64 msr_read_counter(struct perf_event *event)
+{
+	u64 now;
+
+	if (event->hw.event_base)
+		rdmsrl(event->hw.event_base, now);
+	else
+		rdtscll(now);
+
+	return now;
+}
+static void msr_event_update(struct perf_event *event)
+{
+	u64 prev, now;
+	s64 delta;
+
+	/* Careful, an NMI might modify the previous event value. */
+again:
+	prev = local64_read(&event->hw.prev_count);
+	now = msr_read_counter(event);
+
+	if (local64_cmpxchg(&event->hw.prev_count, prev, now) != prev)
+		goto again;
+
+	delta = now - prev;
+	if (unlikely(event->hw.event_base == MSR_SMI_COUNT)) {
+		delta <<= 32;
+		delta >>= 32; /* sign extend */
+	}
+	local64_add(now - prev, &event->count);
+}
+
+static void msr_event_start(struct perf_event *event, int flags)
+{
+	u64 now;
+
+	now = msr_read_counter(event);
+	local64_set(&event->hw.prev_count, now);
+}
+
+static void msr_event_stop(struct perf_event *event, int flags)
+{
+	msr_event_update(event);
+}
+
+static void msr_event_del(struct perf_event *event, int flags)
+{
+	msr_event_stop(event, PERF_EF_UPDATE);
+}
+
+static int msr_event_add(struct perf_event *event, int flags)
+{
+	if (flags & PERF_EF_START)
+		msr_event_start(event, flags);
+
+	return 0;
+}
+
+static struct pmu pmu_msr = {
+	.task_ctx_nr	= perf_sw_context,
+	.attr_groups	= attr_groups,
+	.event_init	= msr_event_init,
+	.add		= msr_event_add,
+	.del		= msr_event_del,
+	.start		= msr_event_start,
+	.stop		= msr_event_stop,
+	.read		= msr_event_update,
+	.capabilities	= PERF_PMU_CAP_NO_INTERRUPT,
+};
+
+static int __init msr_init(void)
+{
+	int i, j = 0;
+
+	if (!boot_cpu_has(X86_FEATURE_TSC)) {
+		pr_cont("no MSR PMU driver.\n");
+		return 0;
+	}
+
+	/* Probe the MSRs. */
+	for (i = PERF_MSR_TSC + 1; i < PERF_MSR_EVENT_MAX; i++) {
+		u64 val;
+
+		/*
+		 * Virt sucks arse; you cannot tell if a R/O MSR is present :/
+		 */
+		if (!msr[i].test(i) || rdmsrl_safe(msr[i].msr, &val))
+			msr[i].attr = NULL;
+	}
+
+	/* List remaining MSRs in the sysfs attrs. */
+	for (i = 0; i < PERF_MSR_EVENT_MAX; i++) {
+		if (msr[i].attr)
+			events_attrs[j++] = &msr[i].attr->attr.attr;
+	}
+	events_attrs[j] = NULL;
+
+	perf_pmu_register(&pmu_msr, "msr", -1);
+
+	return 0;
+}
+device_initcall(msr_init);
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int cpuid_class_cpu_callback(struct notifier_block *nfb,
 	return notifier_from_errno(err);
 }

-static struct notifier_block __refdata cpuid_class_cpu_notifier =
+static struct notifier_block cpuid_class_cpu_notifier =
 {
 	.notifier_call = cpuid_class_cpu_callback,
 };
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}

--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -226,22 +226,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
 */
 static unsigned long hpet_freq;

-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt);
-static int hpet_legacy_next_event(unsigned long delta,
-			   struct clock_event_device *evt);
-
-/*
- * The hpet clock event device
- */
-static struct clock_event_device hpet_clockevent = {
-	.name		= "hpet",
-	.features	= CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
-	.set_mode	= hpet_legacy_set_mode,
-	.set_next_event = hpet_legacy_next_event,
-	.irq		= 0,
-	.rating		= 50,
-};
+static struct clock_event_device hpet_clockevent;

 static void hpet_stop_counter(void)
 {
@@ -306,64 +291,74 @@ static void hpet_legacy_clockevent_register(void)
 	printk(KERN_DEBUG "hpet clockevent registered\n");
 }

-static void hpet_set_mode(enum clock_event_mode mode,
-			  struct clock_event_device *evt, int timer)
+static int hpet_set_periodic(struct clock_event_device *evt, int timer)
 {
 	unsigned int cfg, cmp, now;
 	uint64_t delta;

-	switch (mode) {
-	case CLOCK_EVT_MODE_PERIODIC:
-		hpet_stop_counter();
-		delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
-		delta >>= evt->shift;
-		now = hpet_readl(HPET_COUNTER);
-		cmp = now + (unsigned int) delta;
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
-		       HPET_TN_SETVAL | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		hpet_writel(cmp, HPET_Tn_CMP(timer));
-		udelay(1);
-		/*
-		 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
-		 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
-		 * bit is automatically cleared after the first write.
-		 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
-		 * Publication # 24674)
-		 */
-		hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
-		hpet_start_counter();
-		hpet_print_config();
-		break;
+	hpet_stop_counter();
+	delta = ((uint64_t)(NSEC_PER_SEC / HZ)) * evt->mult;
+	delta >>= evt->shift;
+	now = hpet_readl(HPET_COUNTER);
+	cmp = now + (unsigned int)delta;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL |
+	       HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+	hpet_writel(cmp, HPET_Tn_CMP(timer));
+	udelay(1);
+	/*
+	 * HPET on AMD 81xx needs a second write (with HPET_TN_SETVAL
+	 * cleared) to T0_CMP to set the period. The HPET_TN_SETVAL
+	 * bit is automatically cleared after the first write.
+	 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
+	 * Publication # 24674)
+	 */
+	hpet_writel((unsigned int)delta, HPET_Tn_CMP(timer));
+	hpet_start_counter();
+	hpet_print_config();

-	case CLOCK_EVT_MODE_ONESHOT:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_PERIODIC;
-		cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+	return 0;
+}

-	case CLOCK_EVT_MODE_UNUSED:
-	case CLOCK_EVT_MODE_SHUTDOWN:
-		cfg = hpet_readl(HPET_Tn_CFG(timer));
-		cfg &= ~HPET_TN_ENABLE;
-		hpet_writel(cfg, HPET_Tn_CFG(timer));
-		break;
+static int hpet_set_oneshot(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;

-	case CLOCK_EVT_MODE_RESUME:
-		if (timer == 0) {
-			hpet_enable_legacy_int();
-		} else {
-			struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-			irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
-			disable_irq(hdev->irq);
-			irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
-			enable_irq(hdev->irq);
-		}
-		hpet_print_config();
-		break;
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_PERIODIC;
+	cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_shutdown(struct clock_event_device *evt, int timer)
+{
+	unsigned int cfg;
+
+	cfg = hpet_readl(HPET_Tn_CFG(timer));
+	cfg &= ~HPET_TN_ENABLE;
+	hpet_writel(cfg, HPET_Tn_CFG(timer));
+
+	return 0;
+}
+
+static int hpet_resume(struct clock_event_device *evt, int timer)
+{
+	if (!timer) {
+		hpet_enable_legacy_int();
+	} else {
+		struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+		irq_domain_activate_irq(irq_get_irq_data(hdev->irq));
+		disable_irq(hdev->irq);
+		irq_set_affinity(hdev->irq, cpumask_of(hdev->cpu));
+		enable_irq(hdev->irq);
 	}
+	hpet_print_config();
+
+	return 0;
 }

 static int hpet_next_event(unsigned long delta,
@@ -403,10 +398,24 @@ static int hpet_next_event(unsigned long delta,
 	return res < HPET_MIN_CYCLES ? -ETIME : 0;
 }

-static void hpet_legacy_set_mode(enum clock_event_mode mode,
-			struct clock_event_device *evt)
+static int hpet_legacy_shutdown(struct clock_event_device *evt)
 {
-	hpet_set_mode(mode, evt, 0);
+	return hpet_shutdown(evt, 0);
+}
+
+static int hpet_legacy_set_oneshot(struct clock_event_device *evt)
+{
+	return hpet_set_oneshot(evt, 0);
+}
+
+static int hpet_legacy_set_periodic(struct clock_event_device *evt)
+{
+	return hpet_set_periodic(evt, 0);
+}
+
+static int hpet_legacy_resume(struct clock_event_device *evt)
+{
+	return hpet_resume(evt, 0);
 }

 static int hpet_legacy_next_event(unsigned long delta,
@@ -415,6 +424,22 @@ static int hpet_legacy_next_event(unsigned long delta,
 	return hpet_next_event(delta, evt, 0);
 }

+/*
+ * The hpet clock event device
+ */
+static struct clock_event_device hpet_clockevent = {
+	.name			= "hpet",
+	.features		= CLOCK_EVT_FEAT_PERIODIC |
+				  CLOCK_EVT_FEAT_ONESHOT,
+	.set_state_periodic	= hpet_legacy_set_periodic,
+	.set_state_oneshot	= hpet_legacy_set_oneshot,
+	.set_state_shutdown	= hpet_legacy_shutdown,
+	.tick_resume		= hpet_legacy_resume,
+	.set_next_event		= hpet_legacy_next_event,
+	.irq			= 0,
+	.rating			= 50,
+};
+
 /*
 * HPET MSI Support
 */
@@ -426,7 +451,7 @@ static struct irq_domain *hpet_domain;

 void hpet_msi_unmask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;

 	/* unmask it */
@@ -437,7 +462,7 @@ void hpet_msi_unmask(struct irq_data *data)

 void hpet_msi_mask(struct irq_data *data)
 {
-	struct hpet_dev *hdev = data->handler_data;
+	struct hpet_dev *hdev = irq_data_get_irq_handler_data(data);
 	unsigned int cfg;

 	/* mask it */
@@ -459,11 +484,32 @@ void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
 	msg->address_hi = 0;
 }

-static void hpet_msi_set_mode(enum clock_event_mode mode,
-				struct clock_event_device *evt)
+static int hpet_msi_shutdown(struct clock_event_device *evt)
 {
 	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
-	hpet_set_mode(mode, evt, hdev->num);
+
+	return hpet_shutdown(evt, hdev->num);
+}
+
+static int hpet_msi_set_oneshot(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_set_oneshot(evt, hdev->num);
+}
+
+static int hpet_msi_set_periodic(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_set_periodic(evt, hdev->num);
+}
+
+static int hpet_msi_resume(struct clock_event_device *evt)
+{
+	struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
+
+	return hpet_resume(evt, hdev->num);
 }

 static int hpet_msi_next_event(unsigned long delta,
@@ -523,10 +569,14 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)

 	evt->rating = 110;
 	evt->features = CLOCK_EVT_FEAT_ONESHOT;
-	if (hdev->flags & HPET_DEV_PERI_CAP)
+	if (hdev->flags & HPET_DEV_PERI_CAP) {
 		evt->features |= CLOCK_EVT_FEAT_PERIODIC;
+		evt->set_state_periodic = hpet_msi_set_periodic;
+	}

-	evt->set_mode = hpet_msi_set_mode;
+	evt->set_state_shutdown = hpet_msi_shutdown;
+	evt->set_state_oneshot = hpet_msi_set_oneshot;
+	evt->tick_resume = hpet_msi_resume;
 	evt->set_next_event = hpet_msi_next_event;
 	evt->cpumask = cpumask_of(hdev->cpu);

@@ -735,7 +785,7 @@ static int hpet_clocksource_register(void)

 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = rdtsc();

 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +795,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = rdtsc();
 	} while ((now - start) < 200000UL);

 	if (t1 == hpet_readl(HPET_COUNTER)) {
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -32,6 +32,7 @@
 #include <linux/irqflags.h>
 #include <linux/notifier.h>
 #include <linux/kallsyms.h>
+#include <linux/kprobes.h>
 #include <linux/percpu.h>
 #include <linux/kdebug.h>
 #include <linux/kernel.h>
@@ -179,7 +180,11 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp)
 	va = info->address;
 	len = bp->attr.bp_len;

-	return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
+	/*
+	 * We don't need to worry about va + len - 1 overflowing:
+	 * we already require that va is aligned to a multiple of len.
+	 */
+	return (va >= TASK_SIZE_MAX) || ((va + len - 1) >= TASK_SIZE_MAX);
 }

 int arch_bp_generic_fields(int x86_len, int x86_type,
@@ -243,6 +248,20 @@ static int arch_build_bp_info(struct perf_event *bp)
 		info->type = X86_BREAKPOINT_RW;
 		break;
 	case HW_BREAKPOINT_X:
+		/*
+		 * We don't allow kernel breakpoints in places that are not
+		 * acceptable for kprobes.  On non-kprobes kernels, we don't
+		 * allow kernel breakpoints at all.
+		 */
+		if (bp->attr.bp_addr >= TASK_SIZE_MAX) {
+#ifdef CONFIG_KPROBES
+			if (within_kprobe_blacklist(bp->attr.bp_addr))
+				return -EINVAL;
+#else
+			return -EINVAL;
+#endif
+		}
+
 		info->type = X86_BREAKPOINT_EXECUTE;
 		/*
 		 * x86 inst breakpoints need to have a specific undefined len.
@@ -276,8 +295,18 @@ static int arch_build_bp_info(struct perf_event *bp)
 		break;
 #endif
 	default:
+		/* AMD range breakpoint */
 		if (!is_power_of_2(bp->attr.bp_len))
 			return -EINVAL;
+		if (bp->attr.bp_addr & (bp->attr.bp_len - 1))
+			return -EINVAL;
+		/*
+		 * It's impossible to use a range breakpoint to fake out
+		 * user vs kernel detection because bp_len - 1 can't
+		 * have the high bit set.  If we ever allow range instruction
+		 * breakpoints, then we'll have to check for kprobe-blacklisted
+		 * addresses anywhere in the range.
+		 */
 		if (!cpu_has_bpext)
 			return -EOPNOTSUPP;
 		info->mask = bp->attr.bp_len - 1;
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -34,7 +34,7 @@ static int __init init_pit_clocksource(void)
 	  * - when local APIC timer is active (PIT is switched off)
 	  */
 	if (num_possible_cpus() > 1 || is_hpet_enabled() ||
-	    i8253_clockevent.mode != CLOCK_EVT_MODE_PERIODIC)
+	    !clockevent_state_periodic(&i8253_clockevent))
 		return 0;

 	return clocksource_i8253_init();
--- a/arch/x86/kernel/iosf_mbi.c
+++ b/arch/x86/kernel/iosf_mbi.c
@@ -1,328 +0,0 @@
-/*
- * IOSF-SB MailBox Interface Driver
- * Copyright (c) 2013, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- *
- * The IOSF-SB is a fabric bus available on Atom based SOC's that uses a
- * mailbox interface (MBI) to communicate with mutiple devices. This
- * driver implements access to this interface for those platforms that can
- * enumerate the device using PCI.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/spinlock.h>
-#include <linux/pci.h>
-#include <linux/debugfs.h>
-#include <linux/capability.h>
-
-#include <asm/iosf_mbi.h>
-
-#define PCI_DEVICE_ID_BAYTRAIL		0x0F00
-#define PCI_DEVICE_ID_BRASWELL		0x2280
-#define PCI_DEVICE_ID_QUARK_X1000	0x0958
-
-static DEFINE_SPINLOCK(iosf_mbi_lock);
-
-static inline u32 iosf_mbi_form_mcr(u8 op, u8 port, u8 offset)
-{
-	return (op << 24) | (port << 16) | (offset << 8) | MBI_ENABLE;
-}
-
-static struct pci_dev *mbi_pdev;	/* one mbi device */
-
-static int iosf_mbi_pci_read_mdr(u32 mcrx, u32 mcr, u32 *mdr)
-{
-	int result;
-
-	if (!mbi_pdev)
-		return -ENODEV;
-
-	if (mcrx) {
-		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
-						mcrx);
-		if (result < 0)
-			goto fail_read;
-	}
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
-	if (result < 0)
-		goto fail_read;
-
-	result = pci_read_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
-	if (result < 0)
-		goto fail_read;
-
-	return 0;
-
-fail_read:
-	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
-	return result;
-}
-
-static int iosf_mbi_pci_write_mdr(u32 mcrx, u32 mcr, u32 mdr)
-{
-	int result;
-
-	if (!mbi_pdev)
-		return -ENODEV;
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MDR_OFFSET, mdr);
-	if (result < 0)
-		goto fail_write;
-
-	if (mcrx) {
-		result = pci_write_config_dword(mbi_pdev, MBI_MCRX_OFFSET,
-						mcrx);
-		if (result < 0)
-			goto fail_write;
-	}
-
-	result = pci_write_config_dword(mbi_pdev, MBI_MCR_OFFSET, mcr);
-	if (result < 0)
-		goto fail_write;
-
-	return 0;
-
-fail_write:
-	dev_err(&mbi_pdev->dev, "PCI config access failed with %d\n", result);
-	return result;
-}
-
-int iosf_mbi_read(u8 port, u8 opcode, u32 offset, u32 *mdr)
-{
-	u32 mcr, mcrx;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-	ret = iosf_mbi_pci_read_mdr(mcrx, mcr, mdr);
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_read);
-
-int iosf_mbi_write(u8 port, u8 opcode, u32 offset, u32 mdr)
-{
-	u32 mcr, mcrx;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-	ret = iosf_mbi_pci_write_mdr(mcrx, mcr, mdr);
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_write);
-
-int iosf_mbi_modify(u8 port, u8 opcode, u32 offset, u32 mdr, u32 mask)
-{
-	u32 mcr, mcrx;
-	u32 value;
-	unsigned long flags;
-	int ret;
-
-	/*Access to the GFX unit is handled by GPU code */
-	if (port == BT_MBI_UNIT_GFX) {
-		WARN_ON(1);
-		return -EPERM;
-	}
-
-	mcr = iosf_mbi_form_mcr(opcode, port, offset & MBI_MASK_LO);
-	mcrx = offset & MBI_MASK_HI;
-
-	spin_lock_irqsave(&iosf_mbi_lock, flags);
-
-	/* Read current mdr value */
-	ret = iosf_mbi_pci_read_mdr(mcrx, mcr & MBI_RD_MASK, &value);
-	if (ret < 0) {
-		spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-		return ret;
-	}
-
-	/* Apply mask */
-	value &= ~mask;
-	mdr &= mask;
-	value |= mdr;
-
-	/* Write back */
-	ret = iosf_mbi_pci_write_mdr(mcrx, mcr | MBI_WR_MASK, value);
-
-	spin_unlock_irqrestore(&iosf_mbi_lock, flags);
-
-	return ret;
-}
-EXPORT_SYMBOL(iosf_mbi_modify);
-
-bool iosf_mbi_available(void)
-{
-	/* Mbi isn't hot-pluggable. No remove routine is provided */
-	return mbi_pdev;
-}
-EXPORT_SYMBOL(iosf_mbi_available);
-
-#ifdef CONFIG_IOSF_MBI_DEBUG
-static u32	dbg_mdr;
-static u32	dbg_mcr;
-static u32	dbg_mcrx;
-
-static int mcr_get(void *data, u64 *val)
-{
-	*val = *(u32 *)data;
-	return 0;
-}
-
-static int mcr_set(void *data, u64 val)
-{
-	u8 command = ((u32)val & 0xFF000000) >> 24,
-	   port	   = ((u32)val & 0x00FF0000) >> 16,
-	   offset  = ((u32)val & 0x0000FF00) >> 8;
-	int err;
-
-	*(u32 *)data = val;
-
-	if (!capable(CAP_SYS_RAWIO))
-		return -EACCES;
-
-	if (command & 1u)
-		err = iosf_mbi_write(port,
-			       command,
-			       dbg_mcrx | offset,
-			       dbg_mdr);
-	else
-		err = iosf_mbi_read(port,
-			      command,
-			      dbg_mcrx | offset,
-			      &dbg_mdr);
-
-	return err;
-}
-DEFINE_SIMPLE_ATTRIBUTE(iosf_mcr_fops, mcr_get, mcr_set , "%llx\n");
-
-static struct dentry *iosf_dbg;
-
-static void iosf_sideband_debug_init(void)
-{
-	struct dentry *d;
-
-	iosf_dbg = debugfs_create_dir("iosf_sb", NULL);
-	if (IS_ERR_OR_NULL(iosf_dbg))
-		return;
-
-	/* mdr */
-	d = debugfs_create_x32("mdr", 0660, iosf_dbg, &dbg_mdr);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	/* mcrx */
-	debugfs_create_x32("mcrx", 0660, iosf_dbg, &dbg_mcrx);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	/* mcr - initiates mailbox tranaction */
-	debugfs_create_file("mcr", 0660, iosf_dbg, &dbg_mcr, &iosf_mcr_fops);
-	if (IS_ERR_OR_NULL(d))
-		goto cleanup;
-
-	return;
-
-cleanup:
-	debugfs_remove_recursive(d);
-}
-
-static void iosf_debugfs_init(void)
-{
-	iosf_sideband_debug_init();
-}
-
-static void iosf_debugfs_remove(void)
-{
-	debugfs_remove_recursive(iosf_dbg);
-}
-#else
-static inline void iosf_debugfs_init(void) { }
-static inline void iosf_debugfs_remove(void) { }
-#endif /* CONFIG_IOSF_MBI_DEBUG */
-
-static int iosf_mbi_probe(struct pci_dev *pdev,
-			  const struct pci_device_id *unused)
-{
-	int ret;
-
-	ret = pci_enable_device(pdev);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "error: could not enable device\n");
-		return ret;
-	}
-
-	mbi_pdev = pci_dev_get(pdev);
-	return 0;
-}
-
-static const struct pci_device_id iosf_mbi_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BAYTRAIL) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_BRASWELL) },
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_QUARK_X1000) },
-	{ 0, },
-};
-MODULE_DEVICE_TABLE(pci, iosf_mbi_pci_ids);
-
-static struct pci_driver iosf_mbi_pci_driver = {
-	.name		= "iosf_mbi_pci",
-	.probe		= iosf_mbi_probe,
-	.id_table	= iosf_mbi_pci_ids,
-};
-
-static int __init iosf_mbi_init(void)
-{
-	iosf_debugfs_init();
-
-	return pci_register_driver(&iosf_mbi_pci_driver);
-}
-
-static void __exit iosf_mbi_exit(void)
-{
-	iosf_debugfs_remove();
-
-	pci_unregister_driver(&iosf_mbi_pci_driver);
-	if (mbi_pdev) {
-		pci_dev_put(mbi_pdev);
-		mbi_pdev = NULL;
-	}
-}
-
-module_init(iosf_mbi_init);
-module_exit(iosf_mbi_exit);
-
-MODULE_AUTHOR("David E. Box <david.e.box@linux.intel.com>");
-MODULE_DESCRIPTION("IOSF Mailbox Interface accessor");
-MODULE_LICENSE("GPL v2");
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -139,10 +139,13 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Machine check polls\n");
 #endif
 #if IS_ENABLED(CONFIG_HYPERV) || defined(CONFIG_XEN)
-	seq_printf(p, "%*s: ", prec, "HYP");
-	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_hv_callback_count);
-	seq_puts(p, "  Hypervisor callback interrupts\n");
+	if (test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) {
+		seq_printf(p, "%*s: ", prec, "HYP");
+		for_each_online_cpu(j)
+			seq_printf(p, "%10u ",
+				   irq_stats(j)->irq_hv_callback_count);
+		seq_puts(p, "  Hypervisor callback interrupts\n");
+	}
 #endif
 	seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count));
 #if defined(CONFIG_X86_IO_APIC)
@@ -211,24 +214,38 @@ u64 arch_irq_stat(void)
 __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 {
 	struct pt_regs *old_regs = set_irq_regs(regs);
-
+	struct irq_desc * desc;
 	/* high bit used in ret_from_ code  */
 	unsigned vector = ~regs->orig_ax;
-	unsigned irq;
+
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */

 	entering_irq();

-	irq = __this_cpu_read(vector_irq[vector]);
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "IRQ failed to wake up RCU");

-	if (!handle_irq(irq, regs)) {
+	desc = __this_cpu_read(vector_irq[vector]);
+
+	if (!handle_irq(desc, regs)) {
 		ack_APIC_irq();

-		if (irq != VECTOR_RETRIGGERED) {
-			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector (irq %d)\n",
+		if (desc != VECTOR_RETRIGGERED) {
+			pr_emerg_ratelimited("%s: %d.%d No irq handler for vector\n",
 					     __func__, smp_processor_id(),
-					     vector, irq);
+					     vector);
 		} else {
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 		}
 	}

@@ -330,10 +347,10 @@ static struct cpumask affinity_new, online_new;
 */
 int check_irq_vectors_for_cpu_disable(void)
 {
-	int irq, cpu;
 	unsigned int this_cpu, vector, this_count, count;
 	struct irq_desc *desc;
 	struct irq_data *data;
+	int cpu;

 	this_cpu = smp_processor_id();
 	cpumask_copy(&online_new, cpu_online_mask);
@@ -341,47 +358,43 @@ int check_irq_vectors_for_cpu_disable(void)

 	this_count = 0;
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
-		irq = __this_cpu_read(vector_irq[vector]);
-		if (irq >= 0) {
-			desc = irq_to_desc(irq);
-			if (!desc)
-				continue;
-
-			/*
-			 * Protect against concurrent action removal,
-			 * affinity changes etc.
-			 */
-			raw_spin_lock(&desc->lock);
-			data = irq_desc_get_irq_data(desc);
-			cpumask_copy(&affinity_new, data->affinity);
-			cpumask_clear_cpu(this_cpu, &affinity_new);
-
-			/* Do not count inactive or per-cpu irqs. */
-			if (!irq_has_action(irq) || irqd_is_per_cpu(data)) {
-				raw_spin_unlock(&desc->lock);
-				continue;
-			}
+		desc = __this_cpu_read(vector_irq[vector]);
+		if (IS_ERR_OR_NULL(desc))
+			continue;
+		/*
+		 * Protect against concurrent action removal, affinity
+		 * changes etc.
+		 */
+		raw_spin_lock(&desc->lock);
+		data = irq_desc_get_irq_data(desc);
+		cpumask_copy(&affinity_new,
+			     irq_data_get_affinity_mask(data));
+		cpumask_clear_cpu(this_cpu, &affinity_new);

+		/* Do not count inactive or per-cpu irqs. */
+		if (!irq_desc_has_action(desc) || irqd_is_per_cpu(data)) {
 			raw_spin_unlock(&desc->lock);
-			/*
-			 * A single irq may be mapped to multiple
-			 * cpu's vector_irq[] (for example IOAPIC cluster
-			 * mode).  In this case we have two
-			 * possibilities:
-			 *
-			 * 1) the resulting affinity mask is empty; that is
-			 * this the down'd cpu is the last cpu in the irq's
-			 * affinity mask, or
-			 *
-			 * 2) the resulting affinity mask is no longer
-			 * a subset of the online cpus but the affinity
-			 * mask is not zero; that is the down'd cpu is the
-			 * last online cpu in a user set affinity mask.
-			 */
-			if (cpumask_empty(&affinity_new) ||
-			    !cpumask_subset(&affinity_new, &online_new))
-				this_count++;
+			continue;
 		}
+
+		raw_spin_unlock(&desc->lock);
+		/*
+		 * A single irq may be mapped to multiple cpu's
+		 * vector_irq[] (for example IOAPIC cluster mode).  In
+		 * this case we have two possibilities:
+		 *
+		 * 1) the resulting affinity mask is empty; that is
+		 * this the down'd cpu is the last cpu in the irq's
+		 * affinity mask, or
+		 *
+		 * 2) the resulting affinity mask is no longer a
+		 * subset of the online cpus but the affinity mask is
+		 * not zero; that is the down'd cpu is the last online
+		 * cpu in a user set affinity mask.
+		 */
+		if (cpumask_empty(&affinity_new) ||
+		    !cpumask_subset(&affinity_new, &online_new))
+			this_count++;
 	}

 	count = 0;
@@ -400,8 +413,8 @@ int check_irq_vectors_for_cpu_disable(void)
 		for (vector = FIRST_EXTERNAL_VECTOR;
 		     vector < first_system_vector; vector++) {
 			if (!test_bit(vector, used_vectors) &&
-			    per_cpu(vector_irq, cpu)[vector] < 0)
-					count++;
+			    IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
+			    count++;
 		}
 	}

@@ -437,7 +450,7 @@ void fixup_irqs(void)
 		raw_spin_lock(&desc->lock);

 		data = irq_desc_get_irq_data(desc);
-		affinity = data->affinity;
+		affinity = irq_data_get_affinity_mask(data);
 		if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
 		    cpumask_subset(affinity, cpu_online_mask)) {
 			raw_spin_unlock(&desc->lock);
@@ -505,14 +518,13 @@ void fixup_irqs(void)
 	for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
 		unsigned int irr;

-		if (__this_cpu_read(vector_irq[vector]) <= VECTOR_UNDEFINED)
+		if (IS_ERR_OR_NULL(__this_cpu_read(vector_irq[vector])))
 			continue;

 		irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
 		if (irr  & (1 << (vector % 32))) {
-			irq = __this_cpu_read(vector_irq[vector]);
+			desc = __this_cpu_read(vector_irq[vector]);

-			desc = irq_to_desc(irq);
 			raw_spin_lock(&desc->lock);
 			data = irq_desc_get_irq_data(desc);
 			chip = irq_data_get_irq_chip(data);
@@ -523,7 +535,7 @@ void fixup_irqs(void)
 			raw_spin_unlock(&desc->lock);
 		}
 		if (__this_cpu_read(vector_irq[vector]) != VECTOR_RETRIGGERED)
-			__this_cpu_write(vector_irq[vector], VECTOR_UNDEFINED);
+			__this_cpu_write(vector_irq[vector], VECTOR_UNUSED);
 	}
 }
 #endif
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -148,21 +148,21 @@ void do_softirq_own_stack(void)
 	call_on_stack(__do_softirq, isp);
 }

-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
+	unsigned int irq;
 	int overflow;

 	overflow = check_stack_overflow();

-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (IS_ERR_OR_NULL(desc))
 		return false;

+	irq = irq_desc_get_irq(desc);
 	if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
 		if (unlikely(overflow))
 			print_stack_overflow();
-		desc->handle_irq(irq, desc);
+		generic_handle_irq_desc(irq, desc);
 	}

 	return true;
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -68,16 +68,13 @@ static inline void stack_overflow_check(struct pt_regs *regs)
 #endif
 }

-bool handle_irq(unsigned irq, struct pt_regs *regs)
+bool handle_irq(struct irq_desc *desc, struct pt_regs *regs)
 {
-	struct irq_desc *desc;
-
 	stack_overflow_check(regs);

-	desc = irq_to_desc(irq);
-	if (unlikely(!desc))
+	if (unlikely(IS_ERR_OR_NULL(desc)))
 		return false;

-	generic_handle_irq_desc(irq, desc);
+	generic_handle_irq_desc(irq_desc_get_irq(desc), desc);
 	return true;
 }
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -52,7 +52,7 @@ static struct irqaction irq2 = {
 };

 DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-	[0 ... NR_VECTORS - 1] = VECTOR_UNDEFINED,
+	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };

 int vector_used_by_percpu_irq(unsigned int vector)
@@ -60,7 +60,7 @@ int vector_used_by_percpu_irq(unsigned int vector)
 	int cpu;

 	for_each_online_cpu(cpu) {
-		if (per_cpu(vector_irq, cpu)[vector] > VECTOR_UNDEFINED)
+		if (!IS_ERR_OR_NULL(per_cpu(vector_irq, cpu)[vector]))
 			return 1;
 	}

@@ -94,7 +94,7 @@ void __init init_IRQ(void)
 	 * irq's migrate etc.
 	 */
 	for (i = 0; i < nr_legacy_irqs(); i++)
-		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = i;
+		per_cpu(vector_irq, 0)[ISA_IRQ_VECTOR(i)] = irq_to_desc(i);

 	x86_init.irqs.intr_init();
 }
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -223,9 +223,6 @@ setup_boot_parameters(struct kimage *image, struct boot_params *params,
 	memset(&params->hd0_info, 0, sizeof(params->hd0_info));
 	memset(&params->hd1_info, 0, sizeof(params->hd1_info));

-	/* Default sysdesc table */
-	params->sys_desc_table.length = 0;
-
 	if (image->type == KEXEC_TYPE_CRASH) {
 		ret = crash_setup_memmap_entries(image, params);
 		if (ret)
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 }

-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;

 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;

 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;

 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)

 	__this_cpu_write(last_nmi_rip, regs->ip);

-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 		/*
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");

 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
--- a/arch/x86/kernel/pmc_atom.c
+++ b/arch/x86/kernel/pmc_atom.c
@@ -1,371 +0,0 @@
-/*
- * Intel Atom SOC Power Management Controller Driver
- * Copyright (c) 2014, Intel Corporation.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
- * more details.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/pci.h>
-#include <linux/device.h>
-#include <linux/debugfs.h>
-#include <linux/seq_file.h>
-#include <linux/io.h>
-
-#include <asm/pmc_atom.h>
-
-struct pmc_dev {
-	u32 base_addr;
-	void __iomem *regmap;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry *dbgfs_dir;
-#endif /* CONFIG_DEBUG_FS */
-};
-
-static struct pmc_dev pmc_device;
-static u32 acpi_base_addr;
-
-struct pmc_bit_map {
-	const char *name;
-	u32 bit_mask;
-};
-
-static const struct pmc_bit_map dev_map[] = {
-	{"0  - LPSS1_F0_DMA",		BIT_LPSS1_F0_DMA},
-	{"1  - LPSS1_F1_PWM1",		BIT_LPSS1_F1_PWM1},
-	{"2  - LPSS1_F2_PWM2",		BIT_LPSS1_F2_PWM2},
-	{"3  - LPSS1_F3_HSUART1",	BIT_LPSS1_F3_HSUART1},
-	{"4  - LPSS1_F4_HSUART2",	BIT_LPSS1_F4_HSUART2},
-	{"5  - LPSS1_F5_SPI",		BIT_LPSS1_F5_SPI},
-	{"6  - LPSS1_F6_Reserved",	BIT_LPSS1_F6_XXX},
-	{"7  - LPSS1_F7_Reserved",	BIT_LPSS1_F7_XXX},
-	{"8  - SCC_EMMC",		BIT_SCC_EMMC},
-	{"9  - SCC_SDIO",		BIT_SCC_SDIO},
-	{"10 - SCC_SDCARD",		BIT_SCC_SDCARD},
-	{"11 - SCC_MIPI",		BIT_SCC_MIPI},
-	{"12 - HDA",			BIT_HDA},
-	{"13 - LPE",			BIT_LPE},
-	{"14 - OTG",			BIT_OTG},
-	{"15 - USH",			BIT_USH},
-	{"16 - GBE",			BIT_GBE},
-	{"17 - SATA",			BIT_SATA},
-	{"18 - USB_EHCI",		BIT_USB_EHCI},
-	{"19 - SEC",			BIT_SEC},
-	{"20 - PCIE_PORT0",		BIT_PCIE_PORT0},
-	{"21 - PCIE_PORT1",		BIT_PCIE_PORT1},
-	{"22 - PCIE_PORT2",		BIT_PCIE_PORT2},
-	{"23 - PCIE_PORT3",		BIT_PCIE_PORT3},
-	{"24 - LPSS2_F0_DMA",		BIT_LPSS2_F0_DMA},
-	{"25 - LPSS2_F1_I2C1",		BIT_LPSS2_F1_I2C1},
-	{"26 - LPSS2_F2_I2C2",		BIT_LPSS2_F2_I2C2},
-	{"27 - LPSS2_F3_I2C3",		BIT_LPSS2_F3_I2C3},
-	{"28 - LPSS2_F3_I2C4",		BIT_LPSS2_F4_I2C4},
-	{"29 - LPSS2_F5_I2C5",		BIT_LPSS2_F5_I2C5},
-	{"30 - LPSS2_F6_I2C6",		BIT_LPSS2_F6_I2C6},
-	{"31 - LPSS2_F7_I2C7",		BIT_LPSS2_F7_I2C7},
-	{"32 - SMB",			BIT_SMB},
-	{"33 - OTG_SS_PHY",		BIT_OTG_SS_PHY},
-	{"34 - USH_SS_PHY",		BIT_USH_SS_PHY},
-	{"35 - DFX",			BIT_DFX},
-};
-
-static const struct pmc_bit_map pss_map[] = {
-	{"0  - GBE",			PMC_PSS_BIT_GBE},
-	{"1  - SATA",			PMC_PSS_BIT_SATA},
-	{"2  - HDA",			PMC_PSS_BIT_HDA},
-	{"3  - SEC",			PMC_PSS_BIT_SEC},
-	{"4  - PCIE",			PMC_PSS_BIT_PCIE},
-	{"5  - LPSS",			PMC_PSS_BIT_LPSS},
-	{"6  - LPE",			PMC_PSS_BIT_LPE},
-	{"7  - DFX",			PMC_PSS_BIT_DFX},
-	{"8  - USH_CTRL",		PMC_PSS_BIT_USH_CTRL},
-	{"9  - USH_SUS",		PMC_PSS_BIT_USH_SUS},
-	{"10 - USH_VCCS",		PMC_PSS_BIT_USH_VCCS},
-	{"11 - USH_VCCA",		PMC_PSS_BIT_USH_VCCA},
-	{"12 - OTG_CTRL",		PMC_PSS_BIT_OTG_CTRL},
-	{"13 - OTG_VCCS",		PMC_PSS_BIT_OTG_VCCS},
-	{"14 - OTG_VCCA_CLK",		PMC_PSS_BIT_OTG_VCCA_CLK},
-	{"15 - OTG_VCCA",		PMC_PSS_BIT_OTG_VCCA},
-	{"16 - USB",			PMC_PSS_BIT_USB},
-	{"17 - USB_SUS",		PMC_PSS_BIT_USB_SUS},
-};
-
-static inline u32 pmc_reg_read(struct pmc_dev *pmc, int reg_offset)
-{
-	return readl(pmc->regmap + reg_offset);
-}
-
-static inline void pmc_reg_write(struct pmc_dev *pmc, int reg_offset, u32 val)
-{
-	writel(val, pmc->regmap + reg_offset);
-}
-
-static void pmc_power_off(void)
-{
-	u16	pm1_cnt_port;
-	u32	pm1_cnt_value;
-
-	pr_info("Preparing to enter system sleep state S5\n");
-
-	pm1_cnt_port = acpi_base_addr + PM1_CNT;
-
-	pm1_cnt_value = inl(pm1_cnt_port);
-	pm1_cnt_value &= SLEEP_TYPE_MASK;
-	pm1_cnt_value |= SLEEP_TYPE_S5;
-	pm1_cnt_value |= SLEEP_ENABLE;
-
-	outl(pm1_cnt_value, pm1_cnt_port);
-}
-
-static void pmc_hw_reg_setup(struct pmc_dev *pmc)
-{
-	/*
-	 * Disable PMC S0IX_WAKE_EN events coming from:
-	 * - LPC clock run
-	 * - GPIO_SUS ored dedicated IRQs
-	 * - GPIO_SCORE ored dedicated IRQs
-	 * - GPIO_SUS shared IRQ
-	 * - GPIO_SCORE shared IRQ
-	 */
-	pmc_reg_write(pmc, PMC_S0IX_WAKE_EN, (u32)PMC_WAKE_EN_SETTING);
-}
-
-#ifdef CONFIG_DEBUG_FS
-static int pmc_dev_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u32 func_dis, func_dis_2, func_dis_index;
-	u32 d3_sts_0, d3_sts_1, d3_sts_index;
-	int dev_num, dev_index, reg_index;
-
-	func_dis = pmc_reg_read(pmc, PMC_FUNC_DIS);
-	func_dis_2 = pmc_reg_read(pmc, PMC_FUNC_DIS_2);
-	d3_sts_0 = pmc_reg_read(pmc, PMC_D3_STS_0);
-	d3_sts_1 = pmc_reg_read(pmc, PMC_D3_STS_1);
-
-	dev_num = ARRAY_SIZE(dev_map);
-
-	for (dev_index = 0; dev_index < dev_num; dev_index++) {
-		reg_index = dev_index / PMC_REG_BIT_WIDTH;
-		if (reg_index) {
-			func_dis_index = func_dis_2;
-			d3_sts_index = d3_sts_1;
-		} else {
-			func_dis_index = func_dis;
-			d3_sts_index = d3_sts_0;
-		}
-
-		seq_printf(s, "Dev: %-32s\tState: %s [%s]\n",
-			dev_map[dev_index].name,
-			dev_map[dev_index].bit_mask & func_dis_index ?
-			"Disabled" : "Enabled ",
-			dev_map[dev_index].bit_mask & d3_sts_index ?
-			"D3" : "D0");
-	}
-	return 0;
-}
-
-static int pmc_dev_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_dev_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_dev_state_ops = {
-	.open		= pmc_dev_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_pss_state_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u32 pss = pmc_reg_read(pmc, PMC_PSS);
-	int pss_index;
-
-	for (pss_index = 0; pss_index < ARRAY_SIZE(pss_map); pss_index++) {
-		seq_printf(s, "Island: %-32s\tState: %s\n",
-			pss_map[pss_index].name,
-			pss_map[pss_index].bit_mask & pss ? "Off" : "On");
-	}
-	return 0;
-}
-
-static int pmc_pss_state_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_pss_state_show, inode->i_private);
-}
-
-static const struct file_operations pmc_pss_state_ops = {
-	.open		= pmc_pss_state_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static int pmc_sleep_tmr_show(struct seq_file *s, void *unused)
-{
-	struct pmc_dev *pmc = s->private;
-	u64 s0ir_tmr, s0i1_tmr, s0i2_tmr, s0i3_tmr, s0_tmr;
-
-	s0ir_tmr = (u64)pmc_reg_read(pmc, PMC_S0IR_TMR) << PMC_TMR_SHIFT;
-	s0i1_tmr = (u64)pmc_reg_read(pmc, PMC_S0I1_TMR) << PMC_TMR_SHIFT;
-	s0i2_tmr = (u64)pmc_reg_read(pmc, PMC_S0I2_TMR) << PMC_TMR_SHIFT;
-	s0i3_tmr = (u64)pmc_reg_read(pmc, PMC_S0I3_TMR) << PMC_TMR_SHIFT;
-	s0_tmr = (u64)pmc_reg_read(pmc, PMC_S0_TMR) << PMC_TMR_SHIFT;
-
-	seq_printf(s, "S0IR Residency:\t%lldus\n", s0ir_tmr);
-	seq_printf(s, "S0I1 Residency:\t%lldus\n", s0i1_tmr);
-	seq_printf(s, "S0I2 Residency:\t%lldus\n", s0i2_tmr);
-	seq_printf(s, "S0I3 Residency:\t%lldus\n", s0i3_tmr);
-	seq_printf(s, "S0   Residency:\t%lldus\n", s0_tmr);
-	return 0;
-}
-
-static int pmc_sleep_tmr_open(struct inode *inode, struct file *file)
-{
-	return single_open(file, pmc_sleep_tmr_show, inode->i_private);
-}
-
-static const struct file_operations pmc_sleep_tmr_ops = {
-	.open		= pmc_sleep_tmr_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= single_release,
-};
-
-static void pmc_dbgfs_unregister(struct pmc_dev *pmc)
-{
-	debugfs_remove_recursive(pmc->dbgfs_dir);
-}
-
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
-{
-	struct dentry *dir, *f;
-
-	dir = debugfs_create_dir("pmc_atom", NULL);
-	if (!dir)
-		return -ENOMEM;
-
-	pmc->dbgfs_dir = dir;
-
-	f = debugfs_create_file("dev_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_dev_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "dev_state register failed\n");
-		goto err;
-	}
-
-	f = debugfs_create_file("pss_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_pss_state_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "pss_state register failed\n");
-		goto err;
-	}
-
-	f = debugfs_create_file("sleep_state", S_IFREG | S_IRUGO,
-				dir, pmc, &pmc_sleep_tmr_ops);
-	if (!f) {
-		dev_err(&pdev->dev, "sleep_state register failed\n");
-		goto err;
-	}
-
-	return 0;
-err:
-	pmc_dbgfs_unregister(pmc);
-	return -ENODEV;
-}
-#else
-static int pmc_dbgfs_register(struct pmc_dev *pmc, struct pci_dev *pdev)
-{
-	return 0;
-}
-#endif /* CONFIG_DEBUG_FS */
-
-static int pmc_setup_dev(struct pci_dev *pdev)
-{
-	struct pmc_dev *pmc = &pmc_device;
-	int ret;
-
-	/* Obtain ACPI base address */
-	pci_read_config_dword(pdev, ACPI_BASE_ADDR_OFFSET, &acpi_base_addr);
-	acpi_base_addr &= ACPI_BASE_ADDR_MASK;
-
-	/* Install power off function */
-	if (acpi_base_addr != 0 && pm_power_off == NULL)
-		pm_power_off = pmc_power_off;
-
-	pci_read_config_dword(pdev, PMC_BASE_ADDR_OFFSET, &pmc->base_addr);
-	pmc->base_addr &= PMC_BASE_ADDR_MASK;
-
-	pmc->regmap = ioremap_nocache(pmc->base_addr, PMC_MMIO_REG_LEN);
-	if (!pmc->regmap) {
-		dev_err(&pdev->dev, "error: ioremap failed\n");
-		return -ENOMEM;
-	}
-
-	/* PMC hardware registers setup */
-	pmc_hw_reg_setup(pmc);
-
-	ret = pmc_dbgfs_register(pmc, pdev);
-	if (ret) {
-		iounmap(pmc->regmap);
-	}
-
-	return ret;
-}
-
-/*
- * Data for PCI driver interface
- *
- * This data only exists for exporting the supported
- * PCI ids via MODULE_DEVICE_TABLE.  We do not actually
- * register a pci_driver, because lpc_ich will register
- * a driver on the same PCI id.
- */
-static const struct pci_device_id pmc_pci_ids[] = {
-	{ PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_VLV_PMC) },
-	{ 0, },
-};
-
-MODULE_DEVICE_TABLE(pci, pmc_pci_ids);
-
-static int __init pmc_atom_init(void)
-{
-	struct pci_dev *pdev = NULL;
-	const struct pci_device_id *ent;
-
-	/* We look for our device - PCU PMC
-	 * we assume that there is max. one device.
-	 *
-	 * We can't use plain pci_driver mechanism,
-	 * as the device is really a multiple function device,
-	 * main driver that binds to the pci_device is lpc_ich
-	 * and have to find & bind to the device this way.
-	 */
-	for_each_pci_dev(pdev) {
-		ent = pci_match_id(pmc_pci_ids, pdev);
-		if (ent)
-			return pmc_setup_dev(pdev);
-	}
-	/* Device not found. */
-	return -ENODEV;
-}
-
-module_init(pmc_atom_init);
-/* no module_exit, this driver shouldn't be unloaded */
-
-MODULE_AUTHOR("Aubrey Li <aubrey.li@linux.intel.com>");
-MODULE_DESCRIPTION("Intel Atom SOC Power Management Controller Interface");
-MODULE_LICENSE("GPL v2");
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,8 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/mce.h>
+#include <asm/vm86.h>

 /*
 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -110,6 +112,8 @@ void exit_thread(void)
 		kfree(bp);
 	}

+	free_vm86(t);
+
 	fpu__drop(fpu);
 }

@@ -319,6 +323,7 @@ void stop_this_cpu(void *dummy)
 	 */
 	set_cpu_online(smp_processor_id(), false);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));

 	for (;;)
 		halt();
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>

 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
+#endif
 	}
 }

@@ -248,8 +250,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 }

-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>

 #include "tls.h"

-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
@@ -1123,6 +1121,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }

+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1276,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 #endif

+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */

 #ifdef CONFIG_X86_64

@@ -1434,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -916,11 +916,6 @@ void __init setup_arch(char **cmdline_p)
 #ifdef CONFIG_X86_32
 	apm_info.bios = boot_params.apm_bios_info;
 	ist_info = boot_params.ist_info;
-	if (boot_params.sys_desc_table.length != 0) {
-		machine_id = boot_params.sys_desc_table.table[0];
-		machine_submodel_id = boot_params.sys_desc_table.table[1];
-		BIOS_revision = boot_params.sys_desc_table.table[2];
-	}
 #endif
 	saved_video_mode = boot_params.hdr.vid_mode;
 	bootloader_type = boot_params.hdr.type_of_loader;
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,11 +31,11 @@
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>

 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */

 #include <asm/syscall.h>
@@ -632,6 +632,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	bool stepping, failed;
 	struct fpu *fpu = &current->thread.fpu;

+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
@@ -697,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 * want to handle. Thus you cannot kill init even with a SIGKILL even by
 * mistake.
 */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;

@@ -732,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }

-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -30,6 +30,7 @@
 #include <asm/proto.h>
 #include <asm/apic.h>
 #include <asm/nmi.h>
+#include <asm/mce.h>
 #include <asm/trace/irq_vectors.h>
 /*
 *	Some notes on x86 processor bugs affecting SMP operation:
@@ -243,6 +244,7 @@ static void native_stop_other_cpus(int wait)
 finish:
 	local_irq_save(flags);
 	disable_local_APIC();
+	mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
 	local_irq_restore(flags);
 }

--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -97,8 +97,6 @@ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map);
 DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
 EXPORT_PER_CPU_SYMBOL(cpu_info);

-atomic_t init_deasserted;
-
 static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 {
 	unsigned long flags;
@@ -146,16 +144,11 @@ static void smp_callin(void)

 	/*
 	 * If waken up by an INIT in an 82489DX configuration
-	 * we may get here before an INIT-deassert IPI reaches
-	 * our local APIC.  We have to wait for the IPI or we'll
-	 * lock up on an APIC access.
-	 *
-	 * Since CPU0 is not wakened up by INIT, it doesn't wait for the IPI.
+	 * cpu_callout_mask guarantees we don't get here before
+	 * an INIT_deassert IPI reaches our local APIC, so it is
+	 * now safe to touch our local APIC.
 	 */
 	cpuid = smp_processor_id();
-	if (apic->wait_for_init_deassert && cpuid)
-		while (!atomic_read(&init_deasserted))
-			cpu_relax();

 	/*
 	 * (This works even if the APIC is not enabled.)
@@ -620,7 +613,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 	send_status = safe_apic_wait_icr_idle();

 	mb();
-	atomic_set(&init_deasserted, 1);

 	/*
 	 * Should we send STARTUP IPIs ?
@@ -665,7 +657,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(300);
+		if (init_udelay)
+			udelay(300);

 		pr_debug("Startup point 1\n");

@@ -675,7 +668,8 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 		/*
 		 * Give the other CPU some time to accept the IPI.
 		 */
-		udelay(200);
+		if (init_udelay)
+			udelay(200);

 		if (maxlvt > 3)		/* Due to the Pentium erratum 3AP.  */
 			apic_write(APIC_ESR, 0);
@@ -859,8 +853,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 	 * the targeted processor.
 	 */

-	atomic_set(&init_deasserted, 0);
-
 	if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {

 		pr_debug("Setting warm reset code and vector.\n");
@@ -898,7 +890,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)

 	if (!boot_error) {
 		/*
-		 * Wait 10s total for a response from AP
+		 * Wait 10s total for first sign of life from AP
 		 */
 		boot_error = -1;
 		timeout = jiffies + 10*HZ;
@@ -911,7 +903,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 				boot_error = 0;
 				break;
 			}
-			udelay(100);
 			schedule();
 		}
 	}
@@ -927,7 +918,6 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
 			 * for the MTRR work(triggered by the AP coming online)
 			 * to be completed in the stop machine context.
 			 */
-			udelay(100);
 			schedule();
 		}
 	}
@@ -1358,7 +1348,7 @@ static void remove_siblinginfo(int cpu)
 	cpumask_clear_cpu(cpu, cpu_sibling_setup_mask);
 }

-static void __ref remove_cpu_from_maps(int cpu)
+static void remove_cpu_from_maps(int cpu)
 {
 	set_cpu_online(cpu, false);
 	cpumask_clear_cpu(cpu, cpu_callout_mask);
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 	}

+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		mutex_unlock(&child->mm->context.lock);
 	}
+#endif

 	return addr;
 }
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -57,7 +57,7 @@ __setup("cpu0_hotplug", enable_cpu0_hotplug);
 *
 * This is only called for debugging CPU offline/online feature.
 */
-int __ref _debug_hotplug_cpu(int cpu, int action)
+int _debug_hotplug_cpu(int cpu, int action)
 {
 	struct device *dev = get_cpu_device(cpu);
 	int ret;
@@ -104,7 +104,7 @@ static int __init debug_hotplug_cpu(void)
 late_initcall_sync(debug_hotplug_cpu);
 #endif /* CONFIG_DEBUG_HOTPLUG_CPU0 */

-int __ref arch_register_cpu(int num)
+int arch_register_cpu(int num)
 {
 	struct cpuinfo_x86 *c = &cpu_data(num);

--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
 */
 u64 notrace trace_clock_x86_tsc(void)
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	rdtscll(ret);
-
-	return ret;
+	return rdtsc_ordered();
 }
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>

 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -108,13 +109,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_count_dec();
 }

-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -123,32 +121,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}

 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);

 	/* This code is a bit fragile.  Test it. */
-	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "ist_enter didn't work");
 }

-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);

-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 }

@@ -162,7 +153,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
 * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
 * begins a non-atomic section within an ist_enter()/ist_exit() region.
 * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
 * before ist_exit().
 */
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,17 +280,16 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;

+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		conditional_sti(regs);
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
-
-	exception_exit(prev_state);
 }

 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -351,7 +341,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 #endif

-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);

 	tsk->thread.error_code = error_code;
@@ -371,14 +361,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)

 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;

-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);

 	if (!user_mode(regs))
@@ -435,9 +424,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 	}

-exit:
-	exception_exit(prev_state);
 	return;
+
 exit_trap:
 	/*
 	 * This path out is for all the cases where we could not
@@ -447,35 +435,33 @@ exit_trap:
 	 * time..
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }

 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	enum ctx_state prev_state;

-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	conditional_sti(regs);

 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}

 	tsk = current;
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
-			goto exit;
+			return;

 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}

 	tsk->thread.error_code = error_code;
@@ -491,16 +477,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}

 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_general_protection);

 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -513,7 +495,8 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 		return;

-	prev_state = ist_enter(regs);
+	ist_enter(regs);
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -539,7 +522,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_int3);

@@ -615,12 +598,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;

-	prev_state = ist_enter(regs);
+	ist_enter(regs);

 	get_debugreg(dr6, 6);

@@ -695,7 +677,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();

 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);

@@ -747,21 +729,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)

 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }

 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }

 dotraplinkage void
@@ -773,9 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	BUG_ON(use_eager_fpu());

 #ifdef CONFIG_MATH_EMULATION
@@ -786,7 +760,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)

 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 	}
 #endif
@@ -794,7 +767,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_device_not_available);

@@ -802,9 +774,8 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
-	enum ctx_state prev_state;

-	prev_state = exception_enter();
+	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();

 	info.si_signo = SIGILL;
@@ -816,7 +787,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(prev_state);
 }
 #endif

--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)

 	data = cyc2ns_write_begin(cpu);

-	rdtscll(tsc_now);
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);

 	/*
@@ -290,12 +290,20 @@ u64 native_sched_clock(void)
 	}

 	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
+	tsc_now = rdtsc();

 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
 }

+/*
+ * Generate a sched_clock if you already have a TSC value.
+ */
+u64 native_sched_clock_from_tsc(u64 tsc)
+{
+	return cycles_2_ns(tsc);
+}
+
 /* We need to define a real function for sched_clock, to override the
   weak default version */
 #ifdef CONFIG_PARAVIRT
@@ -308,12 +316,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif

-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
@@ -976,7 +978,7 @@ static struct clocksource clocksource_tsc;
 */
 static cycle_t read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }

 /*
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;

 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
 */
 static void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;

-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);

@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)

 	/*
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	if (unsynchronized_tsc())
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;

+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;

--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -985,3 +985,12 @@ arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs

 	return -1;
 }
+
+bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx,
+				struct pt_regs *regs)
+{
+	if (ctx == RP_CHECK_CALL) /* sp was just decremented by "call" insn */
+		return regs->sp < ret->stack;
+	else
+		return regs->sp <= ret->stack;
+}
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,11 +44,14 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>

 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
+#include <asm/vm86.h>

 /*
 * Known problems:
@@ -66,10 +69,6 @@
 */


-#define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
-
-
 /*
 * 8- and 16-bit register defines..
 */
@@ -81,8 +80,8 @@
 /*
 * virtual flags (16 and 32-bit versions)
 */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)

 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -90,46 +89,13 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)

-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 	struct tss_struct *tss;
-	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
+	long err = 0;

 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,31 +104,57 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();

-	if (!current->thread.vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
-		pr_alert("could not access userspace vm86_info\n");
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
+	user = vm86->user_vm86;
+
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86 info\n");
+		do_exit(SIGSEGV);
+	}
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}

 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = vm86->saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	vm86->saved_sp0 = 0;
 	put_cpu();

-	ret = KVM86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));

-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(vm86->regs32.gs);

-	return ret;
+	regs->pt.ax = retval;
 }

 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -200,45 +192,16 @@ out:


 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);

-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk = current;
-	int tmp;
-
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }


 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
-
-	tsk = current;
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -256,114 +219,133 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}

 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }


-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs = current_pt_regs();
+	unsigned long err = 0;
+
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(&vm86regs, 0, sizeof(vm86regs));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
+	vm86->user_vm86 = user_vm86;

 /*
 * The flags register is also special: we cannot trust that the user
 * has set it up safely, so this makes sure interrupt etc flags are
 * inherited from protected mode.
 */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;

-	switch (info->cpu_type) {
+	vm86regs.pt.orig_ax = regs->orig_ax;
+
+	switch (vm86->cpu_type) {
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}

 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
 */
-	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	vm86->saved_sp0 = tsk->thread.sp0;
+	lazy_save_gs(vm86->regs32.gs);

 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
 	put_cpu();

-	tsk->thread.screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);

-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }

 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }

 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -395,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)

 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -405,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs

 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -420,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }

 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
@@ -518,12 +500,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
+	struct vm86 *vm86 = current->thread.vm86;

 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
@@ -542,18 +525,16 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;

 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }

 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (VMPI.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -574,16 +555,11 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;

 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)

 	orig_flags = *(unsigned short *)&regs->pt.flags;

@@ -622,7 +598,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 		}
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;

 	/* popf */
 	case 0x9d:
@@ -642,16 +618,18 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 			set_vflags_short(newflags, regs);

-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}

 	/* int xx */
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		do_int(regs, intno, ssp, sp);
 		return;
@@ -682,14 +660,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 			set_vflags_short(newflags, regs);
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}

 	/* cli */
 	case 0xfa:
 		IP(regs) = ip;
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;

 	/* sti */
 	/*
@@ -701,14 +679,29 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 		IP(regs) = ip;
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;

 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
 	}

 	return;

+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
+	}
+
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
+	return;
+
 simulate_sigsegv:
 	/* FIXME: After a long discussion with Stas we finally
 	 *        agreed, that this is wrong. Here we should
@@ -720,7 +713,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }

 /* ---------------- vm86 special IRQ passing stuff ----------------- */