Merge branch 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull perf updates from Ingo Molnar: "Kernel side changes: - Intel Knights Landing support. (Harish Chegondi) - Intel Broadwell-EP uncore PMU support. (Kan Liang) - Core code improvements. (Peter Zijlstra.) - Event filter, LBR and PEBS fixes. (Stephane Eranian) - Enable cycles:pp on Intel Atom. (Stephane Eranian) - Add cycles:ppp support for Skylake. (Andi Kleen) - Various x86 NMI overhead optimizations. (Andi Kleen) - Intel PT enhancements. (Takao Indoh) - AMD cache events fix. (Vince Weaver) Tons of tooling changes: - Show random perf tool tips in the 'perf report' bottom line (Namhyung Kim) - perf report now defaults to --group if the perf.data file has grouped events, try it with: # perf record -e '{cycles,instructions}' -a sleep 1 [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 1.093 MB perf.data (1247 samples) ] # perf report # Samples: 1K of event 'anon group { cycles, instructions }' # Event count (approx.): 1955219195 # # Overhead Command Shared Object Symbol 2.86% 0.22% swapper [kernel.kallsyms] [k] intel_idle 1.05% 0.33% firefox libxul.so [.] js::SetObjectElement 1.05% 0.00% kworker/0:3 [kernel.kallsyms] [k] gen6_ring_get_seqno 0.88% 0.17% chrome chrome [.] 0x0000000000ee27ab 0.65% 0.86% firefox libxul.so [.] js::ValueToId<(js::AllowGC)1> 0.64% 0.23% JS Helper libxul.so [.] js::SplayTree<js::jit::LiveRange*, js::jit::LiveRange>::splay 0.62% 1.27% firefox libxul.so [.] js::GetIterator 0.61% 1.74% firefox libxul.so [.] js::NativeSetProperty 0.61% 0.31% firefox libxul.so [.] js::SetPropertyByDefining - Introduce the 'perf stat record/report' workflow: Generate perf.data files from 'perf stat', to tap into the scripting capabilities perf has instead of defining a 'perf stat' specific scripting support to calculate event ratios, etc. Simple example: $ perf stat record -e cycles usleep 1 Performance counter stats for 'usleep 1': 1,134,996 cycles 0.000670644 seconds time elapsed $ perf stat report Performance counter stats for '/home/acme/bin/perf stat record -e cycles usleep 1': 1,134,996 cycles 0.000670644 seconds time elapsed $ It generates PERF_RECORD_ userspace records to store the details: $ perf report -D | grep PERF_RECORD 0xf0 [0x28]: PERF_RECORD_THREAD_MAP nr: 1 thread: 27637 0x118 [0x12]: PERF_RECORD_CPU_MAP nr: 1 cpu: 65535 0x12a [0x40]: PERF_RECORD_STAT_CONFIG 0x16a [0x30]: PERF_RECORD_STAT -1 -1 0x19a [0x40]: PERF_RECORD_MMAP -1/0: [0xffffffff81000000(0x1f000000) @ 0xffffffff81000000]: x [kernel.kallsyms]_text 0x1da [0x18]: PERF_RECORD_STAT_ROUND [acme@ssdandy linux]$ An effort was made to make perf.data files generated like this to not generate cryptic messages when processed by older tools. The 'perf script' bits need rebasing, will go up later. - Make command line options always available, even when they depend on some feature being enabled, warning the user about use of such options (Wang Nan) - Support hw breakpoint events (mem:0xAddress) in the default output mode in 'perf script' (Wang Nan) - Fixes and improvements for supporting annotating ARM binaries, support ARM call and jump instructions, more work needed to have arch specific stuff separated into tools/perf/arch/*/annotate/ (Russell King) - Add initial 'perf config' command, for now just with a --list command to the contents of the configuration file in use and a basic man page describing its format, commands for doing edits and detailed documentation are being reviewed and proof-read. (Taeung Song) - Allows BPF scriptlets specify arguments to be fetched using DWARF info, using a prologue generated at compile/build time (He Kuang, Wang Nan) - Allow attaching BPF scriptlets to module symbols (Wang Nan) - Allow attaching BPF scriptlets to userspace code using uprobe (Wang Nan) - BPF programs now can specify 'perf probe' tunables via its section name, separating key=val values using semicolons (Wang Nan) Testing some of these new BPF features: Use case: get callchains when receiving SSL packets, filter then in the kernel, at arbitrary place. # cat ssl.bpf.c #define SEC(NAME) __attribute__((section(NAME), used)) struct pt_regs; SEC("func=__inet_lookup_established hnum") int func(struct pt_regs *ctx, int err, unsigned short port) { return err == 0 && port == 443; } char _license[] SEC("license") = "GPL"; int _version SEC("version") = LINUX_VERSION_CODE; # # perf record -a -g -e ssl.bpf.c ^C[ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.787 MB perf.data (3 samples) ] # perf script | head -30 swapper 0 [000] 58783.268118: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb 8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux) 896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux) 8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux) 855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) 8572a8 process_backlog (/lib/modules/4.3.0+/build/vmlinux) 856b11 net_rx_action (/lib/modules/4.3.0+/build/vmlinux) 2a284b __do_softirq (/lib/modules/4.3.0+/build/vmlinux) 2a2ba3 irq_exit (/lib/modules/4.3.0+/build/vmlinux) 96b7a4 do_IRQ (/lib/modules/4.3.0+/build/vmlinux) 969807 ret_from_intr (/lib/modules/4.3.0+/build/vmlinux) 2dede5 cpu_startup_entry (/lib/modules/4.3.0+/build/vmlinux) 95d5bc rest_init (/lib/modules/4.3.0+/build/vmlinux) 1163ffa start_kernel ([kernel.vmlinux].init.text) 11634d7 x86_64_start_reservations ([kernel.vmlinux].init.text) 1163623 x86_64_start_kernel ([kernel.vmlinux].init.text) qemu-system-x86 9178 [003] 58785.792417: perf_bpf_probe:func: (ffffffff816a0f60) hnum=0x1bb 8a0f61 __inet_lookup_established (/lib/modules/4.3.0+/build/vmlinux) 896def ip_rcv_finish (/lib/modules/4.3.0+/build/vmlinux) 8976c2 ip_rcv (/lib/modules/4.3.0+/build/vmlinux) 855eba __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) 856660 netif_receive_skb_internal (/lib/modules/4.3.0+/build/vmlinux) 8566ec netif_receive_skb_sk (/lib/modules/4.3.0+/build/vmlinux) 430a br_handle_frame_finish ([bridge]) 48bc br_handle_frame ([bridge]) 855f44 __netif_receive_skb_core (/lib/modules/4.3.0+/build/vmlinux) 8565d8 __netif_receive_skb (/lib/modules/4.3.0+/build/vmlinux) # - Use 'perf probe' various options to list functions, see what variables can be collected at any given point, experiment first collecting without a filter, then filter, use it together with 'perf trace', 'perf top', with or without callchains, if it explodes, please tell us! - Introduce a new callchain mode: "folded", that will list per line representations of all callchains for a give histogram entry, facilitating 'perf report' output processing by other tools, such as Brendan Gregg's flamegraph tools (Namhyung Kim) E.g: # perf report | grep -v ^# | head 18.37% 0.00% swapper [kernel.kallsyms] [k] cpu_startup_entry | ---cpu_startup_entry | |--12.07%--start_secondary | --6.30%--rest_init start_kernel x86_64_start_reservations x86_64_start_kernel # Becomes, in "folded" mode: # perf report -g folded | grep -v ^# | head -5 18.37% 0.00% swapper [kernel.kallsyms] [k] cpu_startup_entry 12.07% cpu_startup_entry;start_secondary 6.30% cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 16.90% 0.00% swapper [kernel.kallsyms] [k] call_cpuidle 11.23% call_cpuidle;cpu_startup_entry;start_secondary 5.67% call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 16.90% 0.00% swapper [kernel.kallsyms] [k] cpuidle_enter 11.23% cpuidle_enter;call_cpuidle;cpu_startup_entry;start_secondary 5.67% cpuidle_enter;call_cpuidle;cpu_startup_entry;rest_init;start_kernel;x86_64_start_reservations;x86_64_start_kernel 15.12% 0.00% swapper [kernel.kallsyms] [k] cpuidle_enter_state # The user can also select one of "count", "period" or "percent" as the first column. ... and lots of infrastructure enhancements, plus fixes and other changes, features I failed to list - see the shortlog and the git log for details" * 'perf-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (271 commits) perf evlist: Add --trace-fields option to show trace fields perf record: Store data mmaps for dwarf unwind perf libdw: Check for mmaps also in MAP__VARIABLE tree perf unwind: Check for mmaps also in MAP__VARIABLE tree perf unwind: Use find_map function in access_dso_mem perf evlist: Remove perf_evlist__(enable|disable)_event functions perf evlist: Make perf_evlist__open() open evsels with their cpus and threads (like perf record does) perf report: Show random usage tip on the help line perf hists: Export a couple of hist functions perf diff: Use perf_hpp__register_sort_field interface perf tools: Add overhead/overhead_children keys defaults via string perf tools: Remove list entry from struct sort_entry perf tools: Include all tools/lib directory for tags/cscope/TAGS targets perf script: Align event name properly perf tools: Add missing headers in perf's MANIFEST perf tools: Do not show trace command if it's not compiled in perf report: Change default to use event group view perf top: Decay periods in callchains tools lib: Move bitmap.[ch] from tools/perf/ to tools/{lib,include}/ tools lib: Sync tools/lib/find_bit.c with the kernel ...
2016-01-11 14:39:17 -08:00
parent 24af98c4cf 3eb9ede23b
commit 5cb52b5e16
240 changed files with 9158 additions and 1887 deletions
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -126,6 +126,37 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
 	return data.ret;
 }

+static void event_function_call(struct perf_event *event,
+				int (*active)(void *),
+				void (*inactive)(void *),
+				void *data)
+{
+	struct perf_event_context *ctx = event->ctx;
+	struct task_struct *task = ctx->task;
+
+	if (!task) {
+		cpu_function_call(event->cpu, active, data);
+		return;
+	}
+
+again:
+	if (!task_function_call(task, active, data))
+		return;
+
+	raw_spin_lock_irq(&ctx->lock);
+	if (ctx->is_active) {
+		/*
+		 * Reload the task pointer, it might have been changed by
+		 * a concurrent perf_event_context_sched_out().
+		 */
+		task = ctx->task;
+		raw_spin_unlock_irq(&ctx->lock);
+		goto again;
+	}
+	inactive(data);
+	raw_spin_unlock_irq(&ctx->lock);
+}
+
 #define EVENT_OWNER_KERNEL ((void *) -1)

 static bool is_kernel_event(struct perf_event *event)
@@ -1629,6 +1660,17 @@ struct remove_event {
 	bool detach_group;
 };

+static void ___perf_remove_from_context(void *info)
+{
+	struct remove_event *re = info;
+	struct perf_event *event = re->event;
+	struct perf_event_context *ctx = event->ctx;
+
+	if (re->detach_group)
+		perf_group_detach(event);
+	list_del_event(event, ctx);
+}
+
 /*
 * Cross CPU call to remove a performance event
 *
@@ -1656,7 +1698,6 @@ static int __perf_remove_from_context(void *info)
 	return 0;
 }

-
 /*
 * Remove the event from a task's (or a CPU's) list of events.
 *
@@ -1673,7 +1714,6 @@ static int __perf_remove_from_context(void *info)
 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
 	struct remove_event re = {
 		.event = event,
 		.detach_group = detach_group,
@@ -1681,44 +1721,8 @@ static void perf_remove_from_context(struct perf_event *event, bool detach_group

 	lockdep_assert_held(&ctx->mutex);

-	if (!task) {
-		/*
-		 * Per cpu events are removed via an smp call. The removal can
-		 * fail if the CPU is currently offline, but in that case we
-		 * already called __perf_remove_from_context from
-		 * perf_event_exit_cpu.
-		 */
-		cpu_function_call(event->cpu, __perf_remove_from_context, &re);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_remove_from_context, &re))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to remove the event, us
-	 * holding the ctx->lock ensures the task won't get scheduled in.
-	 */
-	if (detach_group)
-		perf_group_detach(event);
-	list_del_event(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_remove_from_context,
+			    ___perf_remove_from_context, &re);
 }

 /*
@@ -1762,6 +1766,20 @@ int __perf_event_disable(void *info)
 	return 0;
 }

+void ___perf_event_disable(void *info)
+{
+	struct perf_event *event = info;
+
+	/*
+	 * Since we have the lock this context can't be scheduled
+	 * in, so we can change the state safely.
+	 */
+	if (event->state == PERF_EVENT_STATE_INACTIVE) {
+		update_group_times(event);
+		event->state = PERF_EVENT_STATE_OFF;
+	}
+}
+
 /*
 * Disable a event.
 *
@@ -1778,43 +1796,16 @@ int __perf_event_disable(void *info)
 static void _perf_event_disable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Disable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_disable, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_disable, event))
-		return;

 	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If the event is still active, we need to retry the cross-call.
-	 */
-	if (event->state == PERF_EVENT_STATE_ACTIVE) {
+	if (event->state <= PERF_EVENT_STATE_OFF) {
 		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since we have the lock this context can't be scheduled
-	 * in, so we can change the state safely.
-	 */
-	if (event->state == PERF_EVENT_STATE_INACTIVE) {
-		update_group_times(event);
-		event->state = PERF_EVENT_STATE_OFF;
+		return;
 	}
 	raw_spin_unlock_irq(&ctx->lock);
+
+	event_function_call(event, __perf_event_disable,
+			    ___perf_event_disable, event);
 }

 /*
@@ -2067,6 +2058,18 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
 		ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
 }

+static void ___perf_install_in_context(void *info)
+{
+	struct perf_event *event = info;
+	struct perf_event_context *ctx = event->ctx;
+
+	/*
+	 * Since the task isn't running, its safe to add the event, us holding
+	 * the ctx->lock ensures the task won't get scheduled in.
+	 */
+	add_event_to_ctx(event, ctx);
+}
+
 /*
 * Cross CPU call to install and enable a performance event
 *
@@ -2143,48 +2146,14 @@ perf_install_in_context(struct perf_event_context *ctx,
 			struct perf_event *event,
 			int cpu)
 {
-	struct task_struct *task = ctx->task;
-
 	lockdep_assert_held(&ctx->mutex);

 	event->ctx = ctx;
 	if (event->cpu != -1)
 		event->cpu = cpu;

-	if (!task) {
-		/*
-		 * Per cpu events are installed via an smp call and
-		 * the install is always successful.
-		 */
-		cpu_function_call(cpu, __perf_install_in_context, event);
-		return;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_install_in_context, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-	/*
-	 * If we failed to find a running task, but find the context active now
-	 * that we've acquired the ctx->lock, retry.
-	 */
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		/*
-		 * Reload the task pointer, it might have been changed by
-		 * a concurrent perf_event_context_sched_out().
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-	/*
-	 * Since the task isn't running, its safe to add the event, us holding
-	 * the ctx->lock ensures the task won't get scheduled in.
-	 */
-	add_event_to_ctx(event, ctx);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_install_in_context,
+			    ___perf_install_in_context, event);
 }

 /*
@@ -2287,6 +2256,11 @@ unlock:
 	return 0;
 }

+void ___perf_event_enable(void *info)
+{
+	__perf_event_mark_enabled((struct perf_event *)info);
+}
+
 /*
 * Enable a event.
 *
@@ -2299,58 +2273,26 @@ unlock:
 static void _perf_event_enable(struct perf_event *event)
 {
 	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task = ctx->task;
-
-	if (!task) {
-		/*
-		 * Enable the event on the cpu that it's on
-		 */
-		cpu_function_call(event->cpu, __perf_event_enable, event);
-		return;
-	}

 	raw_spin_lock_irq(&ctx->lock);
-	if (event->state >= PERF_EVENT_STATE_INACTIVE)
-		goto out;
+	if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+		raw_spin_unlock_irq(&ctx->lock);
+		return;
+	}

 	/*
 	 * If the event is in error state, clear that first.
-	 * That way, if we see the event in error state below, we
-	 * know that it has gone back into error state, as distinct
-	 * from the task having been scheduled away before the
-	 * cross-call arrived.
+	 *
+	 * That way, if we see the event in error state below, we know that it
+	 * has gone back into error state, as distinct from the task having
+	 * been scheduled away before the cross-call arrived.
 	 */
 	if (event->state == PERF_EVENT_STATE_ERROR)
 		event->state = PERF_EVENT_STATE_OFF;
-
-retry:
-	if (!ctx->is_active) {
-		__perf_event_mark_enabled(event);
-		goto out;
-	}
-
 	raw_spin_unlock_irq(&ctx->lock);

-	if (!task_function_call(task, __perf_event_enable, event))
-		return;
-
-	raw_spin_lock_irq(&ctx->lock);
-
-	/*
-	 * If the context is active and the event is still off,
-	 * we need to retry the cross-call.
-	 */
-	if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
-		/*
-		 * task could have been flipped by a concurrent
-		 * perf_event_context_sched_out()
-		 */
-		task = ctx->task;
-		goto retry;
-	}
-
-out:
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_enable,
+			    ___perf_event_enable, event);
 }

 /*
@@ -4149,6 +4091,22 @@ struct period_event {
 	u64 value;
 };

+static void ___perf_event_period(void *info)
+{
+	struct period_event *pe = info;
+	struct perf_event *event = pe->event;
+	u64 value = pe->value;
+
+	if (event->attr.freq) {
+		event->attr.sample_freq = value;
+	} else {
+		event->attr.sample_period = value;
+		event->hw.sample_period = value;
+	}
+
+	local64_set(&event->hw.period_left, 0);
+}
+
 static int __perf_event_period(void *info)
 {
 	struct period_event *pe = info;
@@ -4185,8 +4143,6 @@ static int __perf_event_period(void *info)
 static int perf_event_period(struct perf_event *event, u64 __user *arg)
 {
 	struct period_event pe = { .event = event, };
-	struct perf_event_context *ctx = event->ctx;
-	struct task_struct *task;
 	u64 value;

 	if (!is_sampling_event(event))
@@ -4201,34 +4157,10 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
 	if (event->attr.freq && value > sysctl_perf_event_sample_rate)
 		return -EINVAL;

-	task = ctx->task;
 	pe.value = value;

-	if (!task) {
-		cpu_function_call(event->cpu, __perf_event_period, &pe);
-		return 0;
-	}
-
-retry:
-	if (!task_function_call(task, __perf_event_period, &pe))
-		return 0;
-
-	raw_spin_lock_irq(&ctx->lock);
-	if (ctx->is_active) {
-		raw_spin_unlock_irq(&ctx->lock);
-		task = ctx->task;
-		goto retry;
-	}
-
-	if (event->attr.freq) {
-		event->attr.sample_freq = value;
-	} else {
-		event->attr.sample_period = value;
-		event->hw.sample_period = value;
-	}
-
-	local64_set(&event->hw.period_left, 0);
-	raw_spin_unlock_irq(&ctx->lock);
+	event_function_call(event, __perf_event_period,
+			    ___perf_event_period, &pe);

 	return 0;
 }