Merge branch 'linus' into sched/core, to pick up fixes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2017-08-25 11:07:13 +02:00
483 changed files with 3902 additions and 1807 deletions

View File

@@ -66,7 +66,7 @@ static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
FS_MOVE_SELF | FS_EVENT_ON_CHILD)
FS_MOVE_SELF | FS_EVENT_ON_CHILD | FS_UNMOUNT)
static void audit_free_parent(struct audit_parent *parent)
{
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
/*
* audit_remove_watch() drops our reference to 'parent' which
* can get freed. Grab our own reference to be safe.
*/
audit_get_parent(parent);
audit_remove_watch(watch);
if (list_empty(&parent->watches)) {
audit_get_parent(parent);
if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
audit_put_parent(parent);
}
audit_put_parent(parent);
}
}

View File

@@ -2217,6 +2217,33 @@ static int group_can_go_on(struct perf_event *event,
return can_add_hw;
}
/*
* Complement to update_event_times(). This computes the tstamp_* values to
* continue 'enabled' state from @now, and effectively discards the time
* between the prior tstamp_stopped and now (as we were in the OFF state, or
* just switched (context) time base).
*
* This further assumes '@event->state == INACTIVE' (we just came from OFF) and
* cannot have been scheduled in yet. And going into INACTIVE state means
* '@event->tstamp_stopped = @now'.
*
* Thus given the rules of update_event_times():
*
* total_time_enabled = tstamp_stopped - tstamp_enabled
* total_time_running = tstamp_stopped - tstamp_running
*
* We can insert 'tstamp_stopped == now' and reverse them to compute new
* tstamp_* values.
*/
static void __perf_event_enable_time(struct perf_event *event, u64 now)
{
WARN_ON_ONCE(event->state != PERF_EVENT_STATE_INACTIVE);
event->tstamp_stopped = now;
event->tstamp_enabled = now - event->total_time_enabled;
event->tstamp_running = now - event->total_time_running;
}
static void add_event_to_ctx(struct perf_event *event,
struct perf_event_context *ctx)
{
@@ -2224,9 +2251,12 @@ static void add_event_to_ctx(struct perf_event *event,
list_add_event(event, ctx);
perf_group_attach(event);
event->tstamp_enabled = tstamp;
event->tstamp_running = tstamp;
event->tstamp_stopped = tstamp;
/*
* We can be called with event->state == STATE_OFF when we create with
* .disabled = 1. In that case the IOC_ENABLE will call this function.
*/
if (event->state == PERF_EVENT_STATE_INACTIVE)
__perf_event_enable_time(event, tstamp);
}
static void ctx_sched_out(struct perf_event_context *ctx,
@@ -2471,10 +2501,11 @@ static void __perf_event_mark_enabled(struct perf_event *event)
u64 tstamp = perf_event_time(event);
event->state = PERF_EVENT_STATE_INACTIVE;
event->tstamp_enabled = tstamp - event->total_time_enabled;
__perf_event_enable_time(event, tstamp);
list_for_each_entry(sub, &event->sibling_list, group_entry) {
/* XXX should not be > INACTIVE if event isn't */
if (sub->state >= PERF_EVENT_STATE_INACTIVE)
sub->tstamp_enabled = tstamp - sub->total_time_enabled;
__perf_event_enable_time(sub, tstamp);
}
}
@@ -5090,7 +5121,7 @@ static void perf_mmap_open(struct vm_area_struct *vma)
atomic_inc(&event->rb->aux_mmap_count);
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
event->pmu->event_mapped(event, vma->vm_mm);
}
static void perf_pmu_output_stop(struct perf_event *event);
@@ -5113,7 +5144,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
unsigned long size = perf_data_size(rb);
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event);
event->pmu->event_unmapped(event, vma->vm_mm);
/*
* rb->aux_mmap_count will always drop before rb->mmap_count and
@@ -5411,7 +5442,7 @@ aux_unlock:
vma->vm_ops = &perf_mmap_vmops;
if (event->pmu->event_mapped)
event->pmu->event_mapped(event);
event->pmu->event_mapped(event, vma->vm_mm);
return ret;
}

View File

@@ -807,7 +807,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_aio(mm);
mm_init_owner(mm, p);
mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
init_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif

View File

@@ -1000,7 +1000,7 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
unsigned long flags;
unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
@@ -1014,6 +1014,8 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
irq_settings_clr_and_set(desc, clr, set);
trigger = irqd_get_trigger_type(&desc->irq_data);
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -1025,7 +1027,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
tmp = irq_settings_get_trigger_mask(desc);
if (tmp != IRQ_TYPE_NONE)
trigger = tmp;
irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}

View File

@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
if (!data || !ipimask || cpu > nr_cpu_ids)
if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
if (cpu > nr_cpu_ids)
if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {

View File

@@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem);
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
* running at the same time without returning. When this happens we
* believe you've somehow ended up with a recursive module dependency
* creating a loop.
*
* We have no option but to fail.
*
* Userspace should proactively try to detect and prevent these.
*/
#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
/*
modprobe_path is set via /proc/sys.
*/
@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name);
wait_event_interruptible(kmod_wq,
atomic_dec_if_positive(&kmod_concurrent_max) >= 0);
ret = wait_event_killable_timeout(kmod_wq,
atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
if (!ret) {
pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
return -ETIME;
} else if (ret == -ERESTARTSYS) {
pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
return ret;
}
}
trace_module_request(module_name, wait, _RET_IP_);

View File

@@ -527,8 +527,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
if (type != PIDTYPE_PID)
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -537,12 +540,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
{
return pid_nr_ns(task_tgid(tsk), ns);
}
EXPORT_SYMBOL(task_tgid_nr_ns);
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));

View File

@@ -1650,7 +1650,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
{
unsigned long size;
size = global_page_state(NR_SLAB_RECLAIMABLE)
size = global_node_page_state(NR_SLAB_RECLAIMABLE)
+ global_node_page_state(NR_ACTIVE_ANON)
+ global_node_page_state(NR_INACTIVE_ANON)
+ global_node_page_state(NR_ACTIVE_FILE)

View File

@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t);
}
}
if (action->sa.sa_handler == SIG_DFL)
/*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
* debugging to leave init killable.
*/
if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags);

View File

@@ -204,10 +204,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
fmt_cnt++;
}
return __trace_printk(1/* fake ip will not be printed */, fmt,
mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
/* Horrid workaround for getting va_list handling working with different
* argument type combinations generically for 32 and 64 bit archs.
*/
#define __BPF_TP_EMIT() __BPF_ARG3_TP()
#define __BPF_TP(...) \
__trace_printk(1 /* Fake ip will not be printed. */, \
fmt, ##__VA_ARGS__)
#define __BPF_ARG1_TP(...) \
((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
? __BPF_TP(arg1, ##__VA_ARGS__) \
: ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
? __BPF_TP((long)arg1, ##__VA_ARGS__) \
: __BPF_TP((u32)arg1, ##__VA_ARGS__)))
#define __BPF_ARG2_TP(...) \
((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
: ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
: __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
#define __BPF_ARG3_TP(...) \
((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
: ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
: __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {

View File

@@ -889,6 +889,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
/* If function graph is shutting down, ret_stack can be NULL */
if (!current->ret_stack)
return 0;
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;

View File

@@ -4386,15 +4386,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* the page that was allocated, with the read page of the buffer.
*
* Returns:
* The page allocated, or NULL on error.
* The page allocated, or ERR_PTR
*/
void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_per_cpu *cpu_buffer;
struct buffer_data_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
@@ -4412,7 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu)
page = alloc_pages_node(cpu_to_node(cpu),
GFP_KERNEL | __GFP_NORETRY, 0);
if (!page)
return NULL;
return ERR_PTR(-ENOMEM);
bpage = page_address(page);
@@ -4467,8 +4471,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
*
* for example:
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (!rpage)
* return error;
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
* ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
* if (ret >= 0)
* process_page(rpage, ret);

View File

@@ -113,7 +113,7 @@ static enum event_status read_page(int cpu)
int i;
bpage = ring_buffer_alloc_read_page(buffer, cpu);
if (!bpage)
if (IS_ERR(bpage))
return EVENT_DROPPED;
ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);

View File

@@ -6598,7 +6598,7 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
ssize_t ret;
ssize_t ret = 0;
ssize_t size;
if (!count)
@@ -6612,10 +6612,15 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
iter->cpu_file);
info->spare_cpu = iter->cpu_file;
if (IS_ERR(info->spare)) {
ret = PTR_ERR(info->spare);
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
}
}
if (!info->spare)
return -ENOMEM;
return ret;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
@@ -6790,8 +6795,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
ref->ref = 1;
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
ret = -ENOMEM;
if (IS_ERR(ref->page)) {
ret = PTR_ERR(ref->page);
ref->page = NULL;
kfree(ref);
break;
}
@@ -8293,6 +8299,7 @@ __init static int tracer_alloc_buffers(void)
if (ret < 0)
goto out_free_cpumask;
/* Used for event triggers */
ret = -ENOMEM;
temp_buffer = ring_buffer_alloc(PAGE_SIZE, RB_FL_OVERWRITE);
if (!temp_buffer)
goto out_rm_hp_state;
@@ -8407,4 +8414,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
late_initcall(clear_boot_tracer);
late_initcall_sync(clear_boot_tracer);

View File

@@ -1959,6 +1959,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
if (err && !set_str) {
free_event_filter(filter);
filter = NULL;
}
create_filter_finish(ps);
*filterp = filter;

View File

@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
if (!a->pages) {
kfree(a);
return;
}
if (!a->pages)
goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
kfree(a->pages);
free:
kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,

View File

@@ -240,6 +240,7 @@ static void set_sample_period(void)
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
watchdog_update_hrtimer_threshold(sample_period);
}
/* Commands for resetting the watchdog */

View File

@@ -37,6 +37,62 @@ void arch_touch_nmi_watchdog(void)
}
EXPORT_SYMBOL(arch_touch_nmi_watchdog);
#ifdef CONFIG_HARDLOCKUP_CHECK_TIMESTAMP
static DEFINE_PER_CPU(ktime_t, last_timestamp);
static DEFINE_PER_CPU(unsigned int, nmi_rearmed);
static ktime_t watchdog_hrtimer_sample_threshold __read_mostly;
void watchdog_update_hrtimer_threshold(u64 period)
{
/*
* The hrtimer runs with a period of (watchdog_threshold * 2) / 5
*
* So it runs effectively with 2.5 times the rate of the NMI
* watchdog. That means the hrtimer should fire 2-3 times before
* the NMI watchdog expires. The NMI watchdog on x86 is based on
* unhalted CPU cycles, so if Turbo-Mode is enabled the CPU cycles
* might run way faster than expected and the NMI fires in a
* smaller period than the one deduced from the nominal CPU
* frequency. Depending on the Turbo-Mode factor this might be fast
* enough to get the NMI period smaller than the hrtimer watchdog
* period and trigger false positives.
*
* The sample threshold is used to check in the NMI handler whether
* the minimum time between two NMI samples has elapsed. That
* prevents false positives.
*
* Set this to 4/5 of the actual watchdog threshold period so the
* hrtimer is guaranteed to fire at least once within the real
* watchdog threshold.
*/
watchdog_hrtimer_sample_threshold = period * 2;
}
static bool watchdog_check_timestamp(void)
{
ktime_t delta, now = ktime_get_mono_fast_ns();
delta = now - __this_cpu_read(last_timestamp);
if (delta < watchdog_hrtimer_sample_threshold) {
/*
* If ktime is jiffies based, a stalled timer would prevent
* jiffies from being incremented and the filter would look
* at a stale timestamp and never trigger.
*/
if (__this_cpu_inc_return(nmi_rearmed) < 10)
return false;
}
__this_cpu_write(nmi_rearmed, 0);
__this_cpu_write(last_timestamp, now);
return true;
}
#else
static inline bool watchdog_check_timestamp(void)
{
return true;
}
#endif
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
.config = PERF_COUNT_HW_CPU_CYCLES,
@@ -61,6 +117,9 @@ static void watchdog_overflow_callback(struct perf_event *event,
return;
}
if (!watchdog_check_timestamp())
return;
/* check for a hardlockup
* This is done by making sure our timer interrupt
* is incrementing. The timer interrupt should have