Merge branch 'locking/urgent' into locking/core, to pick up fixes
Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
@@ -19,6 +19,7 @@
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
|
||||
#include <linux/file.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/audit.h>
|
||||
#include <linux/kthread.h>
|
||||
@@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
|
||||
unsigned long ino;
|
||||
dev_t dev;
|
||||
|
||||
rcu_read_lock();
|
||||
exe_file = rcu_dereference(tsk->mm->exe_file);
|
||||
exe_file = get_task_exe_file(tsk);
|
||||
if (!exe_file)
|
||||
return 0;
|
||||
ino = exe_file->f_inode->i_ino;
|
||||
dev = exe_file->f_inode->i_sb->s_dev;
|
||||
rcu_read_unlock();
|
||||
fput(exe_file);
|
||||
return audit_mark_compare(mark, ino, dev);
|
||||
}
|
||||
|
@@ -26,11 +26,18 @@ struct bpf_htab {
|
||||
struct bucket *buckets;
|
||||
void *elems;
|
||||
struct pcpu_freelist freelist;
|
||||
void __percpu *extra_elems;
|
||||
atomic_t count; /* number of elements in this hashtable */
|
||||
u32 n_buckets; /* number of hash buckets */
|
||||
u32 elem_size; /* size of each element in bytes */
|
||||
};
|
||||
|
||||
enum extra_elem_state {
|
||||
HTAB_NOT_AN_EXTRA_ELEM = 0,
|
||||
HTAB_EXTRA_ELEM_FREE,
|
||||
HTAB_EXTRA_ELEM_USED
|
||||
};
|
||||
|
||||
/* each htab element is struct htab_elem + key + value */
|
||||
struct htab_elem {
|
||||
union {
|
||||
@@ -38,7 +45,10 @@ struct htab_elem {
|
||||
struct bpf_htab *htab;
|
||||
struct pcpu_freelist_node fnode;
|
||||
};
|
||||
struct rcu_head rcu;
|
||||
union {
|
||||
struct rcu_head rcu;
|
||||
enum extra_elem_state state;
|
||||
};
|
||||
u32 hash;
|
||||
char key[0] __aligned(8);
|
||||
};
|
||||
@@ -113,6 +123,23 @@ free_elems:
|
||||
return err;
|
||||
}
|
||||
|
||||
static int alloc_extra_elems(struct bpf_htab *htab)
|
||||
{
|
||||
void __percpu *pptr;
|
||||
int cpu;
|
||||
|
||||
pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
|
||||
if (!pptr)
|
||||
return -ENOMEM;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
|
||||
HTAB_EXTRA_ELEM_FREE;
|
||||
}
|
||||
htab->extra_elems = pptr;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Called from syscall */
|
||||
static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
{
|
||||
@@ -185,6 +212,8 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
if (percpu)
|
||||
cost += (u64) round_up(htab->map.value_size, 8) *
|
||||
num_possible_cpus() * htab->map.max_entries;
|
||||
else
|
||||
cost += (u64) htab->elem_size * num_possible_cpus();
|
||||
|
||||
if (cost >= U32_MAX - PAGE_SIZE)
|
||||
/* make sure page count doesn't overflow */
|
||||
@@ -212,14 +241,22 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
|
||||
raw_spin_lock_init(&htab->buckets[i].lock);
|
||||
}
|
||||
|
||||
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
|
||||
err = prealloc_elems_and_freelist(htab);
|
||||
if (!percpu) {
|
||||
err = alloc_extra_elems(htab);
|
||||
if (err)
|
||||
goto free_buckets;
|
||||
}
|
||||
|
||||
if (!(attr->map_flags & BPF_F_NO_PREALLOC)) {
|
||||
err = prealloc_elems_and_freelist(htab);
|
||||
if (err)
|
||||
goto free_extra_elems;
|
||||
}
|
||||
|
||||
return &htab->map;
|
||||
|
||||
free_extra_elems:
|
||||
free_percpu(htab->extra_elems);
|
||||
free_buckets:
|
||||
kvfree(htab->buckets);
|
||||
free_htab:
|
||||
@@ -349,7 +386,6 @@ static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
|
||||
if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
|
||||
free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
|
||||
kfree(l);
|
||||
|
||||
}
|
||||
|
||||
static void htab_elem_free_rcu(struct rcu_head *head)
|
||||
@@ -370,6 +406,11 @@ static void htab_elem_free_rcu(struct rcu_head *head)
|
||||
|
||||
static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
|
||||
{
|
||||
if (l->state == HTAB_EXTRA_ELEM_USED) {
|
||||
l->state = HTAB_EXTRA_ELEM_FREE;
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
|
||||
pcpu_freelist_push(&htab->freelist, &l->fnode);
|
||||
} else {
|
||||
@@ -381,25 +422,44 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
|
||||
|
||||
static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
|
||||
void *value, u32 key_size, u32 hash,
|
||||
bool percpu, bool onallcpus)
|
||||
bool percpu, bool onallcpus,
|
||||
bool old_elem_exists)
|
||||
{
|
||||
u32 size = htab->map.value_size;
|
||||
bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
|
||||
struct htab_elem *l_new;
|
||||
void __percpu *pptr;
|
||||
int err = 0;
|
||||
|
||||
if (prealloc) {
|
||||
l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
|
||||
if (!l_new)
|
||||
return ERR_PTR(-E2BIG);
|
||||
err = -E2BIG;
|
||||
} else {
|
||||
if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
|
||||
atomic_dec(&htab->count);
|
||||
return ERR_PTR(-E2BIG);
|
||||
err = -E2BIG;
|
||||
} else {
|
||||
l_new = kmalloc(htab->elem_size,
|
||||
GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!l_new)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
|
||||
if (!l_new)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
if (err) {
|
||||
if (!old_elem_exists)
|
||||
return ERR_PTR(err);
|
||||
|
||||
/* if we're updating the existing element and the hash table
|
||||
* is full, use per-cpu extra elems
|
||||
*/
|
||||
l_new = this_cpu_ptr(htab->extra_elems);
|
||||
if (l_new->state != HTAB_EXTRA_ELEM_FREE)
|
||||
return ERR_PTR(-E2BIG);
|
||||
l_new->state = HTAB_EXTRA_ELEM_USED;
|
||||
} else {
|
||||
l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
|
||||
}
|
||||
|
||||
memcpy(l_new->key, key, key_size);
|
||||
@@ -489,7 +549,8 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
|
||||
l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
|
||||
!!l_old);
|
||||
if (IS_ERR(l_new)) {
|
||||
/* all pre-allocated elements are in use or memory exhausted */
|
||||
ret = PTR_ERR(l_new);
|
||||
@@ -563,7 +624,7 @@ static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
|
||||
}
|
||||
} else {
|
||||
l_new = alloc_htab_elem(htab, key, value, key_size,
|
||||
hash, true, onallcpus);
|
||||
hash, true, onallcpus, false);
|
||||
if (IS_ERR(l_new)) {
|
||||
ret = PTR_ERR(l_new);
|
||||
goto err;
|
||||
@@ -652,6 +713,7 @@ static void htab_map_free(struct bpf_map *map)
|
||||
htab_free_elems(htab);
|
||||
pcpu_freelist_destroy(&htab->freelist);
|
||||
}
|
||||
free_percpu(htab->extra_elems);
|
||||
kvfree(htab->buckets);
|
||||
kfree(htab);
|
||||
}
|
||||
|
@@ -194,6 +194,7 @@ struct verifier_env {
|
||||
struct verifier_state_list **explored_states; /* search pruning optimization */
|
||||
struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
|
||||
u32 used_map_cnt; /* number of used maps */
|
||||
u32 id_gen; /* used to generate unique reg IDs */
|
||||
bool allow_ptr_leaks;
|
||||
};
|
||||
|
||||
@@ -1052,7 +1053,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
|
||||
goto error;
|
||||
break;
|
||||
case BPF_MAP_TYPE_CGROUP_ARRAY:
|
||||
if (func_id != BPF_FUNC_skb_in_cgroup)
|
||||
if (func_id != BPF_FUNC_skb_under_cgroup)
|
||||
goto error;
|
||||
break;
|
||||
default:
|
||||
@@ -1074,7 +1075,7 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id)
|
||||
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
|
||||
goto error;
|
||||
break;
|
||||
case BPF_FUNC_skb_in_cgroup:
|
||||
case BPF_FUNC_skb_under_cgroup:
|
||||
if (map->map_type != BPF_MAP_TYPE_CGROUP_ARRAY)
|
||||
goto error;
|
||||
break;
|
||||
@@ -1301,7 +1302,7 @@ add_imm:
|
||||
/* dst_reg stays as pkt_ptr type and since some positive
|
||||
* integer value was added to the pointer, increment its 'id'
|
||||
*/
|
||||
dst_reg->id++;
|
||||
dst_reg->id = ++env->id_gen;
|
||||
|
||||
/* something was added to pkt_ptr, set range and off to zero */
|
||||
dst_reg->off = 0;
|
||||
|
@@ -6276,6 +6276,12 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd)
|
||||
if (cgroup_sk_alloc_disabled)
|
||||
return;
|
||||
|
||||
/* Socket clone path */
|
||||
if (skcd->val) {
|
||||
cgroup_get(sock_cgroup_ptr(skcd));
|
||||
return;
|
||||
}
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
while (true) {
|
||||
|
@@ -1,4 +1,12 @@
|
||||
# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set
|
||||
CONFIG_CC_OPTIMIZE_FOR_SIZE=y
|
||||
# CONFIG_KERNEL_GZIP is not set
|
||||
# CONFIG_KERNEL_BZIP2 is not set
|
||||
# CONFIG_KERNEL_LZMA is not set
|
||||
CONFIG_KERNEL_XZ=y
|
||||
# CONFIG_KERNEL_LZO is not set
|
||||
# CONFIG_KERNEL_LZ4 is not set
|
||||
CONFIG_OPTIMIZE_INLINING=y
|
||||
# CONFIG_SLAB is not set
|
||||
# CONFIG_SLUB is not set
|
||||
CONFIG_SLOB=y
|
||||
|
@@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
|
||||
mutex_unlock(&cpuset_mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* Make sure the new task conform to the current state of its parent,
|
||||
* which could have been changed by cpuset just after it inherits the
|
||||
* state from the parent and before it sits on the cgroup's task list.
|
||||
*/
|
||||
void cpuset_fork(struct task_struct *task)
|
||||
{
|
||||
if (task_css_is_root(task, cpuset_cgrp_id))
|
||||
return;
|
||||
|
||||
set_cpus_allowed_ptr(task, ¤t->cpus_allowed);
|
||||
task->mems_allowed = current->mems_allowed;
|
||||
}
|
||||
|
||||
struct cgroup_subsys cpuset_cgrp_subsys = {
|
||||
.css_alloc = cpuset_css_alloc,
|
||||
.css_online = cpuset_css_online,
|
||||
@@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = {
|
||||
.attach = cpuset_attach,
|
||||
.post_attach = cpuset_post_attach,
|
||||
.bind = cpuset_bind,
|
||||
.fork = cpuset_fork,
|
||||
.legacy_cftypes = files,
|
||||
.early_init = true,
|
||||
};
|
||||
|
@@ -242,18 +242,6 @@ unlock:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void event_function_local(struct perf_event *event, event_f func, void *data)
|
||||
{
|
||||
struct event_function_struct efs = {
|
||||
.event = event,
|
||||
.func = func,
|
||||
.data = data,
|
||||
};
|
||||
|
||||
int ret = event_function(&efs);
|
||||
WARN_ON_ONCE(ret);
|
||||
}
|
||||
|
||||
static void event_function_call(struct perf_event *event, event_f func, void *data)
|
||||
{
|
||||
struct perf_event_context *ctx = event->ctx;
|
||||
@@ -303,6 +291,54 @@ again:
|
||||
raw_spin_unlock_irq(&ctx->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Similar to event_function_call() + event_function(), but hard assumes IRQs
|
||||
* are already disabled and we're on the right CPU.
|
||||
*/
|
||||
static void event_function_local(struct perf_event *event, event_f func, void *data)
|
||||
{
|
||||
struct perf_event_context *ctx = event->ctx;
|
||||
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
|
||||
struct task_struct *task = READ_ONCE(ctx->task);
|
||||
struct perf_event_context *task_ctx = NULL;
|
||||
|
||||
WARN_ON_ONCE(!irqs_disabled());
|
||||
|
||||
if (task) {
|
||||
if (task == TASK_TOMBSTONE)
|
||||
return;
|
||||
|
||||
task_ctx = ctx;
|
||||
}
|
||||
|
||||
perf_ctx_lock(cpuctx, task_ctx);
|
||||
|
||||
task = ctx->task;
|
||||
if (task == TASK_TOMBSTONE)
|
||||
goto unlock;
|
||||
|
||||
if (task) {
|
||||
/*
|
||||
* We must be either inactive or active and the right task,
|
||||
* otherwise we're screwed, since we cannot IPI to somewhere
|
||||
* else.
|
||||
*/
|
||||
if (ctx->is_active) {
|
||||
if (WARN_ON_ONCE(task != current))
|
||||
goto unlock;
|
||||
|
||||
if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
|
||||
goto unlock;
|
||||
}
|
||||
} else {
|
||||
WARN_ON_ONCE(&cpuctx->ctx != ctx);
|
||||
}
|
||||
|
||||
func(event, cpuctx, ctx, data);
|
||||
unlock:
|
||||
perf_ctx_unlock(cpuctx, task_ctx);
|
||||
}
|
||||
|
||||
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
|
||||
PERF_FLAG_FD_OUTPUT |\
|
||||
PERF_FLAG_PID_CGROUP |\
|
||||
@@ -843,6 +879,32 @@ perf_cgroup_mark_enabled(struct perf_event *event,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Update cpuctx->cgrp so that it is set when first cgroup event is added and
|
||||
* cleared when last cgroup event is removed.
|
||||
*/
|
||||
static inline void
|
||||
list_update_cgroup_event(struct perf_event *event,
|
||||
struct perf_event_context *ctx, bool add)
|
||||
{
|
||||
struct perf_cpu_context *cpuctx;
|
||||
|
||||
if (!is_cgroup_event(event))
|
||||
return;
|
||||
|
||||
if (add && ctx->nr_cgroups++)
|
||||
return;
|
||||
else if (!add && --ctx->nr_cgroups)
|
||||
return;
|
||||
/*
|
||||
* Because cgroup events are always per-cpu events,
|
||||
* this will always be called from the right CPU.
|
||||
*/
|
||||
cpuctx = __get_cpu_context(ctx);
|
||||
cpuctx->cgrp = add ? event->cgrp : NULL;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_CGROUP_PERF */
|
||||
|
||||
static inline bool
|
||||
@@ -920,6 +982,13 @@ perf_cgroup_mark_enabled(struct perf_event *event,
|
||||
struct perf_event_context *ctx)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void
|
||||
list_update_cgroup_event(struct perf_event *event,
|
||||
struct perf_event_context *ctx, bool add)
|
||||
{
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
/*
|
||||
@@ -1392,6 +1461,7 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
|
||||
static void
|
||||
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
{
|
||||
|
||||
lockdep_assert_held(&ctx->lock);
|
||||
|
||||
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
|
||||
@@ -1412,8 +1482,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
list_add_tail(&event->group_entry, list);
|
||||
}
|
||||
|
||||
if (is_cgroup_event(event))
|
||||
ctx->nr_cgroups++;
|
||||
list_update_cgroup_event(event, ctx, true);
|
||||
|
||||
list_add_rcu(&event->event_entry, &ctx->event_list);
|
||||
ctx->nr_events++;
|
||||
@@ -1581,8 +1650,6 @@ static void perf_group_attach(struct perf_event *event)
|
||||
static void
|
||||
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
{
|
||||
struct perf_cpu_context *cpuctx;
|
||||
|
||||
WARN_ON_ONCE(event->ctx != ctx);
|
||||
lockdep_assert_held(&ctx->lock);
|
||||
|
||||
@@ -1594,20 +1661,7 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
|
||||
event->attach_state &= ~PERF_ATTACH_CONTEXT;
|
||||
|
||||
if (is_cgroup_event(event)) {
|
||||
ctx->nr_cgroups--;
|
||||
/*
|
||||
* Because cgroup events are always per-cpu events, this will
|
||||
* always be called from the right CPU.
|
||||
*/
|
||||
cpuctx = __get_cpu_context(ctx);
|
||||
/*
|
||||
* If there are no more cgroup events then clear cgrp to avoid
|
||||
* stale pointer in update_cgrp_time_from_cpuctx().
|
||||
*/
|
||||
if (!ctx->nr_cgroups)
|
||||
cpuctx->cgrp = NULL;
|
||||
}
|
||||
list_update_cgroup_event(event, ctx, false);
|
||||
|
||||
ctx->nr_events--;
|
||||
if (event->attr.inherit_stat)
|
||||
@@ -1716,8 +1770,8 @@ static inline int pmu_filter_match(struct perf_event *event)
|
||||
static inline int
|
||||
event_filter_match(struct perf_event *event)
|
||||
{
|
||||
return (event->cpu == -1 || event->cpu == smp_processor_id())
|
||||
&& perf_cgroup_match(event) && pmu_filter_match(event);
|
||||
return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
|
||||
perf_cgroup_match(event) && pmu_filter_match(event);
|
||||
}
|
||||
|
||||
static void
|
||||
@@ -1737,8 +1791,8 @@ event_sched_out(struct perf_event *event,
|
||||
* maintained, otherwise bogus information is return
|
||||
* via read() for time_enabled, time_running:
|
||||
*/
|
||||
if (event->state == PERF_EVENT_STATE_INACTIVE
|
||||
&& !event_filter_match(event)) {
|
||||
if (event->state == PERF_EVENT_STATE_INACTIVE &&
|
||||
!event_filter_match(event)) {
|
||||
delta = tstamp - event->tstamp_stopped;
|
||||
event->tstamp_running += delta;
|
||||
event->tstamp_stopped = tstamp;
|
||||
@@ -2236,10 +2290,15 @@ perf_install_in_context(struct perf_event_context *ctx,
|
||||
|
||||
lockdep_assert_held(&ctx->mutex);
|
||||
|
||||
event->ctx = ctx;
|
||||
if (event->cpu != -1)
|
||||
event->cpu = cpu;
|
||||
|
||||
/*
|
||||
* Ensures that if we can observe event->ctx, both the event and ctx
|
||||
* will be 'complete'. See perf_iterate_sb_cpu().
|
||||
*/
|
||||
smp_store_release(&event->ctx, ctx);
|
||||
|
||||
if (!task) {
|
||||
cpu_function_call(cpu, __perf_install_in_context, event);
|
||||
return;
|
||||
@@ -2437,11 +2496,11 @@ static int __perf_event_stop(void *info)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int perf_event_restart(struct perf_event *event)
|
||||
static int perf_event_stop(struct perf_event *event, int restart)
|
||||
{
|
||||
struct stop_event_data sd = {
|
||||
.event = event,
|
||||
.restart = 1,
|
||||
.restart = restart,
|
||||
};
|
||||
int ret = 0;
|
||||
|
||||
@@ -3490,8 +3549,17 @@ static int perf_event_read(struct perf_event *event, bool group)
|
||||
.group = group,
|
||||
.ret = 0,
|
||||
};
|
||||
smp_call_function_single(event->oncpu,
|
||||
__perf_event_read, &data, 1);
|
||||
/*
|
||||
* Purposely ignore the smp_call_function_single() return
|
||||
* value.
|
||||
*
|
||||
* If event->oncpu isn't a valid CPU it means the event got
|
||||
* scheduled out and that will have updated the event count.
|
||||
*
|
||||
* Therefore, either way, we'll have an up-to-date event count
|
||||
* after this.
|
||||
*/
|
||||
(void)smp_call_function_single(event->oncpu, __perf_event_read, &data, 1);
|
||||
ret = data.ret;
|
||||
} else if (event->state == PERF_EVENT_STATE_INACTIVE) {
|
||||
struct perf_event_context *ctx = event->ctx;
|
||||
@@ -4777,6 +4845,19 @@ static void ring_buffer_attach(struct perf_event *event,
|
||||
spin_unlock_irqrestore(&rb->event_lock, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* Avoid racing with perf_mmap_close(AUX): stop the event
|
||||
* before swizzling the event::rb pointer; if it's getting
|
||||
* unmapped, its aux_mmap_count will be 0 and it won't
|
||||
* restart. See the comment in __perf_pmu_output_stop().
|
||||
*
|
||||
* Data will inevitably be lost when set_output is done in
|
||||
* mid-air, but then again, whoever does it like this is
|
||||
* not in for the data anyway.
|
||||
*/
|
||||
if (has_aux(event))
|
||||
perf_event_stop(event, 0);
|
||||
|
||||
rcu_assign_pointer(event->rb, rb);
|
||||
|
||||
if (old_rb) {
|
||||
@@ -5969,6 +6050,14 @@ static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
|
||||
struct perf_event *event;
|
||||
|
||||
list_for_each_entry_rcu(event, &pel->list, sb_list) {
|
||||
/*
|
||||
* Skip events that are not fully formed yet; ensure that
|
||||
* if we observe event->ctx, both event and ctx will be
|
||||
* complete enough. See perf_install_in_context().
|
||||
*/
|
||||
if (!smp_load_acquire(&event->ctx))
|
||||
continue;
|
||||
|
||||
if (event->state < PERF_EVENT_STATE_INACTIVE)
|
||||
continue;
|
||||
if (!event_filter_match(event))
|
||||
@@ -6044,7 +6133,7 @@ static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
|
||||
raw_spin_unlock_irqrestore(&ifh->lock, flags);
|
||||
|
||||
if (restart)
|
||||
perf_event_restart(event);
|
||||
perf_event_stop(event, 1);
|
||||
}
|
||||
|
||||
void perf_event_exec(void)
|
||||
@@ -6088,7 +6177,13 @@ static void __perf_event_output_stop(struct perf_event *event, void *data)
|
||||
|
||||
/*
|
||||
* In case of inheritance, it will be the parent that links to the
|
||||
* ring-buffer, but it will be the child that's actually using it:
|
||||
* ring-buffer, but it will be the child that's actually using it.
|
||||
*
|
||||
* We are using event::rb to determine if the event should be stopped,
|
||||
* however this may race with ring_buffer_attach() (through set_output),
|
||||
* which will make us skip the event that actually needs to be stopped.
|
||||
* So ring_buffer_attach() has to stop an aux event before re-assigning
|
||||
* its rb pointer.
|
||||
*/
|
||||
if (rcu_dereference(parent->rb) == rb)
|
||||
ro->err = __perf_event_stop(&sd);
|
||||
@@ -6098,7 +6193,7 @@ static int __perf_pmu_output_stop(void *info)
|
||||
{
|
||||
struct perf_event *event = info;
|
||||
struct pmu *pmu = event->pmu;
|
||||
struct perf_cpu_context *cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
|
||||
struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
|
||||
struct remote_output ro = {
|
||||
.rb = event->rb,
|
||||
};
|
||||
@@ -6552,15 +6647,6 @@ got_name:
|
||||
kfree(buf);
|
||||
}
|
||||
|
||||
/*
|
||||
* Whether this @filter depends on a dynamic object which is not loaded
|
||||
* yet or its load addresses are not known.
|
||||
*/
|
||||
static bool perf_addr_filter_needs_mmap(struct perf_addr_filter *filter)
|
||||
{
|
||||
return filter->filter && filter->inode;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether inode and address range match filter criteria.
|
||||
*/
|
||||
@@ -6611,7 +6697,7 @@ static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
|
||||
raw_spin_unlock_irqrestore(&ifh->lock, flags);
|
||||
|
||||
if (restart)
|
||||
perf_event_restart(event);
|
||||
perf_event_stop(event, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -6622,6 +6708,13 @@ static void perf_addr_filters_adjust(struct vm_area_struct *vma)
|
||||
struct perf_event_context *ctx;
|
||||
int ctxn;
|
||||
|
||||
/*
|
||||
* Data tracing isn't supported yet and as such there is no need
|
||||
* to keep track of anything that isn't related to executable code:
|
||||
*/
|
||||
if (!(vma->vm_flags & VM_EXEC))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
for_each_task_context_nr(ctxn) {
|
||||
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
|
||||
@@ -7774,7 +7867,11 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
|
||||
list_for_each_entry(filter, &ifh->list, entry) {
|
||||
event->addr_filters_offs[count] = 0;
|
||||
|
||||
if (perf_addr_filter_needs_mmap(filter))
|
||||
/*
|
||||
* Adjust base offset if the filter is associated to a binary
|
||||
* that needs to be mapped:
|
||||
*/
|
||||
if (filter->inode)
|
||||
event->addr_filters_offs[count] =
|
||||
perf_addr_filter_apply(filter, mm);
|
||||
|
||||
@@ -7789,7 +7886,7 @@ static void perf_event_addr_filters_apply(struct perf_event *event)
|
||||
mmput(mm);
|
||||
|
||||
restart:
|
||||
perf_event_restart(event);
|
||||
perf_event_stop(event, 1);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -7905,8 +8002,10 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (token == IF_SRC_FILE) {
|
||||
filename = match_strdup(&args[2]);
|
||||
if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
|
||||
int fpos = filter->range ? 2 : 1;
|
||||
|
||||
filename = match_strdup(&args[fpos]);
|
||||
if (!filename) {
|
||||
ret = -ENOMEM;
|
||||
goto fail;
|
||||
|
@@ -330,15 +330,22 @@ void *perf_aux_output_begin(struct perf_output_handle *handle,
|
||||
if (!rb)
|
||||
return NULL;
|
||||
|
||||
if (!rb_has_aux(rb) || !atomic_inc_not_zero(&rb->aux_refcount))
|
||||
if (!rb_has_aux(rb))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* If rb::aux_mmap_count is zero (and rb_has_aux() above went through),
|
||||
* the aux buffer is in perf_mmap_close(), about to get freed.
|
||||
* If aux_mmap_count is zero, the aux buffer is in perf_mmap_close(),
|
||||
* about to get freed, so we leave immediately.
|
||||
*
|
||||
* Checking rb::aux_mmap_count and rb::refcount has to be done in
|
||||
* the same order, see perf_mmap_close. Otherwise we end up freeing
|
||||
* aux pages in this path, which is a bug, because in_atomic().
|
||||
*/
|
||||
if (!atomic_read(&rb->aux_mmap_count))
|
||||
goto err_put;
|
||||
goto err;
|
||||
|
||||
if (!atomic_inc_not_zero(&rb->aux_refcount))
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* Nesting is not supported for AUX area, make sure nested
|
||||
|
@@ -172,8 +172,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
|
||||
err = -EAGAIN;
|
||||
ptep = page_check_address(page, mm, addr, &ptl, 0);
|
||||
if (!ptep)
|
||||
if (!ptep) {
|
||||
mem_cgroup_cancel_charge(kpage, memcg, false);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
get_page(kpage);
|
||||
page_add_new_anon_rmap(kpage, vma, addr, false);
|
||||
@@ -200,7 +202,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
|
||||
err = 0;
|
||||
unlock:
|
||||
mem_cgroup_cancel_charge(kpage, memcg, false);
|
||||
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
|
||||
unlock_page(page);
|
||||
return err;
|
||||
|
@@ -848,12 +848,7 @@ void do_exit(long code)
|
||||
TASKS_RCU(preempt_enable());
|
||||
exit_notify(tsk, group_dead);
|
||||
proc_exit_connector(tsk);
|
||||
#ifdef CONFIG_NUMA
|
||||
task_lock(tsk);
|
||||
mpol_put(tsk->mempolicy);
|
||||
tsk->mempolicy = NULL;
|
||||
task_unlock(tsk);
|
||||
#endif
|
||||
mpol_put_task_policy(tsk);
|
||||
#ifdef CONFIG_FUTEX
|
||||
if (unlikely(current->pi_state_cache))
|
||||
kfree(current->pi_state_cache);
|
||||
|
@@ -798,6 +798,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm)
|
||||
}
|
||||
EXPORT_SYMBOL(get_mm_exe_file);
|
||||
|
||||
/**
|
||||
* get_task_exe_file - acquire a reference to the task's executable file
|
||||
*
|
||||
* Returns %NULL if task's mm (if any) has no associated executable file or
|
||||
* this is a kernel thread with borrowed mm (see the comment above get_task_mm).
|
||||
* User must release file via fput().
|
||||
*/
|
||||
struct file *get_task_exe_file(struct task_struct *task)
|
||||
{
|
||||
struct file *exe_file = NULL;
|
||||
struct mm_struct *mm;
|
||||
|
||||
task_lock(task);
|
||||
mm = task->mm;
|
||||
if (mm) {
|
||||
if (!(task->flags & PF_KTHREAD))
|
||||
exe_file = get_mm_exe_file(mm);
|
||||
}
|
||||
task_unlock(task);
|
||||
return exe_file;
|
||||
}
|
||||
EXPORT_SYMBOL(get_task_exe_file);
|
||||
|
||||
/**
|
||||
* get_task_mm - acquire a reference to the task's mm
|
||||
*
|
||||
@@ -913,14 +936,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
|
||||
deactivate_mm(tsk, mm);
|
||||
|
||||
/*
|
||||
* If we're exiting normally, clear a user-space tid field if
|
||||
* requested. We leave this alone when dying by signal, to leave
|
||||
* the value intact in a core dump, and to save the unnecessary
|
||||
* trouble, say, a killed vfork parent shouldn't touch this mm.
|
||||
* Userland only wants this done for a sys_exit.
|
||||
* Signal userspace if we're not exiting with a core dump
|
||||
* because we want to leave the value intact for debugging
|
||||
* purposes.
|
||||
*/
|
||||
if (tsk->clear_child_tid) {
|
||||
if (!(tsk->flags & PF_SIGNALED) &&
|
||||
if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
|
||||
atomic_read(&mm->mm_users) > 1) {
|
||||
/*
|
||||
* We don't check the error code - if userspace has
|
||||
@@ -1404,7 +1425,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
p->real_start_time = ktime_get_boot_ns();
|
||||
p->io_context = NULL;
|
||||
p->audit_context = NULL;
|
||||
threadgroup_change_begin(current);
|
||||
cgroup_fork(p);
|
||||
#ifdef CONFIG_NUMA
|
||||
p->mempolicy = mpol_dup(p->mempolicy);
|
||||
@@ -1556,6 +1576,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
INIT_LIST_HEAD(&p->thread_group);
|
||||
p->task_works = NULL;
|
||||
|
||||
threadgroup_change_begin(current);
|
||||
/*
|
||||
* Ensure that the cgroup subsystem policies allow the new process to be
|
||||
* forked. It should be noted the the new process's css_set can be changed
|
||||
@@ -1656,6 +1677,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
|
||||
bad_fork_cancel_cgroup:
|
||||
cgroup_cancel_fork(p);
|
||||
bad_fork_free_pid:
|
||||
threadgroup_change_end(current);
|
||||
if (pid != &init_struct_pid)
|
||||
free_pid(pid);
|
||||
bad_fork_cleanup_thread:
|
||||
@@ -1688,7 +1710,6 @@ bad_fork_cleanup_policy:
|
||||
mpol_put(p->mempolicy);
|
||||
bad_fork_cleanup_threadgroup_lock:
|
||||
#endif
|
||||
threadgroup_change_end(current);
|
||||
delayacct_tsk_free(p);
|
||||
bad_fork_cleanup_count:
|
||||
atomic_dec(&p->cred->user->processes);
|
||||
|
@@ -39,6 +39,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
get_online_cpus();
|
||||
if (max_vecs >= num_online_cpus()) {
|
||||
cpumask_copy(affinity_mask, cpu_online_mask);
|
||||
*nr_vecs = num_online_cpus();
|
||||
@@ -56,6 +57,7 @@ struct cpumask *irq_create_affinity_mask(unsigned int *nr_vecs)
|
||||
}
|
||||
*nr_vecs = vecs;
|
||||
}
|
||||
put_online_cpus();
|
||||
|
||||
return affinity_mask;
|
||||
}
|
||||
|
@@ -820,6 +820,17 @@ __irq_do_set_handler(struct irq_desc *desc, irq_flow_handler_t handle,
|
||||
desc->name = name;
|
||||
|
||||
if (handle != handle_bad_irq && is_chained) {
|
||||
/*
|
||||
* We're about to start this interrupt immediately,
|
||||
* hence the need to set the trigger configuration.
|
||||
* But the .set_type callback may have overridden the
|
||||
* flow handler, ignoring that we're dealing with a
|
||||
* chained interrupt. Reset it immediately because we
|
||||
* do know better.
|
||||
*/
|
||||
__irq_set_trigger(desc, irqd_get_trigger_type(&desc->irq_data));
|
||||
desc->handle_irq = handle;
|
||||
|
||||
irq_settings_set_noprobe(desc);
|
||||
irq_settings_set_norequest(desc);
|
||||
irq_settings_set_nothread(desc);
|
||||
|
@@ -1681,8 +1681,10 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
|
||||
action->dev_id = dev_id;
|
||||
|
||||
retval = irq_chip_pm_get(&desc->irq_data);
|
||||
if (retval < 0)
|
||||
if (retval < 0) {
|
||||
kfree(action);
|
||||
return retval;
|
||||
}
|
||||
|
||||
chip_bus_lock(desc);
|
||||
retval = __setup_irq(irq, desc, action);
|
||||
@@ -1985,8 +1987,10 @@ int request_percpu_irq(unsigned int irq, irq_handler_t handler,
|
||||
action->percpu_dev_id = dev_id;
|
||||
|
||||
retval = irq_chip_pm_get(&desc->irq_data);
|
||||
if (retval < 0)
|
||||
if (retval < 0) {
|
||||
kfree(action);
|
||||
return retval;
|
||||
}
|
||||
|
||||
chip_bus_lock(desc);
|
||||
retval = __setup_irq(irq, desc, action);
|
||||
|
@@ -359,6 +359,17 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
|
||||
else
|
||||
dev_dbg(dev, "irq [%d-%d] for MSI\n",
|
||||
virq, virq + desc->nvec_used - 1);
|
||||
/*
|
||||
* This flag is set by the PCI layer as we need to activate
|
||||
* the MSI entries before the PCI layer enables MSI in the
|
||||
* card. Otherwise the card latches a random msi message.
|
||||
*/
|
||||
if (info->flags & MSI_FLAG_ACTIVATE_EARLY) {
|
||||
struct irq_data *irq_data;
|
||||
|
||||
irq_data = irq_domain_get_irq_data(domain, desc->irq);
|
||||
irq_domain_activate_irq(irq_data);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
@@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min,
|
||||
return 0;
|
||||
out:
|
||||
vfree(pi->sechdrs);
|
||||
pi->sechdrs = NULL;
|
||||
|
||||
vfree(pi->purgatory_buf);
|
||||
pi->purgatory_buf = NULL;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
|
||||
align_start = res->start & ~(SECTION_SIZE - 1);
|
||||
align_size = ALIGN(resource_size(res), SECTION_SIZE);
|
||||
arch_remove_memory(align_start, align_size);
|
||||
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
|
||||
pgmap_radix_release(res);
|
||||
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
|
||||
"%s: failed to free all reserved pages\n", __func__);
|
||||
@@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
struct percpu_ref *ref, struct vmem_altmap *altmap)
|
||||
{
|
||||
resource_size_t key, align_start, align_size, align_end;
|
||||
pgprot_t pgprot = PAGE_KERNEL;
|
||||
struct dev_pagemap *pgmap;
|
||||
struct page_map *page_map;
|
||||
int error, nid, is_ram;
|
||||
@@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
if (nid < 0)
|
||||
nid = numa_mem_id();
|
||||
|
||||
error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0,
|
||||
align_size);
|
||||
if (error)
|
||||
goto err_pfn_remap;
|
||||
|
||||
error = arch_add_memory(nid, align_start, align_size, true);
|
||||
if (error)
|
||||
goto err_add_memory;
|
||||
@@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
|
||||
return __va(res->start);
|
||||
|
||||
err_add_memory:
|
||||
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
|
||||
err_pfn_remap:
|
||||
err_radix:
|
||||
pgmap_radix_release(res);
|
||||
devres_free(page_map);
|
||||
|
@@ -300,12 +300,12 @@ static int create_image(int platform_mode)
|
||||
save_processor_state();
|
||||
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
|
||||
error = swsusp_arch_suspend();
|
||||
/* Restore control flow magically appears here */
|
||||
restore_processor_state();
|
||||
trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
|
||||
if (error)
|
||||
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
|
||||
error);
|
||||
/* Restore control flow magically appears here */
|
||||
restore_processor_state();
|
||||
if (!in_suspend)
|
||||
events_check_enabled = false;
|
||||
|
||||
|
@@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req,
|
||||
return;
|
||||
}
|
||||
|
||||
cancel_delayed_work_sync(&req->work);
|
||||
/*
|
||||
* This function may be called very early during boot, for example,
|
||||
* from of_clk_init(), where irq needs to stay disabled.
|
||||
* cancel_delayed_work_sync() assumes that irq is enabled on
|
||||
* invocation and re-enables it on return. Avoid calling it until
|
||||
* workqueue is initialized.
|
||||
*/
|
||||
if (keventd_up())
|
||||
cancel_delayed_work_sync(&req->work);
|
||||
|
||||
__pm_qos_update_request(req, new_value);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(pm_qos_update_request);
|
||||
|
@@ -835,9 +835,9 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
|
||||
*/
|
||||
static bool rtree_next_node(struct memory_bitmap *bm)
|
||||
{
|
||||
bm->cur.node = list_entry(bm->cur.node->list.next,
|
||||
struct rtree_node, list);
|
||||
if (&bm->cur.node->list != &bm->cur.zone->leaves) {
|
||||
if (!list_is_last(&bm->cur.node->list, &bm->cur.zone->leaves)) {
|
||||
bm->cur.node = list_entry(bm->cur.node->list.next,
|
||||
struct rtree_node, list);
|
||||
bm->cur.node_pfn += BM_BITS_PER_BLOCK;
|
||||
bm->cur.node_bit = 0;
|
||||
touch_softlockup_watchdog();
|
||||
@@ -845,9 +845,9 @@ static bool rtree_next_node(struct memory_bitmap *bm)
|
||||
}
|
||||
|
||||
/* No more nodes, goto next zone */
|
||||
bm->cur.zone = list_entry(bm->cur.zone->list.next,
|
||||
if (!list_is_last(&bm->cur.zone->list, &bm->zones)) {
|
||||
bm->cur.zone = list_entry(bm->cur.zone->list.next,
|
||||
struct mem_zone_bm_rtree, list);
|
||||
if (&bm->cur.zone->list != &bm->zones) {
|
||||
bm->cur.node = list_entry(bm->cur.zone->leaves.next,
|
||||
struct rtree_node, list);
|
||||
bm->cur.node_pfn = 0;
|
||||
|
@@ -9,10 +9,10 @@
|
||||
|
||||
char *_braille_console_setup(char **str, char **brl_options)
|
||||
{
|
||||
if (!memcmp(*str, "brl,", 4)) {
|
||||
if (!strncmp(*str, "brl,", 4)) {
|
||||
*brl_options = "";
|
||||
*str += 4;
|
||||
} else if (!memcmp(str, "brl=", 4)) {
|
||||
} else if (!strncmp(*str, "brl=", 4)) {
|
||||
*brl_options = *str + 4;
|
||||
*str = strchr(*brl_options, ',');
|
||||
if (!*str)
|
||||
|
@@ -99,26 +99,32 @@ again:
|
||||
return add;
|
||||
}
|
||||
|
||||
/*
|
||||
* printk one line from the temporary buffer from @start index until
|
||||
* and including the @end index.
|
||||
*/
|
||||
static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end)
|
||||
static void printk_nmi_flush_line(const char *text, int len)
|
||||
{
|
||||
const char *buf = s->buffer + start;
|
||||
|
||||
/*
|
||||
* The buffers are flushed in NMI only on panic. The messages must
|
||||
* go only into the ring buffer at this stage. Consoles will get
|
||||
* explicitly called later when a crashdump is not generated.
|
||||
*/
|
||||
if (in_nmi())
|
||||
printk_deferred("%.*s", (end - start) + 1, buf);
|
||||
printk_deferred("%.*s", len, text);
|
||||
else
|
||||
printk("%.*s", (end - start) + 1, buf);
|
||||
printk("%.*s", len, text);
|
||||
|
||||
}
|
||||
|
||||
/*
|
||||
* printk one line from the temporary buffer from @start index until
|
||||
* and including the @end index.
|
||||
*/
|
||||
static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s,
|
||||
int start, int end)
|
||||
{
|
||||
const char *buf = s->buffer + start;
|
||||
|
||||
printk_nmi_flush_line(buf, (end - start) + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Flush data from the associated per_CPU buffer. The function
|
||||
* can be called either via IRQ work or independently.
|
||||
@@ -150,9 +156,11 @@ more:
|
||||
* the buffer an unexpected way. If we printed something then
|
||||
* @len must only increase.
|
||||
*/
|
||||
if (i && i >= len)
|
||||
pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n",
|
||||
i, len);
|
||||
if (i && i >= len) {
|
||||
const char *msg = "printk_nmi_flush: internal error\n";
|
||||
|
||||
printk_nmi_flush_line(msg, strlen(msg));
|
||||
}
|
||||
|
||||
if (!len)
|
||||
goto out; /* Someone else has already flushed the buffer. */
|
||||
@@ -166,14 +174,14 @@ more:
|
||||
/* Print line by line. */
|
||||
for (; i < size; i++) {
|
||||
if (s->buffer[i] == '\n') {
|
||||
print_nmi_seq_line(s, last_i, i);
|
||||
printk_nmi_flush_seq_line(s, last_i, i);
|
||||
last_i = i + 1;
|
||||
}
|
||||
}
|
||||
/* Check if there was a partial line. */
|
||||
if (last_i < size) {
|
||||
print_nmi_seq_line(s, last_i, size - 1);
|
||||
pr_cont("\n");
|
||||
printk_nmi_flush_seq_line(s, last_i, size - 1);
|
||||
printk_nmi_flush_line("\n", strlen("\n"));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@@ -74,6 +74,7 @@
|
||||
#include <linux/context_tracking.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/frame.h>
|
||||
#include <linux/prefetch.h>
|
||||
|
||||
#include <asm/switch_to.h>
|
||||
#include <asm/tlb.h>
|
||||
@@ -2015,6 +2016,28 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
|
||||
success = 1; /* we're going to change ->state */
|
||||
cpu = task_cpu(p);
|
||||
|
||||
/*
|
||||
* Ensure we load p->on_rq _after_ p->state, otherwise it would
|
||||
* be possible to, falsely, observe p->on_rq == 0 and get stuck
|
||||
* in smp_cond_load_acquire() below.
|
||||
*
|
||||
* sched_ttwu_pending() try_to_wake_up()
|
||||
* [S] p->on_rq = 1; [L] P->state
|
||||
* UNLOCK rq->lock -----.
|
||||
* \
|
||||
* +--- RMB
|
||||
* schedule() /
|
||||
* LOCK rq->lock -----'
|
||||
* UNLOCK rq->lock
|
||||
*
|
||||
* [task p]
|
||||
* [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
|
||||
*
|
||||
* Pairs with the UNLOCK+LOCK on rq->lock from the
|
||||
* last wakeup of our task and the schedule that got our task
|
||||
* current.
|
||||
*/
|
||||
smp_rmb();
|
||||
if (p->on_rq && ttwu_remote(p, wake_flags))
|
||||
goto stat;
|
||||
|
||||
@@ -2971,6 +2994,23 @@ DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
|
||||
EXPORT_PER_CPU_SYMBOL(kstat);
|
||||
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
|
||||
|
||||
/*
|
||||
* The function fair_sched_class.update_curr accesses the struct curr
|
||||
* and its field curr->exec_start; when called from task_sched_runtime(),
|
||||
* we observe a high rate of cache misses in practice.
|
||||
* Prefetching this data results in improved performance.
|
||||
*/
|
||||
static inline void prefetch_curr_exec_start(struct task_struct *p)
|
||||
{
|
||||
#ifdef CONFIG_FAIR_GROUP_SCHED
|
||||
struct sched_entity *curr = (&p->se)->cfs_rq->curr;
|
||||
#else
|
||||
struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
|
||||
#endif
|
||||
prefetch(curr);
|
||||
prefetch(&curr->exec_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return accounted runtime for the task.
|
||||
* In case the task is currently running, return the runtime plus current's
|
||||
@@ -3005,6 +3045,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
|
||||
* thread, breaking clock_gettime().
|
||||
*/
|
||||
if (task_current(rq, p) && task_on_rq_queued(p)) {
|
||||
prefetch_curr_exec_start(p);
|
||||
update_rq_clock(rq);
|
||||
p->sched_class->update_curr(rq);
|
||||
}
|
||||
|
@@ -168,7 +168,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid)
|
||||
|
||||
if (old_idx == IDX_INVALID) {
|
||||
cp->size++;
|
||||
cp->elements[cp->size - 1].dl = 0;
|
||||
cp->elements[cp->size - 1].dl = dl;
|
||||
cp->elements[cp->size - 1].cpu = cpu;
|
||||
cp->elements[cpu].idx = cp->size - 1;
|
||||
cpudl_change_key(cp, cp->size - 1, dl);
|
||||
|
@@ -263,6 +263,11 @@ void account_idle_time(cputime_t cputime)
|
||||
cpustat[CPUTIME_IDLE] += (__force u64) cputime;
|
||||
}
|
||||
|
||||
/*
|
||||
* When a guest is interrupted for a longer amount of time, missed clock
|
||||
* ticks are not redelivered later. Due to that, this function may on
|
||||
* occasion account more time than the calling functions think elapsed.
|
||||
*/
|
||||
static __always_inline cputime_t steal_account_process_time(cputime_t maxtime)
|
||||
{
|
||||
#ifdef CONFIG_PARAVIRT
|
||||
@@ -371,7 +376,7 @@ static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
|
||||
* idle, or potentially user or system time. Due to rounding,
|
||||
* other time can exceed ticks occasionally.
|
||||
*/
|
||||
other = account_other_time(cputime);
|
||||
other = account_other_time(ULONG_MAX);
|
||||
if (other >= cputime)
|
||||
return;
|
||||
cputime -= other;
|
||||
@@ -486,7 +491,7 @@ void account_process_tick(struct task_struct *p, int user_tick)
|
||||
}
|
||||
|
||||
cputime = cputime_one_jiffy;
|
||||
steal = steal_account_process_time(cputime);
|
||||
steal = steal_account_process_time(ULONG_MAX);
|
||||
|
||||
if (steal >= cputime)
|
||||
return;
|
||||
@@ -508,13 +513,21 @@ void account_process_tick(struct task_struct *p, int user_tick)
|
||||
*/
|
||||
void account_idle_ticks(unsigned long ticks)
|
||||
{
|
||||
cputime_t cputime, steal;
|
||||
|
||||
if (sched_clock_irqtime) {
|
||||
irqtime_account_idle_ticks(ticks);
|
||||
return;
|
||||
}
|
||||
|
||||
account_idle_time(jiffies_to_cputime(ticks));
|
||||
cputime = jiffies_to_cputime(ticks);
|
||||
steal = steal_account_process_time(ULONG_MAX);
|
||||
|
||||
if (steal >= cputime)
|
||||
return;
|
||||
|
||||
cputime -= steal;
|
||||
account_idle_time(cputime);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -606,19 +619,25 @@ static void cputime_adjust(struct task_cputime *curr,
|
||||
stime = curr->stime;
|
||||
utime = curr->utime;
|
||||
|
||||
if (utime == 0) {
|
||||
stime = rtime;
|
||||
/*
|
||||
* If either stime or both stime and utime are 0, assume all runtime is
|
||||
* userspace. Once a task gets some ticks, the monotonicy code at
|
||||
* 'update' will ensure things converge to the observed ratio.
|
||||
*/
|
||||
if (stime == 0) {
|
||||
utime = rtime;
|
||||
goto update;
|
||||
}
|
||||
|
||||
if (stime == 0) {
|
||||
utime = rtime;
|
||||
if (utime == 0) {
|
||||
stime = rtime;
|
||||
goto update;
|
||||
}
|
||||
|
||||
stime = scale_stime((__force u64)stime, (__force u64)rtime,
|
||||
(__force u64)(stime + utime));
|
||||
|
||||
update:
|
||||
/*
|
||||
* Make sure stime doesn't go backwards; this preserves monotonicity
|
||||
* for utime because rtime is monotonic.
|
||||
@@ -641,7 +660,6 @@ static void cputime_adjust(struct task_cputime *curr,
|
||||
stime = rtime - utime;
|
||||
}
|
||||
|
||||
update:
|
||||
prev->stime = stime;
|
||||
prev->utime = utime;
|
||||
out:
|
||||
@@ -686,6 +704,13 @@ static cputime_t get_vtime_delta(struct task_struct *tsk)
|
||||
unsigned long now = READ_ONCE(jiffies);
|
||||
cputime_t delta, other;
|
||||
|
||||
/*
|
||||
* Unlike tick based timing, vtime based timing never has lost
|
||||
* ticks, and no need for steal time accounting to make up for
|
||||
* lost ticks. Vtime accounts a rounded version of actual
|
||||
* elapsed time. Limit account_other_time to prevent rounding
|
||||
* errors from causing elapsed vtime to go negative.
|
||||
*/
|
||||
delta = jiffies_to_cputime(now - tsk->vtime_snap);
|
||||
other = account_other_time(delta);
|
||||
WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE);
|
||||
|
@@ -658,8 +658,11 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
|
||||
*
|
||||
* XXX figure out if select_task_rq_dl() deals with offline cpus.
|
||||
*/
|
||||
if (unlikely(!rq->online))
|
||||
if (unlikely(!rq->online)) {
|
||||
lockdep_unpin_lock(&rq->lock, rf.cookie);
|
||||
rq = dl_task_offline_migration(rq, p);
|
||||
rf.cookie = lockdep_pin_lock(&rq->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Queueing this task back might have overloaded rq, check if we need
|
||||
|
@@ -4269,7 +4269,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
|
||||
pcfs_rq = tg->parent->cfs_rq[cpu];
|
||||
|
||||
cfs_rq->throttle_count = pcfs_rq->throttle_count;
|
||||
pcfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
|
||||
cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
|
||||
}
|
||||
|
||||
/* conditionally throttle active cfs_rq's from put_prev_entity() */
|
||||
|
@@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
|
||||
ptrace_event(PTRACE_EVENT_SECCOMP, data);
|
||||
/*
|
||||
* The delivery of a fatal signal during event
|
||||
* notification may silently skip tracer notification.
|
||||
* Terminating the task now avoids executing a system
|
||||
* call that may not be intended.
|
||||
* notification may silently skip tracer notification,
|
||||
* which could leave us with a potentially unmodified
|
||||
* syscall that the tracer would have liked to have
|
||||
* changed. Since the process is about to die, we just
|
||||
* force the syscall to be skipped and let the signal
|
||||
* kill the process and correctly handle any tracer exit
|
||||
* notifications.
|
||||
*/
|
||||
if (fatal_signal_pending(current))
|
||||
do_exit(SIGSYS);
|
||||
goto skip;
|
||||
/* Check if the tracer forced the syscall to be skipped. */
|
||||
this_syscall = syscall_get_nr(current, task_pt_regs(current));
|
||||
if (this_syscall < 0)
|
||||
|
@@ -2140,6 +2140,21 @@ static int do_proc_dointvec_conv(bool *negp, unsigned long *lvalp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
|
||||
int *valp,
|
||||
int write, void *data)
|
||||
{
|
||||
if (write) {
|
||||
if (*negp)
|
||||
return -EINVAL;
|
||||
*valp = *lvalp;
|
||||
} else {
|
||||
unsigned int val = *valp;
|
||||
*lvalp = (unsigned long)val;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const char proc_wspace_sep[] = { ' ', '\t', '\n' };
|
||||
|
||||
static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
|
||||
@@ -2259,8 +2274,27 @@ static int do_proc_dointvec(struct ctl_table *table, int write,
|
||||
int proc_dointvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table,write,buffer,lenp,ppos,
|
||||
NULL,NULL);
|
||||
return do_proc_dointvec(table, write, buffer, lenp, ppos, NULL, NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
* proc_douintvec - read a vector of unsigned integers
|
||||
* @table: the sysctl table
|
||||
* @write: %TRUE if this is a write to the sysctl file
|
||||
* @buffer: the user buffer
|
||||
* @lenp: the size of the user buffer
|
||||
* @ppos: file position
|
||||
*
|
||||
* Reads/writes up to table->maxlen/sizeof(unsigned int) unsigned integer
|
||||
* values from/to the user buffer, treated as an ASCII string.
|
||||
*
|
||||
* Returns 0 on success.
|
||||
*/
|
||||
int proc_douintvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return do_proc_dointvec(table, write, buffer, lenp, ppos,
|
||||
do_proc_douintvec_conv, NULL);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -2858,6 +2892,12 @@ int proc_dointvec(struct ctl_table *table, int write,
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_douintvec(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int proc_dointvec_minmax(struct ctl_table *table, int write,
|
||||
void __user *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
@@ -2903,6 +2943,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
|
||||
* exception granted :-)
|
||||
*/
|
||||
EXPORT_SYMBOL(proc_dointvec);
|
||||
EXPORT_SYMBOL(proc_douintvec);
|
||||
EXPORT_SYMBOL(proc_dointvec_jiffies);
|
||||
EXPORT_SYMBOL(proc_dointvec_minmax);
|
||||
EXPORT_SYMBOL(proc_dointvec_userhz_jiffies);
|
||||
|
@@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
|
||||
ktime_t now, expires;
|
||||
int cpu = smp_processor_id();
|
||||
|
||||
now = tick_nohz_start_idle(ts);
|
||||
|
||||
if (can_stop_idle_tick(cpu, ts)) {
|
||||
int was_stopped = ts->tick_stopped;
|
||||
|
||||
now = tick_nohz_start_idle(ts);
|
||||
ts->idle_calls++;
|
||||
|
||||
expires = tick_nohz_stop_sched_tick(ts, now, cpu);
|
||||
|
@@ -401,7 +401,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
|
||||
do {
|
||||
seq = raw_read_seqcount_latch(&tkf->seq);
|
||||
tkr = tkf->base + (seq & 0x01);
|
||||
now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr);
|
||||
now = ktime_to_ns(tkr->base);
|
||||
|
||||
now += clocksource_delta(tkr->read(tkr->clock),
|
||||
tkr->cycle_last, tkr->mask);
|
||||
} while (read_seqcount_retry(&tkf->seq, seq));
|
||||
|
||||
return now;
|
||||
|
@@ -23,7 +23,9 @@
|
||||
|
||||
#include "timekeeping_internal.h"
|
||||
|
||||
static unsigned int sleep_time_bin[32] = {0};
|
||||
#define NUM_BINS 32
|
||||
|
||||
static unsigned int sleep_time_bin[NUM_BINS] = {0};
|
||||
|
||||
static int tk_debug_show_sleep_time(struct seq_file *s, void *data)
|
||||
{
|
||||
@@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init);
|
||||
|
||||
void tk_debug_account_sleep_time(struct timespec64 *t)
|
||||
{
|
||||
sleep_time_bin[fls(t->tv_sec)]++;
|
||||
/* Cap bin index so we don't overflow the array */
|
||||
int bin = min(fls(t->tv_sec), NUM_BINS-1);
|
||||
|
||||
sleep_time_bin[bin]++;
|
||||
}
|
||||
|
||||
|
@@ -1496,6 +1496,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
||||
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
|
||||
u64 expires = KTIME_MAX;
|
||||
unsigned long nextevt;
|
||||
bool is_max_delta;
|
||||
|
||||
/*
|
||||
* Pretend that there is no timer pending if the cpu is offline.
|
||||
@@ -1506,6 +1507,7 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
||||
|
||||
spin_lock(&base->lock);
|
||||
nextevt = __next_timer_interrupt(base);
|
||||
is_max_delta = (nextevt == base->clk + NEXT_TIMER_MAX_DELTA);
|
||||
base->next_expiry = nextevt;
|
||||
/*
|
||||
* We have a fresh next event. Check whether we can forward the base:
|
||||
@@ -1519,7 +1521,8 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
|
||||
expires = basem;
|
||||
base->is_idle = false;
|
||||
} else {
|
||||
expires = basem + (nextevt - basej) * TICK_NSEC;
|
||||
if (!is_max_delta)
|
||||
expires = basem + (nextevt - basej) * TICK_NSEC;
|
||||
/*
|
||||
* If we expect to sleep more than a tick, mark the base idle:
|
||||
*/
|
||||
|
@@ -223,7 +223,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
|
||||
what |= MASK_TC_BIT(op_flags, META);
|
||||
what |= MASK_TC_BIT(op_flags, PREFLUSH);
|
||||
what |= MASK_TC_BIT(op_flags, FUA);
|
||||
if (op == REQ_OP_DISCARD)
|
||||
if (op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)
|
||||
what |= BLK_TC_ACT(BLK_TC_DISCARD);
|
||||
if (op == REQ_OP_FLUSH)
|
||||
what |= BLK_TC_ACT(BLK_TC_FLUSH);
|
||||
|
Reference in New Issue
Block a user