Merge branch 'perf/urgent' into perf/core, to pick up the latest fixes

Signed-off-by: Ingo Molnar <mingo@kernel.org>
This commit is contained in:
Ingo Molnar
2020-03-06 11:56:40 +01:00
975 changed files with 16181 additions and 11423 deletions

View File

@@ -1101,13 +1101,11 @@ static void audit_log_feature_change(int which, u32 old_feature, u32 new_feature
audit_log_end(ab);
}
static int audit_set_feature(struct sk_buff *skb)
static int audit_set_feature(struct audit_features *uaf)
{
struct audit_features *uaf;
int i;
BUILD_BUG_ON(AUDIT_LAST_FEATURE + 1 > ARRAY_SIZE(audit_feature_names));
uaf = nlmsg_data(nlmsg_hdr(skb));
/* if there is ever a version 2 we should handle that here */
@@ -1175,6 +1173,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
u32 seq;
void *data;
int data_len;
int err;
struct audit_buffer *ab;
u16 msg_type = nlh->nlmsg_type;
@@ -1188,6 +1187,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
seq = nlh->nlmsg_seq;
data = nlmsg_data(nlh);
data_len = nlmsg_len(nlh);
switch (msg_type) {
case AUDIT_GET: {
@@ -1211,7 +1211,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
struct audit_status s;
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
if (s.mask & AUDIT_STATUS_ENABLED) {
err = audit_set_enabled(s.enabled);
if (err < 0)
@@ -1315,7 +1315,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return err;
break;
case AUDIT_SET_FEATURE:
err = audit_set_feature(skb);
if (data_len < sizeof(struct audit_features))
return -EINVAL;
err = audit_set_feature(data);
if (err)
return err;
break;
@@ -1327,6 +1329,8 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
err = audit_filter(msg_type, AUDIT_FILTER_USER);
if (err == 1) { /* match or error */
char *str = data;
err = 0;
if (msg_type == AUDIT_USER_TTY) {
err = tty_audit_push();
@@ -1334,26 +1338,24 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
break;
}
audit_log_user_recv_msg(&ab, msg_type);
if (msg_type != AUDIT_USER_TTY)
if (msg_type != AUDIT_USER_TTY) {
/* ensure NULL termination */
str[data_len - 1] = '\0';
audit_log_format(ab, " msg='%.*s'",
AUDIT_MESSAGE_TEXT_MAX,
(char *)data);
else {
int size;
str);
} else {
audit_log_format(ab, " data=");
size = nlmsg_len(nlh);
if (size > 0 &&
((unsigned char *)data)[size - 1] == '\0')
size--;
audit_log_n_untrustedstring(ab, data, size);
if (data_len > 0 && str[data_len - 1] == '\0')
data_len--;
audit_log_n_untrustedstring(ab, str, data_len);
}
audit_log_end(ab);
}
break;
case AUDIT_ADD_RULE:
case AUDIT_DEL_RULE:
if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
if (data_len < sizeof(struct audit_rule_data))
return -EINVAL;
if (audit_enabled == AUDIT_LOCKED) {
audit_log_common_recv_msg(audit_context(), &ab,
@@ -1365,7 +1367,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
audit_log_end(ab);
return -EPERM;
}
err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh));
err = audit_rule_change(msg_type, seq, data, data_len);
break;
case AUDIT_LIST_RULES:
err = audit_list_rules_send(skb, seq);
@@ -1380,7 +1382,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_MAKE_EQUIV: {
void *bufp = data;
u32 sizes[2];
size_t msglen = nlmsg_len(nlh);
size_t msglen = data_len;
char *old, *new;
err = -EINVAL;
@@ -1456,7 +1458,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
memset(&s, 0, sizeof(s));
/* guard against past and future API changes */
memcpy(&s, data, min_t(size_t, sizeof(s), nlmsg_len(nlh)));
memcpy(&s, data, min_t(size_t, sizeof(s), data_len));
/* check if new data is valid */
if ((s.enabled != 0 && s.enabled != 1) ||
(s.log_passwd != 0 && s.log_passwd != 1))

View File

@@ -456,6 +456,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
bufp = data->buf;
for (i = 0; i < data->field_count; i++) {
struct audit_field *f = &entry->rule.fields[i];
u32 f_val;
err = -EINVAL;
@@ -464,12 +465,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
goto exit_free;
f->type = data->fields[i];
f->val = data->values[i];
f_val = data->values[i];
/* Support legacy tests for a valid loginuid */
if ((f->type == AUDIT_LOGINUID) && (f->val == AUDIT_UID_UNSET)) {
if ((f->type == AUDIT_LOGINUID) && (f_val == AUDIT_UID_UNSET)) {
f->type = AUDIT_LOGINUID_SET;
f->val = 0;
f_val = 0;
entry->rule.pflags |= AUDIT_LOGINUID_LEGACY;
}
@@ -485,7 +486,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SUID:
case AUDIT_FSUID:
case AUDIT_OBJ_UID:
f->uid = make_kuid(current_user_ns(), f->val);
f->uid = make_kuid(current_user_ns(), f_val);
if (!uid_valid(f->uid))
goto exit_free;
break;
@@ -494,11 +495,12 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_SGID:
case AUDIT_FSGID:
case AUDIT_OBJ_GID:
f->gid = make_kgid(current_user_ns(), f->val);
f->gid = make_kgid(current_user_ns(), f_val);
if (!gid_valid(f->gid))
goto exit_free;
break;
case AUDIT_ARCH:
f->val = f_val;
entry->rule.arch_f = f;
break;
case AUDIT_SUBJ_USER:
@@ -511,11 +513,13 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
case AUDIT_OBJ_TYPE:
case AUDIT_OBJ_LEV_LOW:
case AUDIT_OBJ_LEV_HIGH:
str = audit_unpack_string(&bufp, &remain, f->val);
if (IS_ERR(str))
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
entry->rule.buflen += f->val;
}
entry->rule.buflen += f_val;
f->lsm_str = str;
err = security_audit_rule_init(f->type, f->op, str,
(void **)&f->lsm_rule);
/* Keep currently invalid fields around in case they
@@ -524,68 +528,71 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
pr_warn("audit rule for LSM \'%s\' is invalid\n",
str);
err = 0;
}
if (err) {
kfree(str);
} else if (err)
goto exit_free;
} else
f->lsm_str = str;
break;
case AUDIT_WATCH:
str = audit_unpack_string(&bufp, &remain, f->val);
if (IS_ERR(str))
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
entry->rule.buflen += f->val;
err = audit_to_watch(&entry->rule, str, f->val, f->op);
}
err = audit_to_watch(&entry->rule, str, f_val, f->op);
if (err) {
kfree(str);
goto exit_free;
}
entry->rule.buflen += f_val;
break;
case AUDIT_DIR:
str = audit_unpack_string(&bufp, &remain, f->val);
if (IS_ERR(str))
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
entry->rule.buflen += f->val;
}
err = audit_make_tree(&entry->rule, str, f->op);
kfree(str);
if (err)
goto exit_free;
entry->rule.buflen += f_val;
break;
case AUDIT_INODE:
f->val = f_val;
err = audit_to_inode(&entry->rule, f);
if (err)
goto exit_free;
break;
case AUDIT_FILTERKEY:
if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
if (entry->rule.filterkey || f_val > AUDIT_MAX_KEY_LEN)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
if (IS_ERR(str))
goto exit_free;
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
if (entry->rule.exe || f->val > PATH_MAX)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f->val);
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
entry->rule.buflen += f->val;
audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
entry->rule.buflen += f_val;
entry->rule.filterkey = str;
break;
case AUDIT_EXE:
if (entry->rule.exe || f_val > PATH_MAX)
goto exit_free;
str = audit_unpack_string(&bufp, &remain, f_val);
if (IS_ERR(str)) {
err = PTR_ERR(str);
goto exit_free;
}
audit_mark = audit_alloc_mark(&entry->rule, str, f_val);
if (IS_ERR(audit_mark)) {
kfree(str);
err = PTR_ERR(audit_mark);
goto exit_free;
}
entry->rule.buflen += f_val;
entry->rule.exe = audit_mark;
break;
default:
f->val = f_val;
break;
}
}

View File

@@ -4142,9 +4142,9 @@ int btf_distill_func_proto(struct bpf_verifier_log *log,
* EFAULT - verifier bug
* 0 - 99% match. The last 1% is validated by the verifier.
*/
int btf_check_func_type_match(struct bpf_verifier_log *log,
struct btf *btf1, const struct btf_type *t1,
struct btf *btf2, const struct btf_type *t2)
static int btf_check_func_type_match(struct bpf_verifier_log *log,
struct btf *btf1, const struct btf_type *t1,
struct btf *btf2, const struct btf_type *t2)
{
const struct btf_param *args1, *args2;
const char *fn1, *fn2, *s1, *s2;

View File

@@ -56,6 +56,7 @@ struct htab_elem {
union {
struct bpf_htab *htab;
struct pcpu_freelist_node fnode;
struct htab_elem *batch_flink;
};
};
};
@@ -126,6 +127,17 @@ free_elems:
bpf_map_area_free(htab->elems);
}
/* The LRU list has a lock (lru_lock). Each htab bucket has a lock
* (bucket_lock). If both locks need to be acquired together, the lock
* order is always lru_lock -> bucket_lock and this only happens in
* bpf_lru_list.c logic. For example, certain code path of
* bpf_lru_pop_free(), which is called by function prealloc_lru_pop(),
* will acquire lru_lock first followed by acquiring bucket_lock.
*
* In hashtab.c, to avoid deadlock, lock acquisition of
* bucket_lock followed by lru_lock is not allowed. In such cases,
* bucket_lock needs to be released first before acquiring lru_lock.
*/
static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
u32 hash)
{
@@ -1256,10 +1268,12 @@ __htab_map_lookup_and_delete_batch(struct bpf_map *map,
void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
void *ubatch = u64_to_user_ptr(attr->batch.in_batch);
u32 batch, max_count, size, bucket_size;
struct htab_elem *node_to_free = NULL;
u64 elem_map_flags, map_flags;
struct hlist_nulls_head *head;
struct hlist_nulls_node *n;
unsigned long flags;
unsigned long flags = 0;
bool locked = false;
struct htab_elem *l;
struct bucket *b;
int ret = 0;
@@ -1319,15 +1333,25 @@ again_nocopy:
dst_val = values;
b = &htab->buckets[batch];
head = &b->head;
raw_spin_lock_irqsave(&b->lock, flags);
/* do not grab the lock unless need it (bucket_cnt > 0). */
if (locked)
raw_spin_lock_irqsave(&b->lock, flags);
bucket_cnt = 0;
hlist_nulls_for_each_entry_rcu(l, n, head, hash_node)
bucket_cnt++;
if (bucket_cnt && !locked) {
locked = true;
goto again_nocopy;
}
if (bucket_cnt > (max_count - total)) {
if (total == 0)
ret = -ENOSPC;
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
@@ -1337,6 +1361,9 @@ again_nocopy:
if (bucket_cnt > bucket_size) {
bucket_size = bucket_cnt;
/* Note that since bucket_cnt > 0 here, it is implicit
* that the locked was grabbed, so release it.
*/
raw_spin_unlock_irqrestore(&b->lock, flags);
rcu_read_unlock();
this_cpu_dec(bpf_prog_active);
@@ -1346,6 +1373,10 @@ again_nocopy:
goto alloc;
}
/* Next block is only safe to run if you have grabbed the lock */
if (!locked)
goto next_batch;
hlist_nulls_for_each_entry_safe(l, n, head, hash_node) {
memcpy(dst_key, l->key, key_size);
@@ -1370,16 +1401,33 @@ again_nocopy:
}
if (do_delete) {
hlist_nulls_del_rcu(&l->hash_node);
if (is_lru_map)
bpf_lru_push_free(&htab->lru, &l->lru_node);
else
/* bpf_lru_push_free() will acquire lru_lock, which
* may cause deadlock. See comments in function
* prealloc_lru_pop(). Let us do bpf_lru_push_free()
* after releasing the bucket lock.
*/
if (is_lru_map) {
l->batch_flink = node_to_free;
node_to_free = l;
} else {
free_htab_elem(htab, l);
}
}
dst_key += key_size;
dst_val += value_size;
}
raw_spin_unlock_irqrestore(&b->lock, flags);
locked = false;
while (node_to_free) {
l = node_to_free;
node_to_free = node_to_free->batch_flink;
bpf_lru_push_free(&htab->lru, &l->lru_node);
}
next_batch:
/* If we are not copying data, we can go to next bucket and avoid
* unlocking the rcu.
*/

View File

@@ -321,7 +321,7 @@ int bpf_prog_offload_info_fill(struct bpf_prog_info *info,
ulen = info->jited_prog_len;
info->jited_prog_len = aux->offload->jited_len;
if (info->jited_prog_len & ulen) {
if (info->jited_prog_len && ulen) {
uinsns = u64_to_user_ptr(info->jited_prog_insns);
ulen = min_t(u32, info->jited_prog_len, ulen);
if (copy_to_user(uinsns, aux->offload->jited_image, ulen)) {

View File

@@ -26,70 +26,6 @@
#include <linux/uaccess.h>
static int __compat_get_timeval(struct timeval *tv, const struct old_timeval32 __user *ctv)
{
return (!access_ok(ctv, sizeof(*ctv)) ||
__get_user(tv->tv_sec, &ctv->tv_sec) ||
__get_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
static int __compat_put_timeval(const struct timeval *tv, struct old_timeval32 __user *ctv)
{
return (!access_ok(ctv, sizeof(*ctv)) ||
__put_user(tv->tv_sec, &ctv->tv_sec) ||
__put_user(tv->tv_usec, &ctv->tv_usec)) ? -EFAULT : 0;
}
static int __compat_get_timespec(struct timespec *ts, const struct old_timespec32 __user *cts)
{
return (!access_ok(cts, sizeof(*cts)) ||
__get_user(ts->tv_sec, &cts->tv_sec) ||
__get_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
static int __compat_put_timespec(const struct timespec *ts, struct old_timespec32 __user *cts)
{
return (!access_ok(cts, sizeof(*cts)) ||
__put_user(ts->tv_sec, &cts->tv_sec) ||
__put_user(ts->tv_nsec, &cts->tv_nsec)) ? -EFAULT : 0;
}
int compat_get_timeval(struct timeval *tv, const void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0;
else
return __compat_get_timeval(tv, utv);
}
EXPORT_SYMBOL_GPL(compat_get_timeval);
int compat_put_timeval(const struct timeval *tv, void __user *utv)
{
if (COMPAT_USE_64BIT_TIME)
return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0;
else
return __compat_put_timeval(tv, utv);
}
EXPORT_SYMBOL_GPL(compat_put_timeval);
int compat_get_timespec(struct timespec *ts, const void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0;
else
return __compat_get_timespec(ts, uts);
}
EXPORT_SYMBOL_GPL(compat_get_timespec);
int compat_put_timespec(const struct timespec *ts, void __user *uts)
{
if (COMPAT_USE_64BIT_TIME)
return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0;
else
return __compat_put_timespec(ts, uts);
}
EXPORT_SYMBOL_GPL(compat_put_timespec);
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
/*

View File

@@ -302,9 +302,16 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
phys_addr_t align = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order);
phys_addr_t mask = align - 1;
unsigned long node = rmem->fdt_node;
bool default_cma = of_get_flat_dt_prop(node, "linux,cma-default", NULL);
struct cma *cma;
int err;
if (size_cmdline != -1 && default_cma) {
pr_info("Reserved memory: bypass %s node, using cmdline CMA params instead\n",
rmem->name);
return -EBUSY;
}
if (!of_get_flat_dt_prop(node, "reusable", NULL) ||
of_get_flat_dt_prop(node, "no-map", NULL))
return -EINVAL;
@@ -322,7 +329,7 @@ static int __init rmem_cma_setup(struct reserved_mem *rmem)
/* Architecture specific contiguous memory fixup. */
dma_contiguous_early_fixup(rmem->base, rmem->size);
if (of_get_flat_dt_prop(node, "linux,cma-default", NULL))
if (default_cma)
dma_contiguous_set_default(cma);
rmem->ops = &rmem_cma_ops;

View File

@@ -23,18 +23,6 @@
*/
unsigned int zone_dma_bits __ro_after_init = 24;
static void report_addr(struct device *dev, dma_addr_t dma_addr, size_t size)
{
if (!dev->dma_mask) {
dev_err_once(dev, "DMA map on device without dma_mask\n");
} else if (*dev->dma_mask >= DMA_BIT_MASK(32) || dev->bus_dma_limit) {
dev_err_once(dev,
"overflow %pad+%zu of DMA mask %llx bus limit %llx\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
}
WARN_ON_ONCE(1);
}
static inline dma_addr_t phys_to_dma_direct(struct device *dev,
phys_addr_t phys)
{
@@ -357,13 +345,6 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
EXPORT_SYMBOL(dma_direct_unmap_sg);
#endif
static inline bool dma_direct_possible(struct device *dev, dma_addr_t dma_addr,
size_t size)
{
return swiotlb_force != SWIOTLB_FORCE &&
dma_capable(dev, dma_addr, size, true);
}
dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
unsigned long offset, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -371,9 +352,16 @@ dma_addr_t dma_direct_map_page(struct device *dev, struct page *page,
phys_addr_t phys = page_to_phys(page) + offset;
dma_addr_t dma_addr = phys_to_dma(dev, phys);
if (unlikely(!dma_direct_possible(dev, dma_addr, size)) &&
!swiotlb_map(dev, &phys, &dma_addr, size, dir, attrs)) {
report_addr(dev, dma_addr, size);
if (unlikely(swiotlb_force == SWIOTLB_FORCE))
return swiotlb_map(dev, phys, size, dir, attrs);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
if (swiotlb_force != SWIOTLB_NO_FORCE)
return swiotlb_map(dev, phys, size, dir, attrs);
dev_WARN_ONCE(dev, 1,
"DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
return DMA_MAPPING_ERROR;
}
@@ -411,7 +399,10 @@ dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
dma_addr_t dma_addr = paddr;
if (unlikely(!dma_capable(dev, dma_addr, size, false))) {
report_addr(dev, dma_addr, size);
dev_err_once(dev,
"DMA addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
WARN_ON_ONCE(1);
return DMA_MAPPING_ERROR;
}
@@ -472,28 +463,26 @@ int dma_direct_mmap(struct device *dev, struct vm_area_struct *vma,
}
#endif /* CONFIG_MMU */
/*
* Because 32-bit DMA masks are so common we expect every architecture to be
* able to satisfy them - either by not supporting more physical memory, or by
* providing a ZONE_DMA32. If neither is the case, the architecture needs to
* use an IOMMU instead of the direct mapping.
*/
int dma_direct_supported(struct device *dev, u64 mask)
{
u64 min_mask;
u64 min_mask = (max_pfn - 1) << PAGE_SHIFT;
if (IS_ENABLED(CONFIG_ZONE_DMA))
min_mask = DMA_BIT_MASK(zone_dma_bits);
else
min_mask = DMA_BIT_MASK(32);
min_mask = min_t(u64, min_mask, (max_pfn - 1) << PAGE_SHIFT);
/*
* Because 32-bit DMA masks are so common we expect every architecture
* to be able to satisfy them - either by not supporting more physical
* memory, or by providing a ZONE_DMA32. If neither is the case, the
* architecture needs to use an IOMMU instead of the direct mapping.
*/
if (mask >= DMA_BIT_MASK(32))
return 1;
/*
* This check needs to be against the actual bit mask value, so
* use __phys_to_dma() here so that the SME encryption mask isn't
* part of the check.
*/
if (IS_ENABLED(CONFIG_ZONE_DMA))
min_mask = min_t(u64, min_mask, DMA_BIT_MASK(zone_dma_bits));
return mask >= __phys_to_dma(dev, min_mask);
}

View File

@@ -22,6 +22,7 @@
#include <linux/cache.h>
#include <linux/dma-direct.h>
#include <linux/dma-noncoherent.h>
#include <linux/mm.h>
#include <linux/export.h>
#include <linux/spinlock.h>
@@ -656,35 +657,38 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr,
}
/*
* Create a swiotlb mapping for the buffer at @phys, and in case of DMAing
* Create a swiotlb mapping for the buffer at @paddr, and in case of DMAing
* to the device copy the data into it as well.
*/
bool swiotlb_map(struct device *dev, phys_addr_t *phys, dma_addr_t *dma_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
enum dma_data_direction dir, unsigned long attrs)
{
trace_swiotlb_bounced(dev, *dma_addr, size, swiotlb_force);
phys_addr_t swiotlb_addr;
dma_addr_t dma_addr;
if (unlikely(swiotlb_force == SWIOTLB_NO_FORCE)) {
dev_warn_ratelimited(dev,
"Cannot do DMA to address %pa\n", phys);
return false;
}
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
swiotlb_force);
/* Oh well, have to allocate and map a bounce buffer. */
*phys = swiotlb_tbl_map_single(dev, __phys_to_dma(dev, io_tlb_start),
*phys, size, size, dir, attrs);
if (*phys == (phys_addr_t)DMA_MAPPING_ERROR)
return false;
swiotlb_addr = swiotlb_tbl_map_single(dev,
__phys_to_dma(dev, io_tlb_start),
paddr, size, size, dir, attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
/* Ensure that the address returned is DMA'ble */
*dma_addr = __phys_to_dma(dev, *phys);
if (unlikely(!dma_capable(dev, *dma_addr, size, true))) {
swiotlb_tbl_unmap_single(dev, *phys, size, size, dir,
dma_addr = __phys_to_dma(dev, swiotlb_addr);
if (unlikely(!dma_capable(dev, dma_addr, size, true))) {
swiotlb_tbl_unmap_single(dev, swiotlb_addr, size, size, dir,
attrs | DMA_ATTR_SKIP_CPU_SYNC);
return false;
dev_WARN_ONCE(dev, 1,
"swiotlb addr %pad+%zu overflow (mask %llx, bus limit %llx).\n",
&dma_addr, size, *dev->dma_mask, dev->bus_dma_limit);
return DMA_MAPPING_ERROR;
}
return true;
if (!dev_is_dma_coherent(dev) && !(attrs & DMA_ATTR_SKIP_CPU_SYNC))
arch_sync_dma_for_device(swiotlb_addr, size, dir);
return dma_addr;
}
size_t swiotlb_max_mapping_size(struct device *dev)

View File

@@ -128,8 +128,6 @@ static inline void unregister_handler_proc(unsigned int irq,
extern bool irq_can_set_affinity_usr(unsigned int irq);
extern int irq_select_affinity_usr(unsigned int irq);
extern void irq_set_thread_affinity(struct irq_desc *desc);
extern int irq_do_set_affinity(struct irq_data *data,

View File

@@ -481,23 +481,9 @@ int irq_setup_affinity(struct irq_desc *desc)
{
return irq_select_affinity(irq_desc_get_irq(desc));
}
#endif
#endif /* CONFIG_AUTO_IRQ_AFFINITY */
#endif /* CONFIG_SMP */
/*
* Called when a bogus affinity is set via /proc/irq
*/
int irq_select_affinity_usr(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
ret = irq_setup_affinity(desc);
raw_spin_unlock_irqrestore(&desc->lock, flags);
return ret;
}
#endif
/**
* irq_set_vcpu_affinity - Set vcpu affinity for the interrupt

View File

@@ -111,6 +111,28 @@ static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
return show_irq_affinity(AFFINITY_LIST, m);
}
#ifndef CONFIG_AUTO_IRQ_AFFINITY
static inline int irq_select_affinity_usr(unsigned int irq)
{
/*
* If the interrupt is started up already then this fails. The
* interrupt is assigned to an online CPU already. There is no
* point to move it around randomly. Tell user space that the
* selected mask is bogus.
*
* If not then any change to the affinity is pointless because the
* startup code invokes irq_setup_affinity() which will select
* a online CPU anyway.
*/
return -EINVAL;
}
#else
/* ALPHA magic affinity auto selector. Keep it for historical reasons. */
static inline int irq_select_affinity_usr(unsigned int irq)
{
return irq_select_affinity(irq);
}
#endif
static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)

View File

@@ -1681,7 +1681,7 @@ static unsigned long minimum_image_size(unsigned long saveable)
* hibernation for allocations made while saving the image and for device
* drivers, in case they need to allocate memory from their hibernation
* callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
* estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
* estimate) and reserved_size divided by PAGE_SIZE (which is tunable through
* /sys/power/reserved_size, respectively). To make this happen, we compute the
* total number of available page frames and allocate at least
*

View File

@@ -131,11 +131,12 @@ static void s2idle_loop(void)
* to avoid them upfront.
*/
for (;;) {
if (s2idle_ops && s2idle_ops->wake)
s2idle_ops->wake();
if (pm_wakeup_pending())
if (s2idle_ops && s2idle_ops->wake) {
if (s2idle_ops->wake())
break;
} else if (pm_wakeup_pending()) {
break;
}
pm_wakeup_clear(false);

View File

@@ -552,27 +552,32 @@ void resched_cpu(int cpu)
*/
int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id();
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER))
return cpu;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu))
return cpu;
default_cpu = cpu;
}
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
for_each_cpu_and(i, sched_domain_span(sd),
housekeeping_cpumask(HK_FLAG_TIMER)) {
if (cpu == i)
continue;
if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) {
if (!idle_cpu(i)) {
cpu = i;
goto unlock;
}
}
}
if (!housekeeping_cpu(cpu, HK_FLAG_TIMER))
cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
if (default_cpu == -1)
default_cpu = housekeeping_any_cpu(HK_FLAG_TIMER);
cpu = default_cpu;
unlock:
rcu_read_unlock();
return cpu;
@@ -1442,17 +1447,6 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
{
if (!(p->flags & PF_KTHREAD))
return false;
if (p->nr_cpus_allowed != 1)
return false;
return true;
}
/*
* Per-CPU kthreads are allowed to run on !active && online CPUs, see
* __set_cpus_allowed_ptr() and select_fallback_rq().
@@ -3669,28 +3663,32 @@ static void sched_tick_remote(struct work_struct *work)
* statistics and checks timeslices in a time-independent way, regardless
* of when exactly it is running.
*/
if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu))
if (!tick_nohz_tick_stopped_cpu(cpu))
goto out_requeue;
rq_lock_irq(rq, &rf);
curr = rq->curr;
if (is_idle_task(curr) || cpu_is_offline(cpu))
if (cpu_is_offline(cpu))
goto out_unlock;
curr = rq->curr;
update_rq_clock(rq);
delta = rq_clock_task(rq) - curr->se.exec_start;
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
if (!is_idle_task(curr)) {
/*
* Make sure the next tick runs within a reasonable
* amount of time.
*/
delta = rq_clock_task(rq) - curr->se.exec_start;
WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
}
curr->sched_class->task_tick(rq, curr, 0);
calc_load_nohz_remote(rq);
out_unlock:
rq_unlock_irq(rq, &rf);
out_requeue:
/*
* Run the remote tick once per second (1Hz). This arbitrary
* frequency is large enough to avoid overload but short enough
@@ -7063,8 +7061,15 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
if (running) {
set_next_task(rq, tsk);
/*
* After changing group, the running task may have joined a
* throttled one but it's still the running task. Trigger a
* resched to make sure that task can still run.
*/
resched_curr(rq);
}
task_rq_unlock(rq, tsk, &rf);
}
@@ -7260,7 +7265,7 @@ capacity_from_percent(char *buf)
&req.percent);
if (req.ret)
return req;
if (req.percent > UCLAMP_PERCENT_SCALE) {
if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
req.ret = -ERANGE;
return req;
}

View File

@@ -3516,7 +3516,6 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
* attach_entity_load_avg - attach this entity to its cfs_rq load avg
* @cfs_rq: cfs_rq to attach to
* @se: sched_entity to attach
* @flags: migration hints
*
* Must call update_cfs_rq_load_avg() before this, since we rely on
* cfs_rq->avg.last_update_time being current.
@@ -5912,6 +5911,20 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
(available_idle_cpu(prev) || sched_idle_cpu(prev)))
return prev;
/*
* Allow a per-cpu kthread to stack with the wakee if the
* kworker thread and the tasks previous CPUs are the same.
* The assumption is that the wakee queued work for the
* per-cpu kthread that is now complete and the wakeup is
* essentially a sync wakeup. An obvious example of this
* pattern is IO completions.
*/
if (is_per_cpu_kthread(current) &&
prev == smp_processor_id() &&
this_rq()->nr_running <= 1) {
return prev;
}
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
if (recent_used_cpu != prev &&
@@ -8324,6 +8337,8 @@ static inline void update_sg_wakeup_stats(struct sched_domain *sd,
sgs->group_capacity = group->sgc->capacity;
sgs->group_weight = group->group_weight;
sgs->group_type = group_classify(sd->imbalance_pct, group, sgs);
/*
@@ -8658,10 +8673,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* Try to use spare capacity of local group without overloading it or
* emptying busiest.
* XXX Spreading tasks across NUMA nodes is not always the best policy
* and special care should be taken for SD_NUMA domain level before
* spreading the tasks. For now, load_balance() fully relies on
* NUMA_BALANCING and fbq_classify_group/rq to override the decision.
*/
if (local->group_type == group_has_spare) {
if (busiest->group_type > group_fully_busy) {
@@ -8701,16 +8712,37 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
env->imbalance = nr_diff >> 1;
return;
} else {
/*
* If there is no overload, we just want to even the number of
* idle cpus.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
}
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
unsigned int imbalance_min;
/*
* Compute an allowed imbalance based on a simple
* pair of communicating tasks that should remain
* local and ignore them.
*
* NOTE: Generally this would have been based on
* the domain size and this was evaluated. However,
* the benefit is similar across a range of workloads
* and machines but scaling by the domain size adds
* the risk that lower domains have to be rebalanced.
*/
imbalance_min = 2;
if (busiest->sum_nr_running <= imbalance_min)
env->imbalance = 0;
}
/*
* If there is no overload, we just want to even the number of
* idle cpus.
*/
env->migration_type = migrate_task;
env->imbalance = max_t(long, 0, (local->idle_cpus -
busiest->idle_cpus) >> 1);
return;
}

View File

@@ -231,16 +231,11 @@ static inline int calc_load_read_idx(void)
return calc_load_idx & 1;
}
void calc_load_nohz_start(void)
static void calc_load_nohz_fold(struct rq *rq)
{
struct rq *this_rq = this_rq();
long delta;
/*
* We're going into NO_HZ mode, if there's any pending delta, fold it
* into the pending NO_HZ delta.
*/
delta = calc_load_fold_active(this_rq, 0);
delta = calc_load_fold_active(rq, 0);
if (delta) {
int idx = calc_load_write_idx();
@@ -248,6 +243,24 @@ void calc_load_nohz_start(void)
}
}
void calc_load_nohz_start(void)
{
/*
* We're going into NO_HZ mode, if there's any pending delta, fold it
* into the pending NO_HZ delta.
*/
calc_load_nohz_fold(this_rq());
}
/*
* Keep track of the load for NOHZ_FULL, must be called between
* calc_load_nohz_{start,stop}().
*/
void calc_load_nohz_remote(struct rq *rq)
{
calc_load_nohz_fold(rq);
}
void calc_load_nohz_stop(void)
{
struct rq *this_rq = this_rq();
@@ -268,7 +281,7 @@ void calc_load_nohz_stop(void)
this_rq->calc_load_update += LOAD_FREQ;
}
static long calc_load_nohz_fold(void)
static long calc_load_nohz_read(void)
{
int idx = calc_load_read_idx();
long delta = 0;
@@ -323,7 +336,7 @@ static void calc_global_nohz(void)
}
#else /* !CONFIG_NO_HZ_COMMON */
static inline long calc_load_nohz_fold(void) { return 0; }
static inline long calc_load_nohz_read(void) { return 0; }
static inline void calc_global_nohz(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
@@ -346,7 +359,7 @@ void calc_global_load(unsigned long ticks)
/*
* Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
*/
delta = calc_load_nohz_fold();
delta = calc_load_nohz_read();
if (delta)
atomic_long_add(delta, &calc_load_tasks);

View File

@@ -1199,6 +1199,9 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
if (static_branch_likely(&psi_disabled))
return -EOPNOTSUPP;
if (!nbytes)
return -EINVAL;
buf_size = min(nbytes, sizeof(buf));
if (copy_from_user(buf, user_buf, buf_size))
return -EFAULT;

View File

@@ -896,7 +896,7 @@ struct rq {
*/
unsigned long nr_uninterruptible;
struct task_struct *curr;
struct task_struct __rcu *curr;
struct task_struct *idle;
struct task_struct *stop;
unsigned long next_balance;
@@ -2479,3 +2479,16 @@ static inline void membarrier_switch_mm(struct rq *rq,
{
}
#endif
#ifdef CONFIG_SMP
static inline bool is_per_cpu_kthread(struct task_struct *p)
{
if (!(p->flags & PF_KTHREAD))
return false;
if (p->nr_cpus_allowed != 1)
return false;
return true;
}
#endif

View File

@@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
{
struct sigqueue *q = NULL;
struct user_struct *user;
int sigpending;
/*
* Protect access to @t credentials. This can go away when all
* callers hold rcu read lock.
*
* NOTE! A pending signal will hold on to the user refcount,
* and we get/put the refcount only when the sigpending count
* changes from/to zero.
*/
rcu_read_lock();
user = get_uid(__task_cred(t)->user);
atomic_inc(&user->sigpending);
user = __task_cred(t)->user;
sigpending = atomic_inc_return(&user->sigpending);
if (sigpending == 1)
get_uid(user);
rcu_read_unlock();
if (override_rlimit ||
atomic_read(&user->sigpending) <=
task_rlimit(t, RLIMIT_SIGPENDING)) {
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, flags);
} else {
print_dropped_signal(sig);
}
if (unlikely(q == NULL)) {
atomic_dec(&user->sigpending);
free_uid(user);
if (atomic_dec_and_test(&user->sigpending))
free_uid(user);
} else {
INIT_LIST_HEAD(&q->list);
q->flags = 0;
@@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q)
{
if (q->flags & SIGQUEUE_PREALLOC)
return;
atomic_dec(&q->user->sigpending);
free_uid(q->user);
if (atomic_dec_and_test(&q->user->sigpending))
free_uid(q->user);
kmem_cache_free(sigqueue_cachep, q);
}

View File

@@ -805,15 +805,6 @@ static struct ctl_table kern_table[] = {
.extra2 = &maxolduid,
},
#ifdef CONFIG_S390
#ifdef CONFIG_MATHEMU
{
.procname = "ieee_emulation_warnings",
.data = &sysctl_ieee_emulation_warnings,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif
{
.procname = "userprocess_debug",
.data = &show_unhandled_signals,

View File

@@ -449,49 +449,6 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0,
}
EXPORT_SYMBOL(mktime64);
/**
* ns_to_timespec - Convert nanoseconds to timespec
* @nsec: the nanoseconds value to be converted
*
* Returns the timespec representation of the nsec parameter.
*/
struct timespec ns_to_timespec(const s64 nsec)
{
struct timespec ts;
s32 rem;
if (!nsec)
return (struct timespec) {0, 0};
ts.tv_sec = div_s64_rem(nsec, NSEC_PER_SEC, &rem);
if (unlikely(rem < 0)) {
ts.tv_sec--;
rem += NSEC_PER_SEC;
}
ts.tv_nsec = rem;
return ts;
}
EXPORT_SYMBOL(ns_to_timespec);
/**
* ns_to_timeval - Convert nanoseconds to timeval
* @nsec: the nanoseconds value to be converted
*
* Returns the timeval representation of the nsec parameter.
*/
struct timeval ns_to_timeval(const s64 nsec)
{
struct timespec ts = ns_to_timespec(nsec);
struct timeval tv;
tv.tv_sec = ts.tv_sec;
tv.tv_usec = (suseconds_t) ts.tv_nsec / 1000;
return tv;
}
EXPORT_SYMBOL(ns_to_timeval);
struct __kernel_old_timeval ns_to_kernel_old_timeval(const s64 nsec)
{
struct timespec64 ts = ns_to_timespec64(nsec);

View File

@@ -143,8 +143,8 @@ if FTRACE
config BOOTTIME_TRACING
bool "Boot-time Tracing support"
depends on BOOT_CONFIG && TRACING
default y
depends on TRACING
select BOOT_CONFIG
help
Enable developer to setup ftrace subsystem via supplemental
kernel cmdline at boot time for debugging (tracing) driver

View File

@@ -335,6 +335,7 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt)
{
synchronize_rcu();
blk_trace_free(bt);
put_probe_ref();
}
@@ -629,8 +630,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -740,8 +743,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
void blk_trace_shutdown(struct request_queue *q)
{
mutex_lock(&q->blk_trace_mutex);
if (q->blk_trace) {
if (rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex))) {
__blk_trace_startstop(q, 0);
__blk_trace_remove(q);
}
@@ -752,8 +755,10 @@ void blk_trace_shutdown(struct request_queue *q)
#ifdef CONFIG_BLK_CGROUP
static u64 blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
/* We don't use the 'bt' value here except as an optimization... */
bt = rcu_dereference_protected(q->blk_trace, 1);
if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP))
return 0;
@@ -796,10 +801,14 @@ blk_trace_request_get_cgid(struct request_queue *q, struct request *rq)
static void blk_add_trace_rq(struct request *rq, int error,
unsigned int nr_bytes, u32 what, u64 cgid)
{
struct blk_trace *bt = rq->q->blk_trace;
struct blk_trace *bt;
if (likely(!bt))
rcu_read_lock();
bt = rcu_dereference(rq->q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
if (blk_rq_is_passthrough(rq))
what |= BLK_TC_ACT(BLK_TC_PC);
@@ -808,6 +817,7 @@ static void blk_add_trace_rq(struct request *rq, int error,
__blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq),
rq->cmd_flags, what, error, 0, NULL, cgid);
rcu_read_unlock();
}
static void blk_add_trace_rq_insert(void *ignore,
@@ -853,14 +863,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
if (likely(!bt))
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, what, error, 0, NULL,
blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -905,11 +920,14 @@ static void blk_add_trace_getrq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0,
NULL, 0);
rcu_read_unlock();
}
}
@@ -921,27 +939,35 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ,
0, 0, NULL, 0);
rcu_read_unlock();
}
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, 0);
rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
@@ -953,14 +979,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, 0);
}
rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
@@ -969,6 +998,7 @@ static void blk_add_trace_split(void *ignore,
BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu),
&rpdu, blk_trace_bio_get_cgid(q, bio));
}
rcu_read_unlock();
}
/**
@@ -988,11 +1018,15 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
struct blk_io_trace_remap r;
if (likely(!bt))
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio_dev(bio));
@@ -1001,6 +1035,7 @@ static void blk_add_trace_bio_remap(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status,
sizeof(r), &r, blk_trace_bio_get_cgid(q, bio));
rcu_read_unlock();
}
/**
@@ -1021,11 +1056,15 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev,
sector_t from)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
struct blk_io_trace_remap r;
if (likely(!bt))
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -1034,6 +1073,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), 0, BLK_TA_REMAP, 0,
sizeof(r), &r, blk_trace_request_get_cgid(q, rq));
rcu_read_unlock();
}
/**
@@ -1051,14 +1091,19 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq,
void *data, size_t len)
{
struct blk_trace *bt = q->blk_trace;
struct blk_trace *bt;
if (likely(!bt))
rcu_read_lock();
bt = rcu_dereference(q->blk_trace);
if (likely(!bt)) {
rcu_read_unlock();
return;
}
__blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0,
BLK_TA_DRV_DATA, 0, len, data,
blk_trace_request_get_cgid(q, rq));
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1597,6 +1642,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
return -EINVAL;
put_probe_ref();
synchronize_rcu();
blk_trace_free(bt);
return 0;
}
@@ -1758,6 +1804,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q;
struct block_device *bdev;
struct blk_trace *bt;
ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p));
@@ -1770,21 +1817,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
ret = sprintf(buf, "%u\n", !!q->blk_trace);
ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
if (q->blk_trace == NULL)
if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
ret = sprintf(buf, "%u\n", q->blk_trace->pid);
ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
mutex_unlock(&q->blk_trace_mutex);
@@ -1801,6 +1850,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev;
struct request_queue *q;
struct hd_struct *p;
struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1831,8 +1881,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
mutex_lock(&q->blk_trace_mutex);
bt = rcu_dereference_protected(q->blk_trace,
lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
if (!!value == !!q->blk_trace) {
if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
@@ -1844,18 +1896,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
if (q->blk_trace == NULL)
if (bt == NULL)
ret = blk_trace_setup_queue(q, bdev);
if (ret == 0) {
if (attr == &dev_attr_act_mask)
q->blk_trace->act_mask = value;
bt->act_mask = value;
else if (attr == &dev_attr_pid)
q->blk_trace->pid = value;
bt->pid = value;
else if (attr == &dev_attr_start_lba)
q->blk_trace->start_lba = value;
bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
q->blk_trace->end_lba = value;
bt->end_lba = value;
}
out_unlock_bdev:

View File

@@ -111,11 +111,11 @@ static int __init test_gen_synth_cmd(void)
/* Create some bogus values just for testing */
vals[0] = 777; /* next_pid_field */
vals[1] = (u64)"hula hoops"; /* next_comm_field */
vals[1] = (u64)(long)"hula hoops"; /* next_comm_field */
vals[2] = 1000000; /* ts_ns */
vals[3] = 1000; /* ts_ms */
vals[4] = smp_processor_id(); /* cpu */
vals[5] = (u64)"thneed"; /* my_string_field */
vals[4] = raw_smp_processor_id(); /* cpu */
vals[5] = (u64)(long)"thneed"; /* my_string_field */
vals[6] = 598; /* my_int_field */
/* Now generate a gen_synth_test event */
@@ -218,11 +218,11 @@ static int __init test_empty_synth_event(void)
/* Create some bogus values just for testing */
vals[0] = 777; /* next_pid_field */
vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */
vals[2] = 1000000; /* ts_ns */
vals[3] = 1000; /* ts_ms */
vals[4] = smp_processor_id(); /* cpu */
vals[5] = (u64)"thneed_2.0"; /* my_string_field */
vals[4] = raw_smp_processor_id(); /* cpu */
vals[5] = (u64)(long)"thneed_2.0"; /* my_string_field */
vals[6] = 399; /* my_int_field */
/* Now trace an empty_synth_test event */
@@ -290,11 +290,11 @@ static int __init test_create_synth_event(void)
/* Create some bogus values just for testing */
vals[0] = 777; /* next_pid_field */
vals[1] = (u64)"tiddlywinks"; /* next_comm_field */
vals[1] = (u64)(long)"tiddlywinks"; /* next_comm_field */
vals[2] = 1000000; /* ts_ns */
vals[3] = 1000; /* ts_ms */
vals[4] = smp_processor_id(); /* cpu */
vals[5] = (u64)"thneed"; /* my_string_field */
vals[4] = raw_smp_processor_id(); /* cpu */
vals[5] = (u64)(long)"thneed"; /* my_string_field */
vals[6] = 398; /* my_int_field */
/* Now generate a create_synth_test event */
@@ -330,7 +330,7 @@ static int __init test_add_next_synth_val(void)
goto out;
/* next_comm_field */
ret = synth_event_add_next_val((u64)"slinky", &trace_state);
ret = synth_event_add_next_val((u64)(long)"slinky", &trace_state);
if (ret)
goto out;
@@ -345,12 +345,12 @@ static int __init test_add_next_synth_val(void)
goto out;
/* cpu */
ret = synth_event_add_next_val(smp_processor_id(), &trace_state);
ret = synth_event_add_next_val(raw_smp_processor_id(), &trace_state);
if (ret)
goto out;
/* my_string_field */
ret = synth_event_add_next_val((u64)"thneed_2.01", &trace_state);
ret = synth_event_add_next_val((u64)(long)"thneed_2.01", &trace_state);
if (ret)
goto out;
@@ -388,7 +388,7 @@ static int __init test_add_synth_val(void)
if (ret)
goto out;
ret = synth_event_add_val("cpu", smp_processor_id(), &trace_state);
ret = synth_event_add_val("cpu", raw_smp_processor_id(), &trace_state);
if (ret)
goto out;
@@ -396,12 +396,12 @@ static int __init test_add_synth_val(void)
if (ret)
goto out;
ret = synth_event_add_val("next_comm_field", (u64)"silly putty",
ret = synth_event_add_val("next_comm_field", (u64)(long)"silly putty",
&trace_state);
if (ret)
goto out;
ret = synth_event_add_val("my_string_field", (u64)"thneed_9",
ret = synth_event_add_val("my_string_field", (u64)(long)"thneed_9",
&trace_state);
if (ret)
goto out;
@@ -423,13 +423,13 @@ static int __init test_trace_synth_event(void)
/* Trace some bogus values just for testing */
ret = synth_event_trace(create_synth_test, 7, /* number of values */
444, /* next_pid_field */
(u64)"clackers", /* next_comm_field */
1000000, /* ts_ns */
1000, /* ts_ms */
smp_processor_id(), /* cpu */
(u64)"Thneed", /* my_string_field */
999); /* my_int_field */
(u64)444, /* next_pid_field */
(u64)(long)"clackers", /* next_comm_field */
(u64)1000000, /* ts_ns */
(u64)1000, /* ts_ms */
(u64)raw_smp_processor_id(), /* cpu */
(u64)(long)"Thneed", /* my_string_field */
(u64)999); /* my_int_field */
return ret;
}

View File

@@ -1837,6 +1837,7 @@ static __init int init_trace_selftests(void)
pr_info("Running postponed tracer tests:\n");
tracing_selftest_running = true;
list_for_each_entry_safe(p, n, &postponed_selftests, list) {
/* This loop can take minutes when sanitizers are enabled, so
* lets make sure we allow RCU processing.
@@ -1859,6 +1860,7 @@ static __init int init_trace_selftests(void)
list_del(&p->list);
kfree(p);
}
tracing_selftest_running = false;
out:
mutex_unlock(&trace_types_lock);

View File

@@ -821,6 +821,29 @@ static const char *synth_field_fmt(char *type)
return fmt;
}
static void print_synth_event_num_val(struct trace_seq *s,
char *print_fmt, char *name,
int size, u64 val, char *space)
{
switch (size) {
case 1:
trace_seq_printf(s, print_fmt, name, (u8)val, space);
break;
case 2:
trace_seq_printf(s, print_fmt, name, (u16)val, space);
break;
case 4:
trace_seq_printf(s, print_fmt, name, (u32)val, space);
break;
default:
trace_seq_printf(s, print_fmt, name, val, space);
break;
}
}
static enum print_line_t print_synth_event(struct trace_iterator *iter,
int flags,
struct trace_event *event)
@@ -859,10 +882,13 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter,
} else {
struct trace_print_flags __flags[] = {
__def_gfpflag_names, {-1, NULL} };
char *space = (i == se->n_fields - 1 ? "" : " ");
trace_seq_printf(s, print_fmt, se->fields[i]->name,
entry->fields[n_u64],
i == se->n_fields - 1 ? "" : " ");
print_synth_event_num_val(s, print_fmt,
se->fields[i]->name,
se->fields[i]->size,
entry->fields[n_u64],
space);
if (strcmp(se->fields[i]->type, "gfp_t") == 0) {
trace_seq_puts(s, " (");
@@ -1798,6 +1824,62 @@ void synth_event_cmd_init(struct dynevent_cmd *cmd, char *buf, int maxlen)
}
EXPORT_SYMBOL_GPL(synth_event_cmd_init);
static inline int
__synth_event_trace_start(struct trace_event_file *file,
struct synth_event_trace_state *trace_state)
{
int entry_size, fields_size = 0;
int ret = 0;
memset(trace_state, '\0', sizeof(*trace_state));
/*
* Normal event tracing doesn't get called at all unless the
* ENABLED bit is set (which attaches the probe thus allowing
* this code to be called, etc). Because this is called
* directly by the user, we don't have that but we still need
* to honor not logging when disabled. For the the iterated
* trace case, we save the enabed state upon start and just
* ignore the following data calls.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED) ||
trace_trigger_soft_disabled(file)) {
trace_state->disabled = true;
ret = -ENOENT;
goto out;
}
trace_state->event = file->event_call->data;
fields_size = trace_state->event->n_u64 * sizeof(u64);
/*
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
trace_state->buffer = file->tr->array_buffer.buffer;
ring_buffer_nest_start(trace_state->buffer);
entry_size = sizeof(*trace_state->entry) + fields_size;
trace_state->entry = trace_event_buffer_reserve(&trace_state->fbuffer,
file,
entry_size);
if (!trace_state->entry) {
ring_buffer_nest_end(trace_state->buffer);
ret = -EINVAL;
}
out:
return ret;
}
static inline void
__synth_event_trace_end(struct synth_event_trace_state *trace_state)
{
trace_event_buffer_commit(&trace_state->fbuffer);
ring_buffer_nest_end(trace_state->buffer);
}
/**
* synth_event_trace - Trace a synthetic event
* @file: The trace_event_file representing the synthetic event
@@ -1819,71 +1901,61 @@ EXPORT_SYMBOL_GPL(synth_event_cmd_init);
*/
int synth_event_trace(struct trace_event_file *file, unsigned int n_vals, ...)
{
struct trace_event_buffer fbuffer;
struct synth_trace_event *entry;
struct trace_buffer *buffer;
struct synth_event *event;
struct synth_event_trace_state state;
unsigned int i, n_u64;
int fields_size = 0;
va_list args;
int ret = 0;
int ret;
/*
* Normal event generation doesn't get called at all unless
* the ENABLED bit is set (which attaches the probe thus
* allowing this code to be called, etc). Because this is
* called directly by the user, we don't have that but we
* still need to honor not logging when disabled.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED))
return 0;
ret = __synth_event_trace_start(file, &state);
if (ret) {
if (ret == -ENOENT)
ret = 0; /* just disabled, not really an error */
return ret;
}
event = file->event_call->data;
if (n_vals != event->n_fields)
return -EINVAL;
if (trace_trigger_soft_disabled(file))
return -EINVAL;
fields_size = event->n_u64 * sizeof(u64);
/*
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
buffer = file->tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
entry = trace_event_buffer_reserve(&fbuffer, file,
sizeof(*entry) + fields_size);
if (!entry) {
if (n_vals != state.event->n_fields) {
ret = -EINVAL;
goto out;
}
va_start(args, n_vals);
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
u64 val;
val = va_arg(args, u64);
if (event->fields[i]->is_string) {
if (state.event->fields[i]->is_string) {
char *str_val = (char *)(long)val;
char *str_field = (char *)&entry->fields[n_u64];
char *str_field = (char *)&state.entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
entry->fields[n_u64] = val;
struct synth_field *field = state.event->fields[i];
switch (field->size) {
case 1:
*(u8 *)&state.entry->fields[n_u64] = (u8)val;
break;
case 2:
*(u16 *)&state.entry->fields[n_u64] = (u16)val;
break;
case 4:
*(u32 *)&state.entry->fields[n_u64] = (u32)val;
break;
default:
state.entry->fields[n_u64] = val;
break;
}
n_u64++;
}
}
va_end(args);
trace_event_buffer_commit(&fbuffer);
out:
ring_buffer_nest_end(buffer);
__synth_event_trace_end(&state);
return ret;
}
@@ -1910,64 +1982,55 @@ EXPORT_SYMBOL_GPL(synth_event_trace);
int synth_event_trace_array(struct trace_event_file *file, u64 *vals,
unsigned int n_vals)
{
struct trace_event_buffer fbuffer;
struct synth_trace_event *entry;
struct trace_buffer *buffer;
struct synth_event *event;
struct synth_event_trace_state state;
unsigned int i, n_u64;
int fields_size = 0;
int ret = 0;
int ret;
/*
* Normal event generation doesn't get called at all unless
* the ENABLED bit is set (which attaches the probe thus
* allowing this code to be called, etc). Because this is
* called directly by the user, we don't have that but we
* still need to honor not logging when disabled.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED))
return 0;
ret = __synth_event_trace_start(file, &state);
if (ret) {
if (ret == -ENOENT)
ret = 0; /* just disabled, not really an error */
return ret;
}
event = file->event_call->data;
if (n_vals != event->n_fields)
return -EINVAL;
if (trace_trigger_soft_disabled(file))
return -EINVAL;
fields_size = event->n_u64 * sizeof(u64);
/*
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
buffer = file->tr->array_buffer.buffer;
ring_buffer_nest_start(buffer);
entry = trace_event_buffer_reserve(&fbuffer, file,
sizeof(*entry) + fields_size);
if (!entry) {
if (n_vals != state.event->n_fields) {
ret = -EINVAL;
goto out;
}
for (i = 0, n_u64 = 0; i < event->n_fields; i++) {
if (event->fields[i]->is_string) {
for (i = 0, n_u64 = 0; i < state.event->n_fields; i++) {
if (state.event->fields[i]->is_string) {
char *str_val = (char *)(long)vals[i];
char *str_field = (char *)&entry->fields[n_u64];
char *str_field = (char *)&state.entry->fields[n_u64];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
n_u64 += STR_VAR_LEN_MAX / sizeof(u64);
} else {
entry->fields[n_u64] = vals[i];
struct synth_field *field = state.event->fields[i];
u64 val = vals[i];
switch (field->size) {
case 1:
*(u8 *)&state.entry->fields[n_u64] = (u8)val;
break;
case 2:
*(u16 *)&state.entry->fields[n_u64] = (u16)val;
break;
case 4:
*(u32 *)&state.entry->fields[n_u64] = (u32)val;
break;
default:
state.entry->fields[n_u64] = val;
break;
}
n_u64++;
}
}
trace_event_buffer_commit(&fbuffer);
out:
ring_buffer_nest_end(buffer);
__synth_event_trace_end(&state);
return ret;
}
@@ -2004,58 +2067,15 @@ EXPORT_SYMBOL_GPL(synth_event_trace_array);
int synth_event_trace_start(struct trace_event_file *file,
struct synth_event_trace_state *trace_state)
{
struct synth_trace_event *entry;
int fields_size = 0;
int ret = 0;
int ret;
if (!trace_state) {
ret = -EINVAL;
goto out;
}
if (!trace_state)
return -EINVAL;
memset(trace_state, '\0', sizeof(*trace_state));
ret = __synth_event_trace_start(file, trace_state);
if (ret == -ENOENT)
ret = 0; /* just disabled, not really an error */
/*
* Normal event tracing doesn't get called at all unless the
* ENABLED bit is set (which attaches the probe thus allowing
* this code to be called, etc). Because this is called
* directly by the user, we don't have that but we still need
* to honor not logging when disabled. For the the iterated
* trace case, we save the enabed state upon start and just
* ignore the following data calls.
*/
if (!(file->flags & EVENT_FILE_FL_ENABLED)) {
trace_state->enabled = false;
goto out;
}
trace_state->enabled = true;
trace_state->event = file->event_call->data;
if (trace_trigger_soft_disabled(file)) {
ret = -EINVAL;
goto out;
}
fields_size = trace_state->event->n_u64 * sizeof(u64);
/*
* Avoid ring buffer recursion detection, as this event
* is being performed within another event.
*/
trace_state->buffer = file->tr->array_buffer.buffer;
ring_buffer_nest_start(trace_state->buffer);
entry = trace_event_buffer_reserve(&trace_state->fbuffer, file,
sizeof(*entry) + fields_size);
if (!entry) {
ret = -EINVAL;
goto out;
}
trace_state->entry = entry;
out:
return ret;
}
EXPORT_SYMBOL_GPL(synth_event_trace_start);
@@ -2088,7 +2108,7 @@ static int __synth_event_add_val(const char *field_name, u64 val,
trace_state->add_next = true;
}
if (!trace_state->enabled)
if (trace_state->disabled)
goto out;
event = trace_state->event;
@@ -2122,8 +2142,25 @@ static int __synth_event_add_val(const char *field_name, u64 val,
str_field = (char *)&entry->fields[field->offset];
strscpy(str_field, str_val, STR_VAR_LEN_MAX);
} else
entry->fields[field->offset] = val;
} else {
switch (field->size) {
case 1:
*(u8 *)&trace_state->entry->fields[field->offset] = (u8)val;
break;
case 2:
*(u16 *)&trace_state->entry->fields[field->offset] = (u16)val;
break;
case 4:
*(u32 *)&trace_state->entry->fields[field->offset] = (u32)val;
break;
default:
trace_state->entry->fields[field->offset] = val;
break;
}
}
out:
return ret;
}
@@ -2223,9 +2260,7 @@ int synth_event_trace_end(struct synth_event_trace_state *trace_state)
if (!trace_state)
return -EINVAL;
trace_event_buffer_commit(&trace_state->fbuffer);
ring_buffer_nest_end(trace_state->buffer);
__synth_event_trace_end(trace_state);
return 0;
}

View File

@@ -1012,7 +1012,7 @@ int __kprobe_event_add_fields(struct dynevent_cmd *cmd, ...)
{
struct dynevent_arg arg;
va_list args;
int ret;
int ret = 0;
if (cmd->type != DYNEVENT_TYPE_KPROBE)
return -EINVAL;