Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2019-05-31

The following pull-request contains BPF updates for your *net-next* tree.

Lots of exciting new features in the first PR of this developement cycle!
The main changes are:

1) misc verifier improvements, from Alexei.

2) bpftool can now convert btf to valid C, from Andrii.

3) verifier can insert explicit ZEXT insn when requested by 32-bit JITs.
   This feature greatly improves BPF speed on 32-bit architectures. From Jiong.

4) cgroups will now auto-detach bpf programs. This fixes issue of thousands
   bpf programs got stuck in dying cgroups. From Roman.

5) new bpf_send_signal() helper, from Yonghong.

6) cgroup inet skb programs can signal CN to the stack, from Lawrence.

7) miscellaneous cleanups, from many developers.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2019-05-31 21:21:18 -07:00
122 changed files with 6433 additions and 1016 deletions

View File

@@ -6,6 +6,7 @@
#include <linux/errno.h>
#include <linux/jump_label.h>
#include <linux/percpu.h>
#include <linux/percpu-refcount.h>
#include <linux/rbtree.h>
#include <uapi/linux/bpf.h>
@@ -71,11 +72,17 @@ struct cgroup_bpf {
u32 flags[MAX_BPF_ATTACH_TYPE];
/* temp storage for effective prog array used by prog_attach/detach */
struct bpf_prog_array __rcu *inactive;
struct bpf_prog_array *inactive;
/* reference counter used to detach bpf programs after cgroup removal */
struct percpu_ref refcnt;
/* cgroup_bpf is released using a work queue */
struct work_struct release_work;
};
void cgroup_bpf_put(struct cgroup *cgrp);
int cgroup_bpf_inherit(struct cgroup *cgrp);
void cgroup_bpf_offline(struct cgroup *cgrp);
int __cgroup_bpf_attach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type, u32 flags);
@@ -283,8 +290,8 @@ int cgroup_bpf_prog_query(const union bpf_attr *attr,
struct bpf_prog;
struct cgroup_bpf {};
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
static inline int cgroup_bpf_inherit(struct cgroup *cgrp) { return 0; }
static inline void cgroup_bpf_offline(struct cgroup *cgrp) {}
static inline int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype,

View File

@@ -66,6 +66,11 @@ struct bpf_map_ops {
u64 imm, u32 *off);
};
struct bpf_map_memory {
u32 pages;
struct user_struct *user;
};
struct bpf_map {
/* The first two cachelines with read-mostly members of which some
* are also accessed in fast-path (e.g. ops, max_entries).
@@ -86,7 +91,7 @@ struct bpf_map {
u32 btf_key_type_id;
u32 btf_value_type_id;
struct btf *btf;
u32 pages;
struct bpf_map_memory memory;
bool unpriv_array;
bool frozen; /* write-once */
/* 48 bytes hole */
@@ -94,8 +99,7 @@ struct bpf_map {
/* The 3rd and 4th cacheline with misc members to avoid false sharing
* particularly with refcounting.
*/
struct user_struct *user ____cacheline_aligned;
atomic_t refcnt;
atomic_t refcnt ____cacheline_aligned;
atomic_t usercnt;
struct work_struct work;
char name[BPF_OBJ_NAME_LEN];
@@ -370,6 +374,7 @@ struct bpf_prog_aux {
u32 id;
u32 func_cnt; /* used by non-func prog as the number of func progs */
u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
bool verifier_zext; /* Zero extensions has been inserted by verifier. */
bool offload_requested;
struct bpf_prog **func;
void *jit_data; /* JIT specific data. arch dependent */
@@ -513,17 +518,17 @@ struct bpf_prog_array {
};
struct bpf_prog_array *bpf_prog_array_alloc(u32 prog_cnt, gfp_t flags);
void bpf_prog_array_free(struct bpf_prog_array __rcu *progs);
int bpf_prog_array_length(struct bpf_prog_array __rcu *progs);
int bpf_prog_array_copy_to_user(struct bpf_prog_array __rcu *progs,
void bpf_prog_array_free(struct bpf_prog_array *progs);
int bpf_prog_array_length(struct bpf_prog_array *progs);
int bpf_prog_array_copy_to_user(struct bpf_prog_array *progs,
__u32 __user *prog_ids, u32 cnt);
void bpf_prog_array_delete_safe(struct bpf_prog_array __rcu *progs,
void bpf_prog_array_delete_safe(struct bpf_prog_array *progs,
struct bpf_prog *old_prog);
int bpf_prog_array_copy_info(struct bpf_prog_array __rcu *array,
int bpf_prog_array_copy_info(struct bpf_prog_array *array,
u32 *prog_ids, u32 request_cnt,
u32 *prog_cnt);
int bpf_prog_array_copy(struct bpf_prog_array __rcu *old_array,
int bpf_prog_array_copy(struct bpf_prog_array *old_array,
struct bpf_prog *exclude_prog,
struct bpf_prog *include_prog,
struct bpf_prog_array **new_array);
@@ -551,6 +556,56 @@ _out: \
_ret; \
})
/* To be used by __cgroup_bpf_run_filter_skb for EGRESS BPF progs
* so BPF programs can request cwr for TCP packets.
*
* Current cgroup skb programs can only return 0 or 1 (0 to drop the
* packet. This macro changes the behavior so the low order bit
* indicates whether the packet should be dropped (0) or not (1)
* and the next bit is a congestion notification bit. This could be
* used by TCP to call tcp_enter_cwr()
*
* Hence, new allowed return values of CGROUP EGRESS BPF programs are:
* 0: drop packet
* 1: keep packet
* 2: drop packet and cn
* 3: keep packet and cn
*
* This macro then converts it to one of the NET_XMIT or an error
* code that is then interpreted as drop packet (and no cn):
* 0: NET_XMIT_SUCCESS skb should be transmitted
* 1: NET_XMIT_DROP skb should be dropped and cn
* 2: NET_XMIT_CN skb should be transmitted and cn
* 3: -EPERM skb should be dropped
*/
#define BPF_PROG_CGROUP_INET_EGRESS_RUN_ARRAY(array, ctx, func) \
({ \
struct bpf_prog_array_item *_item; \
struct bpf_prog *_prog; \
struct bpf_prog_array *_array; \
u32 ret; \
u32 _ret = 1; \
u32 _cn = 0; \
preempt_disable(); \
rcu_read_lock(); \
_array = rcu_dereference(array); \
_item = &_array->items[0]; \
while ((_prog = READ_ONCE(_item->prog))) { \
bpf_cgroup_storage_set(_item->cgroup_storage); \
ret = func(_prog, ctx); \
_ret &= (ret & 1); \
_cn |= (ret & 2); \
_item++; \
} \
rcu_read_unlock(); \
preempt_enable(); \
if (_ret) \
_ret = (_cn ? NET_XMIT_CN : NET_XMIT_SUCCESS); \
else \
_ret = (_cn ? NET_XMIT_DROP : -EPERM); \
_ret; \
})
#define BPF_PROG_RUN_ARRAY(array, ctx, func) \
__BPF_PROG_RUN_ARRAY(array, ctx, func, false)
@@ -595,9 +650,12 @@ struct bpf_map *__bpf_map_get(struct fd f);
struct bpf_map * __must_check bpf_map_inc(struct bpf_map *map, bool uref);
void bpf_map_put_with_uref(struct bpf_map *map);
void bpf_map_put(struct bpf_map *map);
int bpf_map_precharge_memlock(u32 pages);
int bpf_map_charge_memlock(struct bpf_map *map, u32 pages);
void bpf_map_uncharge_memlock(struct bpf_map *map, u32 pages);
int bpf_map_charge_init(struct bpf_map_memory *mem, size_t size);
void bpf_map_charge_finish(struct bpf_map_memory *mem);
void bpf_map_charge_move(struct bpf_map_memory *dst,
struct bpf_map_memory *src);
void *bpf_map_area_alloc(size_t size, int numa_node);
void bpf_map_area_free(void *base);
void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr);

View File

@@ -36,9 +36,11 @@
*/
enum bpf_reg_liveness {
REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
REG_LIVE_DONE = 4, /* liveness won't be updating this register anymore */
REG_LIVE_READ32 = 0x1, /* reg was read, so we're sensitive to initial value */
REG_LIVE_READ64 = 0x2, /* likewise, but full 64-bit content matters */
REG_LIVE_READ = REG_LIVE_READ32 | REG_LIVE_READ64,
REG_LIVE_WRITTEN = 0x4, /* reg was written first, screening off later reads */
REG_LIVE_DONE = 0x8, /* liveness won't be updating this register anymore */
};
struct bpf_reg_state {
@@ -131,6 +133,11 @@ struct bpf_reg_state {
* pointing to bpf_func_state.
*/
u32 frameno;
/* Tracks subreg definition. The stored value is the insn_idx of the
* writing insn. This is safe because subreg_def is used before any insn
* patching which only happens after main verification finished.
*/
s32 subreg_def;
enum bpf_reg_liveness live;
};
@@ -187,6 +194,7 @@ struct bpf_func_state {
struct bpf_verifier_state {
/* call stack tracking */
struct bpf_func_state *frame[MAX_CALL_FRAMES];
u32 insn_idx;
u32 curframe;
u32 active_spin_lock;
bool speculative;
@@ -232,7 +240,9 @@ struct bpf_insn_aux_data {
int ctx_field_size; /* the ctx field size for load insn, maybe 0 */
int sanitize_stack_off; /* stack slot to be cleared */
bool seen; /* this insn was processed by the verifier */
bool zext_dst; /* this insn zero extends dst reg */
u8 alu_state; /* used in combination with alu_limit */
bool prune_point;
unsigned int orig_idx; /* original instruction index */
};

View File

@@ -924,4 +924,22 @@ static inline bool cgroup_task_frozen(struct task_struct *task)
#endif /* !CONFIG_CGROUPS */
#ifdef CONFIG_CGROUP_BPF
static inline void cgroup_bpf_get(struct cgroup *cgrp)
{
percpu_ref_get(&cgrp->bpf.refcnt);
}
static inline void cgroup_bpf_put(struct cgroup *cgrp)
{
percpu_ref_put(&cgrp->bpf.refcnt);
}
#else /* CONFIG_CGROUP_BPF */
static inline void cgroup_bpf_get(struct cgroup *cgrp) {}
static inline void cgroup_bpf_put(struct cgroup *cgrp) {}
#endif /* CONFIG_CGROUP_BPF */
#endif /* _LINUX_CGROUP_H */

View File

@@ -160,6 +160,20 @@ struct ctl_table_header;
.off = 0, \
.imm = IMM })
/* Special form of mov32, used for doing explicit zero extension on dst. */
#define BPF_ZEXT_REG(DST) \
((struct bpf_insn) { \
.code = BPF_ALU | BPF_MOV | BPF_X, \
.dst_reg = DST, \
.src_reg = DST, \
.off = 0, \
.imm = 1 })
static inline bool insn_is_zext(const struct bpf_insn *insn)
{
return insn->code == (BPF_ALU | BPF_MOV | BPF_X) && insn->imm == 1;
}
/* BPF_LD_IMM64 macro encodes single 'load 64-bit immediate' insn */
#define BPF_LD_IMM64(DST, IMM) \
BPF_LD_IMM64_RAW(DST, 0, IMM)
@@ -512,7 +526,8 @@ struct bpf_prog {
blinded:1, /* Was blinded */
is_func:1, /* program is a bpf function */
kprobe_override:1, /* Do we override a kprobe? */
has_callchain_buf:1; /* callchain buffer allocated? */
has_callchain_buf:1, /* callchain buffer allocated? */
enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
enum bpf_prog_type type; /* Type of BPF program */
enum bpf_attach_type expected_attach_type; /* For some prog types */
u32 len; /* Number of filter blocks */
@@ -811,6 +826,7 @@ u64 __bpf_call_base(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog);
void bpf_jit_compile(struct bpf_prog *prog);
bool bpf_jit_needs_zext(void);
bool bpf_helper_changes_pkt_data(void *func);
static inline bool bpf_dump_raw_ok(void)