bpf: introduce bpf_spin_lock
Introduce 'struct bpf_spin_lock' and bpf_spin_lock/unlock() helpers to let bpf program serialize access to other variables. Example: struct hash_elem { int cnt; struct bpf_spin_lock lock; }; struct hash_elem * val = bpf_map_lookup_elem(&hash_map, &key); if (val) { bpf_spin_lock(&val->lock); val->cnt++; bpf_spin_unlock(&val->lock); } Restrictions and safety checks: - bpf_spin_lock is only allowed inside HASH and ARRAY maps. - BTF description of the map is mandatory for safety analysis. - bpf program can take one bpf_spin_lock at a time, since two or more can cause dead locks. - only one 'struct bpf_spin_lock' is allowed per map element. It drastically simplifies implementation yet allows bpf program to use any number of bpf_spin_locks. - when bpf_spin_lock is taken the calls (either bpf2bpf or helpers) are not allowed. - bpf program must bpf_spin_unlock() before return. - bpf program can access 'struct bpf_spin_lock' only via bpf_spin_lock()/bpf_spin_unlock() helpers. - load/store into 'struct bpf_spin_lock lock;' field is not allowed. - to use bpf_spin_lock() helper the BTF description of map value must be a struct and have 'struct bpf_spin_lock anyname;' field at the top level. Nested lock inside another struct is not allowed. - syscall map_lookup doesn't copy bpf_spin_lock field to user space. - syscall map_update and program map_update do not update bpf_spin_lock field. - bpf_spin_lock cannot be on the stack or inside networking packet. bpf_spin_lock can only be inside HASH or ARRAY map value. - bpf_spin_lock is available to root only and to all program types. - bpf_spin_lock is not allowed in inner maps of map-in-map. - ld_abs is not allowed inside spin_lock-ed region. - tracing progs and socket filter progs cannot use bpf_spin_lock due to insufficient preemption checks Implementation details: - cgroup-bpf class of programs can nest with xdp/tc programs. Hence bpf_spin_lock is equivalent to spin_lock_irqsave. Other solutions to avoid nested bpf_spin_lock are possible. Like making sure that all networking progs run with softirq disabled. spin_lock_irqsave is the simplest and doesn't add overhead to the programs that don't use it. - arch_spinlock_t is used when its implemented as queued_spin_lock - archs can force their own arch_spinlock_t - on architectures where queued_spin_lock is not available and sizeof(arch_spinlock_t) != sizeof(__u32) trivial lock is used. - presence of bpf_spin_lock inside map value could have been indicated via extra flag during map_create, but specifying it via BTF is cleaner. It provides introspection for map key/value and reduces user mistakes. Next steps: - allow bpf_spin_lock in other map types (like cgroup local storage) - introduce BPF_F_LOCK flag for bpf_map_update() syscall and helper to request kernel to grab bpf_spin_lock before rewriting the value. That will serialize access to map elements. Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
This commit is contained in:

committed by
Daniel Borkmann

parent
1832f4ef58
commit
d83525ca62
@@ -72,14 +72,15 @@ struct bpf_map {
|
||||
u32 value_size;
|
||||
u32 max_entries;
|
||||
u32 map_flags;
|
||||
u32 pages;
|
||||
int spin_lock_off; /* >=0 valid offset, <0 error */
|
||||
u32 id;
|
||||
int numa_node;
|
||||
u32 btf_key_type_id;
|
||||
u32 btf_value_type_id;
|
||||
struct btf *btf;
|
||||
u32 pages;
|
||||
bool unpriv_array;
|
||||
/* 55 bytes hole */
|
||||
/* 51 bytes hole */
|
||||
|
||||
/* The 3rd and 4th cacheline with misc members to avoid false sharing
|
||||
* particularly with refcounting.
|
||||
@@ -91,6 +92,34 @@ struct bpf_map {
|
||||
char name[BPF_OBJ_NAME_LEN];
|
||||
};
|
||||
|
||||
static inline bool map_value_has_spin_lock(const struct bpf_map *map)
|
||||
{
|
||||
return map->spin_lock_off >= 0;
|
||||
}
|
||||
|
||||
static inline void check_and_init_map_lock(struct bpf_map *map, void *dst)
|
||||
{
|
||||
if (likely(!map_value_has_spin_lock(map)))
|
||||
return;
|
||||
*(struct bpf_spin_lock *)(dst + map->spin_lock_off) =
|
||||
(struct bpf_spin_lock){};
|
||||
}
|
||||
|
||||
/* copy everything but bpf_spin_lock */
|
||||
static inline void copy_map_value(struct bpf_map *map, void *dst, void *src)
|
||||
{
|
||||
if (unlikely(map_value_has_spin_lock(map))) {
|
||||
u32 off = map->spin_lock_off;
|
||||
|
||||
memcpy(dst, src, off);
|
||||
memcpy(dst + off + sizeof(struct bpf_spin_lock),
|
||||
src + off + sizeof(struct bpf_spin_lock),
|
||||
map->value_size - off - sizeof(struct bpf_spin_lock));
|
||||
} else {
|
||||
memcpy(dst, src, map->value_size);
|
||||
}
|
||||
}
|
||||
|
||||
struct bpf_offload_dev;
|
||||
struct bpf_offloaded_map;
|
||||
|
||||
@@ -162,6 +191,7 @@ enum bpf_arg_type {
|
||||
ARG_PTR_TO_CTX, /* pointer to context */
|
||||
ARG_ANYTHING, /* any (initialized) argument is ok */
|
||||
ARG_PTR_TO_SOCKET, /* pointer to bpf_sock */
|
||||
ARG_PTR_TO_SPIN_LOCK, /* pointer to bpf_spin_lock */
|
||||
};
|
||||
|
||||
/* type of values returned from helper functions */
|
||||
@@ -879,7 +909,8 @@ extern const struct bpf_func_proto bpf_msg_redirect_hash_proto;
|
||||
extern const struct bpf_func_proto bpf_msg_redirect_map_proto;
|
||||
extern const struct bpf_func_proto bpf_sk_redirect_hash_proto;
|
||||
extern const struct bpf_func_proto bpf_sk_redirect_map_proto;
|
||||
|
||||
extern const struct bpf_func_proto bpf_spin_lock_proto;
|
||||
extern const struct bpf_func_proto bpf_spin_unlock_proto;
|
||||
extern const struct bpf_func_proto bpf_get_local_storage_proto;
|
||||
|
||||
/* Shared helpers among cBPF and eBPF. */
|
||||
|
@@ -148,6 +148,7 @@ struct bpf_verifier_state {
|
||||
/* call stack tracking */
|
||||
struct bpf_func_state *frame[MAX_CALL_FRAMES];
|
||||
u32 curframe;
|
||||
u32 active_spin_lock;
|
||||
bool speculative;
|
||||
};
|
||||
|
||||
|
@@ -50,6 +50,7 @@ u32 btf_id(const struct btf *btf);
|
||||
bool btf_member_is_reg_int(const struct btf *btf, const struct btf_type *s,
|
||||
const struct btf_member *m,
|
||||
u32 expected_offset, u32 expected_size);
|
||||
int btf_find_spin_lock(const struct btf *btf, const struct btf_type *t);
|
||||
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
const struct btf_type *btf_type_by_id(const struct btf *btf, u32 type_id);
|
||||
|
Reference in New Issue
Block a user