soreuseport: setsockopt SO_ATTACH_REUSEPORT_[CE]BPF

Expose socket options for setting a classic or extended BPF program
for use when selecting sockets in an SO_REUSEPORT group.  These options
can be used on the first socket to belong to a group before bind or
on any socket in the group after bind.

This change includes refactoring of the existing sk_filter code to
allow reuse of the existing BPF filter validation checks.

Signed-off-by: Craig Gallek <kraig@google.com>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Craig Gallek
2016-01-04 17:41:47 -05:00
committed by David S. Miller
parent e32ea7e747
commit 538950a1b7
22 changed files with 300 additions and 62 deletions

View File

@@ -50,6 +50,7 @@
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
#include <net/sock_reuseport.h>
/**
* sk_filter - run a packet through a socket filter
@@ -1167,6 +1168,68 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
return 0;
}
static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
{
struct bpf_prog *old_prog;
int err;
if (bpf_prog_size(prog->len) > sysctl_optmem_max)
return -ENOMEM;
if (sk_unhashed(sk)) {
err = reuseport_alloc(sk);
if (err)
return err;
} else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
/* The socket wasn't bound with SO_REUSEPORT */
return -EINVAL;
}
old_prog = reuseport_attach_prog(sk, prog);
if (old_prog)
bpf_prog_destroy(old_prog);
return 0;
}
static
struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
struct bpf_prog *prog;
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return ERR_PTR(-EPERM);
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return ERR_PTR(-EINVAL);
prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
return ERR_PTR(-ENOMEM);
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
return ERR_PTR(-EFAULT);
}
prog->len = fprog->len;
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
__bpf_prog_free(prog);
return ERR_PTR(-ENOMEM);
}
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
return bpf_prepare_filter(prog, NULL);
}
/**
* sk_attach_filter - attach a socket filter
* @fprog: the filter program
@@ -1179,39 +1242,9 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
*/
int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
struct bpf_prog *prog;
struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
return -EINVAL;
prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
return -ENOMEM;
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
return -EFAULT;
}
prog->len = fprog->len;
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
__bpf_prog_free(prog);
return -ENOMEM;
}
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
prog = bpf_prepare_filter(prog, NULL);
if (IS_ERR(prog))
return PTR_ERR(prog);
@@ -1225,23 +1258,50 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
}
EXPORT_SYMBOL_GPL(sk_attach_filter);
int sk_attach_bpf(u32 ufd, struct sock *sk)
int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
struct bpf_prog *prog;
struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
prog = bpf_prog_get(ufd);
if (IS_ERR(prog))
return PTR_ERR(prog);
if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
return -EINVAL;
err = __reuseport_attach_prog(prog, sk);
if (err < 0) {
__bpf_prog_release(prog);
return err;
}
return 0;
}
static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return ERR_PTR(-EPERM);
prog = bpf_prog_get(ufd);
if (IS_ERR(prog))
return prog;
if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
return ERR_PTR(-EINVAL);
}
return prog;
}
int sk_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog = __get_bpf(ufd, sk);
int err;
if (IS_ERR(prog))
return PTR_ERR(prog);
err = __sk_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
@@ -1251,6 +1311,23 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
{
struct bpf_prog *prog = __get_bpf(ufd, sk);
int err;
if (IS_ERR(prog))
return PTR_ERR(prog);
err = __reuseport_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
return err;
}
return 0;
}
#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
#define BPF_LDST_LEN 16U

View File

@@ -134,6 +134,7 @@
#include <linux/sock_diag.h>
#include <linux/filter.h>
#include <net/sock_reuseport.h>
#include <trace/events/sock.h>
@@ -932,6 +933,32 @@ set_rcvbuf:
}
break;
case SO_ATTACH_REUSEPORT_CBPF:
ret = -EINVAL;
if (optlen == sizeof(struct sock_fprog)) {
struct sock_fprog fprog;
ret = -EFAULT;
if (copy_from_user(&fprog, optval, sizeof(fprog)))
break;
ret = sk_reuseport_attach_filter(&fprog, sk);
}
break;
case SO_ATTACH_REUSEPORT_EBPF:
ret = -EINVAL;
if (optlen == sizeof(u32)) {
u32 ufd;
ret = -EFAULT;
if (copy_from_user(&ufd, optval, sizeof(ufd)))
break;
ret = sk_reuseport_attach_bpf(ufd, sk);
}
break;
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1443,6 +1470,8 @@ void sk_destruct(struct sock *sk)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
if (rcu_access_pointer(sk->sk_reuseport_cb))
reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);

View File

@@ -1,10 +1,12 @@
/*
* To speed up listener socket lookup, create an array to store all sockets
* listening on the same port. This allows a decision to be made after finding
* the first socket.
* the first socket. An optional BPF program can also be configured for
* selecting the socket index from the array of available sockets.
*/
#include <net/sock_reuseport.h>
#include <linux/bpf.h>
#include <linux/rcupdate.h>
#define INIT_SOCKS 128
@@ -22,6 +24,7 @@ static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
reuse->max_socks = max_socks;
RCU_INIT_POINTER(reuse->prog, NULL);
return reuse;
}
@@ -67,6 +70,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
more_reuse->max_socks = more_socks_size;
more_reuse->num_socks = reuse->num_socks;
more_reuse->prog = reuse->prog;
memcpy(more_reuse->socks, reuse->socks,
reuse->num_socks * sizeof(struct sock *));
@@ -75,6 +79,10 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
more_reuse);
/* Note: we use kfree_rcu here instead of reuseport_free_rcu so
* that reuse and more_reuse can temporarily share a reference
* to prog.
*/
kfree_rcu(reuse, rcu);
return more_reuse;
}
@@ -116,6 +124,16 @@ int reuseport_add_sock(struct sock *sk, const struct sock *sk2)
}
EXPORT_SYMBOL(reuseport_add_sock);
static void reuseport_free_rcu(struct rcu_head *head)
{
struct sock_reuseport *reuse;
reuse = container_of(head, struct sock_reuseport, rcu);
if (reuse->prog)
bpf_prog_destroy(reuse->prog);
kfree(reuse);
}
void reuseport_detach_sock(struct sock *sk)
{
struct sock_reuseport *reuse;
@@ -131,7 +149,7 @@ void reuseport_detach_sock(struct sock *sk)
reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
reuse->num_socks--;
if (reuse->num_socks == 0)
kfree_rcu(reuse, rcu);
call_rcu(&reuse->rcu, reuseport_free_rcu);
break;
}
}
@@ -139,15 +157,53 @@ void reuseport_detach_sock(struct sock *sk)
}
EXPORT_SYMBOL(reuseport_detach_sock);
static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
struct bpf_prog *prog, struct sk_buff *skb,
int hdr_len)
{
struct sk_buff *nskb = NULL;
u32 index;
if (skb_shared(skb)) {
nskb = skb_clone(skb, GFP_ATOMIC);
if (!nskb)
return NULL;
skb = nskb;
}
/* temporarily advance data past protocol header */
if (!pskb_pull(skb, hdr_len)) {
consume_skb(nskb);
return NULL;
}
index = bpf_prog_run_save_cb(prog, skb);
__skb_push(skb, hdr_len);
consume_skb(nskb);
if (index >= socks)
return NULL;
return reuse->socks[index];
}
/**
* reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
* @sk: First socket in the group.
* @hash: Use this hash to select.
* @hash: When no BPF filter is available, use this hash to select.
* @skb: skb to run through BPF filter.
* @hdr_len: BPF filter expects skb data pointer at payload data. If
* the skb does not yet point at the payload, this parameter represents
* how far the pointer needs to advance to reach the payload.
* Returns a socket that should receive the packet (or NULL on error).
*/
struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
struct sock *reuseport_select_sock(struct sock *sk,
u32 hash,
struct sk_buff *skb,
int hdr_len)
{
struct sock_reuseport *reuse;
struct bpf_prog *prog;
struct sock *sk2 = NULL;
u16 socks;
@@ -158,12 +214,16 @@ struct sock *reuseport_select_sock(struct sock *sk, u32 hash)
if (!reuse)
goto out;
prog = rcu_dereference(reuse->prog);
socks = READ_ONCE(reuse->num_socks);
if (likely(socks)) {
/* paired with smp_wmb() in reuseport_add_sock() */
smp_rmb();
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
if (prog && skb)
sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
else
sk2 = reuse->socks[reciprocal_scale(hash, socks)];
}
out:
@@ -171,3 +231,21 @@ out:
return sk2;
}
EXPORT_SYMBOL(reuseport_select_sock);
struct bpf_prog *
reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
{
struct sock_reuseport *reuse;
struct bpf_prog *old_prog;
spin_lock_bh(&reuseport_lock);
reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
lockdep_is_held(&reuseport_lock));
old_prog = rcu_dereference_protected(reuse->prog,
lockdep_is_held(&reuseport_lock));
rcu_assign_pointer(reuse->prog, prog);
spin_unlock_bh(&reuseport_lock);
return old_prog;
}
EXPORT_SYMBOL(reuseport_attach_prog);