Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2019-02-16

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) numerous libbpf API improvements, from Andrii, Andrey, Yonghong.

2) test all bpf progs in alu32 mode, from Jiong.

3) skb->sk access and bpf_sk_fullsock(), bpf_tcp_sock() helpers, from Martin.

4) support for IP encap in lwt bpf progs, from Peter.

5) remove XDP_QUERY_XSK_UMEM dead code, from Jan.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2019-02-16 22:56:34 -08:00
92개의 변경된 파일2574개의 추가작업 그리고 483개의 파일을 삭제

파일 보기

@@ -403,7 +403,7 @@ config LWTUNNEL
config LWTUNNEL_BPF
bool "Execute BPF program as route nexthop action"
depends on LWTUNNEL
depends on LWTUNNEL && INET
default y if LWTUNNEL=y
---help---
Allows to run BPF programs as a nexthop action following a route

파일 보기

@@ -73,6 +73,7 @@
#include <linux/seg6_local.h>
#include <net/seg6.h>
#include <net/seg6_local.h>
#include <net/lwtunnel.h>
/**
* sk_filter_trim_cap - run a packet through a socket filter
@@ -1793,6 +1794,20 @@ static const struct bpf_func_proto bpf_skb_pull_data_proto = {
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_1(bpf_sk_fullsock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
return sk_fullsock(sk) ? (unsigned long)sk : (unsigned long)NULL;
}
static const struct bpf_func_proto bpf_sk_fullsock_proto = {
.func = bpf_sk_fullsock,
.gpl_only = false,
.ret_type = RET_PTR_TO_SOCKET_OR_NULL,
.arg1_type = ARG_PTR_TO_SOCK_COMMON,
};
static inline int sk_skb_try_make_writable(struct sk_buff *skb,
unsigned int write_len)
{
@@ -4803,7 +4818,15 @@ static int bpf_push_seg6_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len
}
#endif /* CONFIG_IPV6_SEG6_BPF */
BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
static int bpf_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len,
bool ingress)
{
return bpf_lwt_push_ip_encap(skb, hdr, len, ingress);
}
#endif
BPF_CALL_4(bpf_lwt_in_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
u32, len)
{
switch (type) {
@@ -4811,14 +4834,41 @@ BPF_CALL_4(bpf_lwt_push_encap, struct sk_buff *, skb, u32, type, void *, hdr,
case BPF_LWT_ENCAP_SEG6:
case BPF_LWT_ENCAP_SEG6_INLINE:
return bpf_push_seg6_encap(skb, type, hdr, len);
#endif
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
case BPF_LWT_ENCAP_IP:
return bpf_push_ip_encap(skb, hdr, len, true /* ingress */);
#endif
default:
return -EINVAL;
}
}
static const struct bpf_func_proto bpf_lwt_push_encap_proto = {
.func = bpf_lwt_push_encap,
BPF_CALL_4(bpf_lwt_xmit_push_encap, struct sk_buff *, skb, u32, type,
void *, hdr, u32, len)
{
switch (type) {
#if IS_ENABLED(CONFIG_LWTUNNEL_BPF)
case BPF_LWT_ENCAP_IP:
return bpf_push_ip_encap(skb, hdr, len, false /* egress */);
#endif
default:
return -EINVAL;
}
}
static const struct bpf_func_proto bpf_lwt_in_push_encap_proto = {
.func = bpf_lwt_in_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_MEM,
.arg4_type = ARG_CONST_SIZE
};
static const struct bpf_func_proto bpf_lwt_xmit_push_encap_proto = {
.func = bpf_lwt_xmit_push_encap,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
@@ -5018,6 +5068,54 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
do { \
switch (si->off) { \
case offsetof(md_type, snd_cwnd): \
CONVERT(snd_cwnd); break; \
case offsetof(md_type, srtt_us): \
CONVERT(srtt_us); break; \
case offsetof(md_type, snd_ssthresh): \
CONVERT(snd_ssthresh); break; \
case offsetof(md_type, rcv_nxt): \
CONVERT(rcv_nxt); break; \
case offsetof(md_type, snd_nxt): \
CONVERT(snd_nxt); break; \
case offsetof(md_type, snd_una): \
CONVERT(snd_una); break; \
case offsetof(md_type, mss_cache): \
CONVERT(mss_cache); break; \
case offsetof(md_type, ecn_flags): \
CONVERT(ecn_flags); break; \
case offsetof(md_type, rate_delivered): \
CONVERT(rate_delivered); break; \
case offsetof(md_type, rate_interval_us): \
CONVERT(rate_interval_us); break; \
case offsetof(md_type, packets_out): \
CONVERT(packets_out); break; \
case offsetof(md_type, retrans_out): \
CONVERT(retrans_out); break; \
case offsetof(md_type, total_retrans): \
CONVERT(total_retrans); break; \
case offsetof(md_type, segs_in): \
CONVERT(segs_in); break; \
case offsetof(md_type, data_segs_in): \
CONVERT(data_segs_in); break; \
case offsetof(md_type, segs_out): \
CONVERT(segs_out); break; \
case offsetof(md_type, data_segs_out): \
CONVERT(data_segs_out); break; \
case offsetof(md_type, lost_out): \
CONVERT(lost_out); break; \
case offsetof(md_type, sacked_out): \
CONVERT(sacked_out); break; \
case offsetof(md_type, bytes_received): \
CONVERT(bytes_received); break; \
case offsetof(md_type, bytes_acked): \
CONVERT(bytes_acked); break; \
} \
} while (0)
#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
@@ -5255,6 +5353,79 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
.arg5_type = ARG_ANYTHING,
};
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
return false;
if (off % size != 0)
return false;
switch (off) {
case offsetof(struct bpf_tcp_sock, bytes_received):
case offsetof(struct bpf_tcp_sock, bytes_acked):
return size == sizeof(__u64);
default:
return size == sizeof(__u32);
}
}
u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
const struct bpf_insn *si,
struct bpf_insn *insn_buf,
struct bpf_prog *prog, u32 *target_size)
{
struct bpf_insn *insn = insn_buf;
#define BPF_TCP_SOCK_GET_COMMON(FIELD) \
do { \
BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, FIELD) > \
FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct tcp_sock, FIELD),\
si->dst_reg, si->src_reg, \
offsetof(struct tcp_sock, FIELD)); \
} while (0)
CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
BPF_TCP_SOCK_GET_COMMON);
if (insn > insn_buf)
return insn - insn_buf;
switch (si->off) {
case offsetof(struct bpf_tcp_sock, rtt_min):
BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) !=
sizeof(struct minmax));
BUILD_BUG_ON(sizeof(struct minmax) <
sizeof(struct minmax_sample));
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
}
return insn - insn_buf;
}
BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
{
sk = sk_to_full_sk(sk);
if (sk_fullsock(sk) && sk->sk_protocol == IPPROTO_TCP)
return (unsigned long)sk;
return (unsigned long)NULL;
}
static const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
.arg1_type = ARG_PTR_TO_SOCK_COMMON,
};
#endif /* CONFIG_INET */
bool bpf_helper_changes_pkt_data(void *func)
@@ -5284,7 +5455,8 @@ bool bpf_helper_changes_pkt_data(void *func)
func == bpf_lwt_seg6_adjust_srh ||
func == bpf_lwt_seg6_action ||
#endif
func == bpf_lwt_push_encap)
func == bpf_lwt_in_push_encap ||
func == bpf_lwt_xmit_push_encap)
return true;
return false;
@@ -5408,6 +5580,12 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
switch (func_id) {
case BPF_FUNC_get_local_storage:
return &bpf_get_local_storage_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
default:
return sk_filter_func_proto(func_id, prog);
}
@@ -5479,6 +5657,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_socket_uid_proto;
case BPF_FUNC_fib_lookup:
return &bpf_skb_fib_lookup_proto;
case BPF_FUNC_sk_fullsock:
return &bpf_sk_fullsock_proto;
#ifdef CONFIG_XFRM
case BPF_FUNC_skb_get_xfrm_state:
return &bpf_skb_get_xfrm_state_proto;
@@ -5496,6 +5676,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_lookup_udp_proto;
case BPF_FUNC_sk_release:
return &bpf_sk_release_proto;
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
#endif
default:
return bpf_base_func_proto(func_id);
@@ -5672,7 +5854,7 @@ lwt_in_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
switch (func_id) {
case BPF_FUNC_lwt_push_encap:
return &bpf_lwt_push_encap_proto;
return &bpf_lwt_in_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
@@ -5708,6 +5890,8 @@ lwt_xmit_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_l4_csum_replace_proto;
case BPF_FUNC_set_hash_invalid:
return &bpf_set_hash_invalid_proto;
case BPF_FUNC_lwt_push_encap:
return &bpf_lwt_xmit_push_encap_proto;
default:
return lwt_out_func_proto(func_id, prog);
}
@@ -5766,6 +5950,11 @@ static bool bpf_skb_is_valid_access(int off, int size, enum bpf_access_type type
if (size != sizeof(__u64))
return false;
break;
case offsetof(struct __sk_buff, sk):
if (type == BPF_WRITE || size != sizeof(__u64))
return false;
info->reg_type = PTR_TO_SOCK_COMMON_OR_NULL;
break;
default:
/* Only narrow read access allowed for now. */
if (type == BPF_WRITE) {
@@ -5937,31 +6126,44 @@ full_access:
return true;
}
static bool __sock_filter_check_size(int off, int size,
bool bpf_sock_common_is_valid_access(int off, int size,
enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
switch (off) {
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
case bpf_ctx_range_till(struct bpf_sock, type, priority):
return false;
default:
return bpf_sock_is_valid_access(off, size, type, info);
}
return size == size_default;
}
bool bpf_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
const int size_default = sizeof(__u32);
if (off < 0 || off >= sizeof(struct bpf_sock))
return false;
if (off % size != 0)
return false;
if (!__sock_filter_check_size(off, size, info))
return false;
return true;
switch (off) {
case offsetof(struct bpf_sock, state):
case offsetof(struct bpf_sock, family):
case offsetof(struct bpf_sock, type):
case offsetof(struct bpf_sock, protocol):
case offsetof(struct bpf_sock, dst_port):
case offsetof(struct bpf_sock, src_port):
case bpf_ctx_range(struct bpf_sock, src_ip4):
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
case bpf_ctx_range(struct bpf_sock, dst_ip4):
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
bpf_ctx_record_field_size(info, size_default);
return bpf_ctx_narrow_access_ok(off, size, size_default);
}
return size == size_default;
}
static bool sock_filter_is_valid_access(int off, int size,
@@ -6750,6 +6952,13 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type,
off += offsetof(struct qdisc_skb_cb, pkt_len);
*target_size = 4;
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, off);
break;
case offsetof(struct __sk_buff, sk):
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, sk),
si->dst_reg, si->src_reg,
offsetof(struct sk_buff, sk));
break;
}
return insn - insn_buf;
@@ -6798,24 +7007,32 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
break;
case offsetof(struct bpf_sock, family):
BUILD_BUG_ON(FIELD_SIZEOF(struct sock, sk_family) != 2);
*insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->src_reg,
offsetof(struct sock, sk_family));
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock_common, skc_family),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common,
skc_family,
FIELD_SIZEOF(struct sock_common,
skc_family),
target_size));
break;
case offsetof(struct bpf_sock, type):
BUILD_BUG_ON(HWEIGHT32(SK_FL_TYPE_MASK) != BITS_PER_BYTE * 2);
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sock, __sk_flags_offset));
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_TYPE_MASK);
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_TYPE_SHIFT);
*target_size = 2;
break;
case offsetof(struct bpf_sock, protocol):
BUILD_BUG_ON(HWEIGHT32(SK_FL_PROTO_MASK) != BITS_PER_BYTE);
*insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg,
offsetof(struct sock, __sk_flags_offset));
*insn++ = BPF_ALU32_IMM(BPF_AND, si->dst_reg, SK_FL_PROTO_MASK);
*insn++ = BPF_ALU32_IMM(BPF_RSH, si->dst_reg, SK_FL_PROTO_SHIFT);
*target_size = 1;
break;
case offsetof(struct bpf_sock, src_ip4):
@@ -6827,6 +7044,15 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
target_size));
break;
case offsetof(struct bpf_sock, dst_ip4):
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_daddr,
FIELD_SIZEOF(struct sock_common,
skc_daddr),
target_size));
break;
case bpf_ctx_range_till(struct bpf_sock, src_ip6[0], src_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
off = si->off;
@@ -6845,6 +7071,23 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
#endif
break;
case bpf_ctx_range_till(struct bpf_sock, dst_ip6[0], dst_ip6[3]):
#if IS_ENABLED(CONFIG_IPV6)
off = si->off;
off -= offsetof(struct bpf_sock, dst_ip6[0]);
*insn++ = BPF_LDX_MEM(
BPF_SIZE(si->code), si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common,
skc_v6_daddr.s6_addr32[0],
FIELD_SIZEOF(struct sock_common,
skc_v6_daddr.s6_addr32[0]),
target_size) + off);
#else
*insn++ = BPF_MOV32_IMM(si->dst_reg, 0);
*target_size = 4;
#endif
break;
case offsetof(struct bpf_sock, src_port):
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock_common, skc_num),
@@ -6854,6 +7097,26 @@ u32 bpf_sock_convert_ctx_access(enum bpf_access_type type,
skc_num),
target_size));
break;
case offsetof(struct bpf_sock, dst_port):
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock_common, skc_dport),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_dport,
FIELD_SIZEOF(struct sock_common,
skc_dport),
target_size));
break;
case offsetof(struct bpf_sock, state):
*insn++ = BPF_LDX_MEM(
BPF_FIELD_SIZEOF(struct sock_common, skc_state),
si->dst_reg, si->src_reg,
bpf_target_off(struct sock_common, skc_state,
FIELD_SIZEOF(struct sock_common,
skc_state),
target_size));
break;
}
return insn - insn_buf;
@@ -7101,6 +7364,85 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
struct bpf_insn *insn = insn_buf;
int off;
/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
OBJ_FIELD), \
si->dst_reg, si->dst_reg, \
offsetof(OBJ, OBJ_FIELD)); \
} while (0)
#define SOCK_OPS_GET_TCP_SOCK_FIELD(FIELD) \
SOCK_OPS_GET_FIELD(FIELD, FIELD, struct tcp_sock)
/* Helper macro for adding write access to tcp_sock or sock fields.
* The macro is called with two registers, dst_reg which contains a pointer
* to ctx (context) and src_reg which contains the value that should be
* stored. However, we need an additional register since we cannot overwrite
* dst_reg because it may be used later in the program.
* Instead we "borrow" one of the other register. We first save its value
* into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
* it at the end of the macro.
*/
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
int reg = BPF_REG_9; \
BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
if (si->dst_reg == reg || si->src_reg == reg) \
reg--; \
if (si->dst_reg == reg || si->src_reg == reg) \
reg--; \
*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
offsetof(struct bpf_sock_ops_kern, \
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \
reg, si->src_reg, \
offsetof(OBJ, OBJ_FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
temp)); \
} while (0)
#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
do { \
if (TYPE == BPF_WRITE) \
SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
else \
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)
CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
SOCK_OPS_GET_TCP_SOCK_FIELD);
if (insn > insn_buf)
return insn - insn_buf;
switch (si->off) {
case offsetof(struct bpf_sock_ops, op) ...
offsetof(struct bpf_sock_ops, replylong[3]):
@@ -7258,175 +7600,15 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
FIELD_SIZEOF(struct minmax_sample, t));
break;
/* Helper macro for adding read access to tcp_sock or sock fields. */
#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
si->dst_reg, si->src_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \
OBJ_FIELD), \
si->dst_reg, si->dst_reg, \
offsetof(OBJ, OBJ_FIELD)); \
} while (0)
/* Helper macro for adding write access to tcp_sock or sock fields.
* The macro is called with two registers, dst_reg which contains a pointer
* to ctx (context) and src_reg which contains the value that should be
* stored. However, we need an additional register since we cannot overwrite
* dst_reg because it may be used later in the program.
* Instead we "borrow" one of the other register. We first save its value
* into a new (temp) field in bpf_sock_ops_kern, use it, and then restore
* it at the end of the macro.
*/
#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \
do { \
int reg = BPF_REG_9; \
BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \
FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \
if (si->dst_reg == reg || si->src_reg == reg) \
reg--; \
if (si->dst_reg == reg || si->src_reg == reg) \
reg--; \
*insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \
offsetof(struct bpf_sock_ops_kern, \
temp)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, \
is_fullsock), \
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
is_fullsock)); \
*insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
struct bpf_sock_ops_kern, sk),\
reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, sk));\
*insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \
reg, si->src_reg, \
offsetof(OBJ, OBJ_FIELD)); \
*insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \
offsetof(struct bpf_sock_ops_kern, \
temp)); \
} while (0)
#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \
do { \
if (TYPE == BPF_WRITE) \
SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
else \
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)
case offsetof(struct bpf_sock_ops, snd_cwnd):
SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, srtt_us):
SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags):
SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, snd_ssthresh):
SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, rcv_nxt):
SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, snd_nxt):
SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, snd_una):
SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, mss_cache):
SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, ecn_flags):
SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, rate_delivered):
SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, rate_interval_us):
SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, packets_out):
SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, retrans_out):
SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, total_retrans):
SOCK_OPS_GET_FIELD(total_retrans, total_retrans,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, segs_in):
SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, data_segs_in):
SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, segs_out):
SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, data_segs_out):
SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, lost_out):
SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, sacked_out):
SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, sk_txhash):
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
case offsetof(struct bpf_sock_ops, bytes_received):
SOCK_OPS_GET_FIELD(bytes_received, bytes_received,
struct tcp_sock);
break;
case offsetof(struct bpf_sock_ops, bytes_acked):
SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock);
break;
}
return insn - insn_buf;
}

파일 보기

@@ -16,6 +16,8 @@
#include <linux/types.h>
#include <linux/bpf.h>
#include <net/lwtunnel.h>
#include <net/gre.h>
#include <net/ip6_route.h>
struct bpf_lwt_prog {
struct bpf_prog *prog;
@@ -55,6 +57,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
switch (ret) {
case BPF_OK:
case BPF_LWT_REROUTE:
break;
case BPF_REDIRECT:
@@ -87,6 +90,30 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
return ret;
}
static int bpf_lwt_input_reroute(struct sk_buff *skb)
{
int err = -EINVAL;
if (skb->protocol == htons(ETH_P_IP)) {
struct iphdr *iph = ip_hdr(skb);
err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
iph->tos, skb_dst(skb)->dev);
} else if (skb->protocol == htons(ETH_P_IPV6)) {
err = ipv6_stub->ipv6_route_input(skb);
} else {
err = -EAFNOSUPPORT;
}
if (err)
goto err;
return dst_input(skb);
err:
kfree_skb(skb);
return err;
}
static int bpf_input(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -98,11 +125,11 @@ static int bpf_input(struct sk_buff *skb)
ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT);
if (ret < 0)
return ret;
if (ret == BPF_LWT_REROUTE)
return bpf_lwt_input_reroute(skb);
}
if (unlikely(!dst->lwtstate->orig_input)) {
pr_warn_once("orig_input not set on dst for prog %s\n",
bpf->out.name);
kfree_skb(skb);
return -EINVAL;
}
@@ -147,6 +174,102 @@ static int xmit_check_hhlen(struct sk_buff *skb)
return 0;
}
static int bpf_lwt_xmit_reroute(struct sk_buff *skb)
{
struct net_device *l3mdev = l3mdev_master_dev_rcu(skb_dst(skb)->dev);
int oif = l3mdev ? l3mdev->ifindex : 0;
struct dst_entry *dst = NULL;
int err = -EAFNOSUPPORT;
struct sock *sk;
struct net *net;
bool ipv4;
if (skb->protocol == htons(ETH_P_IP))
ipv4 = true;
else if (skb->protocol == htons(ETH_P_IPV6))
ipv4 = false;
else
goto err;
sk = sk_to_full_sk(skb->sk);
if (sk) {
if (sk->sk_bound_dev_if)
oif = sk->sk_bound_dev_if;
net = sock_net(sk);
} else {
net = dev_net(skb_dst(skb)->dev);
}
if (ipv4) {
struct iphdr *iph = ip_hdr(skb);
struct flowi4 fl4 = {};
struct rtable *rt;
fl4.flowi4_oif = oif;
fl4.flowi4_mark = skb->mark;
fl4.flowi4_uid = sock_net_uid(net, sk);
fl4.flowi4_tos = RT_TOS(iph->tos);
fl4.flowi4_flags = FLOWI_FLAG_ANYSRC;
fl4.flowi4_proto = iph->protocol;
fl4.daddr = iph->daddr;
fl4.saddr = iph->saddr;
rt = ip_route_output_key(net, &fl4);
if (IS_ERR(rt)) {
err = PTR_ERR(rt);
goto err;
}
dst = &rt->dst;
} else {
struct ipv6hdr *iph6 = ipv6_hdr(skb);
struct flowi6 fl6 = {};
fl6.flowi6_oif = oif;
fl6.flowi6_mark = skb->mark;
fl6.flowi6_uid = sock_net_uid(net, sk);
fl6.flowlabel = ip6_flowinfo(iph6);
fl6.flowi6_proto = iph6->nexthdr;
fl6.daddr = iph6->daddr;
fl6.saddr = iph6->saddr;
err = ipv6_stub->ipv6_dst_lookup(net, skb->sk, &dst, &fl6);
if (unlikely(err))
goto err;
if (IS_ERR(dst)) {
err = PTR_ERR(dst);
goto err;
}
}
if (unlikely(dst->error)) {
err = dst->error;
dst_release(dst);
goto err;
}
/* Although skb header was reserved in bpf_lwt_push_ip_encap(), it
* was done for the previous dst, so we are doing it here again, in
* case the new dst needs much more space. The call below is a noop
* if there is enough header space in skb.
*/
err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev));
if (unlikely(err))
goto err;
skb_dst_drop(skb);
skb_dst_set(skb, dst);
err = dst_output(dev_net(skb_dst(skb)->dev), skb->sk, skb);
if (unlikely(err))
goto err;
/* ip[6]_finish_output2 understand LWTUNNEL_XMIT_DONE */
return LWTUNNEL_XMIT_DONE;
err:
kfree_skb(skb);
return err;
}
static int bpf_xmit(struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -154,11 +277,20 @@ static int bpf_xmit(struct sk_buff *skb)
bpf = bpf_lwt_lwtunnel(dst->lwtstate);
if (bpf->xmit.prog) {
__be16 proto = skb->protocol;
int ret;
ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT);
switch (ret) {
case BPF_OK:
/* If the header changed, e.g. via bpf_lwt_push_encap,
* BPF_LWT_REROUTE below should have been used if the
* protocol was also changed.
*/
if (skb->protocol != proto) {
kfree_skb(skb);
return -EINVAL;
}
/* If the header was expanded, headroom might be too
* small for L2 header to come, expand as needed.
*/
@@ -169,6 +301,8 @@ static int bpf_xmit(struct sk_buff *skb)
return LWTUNNEL_XMIT_CONTINUE;
case BPF_REDIRECT:
return LWTUNNEL_XMIT_DONE;
case BPF_LWT_REROUTE:
return bpf_lwt_xmit_reroute(skb);
default:
return ret;
}
@@ -390,6 +524,133 @@ static const struct lwtunnel_encap_ops bpf_encap_ops = {
.owner = THIS_MODULE,
};
static int handle_gso_type(struct sk_buff *skb, unsigned int gso_type,
int encap_len)
{
struct skb_shared_info *shinfo = skb_shinfo(skb);
gso_type |= SKB_GSO_DODGY;
shinfo->gso_type |= gso_type;
skb_decrease_gso_size(shinfo, encap_len);
shinfo->gso_segs = 0;
return 0;
}
static int handle_gso_encap(struct sk_buff *skb, bool ipv4, int encap_len)
{
int next_hdr_offset;
void *next_hdr;
__u8 protocol;
/* SCTP and UDP_L4 gso need more nuanced handling than what
* handle_gso_type() does above: skb_decrease_gso_size() is not enough.
* So at the moment only TCP GSO packets are let through.
*/
if (!(skb_shinfo(skb)->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
return -ENOTSUPP;
if (ipv4) {
protocol = ip_hdr(skb)->protocol;
next_hdr_offset = sizeof(struct iphdr);
next_hdr = skb_network_header(skb) + next_hdr_offset;
} else {
protocol = ipv6_hdr(skb)->nexthdr;
next_hdr_offset = sizeof(struct ipv6hdr);
next_hdr = skb_network_header(skb) + next_hdr_offset;
}
switch (protocol) {
case IPPROTO_GRE:
next_hdr_offset += sizeof(struct gre_base_hdr);
if (next_hdr_offset > encap_len)
return -EINVAL;
if (((struct gre_base_hdr *)next_hdr)->flags & GRE_CSUM)
return handle_gso_type(skb, SKB_GSO_GRE_CSUM,
encap_len);
return handle_gso_type(skb, SKB_GSO_GRE, encap_len);
case IPPROTO_UDP:
next_hdr_offset += sizeof(struct udphdr);
if (next_hdr_offset > encap_len)
return -EINVAL;
if (((struct udphdr *)next_hdr)->check)
return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL_CSUM,
encap_len);
return handle_gso_type(skb, SKB_GSO_UDP_TUNNEL, encap_len);
case IPPROTO_IP:
case IPPROTO_IPV6:
if (ipv4)
return handle_gso_type(skb, SKB_GSO_IPXIP4, encap_len);
else
return handle_gso_type(skb, SKB_GSO_IPXIP6, encap_len);
default:
return -EPROTONOSUPPORT;
}
}
int bpf_lwt_push_ip_encap(struct sk_buff *skb, void *hdr, u32 len, bool ingress)
{
struct iphdr *iph;
bool ipv4;
int err;
if (unlikely(len < sizeof(struct iphdr) || len > LWT_BPF_MAX_HEADROOM))
return -EINVAL;
/* validate protocol and length */
iph = (struct iphdr *)hdr;
if (iph->version == 4) {
ipv4 = true;
if (unlikely(len < iph->ihl * 4))
return -EINVAL;
} else if (iph->version == 6) {
ipv4 = false;
if (unlikely(len < sizeof(struct ipv6hdr)))
return -EINVAL;
} else {
return -EINVAL;
}
if (ingress)
err = skb_cow_head(skb, len + skb->mac_len);
else
err = skb_cow_head(skb,
len + LL_RESERVED_SPACE(skb_dst(skb)->dev));
if (unlikely(err))
return err;
/* push the encap headers and fix pointers */
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
skb_push(skb, len);
if (ingress)
skb_postpush_rcsum(skb, iph, len);
skb_reset_network_header(skb);
memcpy(skb_network_header(skb), hdr, len);
bpf_compute_data_pointers(skb);
skb_clear_hash(skb);
if (ipv4) {
skb->protocol = htons(ETH_P_IP);
iph = ip_hdr(skb);
if (!iph->check)
iph->check = ip_fast_csum((unsigned char *)iph,
iph->ihl);
} else {
skb->protocol = htons(ETH_P_IPV6);
}
if (skb_is_gso(skb))
return handle_gso_encap(skb, ipv4, len);
return 0;
}
static int __init bpf_lwt_init(void)
{
return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF);

파일 보기

@@ -134,6 +134,11 @@ static int eafnosupport_ipv6_dst_lookup(struct net *net, struct sock *u1,
return -EAFNOSUPPORT;
}
static int eafnosupport_ipv6_route_input(struct sk_buff *skb)
{
return -EAFNOSUPPORT;
}
static struct fib6_table *eafnosupport_fib6_get_table(struct net *net, u32 id)
{
return NULL;
@@ -170,6 +175,7 @@ eafnosupport_ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
const struct ipv6_stub *ipv6_stub __read_mostly = &(struct ipv6_stub) {
.ipv6_dst_lookup = eafnosupport_ipv6_dst_lookup,
.ipv6_route_input = eafnosupport_ipv6_route_input,
.fib6_get_table = eafnosupport_fib6_get_table,
.fib6_table_lookup = eafnosupport_fib6_table_lookup,
.fib6_lookup = eafnosupport_fib6_lookup,

파일 보기

@@ -900,10 +900,17 @@ static struct pernet_operations inet6_net_ops = {
.exit = inet6_net_exit,
};
static int ipv6_route_input(struct sk_buff *skb)
{
ip6_route_input(skb);
return skb_dst(skb)->error;
}
static const struct ipv6_stub ipv6_stub_impl = {
.ipv6_sock_mc_join = ipv6_sock_mc_join,
.ipv6_sock_mc_drop = ipv6_sock_mc_drop,
.ipv6_dst_lookup = ip6_dst_lookup,
.ipv6_route_input = ipv6_route_input,
.fib6_get_table = fib6_get_table,
.fib6_table_lookup = fib6_table_lookup,
.fib6_lookup = fib6_lookup,