bpf: Introduce SK_LOOKUP program type with a dedicated attach point
Add a new program type BPF_PROG_TYPE_SK_LOOKUP with a dedicated attach type BPF_SK_LOOKUP. The new program kind is to be invoked by the transport layer when looking up a listening socket for a new connection request for connection oriented protocols, or when looking up an unconnected socket for a packet for connection-less protocols. When called, SK_LOOKUP BPF program can select a socket that will receive the packet. This serves as a mechanism to overcome the limits of what bind() API allows to express. Two use-cases driving this work are: (1) steer packets destined to an IP range, on fixed port to a socket 192.0.2.0/24, port 80 -> NGINX socket (2) steer packets destined to an IP address, on any port to a socket 198.51.100.1, any port -> L7 proxy socket In its run-time context program receives information about the packet that triggered the socket lookup. Namely IP version, L4 protocol identifier, and address 4-tuple. Context can be further extended to include ingress interface identifier. To select a socket BPF program fetches it from a map holding socket references, like SOCKMAP or SOCKHASH, and calls bpf_sk_assign(ctx, sk, ...) helper to record the selection. Transport layer then uses the selected socket as a result of socket lookup. In its basic form, SK_LOOKUP acts as a filter and hence must return either SK_PASS or SK_DROP. If the program returns with SK_PASS, transport should look for a socket to receive the packet, or use the one selected by the program if available, while SK_DROP informs the transport layer that the lookup should fail. This patch only enables the user to attach an SK_LOOKUP program to a network namespace. Subsequent patches hook it up to run on local delivery path in ipv4 and ipv6 stacks. Suggested-by: Marek Majkowski <marek@cloudflare.com> Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com> Signed-off-by: Alexei Starovoitov <ast@kernel.org> Link: https://lore.kernel.org/bpf/20200717103536.397595-3-jakub@cloudflare.com
This commit is contained in:

committed by
Alexei Starovoitov

parent
ce3aa9cc51
commit
e9ddbb7707
@@ -189,6 +189,7 @@ enum bpf_prog_type {
|
||||
BPF_PROG_TYPE_STRUCT_OPS,
|
||||
BPF_PROG_TYPE_EXT,
|
||||
BPF_PROG_TYPE_LSM,
|
||||
BPF_PROG_TYPE_SK_LOOKUP,
|
||||
};
|
||||
|
||||
enum bpf_attach_type {
|
||||
@@ -228,6 +229,7 @@ enum bpf_attach_type {
|
||||
BPF_XDP_DEVMAP,
|
||||
BPF_CGROUP_INET_SOCK_RELEASE,
|
||||
BPF_XDP_CPUMAP,
|
||||
BPF_SK_LOOKUP,
|
||||
__MAX_BPF_ATTACH_TYPE
|
||||
};
|
||||
|
||||
@@ -3069,6 +3071,10 @@ union bpf_attr {
|
||||
*
|
||||
* long bpf_sk_assign(struct sk_buff *skb, struct bpf_sock *sk, u64 flags)
|
||||
* Description
|
||||
* Helper is overloaded depending on BPF program type. This
|
||||
* description applies to **BPF_PROG_TYPE_SCHED_CLS** and
|
||||
* **BPF_PROG_TYPE_SCHED_ACT** programs.
|
||||
*
|
||||
* Assign the *sk* to the *skb*. When combined with appropriate
|
||||
* routing configuration to receive the packet towards the socket,
|
||||
* will cause *skb* to be delivered to the specified socket.
|
||||
@@ -3094,6 +3100,56 @@ union bpf_attr {
|
||||
* **-ESOCKTNOSUPPORT** if the socket type is not supported
|
||||
* (reuseport).
|
||||
*
|
||||
* long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags)
|
||||
* Description
|
||||
* Helper is overloaded depending on BPF program type. This
|
||||
* description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs.
|
||||
*
|
||||
* Select the *sk* as a result of a socket lookup.
|
||||
*
|
||||
* For the operation to succeed passed socket must be compatible
|
||||
* with the packet description provided by the *ctx* object.
|
||||
*
|
||||
* L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must
|
||||
* be an exact match. While IP family (**AF_INET** or
|
||||
* **AF_INET6**) must be compatible, that is IPv6 sockets
|
||||
* that are not v6-only can be selected for IPv4 packets.
|
||||
*
|
||||
* Only TCP listeners and UDP unconnected sockets can be
|
||||
* selected. *sk* can also be NULL to reset any previous
|
||||
* selection.
|
||||
*
|
||||
* *flags* argument can combination of following values:
|
||||
*
|
||||
* * **BPF_SK_LOOKUP_F_REPLACE** to override the previous
|
||||
* socket selection, potentially done by a BPF program
|
||||
* that ran before us.
|
||||
*
|
||||
* * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip
|
||||
* load-balancing within reuseport group for the socket
|
||||
* being selected.
|
||||
*
|
||||
* On success *ctx->sk* will point to the selected socket.
|
||||
*
|
||||
* Return
|
||||
* 0 on success, or a negative errno in case of failure.
|
||||
*
|
||||
* * **-EAFNOSUPPORT** if socket family (*sk->family*) is
|
||||
* not compatible with packet family (*ctx->family*).
|
||||
*
|
||||
* * **-EEXIST** if socket has been already selected,
|
||||
* potentially by another program, and
|
||||
* **BPF_SK_LOOKUP_F_REPLACE** flag was not specified.
|
||||
*
|
||||
* * **-EINVAL** if unsupported flags were specified.
|
||||
*
|
||||
* * **-EPROTOTYPE** if socket L4 protocol
|
||||
* (*sk->protocol*) doesn't match packet protocol
|
||||
* (*ctx->protocol*).
|
||||
*
|
||||
* * **-ESOCKTNOSUPPORT** if socket is not in allowed
|
||||
* state (TCP listening or UDP unconnected).
|
||||
*
|
||||
* u64 bpf_ktime_get_boot_ns(void)
|
||||
* Description
|
||||
* Return the time elapsed since system boot, in nanoseconds.
|
||||
@@ -3607,6 +3663,12 @@ enum {
|
||||
BPF_RINGBUF_HDR_SZ = 8,
|
||||
};
|
||||
|
||||
/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */
|
||||
enum {
|
||||
BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0),
|
||||
BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1),
|
||||
};
|
||||
|
||||
/* Mode for BPF_FUNC_skb_adjust_room helper. */
|
||||
enum bpf_adj_room_mode {
|
||||
BPF_ADJ_ROOM_NET,
|
||||
@@ -4349,4 +4411,19 @@ struct bpf_pidns_info {
|
||||
__u32 pid;
|
||||
__u32 tgid;
|
||||
};
|
||||
|
||||
/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */
|
||||
struct bpf_sk_lookup {
|
||||
__bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */
|
||||
|
||||
__u32 family; /* Protocol family (AF_INET, AF_INET6) */
|
||||
__u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */
|
||||
__u32 remote_ip4; /* Network byte order */
|
||||
__u32 remote_ip6[4]; /* Network byte order */
|
||||
__u32 remote_port; /* Network byte order */
|
||||
__u32 local_ip4; /* Network byte order */
|
||||
__u32 local_ip6[4]; /* Network byte order */
|
||||
__u32 local_port; /* Host byte order */
|
||||
};
|
||||
|
||||
#endif /* _UAPI__LINUX_BPF_H__ */
|
||||
|
Reference in New Issue
Block a user