Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Alexei Starovoitov says:

====================
pull-request: bpf-next 2020-05-14

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Merged tag 'perf-for-bpf-2020-05-06' from tip tree that includes CAP_PERFMON.

2) support for narrow loads in bpf_sock_addr progs and additional
   helpers in cg-skb progs, from Andrey.

3) bpf benchmark runner, from Andrii.

4) arm and riscv JIT optimizations, from Luke.

5) bpf iterator infrastructure, from Yonghong.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2020-05-14 20:31:21 -07:00
139 changed files with 5253 additions and 544 deletions

View File

@@ -116,6 +116,7 @@ enum bpf_cmd {
BPF_LINK_GET_FD_BY_ID,
BPF_LINK_GET_NEXT_ID,
BPF_ENABLE_STATS,
BPF_ITER_CREATE,
};
enum bpf_map_type {
@@ -218,6 +219,7 @@ enum bpf_attach_type {
BPF_TRACE_FEXIT,
BPF_MODIFY_RETURN,
BPF_LSM_MAC,
BPF_TRACE_ITER,
__MAX_BPF_ATTACH_TYPE
};
@@ -228,6 +230,7 @@ enum bpf_link_type {
BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
BPF_LINK_TYPE_TRACING = 2,
BPF_LINK_TYPE_CGROUP = 3,
BPF_LINK_TYPE_ITER = 4,
MAX_BPF_LINK_TYPE,
};
@@ -612,6 +615,11 @@ union bpf_attr {
__u32 type;
} enable_stats;
struct { /* struct used by BPF_ITER_CREATE command */
__u32 link_fd;
__u32 flags;
} iter_create;
} __attribute__((aligned(8)));
/* The description below is an attempt at providing documentation to eBPF
@@ -667,8 +675,8 @@ union bpf_attr {
* For tracing programs, safely attempt to read *size* bytes from
* kernel space address *unsafe_ptr* and store the data in *dst*.
*
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
* instead.
* Generally, use **bpf_probe_read_user**\ () or
* **bpf_probe_read_kernel**\ () instead.
* Return
* 0 on success, or a negative error in case of failure.
*
@@ -676,7 +684,7 @@ union bpf_attr {
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does not include time the system was suspended.
* See: clock_gettime(CLOCK_MONOTONIC)
* See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
* Return
* Current *ktime*.
*
@@ -1535,11 +1543,11 @@ union bpf_attr {
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
* *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
* more details.
*
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
* instead.
* Generally, use **bpf_probe_read_user_str**\ () or
* **bpf_probe_read_kernel_str**\ () instead.
* Return
* On success, the strictly positive length of the string,
* including the trailing NUL character. On error, a negative
@@ -1567,7 +1575,7 @@ union bpf_attr {
*
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
* Description
* Equivalent to bpf_get_socket_cookie() helper that accepts
* Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
* *skb*, but gets socket from **struct bpf_sock_ops** context.
* Return
* A 8-byte long non-decreasing number.
@@ -1596,6 +1604,7 @@ union bpf_attr {
* The option value of length *optlen* is pointed by *optval*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@@ -1664,12 +1673,12 @@ union bpf_attr {
*
* The lower two bits of *flags* are used as the return code if
* the map lookup fails. This is so that the return value can be
* one of the XDP program return codes up to XDP_TX, as chosen by
* the caller. Any higher bits in the *flags* argument must be
* one of the XDP program return codes up to **XDP_TX**, as chosen
* by the caller. Any higher bits in the *flags* argument must be
* unset.
*
* See also bpf_redirect(), which only supports redirecting to an
* ifindex, but doesn't require a map to do so.
* See also **bpf_redirect**\ (), which only supports redirecting
* to an ifindex, but doesn't require a map to do so.
* Return
* **XDP_REDIRECT** on success, or the value of the two lower bits
* of the *flags* argument on error.
@@ -1777,7 +1786,7 @@ union bpf_attr {
* the time running for event since last normalization. The
* enabled and running times are accumulated since the perf event
* open. To achieve scaling factor between two invocations of an
* eBPF program, users can can use CPU id as the key (which is
* eBPF program, users can use CPU id as the key (which is
* typical for perf array usage model) to remember the previous
* value and do the calculation inside the eBPF program.
* Return
@@ -1804,6 +1813,7 @@ union bpf_attr {
* *opval* and of length *optlen*.
*
* *bpf_socket* should be one of the following:
*
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
* and **BPF_CGROUP_INET6_CONNECT**.
@@ -1825,7 +1835,7 @@ union bpf_attr {
* The first argument is the context *regs* on which the kprobe
* works.
*
* This helper works by setting setting the PC (program counter)
* This helper works by setting the PC (program counter)
* to an override function which is run in place of the original
* probed function. This means the probed function is not run at
* all. The replacement function just returns with the required
@@ -1994,10 +2004,11 @@ union bpf_attr {
*
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
* **AF_INET6**). Looking for a free port to bind to can be
* expensive, therefore binding to port is not permitted by the
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
* must be set to zero.
* **AF_INET6**). It's advised to pass zero port (**sin_port**
* or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
* behavior and lets the kernel efficiently pick up an unused
* port as long as 4-tuple is unique. Passing non-zero port might
* lead to degraded performance.
* Return
* 0 on success, or a negative error in case of failure.
*
@@ -2291,7 +2302,7 @@ union bpf_attr {
* **bpf_rc_keydown**\ () again with the same values, or calling
* **bpf_rc_repeat**\ ().
*
* Some protocols include a toggle bit, in case the button was
* Some protocols include a toggle bit, in case the button was
* released and pressed again between consecutive scancodes.
*
* The *ctx* should point to the lirc sample as passed into
@@ -2637,7 +2648,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains **sizeof**\ (**struct tcphdr**).
*
* Return
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
* error otherwise.
@@ -2820,7 +2830,6 @@ union bpf_attr {
*
* *th* points to the start of the TCP header, while *th_len*
* contains the length of the TCP header.
*
* Return
* On success, lower 32 bits hold the generated SYN cookie in
* followed by 16 bits which hold the MSS value for that cookie,
@@ -2903,7 +2912,7 @@ union bpf_attr {
* // size, after checking its boundaries.
* }
*
* In comparison, using **bpf_probe_read_user()** helper here
* In comparison, using **bpf_probe_read_user**\ () helper here
* instead to read the string would require to estimate the length
* at compile time, and would often result in copying more memory
* than necessary.
@@ -2921,14 +2930,14 @@ union bpf_attr {
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
* Description
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
* to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
* Return
* On success, the strictly positive length of the string, including
* On success, the strictly positive length of the string, including
* the trailing NUL character. On error, a negative value.
*
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
* Description
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
* Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
* *rcv_nxt* is the ack_seq to be sent out.
* Return
* 0 on success, or a negative error in case of failure.
@@ -2956,19 +2965,19 @@ union bpf_attr {
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
* Description
* For an eBPF program attached to a perf event, retrieve the
* branch records (struct perf_branch_entry) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* branch records (**struct perf_branch_entry**) associated to *ctx*
* and store it in the buffer pointed by *buf* up to size
* *size* bytes.
* Return
* On success, number of bytes written to *buf*. On error, a
* negative value.
*
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
* instead return the number of bytes required to store all the
* instead return the number of bytes required to store all the
* branch entries. If this flag is set, *buf* may be NULL.
*
* **-EINVAL** if arguments invalid or **size** not a multiple
* of sizeof(struct perf_branch_entry).
* of **sizeof**\ (**struct perf_branch_entry**\ ).
*
* **-ENOENT** if architecture does not support branch records.
*
@@ -2976,8 +2985,8 @@ union bpf_attr {
* Description
* Returns 0 on success, values for *pid* and *tgid* as seen from the current
* *namespace* will be returned in *nsdata*.
*
* On failure, the returned value is one of the following:
* Return
* 0 on success, or one of the following in case of failure:
*
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number
* with nsfs of current task, or if dev conversion to dev_t lost high bits.
@@ -3016,8 +3025,8 @@ union bpf_attr {
* a global identifier that can be assumed unique. If *ctx* is
* NULL, then the helper returns the cookie for the initial
* network namespace. The cookie itself is very similar to that
* of bpf_get_socket_cookie() helper, but for network namespaces
* instead of sockets.
* of **bpf_get_socket_cookie**\ () helper, but for network
* namespaces instead of sockets.
* Return
* A 8-byte long opaque number.
*
@@ -3052,22 +3061,98 @@ union bpf_attr {
*
* The *flags* argument must be zero.
* Return
* 0 on success, or a negative errno in case of failure.
* 0 on success, or a negative error in case of failure:
*
* * **-EINVAL** Unsupported flags specified.
* * **-ENOENT** Socket is unavailable for assignment.
* * **-ENETUNREACH** Socket is unreachable (wrong netns).
* * **-EOPNOTSUPP** Unsupported operation, for example a
* call from outside of TC ingress.
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
* **-EINVAL** if specified *flags* are not supported.
*
* **-ENOENT** if the socket is unavailable for assignment.
*
* **-ENETUNREACH** if the socket is unreachable (wrong netns).
*
* **-EOPNOTSUPP** if the operation is not supported, for example
* a call from outside of TC ingress.
*
* **-ESOCKTNOSUPPORT** if the socket type is not supported
* (reuseport).
*
* u64 bpf_ktime_get_boot_ns(void)
* Description
* Return the time elapsed since system boot, in nanoseconds.
* Does include the time the system was suspended.
* See: clock_gettime(CLOCK_BOOTTIME)
* See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
* Return
* Current *ktime*.
*
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
* Description
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
* out the format string.
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
* the format string itself. The *data* and *data_len* are format string
* arguments. The *data* are a **u64** array and corresponding format string
* values are stored in the array. For strings and pointers where pointees
* are accessed, only the pointer values are stored in the *data* array.
* The *data_len* is the size of *data* in bytes.
*
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
* Reading kernel memory may fail due to either invalid address or
* valid address but requiring a major memory fault. If reading kernel memory
* fails, the string for **%s** will be an empty string, and the ip
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
* by returning 1 from bpf program.
*
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
*
* **-E2BIG** if *fmt* contains too many format specifiers.
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
* Description
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
* The *m* represents the seq_file. The *data* and *len* represent the
* data to write in bytes.
* Return
* 0 on success, or a negative error in case of failure:
*
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
*
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
* Description
* Return the cgroup v2 id of the socket *sk*.
*
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
* returned from **bpf_sk_lookup_xxx**\ (),
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
* same as in **bpf_skb_cgroup_id**\ ().
*
* This helper is available only if the kernel was compiled with
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
* Return
* The id is returned or 0 in case the id could not be retrieved.
*
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
* Description
* Return id of cgroup v2 that is ancestor of cgroup associated
* with the *sk* at the *ancestor_level*. The root cgroup is at
* *ancestor_level* zero and each step down the hierarchy
* increments the level. If *ancestor_level* == level of cgroup
* associated with *sk*, then return value will be same as that
* of **bpf_sk_cgroup_id**\ ().
*
* The helper is useful to implement policies based on cgroups
* that are upper in hierarchy than immediate cgroup associated
* with *sk*.
*
* The format of returned id and helper limitations are same as in
* **bpf_sk_cgroup_id**\ ().
* Return
* The id is returned or 0 in case the id could not be retrieved.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
@@ -3195,7 +3280,11 @@ union bpf_attr {
FN(get_netns_cookie), \
FN(get_current_ancestor_cgroup_id), \
FN(sk_assign), \
FN(ktime_get_boot_ns),
FN(ktime_get_boot_ns), \
FN(seq_printf), \
FN(seq_write), \
FN(sk_cgroup_id), \
FN(sk_ancestor_cgroup_id),
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
* function eBPF program intends to call
@@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
* Stored in network byte order.
*/
__u32 user_port; /* Allows 4-byte read and write.
__u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
* Stored in network byte order
*/
__u32 family; /* Allows 4-byte read, but no write */