Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next
Alexei Starovoitov says: ==================== pull-request: bpf-next 2020-05-14 The following pull-request contains BPF updates for your *net-next* tree. The main changes are: 1) Merged tag 'perf-for-bpf-2020-05-06' from tip tree that includes CAP_PERFMON. 2) support for narrow loads in bpf_sock_addr progs and additional helpers in cg-skb progs, from Andrey. 3) bpf benchmark runner, from Andrii. 4) arm and riscv JIT optimizations, from Luke. 5) bpf iterator infrastructure, from Yonghong. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
@@ -116,6 +116,7 @@ enum bpf_cmd {
|
||||
BPF_LINK_GET_FD_BY_ID,
|
||||
BPF_LINK_GET_NEXT_ID,
|
||||
BPF_ENABLE_STATS,
|
||||
BPF_ITER_CREATE,
|
||||
};
|
||||
|
||||
enum bpf_map_type {
|
||||
@@ -218,6 +219,7 @@ enum bpf_attach_type {
|
||||
BPF_TRACE_FEXIT,
|
||||
BPF_MODIFY_RETURN,
|
||||
BPF_LSM_MAC,
|
||||
BPF_TRACE_ITER,
|
||||
__MAX_BPF_ATTACH_TYPE
|
||||
};
|
||||
|
||||
@@ -228,6 +230,7 @@ enum bpf_link_type {
|
||||
BPF_LINK_TYPE_RAW_TRACEPOINT = 1,
|
||||
BPF_LINK_TYPE_TRACING = 2,
|
||||
BPF_LINK_TYPE_CGROUP = 3,
|
||||
BPF_LINK_TYPE_ITER = 4,
|
||||
|
||||
MAX_BPF_LINK_TYPE,
|
||||
};
|
||||
@@ -612,6 +615,11 @@ union bpf_attr {
|
||||
__u32 type;
|
||||
} enable_stats;
|
||||
|
||||
struct { /* struct used by BPF_ITER_CREATE command */
|
||||
__u32 link_fd;
|
||||
__u32 flags;
|
||||
} iter_create;
|
||||
|
||||
} __attribute__((aligned(8)));
|
||||
|
||||
/* The description below is an attempt at providing documentation to eBPF
|
||||
@@ -667,8 +675,8 @@ union bpf_attr {
|
||||
* For tracing programs, safely attempt to read *size* bytes from
|
||||
* kernel space address *unsafe_ptr* and store the data in *dst*.
|
||||
*
|
||||
* Generally, use bpf_probe_read_user() or bpf_probe_read_kernel()
|
||||
* instead.
|
||||
* Generally, use **bpf_probe_read_user**\ () or
|
||||
* **bpf_probe_read_kernel**\ () instead.
|
||||
* Return
|
||||
* 0 on success, or a negative error in case of failure.
|
||||
*
|
||||
@@ -676,7 +684,7 @@ union bpf_attr {
|
||||
* Description
|
||||
* Return the time elapsed since system boot, in nanoseconds.
|
||||
* Does not include time the system was suspended.
|
||||
* See: clock_gettime(CLOCK_MONOTONIC)
|
||||
* See: **clock_gettime**\ (**CLOCK_MONOTONIC**)
|
||||
* Return
|
||||
* Current *ktime*.
|
||||
*
|
||||
@@ -1535,11 +1543,11 @@ union bpf_attr {
|
||||
* int bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr)
|
||||
* Description
|
||||
* Copy a NUL terminated string from an unsafe kernel address
|
||||
* *unsafe_ptr* to *dst*. See bpf_probe_read_kernel_str() for
|
||||
* *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for
|
||||
* more details.
|
||||
*
|
||||
* Generally, use bpf_probe_read_user_str() or bpf_probe_read_kernel_str()
|
||||
* instead.
|
||||
* Generally, use **bpf_probe_read_user_str**\ () or
|
||||
* **bpf_probe_read_kernel_str**\ () instead.
|
||||
* Return
|
||||
* On success, the strictly positive length of the string,
|
||||
* including the trailing NUL character. On error, a negative
|
||||
@@ -1567,7 +1575,7 @@ union bpf_attr {
|
||||
*
|
||||
* u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx)
|
||||
* Description
|
||||
* Equivalent to bpf_get_socket_cookie() helper that accepts
|
||||
* Equivalent to **bpf_get_socket_cookie**\ () helper that accepts
|
||||
* *skb*, but gets socket from **struct bpf_sock_ops** context.
|
||||
* Return
|
||||
* A 8-byte long non-decreasing number.
|
||||
@@ -1596,6 +1604,7 @@ union bpf_attr {
|
||||
* The option value of length *optlen* is pointed by *optval*.
|
||||
*
|
||||
* *bpf_socket* should be one of the following:
|
||||
*
|
||||
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
|
||||
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
|
||||
* and **BPF_CGROUP_INET6_CONNECT**.
|
||||
@@ -1664,12 +1673,12 @@ union bpf_attr {
|
||||
*
|
||||
* The lower two bits of *flags* are used as the return code if
|
||||
* the map lookup fails. This is so that the return value can be
|
||||
* one of the XDP program return codes up to XDP_TX, as chosen by
|
||||
* the caller. Any higher bits in the *flags* argument must be
|
||||
* one of the XDP program return codes up to **XDP_TX**, as chosen
|
||||
* by the caller. Any higher bits in the *flags* argument must be
|
||||
* unset.
|
||||
*
|
||||
* See also bpf_redirect(), which only supports redirecting to an
|
||||
* ifindex, but doesn't require a map to do so.
|
||||
* See also **bpf_redirect**\ (), which only supports redirecting
|
||||
* to an ifindex, but doesn't require a map to do so.
|
||||
* Return
|
||||
* **XDP_REDIRECT** on success, or the value of the two lower bits
|
||||
* of the *flags* argument on error.
|
||||
@@ -1777,7 +1786,7 @@ union bpf_attr {
|
||||
* the time running for event since last normalization. The
|
||||
* enabled and running times are accumulated since the perf event
|
||||
* open. To achieve scaling factor between two invocations of an
|
||||
* eBPF program, users can can use CPU id as the key (which is
|
||||
* eBPF program, users can use CPU id as the key (which is
|
||||
* typical for perf array usage model) to remember the previous
|
||||
* value and do the calculation inside the eBPF program.
|
||||
* Return
|
||||
@@ -1804,6 +1813,7 @@ union bpf_attr {
|
||||
* *opval* and of length *optlen*.
|
||||
*
|
||||
* *bpf_socket* should be one of the following:
|
||||
*
|
||||
* * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**.
|
||||
* * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**
|
||||
* and **BPF_CGROUP_INET6_CONNECT**.
|
||||
@@ -1825,7 +1835,7 @@ union bpf_attr {
|
||||
* The first argument is the context *regs* on which the kprobe
|
||||
* works.
|
||||
*
|
||||
* This helper works by setting setting the PC (program counter)
|
||||
* This helper works by setting the PC (program counter)
|
||||
* to an override function which is run in place of the original
|
||||
* probed function. This means the probed function is not run at
|
||||
* all. The replacement function just returns with the required
|
||||
@@ -1994,10 +2004,11 @@ union bpf_attr {
|
||||
*
|
||||
* This helper works for IPv4 and IPv6, TCP and UDP sockets. The
|
||||
* domain (*addr*\ **->sa_family**) must be **AF_INET** (or
|
||||
* **AF_INET6**). Looking for a free port to bind to can be
|
||||
* expensive, therefore binding to port is not permitted by the
|
||||
* helper: *addr*\ **->sin_port** (or **sin6_port**, respectively)
|
||||
* must be set to zero.
|
||||
* **AF_INET6**). It's advised to pass zero port (**sin_port**
|
||||
* or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like
|
||||
* behavior and lets the kernel efficiently pick up an unused
|
||||
* port as long as 4-tuple is unique. Passing non-zero port might
|
||||
* lead to degraded performance.
|
||||
* Return
|
||||
* 0 on success, or a negative error in case of failure.
|
||||
*
|
||||
@@ -2291,7 +2302,7 @@ union bpf_attr {
|
||||
* **bpf_rc_keydown**\ () again with the same values, or calling
|
||||
* **bpf_rc_repeat**\ ().
|
||||
*
|
||||
* Some protocols include a toggle bit, in case the button was
|
||||
* Some protocols include a toggle bit, in case the button was
|
||||
* released and pressed again between consecutive scancodes.
|
||||
*
|
||||
* The *ctx* should point to the lirc sample as passed into
|
||||
@@ -2637,7 +2648,6 @@ union bpf_attr {
|
||||
*
|
||||
* *th* points to the start of the TCP header, while *th_len*
|
||||
* contains **sizeof**\ (**struct tcphdr**).
|
||||
*
|
||||
* Return
|
||||
* 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative
|
||||
* error otherwise.
|
||||
@@ -2820,7 +2830,6 @@ union bpf_attr {
|
||||
*
|
||||
* *th* points to the start of the TCP header, while *th_len*
|
||||
* contains the length of the TCP header.
|
||||
*
|
||||
* Return
|
||||
* On success, lower 32 bits hold the generated SYN cookie in
|
||||
* followed by 16 bits which hold the MSS value for that cookie,
|
||||
@@ -2903,7 +2912,7 @@ union bpf_attr {
|
||||
* // size, after checking its boundaries.
|
||||
* }
|
||||
*
|
||||
* In comparison, using **bpf_probe_read_user()** helper here
|
||||
* In comparison, using **bpf_probe_read_user**\ () helper here
|
||||
* instead to read the string would require to estimate the length
|
||||
* at compile time, and would often result in copying more memory
|
||||
* than necessary.
|
||||
@@ -2921,14 +2930,14 @@ union bpf_attr {
|
||||
* int bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr)
|
||||
* Description
|
||||
* Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr*
|
||||
* to *dst*. Same semantics as with bpf_probe_read_user_str() apply.
|
||||
* to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply.
|
||||
* Return
|
||||
* On success, the strictly positive length of the string, including
|
||||
* On success, the strictly positive length of the string, including
|
||||
* the trailing NUL character. On error, a negative value.
|
||||
*
|
||||
* int bpf_tcp_send_ack(void *tp, u32 rcv_nxt)
|
||||
* Description
|
||||
* Send out a tcp-ack. *tp* is the in-kernel struct tcp_sock.
|
||||
* Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**.
|
||||
* *rcv_nxt* is the ack_seq to be sent out.
|
||||
* Return
|
||||
* 0 on success, or a negative error in case of failure.
|
||||
@@ -2956,19 +2965,19 @@ union bpf_attr {
|
||||
* int bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags)
|
||||
* Description
|
||||
* For an eBPF program attached to a perf event, retrieve the
|
||||
* branch records (struct perf_branch_entry) associated to *ctx*
|
||||
* and store it in the buffer pointed by *buf* up to size
|
||||
* branch records (**struct perf_branch_entry**) associated to *ctx*
|
||||
* and store it in the buffer pointed by *buf* up to size
|
||||
* *size* bytes.
|
||||
* Return
|
||||
* On success, number of bytes written to *buf*. On error, a
|
||||
* negative value.
|
||||
*
|
||||
* The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to
|
||||
* instead return the number of bytes required to store all the
|
||||
* instead return the number of bytes required to store all the
|
||||
* branch entries. If this flag is set, *buf* may be NULL.
|
||||
*
|
||||
* **-EINVAL** if arguments invalid or **size** not a multiple
|
||||
* of sizeof(struct perf_branch_entry).
|
||||
* of **sizeof**\ (**struct perf_branch_entry**\ ).
|
||||
*
|
||||
* **-ENOENT** if architecture does not support branch records.
|
||||
*
|
||||
@@ -2976,8 +2985,8 @@ union bpf_attr {
|
||||
* Description
|
||||
* Returns 0 on success, values for *pid* and *tgid* as seen from the current
|
||||
* *namespace* will be returned in *nsdata*.
|
||||
*
|
||||
* On failure, the returned value is one of the following:
|
||||
* Return
|
||||
* 0 on success, or one of the following in case of failure:
|
||||
*
|
||||
* **-EINVAL** if dev and inum supplied don't match dev_t and inode number
|
||||
* with nsfs of current task, or if dev conversion to dev_t lost high bits.
|
||||
@@ -3016,8 +3025,8 @@ union bpf_attr {
|
||||
* a global identifier that can be assumed unique. If *ctx* is
|
||||
* NULL, then the helper returns the cookie for the initial
|
||||
* network namespace. The cookie itself is very similar to that
|
||||
* of bpf_get_socket_cookie() helper, but for network namespaces
|
||||
* instead of sockets.
|
||||
* of **bpf_get_socket_cookie**\ () helper, but for network
|
||||
* namespaces instead of sockets.
|
||||
* Return
|
||||
* A 8-byte long opaque number.
|
||||
*
|
||||
@@ -3052,22 +3061,98 @@ union bpf_attr {
|
||||
*
|
||||
* The *flags* argument must be zero.
|
||||
* Return
|
||||
* 0 on success, or a negative errno in case of failure.
|
||||
* 0 on success, or a negative error in case of failure:
|
||||
*
|
||||
* * **-EINVAL** Unsupported flags specified.
|
||||
* * **-ENOENT** Socket is unavailable for assignment.
|
||||
* * **-ENETUNREACH** Socket is unreachable (wrong netns).
|
||||
* * **-EOPNOTSUPP** Unsupported operation, for example a
|
||||
* call from outside of TC ingress.
|
||||
* * **-ESOCKTNOSUPPORT** Socket type not supported (reuseport).
|
||||
* **-EINVAL** if specified *flags* are not supported.
|
||||
*
|
||||
* **-ENOENT** if the socket is unavailable for assignment.
|
||||
*
|
||||
* **-ENETUNREACH** if the socket is unreachable (wrong netns).
|
||||
*
|
||||
* **-EOPNOTSUPP** if the operation is not supported, for example
|
||||
* a call from outside of TC ingress.
|
||||
*
|
||||
* **-ESOCKTNOSUPPORT** if the socket type is not supported
|
||||
* (reuseport).
|
||||
*
|
||||
* u64 bpf_ktime_get_boot_ns(void)
|
||||
* Description
|
||||
* Return the time elapsed since system boot, in nanoseconds.
|
||||
* Does include the time the system was suspended.
|
||||
* See: clock_gettime(CLOCK_BOOTTIME)
|
||||
* See: **clock_gettime**\ (**CLOCK_BOOTTIME**)
|
||||
* Return
|
||||
* Current *ktime*.
|
||||
*
|
||||
* int bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len)
|
||||
* Description
|
||||
* **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print
|
||||
* out the format string.
|
||||
* The *m* represents the seq_file. The *fmt* and *fmt_size* are for
|
||||
* the format string itself. The *data* and *data_len* are format string
|
||||
* arguments. The *data* are a **u64** array and corresponding format string
|
||||
* values are stored in the array. For strings and pointers where pointees
|
||||
* are accessed, only the pointer values are stored in the *data* array.
|
||||
* The *data_len* is the size of *data* in bytes.
|
||||
*
|
||||
* Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory.
|
||||
* Reading kernel memory may fail due to either invalid address or
|
||||
* valid address but requiring a major memory fault. If reading kernel memory
|
||||
* fails, the string for **%s** will be an empty string, and the ip
|
||||
* address for **%p{i,I}{4,6}** will be 0. Not returning error to
|
||||
* bpf program is consistent with what **bpf_trace_printk**\ () does for now.
|
||||
* Return
|
||||
* 0 on success, or a negative error in case of failure:
|
||||
*
|
||||
* **-EBUSY** if per-CPU memory copy buffer is busy, can try again
|
||||
* by returning 1 from bpf program.
|
||||
*
|
||||
* **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported.
|
||||
*
|
||||
* **-E2BIG** if *fmt* contains too many format specifiers.
|
||||
*
|
||||
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
|
||||
*
|
||||
* int bpf_seq_write(struct seq_file *m, const void *data, u32 len)
|
||||
* Description
|
||||
* **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data.
|
||||
* The *m* represents the seq_file. The *data* and *len* represent the
|
||||
* data to write in bytes.
|
||||
* Return
|
||||
* 0 on success, or a negative error in case of failure:
|
||||
*
|
||||
* **-EOVERFLOW** if an overflow happened: The same object will be tried again.
|
||||
*
|
||||
* u64 bpf_sk_cgroup_id(struct bpf_sock *sk)
|
||||
* Description
|
||||
* Return the cgroup v2 id of the socket *sk*.
|
||||
*
|
||||
* *sk* must be a non-**NULL** pointer to a full socket, e.g. one
|
||||
* returned from **bpf_sk_lookup_xxx**\ (),
|
||||
* **bpf_sk_fullsock**\ (), etc. The format of returned id is
|
||||
* same as in **bpf_skb_cgroup_id**\ ().
|
||||
*
|
||||
* This helper is available only if the kernel was compiled with
|
||||
* the **CONFIG_SOCK_CGROUP_DATA** configuration option.
|
||||
* Return
|
||||
* The id is returned or 0 in case the id could not be retrieved.
|
||||
*
|
||||
* u64 bpf_sk_ancestor_cgroup_id(struct bpf_sock *sk, int ancestor_level)
|
||||
* Description
|
||||
* Return id of cgroup v2 that is ancestor of cgroup associated
|
||||
* with the *sk* at the *ancestor_level*. The root cgroup is at
|
||||
* *ancestor_level* zero and each step down the hierarchy
|
||||
* increments the level. If *ancestor_level* == level of cgroup
|
||||
* associated with *sk*, then return value will be same as that
|
||||
* of **bpf_sk_cgroup_id**\ ().
|
||||
*
|
||||
* The helper is useful to implement policies based on cgroups
|
||||
* that are upper in hierarchy than immediate cgroup associated
|
||||
* with *sk*.
|
||||
*
|
||||
* The format of returned id and helper limitations are same as in
|
||||
* **bpf_sk_cgroup_id**\ ().
|
||||
* Return
|
||||
* The id is returned or 0 in case the id could not be retrieved.
|
||||
*/
|
||||
#define __BPF_FUNC_MAPPER(FN) \
|
||||
FN(unspec), \
|
||||
@@ -3195,7 +3280,11 @@ union bpf_attr {
|
||||
FN(get_netns_cookie), \
|
||||
FN(get_current_ancestor_cgroup_id), \
|
||||
FN(sk_assign), \
|
||||
FN(ktime_get_boot_ns),
|
||||
FN(ktime_get_boot_ns), \
|
||||
FN(seq_printf), \
|
||||
FN(seq_write), \
|
||||
FN(sk_cgroup_id), \
|
||||
FN(sk_ancestor_cgroup_id),
|
||||
|
||||
/* integer value in 'imm' field of BPF_CALL instruction selects which helper
|
||||
* function eBPF program intends to call
|
||||
@@ -3673,7 +3762,7 @@ struct bpf_sock_addr {
|
||||
__u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write.
|
||||
* Stored in network byte order.
|
||||
*/
|
||||
__u32 user_port; /* Allows 4-byte read and write.
|
||||
__u32 user_port; /* Allows 1,2,4-byte read and 4-byte write.
|
||||
* Stored in network byte order
|
||||
*/
|
||||
__u32 family; /* Allows 4-byte read, but no write */
|
||||
|
Reference in New Issue
Block a user