Merge git://git.kernel.org/pub/scm/linux/kernel/git/bpf/bpf-next

Daniel Borkmann says:

====================
pull-request: bpf-next 2018-10-16

The following pull-request contains BPF updates for your *net-next* tree.

The main changes are:

1) Convert BPF sockmap and kTLS to both use a new sk_msg API and enable
   sk_msg BPF integration for the latter, from Daniel and John.

2) Enable BPF syscall side to indicate for maps that they do not support
   a map lookup operation as opposed to just missing key, from Prashant.

3) Add bpftool map create command which after map creation pins the
   map into bpf fs for further processing, from Jakub.

4) Add bpftool support for attaching programs to maps allowing sock_map
   and sock_hash to be used from bpftool, from John.

5) Improve syscall BPF map update/delete path for map-in-map types to
   wait a RCU grace period for pending references to complete, from Daniel.

6) Couple of follow-up fixes for the BPF socket lookup to get it
   enabled also when IPv6 is compiled as a module, from Joe.

7) Fix a generic-XDP bug to handle the case when the Ethernet header
   was mangled and thus update skb's protocol and data, from Jesper.

8) Add a missing BTF header length check between header copies from
   user space, from Wenwen.

9) Minor fixups in libbpf to use __u32 instead u32 types and include
   proper perf_event.h uapi header instead of perf internal one, from Yonghong.

10) Allow to pass user-defined flags through EXTRA_CFLAGS and EXTRA_LDFLAGS
    to bpftool's build, from Jiri.

11) BPF kselftest tweaks to add LWTUNNEL to config fragment and to install
    with_addr.sh script from flow dissector selftest, from Anders.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
David S. Miller
2018-10-15 23:21:07 -07:00
54 changed files with 4978 additions and 3675 deletions

View File

@@ -36,7 +36,8 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o test_sk_lookup_kern.o
test_skb_cgroup_id_kern.o bpf_flow.o netcnt_prog.o \
test_sk_lookup_kern.o test_xdp_vlan.o
# Order correspond to 'make run_tests' order
TEST_PROGS := test_kmod.sh \
@@ -49,7 +50,10 @@ TEST_PROGS := test_kmod.sh \
test_lwt_seg6local.sh \
test_lirc_mode2.sh \
test_skb_cgroup_id.sh \
test_flow_dissector.sh
test_flow_dissector.sh \
test_xdp_vlan.sh
TEST_PROGS_EXTENDED := with_addr.sh
# Compile but not part of 'make run_tests'
TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \

View File

@@ -155,6 +155,10 @@ static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx,
(void *) BPF_FUNC_sk_lookup_udp;
static int (*bpf_sk_release)(struct bpf_sock *sk) =
(void *) BPF_FUNC_sk_release;
static int (*bpf_skb_vlan_push)(void *ctx, __be16 vlan_proto, __u16 vlan_tci) =
(void *) BPF_FUNC_skb_vlan_push;
static int (*bpf_skb_vlan_pop)(void *ctx) =
(void *) BPF_FUNC_skb_vlan_pop;
/* llvm builtin functions that eBPF C program may use to
* emit BPF_LD_ABS and BPF_LD_IND instructions

View File

@@ -19,3 +19,4 @@ CONFIG_CRYPTO_SHA256=m
CONFIG_VXLAN=y
CONFIG_GENEVE=y
CONFIG_NET_CLS_FLOWER=m
CONFIG_LWTUNNEL=y

View File

@@ -71,6 +71,7 @@ int txmsg_start;
int txmsg_end;
int txmsg_ingress;
int txmsg_skb;
int ktls;
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h' },
@@ -92,6 +93,7 @@ static const struct option long_options[] = {
{"txmsg_end", required_argument, NULL, 'e'},
{"txmsg_ingress", no_argument, &txmsg_ingress, 1 },
{"txmsg_skb", no_argument, &txmsg_skb, 1 },
{"ktls", no_argument, &ktls, 1 },
{0, 0, NULL, 0 }
};
@@ -112,6 +114,76 @@ static void usage(char *argv[])
printf("\n");
}
#define TCP_ULP 31
#define TLS_TX 1
#define TLS_RX 2
#include <linux/tls.h>
char *sock_to_string(int s)
{
if (s == c1)
return "client1";
else if (s == c2)
return "client2";
else if (s == s1)
return "server1";
else if (s == s2)
return "server2";
else if (s == p1)
return "peer1";
else if (s == p2)
return "peer2";
else
return "unknown";
}
static int sockmap_init_ktls(int verbose, int s)
{
struct tls12_crypto_info_aes_gcm_128 tls_tx = {
.info = {
.version = TLS_1_2_VERSION,
.cipher_type = TLS_CIPHER_AES_GCM_128,
},
};
struct tls12_crypto_info_aes_gcm_128 tls_rx = {
.info = {
.version = TLS_1_2_VERSION,
.cipher_type = TLS_CIPHER_AES_GCM_128,
},
};
int so_buf = 6553500;
int err;
err = setsockopt(s, 6, TCP_ULP, "tls", sizeof("tls"));
if (err) {
fprintf(stderr, "setsockopt: TCP_ULP(%s) failed with error %i\n", sock_to_string(s), err);
return -EINVAL;
}
err = setsockopt(s, SOL_TLS, TLS_TX, (void *)&tls_tx, sizeof(tls_tx));
if (err) {
fprintf(stderr, "setsockopt: TLS_TX(%s) failed with error %i\n", sock_to_string(s), err);
return -EINVAL;
}
err = setsockopt(s, SOL_TLS, TLS_RX, (void *)&tls_rx, sizeof(tls_rx));
if (err) {
fprintf(stderr, "setsockopt: TLS_RX(%s) failed with error %i\n", sock_to_string(s), err);
return -EINVAL;
}
err = setsockopt(s, SOL_SOCKET, SO_SNDBUF, &so_buf, sizeof(so_buf));
if (err) {
fprintf(stderr, "setsockopt: (%s) failed sndbuf with error %i\n", sock_to_string(s), err);
return -EINVAL;
}
err = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &so_buf, sizeof(so_buf));
if (err) {
fprintf(stderr, "setsockopt: (%s) failed rcvbuf with error %i\n", sock_to_string(s), err);
return -EINVAL;
}
if (verbose)
fprintf(stdout, "socket(%s) kTLS enabled\n", sock_to_string(s));
return 0;
}
static int sockmap_init_sockets(int verbose)
{
int i, err, one = 1;
@@ -456,6 +528,21 @@ static int sendmsg_test(struct sockmap_options *opt)
else
rx_fd = p2;
if (ktls) {
/* Redirecting into non-TLS socket which sends into a TLS
* socket is not a valid test. So in this case lets not
* enable kTLS but still run the test.
*/
if (!txmsg_redir || (txmsg_redir && txmsg_ingress)) {
err = sockmap_init_ktls(opt->verbose, rx_fd);
if (err)
return err;
}
err = sockmap_init_ktls(opt->verbose, c1);
if (err)
return err;
}
rxpid = fork();
if (rxpid == 0) {
if (opt->drop_expected)
@@ -907,6 +994,8 @@ static void test_options(char *options)
strncat(options, "ingress,", OPTSTRING);
if (txmsg_skb)
strncat(options, "skb,", OPTSTRING);
if (ktls)
strncat(options, "ktls,", OPTSTRING);
}
static int __test_exec(int cgrp, int test, struct sockmap_options *opt)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,292 @@
/* SPDX-License-Identifier: GPL-2.0
* Copyright(c) 2018 Jesper Dangaard Brouer.
*
* XDP/TC VLAN manipulation example
*
* GOTCHA: Remember to disable NIC hardware offloading of VLANs,
* else the VLAN tags are NOT inlined in the packet payload:
*
* # ethtool -K ixgbe2 rxvlan off
*
* Verify setting:
* # ethtool -k ixgbe2 | grep rx-vlan-offload
* rx-vlan-offload: off
*
*/
#include <stddef.h>
#include <stdbool.h>
#include <string.h>
#include <linux/bpf.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/pkt_cls.h>
#include "bpf_helpers.h"
#include "bpf_endian.h"
/* linux/if_vlan.h have not exposed this as UAPI, thus mirror some here
*
* struct vlan_hdr - vlan header
* @h_vlan_TCI: priority and VLAN ID
* @h_vlan_encapsulated_proto: packet type ID or len
*/
struct _vlan_hdr {
__be16 h_vlan_TCI;
__be16 h_vlan_encapsulated_proto;
};
#define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */
#define VLAN_PRIO_SHIFT 13
#define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator */
#define VLAN_TAG_PRESENT VLAN_CFI_MASK
#define VLAN_VID_MASK 0x0fff /* VLAN Identifier */
#define VLAN_N_VID 4096
struct parse_pkt {
__u16 l3_proto;
__u16 l3_offset;
__u16 vlan_outer;
__u16 vlan_inner;
__u8 vlan_outer_offset;
__u8 vlan_inner_offset;
};
char _license[] SEC("license") = "GPL";
static __always_inline
bool parse_eth_frame(struct ethhdr *eth, void *data_end, struct parse_pkt *pkt)
{
__u16 eth_type;
__u8 offset;
offset = sizeof(*eth);
/* Make sure packet is large enough for parsing eth + 2 VLAN headers */
if ((void *)eth + offset + (2*sizeof(struct _vlan_hdr)) > data_end)
return false;
eth_type = eth->h_proto;
/* Handle outer VLAN tag */
if (eth_type == bpf_htons(ETH_P_8021Q)
|| eth_type == bpf_htons(ETH_P_8021AD)) {
struct _vlan_hdr *vlan_hdr;
vlan_hdr = (void *)eth + offset;
pkt->vlan_outer_offset = offset;
pkt->vlan_outer = bpf_ntohs(vlan_hdr->h_vlan_TCI)
& VLAN_VID_MASK;
eth_type = vlan_hdr->h_vlan_encapsulated_proto;
offset += sizeof(*vlan_hdr);
}
/* Handle inner (double) VLAN tag */
if (eth_type == bpf_htons(ETH_P_8021Q)
|| eth_type == bpf_htons(ETH_P_8021AD)) {
struct _vlan_hdr *vlan_hdr;
vlan_hdr = (void *)eth + offset;
pkt->vlan_inner_offset = offset;
pkt->vlan_inner = bpf_ntohs(vlan_hdr->h_vlan_TCI)
& VLAN_VID_MASK;
eth_type = vlan_hdr->h_vlan_encapsulated_proto;
offset += sizeof(*vlan_hdr);
}
pkt->l3_proto = bpf_ntohs(eth_type); /* Convert to host-byte-order */
pkt->l3_offset = offset;
return true;
}
/* Hint, VLANs are choosen to hit network-byte-order issues */
#define TESTVLAN 4011 /* 0xFAB */
// #define TO_VLAN 4000 /* 0xFA0 (hint 0xOA0 = 160) */
SEC("xdp_drop_vlan_4011")
int xdp_prognum0(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Drop specific VLAN ID example */
if (pkt.vlan_outer == TESTVLAN)
return XDP_ABORTED;
/*
* Using XDP_ABORTED makes it possible to record this event,
* via tracepoint xdp:xdp_exception like:
* # perf record -a -e xdp:xdp_exception
* # perf script
*/
return XDP_PASS;
}
/*
Commands to setup VLAN on Linux to test packets gets dropped:
export ROOTDEV=ixgbe2
export VLANID=4011
ip link add link $ROOTDEV name $ROOTDEV.$VLANID type vlan id $VLANID
ip link set dev $ROOTDEV.$VLANID up
ip link set dev $ROOTDEV mtu 1508
ip addr add 100.64.40.11/24 dev $ROOTDEV.$VLANID
Load prog with ip tool:
ip link set $ROOTDEV xdp off
ip link set $ROOTDEV xdp object xdp_vlan01_kern.o section xdp_drop_vlan_4011
*/
/* Changing VLAN to zero, have same practical effect as removing the VLAN. */
#define TO_VLAN 0
SEC("xdp_vlan_change")
int xdp_prognum1(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Change specific VLAN ID */
if (pkt.vlan_outer == TESTVLAN) {
struct _vlan_hdr *vlan_hdr = data + pkt.vlan_outer_offset;
/* Modifying VLAN, preserve top 4 bits */
vlan_hdr->h_vlan_TCI =
bpf_htons((bpf_ntohs(vlan_hdr->h_vlan_TCI) & 0xf000)
| TO_VLAN);
}
return XDP_PASS;
}
/*
* Show XDP+TC can cooperate, on creating a VLAN rewriter.
* 1. Create a XDP prog that can "pop"/remove a VLAN header.
* 2. Create a TC-bpf prog that egress can add a VLAN header.
*/
#ifndef ETH_ALEN /* Ethernet MAC address length */
#define ETH_ALEN 6 /* bytes */
#endif
#define VLAN_HDR_SZ 4 /* bytes */
SEC("xdp_vlan_remove_outer")
int xdp_prognum2(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct parse_pkt pkt = { 0 };
char *dest;
if (!parse_eth_frame(data, data_end, &pkt))
return XDP_ABORTED;
/* Skip packet if no outer VLAN was detected */
if (pkt.vlan_outer_offset == 0)
return XDP_PASS;
/* Moving Ethernet header, dest overlap with src, memmove handle this */
dest = data;
dest+= VLAN_HDR_SZ;
/*
* Notice: Taking over vlan_hdr->h_vlan_encapsulated_proto, by
* only moving two MAC addrs (12 bytes), not overwriting last 2 bytes
*/
__builtin_memmove(dest, data, ETH_ALEN * 2);
/* Note: LLVM built-in memmove inlining require size to be constant */
/* Move start of packet header seen by Linux kernel stack */
bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
return XDP_PASS;
}
static __always_inline
void shift_mac_4bytes_16bit(void *data)
{
__u16 *p = data;
p[7] = p[5]; /* delete p[7] was vlan_hdr->h_vlan_TCI */
p[6] = p[4]; /* delete p[6] was ethhdr->h_proto */
p[5] = p[3];
p[4] = p[2];
p[3] = p[1];
p[2] = p[0];
}
static __always_inline
void shift_mac_4bytes_32bit(void *data)
{
__u32 *p = data;
/* Assuming VLAN hdr present. The 4 bytes in p[3] that gets
* overwritten, is ethhdr->h_proto and vlan_hdr->h_vlan_TCI.
* The vlan_hdr->h_vlan_encapsulated_proto take over role as
* ethhdr->h_proto.
*/
p[3] = p[2];
p[2] = p[1];
p[1] = p[0];
}
SEC("xdp_vlan_remove_outer2")
int xdp_prognum3(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct ethhdr *orig_eth = data;
struct parse_pkt pkt = { 0 };
if (!parse_eth_frame(orig_eth, data_end, &pkt))
return XDP_ABORTED;
/* Skip packet if no outer VLAN was detected */
if (pkt.vlan_outer_offset == 0)
return XDP_PASS;
/* Simply shift down MAC addrs 4 bytes, overwrite h_proto + TCI */
shift_mac_4bytes_32bit(data);
/* Move start of packet header seen by Linux kernel stack */
bpf_xdp_adjust_head(ctx, VLAN_HDR_SZ);
return XDP_PASS;
}
/*=====================================
* BELOW: TC-hook based ebpf programs
* ====================================
* The TC-clsact eBPF programs (currently) need to be attach via TC commands
*/
SEC("tc_vlan_push")
int _tc_progA(struct __sk_buff *ctx)
{
bpf_skb_vlan_push(ctx, bpf_htons(ETH_P_8021Q), TESTVLAN);
return TC_ACT_OK;
}
/*
Commands to setup TC to use above bpf prog:
export ROOTDEV=ixgbe2
export FILE=xdp_vlan01_kern.o
# Re-attach clsact to clear/flush existing role
tc qdisc del dev $ROOTDEV clsact 2> /dev/null ;\
tc qdisc add dev $ROOTDEV clsact
# Attach BPF prog EGRESS
tc filter add dev $ROOTDEV egress \
prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
tc filter show dev $ROOTDEV egress
*/

View File

@@ -0,0 +1,195 @@
#!/bin/bash
TESTNAME=xdp_vlan
usage() {
echo "Testing XDP + TC eBPF VLAN manipulations: $TESTNAME"
echo ""
echo "Usage: $0 [-vfh]"
echo " -v | --verbose : Verbose"
echo " --flush : Flush before starting (e.g. after --interactive)"
echo " --interactive : Keep netns setup running after test-run"
echo ""
}
cleanup()
{
local status=$?
if [ "$status" = "0" ]; then
echo "selftests: $TESTNAME [PASS]";
else
echo "selftests: $TESTNAME [FAILED]";
fi
if [ -n "$INTERACTIVE" ]; then
echo "Namespace setup still active explore with:"
echo " ip netns exec ns1 bash"
echo " ip netns exec ns2 bash"
exit $status
fi
set +e
ip link del veth1 2> /dev/null
ip netns del ns1 2> /dev/null
ip netns del ns2 2> /dev/null
}
# Using external program "getopt" to get --long-options
OPTIONS=$(getopt -o hvfi: \
--long verbose,flush,help,interactive,debug -- "$@")
if (( $? != 0 )); then
usage
echo "selftests: $TESTNAME [FAILED] Error calling getopt, unknown option?"
exit 2
fi
eval set -- "$OPTIONS"
## --- Parse command line arguments / parameters ---
while true; do
case "$1" in
-v | --verbose)
export VERBOSE=yes
shift
;;
-i | --interactive | --debug )
INTERACTIVE=yes
shift
;;
-f | --flush )
cleanup
shift
;;
-- )
shift
break
;;
-h | --help )
usage;
echo "selftests: $TESTNAME [SKIP] usage help info requested"
exit 0
;;
* )
shift
break
;;
esac
done
if [ "$EUID" -ne 0 ]; then
echo "selftests: $TESTNAME [FAILED] need root privileges"
exit 1
fi
ip link set dev lo xdp off 2>/dev/null > /dev/null
if [ $? -ne 0 ];then
echo "selftests: $TESTNAME [SKIP] need ip xdp support"
exit 0
fi
# Interactive mode likely require us to cleanup netns
if [ -n "$INTERACTIVE" ]; then
ip link del veth1 2> /dev/null
ip netns del ns1 2> /dev/null
ip netns del ns2 2> /dev/null
fi
# Exit on failure
set -e
# Some shell-tools dependencies
which ip > /dev/null
which tc > /dev/null
which ethtool > /dev/null
# Make rest of shell verbose, showing comments as doc/info
if [ -n "$VERBOSE" ]; then
set -v
fi
# Create two namespaces
ip netns add ns1
ip netns add ns2
# Run cleanup if failing or on kill
trap cleanup 0 2 3 6 9
# Create veth pair
ip link add veth1 type veth peer name veth2
# Move veth1 and veth2 into the respective namespaces
ip link set veth1 netns ns1
ip link set veth2 netns ns2
# NOTICE: XDP require VLAN header inside packet payload
# - Thus, disable VLAN offloading driver features
# - For veth REMEMBER TX side VLAN-offload
#
# Disable rx-vlan-offload (mostly needed on ns1)
ip netns exec ns1 ethtool -K veth1 rxvlan off
ip netns exec ns2 ethtool -K veth2 rxvlan off
#
# Disable tx-vlan-offload (mostly needed on ns2)
ip netns exec ns2 ethtool -K veth2 txvlan off
ip netns exec ns1 ethtool -K veth1 txvlan off
export IPADDR1=100.64.41.1
export IPADDR2=100.64.41.2
# In ns1/veth1 add IP-addr on plain net_device
ip netns exec ns1 ip addr add ${IPADDR1}/24 dev veth1
ip netns exec ns1 ip link set veth1 up
# In ns2/veth2 create VLAN device
export VLAN=4011
export DEVNS2=veth2
ip netns exec ns2 ip link add link $DEVNS2 name $DEVNS2.$VLAN type vlan id $VLAN
ip netns exec ns2 ip addr add ${IPADDR2}/24 dev $DEVNS2.$VLAN
ip netns exec ns2 ip link set $DEVNS2 up
ip netns exec ns2 ip link set $DEVNS2.$VLAN up
# Bringup lo in netns (to avoids confusing people using --interactive)
ip netns exec ns1 ip link set lo up
ip netns exec ns2 ip link set lo up
# At this point, the hosts cannot reach each-other,
# because ns2 are using VLAN tags on the packets.
ip netns exec ns2 sh -c 'ping -W 1 -c 1 100.64.41.1 || echo "Okay ping fails"'
# Now we can use the test_xdp_vlan.c program to pop/push these VLAN tags
# ----------------------------------------------------------------------
# In ns1: ingress use XDP to remove VLAN tags
export DEVNS1=veth1
export FILE=test_xdp_vlan.o
# First test: Remove VLAN by setting VLAN ID 0, using "xdp_vlan_change"
export XDP_PROG=xdp_vlan_change
ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
# In ns1: egress use TC to add back VLAN tag 4011
# (del cmd)
# tc qdisc del dev $DEVNS1 clsact 2> /dev/null
#
ip netns exec ns1 tc qdisc add dev $DEVNS1 clsact
ip netns exec ns1 tc filter add dev $DEVNS1 egress \
prio 1 handle 1 bpf da obj $FILE sec tc_vlan_push
# Now the namespaces can reach each-other, test with ping:
ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
ip netns exec ns1 ping -W 2 -c 3 $IPADDR2
# Second test: Replace xdp prog, that fully remove vlan header
#
# Catch kernel bug for generic-XDP, that does didn't allow us to
# remove a VLAN header, because skb->protocol still contain VLAN
# ETH_P_8021Q indication, and this cause overwriting of our changes.
#
export XDP_PROG=xdp_vlan_remove_outer2
ip netns exec ns1 ip link set $DEVNS1 xdp off
ip netns exec ns1 ip link set $DEVNS1 xdp object $FILE section $XDP_PROG
# Now the namespaces should still be able reach each-other, test with ping:
ip netns exec ns2 ping -W 2 -c 3 $IPADDR1
ip netns exec ns1 ping -W 2 -c 3 $IPADDR2