From f3c93a93b564d25cab715cd71a6c3417e77b19f3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 21:38:04 -0700
Subject: [PATCH 01/73] tools/bpftool: Strip BPF .o files before skeleton
 generation

Strip away DWARF info from .bpf.o files, before generating BPF skeletons.
This reduces bpftool binary size from 3.43MB to 2.58MB.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Quentin Monnet <quentin@isovalent.com>
Link: https://lore.kernel.org/bpf/20200722043804.2373298-1-andriin@fb.com
---
 tools/bpf/bpftool/Makefile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile
index 51bd520ed437..8462690a039b 100644
--- a/tools/bpf/bpftool/Makefile
+++ b/tools/bpf/bpftool/Makefile
@@ -59,6 +59,7 @@ endif
 INSTALL ?= install
 RM ?= rm -f
 CLANG ?= clang
+LLVM_STRIP ?= llvm-strip
 
 FEATURE_USER = .bpftool
 FEATURE_TESTS = libbfd disassembler-four-args reallocarray zlib libcap \
@@ -147,7 +148,7 @@ $(OUTPUT)%.bpf.o: skeleton/%.bpf.c $(OUTPUT)vmlinux.h $(LIBBPF)
 		-I$(srctree)/tools/include/uapi/ \
 		-I$(LIBBPF_PATH) \
 		-I$(srctree)/tools/lib \
-		-g -O2 -target bpf -c $< -o $@
+		-g -O2 -target bpf -c $< -o $@ && $(LLVM_STRIP) -g $@
 
 $(OUTPUT)%.skel.h: $(OUTPUT)%.bpf.o $(BPFTOOL_BOOTSTRAP)
 	$(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) gen skeleton $< > $@

From c8a2983c4df06c4cd11bea6abfa7e2947bd3113b Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 22 Jul 2020 18:17:19 +0200
Subject: [PATCH 02/73] udp: Don't discard reuseport selection when group has
 connections

When BPF socket lookup prog selects a socket that belongs to a reuseport
group, and the reuseport group has connected sockets in it, the socket
selected by reuseport will be discarded, and socket returned by BPF socket
lookup will be used instead.

Modify this behavior so that the socket selected by reuseport running after
BPF socket lookup always gets used. Ignore the fact that the reuseport
group might have connections because it is only relevant when scoring
sockets during regular hashtable-based lookup.

Fixes: 72f7e9440e9b ("udp: Run SK_LOOKUP BPF program on socket lookup")
Fixes: 6d4201b1386b ("udp6: Run SK_LOOKUP BPF program on socket lookup")
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/bpf/20200722161720.940831-2-jakub@cloudflare.com
---
 net/ipv6/udp.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 5530c9dcb61c..c394e674f486 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -155,9 +155,6 @@ static struct sock *lookup_reuseport(struct net *net, struct sock *sk,
 		hash = udp6_ehashfn(net, daddr, hnum, saddr, sport);
 		reuse_sk = reuseport_select_sock(sk, hash, skb,
 						 sizeof(struct udphdr));
-		/* Fall back to scoring if group has connections */
-		if (reuseport_has_conns(sk, false))
-			return NULL;
 	}
 	return reuse_sk;
 }

From 14fc6bd6b79c430f615500d0fe6cea4722110db8 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:09 -0700
Subject: [PATCH 03/73] bpf: Refactor bpf_iter_reg to have separate seq_info
 member

There is no functionality change for this patch.
Struct bpf_iter_reg is used to register a bpf_iter target,
which includes information for both prog_load, link_create
and seq_file creation.

This patch puts fields related seq_file creation into
a different structure. This will be useful for map
elements iterator where one iterator covers different
map types and different map types may have different
seq_ops, init/fini private_data function and
private_data size.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184109.590030-1-yhs@fb.com
---
 include/linux/bpf.h      | 17 ++++++++++-------
 kernel/bpf/bpf_iter.c    | 12 ++++++------
 kernel/bpf/map_iter.c    |  8 ++++++--
 kernel/bpf/prog_iter.c   |  8 ++++++--
 kernel/bpf/task_iter.c   | 16 ++++++++++++----
 net/ipv4/tcp_ipv4.c      |  8 ++++++--
 net/ipv4/udp.c           |  8 ++++++--
 net/ipv6/route.c         |  8 ++++++--
 net/netlink/af_netlink.c |  8 ++++++--
 9 files changed, 64 insertions(+), 29 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 72221aea1c60..127067f71fd4 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -37,6 +37,15 @@ struct seq_operations;
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
+struct bpf_iter_seq_info {
+	const struct seq_operations *seq_ops;
+	bpf_iter_init_seq_priv_t init_seq_private;
+	bpf_iter_fini_seq_priv_t fini_seq_private;
+	u32 seq_priv_size;
+};
+
 /* map is generic key/value storage optionally accesible by eBPF programs */
 struct bpf_map_ops {
 	/* funcs callable from userspace (via syscall) */
@@ -1189,18 +1198,12 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
-typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
-typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
-
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
-	const struct seq_operations *seq_ops;
-	bpf_iter_init_seq_priv_t init_seq_private;
-	bpf_iter_fini_seq_priv_t fini_seq_private;
-	u32 seq_priv_size;
 	u32 ctx_arg_info_size;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
+	const struct bpf_iter_seq_info *seq_info;
 };
 
 struct bpf_iter_meta {
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index dd612b80b9fe..5b2387d6aa1f 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -218,8 +218,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
 				 target_private);
 
-	if (iter_priv->tinfo->reg_info->fini_seq_private)
-		iter_priv->tinfo->reg_info->fini_seq_private(seq->private);
+	if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private)
+		iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private);
 
 	bpf_prog_put(iter_priv->prog);
 	seq->private = iter_priv;
@@ -433,16 +433,16 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 
 	tinfo = link->tinfo;
 	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
-			   tinfo->reg_info->seq_priv_size;
-	priv_data = __seq_open_private(file, tinfo->reg_info->seq_ops,
+			   tinfo->reg_info->seq_info->seq_priv_size;
+	priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops,
 				       total_priv_dsize);
 	if (!priv_data) {
 		err = -ENOMEM;
 		goto release_prog;
 	}
 
-	if (tinfo->reg_info->init_seq_private) {
-		err = tinfo->reg_info->init_seq_private(priv_data->target_private);
+	if (tinfo->reg_info->seq_info->init_seq_private) {
+		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private);
 		if (err)
 			goto release_seq_file;
 	}
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 5926c76d854e..1a69241fb1e2 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -81,17 +81,21 @@ static const struct seq_operations bpf_map_seq_ops = {
 BTF_ID_LIST(btf_bpf_map_id)
 BTF_ID(struct, bpf_map)
 
-static struct bpf_iter_reg bpf_map_reg_info = {
-	.target			= "bpf_map",
+static const struct bpf_iter_seq_info bpf_map_seq_info = {
 	.seq_ops		= &bpf_map_seq_ops,
 	.init_seq_private	= NULL,
 	.fini_seq_private	= NULL,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_map_info),
+};
+
+static struct bpf_iter_reg bpf_map_reg_info = {
+	.target			= "bpf_map",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_map, map),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &bpf_map_seq_info,
 };
 
 static int __init bpf_map_iter_init(void)
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
index 6541b577d69f..53a73c841c13 100644
--- a/kernel/bpf/prog_iter.c
+++ b/kernel/bpf/prog_iter.c
@@ -81,17 +81,21 @@ static const struct seq_operations bpf_prog_seq_ops = {
 BTF_ID_LIST(btf_bpf_prog_id)
 BTF_ID(struct, bpf_prog)
 
-static struct bpf_iter_reg bpf_prog_reg_info = {
-	.target			= "bpf_prog",
+static const struct bpf_iter_seq_info bpf_prog_seq_info = {
 	.seq_ops		= &bpf_prog_seq_ops,
 	.init_seq_private	= NULL,
 	.fini_seq_private	= NULL,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_prog_info),
+};
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+	.target			= "bpf_prog",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__bpf_prog, prog),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &bpf_prog_seq_info,
 };
 
 static int __init bpf_prog_iter_init(void)
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 1039e52ebd8b..6d9cd23869bf 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -319,25 +319,32 @@ BTF_ID_LIST(btf_task_file_ids)
 BTF_ID(struct, task_struct)
 BTF_ID(struct, file)
 
-static struct bpf_iter_reg task_reg_info = {
-	.target			= "task",
+static const struct bpf_iter_seq_info task_seq_info = {
 	.seq_ops		= &task_seq_ops,
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_info),
+};
+
+static struct bpf_iter_reg task_reg_info = {
+	.target			= "task",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task, task),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &task_seq_info,
 };
 
-static struct bpf_iter_reg task_file_reg_info = {
-	.target			= "task_file",
+static const struct bpf_iter_seq_info task_file_seq_info = {
 	.seq_ops		= &task_file_seq_ops,
 	.init_seq_private	= init_seq_pidns,
 	.fini_seq_private	= fini_seq_pidns,
 	.seq_priv_size		= sizeof(struct bpf_iter_seq_task_file_info),
+};
+
+static struct bpf_iter_reg task_file_reg_info = {
+	.target			= "task_file",
 	.ctx_arg_info_size	= 2,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__task_file, task),
@@ -345,6 +352,7 @@ static struct bpf_iter_reg task_file_reg_info = {
 		{ offsetof(struct bpf_iter__task_file, file),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &task_file_seq_info,
 };
 
 static int __init task_iter_init(void)
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index f8913923a6c0..cb288fdcf2ca 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2947,17 +2947,21 @@ static void bpf_iter_fini_tcp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static struct bpf_iter_reg tcp_reg_info = {
-	.target			= "tcp",
+static const struct bpf_iter_seq_info tcp_seq_info = {
 	.seq_ops		= &bpf_iter_tcp_seq_ops,
 	.init_seq_private	= bpf_iter_init_tcp,
 	.fini_seq_private	= bpf_iter_fini_tcp,
 	.seq_priv_size		= sizeof(struct tcp_iter_state),
+};
+
+static struct bpf_iter_reg tcp_reg_info = {
+	.target			= "tcp",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__tcp, sk_common),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &tcp_seq_info,
 };
 
 static void __init bpf_iter_register(void)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 0fb5e4ea133f..1bc50ec2caef 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3208,17 +3208,21 @@ static void bpf_iter_fini_udp(void *priv_data)
 	bpf_iter_fini_seq_net(priv_data);
 }
 
-static struct bpf_iter_reg udp_reg_info = {
-	.target			= "udp",
+static const struct bpf_iter_seq_info udp_seq_info = {
 	.seq_ops		= &bpf_iter_udp_seq_ops,
 	.init_seq_private	= bpf_iter_init_udp,
 	.fini_seq_private	= bpf_iter_fini_udp,
 	.seq_priv_size		= sizeof(struct udp_iter_state),
+};
+
+static struct bpf_iter_reg udp_reg_info = {
+	.target			= "udp",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__udp, udp_sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &udp_seq_info,
 };
 
 static void __init bpf_iter_register(void)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 33f5efbad0a9..8bfc57b0802a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -6427,17 +6427,21 @@ DEFINE_BPF_ITER_FUNC(ipv6_route, struct bpf_iter_meta *meta, struct fib6_info *r
 BTF_ID_LIST(btf_fib6_info_id)
 BTF_ID(struct, fib6_info)
 
-static struct bpf_iter_reg ipv6_route_reg_info = {
-	.target			= "ipv6_route",
+static const struct bpf_iter_seq_info ipv6_route_seq_info = {
 	.seq_ops		= &ipv6_route_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct ipv6_route_iter),
+};
+
+static struct bpf_iter_reg ipv6_route_reg_info = {
+	.target			= "ipv6_route",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__ipv6_route, rt),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &ipv6_route_seq_info,
 };
 
 static int __init bpf_iter_register(void)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index d8921b833744..b5f30d7d30d0 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2807,17 +2807,21 @@ static const struct rhashtable_params netlink_rhashtable_params = {
 BTF_ID_LIST(btf_netlink_sock_id)
 BTF_ID(struct, netlink_sock)
 
-static struct bpf_iter_reg netlink_reg_info = {
-	.target			= "netlink",
+static const struct bpf_iter_seq_info netlink_seq_info = {
 	.seq_ops		= &netlink_seq_ops,
 	.init_seq_private	= bpf_iter_init_seq_net,
 	.fini_seq_private	= bpf_iter_fini_seq_net,
 	.seq_priv_size		= sizeof(struct nl_seq_iter),
+};
+
+static struct bpf_iter_reg netlink_reg_info = {
+	.target			= "netlink",
 	.ctx_arg_info_size	= 1,
 	.ctx_arg_info		= {
 		{ offsetof(struct bpf_iter__netlink, sk),
 		  PTR_TO_BTF_ID_OR_NULL },
 	},
+	.seq_info		= &netlink_seq_info,
 };
 
 static int __init bpf_iter_register(void)

From f9c792729581bd8b8473af163e8ab426c2c61d89 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:10 -0700
Subject: [PATCH 04/73] bpf: Refactor to provide aux info to
 bpf_iter_init_seq_priv_t

This patch refactored target bpf_iter_init_seq_priv_t callback
function to accept additional information. This will be needed
in later patches for map element targets since a particular
map should be passed to traverse elements for that particular
map. In the future, other information may be passed to target
as well, e.g., pid, cgroup id, etc. to customize the iterator.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184110.590156-1-yhs@fb.com
---
 fs/proc/proc_net.c      | 2 +-
 include/linux/bpf.h     | 7 ++++++-
 include/linux/proc_fs.h | 3 ++-
 kernel/bpf/bpf_iter.c   | 2 +-
 kernel/bpf/task_iter.c  | 2 +-
 net/ipv4/tcp_ipv4.c     | 4 ++--
 net/ipv4/udp.c          | 4 ++--
 7 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index dba63b2429f0..ed8a6306990c 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -98,7 +98,7 @@ static const struct proc_ops proc_net_seq_ops = {
 	.proc_release	= seq_release_net,
 };
 
-int bpf_iter_init_seq_net(void *priv_data)
+int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 #ifdef CONFIG_NET_NS
 	struct seq_net_private *p = priv_data;
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 127067f71fd4..ef52717336cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -33,11 +33,13 @@ struct btf;
 struct btf_type;
 struct exception_table_entry;
 struct seq_operations;
+struct bpf_iter_aux_info;
 
 extern struct idr btf_idr;
 extern spinlock_t btf_idr_lock;
 
-typedef int (*bpf_iter_init_seq_priv_t)(void *private_data);
+typedef int (*bpf_iter_init_seq_priv_t)(void *private_data,
+					struct bpf_iter_aux_info *aux);
 typedef void (*bpf_iter_fini_seq_priv_t)(void *private_data);
 struct bpf_iter_seq_info {
 	const struct seq_operations *seq_ops;
@@ -1198,6 +1200,9 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	extern int bpf_iter_ ## target(args);			\
 	int __init bpf_iter_ ## target(args) { return 0; }
 
+struct bpf_iter_aux_info {
+};
+
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index d1eed1b43651..2df965cd0974 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -133,7 +133,8 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
 						    void *data);
 extern struct pid *tgid_pidfd_to_pid(const struct file *file);
 
-extern int bpf_iter_init_seq_net(void *priv_data);
+struct bpf_iter_aux_info;
+extern int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux);
 extern void bpf_iter_fini_seq_net(void *priv_data);
 
 #ifdef CONFIG_PROC_PID_ARCH_STATUS
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 5b2387d6aa1f..8fa94cb1b5a0 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -442,7 +442,7 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 	}
 
 	if (tinfo->reg_info->seq_info->init_seq_private) {
-		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private);
+		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL);
 		if (err)
 			goto release_seq_file;
 	}
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 6d9cd23869bf..232df29793e9 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -293,7 +293,7 @@ static void task_file_seq_stop(struct seq_file *seq, void *v)
 	}
 }
 
-static int init_seq_pidns(void *priv_data)
+static int init_seq_pidns(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct bpf_iter_seq_task_common *common = priv_data;
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index cb288fdcf2ca..5084333b5ab6 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2921,7 +2921,7 @@ static struct pernet_operations __net_initdata tcp_sk_ops = {
 DEFINE_BPF_ITER_FUNC(tcp, struct bpf_iter_meta *meta,
 		     struct sock_common *sk_common, uid_t uid)
 
-static int bpf_iter_init_tcp(void *priv_data)
+static int bpf_iter_init_tcp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct tcp_iter_state *st = priv_data;
 	struct tcp_seq_afinfo *afinfo;
@@ -2933,7 +2933,7 @@ static int bpf_iter_init_tcp(void *priv_data)
 
 	afinfo->family = AF_UNSPEC;
 	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data);
+	ret = bpf_iter_init_seq_net(priv_data, aux);
 	if (ret)
 		kfree(afinfo);
 	return ret;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 1bc50ec2caef..7ce31beccfc2 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -3181,7 +3181,7 @@ static struct pernet_operations __net_initdata udp_sysctl_ops = {
 DEFINE_BPF_ITER_FUNC(udp, struct bpf_iter_meta *meta,
 		     struct udp_sock *udp_sk, uid_t uid, int bucket)
 
-static int bpf_iter_init_udp(void *priv_data)
+static int bpf_iter_init_udp(void *priv_data, struct bpf_iter_aux_info *aux)
 {
 	struct udp_iter_state *st = priv_data;
 	struct udp_seq_afinfo *afinfo;
@@ -3194,7 +3194,7 @@ static int bpf_iter_init_udp(void *priv_data)
 	afinfo->family = AF_UNSPEC;
 	afinfo->udp_table = &udp_table;
 	st->bpf_seq_afinfo = afinfo;
-	ret = bpf_iter_init_seq_net(priv_data);
+	ret = bpf_iter_init_seq_net(priv_data, aux);
 	if (ret)
 		kfree(afinfo);
 	return ret;

From 86176a1821a1237ad163d312ba0f2d7598757894 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Wed, 22 Jul 2020 18:17:20 +0200
Subject: [PATCH 05/73] selftests/bpf: Test BPF socket lookup and reuseport
 with connections

Cover the case when BPF socket lookup returns a socket that belongs to a
reuseport group, and the reuseport group contains connected UDP sockets.

Ensure that the presence of connected UDP sockets in reuseport group does
not affect the socket lookup result. Socket selected by reuseport should
always be used as result in such case.

Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp>
Link: https://lore.kernel.org/bpf/20200722161720.940831-3-jakub@cloudflare.com
---
 .../selftests/bpf/prog_tests/sk_lookup.c      | 54 ++++++++++++++++++-
 1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index f1784ae4565a..9bbd2b2b7630 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -74,6 +74,7 @@ struct test {
 	struct inet_addr connect_to;
 	struct inet_addr listen_at;
 	enum server accept_on;
+	bool reuseport_has_conns; /* Add a connected socket to reuseport group */
 };
 
 static __u32 duration;		/* for CHECK macro */
@@ -559,7 +560,8 @@ close:
 
 static void run_lookup_prog(const struct test *t)
 {
-	int client_fd, server_fds[MAX_SERVERS] = { -1 };
+	int server_fds[MAX_SERVERS] = { -1 };
+	int client_fd, reuse_conn_fd = -1;
 	struct bpf_link *lookup_link;
 	int i, err;
 
@@ -583,6 +585,32 @@ static void run_lookup_prog(const struct test *t)
 			break;
 	}
 
+	/* Regular UDP socket lookup with reuseport behaves
+	 * differently when reuseport group contains connected
+	 * sockets. Check that adding a connected UDP socket to the
+	 * reuseport group does not affect how reuseport works with
+	 * BPF socket lookup.
+	 */
+	if (t->reuseport_has_conns) {
+		struct sockaddr_storage addr = {};
+		socklen_t len = sizeof(addr);
+
+		/* Add an extra socket to reuseport group */
+		reuse_conn_fd = make_server(t->sotype, t->listen_at.ip,
+					    t->listen_at.port,
+					    t->reuseport_prog);
+		if (reuse_conn_fd < 0)
+			goto close;
+
+		/* Connect the extra socket to itself */
+		err = getsockname(reuse_conn_fd, (void *)&addr, &len);
+		if (CHECK(err, "getsockname", "errno %d\n", errno))
+			goto close;
+		err = connect(reuse_conn_fd, (void *)&addr, len);
+		if (CHECK(err, "connect", "errno %d\n", errno))
+			goto close;
+	}
+
 	client_fd = make_client(t->sotype, t->connect_to.ip, t->connect_to.port);
 	if (client_fd < 0)
 		goto close;
@@ -594,6 +622,8 @@ static void run_lookup_prog(const struct test *t)
 
 	close(client_fd);
 close:
+	if (reuse_conn_fd != -1)
+		close(reuse_conn_fd);
 	for (i = 0; i < ARRAY_SIZE(server_fds); i++) {
 		if (server_fds[i] != -1)
 			close(server_fds[i]);
@@ -710,6 +740,17 @@ static void test_redirect_lookup(struct test_sk_lookup *skel)
 			.listen_at	= { INT_IP4, INT_PORT },
 			.accept_on	= SERVER_B,
 		},
+		{
+			.desc		= "UDP IPv4 redir and reuseport with conns",
+			.lookup_prog	= skel->progs.select_sock_a,
+			.reuseport_prog	= skel->progs.select_sock_b,
+			.sock_map	= skel->maps.redir_map,
+			.sotype		= SOCK_DGRAM,
+			.connect_to	= { EXT_IP4, EXT_PORT },
+			.listen_at	= { INT_IP4, INT_PORT },
+			.accept_on	= SERVER_B,
+			.reuseport_has_conns = true,
+		},
 		{
 			.desc		= "UDP IPv4 redir skip reuseport",
 			.lookup_prog	= skel->progs.select_sock_a_no_reuseport,
@@ -754,6 +795,17 @@ static void test_redirect_lookup(struct test_sk_lookup *skel)
 			.listen_at	= { INT_IP6, INT_PORT },
 			.accept_on	= SERVER_B,
 		},
+		{
+			.desc		= "UDP IPv6 redir and reuseport with conns",
+			.lookup_prog	= skel->progs.select_sock_a,
+			.reuseport_prog	= skel->progs.select_sock_b,
+			.sock_map	= skel->maps.redir_map,
+			.sotype		= SOCK_DGRAM,
+			.connect_to	= { EXT_IP6, EXT_PORT },
+			.listen_at	= { INT_IP6, INT_PORT },
+			.accept_on	= SERVER_B,
+			.reuseport_has_conns = true,
+		},
 		{
 			.desc		= "UDP IPv6 redir skip reuseport",
 			.lookup_prog	= skel->progs.select_sock_a_no_reuseport,

From afbf21dce668ef59482037596eaffbe5041e094c Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:11 -0700
Subject: [PATCH 06/73] bpf: Support readonly/readwrite buffers in verifier

Readonly and readwrite buffer register states
are introduced. Totally four states,
PTR_TO_RDONLY_BUF[_OR_NULL] and PTR_TO_RDWR_BUF[_OR_NULL]
are supported. As suggested by their respective
names, PTR_TO_RDONLY_BUF[_OR_NULL] are for
readonly buffers and PTR_TO_RDWR_BUF[_OR_NULL]
for read/write buffers.

These new register states will be used
by later bpf map element iterator.

New register states share some similarity to
PTR_TO_TP_BUFFER as it will calculate accessed buffer
size during verification time. The accessed buffer
size will be later compared to other metrics during
later attach/link_create time.

Similar to reg_state PTR_TO_BTF_ID_OR_NULL in bpf
iterator programs, PTR_TO_RDONLY_BUF_OR_NULL or
PTR_TO_RDWR_BUF_OR_NULL reg_types can be set at
prog->aux->bpf_ctx_arg_aux, and bpf verifier will
retrieve the values during btf_ctx_access().
Later bpf map element iterator implementation
will show how such information will be assigned
during target registeration time.

The verifier is also enhanced such that PTR_TO_RDONLY_BUF
can be passed to ARG_PTR_TO_MEM[_OR_NULL] helper argument, and
PTR_TO_RDWR_BUF can be passed to ARG_PTR_TO_MEM[_OR_NULL] or
ARG_PTR_TO_UNINIT_MEM.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184111.590274-1-yhs@fb.com
---
 include/linux/bpf.h   |  6 +++
 kernel/bpf/btf.c      | 13 +++++++
 kernel/bpf/verifier.c | 91 ++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 104 insertions(+), 6 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index ef52717336cf..f9c4bb08f616 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -353,6 +353,10 @@ enum bpf_reg_type {
 	PTR_TO_BTF_ID_OR_NULL,	 /* reg points to kernel struct or NULL */
 	PTR_TO_MEM,		 /* reg points to valid memory region */
 	PTR_TO_MEM_OR_NULL,	 /* reg points to valid memory region or NULL */
+	PTR_TO_RDONLY_BUF,	 /* reg points to a readonly buffer */
+	PTR_TO_RDONLY_BUF_OR_NULL, /* reg points to a readonly buffer or NULL */
+	PTR_TO_RDWR_BUF,	 /* reg points to a read/write buffer */
+	PTR_TO_RDWR_BUF_OR_NULL, /* reg points to a read/write buffer or NULL */
 };
 
 /* The information passed from prog-specific *_is_valid_access
@@ -694,6 +698,8 @@ struct bpf_prog_aux {
 	u32 func_idx; /* 0 for non-func prog, the index in func array for func prog */
 	u32 attach_btf_id; /* in-kernel BTF type id to attach to */
 	u32 ctx_arg_info_size;
+	u32 max_rdonly_access;
+	u32 max_rdwr_access;
 	const struct bpf_ctx_arg_aux *ctx_arg_info;
 	struct bpf_prog *linked_prog;
 	bool verifier_zext; /* Zero extensions has been inserted by verifier. */
diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
index ee36b7f60936..0fd6bb62be3a 100644
--- a/kernel/bpf/btf.c
+++ b/kernel/bpf/btf.c
@@ -3806,6 +3806,19 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
 			btf_kind_str[BTF_INFO_KIND(t->info)]);
 		return false;
 	}
+
+	/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
+	for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
+		const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
+
+		if (ctx_arg_info->offset == off &&
+		    (ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
+		     ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
+			info->reg_type = ctx_arg_info->reg_type;
+			return true;
+		}
+	}
+
 	if (t->type == 0)
 		/* This is a pointer to void.
 		 * It is the same as scalar from the verifier safety pov.
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9a6703bc3f36..8d6979db48d8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -409,7 +409,9 @@ static bool reg_type_may_be_null(enum bpf_reg_type type)
 	       type == PTR_TO_SOCK_COMMON_OR_NULL ||
 	       type == PTR_TO_TCP_SOCK_OR_NULL ||
 	       type == PTR_TO_BTF_ID_OR_NULL ||
-	       type == PTR_TO_MEM_OR_NULL;
+	       type == PTR_TO_MEM_OR_NULL ||
+	       type == PTR_TO_RDONLY_BUF_OR_NULL ||
+	       type == PTR_TO_RDWR_BUF_OR_NULL;
 }
 
 static bool reg_may_point_to_spin_lock(const struct bpf_reg_state *reg)
@@ -503,6 +505,10 @@ static const char * const reg_type_str[] = {
 	[PTR_TO_BTF_ID_OR_NULL]	= "ptr_or_null_",
 	[PTR_TO_MEM]		= "mem",
 	[PTR_TO_MEM_OR_NULL]	= "mem_or_null",
+	[PTR_TO_RDONLY_BUF]	= "rdonly_buf",
+	[PTR_TO_RDONLY_BUF_OR_NULL] = "rdonly_buf_or_null",
+	[PTR_TO_RDWR_BUF]	= "rdwr_buf",
+	[PTR_TO_RDWR_BUF_OR_NULL] = "rdwr_buf_or_null",
 };
 
 static char slot_type_char[] = {
@@ -2173,6 +2179,10 @@ static bool is_spillable_regtype(enum bpf_reg_type type)
 	case PTR_TO_XDP_SOCK:
 	case PTR_TO_BTF_ID:
 	case PTR_TO_BTF_ID_OR_NULL:
+	case PTR_TO_RDONLY_BUF:
+	case PTR_TO_RDONLY_BUF_OR_NULL:
+	case PTR_TO_RDWR_BUF:
+	case PTR_TO_RDWR_BUF_OR_NULL:
 		return true;
 	default:
 		return false;
@@ -3052,14 +3062,15 @@ int check_ctx_reg(struct bpf_verifier_env *env,
 	return 0;
 }
 
-static int check_tp_buffer_access(struct bpf_verifier_env *env,
-				  const struct bpf_reg_state *reg,
-				  int regno, int off, int size)
+static int __check_buffer_access(struct bpf_verifier_env *env,
+				 const char *buf_info,
+				 const struct bpf_reg_state *reg,
+				 int regno, int off, int size)
 {
 	if (off < 0) {
 		verbose(env,
-			"R%d invalid tracepoint buffer access: off=%d, size=%d",
-			regno, off, size);
+			"R%d invalid %s buffer access: off=%d, size=%d",
+			regno, buf_info, off, size);
 		return -EACCES;
 	}
 	if (!tnum_is_const(reg->var_off) || reg->var_off.value) {
@@ -3071,12 +3082,45 @@ static int check_tp_buffer_access(struct bpf_verifier_env *env,
 			regno, off, tn_buf);
 		return -EACCES;
 	}
+
+	return 0;
+}
+
+static int check_tp_buffer_access(struct bpf_verifier_env *env,
+				  const struct bpf_reg_state *reg,
+				  int regno, int off, int size)
+{
+	int err;
+
+	err = __check_buffer_access(env, "tracepoint", reg, regno, off, size);
+	if (err)
+		return err;
+
 	if (off + size > env->prog->aux->max_tp_access)
 		env->prog->aux->max_tp_access = off + size;
 
 	return 0;
 }
 
+static int check_buffer_access(struct bpf_verifier_env *env,
+			       const struct bpf_reg_state *reg,
+			       int regno, int off, int size,
+			       bool zero_size_allowed,
+			       const char *buf_info,
+			       u32 *max_access)
+{
+	int err;
+
+	err = __check_buffer_access(env, buf_info, reg, regno, off, size);
+	if (err)
+		return err;
+
+	if (off + size > *max_access)
+		*max_access = off + size;
+
+	return 0;
+}
+
 /* BPF architecture zero extends alu32 ops into 64-bit registesr */
 static void zext_32_to_64(struct bpf_reg_state *reg)
 {
@@ -3427,6 +3471,23 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 	} else if (reg->type == CONST_PTR_TO_MAP) {
 		err = check_ptr_to_map_access(env, regs, regno, off, size, t,
 					      value_regno);
+	} else if (reg->type == PTR_TO_RDONLY_BUF) {
+		if (t == BPF_WRITE) {
+			verbose(env, "R%d cannot write into %s\n",
+				regno, reg_type_str[reg->type]);
+			return -EACCES;
+		}
+		err = check_buffer_access(env, reg, regno, off, size, "rdonly",
+					  false,
+					  &env->prog->aux->max_rdonly_access);
+		if (!err && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
+	} else if (reg->type == PTR_TO_RDWR_BUF) {
+		err = check_buffer_access(env, reg, regno, off, size, "rdwr",
+					  false,
+					  &env->prog->aux->max_rdwr_access);
+		if (!err && t == BPF_READ && value_regno >= 0)
+			mark_reg_unknown(env, regs, value_regno);
 	} else {
 		verbose(env, "R%d invalid mem access '%s'\n", regno,
 			reg_type_str[reg->type]);
@@ -3668,6 +3729,18 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
 		return check_mem_region_access(env, regno, reg->off,
 					       access_size, reg->mem_size,
 					       zero_size_allowed);
+	case PTR_TO_RDONLY_BUF:
+		if (meta && meta->raw_mode)
+			return -EACCES;
+		return check_buffer_access(env, reg, regno, reg->off,
+					   access_size, zero_size_allowed,
+					   "rdonly",
+					   &env->prog->aux->max_rdonly_access);
+	case PTR_TO_RDWR_BUF:
+		return check_buffer_access(env, reg, regno, reg->off,
+					   access_size, zero_size_allowed,
+					   "rdwr",
+					   &env->prog->aux->max_rdwr_access);
 	default: /* scalar_value|ptr_to_stack or invalid ptr */
 		return check_stack_boundary(env, regno, access_size,
 					    zero_size_allowed, meta);
@@ -3933,6 +4006,8 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 arg,
 		else if (!type_is_pkt_pointer(type) &&
 			 type != PTR_TO_MAP_VALUE &&
 			 type != PTR_TO_MEM &&
+			 type != PTR_TO_RDONLY_BUF &&
+			 type != PTR_TO_RDWR_BUF &&
 			 type != expected_type)
 			goto err_type;
 		meta->raw_mode = arg_type == ARG_PTR_TO_UNINIT_MEM;
@@ -6806,6 +6881,10 @@ static void mark_ptr_or_null_reg(struct bpf_func_state *state,
 			reg->type = PTR_TO_BTF_ID;
 		} else if (reg->type == PTR_TO_MEM_OR_NULL) {
 			reg->type = PTR_TO_MEM;
+		} else if (reg->type == PTR_TO_RDONLY_BUF_OR_NULL) {
+			reg->type = PTR_TO_RDONLY_BUF;
+		} else if (reg->type == PTR_TO_RDWR_BUF_OR_NULL) {
+			reg->type = PTR_TO_RDWR_BUF;
 		}
 		if (is_null) {
 			/* We don't need id and ref_obj_id from this point

From 3f9969f2c040ba2ba635b6b5a7051f404bcc634d Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Wed, 22 Jul 2020 12:51:56 -0700
Subject: [PATCH 07/73] bpf: Fix pos computation for bpf_iter seq_ops->start()

Currently, the pos pointer in bpf iterator map/task/task_file
seq_ops->start() is always incremented.
This is incorrect. It should be increased only if
*pos is 0 (for SEQ_START_TOKEN) since these start()
function actually returns the first real object.
If *pos is not 0, it merely found the object
based on the state in seq->private, and not really
advancing the *pos. This patch fixed this issue
by only incrementing *pos if it is 0.

Note that the old *pos calculation, although not
correct, does not affect correctness of bpf_iter
as bpf_iter seq_file->read() does not support llseek.

This patch also renamed "mid" in bpf_map iterator
seq_file private data to "map_id" for better clarity.

Fixes: 6086d29def80 ("bpf: Add bpf_map iterator")
Fixes: eaaacd23910f ("bpf: Add task and task/file iterator targets")
Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722195156.4029817-1-yhs@fb.com
---
 kernel/bpf/map_iter.c  | 16 ++++++----------
 kernel/bpf/task_iter.c |  6 ++++--
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 8a7af11b411f..5926c76d854e 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -7,7 +7,7 @@
 #include <linux/btf_ids.h>
 
 struct bpf_iter_seq_map_info {
-	u32 mid;
+	u32 map_id;
 };
 
 static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
@@ -15,27 +15,23 @@ static void *bpf_map_seq_start(struct seq_file *seq, loff_t *pos)
 	struct bpf_iter_seq_map_info *info = seq->private;
 	struct bpf_map *map;
 
-	map = bpf_map_get_curr_or_next(&info->mid);
+	map = bpf_map_get_curr_or_next(&info->map_id);
 	if (!map)
 		return NULL;
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	return map;
 }
 
 static void *bpf_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
 	struct bpf_iter_seq_map_info *info = seq->private;
-	struct bpf_map *map;
 
 	++*pos;
-	++info->mid;
+	++info->map_id;
 	bpf_map_put((struct bpf_map *)v);
-	map = bpf_map_get_curr_or_next(&info->mid);
-	if (!map)
-		return NULL;
-
-	return map;
+	return bpf_map_get_curr_or_next(&info->map_id);
 }
 
 struct bpf_iter__bpf_map {
diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c
index 2feecf095609..1039e52ebd8b 100644
--- a/kernel/bpf/task_iter.c
+++ b/kernel/bpf/task_iter.c
@@ -51,7 +51,8 @@ static void *task_seq_start(struct seq_file *seq, loff_t *pos)
 	if (!task)
 		return NULL;
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	return task;
 }
 
@@ -210,7 +211,8 @@ static void *task_file_seq_start(struct seq_file *seq, loff_t *pos)
 		return NULL;
 	}
 
-	++*pos;
+	if (*pos == 0)
+		++*pos;
 	info->task = task;
 	info->files = files;
 

From a5cbe05a6673b85bed2a63ffcfea6a96c6410cff Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:12 -0700
Subject: [PATCH 08/73] bpf: Implement bpf iterator for map elements

The bpf iterator for map elements are implemented.
The bpf program will receive four parameters:
  bpf_iter_meta *meta: the meta data
  bpf_map *map:        the bpf_map whose elements are traversed
  void *key:           the key of one element
  void *value:         the value of the same element

Here, meta and map pointers are always valid, and
key has register type PTR_TO_RDONLY_BUF_OR_NULL and
value has register type PTR_TO_RDWR_BUF_OR_NULL.
The kernel will track the access range of key and value
during verification time. Later, these values will be compared
against the values in the actual map to ensure all accesses
are within range.

A new field iter_seq_info is added to bpf_map_ops which
is used to add map type specific information, i.e., seq_ops,
init/fini seq_file func and seq_file private data size.
Subsequent patches will have actual implementation
for bpf_map_ops->iter_seq_info.

In user space, BPF_ITER_LINK_MAP_FD needs to be
specified in prog attr->link_create.flags, which indicates
that attr->link_create.target_fd is a map_fd.
The reason for such an explicit flag is for possible
future cases where one bpf iterator may allow more than
one possible customization, e.g., pid and cgroup id for
task_file.

Current kernel internal implementation only allows
the target to register at most one required bpf_iter_link_info.
To support the above case, optional bpf_iter_link_info's
are needed, the target can be extended to register such link
infos, and user provided link_info needs to match one of
target supported ones.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184112.590360-1-yhs@fb.com
---
 include/linux/bpf.h            | 16 +++++++
 include/uapi/linux/bpf.h       |  7 +++
 kernel/bpf/bpf_iter.c          | 85 +++++++++++++++++++++++++++-------
 kernel/bpf/map_iter.c          | 30 +++++++++++-
 tools/include/uapi/linux/bpf.h |  7 +++
 5 files changed, 128 insertions(+), 17 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f9c4bb08f616..4175cf1f4665 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -107,6 +107,9 @@ struct bpf_map_ops {
 	/* BTF name and id of struct allocated by map_alloc */
 	const char * const map_btf_name;
 	int *map_btf_id;
+
+	/* bpf_iter info used to open a seq_file */
+	const struct bpf_iter_seq_info *iter_seq_info;
 };
 
 struct bpf_map_memory {
@@ -1207,12 +1210,18 @@ int bpf_obj_get_user(const char __user *pathname, int flags);
 	int __init bpf_iter_ ## target(args) { return 0; }
 
 struct bpf_iter_aux_info {
+	struct bpf_map *map;
 };
 
+typedef int (*bpf_iter_check_target_t)(struct bpf_prog *prog,
+				       struct bpf_iter_aux_info *aux);
+
 #define BPF_ITER_CTX_ARG_MAX 2
 struct bpf_iter_reg {
 	const char *target;
+	bpf_iter_check_target_t check_target;
 	u32 ctx_arg_info_size;
+	enum bpf_iter_link_info req_linfo;
 	struct bpf_ctx_arg_aux ctx_arg_info[BPF_ITER_CTX_ARG_MAX];
 	const struct bpf_iter_seq_info *seq_info;
 };
@@ -1223,6 +1232,13 @@ struct bpf_iter_meta {
 	u64 seq_num;
 };
 
+struct bpf_iter__bpf_map_elem {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+	__bpf_md_ptr(void *, key);
+	__bpf_md_ptr(void *, value);
+};
+
 int bpf_iter_reg_target(const struct bpf_iter_reg *reg_info);
 void bpf_iter_unreg_target(const struct bpf_iter_reg *reg_info);
 bool bpf_iter_prog_supported(struct bpf_prog *prog);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 54d0c886e3ba..828c2f6438f2 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -246,6 +246,13 @@ enum bpf_link_type {
 	MAX_BPF_LINK_TYPE,
 };
 
+enum bpf_iter_link_info {
+	BPF_ITER_LINK_UNSPEC = 0,
+	BPF_ITER_LINK_MAP_FD = 1,
+
+	MAX_BPF_ITER_LINK_INFO,
+};
+
 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
  *
  * NONE(default): No further bpf programs allowed in the subtree.
diff --git a/kernel/bpf/bpf_iter.c b/kernel/bpf/bpf_iter.c
index 8fa94cb1b5a0..363b9cafc2d8 100644
--- a/kernel/bpf/bpf_iter.c
+++ b/kernel/bpf/bpf_iter.c
@@ -14,11 +14,13 @@ struct bpf_iter_target_info {
 
 struct bpf_iter_link {
 	struct bpf_link link;
+	struct bpf_iter_aux_info aux;
 	struct bpf_iter_target_info *tinfo;
 };
 
 struct bpf_iter_priv_data {
 	struct bpf_iter_target_info *tinfo;
+	const struct bpf_iter_seq_info *seq_info;
 	struct bpf_prog *prog;
 	u64 session_id;
 	u64 seq_num;
@@ -35,7 +37,8 @@ static DEFINE_MUTEX(link_mutex);
 /* incremented on every opened seq_file */
 static atomic64_t session_id;
 
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link);
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+			    const struct bpf_iter_seq_info *seq_info);
 
 static void bpf_iter_inc_seq_num(struct seq_file *seq)
 {
@@ -199,11 +202,25 @@ done:
 	return copied;
 }
 
+static const struct bpf_iter_seq_info *
+__get_seq_info(struct bpf_iter_link *link)
+{
+	const struct bpf_iter_seq_info *seq_info;
+
+	if (link->aux.map) {
+		seq_info = link->aux.map->ops->iter_seq_info;
+		if (seq_info)
+			return seq_info;
+	}
+
+	return link->tinfo->reg_info->seq_info;
+}
+
 static int iter_open(struct inode *inode, struct file *file)
 {
 	struct bpf_iter_link *link = inode->i_private;
 
-	return prepare_seq_file(file, link);
+	return prepare_seq_file(file, link, __get_seq_info(link));
 }
 
 static int iter_release(struct inode *inode, struct file *file)
@@ -218,8 +235,8 @@ static int iter_release(struct inode *inode, struct file *file)
 	iter_priv = container_of(seq->private, struct bpf_iter_priv_data,
 				 target_private);
 
-	if (iter_priv->tinfo->reg_info->seq_info->fini_seq_private)
-		iter_priv->tinfo->reg_info->seq_info->fini_seq_private(seq->private);
+	if (iter_priv->seq_info->fini_seq_private)
+		iter_priv->seq_info->fini_seq_private(seq->private);
 
 	bpf_prog_put(iter_priv->prog);
 	seq->private = iter_priv;
@@ -318,6 +335,11 @@ bool bpf_iter_prog_supported(struct bpf_prog *prog)
 
 static void bpf_iter_link_release(struct bpf_link *link)
 {
+	struct bpf_iter_link *iter_link =
+		container_of(link, struct bpf_iter_link, link);
+
+	if (iter_link->aux.map)
+		bpf_map_put_with_uref(iter_link->aux.map);
 }
 
 static void bpf_iter_link_dealloc(struct bpf_link *link)
@@ -370,14 +392,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 {
 	struct bpf_link_primer link_primer;
 	struct bpf_iter_target_info *tinfo;
+	struct bpf_iter_aux_info aux = {};
 	struct bpf_iter_link *link;
+	u32 prog_btf_id, target_fd;
 	bool existed = false;
-	u32 prog_btf_id;
+	struct bpf_map *map;
 	int err;
 
-	if (attr->link_create.target_fd || attr->link_create.flags)
-		return -EINVAL;
-
 	prog_btf_id = prog->aux->attach_btf_id;
 	mutex_lock(&targets_mutex);
 	list_for_each_entry(tinfo, &targets, list) {
@@ -390,6 +411,13 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 	if (!existed)
 		return -ENOENT;
 
+	/* Make sure user supplied flags are target expected. */
+	target_fd = attr->link_create.target_fd;
+	if (attr->link_create.flags != tinfo->reg_info->req_linfo)
+		return -EINVAL;
+	if (!attr->link_create.flags && target_fd)
+		return -EINVAL;
+
 	link = kzalloc(sizeof(*link), GFP_USER | __GFP_NOWARN);
 	if (!link)
 		return -ENOMEM;
@@ -403,21 +431,45 @@ int bpf_iter_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
 		return err;
 	}
 
+	if (tinfo->reg_info->req_linfo == BPF_ITER_LINK_MAP_FD) {
+		map = bpf_map_get_with_uref(target_fd);
+		if (IS_ERR(map)) {
+			err = PTR_ERR(map);
+			goto cleanup_link;
+		}
+
+		aux.map = map;
+		err = tinfo->reg_info->check_target(prog, &aux);
+		if (err) {
+			bpf_map_put_with_uref(map);
+			goto cleanup_link;
+		}
+
+		link->aux.map = map;
+	}
+
 	return bpf_link_settle(&link_primer);
+
+cleanup_link:
+	bpf_link_cleanup(&link_primer);
+	return err;
 }
 
 static void init_seq_meta(struct bpf_iter_priv_data *priv_data,
 			  struct bpf_iter_target_info *tinfo,
+			  const struct bpf_iter_seq_info *seq_info,
 			  struct bpf_prog *prog)
 {
 	priv_data->tinfo = tinfo;
+	priv_data->seq_info = seq_info;
 	priv_data->prog = prog;
 	priv_data->session_id = atomic64_inc_return(&session_id);
 	priv_data->seq_num = 0;
 	priv_data->done_stop = false;
 }
 
-static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
+static int prepare_seq_file(struct file *file, struct bpf_iter_link *link,
+			    const struct bpf_iter_seq_info *seq_info)
 {
 	struct bpf_iter_priv_data *priv_data;
 	struct bpf_iter_target_info *tinfo;
@@ -433,21 +485,21 @@ static int prepare_seq_file(struct file *file, struct bpf_iter_link *link)
 
 	tinfo = link->tinfo;
 	total_priv_dsize = offsetof(struct bpf_iter_priv_data, target_private) +
-			   tinfo->reg_info->seq_info->seq_priv_size;
-	priv_data = __seq_open_private(file, tinfo->reg_info->seq_info->seq_ops,
+			   seq_info->seq_priv_size;
+	priv_data = __seq_open_private(file, seq_info->seq_ops,
 				       total_priv_dsize);
 	if (!priv_data) {
 		err = -ENOMEM;
 		goto release_prog;
 	}
 
-	if (tinfo->reg_info->seq_info->init_seq_private) {
-		err = tinfo->reg_info->seq_info->init_seq_private(priv_data->target_private, NULL);
+	if (seq_info->init_seq_private) {
+		err = seq_info->init_seq_private(priv_data->target_private, &link->aux);
 		if (err)
 			goto release_seq_file;
 	}
 
-	init_seq_meta(priv_data, tinfo, prog);
+	init_seq_meta(priv_data, tinfo, seq_info, prog);
 	seq = file->private_data;
 	seq->private = priv_data->target_private;
 
@@ -463,6 +515,7 @@ release_prog:
 
 int bpf_iter_new_fd(struct bpf_link *link)
 {
+	struct bpf_iter_link *iter_link;
 	struct file *file;
 	unsigned int flags;
 	int err, fd;
@@ -481,8 +534,8 @@ int bpf_iter_new_fd(struct bpf_link *link)
 		goto free_fd;
 	}
 
-	err = prepare_seq_file(file,
-			       container_of(link, struct bpf_iter_link, link));
+	iter_link = container_of(link, struct bpf_iter_link, link);
+	err = prepare_seq_file(file, iter_link, __get_seq_info(iter_link));
 	if (err)
 		goto free_file;
 
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 1a69241fb1e2..8a1f9b3355d0 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -98,10 +98,38 @@ static struct bpf_iter_reg bpf_map_reg_info = {
 	.seq_info		= &bpf_map_seq_info,
 };
 
+static int bpf_iter_check_map(struct bpf_prog *prog,
+			      struct bpf_iter_aux_info *aux)
+{
+	return -EINVAL;
+}
+
+DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,
+		     struct bpf_map *map, void *key, void *value)
+
+static const struct bpf_iter_reg bpf_map_elem_reg_info = {
+	.target			= "bpf_map_elem",
+	.check_target		= bpf_iter_check_map,
+	.req_linfo		= BPF_ITER_LINK_MAP_FD,
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_map_elem, key),
+		  PTR_TO_RDONLY_BUF_OR_NULL },
+		{ offsetof(struct bpf_iter__bpf_map_elem, value),
+		  PTR_TO_RDWR_BUF_OR_NULL },
+	},
+};
+
 static int __init bpf_map_iter_init(void)
 {
+	int ret;
+
 	bpf_map_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_map_id;
-	return bpf_iter_reg_target(&bpf_map_reg_info);
+	ret = bpf_iter_reg_target(&bpf_map_reg_info);
+	if (ret)
+		return ret;
+
+	return bpf_iter_reg_target(&bpf_map_elem_reg_info);
 }
 
 late_initcall(bpf_map_iter_init);
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 54d0c886e3ba..828c2f6438f2 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -246,6 +246,13 @@ enum bpf_link_type {
 	MAX_BPF_LINK_TYPE,
 };
 
+enum bpf_iter_link_info {
+	BPF_ITER_LINK_UNSPEC = 0,
+	BPF_ITER_LINK_MAP_FD = 1,
+
+	MAX_BPF_ITER_LINK_INFO,
+};
+
 /* cgroup-bpf attach flags used in BPF_PROG_ATTACH command
  *
  * NONE(default): No further bpf programs allowed in the subtree.

From a228a64fc1e4428e2b96dc68e9ad3c447095c9e7 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Wed, 1 Jul 2020 18:10:18 -0700
Subject: [PATCH 09/73] bpf: Add bpf_prog iterator

It's mostly a copy paste of commit 6086d29def80 ("bpf: Add bpf_map iterator")
that is use to implement bpf_seq_file opreations to traverse all bpf programs.

v1->v2: Tweak to use build time btf_id

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Yonghong Song <yhs@fb.com>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
---
 include/linux/bpf.h    |   1 +
 kernel/bpf/Makefile    |   2 +-
 kernel/bpf/prog_iter.c | 103 +++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/syscall.c   |  19 ++++++++
 4 files changed, 124 insertions(+), 1 deletion(-)
 create mode 100644 kernel/bpf/prog_iter.c

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index bae557ff2da8..72221aea1c60 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1117,6 +1117,7 @@ int  generic_map_delete_batch(struct bpf_map *map,
 			      const union bpf_attr *attr,
 			      union bpf_attr __user *uattr);
 struct bpf_map *bpf_map_get_curr_or_next(u32 *id);
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id);
 
 extern int sysctl_unprivileged_bpf_disabled;
 
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 1131a921e1a6..e6eb9c0402da 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -2,7 +2,7 @@
 obj-y := core.o
 CFLAGS_core.o += $(call cc-disable-warning, override-init)
 
-obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o
+obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
 obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
 obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
 obj-$(CONFIG_BPF_SYSCALL) += disasm.o
diff --git a/kernel/bpf/prog_iter.c b/kernel/bpf/prog_iter.c
new file mode 100644
index 000000000000..6541b577d69f
--- /dev/null
+++ b/kernel/bpf/prog_iter.c
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <linux/fs.h>
+#include <linux/filter.h>
+#include <linux/kernel.h>
+#include <linux/btf_ids.h>
+
+struct bpf_iter_seq_prog_info {
+	u32 prog_id;
+};
+
+static void *bpf_prog_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_prog_info *info = seq->private;
+	struct bpf_prog *prog;
+
+	prog = bpf_prog_get_curr_or_next(&info->prog_id);
+	if (!prog)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return prog;
+}
+
+static void *bpf_prog_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_prog_info *info = seq->private;
+
+	++*pos;
+	++info->prog_id;
+	bpf_prog_put((struct bpf_prog *)v);
+	return bpf_prog_get_curr_or_next(&info->prog_id);
+}
+
+struct bpf_iter__bpf_prog {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_prog *, prog);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_prog, struct bpf_iter_meta *meta, struct bpf_prog *prog)
+
+static int __bpf_prog_seq_show(struct seq_file *seq, void *v, bool in_stop)
+{
+	struct bpf_iter__bpf_prog ctx;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	ctx.meta = &meta;
+	ctx.prog = v;
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, in_stop);
+	if (prog)
+		ret = bpf_iter_run_prog(prog, &ctx);
+
+	return ret;
+}
+
+static int bpf_prog_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_prog_seq_show(seq, v, false);
+}
+
+static void bpf_prog_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_prog_seq_show(seq, v, true);
+	else
+		bpf_prog_put((struct bpf_prog *)v);
+}
+
+static const struct seq_operations bpf_prog_seq_ops = {
+	.start	= bpf_prog_seq_start,
+	.next	= bpf_prog_seq_next,
+	.stop	= bpf_prog_seq_stop,
+	.show	= bpf_prog_seq_show,
+};
+
+BTF_ID_LIST(btf_bpf_prog_id)
+BTF_ID(struct, bpf_prog)
+
+static struct bpf_iter_reg bpf_prog_reg_info = {
+	.target			= "bpf_prog",
+	.seq_ops		= &bpf_prog_seq_ops,
+	.init_seq_private	= NULL,
+	.fini_seq_private	= NULL,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_prog_info),
+	.ctx_arg_info_size	= 1,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_prog, prog),
+		  PTR_TO_BTF_ID_OR_NULL },
+	},
+};
+
+static int __init bpf_prog_iter_init(void)
+{
+	bpf_prog_reg_info.ctx_arg_info[0].btf_id = *btf_bpf_prog_id;
+	return bpf_iter_reg_target(&bpf_prog_reg_info);
+}
+
+late_initcall(bpf_prog_iter_init);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index d07417d17712..ee290b1f2d9e 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3044,6 +3044,25 @@ again:
 	return map;
 }
 
+struct bpf_prog *bpf_prog_get_curr_or_next(u32 *id)
+{
+	struct bpf_prog *prog;
+
+	spin_lock_bh(&prog_idr_lock);
+again:
+	prog = idr_get_next(&prog_idr, id);
+	if (prog) {
+		prog = bpf_prog_inc_not_zero(prog);
+		if (IS_ERR(prog)) {
+			(*id)++;
+			goto again;
+		}
+	}
+	spin_unlock_bh(&prog_idr_lock);
+
+	return prog;
+}
+
 #define BPF_PROG_GET_FD_BY_ID_LAST_FIELD prog_id
 
 struct bpf_prog *bpf_prog_by_id(u32 id)

From d6c4503cc29638f328e1a6e6fefbdbda401c28fc Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:14 -0700
Subject: [PATCH 10/73] bpf: Implement bpf iterator for hash maps

The bpf iterators for hash, percpu hash, lru hash
and lru percpu hash are implemented. During link time,
bpf_iter_reg->check_target() will check map type
and ensure the program access key/value region is
within the map defined key/value size limit.

For percpu hash and lru hash maps, the bpf program
will receive values for all cpus. The map element
bpf iterator infrastructure will prepare value
properly before passing the value pointer to the
bpf program.

This patch set supports readonly map keys and
read/write map values. It does not support deleting
map elements, e.g., from hash tables. If there is
a user case for this, the following mechanism can
be used to support map deletion for hashtab, etc.
  - permit a new bpf program return value, e.g., 2,
    to let bpf iterator know the map element should
    be removed.
  - since bucket lock is taken, the map element will be
    queued.
  - once bucket lock is released after all elements under
    this bucket are traversed, all to-be-deleted map
    elements can be deleted.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184114.590470-1-yhs@fb.com
---
 kernel/bpf/hashtab.c  | 194 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_iter.c |  24 +++++-
 2 files changed, 217 insertions(+), 1 deletion(-)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index d4378d7d442b..024276787055 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -1612,6 +1612,196 @@ htab_lru_map_lookup_and_delete_batch(struct bpf_map *map,
 						  true, false);
 }
 
+struct bpf_iter_seq_hash_map_info {
+	struct bpf_map *map;
+	struct bpf_htab *htab;
+	void *percpu_value_buf; // non-zero means percpu hash
+	unsigned long flags;
+	u32 bucket_id;
+	u32 skip_elems;
+};
+
+static struct htab_elem *
+bpf_hash_map_seq_find_next(struct bpf_iter_seq_hash_map_info *info,
+			   struct htab_elem *prev_elem)
+{
+	const struct bpf_htab *htab = info->htab;
+	unsigned long flags = info->flags;
+	u32 skip_elems = info->skip_elems;
+	u32 bucket_id = info->bucket_id;
+	struct hlist_nulls_head *head;
+	struct hlist_nulls_node *n;
+	struct htab_elem *elem;
+	struct bucket *b;
+	u32 i, count;
+
+	if (bucket_id >= htab->n_buckets)
+		return NULL;
+
+	/* try to find next elem in the same bucket */
+	if (prev_elem) {
+		/* no update/deletion on this bucket, prev_elem should be still valid
+		 * and we won't skip elements.
+		 */
+		n = rcu_dereference_raw(hlist_nulls_next_rcu(&prev_elem->hash_node));
+		elem = hlist_nulls_entry_safe(n, struct htab_elem, hash_node);
+		if (elem)
+			return elem;
+
+		/* not found, unlock and go to the next bucket */
+		b = &htab->buckets[bucket_id++];
+		htab_unlock_bucket(htab, b, flags);
+		skip_elems = 0;
+	}
+
+	for (i = bucket_id; i < htab->n_buckets; i++) {
+		b = &htab->buckets[i];
+		flags = htab_lock_bucket(htab, b);
+
+		count = 0;
+		head = &b->head;
+		hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) {
+			if (count >= skip_elems) {
+				info->flags = flags;
+				info->bucket_id = i;
+				info->skip_elems = count;
+				return elem;
+			}
+			count++;
+		}
+
+		htab_unlock_bucket(htab, b, flags);
+		skip_elems = 0;
+	}
+
+	info->bucket_id = i;
+	info->skip_elems = 0;
+	return NULL;
+}
+
+static void *bpf_hash_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+	struct htab_elem *elem;
+
+	elem = bpf_hash_map_seq_find_next(info, NULL);
+	if (!elem)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return elem;
+}
+
+static void *bpf_hash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+	++*pos;
+	++info->skip_elems;
+	return bpf_hash_map_seq_find_next(info, v);
+}
+
+static int __bpf_hash_map_seq_show(struct seq_file *seq, struct htab_elem *elem)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+	u32 roundup_key_size, roundup_value_size;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_map *map = info->map;
+	struct bpf_iter_meta meta;
+	int ret = 0, off = 0, cpu;
+	struct bpf_prog *prog;
+	void __percpu *pptr;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, elem == NULL);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.map = info->map;
+		if (elem) {
+			roundup_key_size = round_up(map->key_size, 8);
+			ctx.key = elem->key;
+			if (!info->percpu_value_buf) {
+				ctx.value = elem->key + roundup_key_size;
+			} else {
+				roundup_value_size = round_up(map->value_size, 8);
+				pptr = htab_elem_get_ptr(elem, map->key_size);
+				for_each_possible_cpu(cpu) {
+					bpf_long_memcpy(info->percpu_value_buf + off,
+							per_cpu_ptr(pptr, cpu),
+							roundup_value_size);
+					off += roundup_value_size;
+				}
+				ctx.value = info->percpu_value_buf;
+			}
+		}
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int bpf_hash_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_hash_map_seq_show(seq, v);
+}
+
+static void bpf_hash_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_hash_map_info *info = seq->private;
+
+	if (!v)
+		(void)__bpf_hash_map_seq_show(seq, NULL);
+	else
+		htab_unlock_bucket(info->htab,
+				   &info->htab->buckets[info->bucket_id],
+				   info->flags);
+}
+
+static int bpf_iter_init_hash_map(void *priv_data,
+				  struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+	struct bpf_map *map = aux->map;
+	void *value_buf;
+	u32 buf_size;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH) {
+		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+		if (!value_buf)
+			return -ENOMEM;
+
+		seq_info->percpu_value_buf = value_buf;
+	}
+
+	seq_info->map = map;
+	seq_info->htab = container_of(map, struct bpf_htab, map);
+	return 0;
+}
+
+static void bpf_iter_fini_hash_map(void *priv_data)
+{
+	struct bpf_iter_seq_hash_map_info *seq_info = priv_data;
+
+	kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_hash_map_seq_ops = {
+	.start	= bpf_hash_map_seq_start,
+	.next	= bpf_hash_map_seq_next,
+	.stop	= bpf_hash_map_seq_stop,
+	.show	= bpf_hash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+	.seq_ops		= &bpf_hash_map_seq_ops,
+	.init_seq_private	= bpf_iter_init_hash_map,
+	.fini_seq_private	= bpf_iter_fini_hash_map,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_hash_map_info),
+};
+
 static int htab_map_btf_id;
 const struct bpf_map_ops htab_map_ops = {
 	.map_alloc_check = htab_map_alloc_check,
@@ -1626,6 +1816,7 @@ const struct bpf_map_ops htab_map_ops = {
 	BATCH_OPS(htab),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int htab_lru_map_btf_id;
@@ -1643,6 +1834,7 @@ const struct bpf_map_ops htab_lru_map_ops = {
 	BATCH_OPS(htab_lru),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 /* Called from eBPF program */
@@ -1760,6 +1952,7 @@ const struct bpf_map_ops htab_percpu_map_ops = {
 	BATCH_OPS(htab_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_percpu_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int htab_lru_percpu_map_btf_id;
@@ -1775,6 +1968,7 @@ const struct bpf_map_ops htab_lru_percpu_map_ops = {
 	BATCH_OPS(htab_lru_percpu),
 	.map_btf_name = "bpf_htab",
 	.map_btf_id = &htab_lru_percpu_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int fd_htab_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index 8a1f9b3355d0..bcb68b55bf65 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -101,7 +101,29 @@ static struct bpf_iter_reg bpf_map_reg_info = {
 static int bpf_iter_check_map(struct bpf_prog *prog,
 			      struct bpf_iter_aux_info *aux)
 {
-	return -EINVAL;
+	u32 key_acc_size, value_acc_size, key_size, value_size;
+	struct bpf_map *map = aux->map;
+	bool is_percpu = false;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH)
+		is_percpu = true;
+	else if (map->map_type != BPF_MAP_TYPE_HASH &&
+		 map->map_type != BPF_MAP_TYPE_LRU_HASH)
+		return -EINVAL;
+
+	key_acc_size = prog->aux->max_rdonly_access;
+	value_acc_size = prog->aux->max_rdwr_access;
+	key_size = map->key_size;
+	if (!is_percpu)
+		value_size = map->value_size;
+	else
+		value_size = round_up(map->value_size, 8) * num_possible_cpus();
+
+	if (key_acc_size > key_size || value_acc_size > value_size)
+		return -EACCES;
+
+	return 0;
 }
 
 DEFINE_BPF_ITER_FUNC(bpf_map_elem, struct bpf_iter_meta *meta,

From d3cc2ab546adc6e52b65f36f7c34820d2830d0c9 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:15 -0700
Subject: [PATCH 11/73] bpf: Implement bpf iterator for array maps

The bpf iterators for array and percpu array
are implemented. Similar to hash maps, for percpu
array map, bpf program will receive values
from all cpus.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184115.590532-1-yhs@fb.com
---
 kernel/bpf/arraymap.c | 138 ++++++++++++++++++++++++++++++++++++++++++
 kernel/bpf/map_iter.c |   6 +-
 2 files changed, 142 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index c66e8273fccd..8ff419b632a6 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -487,6 +487,142 @@ static int array_map_mmap(struct bpf_map *map, struct vm_area_struct *vma)
 				   vma->vm_pgoff + pgoff);
 }
 
+struct bpf_iter_seq_array_map_info {
+	struct bpf_map *map;
+	void *percpu_value_buf;
+	u32 index;
+};
+
+static void *bpf_array_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_map *map = info->map;
+	struct bpf_array *array;
+	u32 index;
+
+	if (info->index >= map->max_entries)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	array = container_of(map, struct bpf_array, map);
+	index = info->index & array->index_mask;
+	if (info->percpu_value_buf)
+	       return array->pptrs[index];
+	return array->value + array->elem_size * index;
+}
+
+static void *bpf_array_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_map *map = info->map;
+	struct bpf_array *array;
+	u32 index;
+
+	++*pos;
+	++info->index;
+	if (info->index >= map->max_entries)
+		return NULL;
+
+	array = container_of(map, struct bpf_array, map);
+	index = info->index & array->index_mask;
+	if (info->percpu_value_buf)
+	       return array->pptrs[index];
+	return array->value + array->elem_size * index;
+}
+
+static int __bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_array_map_info *info = seq->private;
+	struct bpf_iter__bpf_map_elem ctx = {};
+	struct bpf_map *map = info->map;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int off = 0, cpu = 0;
+	void __percpu **pptr;
+	u32 size;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, v == NULL);
+	if (!prog)
+		return 0;
+
+	ctx.meta = &meta;
+	ctx.map = info->map;
+	if (v) {
+		ctx.key = &info->index;
+
+		if (!info->percpu_value_buf) {
+			ctx.value = v;
+		} else {
+			pptr = v;
+			size = round_up(map->value_size, 8);
+			for_each_possible_cpu(cpu) {
+				bpf_long_memcpy(info->percpu_value_buf + off,
+						per_cpu_ptr(pptr, cpu),
+						size);
+				off += size;
+			}
+			ctx.value = info->percpu_value_buf;
+		}
+	}
+
+	return bpf_iter_run_prog(prog, &ctx);
+}
+
+static int bpf_array_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_array_map_seq_show(seq, v);
+}
+
+static void bpf_array_map_seq_stop(struct seq_file *seq, void *v)
+{
+	if (!v)
+		(void)__bpf_array_map_seq_show(seq, NULL);
+}
+
+static int bpf_iter_init_array_map(void *priv_data,
+				   struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+	struct bpf_map *map = aux->map;
+	void *value_buf;
+	u32 buf_size;
+
+	if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) {
+		buf_size = round_up(map->value_size, 8) * num_possible_cpus();
+		value_buf = kmalloc(buf_size, GFP_USER | __GFP_NOWARN);
+		if (!value_buf)
+			return -ENOMEM;
+
+		seq_info->percpu_value_buf = value_buf;
+	}
+
+	seq_info->map = map;
+	return 0;
+}
+
+static void bpf_iter_fini_array_map(void *priv_data)
+{
+	struct bpf_iter_seq_array_map_info *seq_info = priv_data;
+
+	kfree(seq_info->percpu_value_buf);
+}
+
+static const struct seq_operations bpf_array_map_seq_ops = {
+	.start	= bpf_array_map_seq_start,
+	.next	= bpf_array_map_seq_next,
+	.stop	= bpf_array_map_seq_stop,
+	.show	= bpf_array_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+	.seq_ops		= &bpf_array_map_seq_ops,
+	.init_seq_private	= bpf_iter_init_array_map,
+	.fini_seq_private	= bpf_iter_fini_array_map,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_array_map_info),
+};
+
 static int array_map_btf_id;
 const struct bpf_map_ops array_map_ops = {
 	.map_alloc_check = array_map_alloc_check,
@@ -506,6 +642,7 @@ const struct bpf_map_ops array_map_ops = {
 	.map_update_batch = generic_map_update_batch,
 	.map_btf_name = "bpf_array",
 	.map_btf_id = &array_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int percpu_array_map_btf_id;
@@ -521,6 +658,7 @@ const struct bpf_map_ops percpu_array_map_ops = {
 	.map_check_btf = array_map_check_btf,
 	.map_btf_name = "bpf_array",
 	.map_btf_id = &percpu_array_map_btf_id,
+	.iter_seq_info = &iter_seq_info,
 };
 
 static int fd_array_map_alloc_check(union bpf_attr *attr)
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c
index bcb68b55bf65..fbe1f557cb88 100644
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -106,10 +106,12 @@ static int bpf_iter_check_map(struct bpf_prog *prog,
 	bool is_percpu = false;
 
 	if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
-	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH)
+	    map->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH ||
+	    map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
 		is_percpu = true;
 	else if (map->map_type != BPF_MAP_TYPE_HASH &&
-		 map->map_type != BPF_MAP_TYPE_LRU_HASH)
+		 map->map_type != BPF_MAP_TYPE_LRU_HASH &&
+		 map->map_type != BPF_MAP_TYPE_ARRAY)
 		return -EINVAL;
 
 	key_acc_size = prog->aux->max_rdonly_access;

From 5ce6e77c7edf7310a0ff9532fd6b9693c082ab32 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:16 -0700
Subject: [PATCH 12/73] bpf: Implement bpf iterator for sock local storage map

The bpf iterator for bpf sock local storage map
is implemented. User space interacts with sock
local storage map with fd as a key and storage value.
In kernel, passing fd to the bpf program does not
really make sense. In this case, the sock itself is
passed to bpf program.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184116.590602-1-yhs@fb.com
---
 net/core/bpf_sk_storage.c | 206 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 206 insertions(+)

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 6f921c4ddc2c..eafcd15e7dfd 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -6,6 +6,7 @@
 #include <linux/types.h>
 #include <linux/spinlock.h>
 #include <linux/bpf.h>
+#include <linux/btf_ids.h>
 #include <net/bpf_sk_storage.h>
 #include <net/sock.h>
 #include <uapi/linux/sock_diag.h>
@@ -1217,3 +1218,208 @@ int bpf_sk_storage_diag_put(struct bpf_sk_storage_diag *diag,
 	return err;
 }
 EXPORT_SYMBOL_GPL(bpf_sk_storage_diag_put);
+
+struct bpf_iter_seq_sk_storage_map_info {
+	struct bpf_map *map;
+	unsigned int bucket_id;
+	unsigned skip_elems;
+};
+
+static struct bpf_sk_storage_elem *
+bpf_sk_storage_map_seq_find_next(struct bpf_iter_seq_sk_storage_map_info *info,
+				 struct bpf_sk_storage_elem *prev_selem)
+{
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_sk_storage_elem *selem;
+	u32 skip_elems = info->skip_elems;
+	struct bpf_sk_storage_map *smap;
+	u32 bucket_id = info->bucket_id;
+	u32 i, count, n_buckets;
+	struct bucket *b;
+
+	smap = (struct bpf_sk_storage_map *)info->map;
+	n_buckets = 1U << smap->bucket_log;
+	if (bucket_id >= n_buckets)
+		return NULL;
+
+	/* try to find next selem in the same bucket */
+	selem = prev_selem;
+	count = 0;
+	while (selem) {
+		selem = hlist_entry_safe(selem->map_node.next,
+					 struct bpf_sk_storage_elem, map_node);
+		if (!selem) {
+			/* not found, unlock and go to the next bucket */
+			b = &smap->buckets[bucket_id++];
+			raw_spin_unlock_bh(&b->lock);
+			skip_elems = 0;
+			break;
+		}
+		sk_storage = rcu_dereference_raw(selem->sk_storage);
+		if (sk_storage) {
+			info->skip_elems = skip_elems + count;
+			return selem;
+		}
+		count++;
+	}
+
+	for (i = bucket_id; i < (1U << smap->bucket_log); i++) {
+		b = &smap->buckets[i];
+		raw_spin_lock_bh(&b->lock);
+		count = 0;
+		hlist_for_each_entry(selem, &b->list, map_node) {
+			sk_storage = rcu_dereference_raw(selem->sk_storage);
+			if (sk_storage && count >= skip_elems) {
+				info->bucket_id = i;
+				info->skip_elems = count;
+				return selem;
+			}
+			count++;
+		}
+		raw_spin_unlock_bh(&b->lock);
+		skip_elems = 0;
+	}
+
+	info->bucket_id = i;
+	info->skip_elems = 0;
+	return NULL;
+}
+
+static void *bpf_sk_storage_map_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	struct bpf_sk_storage_elem *selem;
+
+	selem = bpf_sk_storage_map_seq_find_next(seq->private, NULL);
+	if (!selem)
+		return NULL;
+
+	if (*pos == 0)
+		++*pos;
+	return selem;
+}
+
+static void *bpf_sk_storage_map_seq_next(struct seq_file *seq, void *v,
+					 loff_t *pos)
+{
+	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+
+	++*pos;
+	++info->skip_elems;
+	return bpf_sk_storage_map_seq_find_next(seq->private, v);
+}
+
+struct bpf_iter__bpf_sk_storage_map {
+	__bpf_md_ptr(struct bpf_iter_meta *, meta);
+	__bpf_md_ptr(struct bpf_map *, map);
+	__bpf_md_ptr(struct sock *, sk);
+	__bpf_md_ptr(void *, value);
+};
+
+DEFINE_BPF_ITER_FUNC(bpf_sk_storage_map, struct bpf_iter_meta *meta,
+		     struct bpf_map *map, struct sock *sk,
+		     void *value)
+
+static int __bpf_sk_storage_map_seq_show(struct seq_file *seq,
+					 struct bpf_sk_storage_elem *selem)
+{
+	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+	struct bpf_iter__bpf_sk_storage_map ctx = {};
+	struct bpf_sk_storage *sk_storage;
+	struct bpf_iter_meta meta;
+	struct bpf_prog *prog;
+	int ret = 0;
+
+	meta.seq = seq;
+	prog = bpf_iter_get_info(&meta, selem == NULL);
+	if (prog) {
+		ctx.meta = &meta;
+		ctx.map = info->map;
+		if (selem) {
+			sk_storage = rcu_dereference_raw(selem->sk_storage);
+			ctx.sk = sk_storage->sk;
+			ctx.value = SDATA(selem)->data;
+		}
+		ret = bpf_iter_run_prog(prog, &ctx);
+	}
+
+	return ret;
+}
+
+static int bpf_sk_storage_map_seq_show(struct seq_file *seq, void *v)
+{
+	return __bpf_sk_storage_map_seq_show(seq, v);
+}
+
+static void bpf_sk_storage_map_seq_stop(struct seq_file *seq, void *v)
+{
+	struct bpf_iter_seq_sk_storage_map_info *info = seq->private;
+	struct bpf_sk_storage_map *smap;
+	struct bucket *b;
+
+	if (!v) {
+		(void)__bpf_sk_storage_map_seq_show(seq, v);
+	} else {
+		smap = (struct bpf_sk_storage_map *)info->map;
+		b = &smap->buckets[info->bucket_id];
+		raw_spin_unlock_bh(&b->lock);
+	}
+}
+
+static int bpf_iter_init_sk_storage_map(void *priv_data,
+					struct bpf_iter_aux_info *aux)
+{
+	struct bpf_iter_seq_sk_storage_map_info *seq_info = priv_data;
+
+	seq_info->map = aux->map;
+	return 0;
+}
+
+static int bpf_iter_check_map(struct bpf_prog *prog,
+			      struct bpf_iter_aux_info *aux)
+{
+	struct bpf_map *map = aux->map;
+
+	if (map->map_type != BPF_MAP_TYPE_SK_STORAGE)
+		return -EINVAL;
+
+	if (prog->aux->max_rdonly_access > map->value_size)
+		return -EACCES;
+
+	return 0;
+}
+
+static const struct seq_operations bpf_sk_storage_map_seq_ops = {
+	.start  = bpf_sk_storage_map_seq_start,
+	.next   = bpf_sk_storage_map_seq_next,
+	.stop   = bpf_sk_storage_map_seq_stop,
+	.show   = bpf_sk_storage_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info iter_seq_info = {
+	.seq_ops		= &bpf_sk_storage_map_seq_ops,
+	.init_seq_private	= bpf_iter_init_sk_storage_map,
+	.fini_seq_private	= NULL,
+	.seq_priv_size		= sizeof(struct bpf_iter_seq_sk_storage_map_info),
+};
+
+static struct bpf_iter_reg bpf_sk_storage_map_reg_info = {
+	.target			= "bpf_sk_storage_map",
+	.check_target		= bpf_iter_check_map,
+	.req_linfo		= BPF_ITER_LINK_MAP_FD,
+	.ctx_arg_info_size	= 2,
+	.ctx_arg_info		= {
+		{ offsetof(struct bpf_iter__bpf_sk_storage_map, sk),
+		  PTR_TO_BTF_ID_OR_NULL },
+		{ offsetof(struct bpf_iter__bpf_sk_storage_map, value),
+		  PTR_TO_RDWR_BUF_OR_NULL },
+	},
+	.seq_info		= &iter_seq_info,
+};
+
+static int __init bpf_sk_storage_map_iter_init(void)
+{
+	bpf_sk_storage_map_reg_info.ctx_arg_info[0].btf_id =
+		btf_sock_ids[BTF_SOCK_TYPE_SOCK];
+	return bpf_iter_reg_target(&bpf_sk_storage_map_reg_info);
+}
+late_initcall(bpf_sk_storage_map_iter_init);

From cd31039a7347610863aa8b77a9162048999723d0 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:17 -0700
Subject: [PATCH 13/73] tools/libbpf: Add support for bpf map element iterator

Add map_fd to bpf_iter_attach_opts and flags to
bpf_link_create_opts. Later on, bpftool or selftest
will be able to create a bpf map element iterator
by passing map_fd to the kernel during link
creation time.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184117.590673-1-yhs@fb.com
---
 tools/lib/bpf/bpf.c    |  1 +
 tools/lib/bpf/bpf.h    |  3 ++-
 tools/lib/bpf/libbpf.c | 10 +++++++++-
 tools/lib/bpf/libbpf.h |  3 ++-
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index a7329b671c41..e1bdf214f75f 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -598,6 +598,7 @@ int bpf_link_create(int prog_fd, int target_fd,
 	attr.link_create.prog_fd = prog_fd;
 	attr.link_create.target_fd = target_fd;
 	attr.link_create.attach_type = attach_type;
+	attr.link_create.flags = OPTS_GET(opts, flags, 0);
 
 	return sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr));
 }
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index dbef24ebcfcb..6d367e01d05e 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -170,8 +170,9 @@ LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd,
 
 struct bpf_link_create_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
+	__u32 flags;
 };
-#define bpf_link_create_opts__last_field sz
+#define bpf_link_create_opts__last_field flags
 
 LIBBPF_API int bpf_link_create(int prog_fd, int target_fd,
 			       enum bpf_attach_type attach_type,
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 846164c79df1..a05aa7e2bab6 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -8282,13 +8282,20 @@ struct bpf_link *
 bpf_program__attach_iter(struct bpf_program *prog,
 			 const struct bpf_iter_attach_opts *opts)
 {
+	DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_create_opts);
 	char errmsg[STRERR_BUFSIZE];
 	struct bpf_link *link;
 	int prog_fd, link_fd;
+	__u32 target_fd = 0;
 
 	if (!OPTS_VALID(opts, bpf_iter_attach_opts))
 		return ERR_PTR(-EINVAL);
 
+	if (OPTS_HAS(opts, map_fd)) {
+		target_fd = opts->map_fd;
+		link_create_opts.flags = BPF_ITER_LINK_MAP_FD;
+	}
+
 	prog_fd = bpf_program__fd(prog);
 	if (prog_fd < 0) {
 		pr_warn("program '%s': can't attach before loaded\n",
@@ -8301,7 +8308,8 @@ bpf_program__attach_iter(struct bpf_program *prog,
 		return ERR_PTR(-ENOMEM);
 	link->detach = &bpf_link__detach_fd;
 
-	link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_ITER, NULL);
+	link_fd = bpf_link_create(prog_fd, target_fd, BPF_TRACE_ITER,
+				  &link_create_opts);
 	if (link_fd < 0) {
 		link_fd = -errno;
 		free(link);
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c2272132e929..c6813791fa7e 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -264,8 +264,9 @@ LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(struct bpf_map *map);
 
 struct bpf_iter_attach_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
+	__u32 map_fd;
 };
-#define bpf_iter_attach_opts__last_field sz
+#define bpf_iter_attach_opts__last_field map_fd
 
 LIBBPF_API struct bpf_link *
 bpf_program__attach_iter(struct bpf_program *prog,

From d8793aca708602c676372b03d6493972457524af Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:19 -0700
Subject: [PATCH 14/73] tools/bpftool: Add bpftool support for bpf map element
 iterator

The optional parameter "map MAP" can be added to "bpftool iter"
command to create a bpf iterator for map elements. For example,
  bpftool iter pin ./prog.o /sys/fs/bpf/p1 map id 333

For map element bpf iterator "map MAP" parameter is required.
Otherwise, bpf link creation will return an error.

Quentin Monnet kindly provided bash-completion implementation
for new "map MAP" option.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184119.590799-1-yhs@fb.com
---
 .../bpftool/Documentation/bpftool-iter.rst    | 18 ++++++++--
 tools/bpf/bpftool/bash-completion/bpftool     | 18 +++++++++-
 tools/bpf/bpftool/iter.c                      | 33 ++++++++++++++++---
 3 files changed, 62 insertions(+), 7 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-iter.rst b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
index 8dce698eab79..070ffacb42b5 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-iter.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-iter.rst
@@ -17,14 +17,15 @@ SYNOPSIS
 ITER COMMANDS
 ===================
 
-|	**bpftool** **iter pin** *OBJ* *PATH*
+|	**bpftool** **iter pin** *OBJ* *PATH* [**map** *MAP*]
 |	**bpftool** **iter help**
 |
 |	*OBJ* := /a/file/of/bpf_iter_target.o
+|	*MAP* := { **id** *MAP_ID* | **pinned** *FILE* }
 
 DESCRIPTION
 ===========
-	**bpftool iter pin** *OBJ* *PATH*
+	**bpftool iter pin** *OBJ* *PATH* [**map** *MAP*]
 		  A bpf iterator combines a kernel iterating of
 		  particular kernel data (e.g., tasks, bpf_maps, etc.)
 		  and a bpf program called for each kernel data object
@@ -37,6 +38,12 @@ DESCRIPTION
 		  character ('.'), which is reserved for future extensions
 		  of *bpffs*.
 
+		  Map element bpf iterator requires an additional parameter
+		  *MAP* so bpf program can iterate over map elements for
+		  that map. User can have a bpf program in kernel to run
+		  with each map element, do checking, filtering, aggregation,
+		  etc. without copying data to user space.
+
 		  User can then *cat PATH* to see the bpf iterator output.
 
 	**bpftool iter help**
@@ -64,6 +71,13 @@ EXAMPLES
    Create a file-based bpf iterator from bpf_iter_netlink.o and pin it
    to /sys/fs/bpf/my_netlink
 
+**# bpftool iter pin bpf_iter_hashmap.o /sys/fs/bpf/my_hashmap map id 20**
+
+::
+
+   Create a file-based bpf iterator from bpf_iter_hashmap.o and map with
+   id 20, and pin it to /sys/fs/bpf/my_hashmap
+
 SEE ALSO
 ========
 	**bpf**\ (2),
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 7b137264ea3a..257fa310ea2b 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -615,7 +615,23 @@ _bpftool()
         iter)
             case $command in
                 pin)
-                    _filedir
+                    case $prev in
+                        $command)
+                            _filedir
+                            ;;
+                        id)
+                            _bpftool_get_map_ids
+                            ;;
+                        name)
+                            _bpftool_get_map_names
+                            ;;
+                        pinned)
+                            _filedir
+                            ;;
+                        *)
+                            _bpftool_one_of_list $MAP_TYPE
+                            ;;
+                    esac
                     return 0
                     ;;
                 *)
diff --git a/tools/bpf/bpftool/iter.c b/tools/bpf/bpftool/iter.c
index 33240fcc6319..c9dba7543dba 100644
--- a/tools/bpf/bpftool/iter.c
+++ b/tools/bpf/bpftool/iter.c
@@ -2,6 +2,7 @@
 // Copyright (C) 2020 Facebook
 
 #define _GNU_SOURCE
+#include <unistd.h>
 #include <linux/err.h>
 #include <bpf/libbpf.h>
 
@@ -9,11 +10,12 @@
 
 static int do_pin(int argc, char **argv)
 {
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, iter_opts);
 	const char *objfile, *path;
 	struct bpf_program *prog;
 	struct bpf_object *obj;
 	struct bpf_link *link;
-	int err;
+	int err = -1, map_fd = -1;
 
 	if (!REQ_ARGS(2))
 		usage();
@@ -21,10 +23,26 @@ static int do_pin(int argc, char **argv)
 	objfile = GET_ARG();
 	path = GET_ARG();
 
+	/* optional arguments */
+	if (argc) {
+		if (is_prefix(*argv, "map")) {
+			NEXT_ARG();
+
+			if (!REQ_ARGS(2)) {
+				p_err("incorrect map spec");
+				return -1;
+			}
+
+			map_fd = map_parse_fd(&argc, &argv);
+			if (map_fd < 0)
+				return -1;
+		}
+	}
+
 	obj = bpf_object__open(objfile);
 	if (IS_ERR(obj)) {
 		p_err("can't open objfile %s", objfile);
-		return -1;
+		goto close_map_fd;
 	}
 
 	err = bpf_object__load(obj);
@@ -39,7 +57,10 @@ static int do_pin(int argc, char **argv)
 		goto close_obj;
 	}
 
-	link = bpf_program__attach_iter(prog, NULL);
+	if (map_fd >= 0)
+		iter_opts.map_fd = map_fd;
+
+	link = bpf_program__attach_iter(prog, &iter_opts);
 	if (IS_ERR(link)) {
 		err = PTR_ERR(link);
 		p_err("attach_iter failed for program %s",
@@ -62,14 +83,18 @@ close_link:
 	bpf_link__destroy(link);
 close_obj:
 	bpf_object__close(obj);
+close_map_fd:
+	if (map_fd >= 0)
+		close(map_fd);
 	return err;
 }
 
 static int do_help(int argc, char **argv)
 {
 	fprintf(stderr,
-		"Usage: %1$s %2$s pin OBJ PATH\n"
+		"Usage: %1$s %2$s pin OBJ PATH [map MAP]\n"
 		"       %1$s %2$s help\n"
+		"       " HELP_SPEC_MAP "\n"
 		"",
 		bin_name, "iter");
 

From 2a7c2fff7dd6e87634e47ddb2d2c7f272708dbbf Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:20 -0700
Subject: [PATCH 15/73] selftests/bpf: Add test for bpf hash map iterators

Two subtests are added.
  $ ./test_progs -n 4
  ...
  #4/18 bpf_hash_map:OK
  #4/19 bpf_percpu_hash_map:OK
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184120.590916-1-yhs@fb.com
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 187 ++++++++++++++++++
 .../bpf/progs/bpf_iter_bpf_hash_map.c         | 100 ++++++++++
 .../bpf/progs/bpf_iter_bpf_percpu_hash_map.c  |  50 +++++
 3 files changed, 337 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index fed42755416d..72790b600c62 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -15,6 +15,8 @@
 #include "bpf_iter_test_kern2.skel.h"
 #include "bpf_iter_test_kern3.skel.h"
 #include "bpf_iter_test_kern4.skel.h"
+#include "bpf_iter_bpf_hash_map.skel.h"
+#include "bpf_iter_bpf_percpu_hash_map.skel.h"
 
 static int duration;
 
@@ -455,6 +457,187 @@ out:
 	bpf_iter_test_kern4__destroy(skel);
 }
 
+static void test_bpf_hash_map(void)
+{
+	__u32 expected_key_a = 0, expected_key_b = 0, expected_key_c = 0;
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_bpf_hash_map *skel;
+	int err, i, len, map_fd, iter_fd;
+	__u64 val, expected_val = 0;
+	struct bpf_link *link;
+	struct key_t {
+		int a;
+		int b;
+		int c;
+	} key;
+	char buf[64];
+
+	skel = bpf_iter_bpf_hash_map__open();
+	if (CHECK(!skel, "bpf_iter_bpf_hash_map__open",
+		  "skeleton open failed\n"))
+		return;
+
+	skel->bss->in_test_mode = true;
+
+	err = bpf_iter_bpf_hash_map__load(skel);
+	if (CHECK(!skel, "bpf_iter_bpf_hash_map__load",
+		  "skeleton load failed\n"))
+		goto out;
+
+	/* iterator with hashmap2 and hashmap3 should fail */
+	opts.map_fd = bpf_map__fd(skel->maps.hashmap2);
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+	if (CHECK(!IS_ERR(link), "attach_iter",
+		  "attach_iter for hashmap2 unexpected succeeded\n"))
+		goto out;
+
+	opts.map_fd = bpf_map__fd(skel->maps.hashmap3);
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+	if (CHECK(!IS_ERR(link), "attach_iter",
+		  "attach_iter for hashmap3 unexpected succeeded\n"))
+		goto out;
+
+	/* hashmap1 should be good, update map values here */
+	map_fd = bpf_map__fd(skel->maps.hashmap1);
+	for (i = 0; i < bpf_map__max_entries(skel->maps.hashmap1); i++) {
+		key.a = i + 1;
+		key.b = i + 2;
+		key.c = i + 3;
+		val = i + 4;
+		expected_key_a += key.a;
+		expected_key_b += key.b;
+		expected_key_c += key.c;
+		expected_val += val;
+
+		err = bpf_map_update_elem(map_fd, &key, &val, BPF_ANY);
+		if (CHECK(err, "map_update", "map_update failed\n"))
+			goto out;
+	}
+
+	opts.map_fd = map_fd;
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* do some tests */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto close_iter;
+
+	/* test results */
+	if (CHECK(skel->bss->key_sum_a != expected_key_a,
+		  "key_sum_a", "got %u expected %u\n",
+		  skel->bss->key_sum_a, expected_key_a))
+		goto close_iter;
+	if (CHECK(skel->bss->key_sum_b != expected_key_b,
+		  "key_sum_b", "got %u expected %u\n",
+		  skel->bss->key_sum_b, expected_key_b))
+		goto close_iter;
+	if (CHECK(skel->bss->val_sum != expected_val,
+		  "val_sum", "got %llu expected %llu\n",
+		  skel->bss->val_sum, expected_val))
+		goto close_iter;
+
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_bpf_hash_map__destroy(skel);
+}
+
+static void test_bpf_percpu_hash_map(void)
+{
+	__u32 expected_key_a = 0, expected_key_b = 0, expected_key_c = 0;
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_bpf_percpu_hash_map *skel;
+	int err, i, j, len, map_fd, iter_fd;
+	__u32 expected_val = 0;
+	struct bpf_link *link;
+	struct key_t {
+		int a;
+		int b;
+		int c;
+	} key;
+	char buf[64];
+	void *val;
+
+	val = malloc(8 * bpf_num_possible_cpus());
+
+	skel = bpf_iter_bpf_percpu_hash_map__open();
+	if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__open",
+		  "skeleton open failed\n"))
+		return;
+
+	skel->rodata->num_cpus = bpf_num_possible_cpus();
+
+	err = bpf_iter_bpf_percpu_hash_map__load(skel);
+	if (CHECK(!skel, "bpf_iter_bpf_percpu_hash_map__load",
+		  "skeleton load failed\n"))
+		goto out;
+
+	/* update map values here */
+	map_fd = bpf_map__fd(skel->maps.hashmap1);
+	for (i = 0; i < bpf_map__max_entries(skel->maps.hashmap1); i++) {
+		key.a = i + 1;
+		key.b = i + 2;
+		key.c = i + 3;
+		expected_key_a += key.a;
+		expected_key_b += key.b;
+		expected_key_c += key.c;
+
+		for (j = 0; j < bpf_num_possible_cpus(); j++) {
+			*(__u32 *)(val + j * 8) = i + j;
+			expected_val += i + j;
+		}
+
+		err = bpf_map_update_elem(map_fd, &key, val, BPF_ANY);
+		if (CHECK(err, "map_update", "map_update failed\n"))
+			goto out;
+	}
+
+	opts.map_fd = map_fd;
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_hash_map, &opts);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* do some tests */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto close_iter;
+
+	/* test results */
+	if (CHECK(skel->bss->key_sum_a != expected_key_a,
+		  "key_sum_a", "got %u expected %u\n",
+		  skel->bss->key_sum_a, expected_key_a))
+		goto close_iter;
+	if (CHECK(skel->bss->key_sum_b != expected_key_b,
+		  "key_sum_b", "got %u expected %u\n",
+		  skel->bss->key_sum_b, expected_key_b))
+		goto close_iter;
+	if (CHECK(skel->bss->val_sum != expected_val,
+		  "val_sum", "got %u expected %u\n",
+		  skel->bss->val_sum, expected_val))
+		goto close_iter;
+
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_bpf_percpu_hash_map__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -491,4 +674,8 @@ void test_bpf_iter(void)
 		test_overflow(true, false);
 	if (test__start_subtest("prog-ret-1"))
 		test_overflow(false, true);
+	if (test__start_subtest("bpf_hash_map"))
+		test_bpf_hash_map();
+	if (test__start_subtest("bpf_percpu_hash_map"))
+		test_bpf_percpu_hash_map();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
new file mode 100644
index 000000000000..07ddbfdbcab7
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_hash_map.c
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+	int a;
+	int b;
+	int c;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 3);
+	__type(key, struct key_t);
+	__type(value, __u64);
+} hashmap1 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 3);
+	__type(key, __u64);
+	__type(value, __u64);
+} hashmap2 SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 3);
+	__type(key, struct key_t);
+	__type(value, __u32);
+} hashmap3 SEC(".maps");
+
+/* will set before prog run */
+bool in_test_mode = 0;
+
+/* will collect results during prog run */
+__u32 key_sum_a = 0, key_sum_b = 0, key_sum_c = 0;
+__u64 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	struct seq_file *seq = ctx->meta->seq;
+	__u32 seq_num = ctx->meta->seq_num;
+	struct bpf_map *map = ctx->map;
+	struct key_t *key = ctx->key;
+	__u64 *val = ctx->value;
+
+	if (in_test_mode) {
+		/* test mode is used by selftests to
+		 * test functionality of bpf_hash_map iter.
+		 *
+		 * the above hashmap1 will have correct size
+		 * and will be accepted, hashmap2 and hashmap3
+		 * should be rejected due to smaller key/value
+		 * size.
+		 */
+		if (key == (void *)0 || val == (void *)0)
+			return 0;
+
+		key_sum_a += key->a;
+		key_sum_b += key->b;
+		key_sum_c += key->c;
+		val_sum += *val;
+		return 0;
+	}
+
+	/* non-test mode, the map is prepared with the
+	 * below bpftool command sequence:
+	 *   bpftool map create /sys/fs/bpf/m1 type hash \
+	 *   	key 12 value 8 entries 3 name map1
+	 *   bpftool map update id 77 key 0 0 0 1 0 0 0 0 0 0 0 1 \
+	 *   	value 0 0 0 1 0 0 0 1
+	 *   bpftool map update id 77 key 0 0 0 1 0 0 0 0 0 0 0 2 \
+	 *   	value 0 0 0 1 0 0 0 2
+	 * The bpftool iter command line:
+	 *   bpftool iter pin ./bpf_iter_bpf_hash_map.o /sys/fs/bpf/p1 \
+	 *   	map id 77
+	 * The below output will be:
+	 *   map dump starts
+	 *   77: (1000000 0 2000000) (200000001000000)
+	 *   77: (1000000 0 1000000) (100000001000000)
+	 *   map dump ends
+	 */
+	if (seq_num == 0)
+		BPF_SEQ_PRINTF(seq, "map dump starts\n");
+
+	if (key == (void *)0 || val == (void *)0) {
+		BPF_SEQ_PRINTF(seq, "map dump ends\n");
+		return 0;
+	}
+
+	BPF_SEQ_PRINTF(seq, "%d: (%x %d %x) (%llx)\n", map->id,
+		       key->a, key->b, key->c, *val);
+
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
new file mode 100644
index 000000000000..feaaa2b89c57
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_hash_map.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+	int a;
+	int b;
+	int c;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
+	__uint(max_entries, 3);
+	__type(key, struct key_t);
+	__type(value, __u32);
+} hashmap1 SEC(".maps");
+
+/* will set before prog run */
+volatile const __u32 num_cpus = 0;
+
+/* will collect results during prog run */
+__u32 key_sum_a = 0, key_sum_b = 0, key_sum_c = 0;
+__u32 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_percpu_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	struct key_t *key = ctx->key;
+	void *pptr = ctx->value;
+	__u32 step;
+	int i;
+
+	if (key == (void *)0 || pptr == (void *)0)
+		return 0;
+
+	key_sum_a += key->a;
+	key_sum_b += key->b;
+	key_sum_c += key->c;
+
+	step = 8;
+	for (i = 0; i < num_cpus; i++) {
+		val_sum += *(__u32 *)pptr;
+		pptr += step;
+	}
+	return 0;
+}

From 60dd49ea65390986a665c462da704927e861e67e Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:21 -0700
Subject: [PATCH 16/73] selftests/bpf: Add test for bpf array map iterators

Two subtests are added.
  $ ./test_progs -n 4
  ...
  #4/20 bpf_array_map:OK
  #4/21 bpf_percpu_array_map:OK
  ...

The bpf_array_map subtest also tested bpf program
changing array element values and send key/value
to user space through bpf_seq_write() interface.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184121.591367-1-yhs@fb.com
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 161 ++++++++++++++++++
 .../bpf/progs/bpf_iter_bpf_array_map.c        |  40 +++++
 .../bpf/progs/bpf_iter_bpf_percpu_array_map.c |  46 +++++
 3 files changed, 247 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 72790b600c62..4a02b2222a6d 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -17,6 +17,8 @@
 #include "bpf_iter_test_kern4.skel.h"
 #include "bpf_iter_bpf_hash_map.skel.h"
 #include "bpf_iter_bpf_percpu_hash_map.skel.h"
+#include "bpf_iter_bpf_array_map.skel.h"
+#include "bpf_iter_bpf_percpu_array_map.skel.h"
 
 static int duration;
 
@@ -638,6 +640,161 @@ out:
 	bpf_iter_bpf_percpu_hash_map__destroy(skel);
 }
 
+static void test_bpf_array_map(void)
+{
+	__u64 val, expected_val = 0, res_first_val, first_val = 0;
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	__u32 expected_key = 0, res_first_key;
+	struct bpf_iter_bpf_array_map *skel;
+	int err, i, map_fd, iter_fd;
+	struct bpf_link *link;
+	char buf[64] = {};
+	int len, start;
+
+	skel = bpf_iter_bpf_array_map__open_and_load();
+	if (CHECK(!skel, "bpf_iter_bpf_array_map__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	map_fd = bpf_map__fd(skel->maps.arraymap1);
+	for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+		val = i + 4;
+		expected_key += i;
+		expected_val += val;
+
+		if (i == 0)
+			first_val = val;
+
+		err = bpf_map_update_elem(map_fd, &i, &val, BPF_ANY);
+		if (CHECK(err, "map_update", "map_update failed\n"))
+			goto out;
+	}
+
+	opts.map_fd = map_fd;
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_array_map, &opts);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* do some tests */
+	start = 0;
+	while ((len = read(iter_fd, buf + start, sizeof(buf) - start)) > 0)
+		start += len;
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto close_iter;
+
+	/* test results */
+	res_first_key = *(__u32 *)buf;
+	res_first_val = *(__u64 *)(buf + sizeof(__u32));
+	if (CHECK(res_first_key != 0 || res_first_val != first_val,
+		  "bpf_seq_write",
+		  "seq_write failure: first key %u vs expected 0, "
+		  " first value %llu vs expected %llu\n",
+		  res_first_key, res_first_val, first_val))
+		goto close_iter;
+
+	if (CHECK(skel->bss->key_sum != expected_key,
+		  "key_sum", "got %u expected %u\n",
+		  skel->bss->key_sum, expected_key))
+		goto close_iter;
+	if (CHECK(skel->bss->val_sum != expected_val,
+		  "val_sum", "got %llu expected %llu\n",
+		  skel->bss->val_sum, expected_val))
+		goto close_iter;
+
+	for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+		err = bpf_map_lookup_elem(map_fd, &i, &val);
+		if (CHECK(err, "map_lookup", "map_lookup failed\n"))
+			goto out;
+		if (CHECK(i != val, "invalid_val",
+			  "got value %llu expected %u\n", val, i))
+			goto out;
+	}
+
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_bpf_array_map__destroy(skel);
+}
+
+static void test_bpf_percpu_array_map(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_bpf_percpu_array_map *skel;
+	__u32 expected_key = 0, expected_val = 0;
+	int err, i, j, map_fd, iter_fd;
+	struct bpf_link *link;
+	char buf[64];
+	void *val;
+	int len;
+
+	val = malloc(8 * bpf_num_possible_cpus());
+
+	skel = bpf_iter_bpf_percpu_array_map__open();
+	if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__open",
+		  "skeleton open failed\n"))
+		return;
+
+	skel->rodata->num_cpus = bpf_num_possible_cpus();
+
+	err = bpf_iter_bpf_percpu_array_map__load(skel);
+	if (CHECK(!skel, "bpf_iter_bpf_percpu_array_map__load",
+		  "skeleton load failed\n"))
+		goto out;
+
+	/* update map values here */
+	map_fd = bpf_map__fd(skel->maps.arraymap1);
+	for (i = 0; i < bpf_map__max_entries(skel->maps.arraymap1); i++) {
+		expected_key += i;
+
+		for (j = 0; j < bpf_num_possible_cpus(); j++) {
+			*(__u32 *)(val + j * 8) = i + j;
+			expected_val += i + j;
+		}
+
+		err = bpf_map_update_elem(map_fd, &i, val, BPF_ANY);
+		if (CHECK(err, "map_update", "map_update failed\n"))
+			goto out;
+	}
+
+	opts.map_fd = map_fd;
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_percpu_array_map, &opts);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* do some tests */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto close_iter;
+
+	/* test results */
+	if (CHECK(skel->bss->key_sum != expected_key,
+		  "key_sum", "got %u expected %u\n",
+		  skel->bss->key_sum, expected_key))
+		goto close_iter;
+	if (CHECK(skel->bss->val_sum != expected_val,
+		  "val_sum", "got %u expected %u\n",
+		  skel->bss->val_sum, expected_val))
+		goto close_iter;
+
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+out:
+	bpf_iter_bpf_percpu_array_map__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -678,4 +835,8 @@ void test_bpf_iter(void)
 		test_bpf_hash_map();
 	if (test__start_subtest("bpf_percpu_hash_map"))
 		test_bpf_percpu_hash_map();
+	if (test__start_subtest("bpf_array_map"))
+		test_bpf_array_map();
+	if (test__start_subtest("bpf_percpu_array_map"))
+		test_bpf_percpu_array_map();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
new file mode 100644
index 000000000000..6286023fd62b
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+	int a;
+	int b;
+	int c;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 3);
+	__type(key, __u32);
+	__type(value, __u64);
+} arraymap1 SEC(".maps");
+
+__u32 key_sum = 0;
+__u64 val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_array_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	__u32 *key = ctx->key;
+	__u64 *val = ctx->value;
+
+	if (key == (void *)0 || val == (void *)0)
+		return 0;
+
+	bpf_seq_write(ctx->meta->seq, key, sizeof(__u32));
+	bpf_seq_write(ctx->meta->seq, val, sizeof(__u64));
+	key_sum += *key;
+	val_sum += *val;
+	*val = *key;
+	return 0;
+}
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
new file mode 100644
index 000000000000..85fa710fad90
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+	int a;
+	int b;
+	int c;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 3);
+	__type(key, __u32);
+	__type(value, __u32);
+} arraymap1 SEC(".maps");
+
+/* will set before prog run */
+volatile const __u32 num_cpus = 0;
+
+__u32 key_sum = 0, val_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_percpu_array_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	__u32 *key = ctx->key;
+	void *pptr = ctx->value;
+	__u32 step;
+	int i;
+
+	if (key == (void *)0 || pptr == (void *)0)
+		return 0;
+
+	key_sum += *key;
+
+	step = 8;
+	for (i = 0; i < num_cpus; i++) {
+		val_sum += *(__u32 *)pptr;
+		pptr += step;
+	}
+	return 0;
+}

From 3b1c420bd882115eb7a3d2335cc00d7b9974eb0b Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:22 -0700
Subject: [PATCH 17/73] selftests/bpf: Add a test for bpf sk_storage_map
 iterator

Added one test for bpf sk_storage_map_iterator.
  $ ./test_progs -n 4
  ...
  #4/22 bpf_sk_storage_map:OK
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184122.591591-1-yhs@fb.com
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 72 +++++++++++++++++++
 .../bpf/progs/bpf_iter_bpf_sk_storage_map.c   | 34 +++++++++
 2 files changed, 106 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index 4a02b2222a6d..ffbbeb9fa268 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -19,6 +19,7 @@
 #include "bpf_iter_bpf_percpu_hash_map.skel.h"
 #include "bpf_iter_bpf_array_map.skel.h"
 #include "bpf_iter_bpf_percpu_array_map.skel.h"
+#include "bpf_iter_bpf_sk_storage_map.skel.h"
 
 static int duration;
 
@@ -795,6 +796,75 @@ out:
 	bpf_iter_bpf_percpu_array_map__destroy(skel);
 }
 
+static void test_bpf_sk_storage_map(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	int err, i, len, map_fd, iter_fd, num_sockets;
+	struct bpf_iter_bpf_sk_storage_map *skel;
+	int sock_fd[3] = {-1, -1, -1};
+	__u32 val, expected_val = 0;
+	struct bpf_link *link;
+	char buf[64];
+
+	skel = bpf_iter_bpf_sk_storage_map__open_and_load();
+	if (CHECK(!skel, "bpf_iter_bpf_sk_storage_map__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	map_fd = bpf_map__fd(skel->maps.sk_stg_map);
+	num_sockets = ARRAY_SIZE(sock_fd);
+	for (i = 0; i < num_sockets; i++) {
+		sock_fd[i] = socket(AF_INET6, SOCK_STREAM, 0);
+		if (CHECK(sock_fd[i] < 0, "socket", "errno: %d\n", errno))
+			goto out;
+
+		val = i + 1;
+		expected_val += val;
+
+		err = bpf_map_update_elem(map_fd, &sock_fd[i], &val,
+					  BPF_NOEXIST);
+		if (CHECK(err, "map_update", "map_update failed\n"))
+			goto out;
+	}
+
+	opts.map_fd = map_fd;
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_sk_storage_map, &opts);
+	if (CHECK(IS_ERR(link), "attach_iter", "attach_iter failed\n"))
+		goto out;
+
+	iter_fd = bpf_iter_create(bpf_link__fd(link));
+	if (CHECK(iter_fd < 0, "create_iter", "create_iter failed\n"))
+		goto free_link;
+
+	/* do some tests */
+	while ((len = read(iter_fd, buf, sizeof(buf))) > 0)
+		;
+	if (CHECK(len < 0, "read", "read failed: %s\n", strerror(errno)))
+		goto close_iter;
+
+	/* test results */
+	if (CHECK(skel->bss->ipv6_sk_count != num_sockets,
+		  "ipv6_sk_count", "got %u expected %u\n",
+		  skel->bss->ipv6_sk_count, num_sockets))
+		goto close_iter;
+
+	if (CHECK(skel->bss->val_sum != expected_val,
+		  "val_sum", "got %u expected %u\n",
+		  skel->bss->val_sum, expected_val))
+		goto close_iter;
+
+close_iter:
+	close(iter_fd);
+free_link:
+	bpf_link__destroy(link);
+out:
+	for (i = 0; i < num_sockets; i++) {
+		if (sock_fd[i] >= 0)
+			close(sock_fd[i]);
+	}
+	bpf_iter_bpf_sk_storage_map__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -839,4 +909,6 @@ void test_bpf_iter(void)
 		test_bpf_array_map();
 	if (test__start_subtest("bpf_percpu_array_map"))
 		test_bpf_percpu_array_map();
+	if (test__start_subtest("bpf_sk_storage_map"))
+		test_bpf_sk_storage_map();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
new file mode 100644
index 000000000000..6b70ccaba301
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_sk_storage_map.c
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include "bpf_tracing_net.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, int);
+} sk_stg_map SEC(".maps");
+
+__u32 val_sum = 0;
+__u32 ipv6_sk_count = 0;
+
+SEC("iter/bpf_sk_storage_map")
+int dump_bpf_sk_storage_map(struct bpf_iter__bpf_sk_storage_map *ctx)
+{
+	struct sock *sk = ctx->sk;
+	__u32 *val = ctx->value;
+
+	if (sk == (void *)0 || val == (void *)0)
+		return 0;
+
+	if (sk->sk_family == AF_INET6)
+		ipv6_sk_count++;
+
+	val_sum += *val;
+	return 0;
+}

From 7b04d6d60fcfb5b2200ffebb9cfb90927bdfeec7 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:44 -0700
Subject: [PATCH 18/73] bpf: Separate bpf_get_[stack|stackid] for perf events
 BPF

Calling get_perf_callchain() on perf_events from PEBS entries may cause
unwinder errors. To fix this issue, the callchain is fetched early. Such
perf_events are marked with __PERF_SAMPLE_CALLCHAIN_EARLY.

Similarly, calling bpf_get_[stack|stackid] on perf_events from PEBS may
also cause unwinder errors. To fix this, add separate version of these
two helpers, bpf_get_[stack|stackid]_pe. These two hepers use callchain in
bpf_perf_event_data_kern->data->callchain.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-2-songliubraving@fb.com
---
 include/linux/bpf.h      |   2 +
 kernel/bpf/stackmap.c    | 184 +++++++++++++++++++++++++++++++++++----
 kernel/trace/bpf_trace.c |   4 +-
 3 files changed, 170 insertions(+), 20 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 4175cf1f4665..8357be349133 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -1675,6 +1675,8 @@ extern const struct bpf_func_proto bpf_get_current_comm_proto;
 extern const struct bpf_func_proto bpf_get_stackid_proto;
 extern const struct bpf_func_proto bpf_get_stack_proto;
 extern const struct bpf_func_proto bpf_get_task_stack_proto;
+extern const struct bpf_func_proto bpf_get_stackid_proto_pe;
+extern const struct bpf_func_proto bpf_get_stack_proto_pe;
 extern const struct bpf_func_proto bpf_sock_map_update_proto;
 extern const struct bpf_func_proto bpf_sock_hash_update_proto;
 extern const struct bpf_func_proto bpf_get_current_cgroup_id_proto;
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 48d8e739975f..5beb2f8c23da 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -4,6 +4,7 @@
 #include <linux/bpf.h>
 #include <linux/jhash.h>
 #include <linux/filter.h>
+#include <linux/kernel.h>
 #include <linux/stacktrace.h>
 #include <linux/perf_event.h>
 #include <linux/elf.h>
@@ -387,11 +388,10 @@ get_callchain_entry_for_task(struct task_struct *task, u32 init_nr)
 #endif
 }
 
-BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
-	   u64, flags)
+static long __bpf_get_stackid(struct bpf_map *map,
+			      struct perf_callchain_entry *trace, u64 flags)
 {
 	struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
-	struct perf_callchain_entry *trace;
 	struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
 	u32 max_depth = map->value_size / stack_map_data_size(map);
 	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
@@ -399,21 +399,9 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
 	u32 hash, id, trace_nr, trace_len;
 	bool user = flags & BPF_F_USER_STACK;
-	bool kernel = !user;
 	u64 *ips;
 	bool hash_matches;
 
-	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
-			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
-		return -EINVAL;
-
-	trace = get_perf_callchain(regs, init_nr, kernel, user,
-				   sysctl_perf_event_max_stack, false, false);
-
-	if (unlikely(!trace))
-		/* couldn't fetch the stack trace */
-		return -EFAULT;
-
 	/* get_perf_callchain() guarantees that trace->nr >= init_nr
 	 * and trace-nr <= sysctl_perf_event_max_stack, so trace_nr <= max_depth
 	 */
@@ -478,6 +466,30 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
 	return id;
 }
 
+BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
+	   u64, flags)
+{
+	u32 max_depth = map->value_size / stack_map_data_size(map);
+	/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
+	u32 init_nr = sysctl_perf_event_max_stack - max_depth;
+	bool user = flags & BPF_F_USER_STACK;
+	struct perf_callchain_entry *trace;
+	bool kernel = !user;
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	trace = get_perf_callchain(regs, init_nr, kernel, user,
+				   sysctl_perf_event_max_stack, false, false);
+
+	if (unlikely(!trace))
+		/* couldn't fetch the stack trace */
+		return -EFAULT;
+
+	return __bpf_get_stackid(map, trace, flags);
+}
+
 const struct bpf_func_proto bpf_get_stackid_proto = {
 	.func		= bpf_get_stackid,
 	.gpl_only	= true,
@@ -487,7 +499,77 @@ const struct bpf_func_proto bpf_get_stackid_proto = {
 	.arg3_type	= ARG_ANYTHING,
 };
 
+static __u64 count_kernel_ip(struct perf_callchain_entry *trace)
+{
+	__u64 nr_kernel = 0;
+
+	while (nr_kernel < trace->nr) {
+		if (trace->ip[nr_kernel] == PERF_CONTEXT_USER)
+			break;
+		nr_kernel++;
+	}
+	return nr_kernel;
+}
+
+BPF_CALL_3(bpf_get_stackid_pe, struct bpf_perf_event_data_kern *, ctx,
+	   struct bpf_map *, map, u64, flags)
+{
+	struct perf_event *event = ctx->event;
+	struct perf_callchain_entry *trace;
+	bool kernel, user;
+	__u64 nr_kernel;
+	int ret;
+
+	/* perf_sample_data doesn't have callchain, use bpf_get_stackid */
+	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+		return bpf_get_stackid((unsigned long)(ctx->regs),
+				       (unsigned long) map, flags, 0, 0);
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
+		return -EINVAL;
+
+	user = flags & BPF_F_USER_STACK;
+	kernel = !user;
+
+	trace = ctx->data->callchain;
+	if (unlikely(!trace))
+		return -EFAULT;
+
+	nr_kernel = count_kernel_ip(trace);
+
+	if (kernel) {
+		__u64 nr = trace->nr;
+
+		trace->nr = nr_kernel;
+		ret = __bpf_get_stackid(map, trace, flags);
+
+		/* restore nr */
+		trace->nr = nr;
+	} else { /* user */
+		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+		skip += nr_kernel;
+		if (skip > BPF_F_SKIP_FIELD_MASK)
+			return -EFAULT;
+
+		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+		ret = __bpf_get_stackid(map, trace, flags);
+	}
+	return ret;
+}
+
+const struct bpf_func_proto bpf_get_stackid_proto_pe = {
+	.func		= bpf_get_stackid_pe,
+	.gpl_only	= false,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_CONST_MAP_PTR,
+	.arg3_type	= ARG_ANYTHING,
+};
+
 static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
+			    struct perf_callchain_entry *trace_in,
 			    void *buf, u32 size, u64 flags)
 {
 	u32 init_nr, trace_nr, copy_len, elem_size, num_elem;
@@ -520,7 +602,9 @@ static long __bpf_get_stack(struct pt_regs *regs, struct task_struct *task,
 	else
 		init_nr = sysctl_perf_event_max_stack - num_elem;
 
-	if (kernel && task)
+	if (trace_in)
+		trace = trace_in;
+	else if (kernel && task)
 		trace = get_callchain_entry_for_task(task, init_nr);
 	else
 		trace = get_perf_callchain(regs, init_nr, kernel, user,
@@ -556,7 +640,7 @@ clear:
 BPF_CALL_4(bpf_get_stack, struct pt_regs *, regs, void *, buf, u32, size,
 	   u64, flags)
 {
-	return __bpf_get_stack(regs, NULL, buf, size, flags);
+	return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 }
 
 const struct bpf_func_proto bpf_get_stack_proto = {
@@ -574,7 +658,7 @@ BPF_CALL_4(bpf_get_task_stack, struct task_struct *, task, void *, buf,
 {
 	struct pt_regs *regs = task_pt_regs(task);
 
-	return __bpf_get_stack(regs, task, buf, size, flags);
+	return __bpf_get_stack(regs, task, NULL, buf, size, flags);
 }
 
 BTF_ID_LIST(bpf_get_task_stack_btf_ids)
@@ -591,6 +675,70 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
 	.btf_id		= bpf_get_task_stack_btf_ids,
 };
 
+BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
+	   void *, buf, u32, size, u64, flags)
+{
+	struct perf_event *event = ctx->event;
+	struct perf_callchain_entry *trace;
+	bool kernel, user;
+	int err = -EINVAL;
+	__u64 nr_kernel;
+
+	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
+		return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);
+
+	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
+			       BPF_F_USER_BUILD_ID)))
+		goto clear;
+
+	user = flags & BPF_F_USER_STACK;
+	kernel = !user;
+
+	err = -EFAULT;
+	trace = ctx->data->callchain;
+	if (unlikely(!trace))
+		goto clear;
+
+	nr_kernel = count_kernel_ip(trace);
+
+	if (kernel) {
+		__u64 nr = trace->nr;
+
+		trace->nr = nr_kernel;
+		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+				      size, flags);
+
+		/* restore nr */
+		trace->nr = nr;
+	} else { /* user */
+		u64 skip = flags & BPF_F_SKIP_FIELD_MASK;
+
+		skip += nr_kernel;
+		if (skip > BPF_F_SKIP_FIELD_MASK)
+			goto clear;
+
+		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
+		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
+				      size, flags);
+	}
+	return err;
+
+clear:
+	memset(buf, 0, size);
+	return err;
+
+}
+
+const struct bpf_func_proto bpf_get_stack_proto_pe = {
+	.func		= bpf_get_stack_pe,
+	.gpl_only	= true,
+	.ret_type	= RET_INTEGER,
+	.arg1_type	= ARG_PTR_TO_CTX,
+	.arg2_type	= ARG_PTR_TO_UNINIT_MEM,
+	.arg3_type	= ARG_CONST_SIZE_OR_ZERO,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 /* Called from eBPF program */
 static void *stack_map_lookup_elem(struct bpf_map *map, void *key)
 {
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3cc0dcb60ca2..cb91ef902cc4 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -1411,9 +1411,9 @@ pe_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_perf_event_output:
 		return &bpf_perf_event_output_proto_tp;
 	case BPF_FUNC_get_stackid:
-		return &bpf_get_stackid_proto_tp;
+		return &bpf_get_stackid_proto_pe;
 	case BPF_FUNC_get_stack:
-		return &bpf_get_stack_proto_tp;
+		return &bpf_get_stack_proto_pe;
 	case BPF_FUNC_perf_prog_read_value:
 		return &bpf_perf_prog_read_value_proto;
 	case BPF_FUNC_read_branch_records:

From 9efcc4ad7a15ea50550c53fbf62457c309216051 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Thu, 23 Jul 2020 11:41:24 -0700
Subject: [PATCH 19/73] selftests/bpf: Add a test for out of bound rdonly buf
 access

If the bpf program contains out of bound access w.r.t. a
particular map key/value size, the verification will be
still okay, e.g., it will be accepted by verifier. But
it will be rejected during link_create time. A test
is added here to ensure link_create failure did happen
if out of bound access happened.
  $ ./test_progs -n 4
  ...
  #4/23 rdonly-buf-out-of-bound:OK
  ...

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723184124.591700-1-yhs@fb.com
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 22 ++++++++++++
 .../selftests/bpf/progs/bpf_iter_test_kern5.c | 35 +++++++++++++++++++
 2 files changed, 57 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index ffbbeb9fa268..d95de80b1851 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -20,6 +20,7 @@
 #include "bpf_iter_bpf_array_map.skel.h"
 #include "bpf_iter_bpf_percpu_array_map.skel.h"
 #include "bpf_iter_bpf_sk_storage_map.skel.h"
+#include "bpf_iter_test_kern5.skel.h"
 
 static int duration;
 
@@ -865,6 +866,25 @@ out:
 	bpf_iter_bpf_sk_storage_map__destroy(skel);
 }
 
+static void test_rdonly_buf_out_of_bound(void)
+{
+	DECLARE_LIBBPF_OPTS(bpf_iter_attach_opts, opts);
+	struct bpf_iter_test_kern5 *skel;
+	struct bpf_link *link;
+
+	skel = bpf_iter_test_kern5__open_and_load();
+	if (CHECK(!skel, "bpf_iter_test_kern5__open_and_load",
+		  "skeleton open_and_load failed\n"))
+		return;
+
+	opts.map_fd = bpf_map__fd(skel->maps.hashmap1);
+	link = bpf_program__attach_iter(skel->progs.dump_bpf_hash_map, &opts);
+	if (CHECK(!IS_ERR(link), "attach_iter", "unexpected success\n"))
+		bpf_link__destroy(link);
+
+	bpf_iter_test_kern5__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -911,4 +931,6 @@ void test_bpf_iter(void)
 		test_bpf_percpu_array_map();
 	if (test__start_subtest("bpf_sk_storage_map"))
 		test_bpf_sk_storage_map();
+	if (test__start_subtest("rdonly-buf-out-of-bound"))
+		test_rdonly_buf_out_of_bound();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
new file mode 100644
index 000000000000..e3a7575e81d2
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern5.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_tracing.h>
+
+char _license[] SEC("license") = "GPL";
+
+struct key_t {
+	int a;
+	int b;
+	int c;
+};
+
+struct {
+	__uint(type, BPF_MAP_TYPE_HASH);
+	__uint(max_entries, 3);
+	__type(key, struct key_t);
+	__type(value, __u64);
+} hashmap1 SEC(".maps");
+
+__u32 key_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	void *key = ctx->key;
+
+	if (key == (void *)0)
+		return 0;
+
+	/* out of bound access w.r.t. hashmap1 */
+	key_sum += *(__u32 *)(key + sizeof(struct key_t));
+	return 0;
+}

From 5d99cb2c86775b4780c02a339a9578bf9471ead9 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:45 -0700
Subject: [PATCH 20/73] bpf: Fail PERF_EVENT_IOC_SET_BPF when
 bpf_get_[stack|stackid] cannot work

bpf_get_[stack|stackid] on perf_events with precise_ip uses callchain
attached to perf_sample_data. If this callchain is not presented, do not
allow attaching BPF program that calls bpf_get_[stack|stackid] to this
event.

In the error case, -EPROTO is returned so that libbpf can identify this
error and print proper hint message.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-3-songliubraving@fb.com
---
 include/linux/filter.h |  3 ++-
 kernel/bpf/verifier.c  |  3 +++
 kernel/events/core.c   | 18 ++++++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/include/linux/filter.h b/include/linux/filter.h
index d07a6e973a7d..0a355b005bf4 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -533,7 +533,8 @@ struct bpf_prog {
 				is_func:1,	/* program is a bpf function */
 				kprobe_override:1, /* Do we override a kprobe? */
 				has_callchain_buf:1, /* callchain buffer allocated? */
-				enforce_expected_attach_type:1; /* Enforce expected_attach_type checking at attach time */
+				enforce_expected_attach_type:1, /* Enforce expected_attach_type checking at attach time */
+				call_get_stack:1; /* Do we call bpf_get_stack() or bpf_get_stackid() */
 	enum bpf_prog_type	type;		/* Type of BPF program */
 	enum bpf_attach_type	expected_attach_type; /* For some prog types */
 	u32			len;		/* Number of filter blocks */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8d6979db48d8..cd14e70f2d07 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -4962,6 +4962,9 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn
 		env->prog->has_callchain_buf = true;
 	}
 
+	if (func_id == BPF_FUNC_get_stackid || func_id == BPF_FUNC_get_stack)
+		env->prog->call_get_stack = true;
+
 	if (changes_data)
 		clear_all_pkt_pointers(env);
 	return 0;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 856d98c36f56..ddcfd2fb5cc5 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -9544,6 +9544,24 @@ static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
 	if (IS_ERR(prog))
 		return PTR_ERR(prog);
 
+	if (event->attr.precise_ip &&
+	    prog->call_get_stack &&
+	    (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY) ||
+	     event->attr.exclude_callchain_kernel ||
+	     event->attr.exclude_callchain_user)) {
+		/*
+		 * On perf_event with precise_ip, calling bpf_get_stack()
+		 * may trigger unwinder warnings and occasional crashes.
+		 * bpf_get_[stack|stackid] works around this issue by using
+		 * callchain attached to perf_sample_data. If the
+		 * perf_event does not full (kernel and user) callchain
+		 * attached to perf_sample_data, do not allow attaching BPF
+		 * program that calls bpf_get_[stack|stackid].
+		 */
+		bpf_prog_put(prog);
+		return -EPROTO;
+	}
+
 	event->prog = prog;
 	event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
 	WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);

From d4b4dd6ce7709c2d2fe56dcfc15074ee18505bcb Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:46 -0700
Subject: [PATCH 21/73] libbpf: Print hint when PERF_EVENT_IOC_SET_BPF returns
 -EPROTO

The kernel prevents potential unwinder warnings and crashes by blocking
BPF program with bpf_get_[stack|stackid] on perf_event without
PERF_SAMPLE_CALLCHAIN, or with exclude_callchain_[kernel|user]. Print a
hint message in libbpf to help the user debug such issues.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-4-songliubraving@fb.com
---
 tools/lib/bpf/libbpf.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index a05aa7e2bab6..e51479d60285 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7833,6 +7833,9 @@ struct bpf_link *bpf_program__attach_perf_event(struct bpf_program *prog,
 		pr_warn("program '%s': failed to attach to pfd %d: %s\n",
 			bpf_program__title(prog, false), pfd,
 			   libbpf_strerror_r(err, errmsg, sizeof(errmsg)));
+		if (err == -EPROTO)
+			pr_warn("program '%s': try add PERF_SAMPLE_CALLCHAIN to or remove exclude_callchain_[kernel|user] from pfd %d\n",
+				bpf_program__title(prog, false), pfd);
 		return ERR_PTR(err);
 	}
 	if (ioctl(pfd, PERF_EVENT_IOC_ENABLE, 0) < 0) {

From d4a89c1eb81431479664029bcdec593dbf23385f Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:41 -0500
Subject: [PATCH 22/73] selftests/bpf: Add test for CGROUP_STORAGE map on
 multiple attaches

This test creates a parent cgroup, and a child of that cgroup.
It attaches a cgroup_skb/egress program that simply counts packets,
to a global variable (ARRAY map), and to a CGROUP_STORAGE map.
The program is first attached to the parent cgroup only, then to
parent and child.

The test cases sends a message within the child cgroup, and because
the program is inherited across parent / child cgroups, it will
trigger the egress program for both the parent and child, if they
exist. The program, when looking up a CGROUP_STORAGE map, uses the
cgroup and attach type of the attachment parameters; therefore,
both attaches uses different cgroup storages.

We assert that all packet counts returns what we expects.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/5a20206afa4606144691c7caa0d1b997cd60dec0.1595565795.git.zhuyifei@google.com
---
 .../bpf/prog_tests/cg_storage_multi.c         | 161 ++++++++++++++++++
 .../bpf/progs/cg_storage_multi_egress_only.c  |  30 ++++
 2 files changed, 191 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
 create mode 100644 tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c

diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
new file mode 100644
index 000000000000..e90e0547d759
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <test_progs.h>
+#include <cgroup_helpers.h>
+#include <network_helpers.h>
+
+#include "cg_storage_multi_egress_only.skel.h"
+
+#define PARENT_CGROUP "/cgroup_storage"
+#define CHILD_CGROUP "/cgroup_storage/child"
+
+static int duration;
+
+static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
+			   __u32 expected)
+{
+	struct bpf_cgroup_storage_key key = {0};
+	__u32 value;
+	int map_fd;
+
+	map_fd = bpf_map__fd(map);
+
+	key.cgroup_inode_id = get_cgroup_id(cgroup_path);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	if (CHECK(bpf_map_lookup_elem(map_fd, &key, &value) < 0,
+		  "map-lookup", "errno %d", errno))
+		return true;
+	if (CHECK(value != expected,
+		  "assert-storage", "got %u expected %u", value, expected))
+		return true;
+
+	return false;
+}
+
+static bool assert_storage_noexist(struct bpf_map *map, const char *cgroup_path)
+{
+	struct bpf_cgroup_storage_key key = {0};
+	__u32 value;
+	int map_fd;
+
+	map_fd = bpf_map__fd(map);
+
+	key.cgroup_inode_id = get_cgroup_id(cgroup_path);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	if (CHECK(bpf_map_lookup_elem(map_fd, &key, &value) == 0,
+		  "map-lookup", "succeeded, expected ENOENT"))
+		return true;
+	if (CHECK(errno != ENOENT,
+		  "map-lookup", "errno %d, expected ENOENT", errno))
+		return true;
+
+	return false;
+}
+
+static bool connect_send(const char *cgroup_path)
+{
+	bool res = true;
+	int server_fd = -1, client_fd = -1;
+
+	if (join_cgroup(cgroup_path))
+		goto out_clean;
+
+	server_fd = start_server(AF_INET, SOCK_DGRAM, NULL, 0, 0);
+	if (server_fd < 0)
+		goto out_clean;
+
+	client_fd = connect_to_fd(server_fd, 0);
+	if (client_fd < 0)
+		goto out_clean;
+
+	if (send(client_fd, "message", strlen("message"), 0) < 0)
+		goto out_clean;
+
+	res = false;
+
+out_clean:
+	close(client_fd);
+	close(server_fd);
+	return res;
+}
+
+static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
+{
+	struct cg_storage_multi_egress_only *obj;
+	struct bpf_link *parent_link = NULL, *child_link = NULL;
+	bool err;
+
+	obj = cg_storage_multi_egress_only__open_and_load();
+	if (CHECK(!obj, "skel-load", "errno %d", errno))
+		return;
+
+	/* Attach to parent cgroup, trigger packet from child.
+	 * Assert that there is only one run and in that run the storage is
+	 * parent cgroup's storage.
+	 * Also assert that child cgroup's storage does not exist
+	 */
+	parent_link = bpf_program__attach_cgroup(obj->progs.egress,
+						 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_link), "parent-cg-attach",
+		  "err %ld", PTR_ERR(parent_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "first-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 1,
+		  "first-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	if (assert_storage(obj->maps.cgroup_storage, PARENT_CGROUP, 1))
+		goto close_bpf_object;
+	if (assert_storage_noexist(obj->maps.cgroup_storage, CHILD_CGROUP))
+		goto close_bpf_object;
+
+	/* Attach to parent and child cgroup, trigger packet from child.
+	 * Assert that there are two additional runs, one that run with parent
+	 * cgroup's storage and one with child cgroup's storage.
+	 */
+	child_link = bpf_program__attach_cgroup(obj->progs.egress,
+						child_cgroup_fd);
+	if (CHECK(IS_ERR(child_link), "child-cg-attach",
+		  "err %ld", PTR_ERR(child_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "second-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 3,
+		  "second-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	if (assert_storage(obj->maps.cgroup_storage, PARENT_CGROUP, 2))
+		goto close_bpf_object;
+	if (assert_storage(obj->maps.cgroup_storage, CHILD_CGROUP, 1))
+		goto close_bpf_object;
+
+close_bpf_object:
+	bpf_link__destroy(parent_link);
+	bpf_link__destroy(child_link);
+
+	cg_storage_multi_egress_only__destroy(obj);
+}
+
+void test_cg_storage_multi(void)
+{
+	int parent_cgroup_fd = -1, child_cgroup_fd = -1;
+
+	parent_cgroup_fd = test__join_cgroup(PARENT_CGROUP);
+	if (CHECK(parent_cgroup_fd < 0, "cg-create-parent", "errno %d", errno))
+		goto close_cgroup_fd;
+	child_cgroup_fd = create_and_get_cgroup(CHILD_CGROUP);
+	if (CHECK(child_cgroup_fd < 0, "cg-create-child", "errno %d", errno))
+		goto close_cgroup_fd;
+
+	if (test__start_subtest("egress_only"))
+		test_egress_only(parent_cgroup_fd, child_cgroup_fd);
+
+close_cgroup_fd:
+	close(child_cgroup_fd);
+	close(parent_cgroup_fd);
+}
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
new file mode 100644
index 000000000000..ec0165d07105
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, __u32);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress")
+int egress(struct __sk_buff *skb)
+{
+	__u32 *ptr_cg_storage = bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(ptr_cg_storage, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}

From 1da4864c2b20f81afbb18b2a0b914d0c776331fc Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:47 -0700
Subject: [PATCH 23/73] selftests/bpf: Add callchain_stackid

This tests new helper function bpf_get_stackid_pe and bpf_get_stack_pe.
These two helpers have different implementation for perf_event with PEB
entries.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-5-songliubraving@fb.com
---
 .../bpf/prog_tests/perf_event_stackmap.c      | 116 ++++++++++++++++++
 .../selftests/bpf/progs/perf_event_stackmap.c |  59 +++++++++
 2 files changed, 175 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
 create mode 100644 tools/testing/selftests/bpf/progs/perf_event_stackmap.c

diff --git a/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
new file mode 100644
index 000000000000..72c3690844fb
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/perf_event_stackmap.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <sched.h>
+#include <test_progs.h>
+#include "perf_event_stackmap.skel.h"
+
+#ifndef noinline
+#define noinline __attribute__((noinline))
+#endif
+
+noinline int func_1(void)
+{
+	static int val = 1;
+
+	val += 1;
+
+	usleep(100);
+	return val;
+}
+
+noinline int func_2(void)
+{
+	return func_1();
+}
+
+noinline int func_3(void)
+{
+	return func_2();
+}
+
+noinline int func_4(void)
+{
+	return func_3();
+}
+
+noinline int func_5(void)
+{
+	return func_4();
+}
+
+noinline int func_6(void)
+{
+	int i, val = 1;
+
+	for (i = 0; i < 100; i++)
+		val += func_5();
+
+	return val;
+}
+
+void test_perf_event_stackmap(void)
+{
+	struct perf_event_attr attr = {
+		/* .type = PERF_TYPE_SOFTWARE, */
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.precise_ip = 2,
+		.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK |
+			PERF_SAMPLE_CALLCHAIN,
+		.branch_sample_type = PERF_SAMPLE_BRANCH_USER |
+			PERF_SAMPLE_BRANCH_NO_FLAGS |
+			PERF_SAMPLE_BRANCH_NO_CYCLES |
+			PERF_SAMPLE_BRANCH_CALL_STACK,
+		.sample_period = 5000,
+		.size = sizeof(struct perf_event_attr),
+	};
+	struct perf_event_stackmap *skel;
+	__u32 duration = 0;
+	cpu_set_t cpu_set;
+	int pmu_fd, err;
+
+	skel = perf_event_stackmap__open();
+
+	if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+		return;
+
+	err = perf_event_stackmap__load(skel);
+	if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+		goto cleanup;
+
+	CPU_ZERO(&cpu_set);
+	CPU_SET(0, &cpu_set);
+	err = pthread_setaffinity_np(pthread_self(), sizeof(cpu_set), &cpu_set);
+	if (CHECK(err, "set_affinity", "err %d, errno %d\n", err, errno))
+		goto cleanup;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (pmu_fd < 0) {
+		printf("%s:SKIP:cpu doesn't support the event\n", __func__);
+		test__skip();
+		goto cleanup;
+	}
+
+	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+							   pmu_fd);
+	if (CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event",
+		  "err %ld\n", PTR_ERR(skel->links.oncpu))) {
+		close(pmu_fd);
+		goto cleanup;
+	}
+
+	/* create kernel and user stack traces for testing */
+	func_6();
+
+	CHECK(skel->data->stackid_kernel != 2, "get_stackid_kernel", "failed\n");
+	CHECK(skel->data->stackid_user != 2, "get_stackid_user", "failed\n");
+	CHECK(skel->data->stack_kernel != 2, "get_stack_kernel", "failed\n");
+	CHECK(skel->data->stack_user != 2, "get_stack_user", "failed\n");
+
+cleanup:
+	perf_event_stackmap__destroy(skel);
+}
diff --git a/tools/testing/selftests/bpf/progs/perf_event_stackmap.c b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
new file mode 100644
index 000000000000..25467d13c356
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/perf_event_stackmap.c
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include "vmlinux.h"
+#include <bpf/bpf_helpers.h>
+
+#ifndef PERF_MAX_STACK_DEPTH
+#define PERF_MAX_STACK_DEPTH         127
+#endif
+
+typedef __u64 stack_trace_t[PERF_MAX_STACK_DEPTH];
+struct {
+	__uint(type, BPF_MAP_TYPE_STACK_TRACE);
+	__uint(max_entries, 16384);
+	__uint(key_size, sizeof(__u32));
+	__uint(value_size, sizeof(stack_trace_t));
+} stackmap SEC(".maps");
+
+struct {
+	__uint(type, BPF_MAP_TYPE_PERCPU_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, __u32);
+	__type(value, stack_trace_t);
+} stackdata_map SEC(".maps");
+
+long stackid_kernel = 1;
+long stackid_user = 1;
+long stack_kernel = 1;
+long stack_user = 1;
+
+SEC("perf_event")
+int oncpu(void *ctx)
+{
+	stack_trace_t *trace;
+	__u32 key = 0;
+	long val;
+
+	val = bpf_get_stackid(ctx, &stackmap, 0);
+	if (val > 0)
+		stackid_kernel = 2;
+	val = bpf_get_stackid(ctx, &stackmap, BPF_F_USER_STACK);
+	if (val > 0)
+		stackid_user = 2;
+
+	trace = bpf_map_lookup_elem(&stackdata_map, &key);
+	if (!trace)
+		return 0;
+
+	val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), 0);
+	if (val > 0)
+		stack_kernel = 2;
+
+	val = bpf_get_stack(ctx, trace, sizeof(stack_trace_t), BPF_F_USER_STACK);
+	if (val > 0)
+		stack_user = 2;
+
+	return 0;
+}
+
+char LICENSE[] SEC("license") = "GPL";

From 9e5bd1f7633bc1c3c8b25496eedfeced6d2675ff Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:42 -0500
Subject: [PATCH 24/73] selftests/bpf: Test CGROUP_STORAGE map can't be used by
 multiple progs

The current assumption is that the lifetime of a cgroup storage
is tied to the program's attachment. The storage is created in
cgroup_bpf_attach, and released upon cgroup_bpf_detach and
cgroup_bpf_release.

Because the current semantics is that each attachment gets a
completely independent cgroup storage, and you can have multiple
programs attached to the same (cgroup, attach type) pair, the key
of the CGROUP_STORAGE map, looking up the map with this pair could
yield multiple storages, and that is not permitted. Therefore,
the kernel verifier checks that two programs cannot share the same
CGROUP_STORAGE map, even if they have different expected attach
types, considering that the actual attach type does not always
have to be equal to the expected attach type.

The test creates a CGROUP_STORAGE map and make it shared across
two different programs, one cgroup_skb/egress and one /ingress.
It asserts that the two programs cannot be both loaded, due to
verifier failure from the above reason.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/30a6b0da67ae6b0296c4d511bfb19c5f3d035916.1595565795.git.zhuyifei@google.com
---
 .../bpf/prog_tests/cg_storage_multi.c         | 43 ++++++++++++++----
 .../selftests/bpf/progs/cg_storage_multi.h    | 13 ++++++
 .../progs/cg_storage_multi_egress_ingress.c   | 45 +++++++++++++++++++
 .../bpf/progs/cg_storage_multi_egress_only.c  |  9 ++--
 4 files changed, 99 insertions(+), 11 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/progs/cg_storage_multi.h
 create mode 100644 tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c

diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
index e90e0547d759..1c7653423698 100644
--- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -8,7 +8,10 @@
 #include <cgroup_helpers.h>
 #include <network_helpers.h>
 
+#include "progs/cg_storage_multi.h"
+
 #include "cg_storage_multi_egress_only.skel.h"
+#include "cg_storage_multi_egress_ingress.skel.h"
 
 #define PARENT_CGROUP "/cgroup_storage"
 #define CHILD_CGROUP "/cgroup_storage/child"
@@ -16,10 +19,10 @@
 static int duration;
 
 static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
-			   __u32 expected)
+			   struct cgroup_value *expected)
 {
 	struct bpf_cgroup_storage_key key = {0};
-	__u32 value;
+	struct cgroup_value value;
 	int map_fd;
 
 	map_fd = bpf_map__fd(map);
@@ -29,8 +32,8 @@ static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
 	if (CHECK(bpf_map_lookup_elem(map_fd, &key, &value) < 0,
 		  "map-lookup", "errno %d", errno))
 		return true;
-	if (CHECK(value != expected,
-		  "assert-storage", "got %u expected %u", value, expected))
+	if (CHECK(memcmp(&value, expected, sizeof(struct cgroup_value)),
+		  "assert-storage", "storages differ"))
 		return true;
 
 	return false;
@@ -39,7 +42,7 @@ static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
 static bool assert_storage_noexist(struct bpf_map *map, const char *cgroup_path)
 {
 	struct bpf_cgroup_storage_key key = {0};
-	__u32 value;
+	struct cgroup_value value;
 	int map_fd;
 
 	map_fd = bpf_map__fd(map);
@@ -86,6 +89,7 @@ out_clean:
 static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 {
 	struct cg_storage_multi_egress_only *obj;
+	struct cgroup_value expected_cgroup_value;
 	struct bpf_link *parent_link = NULL, *child_link = NULL;
 	bool err;
 
@@ -109,7 +113,9 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	if (CHECK(obj->bss->invocations != 1,
 		  "first-invoke", "invocations=%d", obj->bss->invocations))
 		goto close_bpf_object;
-	if (assert_storage(obj->maps.cgroup_storage, PARENT_CGROUP, 1))
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   PARENT_CGROUP, &expected_cgroup_value))
 		goto close_bpf_object;
 	if (assert_storage_noexist(obj->maps.cgroup_storage, CHILD_CGROUP))
 		goto close_bpf_object;
@@ -129,9 +135,13 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	if (CHECK(obj->bss->invocations != 3,
 		  "second-invoke", "invocations=%d", obj->bss->invocations))
 		goto close_bpf_object;
-	if (assert_storage(obj->maps.cgroup_storage, PARENT_CGROUP, 2))
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   PARENT_CGROUP, &expected_cgroup_value))
 		goto close_bpf_object;
-	if (assert_storage(obj->maps.cgroup_storage, CHILD_CGROUP, 1))
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   CHILD_CGROUP, &expected_cgroup_value))
 		goto close_bpf_object;
 
 close_bpf_object:
@@ -141,6 +151,20 @@ close_bpf_object:
 	cg_storage_multi_egress_only__destroy(obj);
 }
 
+static void test_egress_ingress(int parent_cgroup_fd, int child_cgroup_fd)
+{
+	struct cg_storage_multi_egress_ingress *obj;
+
+	/* Cannot load both programs due to verifier failure:
+	 * "only one cgroup storage of each type is allowed"
+	 */
+	obj = cg_storage_multi_egress_ingress__open_and_load();
+	CHECK(obj || errno != EBUSY,
+	      "skel-load", "errno %d, expected EBUSY", errno);
+
+	cg_storage_multi_egress_ingress__destroy(obj);
+}
+
 void test_cg_storage_multi(void)
 {
 	int parent_cgroup_fd = -1, child_cgroup_fd = -1;
@@ -155,6 +179,9 @@ void test_cg_storage_multi(void)
 	if (test__start_subtest("egress_only"))
 		test_egress_only(parent_cgroup_fd, child_cgroup_fd);
 
+	if (test__start_subtest("egress_ingress"))
+		test_egress_ingress(parent_cgroup_fd, child_cgroup_fd);
+
 close_cgroup_fd:
 	close(child_cgroup_fd);
 	close(parent_cgroup_fd);
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi.h b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
new file mode 100644
index 000000000000..a0778fe7857a
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __PROGS_CG_STORAGE_MULTI_H
+#define __PROGS_CG_STORAGE_MULTI_H
+
+#include <asm/types.h>
+
+struct cgroup_value {
+	__u32 egress_pkts;
+	__u32 ingress_pkts;
+};
+
+#endif
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c
new file mode 100644
index 000000000000..9ce386899365
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c
@@ -0,0 +1,45 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+	__type(key, struct bpf_cgroup_storage_key);
+	__type(value, struct cgroup_value);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress")
+int egress(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->ingress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
index ec0165d07105..44ad46b33539 100644
--- a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_only.c
@@ -10,10 +10,12 @@
 #include <linux/udp.h>
 #include <bpf/bpf_helpers.h>
 
+#include "progs/cg_storage_multi.h"
+
 struct {
 	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
 	__type(key, struct bpf_cgroup_storage_key);
-	__type(value, __u32);
+	__type(value, struct cgroup_value);
 } cgroup_storage SEC(".maps");
 
 __u32 invocations = 0;
@@ -21,9 +23,10 @@ __u32 invocations = 0;
 SEC("cgroup_skb/egress")
 int egress(struct __sk_buff *skb)
 {
-	__u32 *ptr_cg_storage = bpf_get_local_storage(&cgroup_storage, 0);
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
 
-	__sync_fetch_and_add(ptr_cg_storage, 1);
+	__sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
 	__sync_fetch_and_add(&invocations, 1);
 
 	return 1;

From 346938e9380cc0b2ad8e2566389cdc570386fe22 Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Thu, 23 Jul 2020 11:06:48 -0700
Subject: [PATCH 25/73] selftests/bpf: Add get_stackid_cannot_attach

This test confirms that BPF program that calls bpf_get_stackid() cannot
attach to perf_event with precise_ip > 0 but not PERF_SAMPLE_CALLCHAIN;
and cannot attach if the perf_event has exclude_callchain_kernel.

Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200723180648.1429892-6-songliubraving@fb.com
---
 .../prog_tests/get_stackid_cannot_attach.c    | 91 +++++++++++++++++++
 1 file changed, 91 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c

diff --git a/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
new file mode 100644
index 000000000000..d884b2ed5bc5
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/get_stackid_cannot_attach.c
@@ -0,0 +1,91 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright (c) 2020 Facebook
+#include <test_progs.h>
+#include "test_stacktrace_build_id.skel.h"
+
+void test_get_stackid_cannot_attach(void)
+{
+	struct perf_event_attr attr = {
+		/* .type = PERF_TYPE_SOFTWARE, */
+		.type = PERF_TYPE_HARDWARE,
+		.config = PERF_COUNT_HW_CPU_CYCLES,
+		.precise_ip = 1,
+		.sample_type = PERF_SAMPLE_IP | PERF_SAMPLE_BRANCH_STACK,
+		.branch_sample_type = PERF_SAMPLE_BRANCH_USER |
+			PERF_SAMPLE_BRANCH_NO_FLAGS |
+			PERF_SAMPLE_BRANCH_NO_CYCLES |
+			PERF_SAMPLE_BRANCH_CALL_STACK,
+		.sample_period = 5000,
+		.size = sizeof(struct perf_event_attr),
+	};
+	struct test_stacktrace_build_id *skel;
+	__u32 duration = 0;
+	int pmu_fd, err;
+
+	skel = test_stacktrace_build_id__open();
+	if (CHECK(!skel, "skel_open", "skeleton open failed\n"))
+		return;
+
+	/* override program type */
+	bpf_program__set_perf_event(skel->progs.oncpu);
+
+	err = test_stacktrace_build_id__load(skel);
+	if (CHECK(err, "skel_load", "skeleton load failed: %d\n", err))
+		goto cleanup;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+	if (pmu_fd < 0 && (errno == ENOENT || errno == EOPNOTSUPP)) {
+		printf("%s:SKIP:cannot open PERF_COUNT_HW_CPU_CYCLES with precise_ip > 0\n",
+		       __func__);
+		test__skip();
+		goto cleanup;
+	}
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto cleanup;
+
+	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+							   pmu_fd);
+	CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_no_callchain",
+	      "should have failed\n");
+	close(pmu_fd);
+
+	/* add PERF_SAMPLE_CALLCHAIN, attach should succeed */
+	attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto cleanup;
+
+	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+							   pmu_fd);
+	CHECK(IS_ERR(skel->links.oncpu), "attach_perf_event_callchain",
+	      "err: %ld\n", PTR_ERR(skel->links.oncpu));
+	close(pmu_fd);
+
+	/* add exclude_callchain_kernel, attach should fail */
+	attr.exclude_callchain_kernel = 1;
+
+	pmu_fd = syscall(__NR_perf_event_open, &attr, -1 /* pid */,
+			 0 /* cpu 0 */, -1 /* group id */,
+			 0 /* flags */);
+
+	if (CHECK(pmu_fd < 0, "perf_event_open", "err %d errno %d\n",
+		  pmu_fd, errno))
+		goto cleanup;
+
+	skel->links.oncpu = bpf_program__attach_perf_event(skel->progs.oncpu,
+							   pmu_fd);
+	CHECK(!IS_ERR(skel->links.oncpu), "attach_perf_event_exclude_callchain_kernel",
+	      "should have failed\n");
+	close(pmu_fd);
+
+cleanup:
+	test_stacktrace_build_id__destroy(skel);
+}

From 7d9c3427894fe70d1347b4820476bf37736d2ff0 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:43 -0500
Subject: [PATCH 26/73] bpf: Make cgroup storages shared between programs on
 the same cgroup

This change comes in several parts:

One, the restriction that the CGROUP_STORAGE map can only be used
by one program is removed. This results in the removal of the field
'aux' in struct bpf_cgroup_storage_map, and removal of relevant
code associated with the field, and removal of now-noop functions
bpf_free_cgroup_storage and bpf_cgroup_storage_release.

Second, we permit a key of type u64 as the key to the map.
Providing such a key type indicates that the map should ignore
attach type when comparing map keys. However, for simplicity newly
linked storage will still have the attach type at link time in
its key struct. cgroup_storage_check_btf is adapted to accept
u64 as the type of the key.

Third, because the storages are now shared, the storages cannot
be unconditionally freed on program detach. There could be two
ways to solve this issue:
* A. Reference count the usage of the storages, and free when the
     last program is detached.
* B. Free only when the storage is impossible to be referred to
     again, i.e. when either the cgroup_bpf it is attached to, or
     the map itself, is freed.
Option A has the side effect that, when the user detach and
reattach a program, whether the program gets a fresh storage
depends on whether there is another program attached using that
storage. This could trigger races if the user is multi-threaded,
and since nondeterminism in data races is evil, go with option B.

The both the map and the cgroup_bpf now tracks their associated
storages, and the storage unlink and free are removed from
cgroup_bpf_detach and added to cgroup_bpf_release and
cgroup_storage_map_free. The latter also new holds the cgroup_mutex
to prevent any races with the former.

Fourth, on attach, we reuse the old storage if the key already
exists in the map, via cgroup_storage_lookup. If the storage
does not exist yet, we create a new one, and publish it at the
last step in the attach process. This does not create a race
condition because for the whole attach the cgroup_mutex is held.
We keep track of an array of new storages that was allocated
and if the process fails only the new storages would get freed.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/d5401c6106728a00890401190db40020a1f84ff1.1595565795.git.zhuyifei@google.com
---
 include/linux/bpf-cgroup.h |  12 ++-
 kernel/bpf/cgroup.c        |  67 +++++++-----
 kernel/bpf/core.c          |  12 ---
 kernel/bpf/local_storage.c | 214 ++++++++++++++++++++-----------------
 4 files changed, 163 insertions(+), 142 deletions(-)

diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
index 2c6f26670acc..64f367044e25 100644
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -46,7 +46,8 @@ struct bpf_cgroup_storage {
 	};
 	struct bpf_cgroup_storage_map *map;
 	struct bpf_cgroup_storage_key key;
-	struct list_head list;
+	struct list_head list_map;
+	struct list_head list_cg;
 	struct rb_node node;
 	struct rcu_head rcu;
 };
@@ -78,6 +79,9 @@ struct cgroup_bpf {
 	struct list_head progs[MAX_BPF_ATTACH_TYPE];
 	u32 flags[MAX_BPF_ATTACH_TYPE];
 
+	/* list of cgroup shared storages */
+	struct list_head storages;
+
 	/* temp storage for effective prog array used by prog_attach/detach */
 	struct bpf_prog_array *inactive;
 
@@ -161,6 +165,9 @@ static inline void bpf_cgroup_storage_set(struct bpf_cgroup_storage
 		this_cpu_write(bpf_cgroup_storage[stype], storage[stype]);
 }
 
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+		      void *key, bool locked);
 struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(struct bpf_prog *prog,
 					enum bpf_cgroup_storage_type stype);
 void bpf_cgroup_storage_free(struct bpf_cgroup_storage *storage);
@@ -169,7 +176,6 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 			     enum bpf_attach_type type);
 void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage);
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *map);
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *map);
 
 int bpf_percpu_cgroup_storage_copy(struct bpf_map *map, void *key, void *value);
 int bpf_percpu_cgroup_storage_update(struct bpf_map *map, void *key,
@@ -383,8 +389,6 @@ static inline void bpf_cgroup_storage_set(
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE]) {}
 static inline int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux,
 					    struct bpf_map *map) { return 0; }
-static inline void bpf_cgroup_storage_release(struct bpf_prog_aux *aux,
-					      struct bpf_map *map) {}
 static inline struct bpf_cgroup_storage *bpf_cgroup_storage_alloc(
 	struct bpf_prog *prog, enum bpf_cgroup_storage_type stype) { return NULL; }
 static inline void bpf_cgroup_storage_free(
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index ac53102e244a..957cce1d5168 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -37,17 +37,34 @@ static void bpf_cgroup_storages_free(struct bpf_cgroup_storage *storages[])
 }
 
 static int bpf_cgroup_storages_alloc(struct bpf_cgroup_storage *storages[],
-				     struct bpf_prog *prog)
+				     struct bpf_cgroup_storage *new_storages[],
+				     enum bpf_attach_type type,
+				     struct bpf_prog *prog,
+				     struct cgroup *cgrp)
 {
 	enum bpf_cgroup_storage_type stype;
+	struct bpf_cgroup_storage_key key;
+	struct bpf_map *map;
+
+	key.cgroup_inode_id = cgroup_id(cgrp);
+	key.attach_type = type;
 
 	for_each_cgroup_storage_type(stype) {
+		map = prog->aux->cgroup_storage[stype];
+		if (!map)
+			continue;
+
+		storages[stype] = cgroup_storage_lookup((void *)map, &key, false);
+		if (storages[stype])
+			continue;
+
 		storages[stype] = bpf_cgroup_storage_alloc(prog, stype);
 		if (IS_ERR(storages[stype])) {
-			storages[stype] = NULL;
-			bpf_cgroup_storages_free(storages);
+			bpf_cgroup_storages_free(new_storages);
 			return -ENOMEM;
 		}
+
+		new_storages[stype] = storages[stype];
 	}
 
 	return 0;
@@ -63,7 +80,7 @@ static void bpf_cgroup_storages_assign(struct bpf_cgroup_storage *dst[],
 }
 
 static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
-				     struct cgroup* cgrp,
+				     struct cgroup *cgrp,
 				     enum bpf_attach_type attach_type)
 {
 	enum bpf_cgroup_storage_type stype;
@@ -72,14 +89,6 @@ static void bpf_cgroup_storages_link(struct bpf_cgroup_storage *storages[],
 		bpf_cgroup_storage_link(storages[stype], cgrp, attach_type);
 }
 
-static void bpf_cgroup_storages_unlink(struct bpf_cgroup_storage *storages[])
-{
-	enum bpf_cgroup_storage_type stype;
-
-	for_each_cgroup_storage_type(stype)
-		bpf_cgroup_storage_unlink(storages[stype]);
-}
-
 /* Called when bpf_cgroup_link is auto-detached from dying cgroup.
  * It drops cgroup and bpf_prog refcounts, and marks bpf_link as defunct. It
  * doesn't free link memory, which will eventually be done by bpf_link's
@@ -101,22 +110,23 @@ static void cgroup_bpf_release(struct work_struct *work)
 	struct cgroup *p, *cgrp = container_of(work, struct cgroup,
 					       bpf.release_work);
 	struct bpf_prog_array *old_array;
+	struct list_head *storages = &cgrp->bpf.storages;
+	struct bpf_cgroup_storage *storage, *stmp;
+
 	unsigned int type;
 
 	mutex_lock(&cgroup_mutex);
 
 	for (type = 0; type < ARRAY_SIZE(cgrp->bpf.progs); type++) {
 		struct list_head *progs = &cgrp->bpf.progs[type];
-		struct bpf_prog_list *pl, *tmp;
+		struct bpf_prog_list *pl, *pltmp;
 
-		list_for_each_entry_safe(pl, tmp, progs, node) {
+		list_for_each_entry_safe(pl, pltmp, progs, node) {
 			list_del(&pl->node);
 			if (pl->prog)
 				bpf_prog_put(pl->prog);
 			if (pl->link)
 				bpf_cgroup_link_auto_detach(pl->link);
-			bpf_cgroup_storages_unlink(pl->storage);
-			bpf_cgroup_storages_free(pl->storage);
 			kfree(pl);
 			static_branch_dec(&cgroup_bpf_enabled_key);
 		}
@@ -126,6 +136,11 @@ static void cgroup_bpf_release(struct work_struct *work)
 		bpf_prog_array_free(old_array);
 	}
 
+	list_for_each_entry_safe(storage, stmp, storages, list_cg) {
+		bpf_cgroup_storage_unlink(storage);
+		bpf_cgroup_storage_free(storage);
+	}
+
 	mutex_unlock(&cgroup_mutex);
 
 	for (p = cgroup_parent(cgrp); p; p = cgroup_parent(p))
@@ -290,6 +305,8 @@ int cgroup_bpf_inherit(struct cgroup *cgrp)
 	for (i = 0; i < NR; i++)
 		INIT_LIST_HEAD(&cgrp->bpf.progs[i]);
 
+	INIT_LIST_HEAD(&cgrp->bpf.storages);
+
 	for (i = 0; i < NR; i++)
 		if (compute_effective_progs(cgrp, i, &arrays[i]))
 			goto cleanup;
@@ -422,7 +439,7 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	struct list_head *progs = &cgrp->bpf.progs[type];
 	struct bpf_prog *old_prog = NULL;
 	struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
-	struct bpf_cgroup_storage *old_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
+	struct bpf_cgroup_storage *new_storage[MAX_BPF_CGROUP_STORAGE_TYPE] = {};
 	struct bpf_prog_list *pl;
 	int err;
 
@@ -455,17 +472,16 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (IS_ERR(pl))
 		return PTR_ERR(pl);
 
-	if (bpf_cgroup_storages_alloc(storage, prog ? : link->link.prog))
+	if (bpf_cgroup_storages_alloc(storage, new_storage, type,
+				      prog ? : link->link.prog, cgrp))
 		return -ENOMEM;
 
 	if (pl) {
 		old_prog = pl->prog;
-		bpf_cgroup_storages_unlink(pl->storage);
-		bpf_cgroup_storages_assign(old_storage, pl->storage);
 	} else {
 		pl = kmalloc(sizeof(*pl), GFP_KERNEL);
 		if (!pl) {
-			bpf_cgroup_storages_free(storage);
+			bpf_cgroup_storages_free(new_storage);
 			return -ENOMEM;
 		}
 		list_add_tail(&pl->node, progs);
@@ -480,12 +496,11 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
 	if (err)
 		goto cleanup;
 
-	bpf_cgroup_storages_free(old_storage);
 	if (old_prog)
 		bpf_prog_put(old_prog);
 	else
 		static_branch_inc(&cgroup_bpf_enabled_key);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_link(new_storage, cgrp, type);
 	return 0;
 
 cleanup:
@@ -493,9 +508,7 @@ cleanup:
 		pl->prog = old_prog;
 		pl->link = NULL;
 	}
-	bpf_cgroup_storages_free(pl->storage);
-	bpf_cgroup_storages_assign(pl->storage, old_storage);
-	bpf_cgroup_storages_link(pl->storage, cgrp, type);
+	bpf_cgroup_storages_free(new_storage);
 	if (!old_prog) {
 		list_del(&pl->node);
 		kfree(pl);
@@ -679,8 +692,6 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
 
 	/* now can actually delete it from this cgroup list */
 	list_del(&pl->node);
-	bpf_cgroup_storages_unlink(pl->storage);
-	bpf_cgroup_storages_free(pl->storage);
 	kfree(pl);
 	if (list_empty(progs))
 		/* last program was detached, reset flags to zero */
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7be02e555ab9..bde93344164d 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -2097,24 +2097,12 @@ int bpf_prog_array_copy_info(struct bpf_prog_array *array,
 								     : 0;
 }
 
-static void bpf_free_cgroup_storage(struct bpf_prog_aux *aux)
-{
-	enum bpf_cgroup_storage_type stype;
-
-	for_each_cgroup_storage_type(stype) {
-		if (!aux->cgroup_storage[stype])
-			continue;
-		bpf_cgroup_storage_release(aux, aux->cgroup_storage[stype]);
-	}
-}
-
 void __bpf_free_used_maps(struct bpf_prog_aux *aux,
 			  struct bpf_map **used_maps, u32 len)
 {
 	struct bpf_map *map;
 	u32 i;
 
-	bpf_free_cgroup_storage(aux);
 	for (i = 0; i < len; i++) {
 		map = used_maps[i];
 		if (map->ops->map_poke_untrack)
diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 51bd5a8cb01b..3b2c70197d78 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
+#include "../cgroup/cgroup-internal.h"
+
 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
@@ -20,7 +22,6 @@ struct bpf_cgroup_storage_map {
 	struct bpf_map map;
 
 	spinlock_t lock;
-	struct bpf_prog_aux *aux;
 	struct rb_root root;
 	struct list_head list;
 };
@@ -30,24 +31,41 @@ static struct bpf_cgroup_storage_map *map_to_storage(struct bpf_map *map)
 	return container_of(map, struct bpf_cgroup_storage_map, map);
 }
 
-static int bpf_cgroup_storage_key_cmp(
-	const struct bpf_cgroup_storage_key *key1,
-	const struct bpf_cgroup_storage_key *key2)
+static bool attach_type_isolated(const struct bpf_map *map)
 {
-	if (key1->cgroup_inode_id < key2->cgroup_inode_id)
-		return -1;
-	else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
-		return 1;
-	else if (key1->attach_type < key2->attach_type)
-		return -1;
-	else if (key1->attach_type > key2->attach_type)
-		return 1;
+	return map->key_size == sizeof(struct bpf_cgroup_storage_key);
+}
+
+static int bpf_cgroup_storage_key_cmp(const struct bpf_cgroup_storage_map *map,
+				      const void *_key1, const void *_key2)
+{
+	if (attach_type_isolated(&map->map)) {
+		const struct bpf_cgroup_storage_key *key1 = _key1;
+		const struct bpf_cgroup_storage_key *key2 = _key2;
+
+		if (key1->cgroup_inode_id < key2->cgroup_inode_id)
+			return -1;
+		else if (key1->cgroup_inode_id > key2->cgroup_inode_id)
+			return 1;
+		else if (key1->attach_type < key2->attach_type)
+			return -1;
+		else if (key1->attach_type > key2->attach_type)
+			return 1;
+	} else {
+		const __u64 *cgroup_inode_id1 = _key1;
+		const __u64 *cgroup_inode_id2 = _key2;
+
+		if (*cgroup_inode_id1 < *cgroup_inode_id2)
+			return -1;
+		else if (*cgroup_inode_id1 > *cgroup_inode_id2)
+			return 1;
+	}
 	return 0;
 }
 
-static struct bpf_cgroup_storage *cgroup_storage_lookup(
-	struct bpf_cgroup_storage_map *map, struct bpf_cgroup_storage_key *key,
-	bool locked)
+struct bpf_cgroup_storage *
+cgroup_storage_lookup(struct bpf_cgroup_storage_map *map,
+		      void *key, bool locked)
 {
 	struct rb_root *root = &map->root;
 	struct rb_node *node;
@@ -61,7 +79,7 @@ static struct bpf_cgroup_storage *cgroup_storage_lookup(
 
 		storage = container_of(node, struct bpf_cgroup_storage, node);
 
-		switch (bpf_cgroup_storage_key_cmp(key, &storage->key)) {
+		switch (bpf_cgroup_storage_key_cmp(map, key, &storage->key)) {
 		case -1:
 			node = node->rb_left;
 			break;
@@ -93,7 +111,7 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
 		this = container_of(*new, struct bpf_cgroup_storage, node);
 
 		parent = *new;
-		switch (bpf_cgroup_storage_key_cmp(&storage->key, &this->key)) {
+		switch (bpf_cgroup_storage_key_cmp(map, &storage->key, &this->key)) {
 		case -1:
 			new = &((*new)->rb_left);
 			break;
@@ -111,10 +129,9 @@ static int cgroup_storage_insert(struct bpf_cgroup_storage_map *map,
 	return 0;
 }
 
-static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
+static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *key)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 
 	storage = cgroup_storage_lookup(map, key, false);
@@ -124,17 +141,13 @@ static void *cgroup_storage_lookup_elem(struct bpf_map *_map, void *_key)
 	return &READ_ONCE(storage->buf)->data[0];
 }
 
-static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
+static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
 				      void *value, u64 flags)
 {
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	struct bpf_storage_buffer *new;
 
-	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST | BPF_NOEXIST)))
-		return -EINVAL;
-
-	if (unlikely(flags & BPF_NOEXIST))
+	if (unlikely(flags & ~(BPF_F_LOCK | BPF_EXIST)))
 		return -EINVAL;
 
 	if (unlikely((flags & BPF_F_LOCK) &&
@@ -167,11 +180,10 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *_key,
 	return 0;
 }
 
-int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *key,
 				   void *value)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu, off = 0;
 	u32 size;
@@ -197,11 +209,10 @@ int bpf_percpu_cgroup_storage_copy(struct bpf_map *_map, void *_key,
 	return 0;
 }
 
-int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
+int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *key,
 				     void *value, u64 map_flags)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu, off = 0;
 	u32 size;
@@ -232,12 +243,10 @@ int bpf_percpu_cgroup_storage_update(struct bpf_map *_map, void *_key,
 	return 0;
 }
 
-static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
+static int cgroup_storage_get_next_key(struct bpf_map *_map, void *key,
 				       void *_next_key)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	struct bpf_cgroup_storage_key *key = _key;
-	struct bpf_cgroup_storage_key *next = _next_key;
 	struct bpf_cgroup_storage *storage;
 
 	spin_lock_bh(&map->lock);
@@ -250,17 +259,23 @@ static int cgroup_storage_get_next_key(struct bpf_map *_map, void *_key,
 		if (!storage)
 			goto enoent;
 
-		storage = list_next_entry(storage, list);
+		storage = list_next_entry(storage, list_map);
 		if (!storage)
 			goto enoent;
 	} else {
 		storage = list_first_entry(&map->list,
-					 struct bpf_cgroup_storage, list);
+					 struct bpf_cgroup_storage, list_map);
 	}
 
 	spin_unlock_bh(&map->lock);
-	next->attach_type = storage->key.attach_type;
-	next->cgroup_inode_id = storage->key.cgroup_inode_id;
+
+	if (attach_type_isolated(&map->map)) {
+		struct bpf_cgroup_storage_key *next = _next_key;
+		*next = storage->key;
+	} else {
+		__u64 *next = _next_key;
+		*next = storage->key.cgroup_inode_id;
+	}
 	return 0;
 
 enoent:
@@ -275,7 +290,8 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 	struct bpf_map_memory mem;
 	int ret;
 
-	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key))
+	if (attr->key_size != sizeof(struct bpf_cgroup_storage_key) &&
+	    attr->key_size != sizeof(__u64))
 		return ERR_PTR(-EINVAL);
 
 	if (attr->value_size == 0)
@@ -318,6 +334,17 @@ static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
 static void cgroup_storage_map_free(struct bpf_map *_map)
 {
 	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
+	struct list_head *storages = &map->list;
+	struct bpf_cgroup_storage *storage, *stmp;
+
+	mutex_lock(&cgroup_mutex);
+
+	list_for_each_entry_safe(storage, stmp, storages, list_map) {
+		bpf_cgroup_storage_unlink(storage);
+		bpf_cgroup_storage_free(storage);
+	}
+
+	mutex_unlock(&cgroup_mutex);
 
 	WARN_ON(!RB_EMPTY_ROOT(&map->root));
 	WARN_ON(!list_empty(&map->list));
@@ -335,49 +362,63 @@ static int cgroup_storage_check_btf(const struct bpf_map *map,
 				    const struct btf_type *key_type,
 				    const struct btf_type *value_type)
 {
-	struct btf_member *m;
-	u32 offset, size;
+	if (attach_type_isolated(map)) {
+		struct btf_member *m;
+		u32 offset, size;
 
-	/* Key is expected to be of struct bpf_cgroup_storage_key type,
-	 * which is:
-	 * struct bpf_cgroup_storage_key {
-	 *	__u64	cgroup_inode_id;
-	 *	__u32	attach_type;
-	 * };
-	 */
+		/* Key is expected to be of struct bpf_cgroup_storage_key type,
+		 * which is:
+		 * struct bpf_cgroup_storage_key {
+		 *	__u64	cgroup_inode_id;
+		 *	__u32	attach_type;
+		 * };
+		 */
 
-	/*
-	 * Key_type must be a structure with two fields.
-	 */
-	if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
-	    BTF_INFO_VLEN(key_type->info) != 2)
-		return -EINVAL;
+		/*
+		 * Key_type must be a structure with two fields.
+		 */
+		if (BTF_INFO_KIND(key_type->info) != BTF_KIND_STRUCT ||
+		    BTF_INFO_VLEN(key_type->info) != 2)
+			return -EINVAL;
 
-	/*
-	 * The first field must be a 64 bit integer at 0 offset.
-	 */
-	m = (struct btf_member *)(key_type + 1);
-	size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
-	if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
-		return -EINVAL;
+		/*
+		 * The first field must be a 64 bit integer at 0 offset.
+		 */
+		m = (struct btf_member *)(key_type + 1);
+		size = sizeof_field(struct bpf_cgroup_storage_key, cgroup_inode_id);
+		if (!btf_member_is_reg_int(btf, key_type, m, 0, size))
+			return -EINVAL;
 
-	/*
-	 * The second field must be a 32 bit integer at 64 bit offset.
-	 */
-	m++;
-	offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
-	size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
-	if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
-		return -EINVAL;
+		/*
+		 * The second field must be a 32 bit integer at 64 bit offset.
+		 */
+		m++;
+		offset = offsetof(struct bpf_cgroup_storage_key, attach_type);
+		size = sizeof_field(struct bpf_cgroup_storage_key, attach_type);
+		if (!btf_member_is_reg_int(btf, key_type, m, offset, size))
+			return -EINVAL;
+	} else {
+		u32 int_data;
+
+		/*
+		 * Key is expected to be u64, which stores the cgroup_inode_id
+		 */
+
+		if (BTF_INFO_KIND(key_type->info) != BTF_KIND_INT)
+			return -EINVAL;
+
+		int_data = *(u32 *)(key_type + 1);
+		if (BTF_INT_BITS(int_data) != 64 || BTF_INT_OFFSET(int_data))
+			return -EINVAL;
+	}
 
 	return 0;
 }
 
-static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *_key,
+static void cgroup_storage_seq_show_elem(struct bpf_map *map, void *key,
 					 struct seq_file *m)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(map);
-	struct bpf_cgroup_storage_key *key = _key;
 	struct bpf_cgroup_storage *storage;
 	int cpu;
 
@@ -426,38 +467,13 @@ const struct bpf_map_ops cgroup_storage_map_ops = {
 int bpf_cgroup_storage_assign(struct bpf_prog_aux *aux, struct bpf_map *_map)
 {
 	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
-	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-	int ret = -EBUSY;
 
-	spin_lock_bh(&map->lock);
-
-	if (map->aux && map->aux != aux)
-		goto unlock;
 	if (aux->cgroup_storage[stype] &&
 	    aux->cgroup_storage[stype] != _map)
-		goto unlock;
+		return -EBUSY;
 
-	map->aux = aux;
 	aux->cgroup_storage[stype] = _map;
-	ret = 0;
-unlock:
-	spin_unlock_bh(&map->lock);
-
-	return ret;
-}
-
-void bpf_cgroup_storage_release(struct bpf_prog_aux *aux, struct bpf_map *_map)
-{
-	enum bpf_cgroup_storage_type stype = cgroup_storage_type(_map);
-	struct bpf_cgroup_storage_map *map = map_to_storage(_map);
-
-	spin_lock_bh(&map->lock);
-	if (map->aux == aux) {
-		WARN_ON(aux->cgroup_storage[stype] != _map);
-		map->aux = NULL;
-		aux->cgroup_storage[stype] = NULL;
-	}
-	spin_unlock_bh(&map->lock);
+	return 0;
 }
 
 static size_t bpf_cgroup_storage_calculate_size(struct bpf_map *map, u32 *pages)
@@ -578,7 +594,8 @@ void bpf_cgroup_storage_link(struct bpf_cgroup_storage *storage,
 
 	spin_lock_bh(&map->lock);
 	WARN_ON(cgroup_storage_insert(map, storage));
-	list_add(&storage->list, &map->list);
+	list_add(&storage->list_map, &map->list);
+	list_add(&storage->list_cg, &cgroup->bpf.storages);
 	spin_unlock_bh(&map->lock);
 }
 
@@ -596,7 +613,8 @@ void bpf_cgroup_storage_unlink(struct bpf_cgroup_storage *storage)
 	root = &map->root;
 	rb_erase(&storage->node, root);
 
-	list_del(&storage->list);
+	list_del(&storage->list_map);
+	list_del(&storage->list_cg);
 	spin_unlock_bh(&map->lock);
 }
 

From 3573f384014f51fd5289df0e8369b63ae7fdc244 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:44 -0500
Subject: [PATCH 27/73] selftests/bpf: Test CGROUP_STORAGE behavior on shared
 egress + ingress

This mirrors the original egress-only test. The cgroup_storage is
now extended to have two packet counters, one for egress and one
for ingress. We also extend to have two egress programs to test
that egress will always share with other egress origrams in the
same cgroup. The behavior of the counters are exactly the same as
the original egress-only test.

The test is split into two, one "isolated" test that when the key
type is struct bpf_cgroup_storage_key, which contains the attach
type, programs of different attach types will see different
storages. The other, "shared" test that when the key type is u64,
programs of different attach types will see the same storage if
they are attached to the same cgroup.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/c756f5f1521227b8e6e90a453299dda722d7324d.1595565795.git.zhuyifei@google.com
---
 .../bpf/prog_tests/cg_storage_multi.c         | 265 ++++++++++++++++--
 ..._ingress.c => cg_storage_multi_isolated.c} |  16 +-
 .../bpf/progs/cg_storage_multi_shared.c       |  57 ++++
 3 files changed, 311 insertions(+), 27 deletions(-)
 rename tools/testing/selftests/bpf/progs/{cg_storage_multi_egress_ingress.c => cg_storage_multi_isolated.c} (73%)
 create mode 100644 tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c

diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
index 1c7653423698..c67d8c076a34 100644
--- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -11,25 +11,23 @@
 #include "progs/cg_storage_multi.h"
 
 #include "cg_storage_multi_egress_only.skel.h"
-#include "cg_storage_multi_egress_ingress.skel.h"
+#include "cg_storage_multi_isolated.skel.h"
+#include "cg_storage_multi_shared.skel.h"
 
 #define PARENT_CGROUP "/cgroup_storage"
 #define CHILD_CGROUP "/cgroup_storage/child"
 
 static int duration;
 
-static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
+static bool assert_storage(struct bpf_map *map, const void *key,
 			   struct cgroup_value *expected)
 {
-	struct bpf_cgroup_storage_key key = {0};
 	struct cgroup_value value;
 	int map_fd;
 
 	map_fd = bpf_map__fd(map);
 
-	key.cgroup_inode_id = get_cgroup_id(cgroup_path);
-	key.attach_type = BPF_CGROUP_INET_EGRESS;
-	if (CHECK(bpf_map_lookup_elem(map_fd, &key, &value) < 0,
+	if (CHECK(bpf_map_lookup_elem(map_fd, key, &value) < 0,
 		  "map-lookup", "errno %d", errno))
 		return true;
 	if (CHECK(memcmp(&value, expected, sizeof(struct cgroup_value)),
@@ -39,17 +37,14 @@ static bool assert_storage(struct bpf_map *map, const char *cgroup_path,
 	return false;
 }
 
-static bool assert_storage_noexist(struct bpf_map *map, const char *cgroup_path)
+static bool assert_storage_noexist(struct bpf_map *map, const void *key)
 {
-	struct bpf_cgroup_storage_key key = {0};
 	struct cgroup_value value;
 	int map_fd;
 
 	map_fd = bpf_map__fd(map);
 
-	key.cgroup_inode_id = get_cgroup_id(cgroup_path);
-	key.attach_type = BPF_CGROUP_INET_EGRESS;
-	if (CHECK(bpf_map_lookup_elem(map_fd, &key, &value) == 0,
+	if (CHECK(bpf_map_lookup_elem(map_fd, key, &value) == 0,
 		  "map-lookup", "succeeded, expected ENOENT"))
 		return true;
 	if (CHECK(errno != ENOENT,
@@ -90,9 +85,12 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 {
 	struct cg_storage_multi_egress_only *obj;
 	struct cgroup_value expected_cgroup_value;
+	struct bpf_cgroup_storage_key key;
 	struct bpf_link *parent_link = NULL, *child_link = NULL;
 	bool err;
 
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+
 	obj = cg_storage_multi_egress_only__open_and_load();
 	if (CHECK(!obj, "skel-load", "errno %d", errno))
 		return;
@@ -113,11 +111,13 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	if (CHECK(obj->bss->invocations != 1,
 		  "first-invoke", "invocations=%d", obj->bss->invocations))
 		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
 	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
 	if (assert_storage(obj->maps.cgroup_storage,
-			   PARENT_CGROUP, &expected_cgroup_value))
+			   &key, &expected_cgroup_value))
 		goto close_bpf_object;
-	if (assert_storage_noexist(obj->maps.cgroup_storage, CHILD_CGROUP))
+	key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+	if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
 		goto close_bpf_object;
 
 	/* Attach to parent and child cgroup, trigger packet from child.
@@ -135,13 +135,15 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 	if (CHECK(obj->bss->invocations != 3,
 		  "second-invoke", "invocations=%d", obj->bss->invocations))
 		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
 	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
 	if (assert_storage(obj->maps.cgroup_storage,
-			   PARENT_CGROUP, &expected_cgroup_value))
+			   &key, &expected_cgroup_value))
 		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
 	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 1 };
 	if (assert_storage(obj->maps.cgroup_storage,
-			   CHILD_CGROUP, &expected_cgroup_value))
+			   &key, &expected_cgroup_value))
 		goto close_bpf_object;
 
 close_bpf_object:
@@ -151,18 +153,228 @@ close_bpf_object:
 	cg_storage_multi_egress_only__destroy(obj);
 }
 
-static void test_egress_ingress(int parent_cgroup_fd, int child_cgroup_fd)
+static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 {
-	struct cg_storage_multi_egress_ingress *obj;
+	struct cg_storage_multi_isolated *obj;
+	struct cgroup_value expected_cgroup_value;
+	struct bpf_cgroup_storage_key key;
+	struct bpf_link *parent_egress1_link = NULL, *parent_egress2_link = NULL;
+	struct bpf_link *child_egress1_link = NULL, *child_egress2_link = NULL;
+	struct bpf_link *parent_ingress_link = NULL, *child_ingress_link = NULL;
+	bool err;
 
-	/* Cannot load both programs due to verifier failure:
-	 * "only one cgroup storage of each type is allowed"
+	obj = cg_storage_multi_isolated__open_and_load();
+	if (CHECK(!obj, "skel-load", "errno %d", errno))
+		return;
+
+	/* Attach to parent cgroup, trigger packet from child.
+	 * Assert that there is three runs, two with parent cgroup egress and
+	 * one with parent cgroup ingress, stored in separate parent storages.
+	 * Also assert that child cgroup's storages does not exist
 	 */
-	obj = cg_storage_multi_egress_ingress__open_and_load();
-	CHECK(obj || errno != EBUSY,
-	      "skel-load", "errno %d, expected EBUSY", errno);
+	parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
+		  "err %ld", PTR_ERR(parent_egress1_link)))
+		goto close_bpf_object;
+	parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
+		  "err %ld", PTR_ERR(parent_egress2_link)))
+		goto close_bpf_object;
+	parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
+		  "err %ld", PTR_ERR(parent_ingress_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "first-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 3,
+		  "first-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key.attach_type = BPF_CGROUP_INET_INGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 1 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+		goto close_bpf_object;
+	key.attach_type = BPF_CGROUP_INET_INGRESS;
+	if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+		goto close_bpf_object;
 
-	cg_storage_multi_egress_ingress__destroy(obj);
+	/* Attach to parent and child cgroup, trigger packet from child.
+	 * Assert that there is six additional runs, parent cgroup egresses and
+	 * ingress, child cgroup egresses and ingress.
+	 * Assert that egree and ingress storages are separate.
+	 */
+	child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
+		  "err %ld", PTR_ERR(child_egress1_link)))
+		goto close_bpf_object;
+	child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
+		  "err %ld", PTR_ERR(child_egress2_link)))
+		goto close_bpf_object;
+	child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
+		  "err %ld", PTR_ERR(child_ingress_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "second-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 9,
+		  "second-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(PARENT_CGROUP);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 4 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key.attach_type = BPF_CGROUP_INET_INGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 2 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key.cgroup_inode_id = get_cgroup_id(CHILD_CGROUP);
+	key.attach_type = BPF_CGROUP_INET_EGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .egress_pkts = 2 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key.attach_type = BPF_CGROUP_INET_INGRESS;
+	expected_cgroup_value = (struct cgroup_value) { .ingress_pkts = 1 };
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+
+close_bpf_object:
+	bpf_link__destroy(parent_egress1_link);
+	bpf_link__destroy(parent_egress2_link);
+	bpf_link__destroy(parent_ingress_link);
+	bpf_link__destroy(child_egress1_link);
+	bpf_link__destroy(child_egress2_link);
+	bpf_link__destroy(child_ingress_link);
+
+	cg_storage_multi_isolated__destroy(obj);
+}
+
+static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
+{
+	struct cg_storage_multi_shared *obj;
+	struct cgroup_value expected_cgroup_value;
+	__u64 key;
+	struct bpf_link *parent_egress1_link = NULL, *parent_egress2_link = NULL;
+	struct bpf_link *child_egress1_link = NULL, *child_egress2_link = NULL;
+	struct bpf_link *parent_ingress_link = NULL, *child_ingress_link = NULL;
+	bool err;
+
+	obj = cg_storage_multi_shared__open_and_load();
+	if (CHECK(!obj, "skel-load", "errno %d", errno))
+		return;
+
+	/* Attach to parent cgroup, trigger packet from child.
+	 * Assert that there is three runs, two with parent cgroup egress and
+	 * one with parent cgroup ingress.
+	 * Also assert that child cgroup's storage does not exist
+	 */
+	parent_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_egress1_link), "parent-egress1-cg-attach",
+		  "err %ld", PTR_ERR(parent_egress1_link)))
+		goto close_bpf_object;
+	parent_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_egress2_link), "parent-egress2-cg-attach",
+		  "err %ld", PTR_ERR(parent_egress2_link)))
+		goto close_bpf_object;
+	parent_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+							 parent_cgroup_fd);
+	if (CHECK(IS_ERR(parent_ingress_link), "parent-ingress-cg-attach",
+		  "err %ld", PTR_ERR(parent_ingress_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "first-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 3,
+		  "first-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	key = get_cgroup_id(PARENT_CGROUP);
+	expected_cgroup_value = (struct cgroup_value) {
+		.egress_pkts = 2,
+		.ingress_pkts = 1,
+	};
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key = get_cgroup_id(CHILD_CGROUP);
+	if (assert_storage_noexist(obj->maps.cgroup_storage, &key))
+		goto close_bpf_object;
+
+	/* Attach to parent and child cgroup, trigger packet from child.
+	 * Assert that there is six additional runs, parent cgroup egresses and
+	 * ingress, child cgroup egresses and ingress.
+	 */
+	child_egress1_link = bpf_program__attach_cgroup(obj->progs.egress1,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_egress1_link), "child-egress1-cg-attach",
+		  "err %ld", PTR_ERR(child_egress1_link)))
+		goto close_bpf_object;
+	child_egress2_link = bpf_program__attach_cgroup(obj->progs.egress2,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_egress2_link), "child-egress2-cg-attach",
+		  "err %ld", PTR_ERR(child_egress2_link)))
+		goto close_bpf_object;
+	child_ingress_link = bpf_program__attach_cgroup(obj->progs.ingress,
+							child_cgroup_fd);
+	if (CHECK(IS_ERR(child_ingress_link), "child-ingress-cg-attach",
+		  "err %ld", PTR_ERR(child_ingress_link)))
+		goto close_bpf_object;
+	err = connect_send(CHILD_CGROUP);
+	if (CHECK(err, "second-connect-send", "errno %d", errno))
+		goto close_bpf_object;
+	if (CHECK(obj->bss->invocations != 9,
+		  "second-invoke", "invocations=%d", obj->bss->invocations))
+		goto close_bpf_object;
+	key = get_cgroup_id(PARENT_CGROUP);
+	expected_cgroup_value = (struct cgroup_value) {
+		.egress_pkts = 4,
+		.ingress_pkts = 2,
+	};
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+	key = get_cgroup_id(CHILD_CGROUP);
+	expected_cgroup_value = (struct cgroup_value) {
+		.egress_pkts = 2,
+		.ingress_pkts = 1,
+	};
+	if (assert_storage(obj->maps.cgroup_storage,
+			   &key, &expected_cgroup_value))
+		goto close_bpf_object;
+
+close_bpf_object:
+	bpf_link__destroy(parent_egress1_link);
+	bpf_link__destroy(parent_egress2_link);
+	bpf_link__destroy(parent_ingress_link);
+	bpf_link__destroy(child_egress1_link);
+	bpf_link__destroy(child_egress2_link);
+	bpf_link__destroy(child_ingress_link);
+
+	cg_storage_multi_shared__destroy(obj);
 }
 
 void test_cg_storage_multi(void)
@@ -179,8 +391,11 @@ void test_cg_storage_multi(void)
 	if (test__start_subtest("egress_only"))
 		test_egress_only(parent_cgroup_fd, child_cgroup_fd);
 
-	if (test__start_subtest("egress_ingress"))
-		test_egress_ingress(parent_cgroup_fd, child_cgroup_fd);
+	if (test__start_subtest("isolated"))
+		test_isolated(parent_cgroup_fd, child_cgroup_fd);
+
+	if (test__start_subtest("shared"))
+		test_shared(parent_cgroup_fd, child_cgroup_fd);
 
 close_cgroup_fd:
 	close(child_cgroup_fd);
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c
similarity index 73%
rename from tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c
rename to tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c
index 9ce386899365..a25373002055 100644
--- a/tools/testing/selftests/bpf/progs/cg_storage_multi_egress_ingress.c
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_isolated.c
@@ -20,8 +20,20 @@ struct {
 
 __u32 invocations = 0;
 
-SEC("cgroup_skb/egress")
-int egress(struct __sk_buff *skb)
+SEC("cgroup_skb/egress/1")
+int egress1(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}
+
+SEC("cgroup_skb/egress/2")
+int egress2(struct __sk_buff *skb)
 {
 	struct cgroup_value *ptr_cg_storage =
 		bpf_get_local_storage(&cgroup_storage, 0);
diff --git a/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c b/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c
new file mode 100644
index 000000000000..a149f33bc533
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/cg_storage_multi_shared.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+/*
+ * Copyright 2020 Google LLC.
+ */
+
+#include <errno.h>
+#include <linux/bpf.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <bpf/bpf_helpers.h>
+
+#include "progs/cg_storage_multi.h"
+
+struct {
+	__uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+	__type(key, __u64);
+	__type(value, struct cgroup_value);
+} cgroup_storage SEC(".maps");
+
+__u32 invocations = 0;
+
+SEC("cgroup_skb/egress/1")
+int egress1(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}
+
+SEC("cgroup_skb/egress/2")
+int egress2(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->egress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}
+
+SEC("cgroup_skb/ingress")
+int ingress(struct __sk_buff *skb)
+{
+	struct cgroup_value *ptr_cg_storage =
+		bpf_get_local_storage(&cgroup_storage, 0);
+
+	__sync_fetch_and_add(&ptr_cg_storage->ingress_pkts, 1);
+	__sync_fetch_and_add(&invocations, 1);
+
+	return 1;
+}

From 4e15f460be6d14c3fe80ef3221bde759f6b94d9d Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Thu, 23 Jul 2020 23:47:45 -0500
Subject: [PATCH 28/73] Documentation/bpf: Document CGROUP_STORAGE map type

The machanics and usage are not very straightforward. Given the
changes it's better to document how it works and how to use it,
rather than having to rely on the examples and implementation to
infer what is going on.

Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/b412edfbb05cb1077c9e2a36a981a54ee23fa8b3.1595565795.git.zhuyifei@google.com
---
 Documentation/bpf/index.rst              |   9 ++
 Documentation/bpf/map_cgroup_storage.rst | 169 +++++++++++++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 Documentation/bpf/map_cgroup_storage.rst

diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index 38b4db8be7a2..26f4bb3107fc 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -48,6 +48,15 @@ Program types
    bpf_lsm
 
 
+Map types
+=========
+
+.. toctree::
+   :maxdepth: 1
+
+   map_cgroup_storage
+
+
 Testing and debugging BPF
 =========================
 
diff --git a/Documentation/bpf/map_cgroup_storage.rst b/Documentation/bpf/map_cgroup_storage.rst
new file mode 100644
index 000000000000..cab9543017bf
--- /dev/null
+++ b/Documentation/bpf/map_cgroup_storage.rst
@@ -0,0 +1,169 @@
+.. SPDX-License-Identifier: GPL-2.0-only
+.. Copyright (C) 2020 Google LLC.
+
+===========================
+BPF_MAP_TYPE_CGROUP_STORAGE
+===========================
+
+The ``BPF_MAP_TYPE_CGROUP_STORAGE`` map type represents a local fix-sized
+storage. It is only available with ``CONFIG_CGROUP_BPF``, and to programs that
+attach to cgroups; the programs are made available by the same Kconfig. The
+storage is identified by the cgroup the program is attached to.
+
+The map provide a local storage at the cgroup that the BPF program is attached
+to. It provides a faster and simpler access than the general purpose hash
+table, which performs a hash table lookups, and requires user to track live
+cgroups on their own.
+
+This document describes the usage and semantics of the
+``BPF_MAP_TYPE_CGROUP_STORAGE`` map type. Some of its behaviors was changed in
+Linux 5.9 and this document will describe the differences.
+
+Usage
+=====
+
+The map uses key of type of either ``__u64 cgroup_inode_id`` or
+``struct bpf_cgroup_storage_key``, declared in ``linux/bpf.h``::
+
+    struct bpf_cgroup_storage_key {
+            __u64 cgroup_inode_id;
+            __u32 attach_type;
+    };
+
+``cgroup_inode_id`` is the inode id of the cgroup directory.
+``attach_type`` is the the program's attach type.
+
+Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
+When this key type is used, then all attach types of the particular cgroup and
+map will share the same storage. Otherwise, if the type is
+``struct bpf_cgroup_storage_key``, then programs of different attach types
+be isolated and see different storages.
+
+To access the storage in a program, use ``bpf_get_local_storage``::
+
+    void *bpf_get_local_storage(void *map, u64 flags)
+
+``flags`` is reserved for future use and must be 0.
+
+There is no implicit synchronization. Storages of ``BPF_MAP_TYPE_CGROUP_STORAGE``
+can be accessed by multiple programs across different CPUs, and user should
+take care of synchronization by themselves. The bpf infrastructure provides
+``struct bpf_spin_lock`` to synchronize the storage. See
+``tools/testing/selftests/bpf/progs/test_spin_lock.c``.
+
+Examples
+========
+
+Usage with key type as ``struct bpf_cgroup_storage_key``::
+
+    #include <bpf/bpf.h>
+
+    struct {
+            __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+            __type(key, struct bpf_cgroup_storage_key);
+            __type(value, __u32);
+    } cgroup_storage SEC(".maps");
+
+    int program(struct __sk_buff *skb)
+    {
+            __u32 *ptr = bpf_get_local_storage(&cgroup_storage, 0);
+            __sync_fetch_and_add(ptr, 1);
+
+            return 0;
+    }
+
+Userspace accessing map declared above::
+
+    #include <linux/bpf.h>
+    #include <linux/libbpf.h>
+
+    __u32 map_lookup(struct bpf_map *map, __u64 cgrp, enum bpf_attach_type type)
+    {
+            struct bpf_cgroup_storage_key = {
+                    .cgroup_inode_id = cgrp,
+                    .attach_type = type,
+            };
+            __u32 value;
+            bpf_map_lookup_elem(bpf_map__fd(map), &key, &value);
+            // error checking omitted
+            return value;
+    }
+
+Alternatively, using just ``__u64 cgroup_inode_id`` as key type::
+
+    #include <bpf/bpf.h>
+
+    struct {
+            __uint(type, BPF_MAP_TYPE_CGROUP_STORAGE);
+            __type(key, __u64);
+            __type(value, __u32);
+    } cgroup_storage SEC(".maps");
+
+    int program(struct __sk_buff *skb)
+    {
+            __u32 *ptr = bpf_get_local_storage(&cgroup_storage, 0);
+            __sync_fetch_and_add(ptr, 1);
+
+            return 0;
+    }
+
+And userspace::
+
+    #include <linux/bpf.h>
+    #include <linux/libbpf.h>
+
+    __u32 map_lookup(struct bpf_map *map, __u64 cgrp, enum bpf_attach_type type)
+    {
+            __u32 value;
+            bpf_map_lookup_elem(bpf_map__fd(map), &cgrp, &value);
+            // error checking omitted
+            return value;
+    }
+
+Semantics
+=========
+
+``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE`` is a variant of this map type. This
+per-CPU variant will have different memory regions for each CPU for each
+storage. The non-per-CPU will have the same memory region for each storage.
+
+Prior to Linux 5.9, the lifetime of a storage is precisely per-attachment, and
+for a single ``CGROUP_STORAGE`` map, there can be at most one program loaded
+that uses the map. A program may be attached to multiple cgroups or have
+multiple attach types, and each attach creates a fresh zeroed storage. The
+storage is freed upon detach.
+
+There is a one-to-one association between the map of each type (per-CPU and
+non-per-CPU) and the BPF program during load verification time. As a result,
+each map can only be used by one BPF program and each BPF program can only use
+one storage map of each type. Because of map can only be used by one BPF
+program, sharing of this cgroup's storage with other BPF programs were
+impossible.
+
+Since Linux 5.9, storage can be shared by multiple programs. When a program is
+attached to a cgroup, the kernel would create a new storage only if the map
+does not already contain an entry for the cgroup and attach type pair, or else
+the old storage is reused for the new attachment. If the map is attach type
+shared, then attach type is simply ignored during comparison. Storage is freed
+only when either the map or the cgroup attached to is being freed. Detaching
+will not directly free the storage, but it may cause the reference to the map
+to reach zero and indirectly freeing all storage in the map.
+
+The map is not associated with any BPF program, thus making sharing possible.
+However, the BPF program can still only associate with one map of each type
+(per-CPU and non-per-CPU). A BPF program cannot use more than one
+``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
+``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
+
+In all versions, userspace may use the the attach parameters of cgroup and
+attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
+APIs to read or update the storage for a given attachment. For Linux 5.9
+attach type shared storages, only the first value in the struct, cgroup inode
+id, is used during comparison, so userspace may just specify a ``__u64``
+directly.
+
+The storage is bound at attach time. Even if the program is attached to parent
+and triggers in child, the storage still belongs to the parent.
+
+Userspace cannot create a new entry in the map or delete an existing entry.
+Program test runs always use a temporary storage.

From dfcdf0e9ad2e006196986f363c99b2097aec5ef0 Mon Sep 17 00:00:00 2001
From: YiFei Zhu <zhuyifei@google.com>
Date: Fri, 24 Jul 2020 16:17:53 -0500
Subject: [PATCH 29/73] bpf/local_storage: Fix build without CONFIG_CGROUP

local_storage.o has its compile guard as CONFIG_BPF_SYSCALL, which
does not imply that CONFIG_CGROUP is on. Including cgroup-internal.h
when CONFIG_CGROUP is off cause a compilation failure.

Fixes: f67cfc233706 ("bpf: Make cgroup storages shared between programs on the same cgroup")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: YiFei Zhu <zhuyifei@google.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200724211753.902969-1-zhuyifei1999@gmail.com
---
 kernel/bpf/local_storage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/local_storage.c b/kernel/bpf/local_storage.c
index 3b2c70197d78..571bb351ed3b 100644
--- a/kernel/bpf/local_storage.c
+++ b/kernel/bpf/local_storage.c
@@ -9,12 +9,12 @@
 #include <linux/slab.h>
 #include <uapi/linux/btf.h>
 
-#include "../cgroup/cgroup-internal.h"
-
 DEFINE_PER_CPU(struct bpf_cgroup_storage*, bpf_cgroup_storage[MAX_BPF_CGROUP_STORAGE_TYPE]);
 
 #ifdef CONFIG_CGROUP_BPF
 
+#include "../cgroup/cgroup-internal.h"
+
 #define LOCAL_STORAGE_CREATE_FLAG_MASK					\
 	(BPF_F_NUMA_NODE | BPF_F_ACCESS_MASK)
 

From 2b9b305fcdda1810bdffeb599361174eb2cd0b7c Mon Sep 17 00:00:00 2001
From: Song Liu <songliubraving@fb.com>
Date: Fri, 24 Jul 2020 13:05:02 -0700
Subject: [PATCH 30/73] bpf: Fix build on architectures with special
 bpf_user_pt_regs_t

Architectures like s390, powerpc, arm64, riscv have speical definition of
bpf_user_pt_regs_t. So we need to cast the pointer before passing it to
bpf_get_stack(). This is similar to bpf_get_stack_tp().

Fixes: 03d42fd2d83f ("bpf: Separate bpf_get_[stack|stackid] for perf events BPF")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200724200503.3629591-1-songliubraving@fb.com
---
 kernel/bpf/stackmap.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 5beb2f8c23da..4fd830a62be2 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -678,6 +678,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
 BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 	   void *, buf, u32, size, u64, flags)
 {
+	struct pt_regs *regs = (struct pt_regs *)(ctx->regs);
 	struct perf_event *event = ctx->event;
 	struct perf_callchain_entry *trace;
 	bool kernel, user;
@@ -685,7 +686,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 	__u64 nr_kernel;
 
 	if (!(event->attr.sample_type & __PERF_SAMPLE_CALLCHAIN_EARLY))
-		return __bpf_get_stack(ctx->regs, NULL, NULL, buf, size, flags);
+		return __bpf_get_stack(regs, NULL, NULL, buf, size, flags);
 
 	if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
 			       BPF_F_USER_BUILD_ID)))
@@ -705,8 +706,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 		__u64 nr = trace->nr;
 
 		trace->nr = nr_kernel;
-		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
-				      size, flags);
+		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 
 		/* restore nr */
 		trace->nr = nr;
@@ -718,8 +718,7 @@ BPF_CALL_4(bpf_get_stack_pe, struct bpf_perf_event_data_kern *, ctx,
 			goto clear;
 
 		flags = (flags & ~BPF_F_SKIP_FIELD_MASK) | skip;
-		err = __bpf_get_stack(ctx->regs, NULL, trace, buf,
-				      size, flags);
+		err = __bpf_get_stack(regs, NULL, trace, buf, size, flags);
 	}
 	return err;
 

From 6cc7d1e8e9e06d45f9d1a39a5f465288d7cd8f9a Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:54 -0700
Subject: [PATCH 31/73] bpf: Make bpf_link API available indepently of
 CONFIG_BPF_SYSCALL

Similarly to bpf_prog, make bpf_link and related generic API available
unconditionally to make it easier to have bpf_link support in various parts of
the kernel. Stub out init/prime/settle/cleanup and inc/put APIs.

Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-2-andriin@fb.com
---
 include/linux/bpf.h | 81 ++++++++++++++++++++++++++++++---------------
 1 file changed, 55 insertions(+), 26 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 8357be349133..40c5e206ecf2 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -781,6 +781,32 @@ struct bpf_array_aux {
 	struct work_struct work;
 };
 
+struct bpf_link {
+	atomic64_t refcnt;
+	u32 id;
+	enum bpf_link_type type;
+	const struct bpf_link_ops *ops;
+	struct bpf_prog *prog;
+	struct work_struct work;
+};
+
+struct bpf_link_ops {
+	void (*release)(struct bpf_link *link);
+	void (*dealloc)(struct bpf_link *link);
+	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
+			   struct bpf_prog *old_prog);
+	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
+	int (*fill_link_info)(const struct bpf_link *link,
+			      struct bpf_link_info *info);
+};
+
+struct bpf_link_primer {
+	struct bpf_link *link;
+	struct file *file;
+	int fd;
+	u32 id;
+};
+
 struct bpf_struct_ops_value;
 struct btf_type;
 struct btf_member;
@@ -1164,32 +1190,6 @@ static inline bool bpf_bypass_spec_v4(void)
 int bpf_map_new_fd(struct bpf_map *map, int flags);
 int bpf_prog_new_fd(struct bpf_prog *prog);
 
-struct bpf_link {
-	atomic64_t refcnt;
-	u32 id;
-	enum bpf_link_type type;
-	const struct bpf_link_ops *ops;
-	struct bpf_prog *prog;
-	struct work_struct work;
-};
-
-struct bpf_link_primer {
-	struct bpf_link *link;
-	struct file *file;
-	int fd;
-	u32 id;
-};
-
-struct bpf_link_ops {
-	void (*release)(struct bpf_link *link);
-	void (*dealloc)(struct bpf_link *link);
-	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
-			   struct bpf_prog *old_prog);
-	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
-	int (*fill_link_info)(const struct bpf_link *link,
-			      struct bpf_link_info *info);
-};
-
 void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
 		   const struct bpf_link_ops *ops, struct bpf_prog *prog);
 int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer);
@@ -1401,6 +1401,35 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
 {
 }
 
+static inline void bpf_link_init(struct bpf_link *link, enum bpf_link_type type,
+				 const struct bpf_link_ops *ops,
+				 struct bpf_prog *prog)
+{
+}
+
+static inline int bpf_link_prime(struct bpf_link *link,
+				 struct bpf_link_primer *primer)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline int bpf_link_settle(struct bpf_link_primer *primer)
+{
+	return -EOPNOTSUPP;
+}
+
+static inline void bpf_link_cleanup(struct bpf_link_primer *primer)
+{
+}
+
+static inline void bpf_link_inc(struct bpf_link *link)
+{
+}
+
+static inline void bpf_link_put(struct bpf_link *link)
+{
+}
+
 static inline int bpf_obj_get_user(const char __user *pathname, int flags)
 {
 	return -EOPNOTSUPP;

From 7f0a838254bdd9114b978ef2541a6ce330307e9e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:55 -0700
Subject: [PATCH 32/73] bpf, xdp: Maintain info on attached XDP BPF programs in
 net_device

Instead of delegating to drivers, maintain information about which BPF
programs are attached in which XDP modes (generic/skb, driver, or hardware)
locally in net_device. This effectively obsoletes XDP_QUERY_PROG command.

Such re-organization simplifies existing code already. But it also allows to
further add bpf_link-based XDP attachments without drivers having to know
about any of this at all, which seems like a good setup.
XDP_SETUP_PROG/XDP_SETUP_PROG_HW are just low-level commands to driver to
install/uninstall active BPF program. All the higher-level concerns about
prog/link interaction will be contained within generic driver-agnostic logic.

All the XDP_QUERY_PROG calls to driver in dev_xdp_uninstall() were removed.
It's not clear for me why dev_xdp_uninstall() were passing previous prog_flags
when resetting installed programs. That seems unnecessary, plus most drivers
don't populate prog_flags anyways. Having XDP_SETUP_PROG vs XDP_SETUP_PROG_HW
should be enough of an indicator of what is required of driver to correctly
reset active BPF program. dev_xdp_uninstall() is also generalized as an
iteration over all three supported mode.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-3-andriin@fb.com
---
 include/linux/netdevice.h |  17 +++-
 net/core/dev.c            | 166 +++++++++++++++++++++-----------------
 net/core/rtnetlink.c      |   5 +-
 3 files changed, 109 insertions(+), 79 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index ac2cd3f49aba..cad44b40c776 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -889,6 +889,17 @@ struct netlink_ext_ack;
 struct xdp_umem;
 struct xdp_dev_bulk_queue;
 
+enum bpf_xdp_mode {
+	XDP_MODE_SKB = 0,
+	XDP_MODE_DRV = 1,
+	XDP_MODE_HW = 2,
+	__MAX_XDP_MODE
+};
+
+struct bpf_xdp_entity {
+	struct bpf_prog *prog;
+};
+
 struct netdev_bpf {
 	enum bpf_netdev_command command;
 	union {
@@ -2142,6 +2153,9 @@ struct net_device {
 #endif
 	const struct udp_tunnel_nic_info	*udp_tunnel_nic_info;
 	struct udp_tunnel_nic	*udp_tunnel_nic;
+
+	/* protected by rtnl_lock */
+	struct bpf_xdp_entity	xdp_state[__MAX_XDP_MODE];
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
@@ -3817,8 +3831,7 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags);
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t xdp_op,
-		    enum bpf_netdev_command cmd);
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
 int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/net/core/dev.c b/net/core/dev.c
index fe2e387eed29..bf38fde667e9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8716,84 +8716,103 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down_generic);
 
-u32 __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op,
-		    enum bpf_netdev_command cmd)
+static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
 {
-	struct netdev_bpf xdp;
-
-	if (!bpf_op)
-		return 0;
-
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = cmd;
-
-	/* Query must always succeed. */
-	WARN_ON(bpf_op(dev, &xdp) < 0 && cmd == XDP_QUERY_PROG);
-
-	return xdp.prog_id;
+	if (flags & XDP_FLAGS_HW_MODE)
+		return XDP_MODE_HW;
+	if (flags & XDP_FLAGS_DRV_MODE)
+		return XDP_MODE_DRV;
+	return XDP_MODE_SKB;
 }
 
-static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op,
-			   struct netlink_ext_ack *extack, u32 flags,
-			   struct bpf_prog *prog)
+static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+	switch (mode) {
+	case XDP_MODE_SKB:
+		return generic_xdp_install;
+	case XDP_MODE_DRV:
+	case XDP_MODE_HW:
+		return dev->netdev_ops->ndo_bpf;
+	default:
+		return NULL;
+	};
+}
+
+static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
+				     enum bpf_xdp_mode mode)
+{
+	return dev->xdp_state[mode].prog;
+}
+
+u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
+{
+	struct bpf_prog *prog = dev_xdp_prog(dev, mode);
+
+	return prog ? prog->aux->id : 0;
+}
+
+static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
+			     struct bpf_prog *prog)
+{
+	dev->xdp_state[mode].prog = prog;
+}
+
+static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
+			   bpf_op_t bpf_op, struct netlink_ext_ack *extack,
+			   u32 flags, struct bpf_prog *prog)
 {
-	bool non_hw = !(flags & XDP_FLAGS_HW_MODE);
-	struct bpf_prog *prev_prog = NULL;
 	struct netdev_bpf xdp;
 	int err;
 
-	if (non_hw) {
-		prev_prog = bpf_prog_by_id(__dev_xdp_query(dev, bpf_op,
-							   XDP_QUERY_PROG));
-		if (IS_ERR(prev_prog))
-			prev_prog = NULL;
-	}
-
 	memset(&xdp, 0, sizeof(xdp));
-	if (flags & XDP_FLAGS_HW_MODE)
-		xdp.command = XDP_SETUP_PROG_HW;
-	else
-		xdp.command = XDP_SETUP_PROG;
+	xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
 	xdp.extack = extack;
 	xdp.flags = flags;
 	xdp.prog = prog;
 
+	/* Drivers assume refcnt is already incremented (i.e, prog pointer is
+	 * "moved" into driver), so they don't increment it on their own, but
+	 * they do decrement refcnt when program is detached or replaced.
+	 * Given net_device also owns link/prog, we need to bump refcnt here
+	 * to prevent drivers from underflowing it.
+	 */
+	if (prog)
+		bpf_prog_inc(prog);
 	err = bpf_op(dev, &xdp);
-	if (!err && non_hw)
-		bpf_prog_change_xdp(prev_prog, prog);
+	if (err) {
+		if (prog)
+			bpf_prog_put(prog);
+		return err;
+	}
 
-	if (prev_prog)
-		bpf_prog_put(prev_prog);
+	if (mode != XDP_MODE_HW)
+		bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
 
-	return err;
+	return 0;
 }
 
 static void dev_xdp_uninstall(struct net_device *dev)
 {
-	struct netdev_bpf xdp;
-	bpf_op_t ndo_bpf;
+	struct bpf_prog *prog;
+	enum bpf_xdp_mode mode;
+	bpf_op_t bpf_op;
 
-	/* Remove generic XDP */
-	WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL));
+	ASSERT_RTNL();
 
-	/* Remove from the driver */
-	ndo_bpf = dev->netdev_ops->ndo_bpf;
-	if (!ndo_bpf)
-		return;
+	for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
+		prog = dev_xdp_prog(dev, mode);
+		if (!prog)
+			continue;
 
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_QUERY_PROG;
-	WARN_ON(ndo_bpf(dev, &xdp));
-	if (xdp.prog_id)
-		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
-					NULL));
+		bpf_op = dev_xdp_bpf_op(dev, mode);
+		if (!bpf_op)
+			continue;
 
-	/* Remove HW offload */
-	memset(&xdp, 0, sizeof(xdp));
-	xdp.command = XDP_QUERY_PROG_HW;
-	if (!ndo_bpf(dev, &xdp) && xdp.prog_id)
-		WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags,
-					NULL));
+		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+
+		bpf_prog_put(prog);
+		dev_xdp_set_prog(dev, mode, NULL);
+	}
 }
 
 /**
@@ -8810,29 +8829,22 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags)
 {
 	const struct net_device_ops *ops = dev->netdev_ops;
-	enum bpf_netdev_command query;
+	enum bpf_xdp_mode mode = dev_xdp_mode(flags);
+	bool offload = mode == XDP_MODE_HW;
 	u32 prog_id, expected_id = 0;
-	bpf_op_t bpf_op, bpf_chk;
 	struct bpf_prog *prog;
-	bool offload;
+	bpf_op_t bpf_op;
 	int err;
 
 	ASSERT_RTNL();
 
-	offload = flags & XDP_FLAGS_HW_MODE;
-	query = offload ? XDP_QUERY_PROG_HW : XDP_QUERY_PROG;
-
-	bpf_op = bpf_chk = ops->ndo_bpf;
-	if (!bpf_op && (flags & (XDP_FLAGS_DRV_MODE | XDP_FLAGS_HW_MODE))) {
+	bpf_op = dev_xdp_bpf_op(dev, mode);
+	if (!bpf_op) {
 		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
 		return -EOPNOTSUPP;
 	}
-	if (!bpf_op || (flags & XDP_FLAGS_SKB_MODE))
-		bpf_op = generic_xdp_install;
-	if (bpf_op == bpf_chk)
-		bpf_chk = generic_xdp_install;
 
-	prog_id = __dev_xdp_query(dev, bpf_op, query);
+	prog_id = dev_xdp_prog_id(dev, mode);
 	if (flags & XDP_FLAGS_REPLACE) {
 		if (expected_fd >= 0) {
 			prog = bpf_prog_get_type_dev(expected_fd,
@@ -8850,8 +8862,11 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		}
 	}
 	if (fd >= 0) {
-		if (!offload && __dev_xdp_query(dev, bpf_chk, XDP_QUERY_PROG)) {
-			NL_SET_ERR_MSG(extack, "native and generic XDP can't be active at the same time");
+		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+					       ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+		if (!offload && dev_xdp_prog_id(dev, other_mode)) {
+			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
 			return -EEXIST;
 		}
 
@@ -8866,7 +8881,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 			return PTR_ERR(prog);
 
 		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
-			NL_SET_ERR_MSG(extack, "using device-bound program without HW_MODE flag is not supported");
+			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
 			bpf_prog_put(prog);
 			return -EINVAL;
 		}
@@ -8895,11 +8910,14 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		prog = NULL;
 	}
 
-	err = dev_xdp_install(dev, bpf_op, extack, flags, prog);
-	if (err < 0 && prog)
+	err = dev_xdp_install(dev, mode, bpf_op, extack, flags, prog);
+	if (err < 0 && prog) {
 		bpf_prog_put(prog);
+		return err;
+	}
+	dev_xdp_set_prog(dev, mode, prog);
 
-	return err;
+	return 0;
 }
 
 /**
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 85a4b0101f76..58c484a28395 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1416,13 +1416,12 @@ static u32 rtnl_xdp_prog_skb(struct net_device *dev)
 
 static u32 rtnl_xdp_prog_drv(struct net_device *dev)
 {
-	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf, XDP_QUERY_PROG);
+	return dev_xdp_prog_id(dev, XDP_MODE_DRV);
 }
 
 static u32 rtnl_xdp_prog_hw(struct net_device *dev)
 {
-	return __dev_xdp_query(dev, dev->netdev_ops->ndo_bpf,
-			       XDP_QUERY_PROG_HW);
+	return dev_xdp_prog_id(dev, XDP_MODE_HW);
 }
 
 static int rtnl_xdp_report_one(struct sk_buff *skb, struct net_device *dev,

From d4baa9368a5e4d694e787e0442ddd6ab95d6fd96 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:56 -0700
Subject: [PATCH 33/73] bpf, xdp: Extract common XDP program attachment logic

Further refactor XDP attachment code. dev_change_xdp_fd() is split into two
parts: getting bpf_progs from FDs and attachment logic, working with
bpf_progs. This makes attachment  logic a bit more straightforward and
prepares code for bpf_xdp_link inclusion, which will share the common logic.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-4-andriin@fb.com
---
 net/core/dev.c | 177 +++++++++++++++++++++++++++----------------------
 1 file changed, 97 insertions(+), 80 deletions(-)

diff --git a/net/core/dev.c b/net/core/dev.c
index bf38fde667e9..521ce031ee35 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8815,6 +8815,82 @@ static void dev_xdp_uninstall(struct net_device *dev)
 	}
 }
 
+static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
+			  struct bpf_prog *new_prog, struct bpf_prog *old_prog,
+			  u32 flags)
+{
+	struct bpf_prog *cur_prog;
+	enum bpf_xdp_mode mode;
+	bpf_op_t bpf_op;
+	int err;
+
+	ASSERT_RTNL();
+
+	/* just one XDP mode bit should be set, zero defaults to SKB mode */
+	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
+		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
+		return -EINVAL;
+	}
+	/* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
+	if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
+		NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
+		return -EINVAL;
+	}
+
+	mode = dev_xdp_mode(flags);
+	cur_prog = dev_xdp_prog(dev, mode);
+	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
+		NL_SET_ERR_MSG(extack, "Active program does not match expected");
+		return -EEXIST;
+	}
+	if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
+		NL_SET_ERR_MSG(extack, "XDP program already attached");
+		return -EBUSY;
+	}
+
+	if (new_prog) {
+		bool offload = mode == XDP_MODE_HW;
+		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
+					       ? XDP_MODE_DRV : XDP_MODE_SKB;
+
+		if (!offload && dev_xdp_prog(dev, other_mode)) {
+			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
+			return -EEXIST;
+		}
+		if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
+			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
+			return -EINVAL;
+		}
+		if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
+			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
+			return -EINVAL;
+		}
+		if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
+			NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
+			return -EINVAL;
+		}
+	}
+
+	/* don't call drivers if the effective program didn't change */
+	if (new_prog != cur_prog) {
+		bpf_op = dev_xdp_bpf_op(dev, mode);
+		if (!bpf_op) {
+			NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
+			return -EOPNOTSUPP;
+		}
+
+		err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
+		if (err)
+			return err;
+	}
+
+	dev_xdp_set_prog(dev, mode, new_prog);
+	if (cur_prog)
+		bpf_prog_put(cur_prog);
+
+	return 0;
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -8828,96 +8904,37 @@ static void dev_xdp_uninstall(struct net_device *dev)
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags)
 {
-	const struct net_device_ops *ops = dev->netdev_ops;
 	enum bpf_xdp_mode mode = dev_xdp_mode(flags);
-	bool offload = mode == XDP_MODE_HW;
-	u32 prog_id, expected_id = 0;
-	struct bpf_prog *prog;
-	bpf_op_t bpf_op;
+	struct bpf_prog *new_prog = NULL, *old_prog = NULL;
 	int err;
 
 	ASSERT_RTNL();
 
-	bpf_op = dev_xdp_bpf_op(dev, mode);
-	if (!bpf_op) {
-		NL_SET_ERR_MSG(extack, "underlying driver does not support XDP in native mode");
-		return -EOPNOTSUPP;
-	}
-
-	prog_id = dev_xdp_prog_id(dev, mode);
-	if (flags & XDP_FLAGS_REPLACE) {
-		if (expected_fd >= 0) {
-			prog = bpf_prog_get_type_dev(expected_fd,
-						     BPF_PROG_TYPE_XDP,
-						     bpf_op == ops->ndo_bpf);
-			if (IS_ERR(prog))
-				return PTR_ERR(prog);
-			expected_id = prog->aux->id;
-			bpf_prog_put(prog);
-		}
-
-		if (prog_id != expected_id) {
-			NL_SET_ERR_MSG(extack, "Active program does not match expected");
-			return -EEXIST;
-		}
-	}
 	if (fd >= 0) {
-		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
-					       ? XDP_MODE_DRV : XDP_MODE_SKB;
-
-		if (!offload && dev_xdp_prog_id(dev, other_mode)) {
-			NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
-			return -EEXIST;
-		}
-
-		if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && prog_id) {
-			NL_SET_ERR_MSG(extack, "XDP program already attached");
-			return -EBUSY;
-		}
-
-		prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
-					     bpf_op == ops->ndo_bpf);
-		if (IS_ERR(prog))
-			return PTR_ERR(prog);
-
-		if (!offload && bpf_prog_is_dev_bound(prog->aux)) {
-			NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
-			bpf_prog_put(prog);
-			return -EINVAL;
-		}
-
-		if (prog->expected_attach_type == BPF_XDP_DEVMAP) {
-			NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
-			bpf_prog_put(prog);
-			return -EINVAL;
-		}
-
-		if (prog->expected_attach_type == BPF_XDP_CPUMAP) {
-			NL_SET_ERR_MSG(extack,
-				       "BPF_XDP_CPUMAP programs can not be attached to a device");
-			bpf_prog_put(prog);
-			return -EINVAL;
-		}
-
-		/* prog->aux->id may be 0 for orphaned device-bound progs */
-		if (prog->aux->id && prog->aux->id == prog_id) {
-			bpf_prog_put(prog);
-			return 0;
-		}
-	} else {
-		if (!prog_id)
-			return 0;
-		prog = NULL;
+		new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
+						 mode != XDP_MODE_SKB);
+		if (IS_ERR(new_prog))
+			return PTR_ERR(new_prog);
 	}
 
-	err = dev_xdp_install(dev, mode, bpf_op, extack, flags, prog);
-	if (err < 0 && prog) {
-		bpf_prog_put(prog);
-		return err;
+	if (expected_fd >= 0) {
+		old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
+						 mode != XDP_MODE_SKB);
+		if (IS_ERR(old_prog)) {
+			err = PTR_ERR(old_prog);
+			old_prog = NULL;
+			goto err_out;
+		}
 	}
-	dev_xdp_set_prog(dev, mode, prog);
 
-	return 0;
+	err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags);
+
+err_out:
+	if (err && new_prog)
+		bpf_prog_put(new_prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+	return err;
 }
 
 /**

From aa8d3a716b59db6c1ad6c68fb8aa05e31980da60 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:57 -0700
Subject: [PATCH 34/73] bpf, xdp: Add bpf_link-based XDP attachment API

Add bpf_link-based API (bpf_xdp_link) to attach BPF XDP program through
BPF_LINK_CREATE command.

bpf_xdp_link is mutually exclusive with direct BPF program attachment,
previous BPF program should be detached prior to attempting to create a new
bpf_xdp_link attachment (for a given XDP mode). Once BPF link is attached, it
can't be replaced by other BPF program attachment or link attachment. It will
be detached only when the last BPF link FD is closed.

bpf_xdp_link will be auto-detached when net_device is shutdown, similarly to
how other BPF links behave (cgroup, flow_dissector). At that point bpf_link
will become defunct, but won't be destroyed until last FD is closed.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-5-andriin@fb.com
---
 include/linux/netdevice.h |   4 +
 include/uapi/linux/bpf.h  |   7 +-
 kernel/bpf/syscall.c      |   5 ++
 net/core/dev.c            | 169 ++++++++++++++++++++++++++++++++++++--
 4 files changed, 178 insertions(+), 7 deletions(-)

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index cad44b40c776..7d3c412fcfe5 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -888,6 +888,7 @@ struct bpf_prog_offload_ops;
 struct netlink_ext_ack;
 struct xdp_umem;
 struct xdp_dev_bulk_queue;
+struct bpf_xdp_link;
 
 enum bpf_xdp_mode {
 	XDP_MODE_SKB = 0,
@@ -898,6 +899,7 @@ enum bpf_xdp_mode {
 
 struct bpf_xdp_entity {
 	struct bpf_prog *prog;
+	struct bpf_xdp_link *link;
 };
 
 struct netdev_bpf {
@@ -3831,7 +3833,9 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
 typedef int (*bpf_op_t)(struct net_device *dev, struct netdev_bpf *bpf);
 int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		      int fd, int expected_fd, u32 flags);
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog);
 u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode);
+
 int xdp_umem_query(struct net_device *dev, u16 queue_id);
 
 int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 828c2f6438f2..87823fb9c123 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -230,6 +230,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_SOCK_RELEASE,
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
+	BPF_XDP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -242,6 +243,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_CGROUP = 3,
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
+	BPF_LINK_TYPE_XDP = 6,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -614,7 +616,10 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_CREATE command */
 		__u32		prog_fd;	/* eBPF program to attach */
-		__u32		target_fd;	/* object to attach to */
+		union {
+			__u32		target_fd;	/* object to attach to */
+			__u32		target_ifindex; /* target ifindex */
+		};
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
 	} link_create;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index ee290b1f2d9e..0e8c88db7e7a 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -2824,6 +2824,8 @@ attach_type_to_prog_type(enum bpf_attach_type attach_type)
 		return BPF_PROG_TYPE_TRACING;
 	case BPF_SK_LOOKUP:
 		return BPF_PROG_TYPE_SK_LOOKUP;
+	case BPF_XDP:
+		return BPF_PROG_TYPE_XDP;
 	default:
 		return BPF_PROG_TYPE_UNSPEC;
 	}
@@ -3921,6 +3923,9 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+	case BPF_PROG_TYPE_XDP:
+		ret = bpf_xdp_link_attach(attr, prog);
+		break;
 	default:
 		ret = -EINVAL;
 	}
diff --git a/net/core/dev.c b/net/core/dev.c
index 521ce031ee35..e24248f3d675 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8716,6 +8716,12 @@ int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
 }
 EXPORT_SYMBOL(dev_change_proto_down_generic);
 
+struct bpf_xdp_link {
+	struct bpf_link link;
+	struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
+	int flags;
+};
+
 static enum bpf_xdp_mode dev_xdp_mode(u32 flags)
 {
 	if (flags & XDP_FLAGS_HW_MODE)
@@ -8738,9 +8744,19 @@ static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
 	};
 }
 
+static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
+					 enum bpf_xdp_mode mode)
+{
+	return dev->xdp_state[mode].link;
+}
+
 static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
 				     enum bpf_xdp_mode mode)
 {
+	struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
+
+	if (link)
+		return link->link.prog;
 	return dev->xdp_state[mode].prog;
 }
 
@@ -8751,9 +8767,17 @@ u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
 	return prog ? prog->aux->id : 0;
 }
 
+static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
+			     struct bpf_xdp_link *link)
+{
+	dev->xdp_state[mode].link = link;
+	dev->xdp_state[mode].prog = NULL;
+}
+
 static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
 			     struct bpf_prog *prog)
 {
+	dev->xdp_state[mode].link = NULL;
 	dev->xdp_state[mode].prog = prog;
 }
 
@@ -8793,6 +8817,7 @@ static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
 
 static void dev_xdp_uninstall(struct net_device *dev)
 {
+	struct bpf_xdp_link *link;
 	struct bpf_prog *prog;
 	enum bpf_xdp_mode mode;
 	bpf_op_t bpf_op;
@@ -8810,14 +8835,20 @@ static void dev_xdp_uninstall(struct net_device *dev)
 
 		WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
 
-		bpf_prog_put(prog);
-		dev_xdp_set_prog(dev, mode, NULL);
+		/* auto-detach link from net device */
+		link = dev_xdp_link(dev, mode);
+		if (link)
+			link->dev = NULL;
+		else
+			bpf_prog_put(prog);
+
+		dev_xdp_set_link(dev, mode, NULL);
 	}
 }
 
 static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
-			  struct bpf_prog *new_prog, struct bpf_prog *old_prog,
-			  u32 flags)
+			  struct bpf_xdp_link *link, struct bpf_prog *new_prog,
+			  struct bpf_prog *old_prog, u32 flags)
 {
 	struct bpf_prog *cur_prog;
 	enum bpf_xdp_mode mode;
@@ -8826,6 +8857,14 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 
 	ASSERT_RTNL();
 
+	/* either link or prog attachment, never both */
+	if (link && (new_prog || old_prog))
+		return -EINVAL;
+	/* link supports only XDP mode flags */
+	if (link && (flags & ~XDP_FLAGS_MODES)) {
+		NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
+		return -EINVAL;
+	}
 	/* just one XDP mode bit should be set, zero defaults to SKB mode */
 	if (hweight32(flags & XDP_FLAGS_MODES) > 1) {
 		NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
@@ -8838,7 +8877,18 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 	}
 
 	mode = dev_xdp_mode(flags);
+	/* can't replace attached link */
+	if (dev_xdp_link(dev, mode)) {
+		NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
+		return -EBUSY;
+	}
+
 	cur_prog = dev_xdp_prog(dev, mode);
+	/* can't replace attached prog with link */
+	if (link && cur_prog) {
+		NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
+		return -EBUSY;
+	}
 	if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
 		NL_SET_ERR_MSG(extack, "Active program does not match expected");
 		return -EEXIST;
@@ -8848,6 +8898,10 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 		return -EBUSY;
 	}
 
+	/* put effective new program into new_prog */
+	if (link)
+		new_prog = link->link.prog;
+
 	if (new_prog) {
 		bool offload = mode == XDP_MODE_HW;
 		enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
@@ -8884,13 +8938,116 @@ static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack
 			return err;
 	}
 
-	dev_xdp_set_prog(dev, mode, new_prog);
+	if (link)
+		dev_xdp_set_link(dev, mode, link);
+	else
+		dev_xdp_set_prog(dev, mode, new_prog);
 	if (cur_prog)
 		bpf_prog_put(cur_prog);
 
 	return 0;
 }
 
+static int dev_xdp_attach_link(struct net_device *dev,
+			       struct netlink_ext_ack *extack,
+			       struct bpf_xdp_link *link)
+{
+	return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
+}
+
+static int dev_xdp_detach_link(struct net_device *dev,
+			       struct netlink_ext_ack *extack,
+			       struct bpf_xdp_link *link)
+{
+	enum bpf_xdp_mode mode;
+	bpf_op_t bpf_op;
+
+	ASSERT_RTNL();
+
+	mode = dev_xdp_mode(link->flags);
+	if (dev_xdp_link(dev, mode) != link)
+		return -EINVAL;
+
+	bpf_op = dev_xdp_bpf_op(dev, mode);
+	WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
+	dev_xdp_set_link(dev, mode, NULL);
+	return 0;
+}
+
+static void bpf_xdp_link_release(struct bpf_link *link)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+	rtnl_lock();
+
+	/* if racing with net_device's tear down, xdp_link->dev might be
+	 * already NULL, in which case link was already auto-detached
+	 */
+	if (xdp_link->dev)
+		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+
+	rtnl_unlock();
+}
+
+static void bpf_xdp_link_dealloc(struct bpf_link *link)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+
+	kfree(xdp_link);
+}
+
+static const struct bpf_link_ops bpf_xdp_link_lops = {
+	.release = bpf_xdp_link_release,
+	.dealloc = bpf_xdp_link_dealloc,
+};
+
+int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
+{
+	struct net *net = current->nsproxy->net_ns;
+	struct bpf_link_primer link_primer;
+	struct bpf_xdp_link *link;
+	struct net_device *dev;
+	int err, fd;
+
+	dev = dev_get_by_index(net, attr->link_create.target_ifindex);
+	if (!dev)
+		return -EINVAL;
+
+	link = kzalloc(sizeof(*link), GFP_USER);
+	if (!link) {
+		err = -ENOMEM;
+		goto out_put_dev;
+	}
+
+	bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
+	link->dev = dev;
+	link->flags = attr->link_create.flags;
+
+	err = bpf_link_prime(&link->link, &link_primer);
+	if (err) {
+		kfree(link);
+		goto out_put_dev;
+	}
+
+	rtnl_lock();
+	err = dev_xdp_attach_link(dev, NULL, link);
+	rtnl_unlock();
+
+	if (err) {
+		bpf_link_cleanup(&link_primer);
+		goto out_put_dev;
+	}
+
+	fd = bpf_link_settle(&link_primer);
+	/* link itself doesn't hold dev's refcnt to not complicate shutdown */
+	dev_put(dev);
+	return fd;
+
+out_put_dev:
+	dev_put(dev);
+	return err;
+}
+
 /**
  *	dev_change_xdp_fd - set or clear a bpf program for a device rx path
  *	@dev: device
@@ -8927,7 +9084,7 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
 		}
 	}
 
-	err = dev_xdp_attach(dev, extack, new_prog, old_prog, flags);
+	err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
 
 err_out:
 	if (err && new_prog)

From 026a4c28e1db3b0cb99cd9a3e495d4a8b632fa74 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:58 -0700
Subject: [PATCH 35/73] bpf, xdp: Implement LINK_UPDATE for BPF XDP link

Add support for LINK_UPDATE command for BPF XDP link to enable reliable
replacement of underlying BPF program.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-6-andriin@fb.com
---
 net/core/dev.c | 43 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/net/core/dev.c b/net/core/dev.c
index e24248f3d675..49f284f51a22 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8996,9 +8996,52 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link)
 	kfree(xdp_link);
 }
 
+static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
+			       struct bpf_prog *old_prog)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+	enum bpf_xdp_mode mode;
+	bpf_op_t bpf_op;
+	int err = 0;
+
+	rtnl_lock();
+
+	/* link might have been auto-released already, so fail */
+	if (!xdp_link->dev) {
+		err = -ENOLINK;
+		goto out_unlock;
+	}
+
+	if (old_prog && link->prog != old_prog) {
+		err = -EPERM;
+		goto out_unlock;
+	}
+	old_prog = link->prog;
+	if (old_prog == new_prog) {
+		/* no-op, don't disturb drivers */
+		bpf_prog_put(new_prog);
+		goto out_unlock;
+	}
+
+	mode = dev_xdp_mode(xdp_link->flags);
+	bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
+	err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
+			      xdp_link->flags, new_prog);
+	if (err)
+		goto out_unlock;
+
+	old_prog = xchg(&link->prog, new_prog);
+	bpf_prog_put(old_prog);
+
+out_unlock:
+	rtnl_unlock();
+	return err;
+}
+
 static const struct bpf_link_ops bpf_xdp_link_lops = {
 	.release = bpf_xdp_link_release,
 	.dealloc = bpf_xdp_link_dealloc,
+	.update_prog = bpf_xdp_link_update,
 };
 
 int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)

From c1931c9784ebb5787c0784c112fb8baa5e8455b3 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:45:59 -0700
Subject: [PATCH 36/73] bpf: Implement BPF XDP link-specific introspection APIs

Implement XDP link-specific show_fdinfo and link_info to emit ifindex.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-7-andriin@fb.com
---
 include/uapi/linux/bpf.h |  3 +++
 net/core/dev.c           | 31 +++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 87823fb9c123..e1ba4ae6a916 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -4069,6 +4069,9 @@ struct bpf_link_info {
 			__u32 netns_ino;
 			__u32 attach_type;
 		} netns;
+		struct {
+			__u32 ifindex;
+		} xdp;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/net/core/dev.c b/net/core/dev.c
index 49f284f51a22..82ce0920b172 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8996,6 +8996,35 @@ static void bpf_xdp_link_dealloc(struct bpf_link *link)
 	kfree(xdp_link);
 }
 
+static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
+				     struct seq_file *seq)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (xdp_link->dev)
+		ifindex = xdp_link->dev->ifindex;
+	rtnl_unlock();
+
+	seq_printf(seq, "ifindex:\t%u\n", ifindex);
+}
+
+static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
+				       struct bpf_link_info *info)
+{
+	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
+	u32 ifindex = 0;
+
+	rtnl_lock();
+	if (xdp_link->dev)
+		ifindex = xdp_link->dev->ifindex;
+	rtnl_unlock();
+
+	info->xdp.ifindex = ifindex;
+	return 0;
+}
+
 static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
 			       struct bpf_prog *old_prog)
 {
@@ -9041,6 +9070,8 @@ out_unlock:
 static const struct bpf_link_ops bpf_xdp_link_lops = {
 	.release = bpf_xdp_link_release,
 	.dealloc = bpf_xdp_link_dealloc,
+	.show_fdinfo = bpf_xdp_link_show_fdinfo,
+	.fill_link_info = bpf_xdp_link_fill_link_info,
 	.update_prog = bpf_xdp_link_update,
 };
 

From dc8698cac7aada9b61a612cb819341d84591163e Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:46:00 -0700
Subject: [PATCH 37/73] libbpf: Add support for BPF XDP link

Sync UAPI header and add support for using bpf_link-based XDP attachment.
Make xdp/ prog type set expected attach type. Kernel didn't enforce
attach_type for XDP programs before, so there is no backwards compatiblity
issues there.

Also fix section_names selftest to recognize that xdp prog types now have
expected attach type.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-8-andriin@fb.com
---
 tools/include/uapi/linux/bpf.h                         | 10 +++++++++-
 tools/lib/bpf/libbpf.c                                 |  9 ++++++++-
 tools/lib/bpf/libbpf.h                                 |  2 ++
 tools/lib/bpf/libbpf.map                               |  1 +
 tools/testing/selftests/bpf/prog_tests/section_names.c |  2 +-
 5 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index 828c2f6438f2..e1ba4ae6a916 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -230,6 +230,7 @@ enum bpf_attach_type {
 	BPF_CGROUP_INET_SOCK_RELEASE,
 	BPF_XDP_CPUMAP,
 	BPF_SK_LOOKUP,
+	BPF_XDP,
 	__MAX_BPF_ATTACH_TYPE
 };
 
@@ -242,6 +243,7 @@ enum bpf_link_type {
 	BPF_LINK_TYPE_CGROUP = 3,
 	BPF_LINK_TYPE_ITER = 4,
 	BPF_LINK_TYPE_NETNS = 5,
+	BPF_LINK_TYPE_XDP = 6,
 
 	MAX_BPF_LINK_TYPE,
 };
@@ -614,7 +616,10 @@ union bpf_attr {
 
 	struct { /* struct used by BPF_LINK_CREATE command */
 		__u32		prog_fd;	/* eBPF program to attach */
-		__u32		target_fd;	/* object to attach to */
+		union {
+			__u32		target_fd;	/* object to attach to */
+			__u32		target_ifindex; /* target ifindex */
+		};
 		__u32		attach_type;	/* attach type */
 		__u32		flags;		/* extra flags */
 	} link_create;
@@ -4064,6 +4069,9 @@ struct bpf_link_info {
 			__u32 netns_ino;
 			__u32 attach_type;
 		} netns;
+		struct {
+			__u32 ifindex;
+		} xdp;
 	};
 } __attribute__((aligned(8)));
 
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index e51479d60285..54830d603fee 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6915,7 +6915,8 @@ static const struct bpf_sec_def section_defs[] = {
 						BPF_XDP_DEVMAP),
 	BPF_EAPROG_SEC("xdp_cpumap/",		BPF_PROG_TYPE_XDP,
 						BPF_XDP_CPUMAP),
-	BPF_PROG_SEC("xdp",			BPF_PROG_TYPE_XDP),
+	BPF_EAPROG_SEC("xdp",			BPF_PROG_TYPE_XDP,
+						BPF_XDP),
 	BPF_PROG_SEC("perf_event",		BPF_PROG_TYPE_PERF_EVENT),
 	BPF_PROG_SEC("lwt_in",			BPF_PROG_TYPE_LWT_IN),
 	BPF_PROG_SEC("lwt_out",			BPF_PROG_TYPE_LWT_OUT),
@@ -8281,6 +8282,12 @@ bpf_program__attach_netns(struct bpf_program *prog, int netns_fd)
 	return bpf_program__attach_fd(prog, netns_fd, "netns");
 }
 
+struct bpf_link *bpf_program__attach_xdp(struct bpf_program *prog, int ifindex)
+{
+	/* target_fd/target_ifindex use the same field in LINK_CREATE */
+	return bpf_program__attach_fd(prog, ifindex, "xdp");
+}
+
 struct bpf_link *
 bpf_program__attach_iter(struct bpf_program *prog,
 			 const struct bpf_iter_attach_opts *opts)
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index c6813791fa7e..9924385462ab 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -257,6 +257,8 @@ LIBBPF_API struct bpf_link *
 bpf_program__attach_cgroup(struct bpf_program *prog, int cgroup_fd);
 LIBBPF_API struct bpf_link *
 bpf_program__attach_netns(struct bpf_program *prog, int netns_fd);
+LIBBPF_API struct bpf_link *
+bpf_program__attach_xdp(struct bpf_program *prog, int ifindex);
 
 struct bpf_map;
 
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 6f0856abe299..ca49a6a7e5b2 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -286,6 +286,7 @@ LIBBPF_0.1.0 {
 		bpf_map__set_value_size;
 		bpf_map__type;
 		bpf_map__value_size;
+		bpf_program__attach_xdp;
 		bpf_program__autoload;
 		bpf_program__is_sk_lookup;
 		bpf_program__set_autoload;
diff --git a/tools/testing/selftests/bpf/prog_tests/section_names.c b/tools/testing/selftests/bpf/prog_tests/section_names.c
index 713167449c98..8b571890c57e 100644
--- a/tools/testing/selftests/bpf/prog_tests/section_names.c
+++ b/tools/testing/selftests/bpf/prog_tests/section_names.c
@@ -35,7 +35,7 @@ static struct sec_name_test tests[] = {
 		{-EINVAL, 0},
 	},
 	{"raw_tp/", {0, BPF_PROG_TYPE_RAW_TRACEPOINT, 0}, {-EINVAL, 0} },
-	{"xdp", {0, BPF_PROG_TYPE_XDP, 0}, {-EINVAL, 0} },
+	{"xdp", {0, BPF_PROG_TYPE_XDP, BPF_XDP}, {0, BPF_XDP} },
 	{"perf_event", {0, BPF_PROG_TYPE_PERF_EVENT, 0}, {-EINVAL, 0} },
 	{"lwt_in", {0, BPF_PROG_TYPE_LWT_IN, 0}, {-EINVAL, 0} },
 	{"lwt_out", {0, BPF_PROG_TYPE_LWT_OUT, 0}, {-EINVAL, 0} },

From fe48230cf2ae27c9e3b96d29908e22e2926fd1ab Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:46:01 -0700
Subject: [PATCH 38/73] selftests/bpf: Add BPF XDP link selftests

Add selftest validating all the attachment logic around BPF XDP link. Test
also link updates and get_obj_info() APIs.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-9-andriin@fb.com
---
 .../selftests/bpf/prog_tests/xdp_link.c       | 137 ++++++++++++++++++
 .../selftests/bpf/progs/test_xdp_link.c       |  12 ++
 2 files changed, 149 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/prog_tests/xdp_link.c
 create mode 100644 tools/testing/selftests/bpf/progs/test_xdp_link.c

diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
new file mode 100644
index 000000000000..52cba6795d40
--- /dev/null
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -0,0 +1,137 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <uapi/linux/if_link.h>
+#include <test_progs.h>
+#include "test_xdp_link.skel.h"
+
+#define IFINDEX_LO 1
+
+void test_xdp_link(void)
+{
+	__u32 duration = 0, id1, id2, id0 = 0, prog_fd1, prog_fd2, err;
+	DECLARE_LIBBPF_OPTS(bpf_xdp_set_link_opts, opts, .old_fd = -1);
+	struct test_xdp_link *skel1 = NULL, *skel2 = NULL;
+	struct bpf_link_info link_info;
+	struct bpf_prog_info prog_info;
+	struct bpf_link *link;
+	__u32 link_info_len = sizeof(link_info);
+	__u32 prog_info_len = sizeof(prog_info);
+
+	skel1 = test_xdp_link__open_and_load();
+	if (CHECK(!skel1, "skel_load", "skeleton open and load failed\n"))
+		goto cleanup;
+	prog_fd1 = bpf_program__fd(skel1->progs.xdp_handler);
+
+	skel2 = test_xdp_link__open_and_load();
+	if (CHECK(!skel2, "skel_load", "skeleton open and load failed\n"))
+		goto cleanup;
+	prog_fd2 = bpf_program__fd(skel2->progs.xdp_handler);
+
+	memset(&prog_info, 0, sizeof(prog_info));
+	err = bpf_obj_get_info_by_fd(prog_fd1, &prog_info, &prog_info_len);
+	if (CHECK(err, "fd_info1", "failed %d\n", -errno))
+		goto cleanup;
+	id1 = prog_info.id;
+
+	memset(&prog_info, 0, sizeof(prog_info));
+	err = bpf_obj_get_info_by_fd(prog_fd2, &prog_info, &prog_info_len);
+	if (CHECK(err, "fd_info2", "failed %d\n", -errno))
+		goto cleanup;
+	id2 = prog_info.id;
+
+	/* set initial prog attachment */
+	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd1, XDP_FLAGS_REPLACE, &opts);
+	if (CHECK(err, "fd_attach", "initial prog attach failed: %d\n", err))
+		goto cleanup;
+
+	/* validate prog ID */
+	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+	CHECK(err || id0 != id1, "id1_check",
+	      "loaded prog id %u != id1 %u, err %d", id0, id1, err);
+
+	/* BPF link is not allowed to replace prog attachment */
+	link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
+	if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+		bpf_link__destroy(link);
+		/* best-effort detach prog */
+		opts.old_fd = prog_fd1;
+		bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+		goto cleanup;
+	}
+
+	/* detach BPF program */
+	opts.old_fd = prog_fd1;
+	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, -1, XDP_FLAGS_REPLACE, &opts);
+	if (CHECK(err, "prog_detach", "failed %d\n", err))
+		goto cleanup;
+
+	/* now BPF link should attach successfully */
+	link = bpf_program__attach_xdp(skel1->progs.xdp_handler, IFINDEX_LO);
+	if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+		goto cleanup;
+	skel1->links.xdp_handler = link;
+
+	/* validate prog ID */
+	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+	if (CHECK(err || id0 != id1, "id1_check",
+		  "loaded prog id %u != id1 %u, err %d", id0, id1, err))
+		goto cleanup;
+
+	/* BPF prog attach is not allowed to replace BPF link */
+	opts.old_fd = prog_fd1;
+	err = bpf_set_link_xdp_fd_opts(IFINDEX_LO, prog_fd2, XDP_FLAGS_REPLACE, &opts);
+	if (CHECK(!err, "prog_attach_fail", "unexpected success\n"))
+		goto cleanup;
+
+	/* Can't force-update when BPF link is active */
+	err = bpf_set_link_xdp_fd(IFINDEX_LO, prog_fd2, 0);
+	if (CHECK(!err, "prog_update_fail", "unexpected success\n"))
+		goto cleanup;
+
+	/* Can't force-detach when BPF link is active */
+	err = bpf_set_link_xdp_fd(IFINDEX_LO, -1, 0);
+	if (CHECK(!err, "prog_detach_fail", "unexpected success\n"))
+		goto cleanup;
+
+	/* BPF link is not allowed to replace another BPF link */
+	link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
+	if (CHECK(!IS_ERR(link), "link_attach_fail", "unexpected success\n")) {
+		bpf_link__destroy(link);
+		goto cleanup;
+	}
+
+	bpf_link__destroy(skel1->links.xdp_handler);
+	skel1->links.xdp_handler = NULL;
+
+	/* new link attach should succeed */
+	link = bpf_program__attach_xdp(skel2->progs.xdp_handler, IFINDEX_LO);
+	if (CHECK(IS_ERR(link), "link_attach", "failed: %ld\n", PTR_ERR(link)))
+		goto cleanup;
+	skel2->links.xdp_handler = link;
+
+	err = bpf_get_link_xdp_id(IFINDEX_LO, &id0, 0);
+	if (CHECK(err || id0 != id2, "id2_check",
+		  "loaded prog id %u != id2 %u, err %d", id0, id1, err))
+		goto cleanup;
+
+	/* updating program under active BPF link works as expected */
+	err = bpf_link__update_program(link, skel1->progs.xdp_handler);
+	if (CHECK(err, "link_upd", "failed: %d\n", err))
+		goto cleanup;
+
+	memset(&link_info, 0, sizeof(link_info));
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+	if (CHECK(err, "link_info", "failed: %d\n", err))
+		goto cleanup;
+
+	CHECK(link_info.type != BPF_LINK_TYPE_XDP, "link_type",
+	      "got %u != exp %u\n", link_info.type, BPF_LINK_TYPE_XDP);
+	CHECK(link_info.prog_id != id1, "link_prog_id",
+	      "got %u != exp %u\n", link_info.prog_id, id1);
+	CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex",
+	      "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO);
+
+cleanup:
+	test_xdp_link__destroy(skel1);
+	test_xdp_link__destroy(skel2);
+}
diff --git a/tools/testing/selftests/bpf/progs/test_xdp_link.c b/tools/testing/selftests/bpf/progs/test_xdp_link.c
new file mode 100644
index 000000000000..eb93ea95d1d8
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/test_xdp_link.c
@@ -0,0 +1,12 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+
+char LICENSE[] SEC("license") = "GPL";
+
+SEC("xdp/handler")
+int xdp_handler(struct xdp_md *xdp)
+{
+	return 0;
+}

From e8407fdeb9a6866784e249881f6c786a0835faba Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 21 Jul 2020 23:46:02 -0700
Subject: [PATCH 39/73] bpf, xdp: Remove XDP_QUERY_PROG and XDP_QUERY_PROG_HW
 XDP commands

Now that BPF program/link management is centralized in generic net_device
code, kernel code never queries program id from drivers, so
XDP_QUERY_PROG/XDP_QUERY_PROG_HW commands are unnecessary.

This patch removes all the implementations of those commands in kernel, along
the xdp_attachment_query().

This patch was compile-tested on allyesconfig.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20200722064603.3350758-10-andriin@fb.com
---
 drivers/net/ethernet/amazon/ena/ena_netdev.c  |  6 -----
 drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c |  4 ----
 .../net/ethernet/cavium/thunder/nicvf_main.c  |  3 ---
 .../net/ethernet/freescale/dpaa2/dpaa2-eth.c  |  5 ----
 drivers/net/ethernet/intel/i40e/i40e_main.c   |  3 ---
 drivers/net/ethernet/intel/ice/ice_main.c     |  3 ---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  4 ----
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c |  6 -----
 drivers/net/ethernet/marvell/mvneta.c         |  5 ----
 .../net/ethernet/marvell/mvpp2/mvpp2_main.c   |  3 ---
 .../net/ethernet/mellanox/mlx4/en_netdev.c    | 24 -------------------
 .../net/ethernet/mellanox/mlx5/core/en_main.c | 18 --------------
 .../ethernet/netronome/nfp/nfp_net_common.c   |  4 ----
 .../net/ethernet/qlogic/qede/qede_filter.c    |  3 ---
 drivers/net/ethernet/sfc/efx.c                |  5 ----
 drivers/net/ethernet/socionext/netsec.c       |  3 ---
 drivers/net/ethernet/ti/cpsw_priv.c           |  3 ---
 drivers/net/hyperv/netvsc_bpf.c               | 21 +---------------
 drivers/net/netdevsim/bpf.c                   |  4 ----
 drivers/net/netdevsim/netdevsim.h             |  2 +-
 drivers/net/tun.c                             | 15 ------------
 drivers/net/veth.c                            | 15 ------------
 drivers/net/virtio_net.c                      | 17 -------------
 drivers/net/xen-netfront.c                    | 21 ----------------
 include/linux/netdevice.h                     |  8 -------
 include/net/xdp.h                             |  2 --
 net/core/dev.c                                |  4 ----
 net/core/xdp.c                                |  9 -------
 28 files changed, 2 insertions(+), 218 deletions(-)

diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c
index 6478c1e0d137..2a6c9725e092 100644
--- a/drivers/net/ethernet/amazon/ena/ena_netdev.c
+++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c
@@ -576,15 +576,9 @@ static int ena_xdp_set(struct net_device *netdev, struct netdev_bpf *bpf)
  */
 static int ena_xdp(struct net_device *netdev, struct netdev_bpf *bpf)
 {
-	struct ena_adapter *adapter = netdev_priv(netdev);
-
 	switch (bpf->command) {
 	case XDP_SETUP_PROG:
 		return ena_xdp_set(netdev, bpf);
-	case XDP_QUERY_PROG:
-		bpf->prog_id = adapter->xdp_bpf_prog ?
-			adapter->xdp_bpf_prog->aux->id : 0;
-		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
index 5e3b4a3b69ea..2704a4709bc7 100644
--- a/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
+++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_xdp.c
@@ -330,10 +330,6 @@ int bnxt_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	case XDP_SETUP_PROG:
 		rc = bnxt_xdp_set(bp, xdp->prog);
 		break;
-	case XDP_QUERY_PROG:
-		xdp->prog_id = bp->xdp_prog ? bp->xdp_prog->aux->id : 0;
-		rc = 0;
-		break;
 	default:
 		rc = -EINVAL;
 		break;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 2ba0ce115e63..1c6163934e20 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -1906,9 +1906,6 @@ static int nicvf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return nicvf_xdp_setup(nic, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = nic->xdp_prog ? nic->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
index 9b4028c0e34c..17f6bcafc944 100644
--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
+++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-eth.c
@@ -2077,14 +2077,9 @@ out_err:
 
 static int dpaa2_eth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
-	struct dpaa2_eth_priv *priv = netdev_priv(dev);
-
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return setup_xdp(dev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
-		break;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index dadbfb3d2a2b..d8315811cbdf 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -12923,9 +12923,6 @@ static int i40e_xdp(struct net_device *dev,
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return i40e_xdp_setup(vsi, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
-		return 0;
 	case XDP_SETUP_XSK_UMEM:
 		return i40e_xsk_umem_setup(vsi, xdp->xsk.umem,
 					   xdp->xsk.queue_id);
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index 16a4096bb780..231f4b6e93d0 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -2292,9 +2292,6 @@ static int ice_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return ice_xdp_setup_prog(vsi, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = vsi->xdp_prog ? vsi->xdp_prog->aux->id : 0;
-		return 0;
 	case XDP_SETUP_XSK_UMEM:
 		return ice_xsk_umem_setup(vsi, xdp->xsk.umem,
 					  xdp->xsk.queue_id);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 4d898ff21a46..6f32b1706ab9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -10190,10 +10190,6 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return ixgbe_xdp_setup(dev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = adapter->xdp_prog ?
-			adapter->xdp_prog->aux->id : 0;
-		return 0;
 	case XDP_SETUP_XSK_UMEM:
 		return ixgbe_xsk_umem_setup(adapter, xdp->xsk.umem,
 					    xdp->xsk.queue_id);
diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
index 6e9a397db583..a6267569bfa9 100644
--- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
+++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c
@@ -4502,15 +4502,9 @@ static int ixgbevf_xdp_setup(struct net_device *dev, struct bpf_prog *prog)
 
 static int ixgbevf_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
-	struct ixgbevf_adapter *adapter = netdev_priv(dev);
-
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return ixgbevf_xdp_setup(dev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = adapter->xdp_prog ?
-			       adapter->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/marvell/mvneta.c b/drivers/net/ethernet/marvell/mvneta.c
index 2c9277e73cef..6e3f9e2f883b 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -4442,14 +4442,9 @@ static int mvneta_xdp_setup(struct net_device *dev, struct bpf_prog *prog,
 
 static int mvneta_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
-	struct mvneta_port *pp = netdev_priv(dev);
-
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return mvneta_xdp_setup(dev, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = pp->xdp_prog ? pp->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
index 6a3f356640a0..cd5e9d60307e 100644
--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
@@ -4656,9 +4656,6 @@ static int mvpp2_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return mvpp2_xdp_setup(port, xdp);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = port->xdp_prog ? port->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index 2b8608f8f0a9..106513f772c3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2802,35 +2802,11 @@ unlock_out:
 	return err;
 }
 
-static u32 mlx4_xdp_query(struct net_device *dev)
-{
-	struct mlx4_en_priv *priv = netdev_priv(dev);
-	struct mlx4_en_dev *mdev = priv->mdev;
-	const struct bpf_prog *xdp_prog;
-	u32 prog_id = 0;
-
-	if (!priv->tx_ring_num[TX_XDP])
-		return prog_id;
-
-	mutex_lock(&mdev->state_lock);
-	xdp_prog = rcu_dereference_protected(
-		priv->rx_ring[0]->xdp_prog,
-		lockdep_is_held(&mdev->state_lock));
-	if (xdp_prog)
-		prog_id = xdp_prog->aux->id;
-	mutex_unlock(&mdev->state_lock);
-
-	return prog_id;
-}
-
 static int mlx4_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return mlx4_xdp_set(dev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = mlx4_xdp_query(dev);
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 9d5d8b28bcd8..aa4fb503dac3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -4518,29 +4518,11 @@ unlock:
 	return err;
 }
 
-static u32 mlx5e_xdp_query(struct net_device *dev)
-{
-	struct mlx5e_priv *priv = netdev_priv(dev);
-	const struct bpf_prog *xdp_prog;
-	u32 prog_id = 0;
-
-	mutex_lock(&priv->state_lock);
-	xdp_prog = priv->channels.params.xdp_prog;
-	if (xdp_prog)
-		prog_id = xdp_prog->aux->id;
-	mutex_unlock(&priv->state_lock);
-
-	return prog_id;
-}
-
 static int mlx5e_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return mlx5e_xdp_set(dev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = mlx5e_xdp_query(dev);
-		return 0;
 	case XDP_SETUP_XSK_UMEM:
 		return mlx5e_xsk_setup_umem(dev, xdp->xsk.umem,
 					    xdp->xsk.queue_id);
diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
index 44608873d3d9..39ee23e8c0bf 100644
--- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
+++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c
@@ -3614,10 +3614,6 @@ static int nfp_net_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 		return nfp_net_xdp_setup_drv(nn, xdp);
 	case XDP_SETUP_PROG_HW:
 		return nfp_net_xdp_setup_hw(nn, xdp);
-	case XDP_QUERY_PROG:
-		return xdp_attachment_query(&nn->xdp, xdp);
-	case XDP_QUERY_PROG_HW:
-		return xdp_attachment_query(&nn->xdp_hw, xdp);
 	default:
 		return nfp_app_bpf(nn->app, nn, xdp);
 	}
diff --git a/drivers/net/ethernet/qlogic/qede/qede_filter.c b/drivers/net/ethernet/qlogic/qede/qede_filter.c
index b7d0b6ccebd3..f961f65d9372 100644
--- a/drivers/net/ethernet/qlogic/qede/qede_filter.c
+++ b/drivers/net/ethernet/qlogic/qede/qede_filter.c
@@ -1045,9 +1045,6 @@ int qede_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return qede_xdp_set(edev, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = edev->xdp_prog ? edev->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c
index f16b4f236031..d60acaa3879d 100644
--- a/drivers/net/ethernet/sfc/efx.c
+++ b/drivers/net/ethernet/sfc/efx.c
@@ -653,15 +653,10 @@ static int efx_xdp_setup_prog(struct efx_nic *efx, struct bpf_prog *prog)
 static int efx_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	struct efx_nic *efx = netdev_priv(dev);
-	struct bpf_prog *xdp_prog;
 
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return efx_xdp_setup_prog(efx, xdp->prog);
-	case XDP_QUERY_PROG:
-		xdp_prog = rtnl_dereference(efx->xdp_prog);
-		xdp->prog_id = xdp_prog ? xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/socionext/netsec.c b/drivers/net/ethernet/socionext/netsec.c
index 0f366cc50b74..25db667fa879 100644
--- a/drivers/net/ethernet/socionext/netsec.c
+++ b/drivers/net/ethernet/socionext/netsec.c
@@ -1811,9 +1811,6 @@ static int netsec_xdp(struct net_device *ndev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return netsec_xdp_setup(priv, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = priv->xdp_prog ? priv->xdp_prog->aux->id : 0;
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/ti/cpsw_priv.c b/drivers/net/ethernet/ti/cpsw_priv.c
index a399f3659346..d6d7a7d9c7ad 100644
--- a/drivers/net/ethernet/ti/cpsw_priv.c
+++ b/drivers/net/ethernet/ti/cpsw_priv.c
@@ -1286,9 +1286,6 @@ int cpsw_ndo_bpf(struct net_device *ndev, struct netdev_bpf *bpf)
 	case XDP_SETUP_PROG:
 		return cpsw_xdp_prog_setup(priv, bpf);
 
-	case XDP_QUERY_PROG:
-		return xdp_attachment_query(&priv->xdpi, bpf);
-
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/hyperv/netvsc_bpf.c b/drivers/net/hyperv/netvsc_bpf.c
index 8e4141552423..440486d9c999 100644
--- a/drivers/net/hyperv/netvsc_bpf.c
+++ b/drivers/net/hyperv/netvsc_bpf.c
@@ -163,16 +163,6 @@ int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog)
 	return ret;
 }
 
-static u32 netvsc_xdp_query(struct netvsc_device *nvdev)
-{
-	struct bpf_prog *prog = netvsc_xdp_get(nvdev);
-
-	if (prog)
-		return prog->aux->id;
-
-	return 0;
-}
-
 int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 {
 	struct net_device_context *ndevctx = netdev_priv(dev);
@@ -182,12 +172,7 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	int ret;
 
 	if (!nvdev || nvdev->destroy) {
-		if (bpf->command == XDP_QUERY_PROG) {
-			bpf->prog_id = 0;
-			return 0; /* Query must always succeed */
-		} else {
-			return -ENODEV;
-		}
+		return -ENODEV;
 	}
 
 	switch (bpf->command) {
@@ -208,10 +193,6 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 
 		return ret;
 
-	case XDP_QUERY_PROG:
-		bpf->prog_id = netvsc_xdp_query(nvdev);
-		return 0;
-
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/netdevsim/bpf.c b/drivers/net/netdevsim/bpf.c
index 0b362b8dac17..2e90512f3bbe 100644
--- a/drivers/net/netdevsim/bpf.c
+++ b/drivers/net/netdevsim/bpf.c
@@ -551,10 +551,6 @@ int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 	ASSERT_RTNL();
 
 	switch (bpf->command) {
-	case XDP_QUERY_PROG:
-		return xdp_attachment_query(&ns->xdp, bpf);
-	case XDP_QUERY_PROG_HW:
-		return xdp_attachment_query(&ns->xdp_hw, bpf);
 	case XDP_SETUP_PROG:
 		err = nsim_setup_prog_checks(ns, bpf);
 		if (err)
diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h
index d164052e0393..284f7092241d 100644
--- a/drivers/net/netdevsim/netdevsim.h
+++ b/drivers/net/netdevsim/netdevsim.h
@@ -121,7 +121,7 @@ static inline void nsim_bpf_uninit(struct netdevsim *ns)
 
 static inline int nsim_bpf(struct net_device *dev, struct netdev_bpf *bpf)
 {
-	return bpf->command == XDP_QUERY_PROG ? 0 : -EOPNOTSUPP;
+	return -EOPNOTSUPP;
 }
 
 static inline int nsim_bpf_disable_tc(struct netdevsim *ns)
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 7adeb91bd368..061bebe25cb1 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1184,26 +1184,11 @@ static int tun_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return 0;
 }
 
-static u32 tun_xdp_query(struct net_device *dev)
-{
-	struct tun_struct *tun = netdev_priv(dev);
-	const struct bpf_prog *xdp_prog;
-
-	xdp_prog = rtnl_dereference(tun->xdp_prog);
-	if (xdp_prog)
-		return xdp_prog->aux->id;
-
-	return 0;
-}
-
 static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return tun_xdp_set(dev, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = tun_xdp_query(dev);
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index b594f03eeddb..e56cd562a664 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -1198,26 +1198,11 @@ err:
 	return err;
 }
 
-static u32 veth_xdp_query(struct net_device *dev)
-{
-	struct veth_priv *priv = netdev_priv(dev);
-	const struct bpf_prog *xdp_prog;
-
-	xdp_prog = priv->_xdp_prog;
-	if (xdp_prog)
-		return xdp_prog->aux->id;
-
-	return 0;
-}
-
 static int veth_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return veth_xdp_set(dev, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = veth_xdp_query(dev);
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index ba38765dc490..6fa8fe5ef160 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2490,28 +2490,11 @@ err:
 	return err;
 }
 
-static u32 virtnet_xdp_query(struct net_device *dev)
-{
-	struct virtnet_info *vi = netdev_priv(dev);
-	const struct bpf_prog *xdp_prog;
-	int i;
-
-	for (i = 0; i < vi->max_queue_pairs; i++) {
-		xdp_prog = rtnl_dereference(vi->rq[i].xdp_prog);
-		if (xdp_prog)
-			return xdp_prog->aux->id;
-	}
-	return 0;
-}
-
 static int virtnet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return virtnet_xdp_set(dev, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = virtnet_xdp_query(dev);
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c
index a63e550c370e..458be6882b98 100644
--- a/drivers/net/xen-netfront.c
+++ b/drivers/net/xen-netfront.c
@@ -1480,32 +1480,11 @@ static int xennet_xdp_set(struct net_device *dev, struct bpf_prog *prog,
 	return 0;
 }
 
-static u32 xennet_xdp_query(struct net_device *dev)
-{
-	unsigned int num_queues = dev->real_num_tx_queues;
-	struct netfront_info *np = netdev_priv(dev);
-	const struct bpf_prog *xdp_prog;
-	struct netfront_queue *queue;
-	unsigned int i;
-
-	for (i = 0; i < num_queues; ++i) {
-		queue = &np->queues[i];
-		xdp_prog = rtnl_dereference(queue->xdp_prog);
-		if (xdp_prog)
-			return xdp_prog->aux->id;
-	}
-
-	return 0;
-}
-
 static int xennet_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 {
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return xennet_xdp_set(dev, xdp->prog, xdp->extack);
-	case XDP_QUERY_PROG:
-		xdp->prog_id = xennet_xdp_query(dev);
-		return 0;
 	default:
 		return -EINVAL;
 	}
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 7d3c412fcfe5..1046763cd0dc 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -876,8 +876,6 @@ enum bpf_netdev_command {
 	 */
 	XDP_SETUP_PROG,
 	XDP_SETUP_PROG_HW,
-	XDP_QUERY_PROG,
-	XDP_QUERY_PROG_HW,
 	/* BPF program for offload callbacks, invoked at program load time. */
 	BPF_OFFLOAD_MAP_ALLOC,
 	BPF_OFFLOAD_MAP_FREE,
@@ -911,12 +909,6 @@ struct netdev_bpf {
 			struct bpf_prog *prog;
 			struct netlink_ext_ack *extack;
 		};
-		/* XDP_QUERY_PROG, XDP_QUERY_PROG_HW */
-		struct {
-			u32 prog_id;
-			/* flags with which program was installed */
-			u32 prog_flags;
-		};
 		/* BPF_OFFLOAD_MAP_ALLOC, BPF_OFFLOAD_MAP_FREE */
 		struct {
 			struct bpf_offloaded_map *offmap;
diff --git a/include/net/xdp.h b/include/net/xdp.h
index dbe9c60797e1..3814fb631d52 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -240,8 +240,6 @@ struct xdp_attachment_info {
 };
 
 struct netdev_bpf;
-int xdp_attachment_query(struct xdp_attachment_info *info,
-			 struct netdev_bpf *bpf);
 bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
 			     struct netdev_bpf *bpf);
 void xdp_attachment_setup(struct xdp_attachment_info *info,
diff --git a/net/core/dev.c b/net/core/dev.c
index 82ce0920b172..a2a57988880a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5468,10 +5468,6 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
 		}
 		break;
 
-	case XDP_QUERY_PROG:
-		xdp->prog_id = old ? old->aux->id : 0;
-		break;
-
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 3c45f99e26d5..48aba933a5a8 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -400,15 +400,6 @@ void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
 }
 EXPORT_SYMBOL_GPL(__xdp_release_frame);
 
-int xdp_attachment_query(struct xdp_attachment_info *info,
-			 struct netdev_bpf *bpf)
-{
-	bpf->prog_id = info->prog ? info->prog->aux->id : 0;
-	bpf->prog_flags = info->prog ? info->flags : 0;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(xdp_attachment_query);
-
 bool xdp_attachment_flags_ok(struct xdp_attachment_info *info,
 			     struct netdev_bpf *bpf)
 {

From 70cfab1d871c771bd1963dd898f4f9b2731590c2 Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 24 Jul 2020 10:06:17 +0100
Subject: [PATCH 40/73] tools, bpftool: Skip type probe if name is not found

For probing program and map types, bpftool loops on type values and uses
the relevant type name in prog_type_name[] or map_type_name[]. To ensure
the name exists, we exit from the loop if we go over the size of the
array.

However, this is not enough in the case where the arrays have "holes" in
them, program or map types for which they have no name, but not at the
end of the list. This is currently the case for BPF_PROG_TYPE_LSM, not
known to bpftool and which name is a null string. When probing for
features, bpftool attempts to strlen() that name and segfaults.

Let's fix it by skipping probes for "unknown" program and map types,
with an informational message giving the numeral value in that case.

Fixes: 93a3545d812a ("tools/bpftool: Add name mappings for SK_LOOKUP prog and attach type")
Reported-by: Paul Chaignon <paul@cilium.io>
Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200724090618.16378-2-quentin@isovalent.com
---
 tools/bpf/bpftool/feature.c | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c
index 1cd75807673e..a43a6f10b564 100644
--- a/tools/bpf/bpftool/feature.c
+++ b/tools/bpf/bpftool/feature.c
@@ -504,6 +504,10 @@ probe_prog_type(enum bpf_prog_type prog_type, bool *supported_types,
 
 	supported_types[prog_type] |= res;
 
+	if (!prog_type_name[prog_type]) {
+		p_info("program type name not found (type %d)", prog_type);
+		return;
+	}
 	maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1;
 	if (strlen(prog_type_name[prog_type]) > maxlen) {
 		p_info("program type name too long");
@@ -533,6 +537,10 @@ probe_map_type(enum bpf_map_type map_type, const char *define_prefix,
 	 * check required for unprivileged users
 	 */
 
+	if (!map_type_name[map_type]) {
+		p_info("map type name not found (type %d)", map_type);
+		return;
+	}
 	maxlen = sizeof(plain_desc) - strlen(plain_comment) - 1;
 	if (strlen(map_type_name[map_type]) > maxlen) {
 		p_info("map type name too long");

From 9a97c9d2af5ca798377342debf7f0f44281d050e Mon Sep 17 00:00:00 2001
From: Quentin Monnet <quentin@isovalent.com>
Date: Fri, 24 Jul 2020 10:06:18 +0100
Subject: [PATCH 41/73] tools, bpftool: Add LSM type to array of prog names

Assign "lsm" as a printed name for BPF_PROG_TYPE_LSM in bpftool, so that
it can use it when listing programs loaded on the system or when probing
features.

Signed-off-by: Quentin Monnet <quentin@isovalent.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200724090618.16378-3-quentin@isovalent.com
---
 tools/bpf/bpftool/prog.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c
index 3e6ecc6332e2..158995d853b0 100644
--- a/tools/bpf/bpftool/prog.c
+++ b/tools/bpf/bpftool/prog.c
@@ -59,6 +59,7 @@ const char * const prog_type_name[] = {
 	[BPF_PROG_TYPE_TRACING]			= "tracing",
 	[BPF_PROG_TYPE_STRUCT_OPS]		= "struct_ops",
 	[BPF_PROG_TYPE_EXT]			= "ext",
+	[BPF_PROG_TYPE_LSM]			= "lsm",
 	[BPF_PROG_TYPE_SK_LOOKUP]		= "sk_lookup",
 };
 

From e1613b5714ee6c186c9628e9958edf65e9d9cddd Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Mon, 27 Jul 2020 15:47:15 -0700
Subject: [PATCH 42/73] bpf: Fix bpf_ringbuf_output() signature to return long

Due to bpf tree fix merge, bpf_ringbuf_output() signature ended up with int as
a return type, while all other helpers got converted to returning long. So fix
it in bpf-next now.

Fixes: b0659d8a950d ("bpf: Fix definition of bpf_ringbuf_output() helper in UAPI comments")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200727224715.652037-1-andriin@fb.com
---
 include/uapi/linux/bpf.h       | 2 +-
 tools/include/uapi/linux/bpf.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index e1ba4ae6a916..eb5e0c38eb2c 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -3241,7 +3241,7 @@ union bpf_attr {
  *	Return
  *		The id is returned or 0 in case the id could not be retrieved.
  *
- * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
  * 	Description
  * 		Copy *size* bytes from *data* into a ring buffer *ringbuf*.
  * 		If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index e1ba4ae6a916..eb5e0c38eb2c 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -3241,7 +3241,7 @@ union bpf_attr {
  *	Return
  *		The id is returned or 0 in case the id could not be retrieved.
  *
- * int bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
+ * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags)
  * 	Description
  * 		Copy *size* bytes from *data* into a ring buffer *ringbuf*.
  * 		If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification

From 363885d7c62e293fb093c7c355bf5f05fa0a25a9 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Mon, 27 Jul 2020 16:33:45 -0700
Subject: [PATCH 43/73] selftests/bpf: Add new bpf_iter context structs to fix
 build on old kernels

Add bpf_iter__bpf_map_elem and bpf_iter__bpf_sk_storage_map to bpf_iter.h.

Fixes: 3b1c420bd882 ("selftests/bpf: Add a test for bpf sk_storage_map iterator")
Fixes: 2a7c2fff7dd6 ("selftests/bpf: Add test for bpf hash map iterators")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200727233345.1686358-1-andriin@fb.com
---
 tools/testing/selftests/bpf/progs/bpf_iter.h | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/bpf_iter.h b/tools/testing/selftests/bpf/progs/bpf_iter.h
index 17db3bac518b..c196280df90d 100644
--- a/tools/testing/selftests/bpf/progs/bpf_iter.h
+++ b/tools/testing/selftests/bpf/progs/bpf_iter.h
@@ -11,6 +11,8 @@
 #define tcp6_sock tcp6_sock___not_used
 #define bpf_iter__udp bpf_iter__udp___not_used
 #define udp6_sock udp6_sock___not_used
+#define bpf_iter__bpf_map_elem bpf_iter__bpf_map_elem___not_used
+#define bpf_iter__bpf_sk_storage_map bpf_iter__bpf_sk_storage_map___not_used
 #include "vmlinux.h"
 #undef bpf_iter_meta
 #undef bpf_iter__bpf_map
@@ -22,6 +24,8 @@
 #undef tcp6_sock
 #undef bpf_iter__udp
 #undef udp6_sock
+#undef bpf_iter__bpf_map_elem
+#undef bpf_iter__bpf_sk_storage_map
 
 struct bpf_iter_meta {
 	struct seq_file *seq;
@@ -78,3 +82,17 @@ struct udp6_sock {
 	struct udp_sock	udp;
 	struct ipv6_pinfo inet6;
 } __attribute__((preserve_access_index));
+
+struct bpf_iter__bpf_map_elem {
+	struct bpf_iter_meta *meta;
+	struct bpf_map *map;
+	void *key;
+	void *value;
+};
+
+struct bpf_iter__bpf_sk_storage_map {
+	struct bpf_iter_meta *meta;
+	struct bpf_map *map;
+	struct sock *sk;
+	void *value;
+};

From f6dfbe31e8fa5cbd5bc89df9d7f0fa0af7e69981 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Mon, 27 Jul 2020 18:54:11 +0100
Subject: [PATCH 44/73] bpf: Fix swapped arguments in calls to
 check_buffer_access

There are a couple of arguments of the boolean flag zero_size_allowed and
the char pointer buf_info when calling to function check_buffer_access that
are swapped by mistake. Fix these by swapping them to correct the argument
ordering.

Fixes: afbf21dce668 ("bpf: Support readonly/readwrite buffers in verifier")
Addresses-Coverity: ("Array compared to 0")
Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Yonghong Song <yhs@fb.com>
Link: https://lore.kernel.org/bpf/20200727175411.155179-1-colin.king@canonical.com
---
 kernel/bpf/verifier.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index cd14e70f2d07..88bb25d08bf8 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3477,14 +3477,14 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
 				regno, reg_type_str[reg->type]);
 			return -EACCES;
 		}
-		err = check_buffer_access(env, reg, regno, off, size, "rdonly",
-					  false,
+		err = check_buffer_access(env, reg, regno, off, size, false,
+					  "rdonly",
 					  &env->prog->aux->max_rdonly_access);
 		if (!err && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);
 	} else if (reg->type == PTR_TO_RDWR_BUF) {
-		err = check_buffer_access(env, reg, regno, off, size, "rdwr",
-					  false,
+		err = check_buffer_access(env, reg, regno, off, size, false,
+					  "rdwr",
 					  &env->prog->aux->max_rdwr_access);
 		if (!err && t == BPF_READ && value_regno >= 0)
 			mark_reg_unknown(env, regs, value_regno);

From 3c4f850e8441ac8b3b6dbaa6107604c4199ef01f Mon Sep 17 00:00:00 2001
From: Peilin Ye <yepeilin.cs@gmail.com>
Date: Tue, 28 Jul 2020 01:36:04 -0400
Subject: [PATCH 45/73] xdp: Prevent kernel-infoleak in xsk_getsockopt()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xsk_getsockopt() is copying uninitialized stack memory to userspace when
'extra_stats' is 'false'. Fix it. Doing '= {};' is sufficient since currently
'struct xdp_statistics' is defined as follows:

  struct xdp_statistics {
    __u64 rx_dropped;
    __u64 rx_invalid_descs;
    __u64 tx_invalid_descs;
    __u64 rx_ring_full;
    __u64 rx_fill_ring_empty_descs;
    __u64 tx_ring_empty_descs;
  };

When being copied to the userspace, 'stats' will not contain any uninitialized
'holes' between struct fields.

Fixes: 8aa5a33578e9 ("xsk: Add new statistics")
Suggested-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Peilin Ye <yepeilin.cs@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Björn Töpel <bjorn.topel@intel.com>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/bpf/20200728053604.404631-1-yepeilin.cs@gmail.com
---
 net/xdp/xsk.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
index 2e94a7e94671..c3231620d210 100644
--- a/net/xdp/xsk.c
+++ b/net/xdp/xsk.c
@@ -840,7 +840,7 @@ static int xsk_getsockopt(struct socket *sock, int level, int optname,
 	switch (optname) {
 	case XDP_STATISTICS:
 	{
-		struct xdp_statistics stats;
+		struct xdp_statistics stats = {};
 		bool extra_stats = true;
 		size_t stats_size;
 

From ca5cd355b7f0372da0d50fce5b12a3367e417290 Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Tue, 28 Jul 2020 07:39:02 -0700
Subject: [PATCH 46/73] bpf, selftests: use :: 1 for localhost in tcp_server.py

Using localhost requires the host to have a /etc/hosts file with that
specific line in it. By default my dev box did not, they used
ip6-localhost, so the test was failing. To fix remove the need for any
/etc/hosts and use ::1.

I could just add the line, but this seems easier.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/159594714197.21431.10113693935099326445.stgit@john-Precision-5820-Tower
---
 tools/testing/selftests/bpf/tcp_client.py | 2 +-
 tools/testing/selftests/bpf/tcp_server.py | 2 +-
 tools/testing/selftests/bpf/test_netcnt.c | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/testing/selftests/bpf/tcp_client.py b/tools/testing/selftests/bpf/tcp_client.py
index a53ed58528d6..bfff82be3fc1 100755
--- a/tools/testing/selftests/bpf/tcp_client.py
+++ b/tools/testing/selftests/bpf/tcp_client.py
@@ -34,7 +34,7 @@ serverPort = int(sys.argv[1])
 # create active socket
 sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
 try:
-    sock.connect(('localhost', serverPort))
+    sock.connect(('::1', serverPort))
 except socket.error as e:
     sys.exit(1)
 
diff --git a/tools/testing/selftests/bpf/tcp_server.py b/tools/testing/selftests/bpf/tcp_server.py
index 0ca60d193bed..42ab8882f00f 100755
--- a/tools/testing/selftests/bpf/tcp_server.py
+++ b/tools/testing/selftests/bpf/tcp_server.py
@@ -38,7 +38,7 @@ serverSocket = None
 # create passive socket
 serverSocket = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
 
-try: serverSocket.bind(('localhost', 0))
+try: serverSocket.bind(('::1', 0))
 except socket.error as msg:
     print('bind fails: ' + str(msg))
 
diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c
index c1da5404454a..7a68c9069639 100644
--- a/tools/testing/selftests/bpf/test_netcnt.c
+++ b/tools/testing/selftests/bpf/test_netcnt.c
@@ -82,9 +82,9 @@ int main(int argc, char **argv)
 	}
 
 	if (system("which ping6 &>/dev/null") == 0)
-		assert(!system("ping6 localhost -c 10000 -f -q > /dev/null"));
+		assert(!system("ping6 ::1 -c 10000 -f -q > /dev/null"));
 	else
-		assert(!system("ping -6 localhost -c 10000 -f -q > /dev/null"));
+		assert(!system("ping -6 ::1 -c 10000 -f -q > /dev/null"));
 
 	if (bpf_prog_query(cgroup_fd, BPF_CGROUP_INET_EGRESS, 0, NULL, NULL,
 			   &prog_cnt)) {

From 310ad7970a0dec847563dc6dba9e7e587d545622 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 28 Jul 2020 12:05:27 -0700
Subject: [PATCH 47/73] bpf: Fix build without CONFIG_NET when using BPF XDP
 link

Entire net/core subsystem is not built without CONFIG_NET. linux/netdevice.h
just assumes that it's always there, so the easiest way to fix this is to
conditionally compile out bpf_xdp_link_attach() use in bpf/syscall.c.

Fixes: aa8d3a716b59 ("bpf, xdp: Add bpf_link-based XDP attachment API")
Reported-by: Randy Dunlap <rdunlap@infradead.org>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Randy Dunlap <rdunlap@infradead.org> # build-tested
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728190527.110830-1-andriin@fb.com
---
 kernel/bpf/syscall.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 0e8c88db7e7a..cd3d599e9e90 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3923,9 +3923,11 @@ static int link_create(union bpf_attr *attr)
 	case BPF_PROG_TYPE_SK_LOOKUP:
 		ret = netns_bpf_link_create(attr, prog);
 		break;
+#ifdef CONFIG_NET
 	case BPF_PROG_TYPE_XDP:
 		ret = bpf_xdp_link_attach(attr, prog);
 		break;
+#endif
 	default:
 		ret = -EINVAL;
 	}

From 800834285361dcf8e98b018e891df876472a4fac Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Tue, 28 Jul 2020 17:21:26 +0200
Subject: [PATCH 48/73] bpf, arm64: Add BPF exception tables

When a tracing BPF program attempts to read memory without using the
bpf_probe_read() helper, the verifier marks the load instruction with
the BPF_PROBE_MEM flag. Since the arm64 JIT does not currently recognize
this flag it falls back to the interpreter.

Add support for BPF_PROBE_MEM, by appending an exception table to the
BPF program. If the load instruction causes a data abort, the fixup
infrastructure finds the exception table and fixes up the fault, by
clearing the destination register and jumping over the faulting
instruction.

To keep the compact exception table entry format, inspect the pc in
fixup_exception(). A more generic solution would add a "handler" field
to the table entry, like on x86 and s390.

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728152122.1292756-2-jean-philippe@linaro.org
---
 arch/arm64/include/asm/extable.h | 12 +++++
 arch/arm64/mm/extable.c          | 12 +++--
 arch/arm64/net/bpf_jit_comp.c    | 93 +++++++++++++++++++++++++++++---
 3 files changed, 108 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/include/asm/extable.h b/arch/arm64/include/asm/extable.h
index 56a4f68b262e..840a35ed92ec 100644
--- a/arch/arm64/include/asm/extable.h
+++ b/arch/arm64/include/asm/extable.h
@@ -22,5 +22,17 @@ struct exception_table_entry
 
 #define ARCH_HAS_RELATIVE_EXTABLE
 
+#ifdef CONFIG_BPF_JIT
+int arm64_bpf_fixup_exception(const struct exception_table_entry *ex,
+			      struct pt_regs *regs);
+#else /* !CONFIG_BPF_JIT */
+static inline
+int arm64_bpf_fixup_exception(const struct exception_table_entry *ex,
+			      struct pt_regs *regs)
+{
+	return 0;
+}
+#endif /* !CONFIG_BPF_JIT */
+
 extern int fixup_exception(struct pt_regs *regs);
 #endif
diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index 81e694af5f8c..eee1732ab6cd 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -11,8 +11,14 @@ int fixup_exception(struct pt_regs *regs)
 	const struct exception_table_entry *fixup;
 
 	fixup = search_exception_tables(instruction_pointer(regs));
-	if (fixup)
-		regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
+	if (!fixup)
+		return 0;
 
-	return fixup != NULL;
+	if (IS_ENABLED(CONFIG_BPF_JIT) &&
+	    regs->pc >= BPF_JIT_REGION_START &&
+	    regs->pc < BPF_JIT_REGION_END)
+		return arm64_bpf_fixup_exception(fixup, regs);
+
+	regs->pc = (unsigned long)&fixup->fixup + fixup->fixup;
+	return 1;
 }
diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c
index 3cb25b43b368..f8912e45be7a 100644
--- a/arch/arm64/net/bpf_jit_comp.c
+++ b/arch/arm64/net/bpf_jit_comp.c
@@ -7,6 +7,7 @@
 
 #define pr_fmt(fmt) "bpf_jit: " fmt
 
+#include <linux/bitfield.h>
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/printk.h>
@@ -56,6 +57,7 @@ struct jit_ctx {
 	int idx;
 	int epilogue_offset;
 	int *offset;
+	int exentry_idx;
 	__le32 *image;
 	u32 stack_size;
 };
@@ -351,6 +353,67 @@ static void build_epilogue(struct jit_ctx *ctx)
 	emit(A64_RET(A64_LR), ctx);
 }
 
+#define BPF_FIXUP_OFFSET_MASK	GENMASK(26, 0)
+#define BPF_FIXUP_REG_MASK	GENMASK(31, 27)
+
+int arm64_bpf_fixup_exception(const struct exception_table_entry *ex,
+			      struct pt_regs *regs)
+{
+	off_t offset = FIELD_GET(BPF_FIXUP_OFFSET_MASK, ex->fixup);
+	int dst_reg = FIELD_GET(BPF_FIXUP_REG_MASK, ex->fixup);
+
+	regs->regs[dst_reg] = 0;
+	regs->pc = (unsigned long)&ex->fixup - offset;
+	return 1;
+}
+
+/* For accesses to BTF pointers, add an entry to the exception table */
+static int add_exception_handler(const struct bpf_insn *insn,
+				 struct jit_ctx *ctx,
+				 int dst_reg)
+{
+	off_t offset;
+	unsigned long pc;
+	struct exception_table_entry *ex;
+
+	if (!ctx->image)
+		/* First pass */
+		return 0;
+
+	if (BPF_MODE(insn->code) != BPF_PROBE_MEM)
+		return 0;
+
+	if (!ctx->prog->aux->extable ||
+	    WARN_ON_ONCE(ctx->exentry_idx >= ctx->prog->aux->num_exentries))
+		return -EINVAL;
+
+	ex = &ctx->prog->aux->extable[ctx->exentry_idx];
+	pc = (unsigned long)&ctx->image[ctx->idx - 1];
+
+	offset = pc - (long)&ex->insn;
+	if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN))
+		return -ERANGE;
+	ex->insn = offset;
+
+	/*
+	 * Since the extable follows the program, the fixup offset is always
+	 * negative and limited to BPF_JIT_REGION_SIZE. Store a positive value
+	 * to keep things simple, and put the destination register in the upper
+	 * bits. We don't need to worry about buildtime or runtime sort
+	 * modifying the upper bits because the table is already sorted, and
+	 * isn't part of the main exception table.
+	 */
+	offset = (long)&ex->fixup - (pc + AARCH64_INSN_SIZE);
+	if (!FIELD_FIT(BPF_FIXUP_OFFSET_MASK, offset))
+		return -ERANGE;
+
+	ex->fixup = FIELD_PREP(BPF_FIXUP_OFFSET_MASK, offset) |
+		    FIELD_PREP(BPF_FIXUP_REG_MASK, dst_reg);
+
+	ctx->exentry_idx++;
+	return 0;
+}
+
 /* JITs an eBPF instruction.
  * Returns:
  * 0  - successfully JITed an 8-byte eBPF instruction.
@@ -375,6 +438,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx,
 	u8 jmp_cond, reg;
 	s32 jmp_offset;
 	u32 a64_insn;
+	int ret;
 
 #define check_imm(bits, imm) do {				\
 	if ((((imm) > 0) && ((imm) >> (bits))) ||		\
@@ -694,7 +758,6 @@ emit_cond_jmp:
 		const u8 r0 = bpf2a64[BPF_REG_0];
 		bool func_addr_fixed;
 		u64 func_addr;
-		int ret;
 
 		ret = bpf_jit_get_func_addr(ctx->prog, insn, extra_pass,
 					    &func_addr, &func_addr_fixed);
@@ -738,6 +801,10 @@ emit_cond_jmp:
 	case BPF_LDX | BPF_MEM | BPF_H:
 	case BPF_LDX | BPF_MEM | BPF_B:
 	case BPF_LDX | BPF_MEM | BPF_DW:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_DW:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_W:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_H:
+	case BPF_LDX | BPF_PROBE_MEM | BPF_B:
 		emit_a64_mov_i(1, tmp, off, ctx);
 		switch (BPF_SIZE(code)) {
 		case BPF_W:
@@ -753,6 +820,10 @@ emit_cond_jmp:
 			emit(A64_LDR64(dst, src, tmp), ctx);
 			break;
 		}
+
+		ret = add_exception_handler(insn, ctx, dst);
+		if (ret)
+			return ret;
 		break;
 
 	/* ST: *(size *)(dst + off) = imm */
@@ -868,6 +939,9 @@ static int validate_code(struct jit_ctx *ctx)
 			return -1;
 	}
 
+	if (WARN_ON_ONCE(ctx->exentry_idx != ctx->prog->aux->num_exentries))
+		return -1;
+
 	return 0;
 }
 
@@ -884,6 +958,7 @@ struct arm64_jit_data {
 
 struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 {
+	int image_size, prog_size, extable_size;
 	struct bpf_prog *tmp, *orig_prog = prog;
 	struct bpf_binary_header *header;
 	struct arm64_jit_data *jit_data;
@@ -891,7 +966,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	bool tmp_blinded = false;
 	bool extra_pass = false;
 	struct jit_ctx ctx;
-	int image_size;
 	u8 *image_ptr;
 
 	if (!prog->jit_requested)
@@ -922,7 +996,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 		image_ptr = jit_data->image;
 		header = jit_data->header;
 		extra_pass = true;
-		image_size = sizeof(u32) * ctx.idx;
+		prog_size = sizeof(u32) * ctx.idx;
 		goto skip_init_ctx;
 	}
 	memset(&ctx, 0, sizeof(ctx));
@@ -950,8 +1024,12 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	ctx.epilogue_offset = ctx.idx;
 	build_epilogue(&ctx);
 
+	extable_size = prog->aux->num_exentries *
+		sizeof(struct exception_table_entry);
+
 	/* Now we know the actual image size. */
-	image_size = sizeof(u32) * ctx.idx;
+	prog_size = sizeof(u32) * ctx.idx;
+	image_size = prog_size + extable_size;
 	header = bpf_jit_binary_alloc(image_size, &image_ptr,
 				      sizeof(u32), jit_fill_hole);
 	if (header == NULL) {
@@ -962,8 +1040,11 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog)
 	/* 2. Now, the actual pass. */
 
 	ctx.image = (__le32 *)image_ptr;
+	if (extable_size)
+		prog->aux->extable = (void *)image_ptr + prog_size;
 skip_init_ctx:
 	ctx.idx = 0;
+	ctx.exentry_idx = 0;
 
 	build_prologue(&ctx, was_classic);
 
@@ -984,7 +1065,7 @@ skip_init_ctx:
 
 	/* And we're done. */
 	if (bpf_jit_enable > 1)
-		bpf_jit_dump(prog->len, image_size, 2, ctx.image);
+		bpf_jit_dump(prog->len, prog_size, 2, ctx.image);
 
 	bpf_flush_icache(header, ctx.image + ctx.idx);
 
@@ -1005,7 +1086,7 @@ skip_init_ctx:
 	}
 	prog->bpf_func = (void *)ctx.image;
 	prog->jited = 1;
-	prog->jited_len = image_size;
+	prog->jited_len = prog_size;
 
 	if (!prog->is_func || extra_pass) {
 		bpf_prog_fill_jited_linfo(prog, ctx.offset);

From 4fc00b79b85d4c34bef06ad49f109ad7cd9e5d83 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 28 Jul 2020 15:18:01 -0700
Subject: [PATCH 49/73] bpf: Add missing newline characters in verifier error
 messages

Newline characters are added in two verifier error messages,
refactored in Commit afbf21dce668 ("bpf: Support readonly/readwrite
buffers in verifier"). This way, they do not mix with
messages afterwards.

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728221801.1090349-1-yhs@fb.com
---
 kernel/bpf/verifier.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 88bb25d08bf8..b6ccfce3bf4c 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3069,7 +3069,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 {
 	if (off < 0) {
 		verbose(env,
-			"R%d invalid %s buffer access: off=%d, size=%d",
+			"R%d invalid %s buffer access: off=%d, size=%d\n",
 			regno, buf_info, off, size);
 		return -EACCES;
 	}
@@ -3078,7 +3078,7 @@ static int __check_buffer_access(struct bpf_verifier_env *env,
 
 		tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
 		verbose(env,
-			"R%d invalid variable buffer offset: off=%d, var_off=%s",
+			"R%d invalid variable buffer offset: off=%d, var_off=%s\n",
 			regno, off, tn_buf);
 		return -EACCES;
 	}

From 12e6196fb15953605be54ac9320ac54371aecab7 Mon Sep 17 00:00:00 2001
From: Yonghong Song <yhs@fb.com>
Date: Tue, 28 Jul 2020 15:18:01 -0700
Subject: [PATCH 50/73] selftests/bpf: Test bpf_iter buffer access with
 negative offset

Commit afbf21dce668 ("bpf: Support readonly/readwrite buffers
in verifier") added readonly/readwrite buffer support which
is currently used by bpf_iter tracing programs. It has
a bug with incorrect parameter ordering which later fixed
by Commit f6dfbe31e8fa ("bpf: Fix swapped arguments in calls
to check_buffer_access").

This patch added a test case with a negative offset access
which will trigger the error path.

Without Commit f6dfbe31e8fa, running the test case in the patch,
the error message looks like:
   R1_w=rdwr_buf(id=0,off=0,imm=0) R10=fp0
  ; value_sum += *(__u32 *)(value - 4);
  2: (61) r1 = *(u32 *)(r1 -4)
  R1 invalid (null) buffer access: off=-4, size=4

With the above commit, the error message looks like:
   R1_w=rdwr_buf(id=0,off=0,imm=0) R10=fp0
  ; value_sum += *(__u32 *)(value - 4);
  2: (61) r1 = *(u32 *)(r1 -4)
  R1 invalid rdwr buffer access: off=-4, size=4

Signed-off-by: Yonghong Song <yhs@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200728221801.1090406-1-yhs@fb.com
---
 .../selftests/bpf/prog_tests/bpf_iter.c       | 13 ++++++++++++
 .../selftests/bpf/progs/bpf_iter_test_kern6.c | 21 +++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c

diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
index d95de80b1851..4ffefdc1130f 100644
--- a/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
+++ b/tools/testing/selftests/bpf/prog_tests/bpf_iter.c
@@ -21,6 +21,7 @@
 #include "bpf_iter_bpf_percpu_array_map.skel.h"
 #include "bpf_iter_bpf_sk_storage_map.skel.h"
 #include "bpf_iter_test_kern5.skel.h"
+#include "bpf_iter_test_kern6.skel.h"
 
 static int duration;
 
@@ -885,6 +886,16 @@ static void test_rdonly_buf_out_of_bound(void)
 	bpf_iter_test_kern5__destroy(skel);
 }
 
+static void test_buf_neg_offset(void)
+{
+	struct bpf_iter_test_kern6 *skel;
+
+	skel = bpf_iter_test_kern6__open_and_load();
+	if (CHECK(skel, "bpf_iter_test_kern6__open_and_load",
+		  "skeleton open_and_load unexpected success\n"))
+		bpf_iter_test_kern6__destroy(skel);
+}
+
 void test_bpf_iter(void)
 {
 	if (test__start_subtest("btf_id_or_null"))
@@ -933,4 +944,6 @@ void test_bpf_iter(void)
 		test_bpf_sk_storage_map();
 	if (test__start_subtest("rdonly-buf-out-of-bound"))
 		test_rdonly_buf_out_of_bound();
+	if (test__start_subtest("buf-neg-offset"))
+		test_buf_neg_offset();
 }
diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
new file mode 100644
index 000000000000..1c7304f56b1e
--- /dev/null
+++ b/tools/testing/selftests/bpf/progs/bpf_iter_test_kern6.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright (c) 2020 Facebook */
+#include "bpf_iter.h"
+#include <bpf/bpf_helpers.h>
+
+char _license[] SEC("license") = "GPL";
+
+__u32 value_sum = 0;
+
+SEC("iter/bpf_map_elem")
+int dump_bpf_hash_map(struct bpf_iter__bpf_map_elem *ctx)
+{
+	void *value = ctx->value;
+
+	if (value == (void *)0)
+		return 0;
+
+	/* negative offset, verifier failure. */
+	value_sum += *(__u32 *)(value - 4);
+	return 0;
+}

From f7c6cb1d9728dea9d9f131ef57303d6821afb0f8 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jul 2020 17:31:03 -0700
Subject: [PATCH 51/73] bpf: Expose socket storage to BPF_PROG_TYPE_CGROUP_SOCK

This lets us use socket storage from the following hooks:

* BPF_CGROUP_INET_SOCK_CREATE
* BPF_CGROUP_INET_SOCK_RELEASE
* BPF_CGROUP_INET4_POST_BIND
* BPF_CGROUP_INET6_POST_BIND

Using existing 'bpf_sk_storage_get_proto' doesn't work because
second argument is ARG_PTR_TO_SOCKET. Even though
BPF_PROG_TYPE_CGROUP_SOCK hooks operate on 'struct bpf_sock',
the verifier still considers it as a PTR_TO_CTX.
That's why I'm adding another 'bpf_sk_storage_get_cg_sock_proto'
definition strictly for BPF_PROG_TYPE_CGROUP_SOCK which accepts
ARG_PTR_TO_CTX which is really 'struct sock' for this program type.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200729003104.1280813-1-sdf@google.com
---
 net/core/bpf_sk_storage.c | 10 ++++++++++
 net/core/filter.c         |  3 +++
 2 files changed, 13 insertions(+)

diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index eafcd15e7dfd..d3377c90a291 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -944,6 +944,16 @@ const struct bpf_func_proto bpf_sk_storage_get_proto = {
 	.arg4_type	= ARG_ANYTHING,
 };
 
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto = {
+	.func		= bpf_sk_storage_get,
+	.gpl_only	= false,
+	.ret_type	= RET_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg1_type	= ARG_CONST_MAP_PTR,
+	.arg2_type	= ARG_PTR_TO_CTX, /* context is 'struct sock' */
+	.arg3_type	= ARG_PTR_TO_MAP_VALUE_OR_NULL,
+	.arg4_type	= ARG_ANYTHING,
+};
+
 const struct bpf_func_proto bpf_sk_storage_delete_proto = {
 	.func		= bpf_sk_storage_delete,
 	.gpl_only	= false,
diff --git a/net/core/filter.c b/net/core/filter.c
index 29e3455122f7..7124f0fe6974 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -6187,6 +6187,7 @@ bool bpf_helper_changes_pkt_data(void *func)
 }
 
 const struct bpf_func_proto bpf_event_output_data_proto __weak;
+const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak;
 
 static const struct bpf_func_proto *
 sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
@@ -6219,6 +6220,8 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
 	case BPF_FUNC_get_cgroup_classid:
 		return &bpf_get_cgroup_classid_curr_proto;
 #endif
+	case BPF_FUNC_sk_storage_get:
+		return &bpf_sk_storage_get_cg_sock_proto;
 	default:
 		return bpf_base_func_proto(func_id);
 	}

From 4fb5f94911405b6d2645d4384c2ae1215bfc6a76 Mon Sep 17 00:00:00 2001
From: Stanislav Fomichev <sdf@google.com>
Date: Tue, 28 Jul 2020 17:31:04 -0700
Subject: [PATCH 52/73] selftests/bpf: Verify socket storage in
 cgroup/sock_{create, release}

Augment udp_limit test to set and verify socket storage value.
That should be enough to exercise the changes from the previous
patch.

Signed-off-by: Stanislav Fomichev <sdf@google.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200729003104.1280813-2-sdf@google.com
---
 tools/testing/selftests/bpf/progs/udp_limit.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tools/testing/selftests/bpf/progs/udp_limit.c b/tools/testing/selftests/bpf/progs/udp_limit.c
index 8429b22525a7..165e3c2dd9a3 100644
--- a/tools/testing/selftests/bpf/progs/udp_limit.c
+++ b/tools/testing/selftests/bpf/progs/udp_limit.c
@@ -6,14 +6,28 @@
 
 int invocations = 0, in_use = 0;
 
+struct {
+	__uint(type, BPF_MAP_TYPE_SK_STORAGE);
+	__uint(map_flags, BPF_F_NO_PREALLOC);
+	__type(key, int);
+	__type(value, int);
+} sk_map SEC(".maps");
+
 SEC("cgroup/sock_create")
 int sock(struct bpf_sock *ctx)
 {
+	int *sk_storage;
 	__u32 key;
 
 	if (ctx->type != SOCK_DGRAM)
 		return 1;
 
+	sk_storage = bpf_sk_storage_get(&sk_map, ctx, 0,
+					BPF_SK_STORAGE_GET_F_CREATE);
+	if (!sk_storage)
+		return 0;
+	*sk_storage = 0xdeadbeef;
+
 	__sync_fetch_and_add(&invocations, 1);
 
 	if (in_use > 0) {
@@ -31,11 +45,16 @@ int sock(struct bpf_sock *ctx)
 SEC("cgroup/sock_release")
 int sock_release(struct bpf_sock *ctx)
 {
+	int *sk_storage;
 	__u32 key;
 
 	if (ctx->type != SOCK_DGRAM)
 		return 1;
 
+	sk_storage = bpf_sk_storage_get(&sk_map, ctx, 0, 0);
+	if (!sk_storage || *sk_storage != 0xdeadbeef)
+		return 0;
+
 	__sync_fetch_and_add(&invocations, 1);
 	__sync_fetch_and_add(&in_use, -1);
 	return 1;

From dfdb0d93e5bc351af5b286ae9c630d3cf869b810 Mon Sep 17 00:00:00 2001
From: Hangbin Liu <liuhangbin@gmail.com>
Date: Wed, 29 Jul 2020 16:56:58 +0800
Subject: [PATCH 53/73] selftests/bpf: Add xdpdrv mode for test_xdp_redirect

This patch add xdpdrv mode for test_xdp_redirect.sh since veth has support
native mode. After update here is the test result:

  # ./test_xdp_redirect.sh
  selftests: test_xdp_redirect xdpgeneric [PASS]
  selftests: test_xdp_redirect xdpdrv [PASS]

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: William Tu <u9012063@gmail.com>
Link: https://lore.kernel.org/bpf/20200729085658.403794-1-liuhangbin@gmail.com
---
 .../selftests/bpf/test_xdp_redirect.sh        | 96 +++++++++++--------
 1 file changed, 58 insertions(+), 38 deletions(-)

diff --git a/tools/testing/selftests/bpf/test_xdp_redirect.sh b/tools/testing/selftests/bpf/test_xdp_redirect.sh
index c4b17e08d431..dd80f0c84afb 100755
--- a/tools/testing/selftests/bpf/test_xdp_redirect.sh
+++ b/tools/testing/selftests/bpf/test_xdp_redirect.sh
@@ -10,52 +10,72 @@
 #     | xdp forwarding |
 #     ------------------
 
+ret=0
+
+setup()
+{
+
+	local xdpmode=$1
+
+	ip netns add ns1
+	ip netns add ns2
+
+	ip link add veth1 index 111 type veth peer name veth11 netns ns1
+	ip link add veth2 index 222 type veth peer name veth22 netns ns2
+
+	ip link set veth1 up
+	ip link set veth2 up
+	ip -n ns1 link set dev veth11 up
+	ip -n ns2 link set dev veth22 up
+
+	ip -n ns1 addr add 10.1.1.11/24 dev veth11
+	ip -n ns2 addr add 10.1.1.22/24 dev veth22
+}
+
 cleanup()
 {
-	if [ "$?" = "0" ]; then
-		echo "selftests: test_xdp_redirect [PASS]";
-	else
-		echo "selftests: test_xdp_redirect [FAILED]";
-	fi
-
-	set +e
 	ip link del veth1 2> /dev/null
 	ip link del veth2 2> /dev/null
 	ip netns del ns1 2> /dev/null
 	ip netns del ns2 2> /dev/null
 }
 
-ip link set dev lo xdpgeneric off 2>/dev/null > /dev/null
-if [ $? -ne 0 ];then
-	echo "selftests: [SKIP] Could not run test without the ip xdpgeneric support"
-	exit 0
-fi
+test_xdp_redirect()
+{
+	local xdpmode=$1
+
+	setup
+
+	ip link set dev veth1 $xdpmode off &> /dev/null
+	if [ $? -ne 0 ];then
+		echo "selftests: test_xdp_redirect $xdpmode [SKIP]"
+		return 0
+	fi
+
+	ip -n ns1 link set veth11 $xdpmode obj xdp_dummy.o sec xdp_dummy &> /dev/null
+	ip -n ns2 link set veth22 $xdpmode obj xdp_dummy.o sec xdp_dummy &> /dev/null
+	ip link set dev veth1 $xdpmode obj test_xdp_redirect.o sec redirect_to_222 &> /dev/null
+	ip link set dev veth2 $xdpmode obj test_xdp_redirect.o sec redirect_to_111 &> /dev/null
+
+	ip netns exec ns1 ping -c 1 10.1.1.22 &> /dev/null
+	local ret1=$?
+	ip netns exec ns2 ping -c 1 10.1.1.11 &> /dev/null
+	local ret2=$?
+
+	if [ $ret1 -eq 0 -a $ret2 -eq 0 ]; then
+		echo "selftests: test_xdp_redirect $xdpmode [PASS]";
+	else
+		ret=1
+		echo "selftests: test_xdp_redirect $xdpmode [FAILED]";
+	fi
+
+	cleanup
+}
+
 set -e
+trap cleanup 2 3 6 9
 
-ip netns add ns1
-ip netns add ns2
+test_xdp_redirect xdpgeneric
+test_xdp_redirect xdpdrv
 
-trap cleanup 0 2 3 6 9
-
-ip link add veth1 index 111 type veth peer name veth11
-ip link add veth2 index 222 type veth peer name veth22
-
-ip link set veth11 netns ns1
-ip link set veth22 netns ns2
-
-ip link set veth1 up
-ip link set veth2 up
-
-ip netns exec ns1 ip addr add 10.1.1.11/24 dev veth11
-ip netns exec ns2 ip addr add 10.1.1.22/24 dev veth22
-
-ip netns exec ns1 ip link set dev veth11 up
-ip netns exec ns2 ip link set dev veth22 up
-
-ip link set dev veth1 xdpgeneric obj test_xdp_redirect.o sec redirect_to_222
-ip link set dev veth2 xdpgeneric obj test_xdp_redirect.o sec redirect_to_111
-
-ip netns exec ns1 ping -c 1 10.1.1.22
-ip netns exec ns2 ping -c 1 10.1.1.11
-
-exit 0
+exit $ret

From 80546ac4586c0bd326aa7ce80f076646db02bcd0 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Tue, 28 Jul 2020 21:50:56 -0700
Subject: [PATCH 54/73] selftests/bpf: Don't destroy failed link

Check that link is NULL or proper pointer before invoking bpf_link__destroy().
Not doing this causes crash in test_progs, when cg_storage_multi selftest
fails.

Fixes: 3573f384014f ("selftests/bpf: Test CGROUP_STORAGE behavior on shared egress + ingress")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200729045056.3363921-1-andriin@fb.com
---
 .../bpf/prog_tests/cg_storage_multi.c         | 42 ++++++++++++-------
 1 file changed, 28 insertions(+), 14 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
index c67d8c076a34..643dfa35419c 100644
--- a/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
+++ b/tools/testing/selftests/bpf/prog_tests/cg_storage_multi.c
@@ -147,8 +147,10 @@ static void test_egress_only(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	bpf_link__destroy(parent_link);
-	bpf_link__destroy(child_link);
+	if (!IS_ERR(parent_link))
+		bpf_link__destroy(parent_link);
+	if (!IS_ERR(child_link))
+		bpf_link__destroy(child_link);
 
 	cg_storage_multi_egress_only__destroy(obj);
 }
@@ -262,12 +264,18 @@ static void test_isolated(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	bpf_link__destroy(parent_egress1_link);
-	bpf_link__destroy(parent_egress2_link);
-	bpf_link__destroy(parent_ingress_link);
-	bpf_link__destroy(child_egress1_link);
-	bpf_link__destroy(child_egress2_link);
-	bpf_link__destroy(child_ingress_link);
+	if (!IS_ERR(parent_egress1_link))
+		bpf_link__destroy(parent_egress1_link);
+	if (!IS_ERR(parent_egress2_link))
+		bpf_link__destroy(parent_egress2_link);
+	if (!IS_ERR(parent_ingress_link))
+		bpf_link__destroy(parent_ingress_link);
+	if (!IS_ERR(child_egress1_link))
+		bpf_link__destroy(child_egress1_link);
+	if (!IS_ERR(child_egress2_link))
+		bpf_link__destroy(child_egress2_link);
+	if (!IS_ERR(child_ingress_link))
+		bpf_link__destroy(child_ingress_link);
 
 	cg_storage_multi_isolated__destroy(obj);
 }
@@ -367,12 +375,18 @@ static void test_shared(int parent_cgroup_fd, int child_cgroup_fd)
 		goto close_bpf_object;
 
 close_bpf_object:
-	bpf_link__destroy(parent_egress1_link);
-	bpf_link__destroy(parent_egress2_link);
-	bpf_link__destroy(parent_ingress_link);
-	bpf_link__destroy(child_egress1_link);
-	bpf_link__destroy(child_egress2_link);
-	bpf_link__destroy(child_ingress_link);
+	if (!IS_ERR(parent_egress1_link))
+		bpf_link__destroy(parent_egress1_link);
+	if (!IS_ERR(parent_egress2_link))
+		bpf_link__destroy(parent_egress2_link);
+	if (!IS_ERR(parent_ingress_link))
+		bpf_link__destroy(parent_ingress_link);
+	if (!IS_ERR(child_egress1_link))
+		bpf_link__destroy(child_egress1_link);
+	if (!IS_ERR(child_egress2_link))
+		bpf_link__destroy(child_egress2_link);
+	if (!IS_ERR(child_ingress_link))
+		bpf_link__destroy(child_ingress_link);
 
 	cg_storage_multi_shared__destroy(obj);
 }

From a6599abdeac30063baf89df166068b20758e0e86 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Thu, 30 Jul 2020 14:53:25 +0200
Subject: [PATCH 55/73] selftests/bpf: Omit nodad flag when adding addresses to
 loopback

Setting IFA_F_NODAD flag for IPv6 addresses to add to loopback is
unnecessary. Duplicate Address Detection does not happen on loopback
device.

Also, passing 'nodad' flag to 'ip address' breaks libbpf CI, which runs in
an environment with BusyBox implementation of 'ip' command, that doesn't
understand this flag.

Fixes: 0ab5539f8584 ("selftests/bpf: Tests for BPF_SK_LOOKUP attach point")
Reported-by: Andrii Nakryiko <andrii.nakryiko@gmail.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Tested-by: Andrii Nakryiko <andrii@fb.com>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/20200730125325.1869363-1-jakub@cloudflare.com
---
 tools/testing/selftests/bpf/prog_tests/sk_lookup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 9bbd2b2b7630..379da6f10ee9 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -1290,8 +1290,8 @@ static void run_tests(struct test_sk_lookup *skel)
 static int switch_netns(void)
 {
 	static const char * const setup_script[] = {
-		"ip -6 addr add dev lo " EXT_IP6 "/128 nodad",
-		"ip -6 addr add dev lo " INT_IP6 "/128 nodad",
+		"ip -6 addr add dev lo " EXT_IP6 "/128",
+		"ip -6 addr add dev lo " INT_IP6 "/128",
 		"ip link set dev lo up",
 		NULL,
 	};

From 50450fc716c1a570ee8d8bfe198ef5d3cfca36e4 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Wed, 29 Jul 2020 16:21:48 -0700
Subject: [PATCH 56/73] libbpf: Make destructors more robust by handling
 ERR_PTR(err) cases

Most of libbpf "constructors" on failure return ERR_PTR(err) result encoded as
a pointer. It's a common mistake to eventually pass such malformed pointers
into xxx__destroy()/xxx__free() "destructors". So instead of fixing up
clean up code in selftests and user programs, handle such error pointers in
destructors themselves. This works beautifully for NULL pointers passed to
destructors, so might as well just work for error pointers.

Suggested-by: Song Liu <songliubraving@fb.com>
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Song Liu <songliubraving@fb.com>
Link: https://lore.kernel.org/bpf/20200729232148.896125-1-andriin@fb.com
---
 tools/lib/bpf/btf.c      | 4 ++--
 tools/lib/bpf/btf_dump.c | 2 +-
 tools/lib/bpf/libbpf.c   | 9 ++++-----
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index c9e760e120dc..ded5b29965f9 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -386,7 +386,7 @@ __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name,
 
 void btf__free(struct btf *btf)
 {
-	if (!btf)
+	if (IS_ERR_OR_NULL(btf))
 		return;
 
 	if (btf->fd >= 0)
@@ -1025,7 +1025,7 @@ static int btf_ext_parse_hdr(__u8 *data, __u32 data_size)
 
 void btf_ext__free(struct btf_ext *btf_ext)
 {
-	if (!btf_ext)
+	if (IS_ERR_OR_NULL(btf_ext))
 		return;
 	free(btf_ext->data);
 	free(btf_ext);
diff --git a/tools/lib/bpf/btf_dump.c b/tools/lib/bpf/btf_dump.c
index e1c344504cae..cf711168d34a 100644
--- a/tools/lib/bpf/btf_dump.c
+++ b/tools/lib/bpf/btf_dump.c
@@ -183,7 +183,7 @@ void btf_dump__free(struct btf_dump *d)
 {
 	int i, cnt;
 
-	if (!d)
+	if (IS_ERR_OR_NULL(d))
 		return;
 
 	free(d->type_states);
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index 54830d603fee..b9f11f854985 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -6504,7 +6504,7 @@ void bpf_object__close(struct bpf_object *obj)
 {
 	size_t i;
 
-	if (!obj)
+	if (IS_ERR_OR_NULL(obj))
 		return;
 
 	if (obj->clear_priv)
@@ -7690,7 +7690,7 @@ int bpf_link__destroy(struct bpf_link *link)
 {
 	int err = 0;
 
-	if (!link)
+	if (IS_ERR_OR_NULL(link))
 		return 0;
 
 	if (!link->disconnected && link->detach)
@@ -8502,7 +8502,7 @@ void perf_buffer__free(struct perf_buffer *pb)
 {
 	int i;
 
-	if (!pb)
+	if (IS_ERR_OR_NULL(pb))
 		return;
 	if (pb->cpu_bufs) {
 		for (i = 0; i < pb->cpu_cnt; i++) {
@@ -9379,8 +9379,7 @@ void bpf_object__detach_skeleton(struct bpf_object_skeleton *s)
 	for (i = 0; i < s->prog_cnt; i++) {
 		struct bpf_link **link = s->progs[i].link;
 
-		if (!IS_ERR_OR_NULL(*link))
-			bpf_link__destroy(*link);
+		bpf_link__destroy(*link);
 		*link = NULL;
 	}
 }

From c64c9c282a9a7ec0515b725d5aaed68c32e403a4 Mon Sep 17 00:00:00 2001
From: Jakub Sitnicki <jakub@cloudflare.com>
Date: Sun, 26 Jul 2020 14:02:28 +0200
Subject: [PATCH 57/73] udp, bpf: Ignore connections in reuseport group after
 BPF sk lookup

When BPF sk lookup invokes reuseport handling for the selected socket, it
should ignore the fact that reuseport group can contain connected UDP
sockets. With BPF sk lookup this is not relevant as we are not scoring
sockets to find the best match, which might be a connected UDP socket.

Fix it by unconditionally accepting the socket selected by reuseport.

This fixes the following two failures reported by test_progs.

  # ./test_progs -t sk_lookup
  ...
  #73/14 UDP IPv4 redir and reuseport with conns:FAIL
  ...
  #73/20 UDP IPv6 redir and reuseport with conns:FAIL
  ...

Fixes: a57066b1a019 ("Merge git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net")
Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
Signed-off-by: Jakub Sitnicki <jakub@cloudflare.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200726120228.1414348-1-jakub@cloudflare.com
---
 net/ipv4/udp.c | 2 +-
 net/ipv6/udp.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 7ce31beccfc2..e88efba07551 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -473,7 +473,7 @@ static struct sock *udp4_lookup_run_bpf(struct net *net,
 		return sk;
 
 	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
-	if (reuse_sk && !reuseport_has_conns(sk, false))
+	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;
 }
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index c394e674f486..29d9691359b9 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -208,7 +208,7 @@ static inline struct sock *udp6_lookup_run_bpf(struct net *net,
 		return sk;
 
 	reuse_sk = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum);
-	if (reuse_sk && !reuseport_has_conns(sk, false))
+	if (reuse_sk)
 		sk = reuse_sk;
 	return sk;
 }

From 1acf8f90ea7ee59006d0474275922145ac291331 Mon Sep 17 00:00:00 2001
From: Jerry Crunchtime <jerry.c.t@web.de>
Date: Fri, 31 Jul 2020 17:08:01 +0200
Subject: [PATCH 58/73] libbpf: Fix register in PT_REGS MIPS macros

The o32, n32 and n64 calling conventions require the return
value to be stored in $v0 which maps to $2 register, i.e.,
the register 2.

Fixes: c1932cd ("bpf: Add MIPS support to samples/bpf.")
Signed-off-by: Jerry Crunchtime <jerry.c.t@web.de>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/43707d31-0210-e8f0-9226-1af140907641@web.de
---
 tools/lib/bpf/bpf_tracing.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/lib/bpf/bpf_tracing.h b/tools/lib/bpf/bpf_tracing.h
index 58eceb884df3..eebf020cbe3e 100644
--- a/tools/lib/bpf/bpf_tracing.h
+++ b/tools/lib/bpf/bpf_tracing.h
@@ -215,7 +215,7 @@ struct pt_regs;
 #define PT_REGS_PARM5(x) ((x)->regs[8])
 #define PT_REGS_RET(x) ((x)->regs[31])
 #define PT_REGS_FP(x) ((x)->regs[30]) /* Works only with CONFIG_FRAME_POINTER */
-#define PT_REGS_RC(x) ((x)->regs[1])
+#define PT_REGS_RC(x) ((x)->regs[2])
 #define PT_REGS_SP(x) ((x)->regs[29])
 #define PT_REGS_IP(x) ((x)->cp0_epc)
 
@@ -226,7 +226,7 @@ struct pt_regs;
 #define PT_REGS_PARM5_CORE(x) BPF_CORE_READ((x), regs[8])
 #define PT_REGS_RET_CORE(x) BPF_CORE_READ((x), regs[31])
 #define PT_REGS_FP_CORE(x) BPF_CORE_READ((x), regs[30])
-#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), regs[1])
+#define PT_REGS_RC_CORE(x) BPF_CORE_READ((x), regs[2])
 #define PT_REGS_SP_CORE(x) BPF_CORE_READ((x), regs[29])
 #define PT_REGS_IP_CORE(x) BPF_CORE_READ((x), cp0_epc)
 

From ffba964e4d1126bef5d636e8af70f052e50342fc Mon Sep 17 00:00:00 2001
From: Tiezhu Yang <yangtiezhu@loongson.cn>
Date: Fri, 31 Jul 2020 16:29:02 +0800
Subject: [PATCH 59/73] Documentation/bpf: Use valid and new links in index.rst

There exists an error "404 Not Found" when I click the html link of
"Documentation/networking/filter.rst" in the BPF documentation [1],
fix it.

Additionally, use the new links about "BPF and XDP Reference Guide"
and "bpf(2)" to avoid redirects.

[1] https://www.kernel.org/doc/html/latest/bpf/

Fixes: d9b9170a2653 ("docs: bpf: Rename README.rst to index.rst")
Fixes: cb3f0d56e153 ("docs: networking: convert filter.txt to ReST")
Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/1596184142-18476-1-git-send-email-yangtiezhu@loongson.cn
---
 Documentation/bpf/index.rst         | 12 ++++++------
 Documentation/networking/filter.rst |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Documentation/bpf/index.rst b/Documentation/bpf/index.rst
index 26f4bb3107fc..44ca8ea5a723 100644
--- a/Documentation/bpf/index.rst
+++ b/Documentation/bpf/index.rst
@@ -5,10 +5,10 @@ BPF Documentation
 This directory contains documentation for the BPF (Berkeley Packet
 Filter) facility, with a focus on the extended BPF version (eBPF).
 
-This kernel side documentation is still work in progress.  The main
+This kernel side documentation is still work in progress. The main
 textual documentation is (for historical reasons) described in
-`Documentation/networking/filter.rst`_, which describe both classical
-and extended BPF instruction-set.
+:ref:`networking-filter`, which describe both classical and extended
+BPF instruction-set.
 The Cilium project also maintains a `BPF and XDP Reference Guide`_
 that goes into great technical depth about the BPF Architecture.
 
@@ -68,7 +68,7 @@ Testing and debugging BPF
 
 
 .. Links:
-.. _Documentation/networking/filter.rst: ../networking/filter.txt
+.. _networking-filter: ../networking/filter.rst
 .. _man-pages: https://www.kernel.org/doc/man-pages/
-.. _bpf(2): http://man7.org/linux/man-pages/man2/bpf.2.html
-.. _BPF and XDP Reference Guide: http://cilium.readthedocs.io/en/latest/bpf/
+.. _bpf(2): https://man7.org/linux/man-pages/man2/bpf.2.html
+.. _BPF and XDP Reference Guide: https://docs.cilium.io/en/latest/bpf/
diff --git a/Documentation/networking/filter.rst b/Documentation/networking/filter.rst
index a1d3e192b9fa..debb59e374de 100644
--- a/Documentation/networking/filter.rst
+++ b/Documentation/networking/filter.rst
@@ -1,5 +1,7 @@
 .. SPDX-License-Identifier: GPL-2.0
 
+.. _networking-filter:
+
 =======================================================
 Linux Socket Filtering aka Berkeley Packet Filter (BPF)
 =======================================================

From 4939b2847d26c025e2e2118744226967f239a1ac Mon Sep 17 00:00:00 2001
From: John Fastabend <john.fastabend@gmail.com>
Date: Fri, 31 Jul 2020 15:09:14 -0700
Subject: [PATCH 60/73] bpf, selftests: Use single cgroup helpers for both
 test_sockmap/progs

Nearly every user of cgroup helpers does the same sequence of API calls. So
push these into a single helper cgroup_setup_and_join. The cases that do
a bit of extra logic are test_progs which currently uses an env variable
to decide if it needs to setup the cgroup environment or can use an
existingi environment. And then tests that are doing cgroup tests
themselves. We skip these cases for now.

Signed-off-by: John Fastabend <john.fastabend@gmail.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Link: https://lore.kernel.org/bpf/159623335418.30208.15807461815525100199.stgit@john-XPS-13-9370
---
 tools/testing/selftests/bpf/cgroup_helpers.c  | 23 +++++++++++++++++++
 tools/testing/selftests/bpf/cgroup_helpers.h  |  1 +
 .../selftests/bpf/get_cgroup_id_user.c        | 14 ++---------
 .../selftests/bpf/test_cgroup_storage.c       | 17 +-------------
 tools/testing/selftests/bpf/test_dev_cgroup.c | 15 ++----------
 tools/testing/selftests/bpf/test_netcnt.c     | 17 ++------------
 .../selftests/bpf/test_skb_cgroup_id_user.c   |  8 +------
 tools/testing/selftests/bpf/test_sock.c       |  8 +------
 tools/testing/selftests/bpf/test_sock_addr.c  |  8 +------
 .../testing/selftests/bpf/test_sock_fields.c  | 14 +++--------
 .../selftests/bpf/test_socket_cookie.c        |  8 +------
 tools/testing/selftests/bpf/test_sockmap.c    | 18 ++-------------
 tools/testing/selftests/bpf/test_sysctl.c     |  8 +------
 .../testing/selftests/bpf/test_tcpbpf_user.c  |  8 +------
 .../selftests/bpf/test_tcpnotify_user.c       |  8 +------
 15 files changed, 43 insertions(+), 132 deletions(-)

diff --git a/tools/testing/selftests/bpf/cgroup_helpers.c b/tools/testing/selftests/bpf/cgroup_helpers.c
index 0fb910df5387..033051717ba5 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.c
+++ b/tools/testing/selftests/bpf/cgroup_helpers.c
@@ -290,3 +290,26 @@ free_mem:
 	free(fhp);
 	return ret;
 }
+
+int cgroup_setup_and_join(const char *path) {
+	int cg_fd;
+
+	if (setup_cgroup_environment()) {
+		fprintf(stderr, "Failed to setup cgroup environment\n");
+		return -EINVAL;
+	}
+
+	cg_fd = create_and_get_cgroup(path);
+	if (cg_fd < 0) {
+		fprintf(stderr, "Failed to create test cgroup\n");
+		cleanup_cgroup_environment();
+		return cg_fd;
+	}
+
+	if (join_cgroup(path)) {
+		fprintf(stderr, "Failed to join cgroup\n");
+		cleanup_cgroup_environment();
+		return -EINVAL;
+	}
+	return cg_fd;
+}
diff --git a/tools/testing/selftests/bpf/cgroup_helpers.h b/tools/testing/selftests/bpf/cgroup_helpers.h
index d64bb8957090..5fe3d88e4f0d 100644
--- a/tools/testing/selftests/bpf/cgroup_helpers.h
+++ b/tools/testing/selftests/bpf/cgroup_helpers.h
@@ -9,6 +9,7 @@
 	__FILE__, __LINE__, clean_errno(), ##__VA_ARGS__)
 
 
+int cgroup_setup_and_join(const char *path);
 int create_and_get_cgroup(const char *path);
 int join_cgroup(const char *path);
 int setup_cgroup_environment(void);
diff --git a/tools/testing/selftests/bpf/get_cgroup_id_user.c b/tools/testing/selftests/bpf/get_cgroup_id_user.c
index e8da7b39158d..b8d6aef99db4 100644
--- a/tools/testing/selftests/bpf/get_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/get_cgroup_id_user.c
@@ -58,20 +58,10 @@ int main(int argc, char **argv)
 	int exit_code = 1;
 	char buf[256];
 
-	err = setup_cgroup_environment();
-	if (CHECK(err, "setup_cgroup_environment", "err %d errno %d\n", err,
-		  errno))
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	if (CHECK(cgroup_fd < 0, "cgroup_setup_and_join", "err %d errno %d\n", cgroup_fd, errno))
 		return 1;
 
-	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
-	if (CHECK(cgroup_fd < 0, "create_and_get_cgroup", "err %d errno %d\n",
-		  cgroup_fd, errno))
-		goto cleanup_cgroup_env;
-
-	err = join_cgroup(TEST_CGROUP);
-	if (CHECK(err, "join_cgroup", "err %d errno %d\n", err, errno))
-		goto cleanup_cgroup_env;
-
 	err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd);
 	if (CHECK(err, "bpf_prog_load", "err %d errno %d\n", err, errno))
 		goto cleanup_cgroup_env;
diff --git a/tools/testing/selftests/bpf/test_cgroup_storage.c b/tools/testing/selftests/bpf/test_cgroup_storage.c
index 655729004391..d946252a25bb 100644
--- a/tools/testing/selftests/bpf/test_cgroup_storage.c
+++ b/tools/testing/selftests/bpf/test_cgroup_storage.c
@@ -74,22 +74,7 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	if (setup_cgroup_environment()) {
-		printf("Failed to setup cgroup environment\n");
-		goto err;
-	}
-
-	/* Create a cgroup, get fd, and join it */
-	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
-	if (cgroup_fd < 0) {
-		printf("Failed to create test cgroup\n");
-		goto err;
-	}
-
-	if (join_cgroup(TEST_CGROUP)) {
-		printf("Failed to join cgroup\n");
-		goto err;
-	}
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
 
 	/* Attach the bpf program */
 	if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
diff --git a/tools/testing/selftests/bpf/test_dev_cgroup.c b/tools/testing/selftests/bpf/test_dev_cgroup.c
index d850fb9076b5..804dddd97d4c 100644
--- a/tools/testing/selftests/bpf/test_dev_cgroup.c
+++ b/tools/testing/selftests/bpf/test_dev_cgroup.c
@@ -33,21 +33,10 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	if (setup_cgroup_environment()) {
-		printf("Failed to load DEV_CGROUP program\n");
-		goto err;
-	}
-
-	/* Create a cgroup, get fd, and join it */
-	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
 	if (cgroup_fd < 0) {
 		printf("Failed to create test cgroup\n");
-		goto err;
-	}
-
-	if (join_cgroup(TEST_CGROUP)) {
-		printf("Failed to join cgroup\n");
-		goto err;
+		goto out;
 	}
 
 	/* Attach bpf program */
diff --git a/tools/testing/selftests/bpf/test_netcnt.c b/tools/testing/selftests/bpf/test_netcnt.c
index 7a68c9069639..a7b9a69f4fd5 100644
--- a/tools/testing/selftests/bpf/test_netcnt.c
+++ b/tools/testing/selftests/bpf/test_netcnt.c
@@ -58,22 +58,9 @@ int main(int argc, char **argv)
 		goto out;
 	}
 
-	if (setup_cgroup_environment()) {
-		printf("Failed to load bpf program\n");
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	if (cgroup_fd < 0)
 		goto err;
-	}
-
-	/* Create a cgroup, get fd, and join it */
-	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
-	if (cgroup_fd < 0) {
-		printf("Failed to create test cgroup\n");
-		goto err;
-	}
-
-	if (join_cgroup(TEST_CGROUP)) {
-		printf("Failed to join cgroup\n");
-		goto err;
-	}
 
 	/* Attach bpf program */
 	if (bpf_prog_attach(prog_fd, cgroup_fd, BPF_CGROUP_INET_EGRESS, 0)) {
diff --git a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
index 356351c0ac28..4a64306728ab 100644
--- a/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
+++ b/tools/testing/selftests/bpf/test_skb_cgroup_id_user.c
@@ -160,16 +160,10 @@ int main(int argc, char **argv)
 		exit(EXIT_FAILURE);
 	}
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cgfd = create_and_get_cgroup(CGROUP_PATH);
+	cgfd = cgroup_setup_and_join(CGROUP_PATH);
 	if (cgfd < 0)
 		goto err;
 
-	if (join_cgroup(CGROUP_PATH))
-		goto err;
-
 	if (send_packet(argv[1]))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/test_sock.c b/tools/testing/selftests/bpf/test_sock.c
index 52bf14955797..9613f7538840 100644
--- a/tools/testing/selftests/bpf/test_sock.c
+++ b/tools/testing/selftests/bpf/test_sock.c
@@ -464,16 +464,10 @@ int main(int argc, char **argv)
 	int cgfd = -1;
 	int err = 0;
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cgfd = create_and_get_cgroup(CG_PATH);
+	cgfd = cgroup_setup_and_join(CG_PATH);
 	if (cgfd < 0)
 		goto err;
 
-	if (join_cgroup(CG_PATH))
-		goto err;
-
 	if (run_tests(cgfd))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/test_sock_addr.c b/tools/testing/selftests/bpf/test_sock_addr.c
index 0358814c67dc..b8c72c1d9cf7 100644
--- a/tools/testing/selftests/bpf/test_sock_addr.c
+++ b/tools/testing/selftests/bpf/test_sock_addr.c
@@ -1638,16 +1638,10 @@ int main(int argc, char **argv)
 		exit(err);
 	}
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cgfd = create_and_get_cgroup(CG_PATH);
+	cgfd = cgroup_setup_and_join(CG_PATH);
 	if (cgfd < 0)
 		goto err;
 
-	if (join_cgroup(CG_PATH))
-		goto err;
-
 	if (run_tests(cgfd))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/test_sock_fields.c b/tools/testing/selftests/bpf/test_sock_fields.c
index f0fc103261a4..6c9f269c396d 100644
--- a/tools/testing/selftests/bpf/test_sock_fields.c
+++ b/tools/testing/selftests/bpf/test_sock_fields.c
@@ -421,19 +421,11 @@ int main(int argc, char **argv)
 	struct bpf_object *obj;
 	struct bpf_map *map;
 
-	err = setup_cgroup_environment();
-	CHECK(err, "setup_cgroup_environment()", "err:%d errno:%d",
-	      err, errno);
-
-	atexit(cleanup_cgroup_environment);
-
 	/* Create a cgroup, get fd, and join it */
-	cgroup_fd = create_and_get_cgroup(TEST_CGROUP);
-	CHECK(cgroup_fd == -1, "create_and_get_cgroup()",
+	cgroup_fd = cgroup_setup_and_join(TEST_CGROUP);
+	CHECK(cgroup_fd < 0, "cgroup_setup_and_join()",
 	      "cgroup_fd:%d errno:%d", cgroup_fd, errno);
-
-	err = join_cgroup(TEST_CGROUP);
-	CHECK(err, "join_cgroup", "err:%d errno:%d", err, errno);
+	atexit(cleanup_cgroup_environment);
 
 	err = bpf_prog_load_xattr(&attr, &obj, &egress_fd);
 	CHECK(err, "bpf_prog_load_xattr()", "err:%d", err);
diff --git a/tools/testing/selftests/bpf/test_socket_cookie.c b/tools/testing/selftests/bpf/test_socket_cookie.c
index 15653b0e26eb..154a8fd2a48d 100644
--- a/tools/testing/selftests/bpf/test_socket_cookie.c
+++ b/tools/testing/selftests/bpf/test_socket_cookie.c
@@ -191,16 +191,10 @@ int main(int argc, char **argv)
 	int cgfd = -1;
 	int err = 0;
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cgfd = create_and_get_cgroup(CG_PATH);
+	cgfd = cgroup_setup_and_join(CG_PATH);
 	if (cgfd < 0)
 		goto err;
 
-	if (join_cgroup(CG_PATH))
-		goto err;
-
 	if (run_test(cgfd))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c
index 78789b27e573..9b6fb00dc7a0 100644
--- a/tools/testing/selftests/bpf/test_sockmap.c
+++ b/tools/testing/selftests/bpf/test_sockmap.c
@@ -1963,23 +1963,9 @@ int main(int argc, char **argv)
 	}
 
 	if (!cg_fd) {
-		if (setup_cgroup_environment()) {
-			fprintf(stderr, "ERROR: cgroup env failed\n");
-			return -EINVAL;
-		}
-
-		cg_fd = create_and_get_cgroup(CG_PATH);
-		if (cg_fd < 0) {
-			fprintf(stderr,
-				"ERROR: (%i) open cg path failed: %s\n",
-				cg_fd, strerror(errno));
+		cg_fd = cgroup_setup_and_join(CG_PATH);
+		if (cg_fd < 0)
 			return cg_fd;
-		}
-
-		if (join_cgroup(CG_PATH)) {
-			fprintf(stderr, "ERROR: failed to join cgroup\n");
-			return -EINVAL;
-		}
 		cg_created = 1;
 	}
 
diff --git a/tools/testing/selftests/bpf/test_sysctl.c b/tools/testing/selftests/bpf/test_sysctl.c
index d196e2a4a6e0..a20a919244c0 100644
--- a/tools/testing/selftests/bpf/test_sysctl.c
+++ b/tools/testing/selftests/bpf/test_sysctl.c
@@ -1619,16 +1619,10 @@ int main(int argc, char **argv)
 	int cgfd = -1;
 	int err = 0;
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cgfd = create_and_get_cgroup(CG_PATH);
+	cgfd = cgroup_setup_and_join(CG_PATH);
 	if (cgfd < 0)
 		goto err;
 
-	if (join_cgroup(CG_PATH))
-		goto err;
-
 	if (run_tests(cgfd))
 		goto err;
 
diff --git a/tools/testing/selftests/bpf/test_tcpbpf_user.c b/tools/testing/selftests/bpf/test_tcpbpf_user.c
index 3ae127620463..74a9e49988b6 100644
--- a/tools/testing/selftests/bpf/test_tcpbpf_user.c
+++ b/tools/testing/selftests/bpf/test_tcpbpf_user.c
@@ -102,16 +102,10 @@ int main(int argc, char **argv)
 	__u32 key = 0;
 	int rv;
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cg_fd = create_and_get_cgroup(cg_path);
+	cg_fd = cgroup_setup_and_join(cg_path);
 	if (cg_fd < 0)
 		goto err;
 
-	if (join_cgroup(cg_path))
-		goto err;
-
 	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
 		printf("FAILED: load_bpf_file failed for: %s\n", file);
 		goto err;
diff --git a/tools/testing/selftests/bpf/test_tcpnotify_user.c b/tools/testing/selftests/bpf/test_tcpnotify_user.c
index f9765ddf0761..8549b31716ab 100644
--- a/tools/testing/selftests/bpf/test_tcpnotify_user.c
+++ b/tools/testing/selftests/bpf/test_tcpnotify_user.c
@@ -86,16 +86,10 @@ int main(int argc, char **argv)
 	CPU_SET(0, &cpuset);
 	pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &cpuset);
 
-	if (setup_cgroup_environment())
-		goto err;
-
-	cg_fd = create_and_get_cgroup(cg_path);
+	cg_fd = cgroup_setup_and_join(cg_path);
 	if (cg_fd < 0)
 		goto err;
 
-	if (join_cgroup(cg_path))
-		goto err;
-
 	if (bpf_prog_load(file, BPF_PROG_TYPE_SOCK_OPS, &obj, &prog_fd)) {
 		printf("FAILED: load_bpf_file failed for: %s\n", file);
 		goto err;

From 73b11c2ab072d5b0599d1e12cc126f55ee306daf Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:26 -0700
Subject: [PATCH 61/73] bpf: Add support for forced LINK_DETACH command

Add LINK_DETACH command to force-detach bpf_link without destroying it. It has
the same behavior as auto-detaching of bpf_link due to cgroup dying for
bpf_cgroup_link or net_device being destroyed for bpf_xdp_link. In such case,
bpf_link is still a valid kernel object, but is defuncts and doesn't hold BPF
program attached to corresponding BPF hook. This functionality allows users
with enough access rights to manually force-detach attached bpf_link without
killing respective owner process.

This patch implements LINK_DETACH for cgroup, xdp, and netns links, mostly
re-using existing link release handling code.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731182830.286260-2-andriin@fb.com
---
 include/linux/bpf.h        |  1 +
 include/uapi/linux/bpf.h   |  5 +++++
 kernel/bpf/cgroup.c        | 15 ++++++++++++++-
 kernel/bpf/net_namespace.c |  8 ++++++++
 kernel/bpf/syscall.c       | 26 ++++++++++++++++++++++++++
 net/core/dev.c             | 11 ++++++++++-
 6 files changed, 64 insertions(+), 2 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 40c5e206ecf2..cef4ef0d2b4e 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -793,6 +793,7 @@ struct bpf_link {
 struct bpf_link_ops {
 	void (*release)(struct bpf_link *link);
 	void (*dealloc)(struct bpf_link *link);
+	int (*detach)(struct bpf_link *link);
 	int (*update_prog)(struct bpf_link *link, struct bpf_prog *new_prog,
 			   struct bpf_prog *old_prog);
 	void (*show_fdinfo)(const struct bpf_link *link, struct seq_file *seq);
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index eb5e0c38eb2c..b134e679e9db 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
 	BPF_ITER_CREATE,
+	BPF_LINK_DETACH,
 };
 
 enum bpf_map_type {
@@ -634,6 +635,10 @@ union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct {
+		__u32		link_fd;
+	} link_detach;
+
 	struct { /* struct used by BPF_ENABLE_STATS command */
 		__u32		type;
 	} enable_stats;
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
index 957cce1d5168..83ff127ef7ae 100644
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -814,6 +814,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 {
 	struct bpf_cgroup_link *cg_link =
 		container_of(link, struct bpf_cgroup_link, link);
+	struct cgroup *cg;
 
 	/* link might have been auto-detached by dying cgroup already,
 	 * in that case our work is done here
@@ -832,8 +833,12 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
 	WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
 				    cg_link->type));
 
+	cg = cg_link->cgroup;
+	cg_link->cgroup = NULL;
+
 	mutex_unlock(&cgroup_mutex);
-	cgroup_put(cg_link->cgroup);
+
+	cgroup_put(cg);
 }
 
 static void bpf_cgroup_link_dealloc(struct bpf_link *link)
@@ -844,6 +849,13 @@ static void bpf_cgroup_link_dealloc(struct bpf_link *link)
 	kfree(cg_link);
 }
 
+static int bpf_cgroup_link_detach(struct bpf_link *link)
+{
+	bpf_cgroup_link_release(link);
+
+	return 0;
+}
+
 static void bpf_cgroup_link_show_fdinfo(const struct bpf_link *link,
 					struct seq_file *seq)
 {
@@ -883,6 +895,7 @@ static int bpf_cgroup_link_fill_link_info(const struct bpf_link *link,
 static const struct bpf_link_ops bpf_cgroup_link_lops = {
 	.release = bpf_cgroup_link_release,
 	.dealloc = bpf_cgroup_link_dealloc,
+	.detach = bpf_cgroup_link_detach,
 	.update_prog = cgroup_bpf_replace,
 	.show_fdinfo = bpf_cgroup_link_show_fdinfo,
 	.fill_link_info = bpf_cgroup_link_fill_link_info,
diff --git a/kernel/bpf/net_namespace.c b/kernel/bpf/net_namespace.c
index 71405edd667c..542f275bf252 100644
--- a/kernel/bpf/net_namespace.c
+++ b/kernel/bpf/net_namespace.c
@@ -142,9 +142,16 @@ static void bpf_netns_link_release(struct bpf_link *link)
 	bpf_prog_array_free(old_array);
 
 out_unlock:
+	net_link->net = NULL;
 	mutex_unlock(&netns_bpf_mutex);
 }
 
+static int bpf_netns_link_detach(struct bpf_link *link)
+{
+	bpf_netns_link_release(link);
+	return 0;
+}
+
 static void bpf_netns_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_netns_link *net_link =
@@ -228,6 +235,7 @@ static void bpf_netns_link_show_fdinfo(const struct bpf_link *link,
 static const struct bpf_link_ops bpf_netns_link_ops = {
 	.release = bpf_netns_link_release,
 	.dealloc = bpf_netns_link_dealloc,
+	.detach = bpf_netns_link_detach,
 	.update_prog = bpf_netns_link_update_prog,
 	.fill_link_info = bpf_netns_link_fill_info,
 	.show_fdinfo = bpf_netns_link_show_fdinfo,
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index cd3d599e9e90..2f343ce15747 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -3991,6 +3991,29 @@ out_put_link:
 	return ret;
 }
 
+#define BPF_LINK_DETACH_LAST_FIELD link_detach.link_fd
+
+static int link_detach(union bpf_attr *attr)
+{
+	struct bpf_link *link;
+	int ret;
+
+	if (CHECK_ATTR(BPF_LINK_DETACH))
+		return -EINVAL;
+
+	link = bpf_link_get_from_fd(attr->link_detach.link_fd);
+	if (IS_ERR(link))
+		return PTR_ERR(link);
+
+	if (link->ops->detach)
+		ret = link->ops->detach(link);
+	else
+		ret = -EOPNOTSUPP;
+
+	bpf_link_put(link);
+	return ret;
+}
+
 static int bpf_link_inc_not_zero(struct bpf_link *link)
 {
 	return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? 0 : -ENOENT;
@@ -4240,6 +4263,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
 	case BPF_ITER_CREATE:
 		err = bpf_iter_create(&attr);
 		break;
+	case BPF_LINK_DETACH:
+		err = link_detach(&attr);
+		break;
 	default:
 		err = -EINVAL;
 		break;
diff --git a/net/core/dev.c b/net/core/dev.c
index a2a57988880a..c8b911b10187 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -8979,12 +8979,20 @@ static void bpf_xdp_link_release(struct bpf_link *link)
 	/* if racing with net_device's tear down, xdp_link->dev might be
 	 * already NULL, in which case link was already auto-detached
 	 */
-	if (xdp_link->dev)
+	if (xdp_link->dev) {
 		WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
+		xdp_link->dev = NULL;
+	}
 
 	rtnl_unlock();
 }
 
+static int bpf_xdp_link_detach(struct bpf_link *link)
+{
+	bpf_xdp_link_release(link);
+	return 0;
+}
+
 static void bpf_xdp_link_dealloc(struct bpf_link *link)
 {
 	struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
@@ -9066,6 +9074,7 @@ out_unlock:
 static const struct bpf_link_ops bpf_xdp_link_lops = {
 	.release = bpf_xdp_link_release,
 	.dealloc = bpf_xdp_link_dealloc,
+	.detach = bpf_xdp_link_detach,
 	.show_fdinfo = bpf_xdp_link_show_fdinfo,
 	.fill_link_info = bpf_xdp_link_fill_link_info,
 	.update_prog = bpf_xdp_link_update,

From 2e49527e52486dac910460b1b3f6fce6e21c6b48 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:27 -0700
Subject: [PATCH 62/73] libbpf: Add bpf_link detach APIs

Add low-level bpf_link_detach() API. Also add higher-level bpf_link__detach()
one.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731182830.286260-3-andriin@fb.com
---
 tools/include/uapi/linux/bpf.h |  5 +++++
 tools/lib/bpf/bpf.c            | 10 ++++++++++
 tools/lib/bpf/bpf.h            |  2 ++
 tools/lib/bpf/libbpf.c         |  5 +++++
 tools/lib/bpf/libbpf.h         |  1 +
 tools/lib/bpf/libbpf.map       |  2 ++
 6 files changed, 25 insertions(+)

diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h
index eb5e0c38eb2c..b134e679e9db 100644
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -117,6 +117,7 @@ enum bpf_cmd {
 	BPF_LINK_GET_NEXT_ID,
 	BPF_ENABLE_STATS,
 	BPF_ITER_CREATE,
+	BPF_LINK_DETACH,
 };
 
 enum bpf_map_type {
@@ -634,6 +635,10 @@ union bpf_attr {
 		__u32		old_prog_fd;
 	} link_update;
 
+	struct {
+		__u32		link_fd;
+	} link_detach;
+
 	struct { /* struct used by BPF_ENABLE_STATS command */
 		__u32		type;
 	} enable_stats;
diff --git a/tools/lib/bpf/bpf.c b/tools/lib/bpf/bpf.c
index e1bdf214f75f..eab14c97c15d 100644
--- a/tools/lib/bpf/bpf.c
+++ b/tools/lib/bpf/bpf.c
@@ -603,6 +603,16 @@ int bpf_link_create(int prog_fd, int target_fd,
 	return sys_bpf(BPF_LINK_CREATE, &attr, sizeof(attr));
 }
 
+int bpf_link_detach(int link_fd)
+{
+	union bpf_attr attr;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.link_detach.link_fd = link_fd;
+
+	return sys_bpf(BPF_LINK_DETACH, &attr, sizeof(attr));
+}
+
 int bpf_link_update(int link_fd, int new_prog_fd,
 		    const struct bpf_link_update_opts *opts)
 {
diff --git a/tools/lib/bpf/bpf.h b/tools/lib/bpf/bpf.h
index 6d367e01d05e..28855fd5b5f4 100644
--- a/tools/lib/bpf/bpf.h
+++ b/tools/lib/bpf/bpf.h
@@ -178,6 +178,8 @@ LIBBPF_API int bpf_link_create(int prog_fd, int target_fd,
 			       enum bpf_attach_type attach_type,
 			       const struct bpf_link_create_opts *opts);
 
+LIBBPF_API int bpf_link_detach(int link_fd);
+
 struct bpf_link_update_opts {
 	size_t sz; /* size of this struct for forward/backward compatibility */
 	__u32 flags;	   /* extra flags */
diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c
index b9f11f854985..7be04e45d29c 100644
--- a/tools/lib/bpf/libbpf.c
+++ b/tools/lib/bpf/libbpf.c
@@ -7748,6 +7748,11 @@ struct bpf_link *bpf_link__open(const char *path)
 	return link;
 }
 
+int bpf_link__detach(struct bpf_link *link)
+{
+	return bpf_link_detach(link->fd) ? -errno : 0;
+}
+
 int bpf_link__pin(struct bpf_link *link, const char *path)
 {
 	int err;
diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h
index 9924385462ab..3ed1399bfbbc 100644
--- a/tools/lib/bpf/libbpf.h
+++ b/tools/lib/bpf/libbpf.h
@@ -229,6 +229,7 @@ LIBBPF_API int bpf_link__unpin(struct bpf_link *link);
 LIBBPF_API int bpf_link__update_program(struct bpf_link *link,
 					struct bpf_program *prog);
 LIBBPF_API void bpf_link__disconnect(struct bpf_link *link);
+LIBBPF_API int bpf_link__detach(struct bpf_link *link);
 LIBBPF_API int bpf_link__destroy(struct bpf_link *link);
 
 LIBBPF_API struct bpf_link *
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index ca49a6a7e5b2..099863411f7d 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -273,6 +273,8 @@ LIBBPF_0.0.9 {
 
 LIBBPF_0.1.0 {
 	global:
+		bpf_link__detach;
+		bpf_link_detach;
 		bpf_map__ifindex;
 		bpf_map__key_size;
 		bpf_map__map_flags;

From 90806ccc90bbd0150267a97ae4003269597a6a6c Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:28 -0700
Subject: [PATCH 63/73] selftests/bpf: Add link detach tests for cgroup, netns,
 and xdp bpf_links

Add bpf_link__detach() testing to selftests for cgroup, netns, and xdp
bpf_links.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731182830.286260-4-andriin@fb.com
---
 .../selftests/bpf/prog_tests/cgroup_link.c    | 20 +++++++-
 .../selftests/bpf/prog_tests/sk_lookup.c      | 51 +++++++++----------
 .../selftests/bpf/prog_tests/xdp_link.c       | 14 +++++
 tools/testing/selftests/bpf/testing_helpers.c | 14 +++++
 tools/testing/selftests/bpf/testing_helpers.h |  3 ++
 5 files changed, 73 insertions(+), 29 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
index 6e04f8d1d15b..4d9b514b3fd9 100644
--- a/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/cgroup_link.c
@@ -2,6 +2,7 @@
 
 #include <test_progs.h>
 #include "cgroup_helpers.h"
+#include "testing_helpers.h"
 #include "test_cgroup_link.skel.h"
 
 static __u32 duration = 0;
@@ -37,7 +38,8 @@ void test_cgroup_link(void)
 	int last_cg = ARRAY_SIZE(cgs) - 1, cg_nr = ARRAY_SIZE(cgs);
 	DECLARE_LIBBPF_OPTS(bpf_link_update_opts, link_upd_opts);
 	struct bpf_link *links[ARRAY_SIZE(cgs)] = {}, *tmp_link;
-	__u32 prog_ids[ARRAY_SIZE(cgs)], prog_cnt = 0, attach_flags;
+	__u32 prog_ids[ARRAY_SIZE(cgs)], prog_cnt = 0, attach_flags, prog_id;
+	struct bpf_link_info info;
 	int i = 0, err, prog_fd;
 	bool detach_legacy = false;
 
@@ -219,6 +221,22 @@ void test_cgroup_link(void)
 	/* BPF programs should still get called */
 	ping_and_check(0, cg_nr);
 
+	prog_id = link_info_prog_id(links[0], &info);
+	CHECK(prog_id == 0, "link_info", "failed\n");
+	CHECK(info.cgroup.cgroup_id == 0, "cgroup_id", "unexpected %llu\n", info.cgroup.cgroup_id);
+
+	err = bpf_link__detach(links[0]);
+	if (CHECK(err, "link_detach", "failed %d\n", err))
+		goto cleanup;
+
+	/* cgroup_id should be zero in link_info */
+	prog_id = link_info_prog_id(links[0], &info);
+	CHECK(prog_id == 0, "link_info", "failed\n");
+	CHECK(info.cgroup.cgroup_id != 0, "cgroup_id", "unexpected %llu\n", info.cgroup.cgroup_id);
+
+	/* First BPF program shouldn't be called anymore */
+	ping_and_check(0, cg_nr - 1);
+
 	/* leave cgroup and remove them, don't detach programs */
 	cleanup_cgroup_environment();
 
diff --git a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
index 379da6f10ee9..c571584c00f5 100644
--- a/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
+++ b/tools/testing/selftests/bpf/prog_tests/sk_lookup.c
@@ -34,6 +34,7 @@
 #include "bpf_util.h"
 #include "cgroup_helpers.h"
 #include "network_helpers.h"
+#include "testing_helpers.h"
 #include "test_sk_lookup.skel.h"
 
 /* External (address, port) pairs the client sends packets to. */
@@ -469,34 +470,10 @@ static int update_lookup_map(struct bpf_map *map, int index, int sock_fd)
 	return 0;
 }
 
-static __u32 link_info_prog_id(struct bpf_link *link)
-{
-	struct bpf_link_info info = {};
-	__u32 info_len = sizeof(info);
-	int link_fd, err;
-
-	link_fd = bpf_link__fd(link);
-	if (CHECK(link_fd < 0, "bpf_link__fd", "failed\n")) {
-		errno = -link_fd;
-		log_err("bpf_link__fd failed");
-		return 0;
-	}
-
-	err = bpf_obj_get_info_by_fd(link_fd, &info, &info_len);
-	if (CHECK(err, "bpf_obj_get_info_by_fd", "failed\n")) {
-		log_err("bpf_obj_get_info_by_fd");
-		return 0;
-	}
-	if (CHECK(info_len != sizeof(info), "bpf_obj_get_info_by_fd",
-		  "unexpected info len %u\n", info_len))
-		return 0;
-
-	return info.prog_id;
-}
-
 static void query_lookup_prog(struct test_sk_lookup *skel)
 {
 	struct bpf_link *link[3] = {};
+	struct bpf_link_info info;
 	__u32 attach_flags = 0;
 	__u32 prog_ids[3] = {};
 	__u32 prog_cnt = 3;
@@ -534,18 +511,36 @@ static void query_lookup_prog(struct test_sk_lookup *skel)
 	if (CHECK(prog_cnt != 3, "bpf_prog_query",
 		  "wrong program count on query: %u", prog_cnt))
 		goto detach;
-	prog_id = link_info_prog_id(link[0]);
+	prog_id = link_info_prog_id(link[0], &info);
 	CHECK(prog_ids[0] != prog_id, "bpf_prog_query",
 	      "invalid program #0 id on query: %u != %u\n",
 	      prog_ids[0], prog_id);
-	prog_id = link_info_prog_id(link[1]);
+	CHECK(info.netns.netns_ino == 0, "netns_ino",
+	      "unexpected netns_ino: %u\n", info.netns.netns_ino);
+	prog_id = link_info_prog_id(link[1], &info);
 	CHECK(prog_ids[1] != prog_id, "bpf_prog_query",
 	      "invalid program #1 id on query: %u != %u\n",
 	      prog_ids[1], prog_id);
-	prog_id = link_info_prog_id(link[2]);
+	CHECK(info.netns.netns_ino == 0, "netns_ino",
+	      "unexpected netns_ino: %u\n", info.netns.netns_ino);
+	prog_id = link_info_prog_id(link[2], &info);
 	CHECK(prog_ids[2] != prog_id, "bpf_prog_query",
 	      "invalid program #2 id on query: %u != %u\n",
 	      prog_ids[2], prog_id);
+	CHECK(info.netns.netns_ino == 0, "netns_ino",
+	      "unexpected netns_ino: %u\n", info.netns.netns_ino);
+
+	err = bpf_link__detach(link[0]);
+	if (CHECK(err, "link_detach", "failed %d\n", err))
+		goto detach;
+
+	/* prog id is still there, but netns_ino is zeroed out */
+	prog_id = link_info_prog_id(link[0], &info);
+	CHECK(prog_ids[0] != prog_id, "bpf_prog_query",
+	      "invalid program #0 id on query: %u != %u\n",
+	      prog_ids[0], prog_id);
+	CHECK(info.netns.netns_ino != 0, "netns_ino",
+	      "unexpected netns_ino: %u\n", info.netns.netns_ino);
 
 detach:
 	if (link[2])
diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_link.c b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
index 52cba6795d40..6f814999b395 100644
--- a/tools/testing/selftests/bpf/prog_tests/xdp_link.c
+++ b/tools/testing/selftests/bpf/prog_tests/xdp_link.c
@@ -131,6 +131,20 @@ void test_xdp_link(void)
 	CHECK(link_info.xdp.ifindex != IFINDEX_LO, "link_ifindex",
 	      "got %u != exp %u\n", link_info.xdp.ifindex, IFINDEX_LO);
 
+	err = bpf_link__detach(link);
+	if (CHECK(err, "link_detach", "failed %d\n", err))
+		goto cleanup;
+
+	memset(&link_info, 0, sizeof(link_info));
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), &link_info, &link_info_len);
+	if (CHECK(err, "link_info", "failed: %d\n", err))
+		goto cleanup;
+	CHECK(link_info.prog_id != id1, "link_prog_id",
+	      "got %u != exp %u\n", link_info.prog_id, id1);
+	/* ifindex should be zeroed out */
+	CHECK(link_info.xdp.ifindex != 0, "link_ifindex",
+	      "got %u != exp %u\n", link_info.xdp.ifindex, 0);
+
 cleanup:
 	test_xdp_link__destroy(skel1);
 	test_xdp_link__destroy(skel2);
diff --git a/tools/testing/selftests/bpf/testing_helpers.c b/tools/testing/selftests/bpf/testing_helpers.c
index 0af6337a8962..800d503e5cb4 100644
--- a/tools/testing/selftests/bpf/testing_helpers.c
+++ b/tools/testing/selftests/bpf/testing_helpers.c
@@ -64,3 +64,17 @@ int parse_num_list(const char *s, bool **num_set, int *num_set_len)
 
 	return 0;
 }
+
+__u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info)
+{
+	__u32 info_len = sizeof(*info);
+	int err;
+
+	memset(info, 0, sizeof(*info));
+	err = bpf_obj_get_info_by_fd(bpf_link__fd(link), info, &info_len);
+	if (err) {
+		printf("failed to get link info: %d\n", -errno);
+		return 0;
+	}
+	return info->prog_id;
+}
diff --git a/tools/testing/selftests/bpf/testing_helpers.h b/tools/testing/selftests/bpf/testing_helpers.h
index 923b51762759..d4f8e749611b 100644
--- a/tools/testing/selftests/bpf/testing_helpers.h
+++ b/tools/testing/selftests/bpf/testing_helpers.h
@@ -1,5 +1,8 @@
 /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
 /* Copyright (C) 2020 Facebook, Inc. */
 #include <stdbool.h>
+#include <bpf/bpf.h>
+#include <bpf/libbpf.h>
 
 int parse_num_list(const char *s, bool **set, int *set_len);
+__u32 link_info_prog_id(const struct bpf_link *link, struct bpf_link_info *info);

From 0e8c7c07f090668566db2030a027a360ffd00938 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:29 -0700
Subject: [PATCH 64/73] tools/bpftool: Add `link detach` subcommand

Add ability to force-detach BPF link. Also add missing error message, if
specified link ID is wrong.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731182830.286260-5-andriin@fb.com
---
 tools/bpf/bpftool/link.c | 37 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/link.c b/tools/bpf/bpftool/link.c
index 326b8fdf0243..1b793759170e 100644
--- a/tools/bpf/bpftool/link.c
+++ b/tools/bpf/bpftool/link.c
@@ -22,6 +22,8 @@ static const char * const link_type_name[] = {
 
 static int link_parse_fd(int *argc, char ***argv)
 {
+	int fd;
+
 	if (is_prefix(**argv, "id")) {
 		unsigned int id;
 		char *endptr;
@@ -35,7 +37,10 @@ static int link_parse_fd(int *argc, char ***argv)
 		}
 		NEXT_ARGP();
 
-		return bpf_link_get_fd_by_id(id);
+		fd = bpf_link_get_fd_by_id(id);
+		if (fd < 0)
+			p_err("failed to get link with ID %d: %s", id, strerror(errno));
+		return fd;
 	} else if (is_prefix(**argv, "pinned")) {
 		char *path;
 
@@ -316,6 +321,34 @@ static int do_pin(int argc, char **argv)
 	return err;
 }
 
+static int do_detach(int argc, char **argv)
+{
+	int err, fd;
+
+	if (argc != 2) {
+		p_err("link specifier is invalid or missing\n");
+		return 1;
+	}
+
+	fd = link_parse_fd(&argc, &argv);
+	if (fd < 0)
+		return 1;
+
+	err = bpf_link_detach(fd);
+	if (err)
+		err = -errno;
+	close(fd);
+	if (err) {
+		p_err("failed link detach: %s", strerror(-err));
+		return 1;
+	}
+
+	if (json_output)
+		jsonw_null(json_wtr);
+
+	return 0;
+}
+
 static int do_help(int argc, char **argv)
 {
 	if (json_output) {
@@ -326,6 +359,7 @@ static int do_help(int argc, char **argv)
 	fprintf(stderr,
 		"Usage: %1$s %2$s { show | list }   [LINK]\n"
 		"       %1$s %2$s pin        LINK  FILE\n"
+		"       %1$s %2$s detach     LINK\n"
 		"       %1$s %2$s help\n"
 		"\n"
 		"       " HELP_SPEC_LINK "\n"
@@ -341,6 +375,7 @@ static const struct cmd cmds[] = {
 	{ "list",	do_show },
 	{ "help",	do_help },
 	{ "pin",	do_pin },
+	{ "detach",	do_detach },
 	{ 0 }
 };
 

From e85f99aa7760e74bb5a7e8515948f99c264a275f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 11:28:30 -0700
Subject: [PATCH 65/73] tools/bpftool: Add documentation and bash-completion
 for `link detach`

Add info on link detach sub-command to man page. Add detach to bash-completion
as well.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: Song Liu <songliubraving@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com.
Link: https://lore.kernel.org/bpf/20200731182830.286260-6-andriin@fb.com
---
 tools/bpf/bpftool/Documentation/bpftool-link.rst | 8 ++++++++
 tools/bpf/bpftool/bash-completion/bpftool        | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/bpf/bpftool/Documentation/bpftool-link.rst b/tools/bpf/bpftool/Documentation/bpftool-link.rst
index 38b0949a185b..4a52e7a93339 100644
--- a/tools/bpf/bpftool/Documentation/bpftool-link.rst
+++ b/tools/bpf/bpftool/Documentation/bpftool-link.rst
@@ -21,6 +21,7 @@ LINK COMMANDS
 
 |	**bpftool** **link { show | list }** [*LINK*]
 |	**bpftool** **link pin** *LINK* *FILE*
+|	**bpftool** **link detach *LINK*
 |	**bpftool** **link help**
 |
 |	*LINK* := { **id** *LINK_ID* | **pinned** *FILE* }
@@ -49,6 +50,13 @@ DESCRIPTION
 		  contain a dot character ('.'), which is reserved for future
 		  extensions of *bpffs*.
 
+	**bpftool link detach** *LINK*
+		  Force-detach link *LINK*. BPF link and its underlying BPF
+		  program will stay valid, but they will be detached from the
+		  respective BPF hook and BPF link will transition into
+		  a defunct state until last open file descriptor for that
+		  link is closed.
+
 	**bpftool link help**
 		  Print short help message.
 
diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool
index 257fa310ea2b..f53ed2f1a4aa 100644
--- a/tools/bpf/bpftool/bash-completion/bpftool
+++ b/tools/bpf/bpftool/bash-completion/bpftool
@@ -1122,7 +1122,7 @@ _bpftool()
             ;;
         link)
             case $command in
-                show|list|pin)
+                show|list|pin|detach)
                     case $prev in
                         id)
                             _bpftool_get_link_ids
@@ -1139,7 +1139,7 @@ _bpftool()
                     COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) )
                     return 0
                     ;;
-                pin)
+                pin|detach)
                     if [[ $prev == "$command" ]]; then
                         COMPREPLY=( $( compgen -W "$LINK_TYPE" -- "$cur" ) )
                     else

From b5cc46cdff76cdfe2cb45ab2636d847efb1012cc Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Fri, 31 Jul 2020 13:49:57 -0700
Subject: [PATCH 66/73] selftests/bpf: Fix spurious test failures in core_retro
 selftest

core_retro selftest uses BPF program that's triggered on sys_enter
system-wide, but has no protection from some unrelated process doing syscall
while selftest is running. This leads to occasional test failures with
unexpected PIDs being returned. Fix that by filtering out all processes that
are not test_progs process.

Fixes: fcda189a5133 ("selftests/bpf: Add test relying only on CO-RE and no recent kernel features")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200731204957.2047119-1-andriin@fb.com
---
 tools/testing/selftests/bpf/prog_tests/core_retro.c |  8 ++++++--
 tools/testing/selftests/bpf/progs/test_core_retro.c | 13 +++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tools/testing/selftests/bpf/prog_tests/core_retro.c b/tools/testing/selftests/bpf/prog_tests/core_retro.c
index 78e30d3a23d5..6acb0e94d4d7 100644
--- a/tools/testing/selftests/bpf/prog_tests/core_retro.c
+++ b/tools/testing/selftests/bpf/prog_tests/core_retro.c
@@ -6,7 +6,7 @@
 
 void test_core_retro(void)
 {
-	int err, zero = 0, res, duration = 0;
+	int err, zero = 0, res, duration = 0, my_pid = getpid();
 	struct test_core_retro *skel;
 
 	/* load program */
@@ -14,6 +14,10 @@ void test_core_retro(void)
 	if (CHECK(!skel, "skel_load", "skeleton open/load failed\n"))
 		goto out_close;
 
+	err = bpf_map_update_elem(bpf_map__fd(skel->maps.exp_tgid_map), &zero, &my_pid, 0);
+	if (CHECK(err, "map_update", "failed to set expected PID: %d\n", errno))
+		goto out_close;
+
 	/* attach probe */
 	err = test_core_retro__attach(skel);
 	if (CHECK(err, "attach_kprobe", "err %d\n", err))
@@ -26,7 +30,7 @@ void test_core_retro(void)
 	if (CHECK(err, "map_lookup", "failed to lookup result: %d\n", errno))
 		goto out_close;
 
-	CHECK(res != getpid(), "pid_check", "got %d != exp %d\n", res, getpid());
+	CHECK(res != my_pid, "pid_check", "got %d != exp %d\n", res, my_pid);
 
 out_close:
 	test_core_retro__destroy(skel);
diff --git a/tools/testing/selftests/bpf/progs/test_core_retro.c b/tools/testing/selftests/bpf/progs/test_core_retro.c
index 75c60c3c29cf..20861ec2f674 100644
--- a/tools/testing/selftests/bpf/progs/test_core_retro.c
+++ b/tools/testing/selftests/bpf/progs/test_core_retro.c
@@ -8,6 +8,13 @@ struct task_struct {
 	int tgid;
 } __attribute__((preserve_access_index));
 
+struct {
+	__uint(type, BPF_MAP_TYPE_ARRAY);
+	__uint(max_entries, 1);
+	__type(key, int);
+	__type(value, int);
+} exp_tgid_map SEC(".maps");
+
 struct {
 	__uint(type, BPF_MAP_TYPE_ARRAY);
 	__uint(max_entries, 1);
@@ -21,6 +28,12 @@ int handle_sys_enter(void *ctx)
 	struct task_struct *task = (void *)bpf_get_current_task();
 	int tgid = BPF_CORE_READ(task, tgid);
 	int zero = 0;
+	int real_tgid = bpf_get_current_pid_tgid() >> 32;
+	int *exp_tgid = bpf_map_lookup_elem(&exp_tgid_map, &zero);
+
+	/* only pass through sys_enters from test process */
+	if (!exp_tgid || *exp_tgid != real_tgid)
+		return 0;
 
 	bpf_map_update_elem(&results, &zero, &tgid, 0);
 

From a278f3d8191228212c553a5d4303fa603214b717 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Thu, 30 Jul 2020 19:42:44 -0700
Subject: [PATCH 67/73] tools, build: Propagate build failures from
 tools/build/Makefile.build

The '&&' command seems to have a bad effect when $(cmd_$(1)) exits with
non-zero effect: the command failure is masked (despite `set -e`) and all but
the first command of $(dep-cmd) is executed (successfully, as they are mostly
printfs), thus overall returning 0 in the end.

This means in practice that despite compilation errors, tools's build Makefile
will return success. We see this very reliably with libbpf's Makefile, which
doesn't get compilation error propagated properly. This in turns causes issues
with selftests build, as well as bpftool and other projects that rely on
building libbpf.

The fix is simple: don't use &&. Given `set -e`, we don't need to chain
commands with &&. The shell will exit on first failure, giving desired
behavior and propagating error properly.

Fixes: 275e2d95591e ("tools build: Move dependency copy into function")
Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Jiri Olsa <jolsa@redhat.com>
Link: https://lore.kernel.org/bpf/20200731024244.872574-1-andriin@fb.com
---
 tools/build/Build.include | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tools/build/Build.include b/tools/build/Build.include
index 9ec01f4454f9..585486e40995 100644
--- a/tools/build/Build.include
+++ b/tools/build/Build.include
@@ -74,7 +74,8 @@ dep-cmd = $(if $(wildcard $(fixdep)),
 #                   dependencies in the cmd file
 if_changed_dep = $(if $(strip $(any-prereq) $(arg-check)),         \
                   @set -e;                                         \
-                  $(echo-cmd) $(cmd_$(1)) && $(dep-cmd))
+                  $(echo-cmd) $(cmd_$(1));                         \
+                  $(dep-cmd))
 
 # if_changed      - execute command if any prerequisite is newer than
 #                   target, or command line has changed

From 041549b7b2c7811ec40e705c439211f00ade2dda Mon Sep 17 00:00:00 2001
From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Date: Sun, 2 Aug 2020 19:15:40 +0800
Subject: [PATCH 68/73] tools, bpftool: Fix wrong return value in do_dump()

In case of btf_id does not exist, a negative error code -ENOENT
should be returned.

Fixes: c93cc69004df3 ("bpftool: add ability to dump BTF types")
Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Reviewed-by: Tobias Klauser <tklauser@distanz.ch>
Acked-by: Andrii Nakryiko <andriin@fb.com>
Acked-by: John Fastabend <john.fastabend@gmail.com>
Link: https://lore.kernel.org/bpf/20200802111540.5384-1-tianjia.zhang@linux.alibaba.com
---
 tools/bpf/bpftool/btf.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index fc9bc7a23db6..37c091308714 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -596,7 +596,7 @@ static int do_dump(int argc, char **argv)
 			goto done;
 		}
 		if (!btf) {
-			err = ENOENT;
+			err = -ENOENT;
 			p_err("can't find btf with ID (%u)", btf_id);
 			goto done;
 		}

From 94a1fedd63edb672933bef44ca9213937e377c05 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sat, 1 Aug 2020 18:32:17 -0700
Subject: [PATCH 69/73] libbpf: Add btf__parse_raw() and generic btf__parse()
 APIs

Add public APIs to parse BTF from raw data file (e.g.,
/sys/kernel/btf/vmlinux), as well as generic btf__parse(), which will try to
determine correct format, currently either raw or ELF.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200802013219.864880-2-andriin@fb.com
---
 tools/lib/bpf/btf.c      | 114 ++++++++++++++++++++++++++-------------
 tools/lib/bpf/btf.h      |   5 +-
 tools/lib/bpf/libbpf.map |   2 +
 3 files changed, 83 insertions(+), 38 deletions(-)

diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c
index ded5b29965f9..856b09a04563 100644
--- a/tools/lib/bpf/btf.c
+++ b/tools/lib/bpf/btf.c
@@ -562,6 +562,83 @@ done:
 	return btf;
 }
 
+struct btf *btf__parse_raw(const char *path)
+{
+	void *data = NULL;
+	struct btf *btf;
+	FILE *f = NULL;
+	__u16 magic;
+	int err = 0;
+	long sz;
+
+	f = fopen(path, "rb");
+	if (!f) {
+		err = -errno;
+		goto err_out;
+	}
+
+	/* check BTF magic */
+	if (fread(&magic, 1, sizeof(magic), f) < sizeof(magic)) {
+		err = -EIO;
+		goto err_out;
+	}
+	if (magic != BTF_MAGIC) {
+		/* definitely not a raw BTF */
+		err = -EPROTO;
+		goto err_out;
+	}
+
+	/* get file size */
+	if (fseek(f, 0, SEEK_END)) {
+		err = -errno;
+		goto err_out;
+	}
+	sz = ftell(f);
+	if (sz < 0) {
+		err = -errno;
+		goto err_out;
+	}
+	/* rewind to the start */
+	if (fseek(f, 0, SEEK_SET)) {
+		err = -errno;
+		goto err_out;
+	}
+
+	/* pre-alloc memory and read all of BTF data */
+	data = malloc(sz);
+	if (!data) {
+		err = -ENOMEM;
+		goto err_out;
+	}
+	if (fread(data, 1, sz, f) < sz) {
+		err = -EIO;
+		goto err_out;
+	}
+
+	/* finally parse BTF data */
+	btf = btf__new(data, sz);
+
+err_out:
+	free(data);
+	if (f)
+		fclose(f);
+	return err ? ERR_PTR(err) : btf;
+}
+
+struct btf *btf__parse(const char *path, struct btf_ext **btf_ext)
+{
+	struct btf *btf;
+
+	if (btf_ext)
+		*btf_ext = NULL;
+
+	btf = btf__parse_raw(path);
+	if (!IS_ERR(btf) || PTR_ERR(btf) != -EPROTO)
+		return btf;
+
+	return btf__parse_elf(path, btf_ext);
+}
+
 static int compare_vsi_off(const void *_a, const void *_b)
 {
 	const struct btf_var_secinfo *a = _a;
@@ -2951,41 +3028,6 @@ static int btf_dedup_remap_types(struct btf_dedup *d)
 	return 0;
 }
 
-static struct btf *btf_load_raw(const char *path)
-{
-	struct btf *btf;
-	size_t read_cnt;
-	struct stat st;
-	void *data;
-	FILE *f;
-
-	if (stat(path, &st))
-		return ERR_PTR(-errno);
-
-	data = malloc(st.st_size);
-	if (!data)
-		return ERR_PTR(-ENOMEM);
-
-	f = fopen(path, "rb");
-	if (!f) {
-		btf = ERR_PTR(-errno);
-		goto cleanup;
-	}
-
-	read_cnt = fread(data, 1, st.st_size, f);
-	fclose(f);
-	if (read_cnt < st.st_size) {
-		btf = ERR_PTR(-EBADF);
-		goto cleanup;
-	}
-
-	btf = btf__new(data, read_cnt);
-
-cleanup:
-	free(data);
-	return btf;
-}
-
 /*
  * Probe few well-known locations for vmlinux kernel image and try to load BTF
  * data out of it to use for target BTF.
@@ -3021,7 +3063,7 @@ struct btf *libbpf_find_kernel_btf(void)
 			continue;
 
 		if (locations[i].raw_btf)
-			btf = btf_load_raw(path);
+			btf = btf__parse_raw(path);
 		else
 			btf = btf__parse_elf(path, NULL);
 
diff --git a/tools/lib/bpf/btf.h b/tools/lib/bpf/btf.h
index 491c7b41ffdc..f4a1a1d2b9a3 100644
--- a/tools/lib/bpf/btf.h
+++ b/tools/lib/bpf/btf.h
@@ -64,8 +64,9 @@ struct btf_ext_header {
 
 LIBBPF_API void btf__free(struct btf *btf);
 LIBBPF_API struct btf *btf__new(const void *data, __u32 size);
-LIBBPF_API struct btf *btf__parse_elf(const char *path,
-				      struct btf_ext **btf_ext);
+LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext);
+LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext);
+LIBBPF_API struct btf *btf__parse_raw(const char *path);
 LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf);
 LIBBPF_API int btf__load(struct btf *btf);
 LIBBPF_API __s32 btf__find_by_name(const struct btf *btf,
diff --git a/tools/lib/bpf/libbpf.map b/tools/lib/bpf/libbpf.map
index 099863411f7d..0c4722bfdd0a 100644
--- a/tools/lib/bpf/libbpf.map
+++ b/tools/lib/bpf/libbpf.map
@@ -293,5 +293,7 @@ LIBBPF_0.1.0 {
 		bpf_program__is_sk_lookup;
 		bpf_program__set_autoload;
 		bpf_program__set_sk_lookup;
+		btf__parse;
+		btf__parse_raw;
 		btf__set_fd;
 } LIBBPF_0.0.9;

From 8526df04570f5698c97ac661ad1f2f35293557a7 Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sat, 1 Aug 2020 18:32:18 -0700
Subject: [PATCH 70/73] tools/bpftool: Use libbpf's btf__parse() API for
 parsing BTF from file

Use generic libbpf API to parse BTF data from file, instead of re-implementing
it in bpftool.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200802013219.864880-3-andriin@fb.com
---
 tools/bpf/bpftool/btf.c | 54 +----------------------------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c
index 37c091308714..8ab142ff5eac 100644
--- a/tools/bpf/bpftool/btf.c
+++ b/tools/bpf/bpftool/btf.c
@@ -422,54 +422,6 @@ done:
 	return err;
 }
 
-static struct btf *btf__parse_raw(const char *file)
-{
-	struct btf *btf;
-	struct stat st;
-	__u8 *buf;
-	FILE *f;
-
-	if (stat(file, &st))
-		return NULL;
-
-	f = fopen(file, "rb");
-	if (!f)
-		return NULL;
-
-	buf = malloc(st.st_size);
-	if (!buf) {
-		btf = ERR_PTR(-ENOMEM);
-		goto exit_close;
-	}
-
-	if ((size_t) st.st_size != fread(buf, 1, st.st_size, f)) {
-		btf = ERR_PTR(-EINVAL);
-		goto exit_free;
-	}
-
-	btf = btf__new(buf, st.st_size);
-
-exit_free:
-	free(buf);
-exit_close:
-	fclose(f);
-	return btf;
-}
-
-static bool is_btf_raw(const char *file)
-{
-	__u16 magic = 0;
-	int fd, nb_read;
-
-	fd = open(file, O_RDONLY);
-	if (fd < 0)
-		return false;
-
-	nb_read = read(fd, &magic, sizeof(magic));
-	close(fd);
-	return nb_read == sizeof(magic) && magic == BTF_MAGIC;
-}
-
 static int do_dump(int argc, char **argv)
 {
 	struct btf *btf = NULL;
@@ -547,11 +499,7 @@ static int do_dump(int argc, char **argv)
 		}
 		NEXT_ARG();
 	} else if (is_prefix(src, "file")) {
-		if (is_btf_raw(*argv))
-			btf = btf__parse_raw(*argv);
-		else
-			btf = btf__parse_elf(*argv, NULL);
-
+		btf = btf__parse(*argv, NULL);
 		if (IS_ERR(btf)) {
 			err = -PTR_ERR(btf);
 			btf = NULL;

From f86ca3cffef153555a3f4755b3a44881d962754f Mon Sep 17 00:00:00 2001
From: Andrii Nakryiko <andriin@fb.com>
Date: Sat, 1 Aug 2020 18:32:19 -0700
Subject: [PATCH 71/73] tools/resolve_btfids: Use libbpf's btf__parse() API

Instead of re-implementing generic BTF parsing logic, use libbpf's API.
Also add .gitignore for resolve_btfids's build artifacts.

Signed-off-by: Andrii Nakryiko <andriin@fb.com>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200802013219.864880-4-andriin@fb.com
---
 tools/bpf/resolve_btfids/.gitignore |  4 ++
 tools/bpf/resolve_btfids/main.c     | 58 +----------------------------
 2 files changed, 5 insertions(+), 57 deletions(-)
 create mode 100644 tools/bpf/resolve_btfids/.gitignore

diff --git a/tools/bpf/resolve_btfids/.gitignore b/tools/bpf/resolve_btfids/.gitignore
new file mode 100644
index 000000000000..a026df7dc280
--- /dev/null
+++ b/tools/bpf/resolve_btfids/.gitignore
@@ -0,0 +1,4 @@
+/FEATURE-DUMP.libbpf
+/bpf_helper_defs.h
+/fixdep
+/resolve_btfids
diff --git a/tools/bpf/resolve_btfids/main.c b/tools/bpf/resolve_btfids/main.c
index 6956b6350cad..52d883325a23 100644
--- a/tools/bpf/resolve_btfids/main.c
+++ b/tools/bpf/resolve_btfids/main.c
@@ -403,62 +403,6 @@ static int symbols_collect(struct object *obj)
 	return 0;
 }
 
-static struct btf *btf__parse_raw(const char *file)
-{
-	struct btf *btf;
-	struct stat st;
-	__u8 *buf;
-	FILE *f;
-
-	if (stat(file, &st))
-		return NULL;
-
-	f = fopen(file, "rb");
-	if (!f)
-		return NULL;
-
-	buf = malloc(st.st_size);
-	if (!buf) {
-		btf = ERR_PTR(-ENOMEM);
-		goto exit_close;
-	}
-
-	if ((size_t) st.st_size != fread(buf, 1, st.st_size, f)) {
-		btf = ERR_PTR(-EINVAL);
-		goto exit_free;
-	}
-
-	btf = btf__new(buf, st.st_size);
-
-exit_free:
-	free(buf);
-exit_close:
-	fclose(f);
-	return btf;
-}
-
-static bool is_btf_raw(const char *file)
-{
-	__u16 magic = 0;
-	int fd, nb_read;
-
-	fd = open(file, O_RDONLY);
-	if (fd < 0)
-		return false;
-
-	nb_read = read(fd, &magic, sizeof(magic));
-	close(fd);
-	return nb_read == sizeof(magic) && magic == BTF_MAGIC;
-}
-
-static struct btf *btf_open(const char *path)
-{
-	if (is_btf_raw(path))
-		return btf__parse_raw(path);
-	else
-		return btf__parse_elf(path, NULL);
-}
-
 static int symbols_resolve(struct object *obj)
 {
 	int nr_typedefs = obj->nr_typedefs;
@@ -469,7 +413,7 @@ static int symbols_resolve(struct object *obj)
 	struct btf *btf;
 	__u32 nr;
 
-	btf = btf_open(obj->btf ?: obj->path);
+	btf = btf__parse(obj->btf ?: obj->path, NULL);
 	err = libbpf_get_error(btf);
 	if (err) {
 		pr_err("FAILED: load BTF from %s: %s",

From fa5cb548ced61b9d3095f32f8a7e427a248c65ee Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Mon, 3 Aug 2020 12:05:44 +0300
Subject: [PATCH 72/73] bpf: Setup socket family and addresses in
 bpf_prog_test_run_skb

Now it's impossible to test all branches of cgroup_skb bpf program which
accesses skb->family and skb->{local,remote}_ip{4,6} fields because they
are zeroed during socket allocation. This commit fills socket family and
addresses from related fields in constructed skb.

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200803090545.82046-2-zeil@yandex-team.ru
---
 net/bpf/test_run.c | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index b03c469cd01f..736a5964fa95 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -449,6 +449,27 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 	skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev);
 	skb_reset_network_header(skb);
 
+	switch (skb->protocol) {
+	case htons(ETH_P_IP):
+		sk->sk_family = AF_INET;
+		if (sizeof(struct iphdr) <= skb_headlen(skb)) {
+			sk->sk_rcv_saddr = ip_hdr(skb)->saddr;
+			sk->sk_daddr = ip_hdr(skb)->daddr;
+		}
+		break;
+#if IS_ENABLED(CONFIG_IPV6)
+	case htons(ETH_P_IPV6):
+		sk->sk_family = AF_INET6;
+		if (sizeof(struct ipv6hdr) <= skb_headlen(skb)) {
+			sk->sk_v6_rcv_saddr = ipv6_hdr(skb)->saddr;
+			sk->sk_v6_daddr = ipv6_hdr(skb)->daddr;
+		}
+		break;
+#endif
+	default:
+		break;
+	}
+
 	if (is_l2)
 		__skb_push(skb, hh_len);
 	if (is_direct_pkt_access)

From 21594c44083c375697d418729c4b2e4522cf9f70 Mon Sep 17 00:00:00 2001
From: Dmitry Yakunin <zeil@yandex-team.ru>
Date: Mon, 3 Aug 2020 12:05:45 +0300
Subject: [PATCH 73/73] bpf: Allow to specify ifindex for skb in
 bpf_prog_test_run_skb

Now skb->dev is unconditionally set to the loopback device in current net
namespace. But if we want to test bpf program which contains code branch
based on ifindex condition (eg filters out localhost packets) it is useful
to allow specifying of ifindex from userspace. This patch adds such option
through ctx_in (__sk_buff) parameter.

Signed-off-by: Dmitry Yakunin <zeil@yandex-team.ru>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Link: https://lore.kernel.org/bpf/20200803090545.82046-3-zeil@yandex-team.ru
---
 net/bpf/test_run.c                            | 22 +++++++++++++++++--
 .../selftests/bpf/prog_tests/skb_ctx.c        |  5 +++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index 736a5964fa95..99eb8c6c0fbc 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -327,6 +327,12 @@ static int convert___skb_to_skb(struct sk_buff *skb, struct __sk_buff *__skb)
 	/* priority is allowed */
 
 	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, priority),
+			   offsetof(struct __sk_buff, ifindex)))
+		return -EINVAL;
+
+	/* ifindex is allowed */
+
+	if (!range_is_zero(__skb, offsetofend(struct __sk_buff, ifindex),
 			   offsetof(struct __sk_buff, cb)))
 		return -EINVAL;
 
@@ -381,6 +387,7 @@ static void convert_skb_to___skb(struct sk_buff *skb, struct __sk_buff *__skb)
 
 	__skb->mark = skb->mark;
 	__skb->priority = skb->priority;
+	__skb->ifindex = skb->dev->ifindex;
 	__skb->tstamp = skb->tstamp;
 	memcpy(__skb->cb, &cb->data, QDISC_CB_PRIV_LEN);
 	__skb->wire_len = cb->pkt_len;
@@ -391,6 +398,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 			  union bpf_attr __user *uattr)
 {
 	bool is_l2 = false, is_direct_pkt_access = false;
+	struct net *net = current->nsproxy->net_ns;
+	struct net_device *dev = net->loopback_dev;
 	u32 size = kattr->test.data_size_in;
 	u32 repeat = kattr->test.repeat;
 	struct __sk_buff *ctx = NULL;
@@ -432,7 +441,7 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		kfree(ctx);
 		return -ENOMEM;
 	}
-	sock_net_set(sk, current->nsproxy->net_ns);
+	sock_net_set(sk, net);
 	sock_init_data(NULL, sk);
 
 	skb = build_skb(data, 0);
@@ -446,7 +455,14 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 
 	skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN);
 	__skb_put(skb, size);
-	skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev);
+	if (ctx && ctx->ifindex > 1) {
+		dev = dev_get_by_index(net, ctx->ifindex);
+		if (!dev) {
+			ret = -ENODEV;
+			goto out;
+		}
+	}
+	skb->protocol = eth_type_trans(skb, dev);
 	skb_reset_network_header(skb);
 
 	switch (skb->protocol) {
@@ -502,6 +518,8 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
 		ret = bpf_ctx_finish(kattr, uattr, ctx,
 				     sizeof(struct __sk_buff));
 out:
+	if (dev && dev != net->loopback_dev)
+		dev_put(dev);
 	kfree_skb(skb);
 	bpf_sk_storage_free(sk);
 	kfree(sk);
diff --git a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
index 7021b92af313..25de86af2d03 100644
--- a/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
+++ b/tools/testing/selftests/bpf/prog_tests/skb_ctx.c
@@ -11,6 +11,7 @@ void test_skb_ctx(void)
 		.cb[3] = 4,
 		.cb[4] = 5,
 		.priority = 6,
+		.ifindex = 1,
 		.tstamp = 7,
 		.wire_len = 100,
 		.gso_segs = 8,
@@ -92,6 +93,10 @@ void test_skb_ctx(void)
 		   "ctx_out_priority",
 		   "skb->priority == %d, expected %d\n",
 		   skb.priority, 7);
+	CHECK_ATTR(skb.ifindex != 1,
+		   "ctx_out_ifindex",
+		   "skb->ifindex == %d, expected %d\n",
+		   skb.ifindex, 1);
 	CHECK_ATTR(skb.tstamp != 8,
 		   "ctx_out_tstamp",
 		   "skb->tstamp == %lld, expected %d\n",