xdp: change ndo_xdp_xmit API to support bulking

This patch change the API for ndo_xdp_xmit to support bulking
xdp_frames.

When kernel is compiled with CONFIG_RETPOLINE, XDP sees a huge slowdown.
Most of the slowdown is caused by DMA API indirect function calls, but
also the net_device->ndo_xdp_xmit() call.

Benchmarked patch with CONFIG_RETPOLINE, using xdp_redirect_map with
single flow/core test (CPU E5-1650 v4 @ 3.60GHz), showed
performance improved:
 for driver ixgbe: 6,042,682 pps -> 6,853,768 pps = +811,086 pps
 for driver i40e : 6,187,169 pps -> 6,724,519 pps = +537,350 pps

With frames avail as a bulk inside the driver ndo_xdp_xmit call,
further optimizations are possible, like bulk DMA-mapping for TX.

Testing without CONFIG_RETPOLINE show the same performance for
physical NIC drivers.

The virtual NIC driver tun sees a huge performance boost, as it can
avoid doing per frame producer locking, but instead amortize the
locking cost over the bulk.

V2: Fix compile errors reported by kbuild test robot <lkp@intel.com>
V4: Isolated ndo, driver changes and callers.

Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
This commit is contained in:
Jesper Dangaard Brouer
2018-05-24 16:46:12 +02:00
committed by Alexei Starovoitov
parent 389ab7f01a
commit 735fc4054b
8 changed files with 140 additions and 65 deletions

View File

@@ -70,6 +70,7 @@
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
#include <net/sock.h>
#include <net/xdp.h>
#include <linux/seq_file.h>
#include <linux/uio.h>
#include <linux/skb_array.h>
@@ -1290,34 +1291,44 @@ static const struct net_device_ops tun_netdev_ops = {
.ndo_get_stats64 = tun_net_get_stats64,
};
static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
static int tun_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames)
{
struct tun_struct *tun = netdev_priv(dev);
struct tun_file *tfile;
u32 numqueues;
int ret = 0;
int drops = 0;
int cnt = n;
int i;
rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues);
if (!numqueues) {
ret = -ENOSPC;
goto out;
rcu_read_unlock();
return -ENXIO; /* Caller will free/return all frames */
}
tfile = rcu_dereference(tun->tfiles[smp_processor_id() %
numqueues]);
/* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff.
*/
if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
this_cpu_inc(tun->pcpu_stats->tx_dropped);
ret = -ENOSPC;
}
out:
spin_lock(&tfile->tx_ring.producer_lock);
for (i = 0; i < n; i++) {
struct xdp_frame *xdp = frames[i];
/* Encode the XDP flag into lowest bit for consumer to differ
* XDP buffer from sk_buff.
*/
void *frame = tun_xdp_to_ptr(xdp);
if (__ptr_ring_produce(&tfile->tx_ring, frame)) {
this_cpu_inc(tun->pcpu_stats->tx_dropped);
xdp_return_frame_rx_napi(xdp);
drops++;
}
}
spin_unlock(&tfile->tx_ring.producer_lock);
rcu_read_unlock();
return ret;
return cnt - drops;
}
static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
@@ -1327,7 +1338,7 @@ static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
if (unlikely(!frame))
return -EOVERFLOW;
return tun_xdp_xmit(dev, frame);
return tun_xdp_xmit(dev, 1, &frame);
}
static void tun_xdp_flush(struct net_device *dev)