tcp/dccp: install syn_recv requests into ehash table

In this patch, we insert request sockets into TCP/DCCP
regular ehash table (where ESTABLISHED and TIMEWAIT sockets
are) instead of using the per listener hash table.

ACK packets find SYN_RECV pseudo sockets without having
to find and lock the listener.

In nominal conditions, this halves pressure on listener lock.

Note that this will allow for SO_REUSEPORT refinements,
so that we can select a listener using cpu/numa affinities instead
of the prior 'consistent hash', since only SYN packets will
apply this selection logic.

We will shrink listen_sock in the following patch to ease
code review.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ying Cai <ycai@google.com>
Cc: Willem de Bruijn <willemb@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:
Eric Dumazet
2015-10-02 11:43:32 -07:00
committed by David S. Miller
parent 2feda34192
commit 079096f103
15 changed files with 163 additions and 504 deletions

View File

@@ -727,7 +727,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.route_req = tcp_v6_route_req,
.init_seq = tcp_v6_init_sequence,
.send_synack = tcp_v6_send_synack,
.queue_hash_add = inet6_csk_reqsk_queue_hash_add,
};
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
@@ -938,37 +937,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
}
static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb)
static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{
const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
struct sock *nsk;
/* Find possible connection requests. */
req = inet6_csk_search_req(sk, th->source,
&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
if (req) {
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk || nsk == sk)
reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr, ntohs(th->dest),
tcp_v6_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb);
if (!th->syn)
sk = cookie_v6_check(sk, skb);
#endif
@@ -1258,15 +1231,11 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
goto csum_err;
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_hnd_req(sk, skb);
struct sock *nsk = tcp_v6_cookie_check(sk, skb);
if (!nsk)
goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb);
sk_mark_napi_id(nsk, skb);
@@ -1402,6 +1371,33 @@ process:
if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_v6_inbound_md5_hash(sk, skb)) {
reqsk_put(req);
goto discard_it;
}
if (sk->sk_state == TCP_LISTEN)
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
tcp_v6_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v6_send_reset(nsk, skb);
goto discard_it;
} else {
return 0;
}
}
if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse;
@@ -1765,18 +1761,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
}
st = seq->private;
switch (st->state) {
case TCP_SEQ_STATE_LISTENING:
case TCP_SEQ_STATE_ESTABLISHED:
if (sk->sk_state == TCP_TIME_WAIT)
get_timewait6_sock(seq, v, st->num);
else
get_tcp6_sock(seq, v, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
if (sk->sk_state == TCP_TIME_WAIT)
get_timewait6_sock(seq, v, st->num);
else if (sk->sk_state == TCP_NEW_SYN_RECV)
get_openreq6(seq, v, st->num);
break;
}
else
get_tcp6_sock(seq, v, st->num);
out:
return 0;
}