tcp/dccp: install syn_recv requests into ehash table
In this patch, we insert request sockets into TCP/DCCP regular ehash table (where ESTABLISHED and TIMEWAIT sockets are) instead of using the per listener hash table. ACK packets find SYN_RECV pseudo sockets without having to find and lock the listener. In nominal conditions, this halves pressure on listener lock. Note that this will allow for SO_REUSEPORT refinements, so that we can select a listener using cpu/numa affinities instead of the prior 'consistent hash', since only SYN packets will apply this selection logic. We will shrink listen_sock in the following patch to ease code review. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Ying Cai <ycai@google.com> Cc: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:

committed by
David S. Miller

parent
2feda34192
commit
079096f103
@@ -1224,7 +1224,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
|
||||
.route_req = tcp_v4_route_req,
|
||||
.init_seq = tcp_v4_init_sequence,
|
||||
.send_synack = tcp_v4_send_synack,
|
||||
.queue_hash_add = inet_csk_reqsk_queue_hash_add,
|
||||
};
|
||||
|
||||
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
|
||||
@@ -1343,34 +1342,11 @@ put_and_exit:
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
|
||||
|
||||
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
|
||||
static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
|
||||
{
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
const struct iphdr *iph = ip_hdr(skb);
|
||||
struct request_sock *req;
|
||||
struct sock *nsk;
|
||||
|
||||
req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
|
||||
if (req) {
|
||||
nsk = tcp_check_req(sk, skb, req, false);
|
||||
if (!nsk || nsk == sk)
|
||||
reqsk_put(req);
|
||||
return nsk;
|
||||
}
|
||||
|
||||
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
|
||||
th->source, iph->daddr, th->dest, inet_iif(skb));
|
||||
|
||||
if (nsk) {
|
||||
if (nsk->sk_state != TCP_TIME_WAIT) {
|
||||
bh_lock_sock(nsk);
|
||||
return nsk;
|
||||
}
|
||||
inet_twsk_put(inet_twsk(nsk));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYN_COOKIES
|
||||
const struct tcphdr *th = tcp_hdr(skb);
|
||||
|
||||
if (!th->syn)
|
||||
sk = cookie_v4_check(sk, skb);
|
||||
#endif
|
||||
@@ -1409,10 +1385,10 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
|
||||
goto csum_err;
|
||||
|
||||
if (sk->sk_state == TCP_LISTEN) {
|
||||
struct sock *nsk = tcp_v4_hnd_req(sk, skb);
|
||||
struct sock *nsk = tcp_v4_cookie_check(sk, skb);
|
||||
|
||||
if (!nsk)
|
||||
goto discard;
|
||||
|
||||
if (nsk != sk) {
|
||||
sock_rps_save_rxhash(nsk, skb);
|
||||
sk_mark_napi_id(nsk, skb);
|
||||
@@ -1603,6 +1579,29 @@ process:
|
||||
if (sk->sk_state == TCP_TIME_WAIT)
|
||||
goto do_time_wait;
|
||||
|
||||
if (sk->sk_state == TCP_NEW_SYN_RECV) {
|
||||
struct request_sock *req = inet_reqsk(sk);
|
||||
struct sock *nsk = NULL;
|
||||
|
||||
sk = req->rsk_listener;
|
||||
if (tcp_v4_inbound_md5_hash(sk, skb))
|
||||
goto discard_and_relse;
|
||||
if (sk->sk_state == TCP_LISTEN)
|
||||
nsk = tcp_check_req(sk, skb, req, false);
|
||||
if (!nsk) {
|
||||
reqsk_put(req);
|
||||
goto discard_it;
|
||||
}
|
||||
if (nsk == sk) {
|
||||
sock_hold(sk);
|
||||
reqsk_put(req);
|
||||
} else if (tcp_child_process(sk, nsk, skb)) {
|
||||
tcp_v4_send_reset(nsk, skb);
|
||||
goto discard_it;
|
||||
} else {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
|
||||
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
|
||||
goto discard_and_relse;
|
||||
@@ -1830,35 +1829,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
|
||||
++st->num;
|
||||
++st->offset;
|
||||
|
||||
if (st->state == TCP_SEQ_STATE_OPENREQ) {
|
||||
struct request_sock *req = cur;
|
||||
|
||||
icsk = inet_csk(st->syn_wait_sk);
|
||||
req = req->dl_next;
|
||||
while (1) {
|
||||
while (req) {
|
||||
if (req->rsk_ops->family == st->family) {
|
||||
cur = req;
|
||||
goto out;
|
||||
}
|
||||
req = req->dl_next;
|
||||
}
|
||||
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
|
||||
break;
|
||||
get_req:
|
||||
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
|
||||
}
|
||||
sk = sk_nulls_next(st->syn_wait_sk);
|
||||
st->state = TCP_SEQ_STATE_LISTENING;
|
||||
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
} else {
|
||||
icsk = inet_csk(sk);
|
||||
spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
if (reqsk_queue_len(&icsk->icsk_accept_queue))
|
||||
goto start_req;
|
||||
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
sk = sk_nulls_next(sk);
|
||||
}
|
||||
sk = sk_nulls_next(sk);
|
||||
get_sk:
|
||||
sk_nulls_for_each_from(sk, node) {
|
||||
if (!net_eq(sock_net(sk), net))
|
||||
@@ -1868,15 +1839,6 @@ get_sk:
|
||||
goto out;
|
||||
}
|
||||
icsk = inet_csk(sk);
|
||||
spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
|
||||
start_req:
|
||||
st->syn_wait_sk = sk;
|
||||
st->state = TCP_SEQ_STATE_OPENREQ;
|
||||
st->sbucket = 0;
|
||||
goto get_req;
|
||||
}
|
||||
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
}
|
||||
spin_unlock_bh(&ilb->lock);
|
||||
st->offset = 0;
|
||||
@@ -2008,7 +1970,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
|
||||
void *rc = NULL;
|
||||
|
||||
switch (st->state) {
|
||||
case TCP_SEQ_STATE_OPENREQ:
|
||||
case TCP_SEQ_STATE_LISTENING:
|
||||
if (st->bucket >= INET_LHTABLE_SIZE)
|
||||
break;
|
||||
@@ -2067,7 +2028,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
|
||||
}
|
||||
|
||||
switch (st->state) {
|
||||
case TCP_SEQ_STATE_OPENREQ:
|
||||
case TCP_SEQ_STATE_LISTENING:
|
||||
rc = listening_get_next(seq, v);
|
||||
if (!rc) {
|
||||
@@ -2092,11 +2052,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
|
||||
struct tcp_iter_state *st = seq->private;
|
||||
|
||||
switch (st->state) {
|
||||
case TCP_SEQ_STATE_OPENREQ:
|
||||
if (v) {
|
||||
struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
|
||||
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
|
||||
}
|
||||
case TCP_SEQ_STATE_LISTENING:
|
||||
if (v != SEQ_START_TOKEN)
|
||||
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
|
||||
@@ -2269,18 +2224,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
|
||||
}
|
||||
st = seq->private;
|
||||
|
||||
switch (st->state) {
|
||||
case TCP_SEQ_STATE_LISTENING:
|
||||
case TCP_SEQ_STATE_ESTABLISHED:
|
||||
if (sk->sk_state == TCP_TIME_WAIT)
|
||||
get_timewait4_sock(v, seq, st->num);
|
||||
else
|
||||
get_tcp4_sock(v, seq, st->num);
|
||||
break;
|
||||
case TCP_SEQ_STATE_OPENREQ:
|
||||
if (sk->sk_state == TCP_TIME_WAIT)
|
||||
get_timewait4_sock(v, seq, st->num);
|
||||
else if (sk->sk_state == TCP_NEW_SYN_RECV)
|
||||
get_openreq4(v, seq, st->num);
|
||||
break;
|
||||
}
|
||||
else
|
||||
get_tcp4_sock(v, seq, st->num);
|
||||
out:
|
||||
seq_pad(seq, '\n');
|
||||
return 0;
|
||||
|
Reference in New Issue
Block a user