tcp: switch rtt estimations to usec resolution
Upcoming congestion controls for TCP require usec resolution for RTT estimations. Millisecond resolution is simply not enough these days. FQ/pacing in DC environments also require this change for finer control and removal of bimodal behavior due to the current hack in tcp_update_pacing_rate() for 'small rtt' TCP_CONG_RTT_STAMP is no longer needed. As Julian Anastasov pointed out, we need to keep user compatibility : tcp_metrics used to export RTT and RTTVAR in msec resolution, so we added RTT_US and RTTVAR_US. An iproute2 patch is needed to use the new attributes if provided by the kernel. In this example ss command displays a srtt of 32 usecs (10Gbit link) lpk51:~# ./ss -i dst lpk52 Netid State Recv-Q Send-Q Local Address:Port Peer Address:Port tcp ESTAB 0 1 10.246.11.51:42959 10.246.11.52:64614 cubic wscale:6,6 rto:201 rtt:0.032/0.001 ato:40 mss:1448 cwnd:10 send 3620.0Mbps pacing_rate 7240.0Mbps unacked:1 rcv_rtt:993 rcv_space:29559 Updated iproute2 ip command displays : lpk51:~# ./ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 274us rttvar 213us source 10.246.11.51 Old binary displays : lpk51:~# ip tcp_metrics | grep 10.246.11.52 10.246.11.52 age 561.914sec cwnd 10 rtt 250us rttvar 125us source 10.246.11.51 With help from Julian Anastasov, Stephen Hemminger and Yuchung Cheng Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Cc: Stephen Hemminger <stephen@networkplumber.org> Cc: Yuchung Cheng <ycheng@google.com> Cc: Larry Brakmo <brakmo@google.com> Cc: Julian Anastasov <ja@ssi.bg> Signed-off-by: David S. Miller <davem@davemloft.net>
This commit is contained in:

committed by
David S. Miller

parent
363ec39235
commit
740b0f1841
@@ -866,11 +866,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
|
||||
if (clone_it) {
|
||||
const struct sk_buff *fclone = skb + 1;
|
||||
|
||||
/* If congestion control is doing timestamping, we must
|
||||
* take such a timestamp before we potentially clone/copy.
|
||||
*/
|
||||
if (icsk->icsk_ca_ops->flags & TCP_CONG_RTT_STAMP)
|
||||
__net_timestamp(skb);
|
||||
skb_mstamp_get(&skb->skb_mstamp);
|
||||
|
||||
if (unlikely(skb->fclone == SKB_FCLONE_ORIG &&
|
||||
fclone->fclone == SKB_FCLONE_CLONE))
|
||||
@@ -1974,7 +1970,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||
struct inet_connection_sock *icsk = inet_csk(sk);
|
||||
struct tcp_sock *tp = tcp_sk(sk);
|
||||
u32 timeout, tlp_time_stamp, rto_time_stamp;
|
||||
u32 rtt = tp->srtt >> 3;
|
||||
u32 rtt = usecs_to_jiffies(tp->srtt_us >> 3);
|
||||
|
||||
if (WARN_ON(icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS))
|
||||
return false;
|
||||
@@ -1996,7 +1992,7 @@ bool tcp_schedule_loss_probe(struct sock *sk)
|
||||
/* Schedule a loss probe in 2*RTT for SACK capable connections
|
||||
* in Open state, that are either limited by cwnd or application.
|
||||
*/
|
||||
if (sysctl_tcp_early_retrans < 3 || !tp->srtt || !tp->packets_out ||
|
||||
if (sysctl_tcp_early_retrans < 3 || !tp->srtt_us || !tp->packets_out ||
|
||||
!tcp_is_sack(tp) || inet_csk(sk)->icsk_ca_state != TCP_CA_Open)
|
||||
return false;
|
||||
|
||||
@@ -3050,8 +3046,9 @@ void tcp_send_delayed_ack(struct sock *sk)
|
||||
* Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
|
||||
* directly.
|
||||
*/
|
||||
if (tp->srtt) {
|
||||
int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN);
|
||||
if (tp->srtt_us) {
|
||||
int rtt = max_t(int, usecs_to_jiffies(tp->srtt_us >> 3),
|
||||
TCP_DELACK_MIN);
|
||||
|
||||
if (rtt < max_ato)
|
||||
max_ato = rtt;
|
||||
|
Reference in New Issue
Block a user