tcp: switch orphan_count to bare per-cpu counters
[ Upstream commit 19757cebf0c5016a1f36f7fe9810a9f0b33c0832 ] Use of percpu_counter structure to track count of orphaned sockets is causing problems on modern hosts with 256 cpus or more. Stefan Bach reported a serious spinlock contention in real workloads, that I was able to reproduce with a netfilter rule dropping incoming FIN packets. 53.56% server [kernel.kallsyms] [k] queued_spin_lock_slowpath | ---queued_spin_lock_slowpath | --53.51%--_raw_spin_lock_irqsave | --53.51%--__percpu_counter_sum tcp_check_oom | |--39.03%--__tcp_close | tcp_close | inet_release | inet6_release | sock_close | __fput | ____fput | task_work_run | exit_to_usermode_loop | do_syscall_64 | entry_SYSCALL_64_after_hwframe | __GI___libc_close | --14.48%--tcp_out_of_resources tcp_write_timeout tcp_retransmit_timer tcp_write_timer_handler tcp_write_timer call_timer_fn expire_timers __run_timers run_timer_softirq __softirqentry_text_start As explained in commitcf86a086a1
("net/dst: use a smaller percpu_counter batch for dst entries accounting"), default batch size is too big for the default value of tcp_max_orphans (262144). But even if we reduce batch sizes, there would still be cases where the estimated count of orphans is beyond the limit, and where tcp_too_many_orphans() has to call the expensive percpu_counter_sum_positive(). One solution is to use plain per-cpu counters, and have a timer to periodically refresh this cache. Updating this cache every 100ms seems about right, tcp pressure state is not radically changing over shorter periods. percpu_counter was nice 15 years ago while hosts had less than 16 cpus, not anymore by current standards. v2: Fix the build issue for CONFIG_CRYPTO_DEV_CHELSIO_TLS=m, reported by kernel test robot <lkp@intel.com> Remove unused socket argument from tcp_too_many_orphans() Fixes:dd24c00191
("net: Use a percpu_counter for orphan_count") Signed-off-by: Eric Dumazet <edumazet@google.com> Reported-by: Stefan Bach <sfb@google.com> Cc: Neal Cardwell <ncardwell@google.com> Acked-by: Neal Cardwell <ncardwell@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> Signed-off-by: Sasha Levin <sashal@kernel.org>
This commit is contained in:

committed by
Greg Kroah-Hartman

parent
c85c6fadbe
commit
a342cb4772
@@ -280,8 +280,8 @@
|
||||
#include <asm/ioctls.h>
|
||||
#include <net/busy_poll.h>
|
||||
|
||||
struct percpu_counter tcp_orphan_count;
|
||||
EXPORT_SYMBOL_GPL(tcp_orphan_count);
|
||||
DEFINE_PER_CPU(unsigned int, tcp_orphan_count);
|
||||
EXPORT_PER_CPU_SYMBOL_GPL(tcp_orphan_count);
|
||||
|
||||
long sysctl_tcp_mem[3] __read_mostly;
|
||||
EXPORT_SYMBOL(sysctl_tcp_mem);
|
||||
@@ -2394,11 +2394,36 @@ void tcp_shutdown(struct sock *sk, int how)
|
||||
}
|
||||
EXPORT_SYMBOL(tcp_shutdown);
|
||||
|
||||
int tcp_orphan_count_sum(void)
|
||||
{
|
||||
int i, total = 0;
|
||||
|
||||
for_each_possible_cpu(i)
|
||||
total += per_cpu(tcp_orphan_count, i);
|
||||
|
||||
return max(total, 0);
|
||||
}
|
||||
|
||||
static int tcp_orphan_cache;
|
||||
static struct timer_list tcp_orphan_timer;
|
||||
#define TCP_ORPHAN_TIMER_PERIOD msecs_to_jiffies(100)
|
||||
|
||||
static void tcp_orphan_update(struct timer_list *unused)
|
||||
{
|
||||
WRITE_ONCE(tcp_orphan_cache, tcp_orphan_count_sum());
|
||||
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
|
||||
}
|
||||
|
||||
static bool tcp_too_many_orphans(int shift)
|
||||
{
|
||||
return READ_ONCE(tcp_orphan_cache) << shift > sysctl_tcp_max_orphans;
|
||||
}
|
||||
|
||||
bool tcp_check_oom(struct sock *sk, int shift)
|
||||
{
|
||||
bool too_many_orphans, out_of_socket_memory;
|
||||
|
||||
too_many_orphans = tcp_too_many_orphans(sk, shift);
|
||||
too_many_orphans = tcp_too_many_orphans(shift);
|
||||
out_of_socket_memory = tcp_out_of_memory(sk);
|
||||
|
||||
if (too_many_orphans)
|
||||
@@ -2508,7 +2533,7 @@ adjudge_to_death:
|
||||
/* remove backlog if any, without releasing ownership. */
|
||||
__release_sock(sk);
|
||||
|
||||
percpu_counter_inc(sk->sk_prot->orphan_count);
|
||||
this_cpu_inc(tcp_orphan_count);
|
||||
|
||||
/* Have we already been destroyed by a softirq or backlog? */
|
||||
if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
|
||||
@@ -4145,7 +4170,10 @@ void __init tcp_init(void)
|
||||
sizeof_field(struct sk_buff, cb));
|
||||
|
||||
percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
|
||||
percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
|
||||
|
||||
timer_setup(&tcp_orphan_timer, tcp_orphan_update, TIMER_DEFERRABLE);
|
||||
mod_timer(&tcp_orphan_timer, jiffies + TCP_ORPHAN_TIMER_PERIOD);
|
||||
|
||||
inet_hashinfo_init(&tcp_hashinfo);
|
||||
inet_hashinfo2_init(&tcp_hashinfo, "tcp_listen_portaddr_hash",
|
||||
thash_entries, 21, /* one slot per 2 MB*/
|
||||
|
Reference in New Issue
Block a user