inet_timewait_sock.c 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * Generic TIME_WAIT sockets functions
  8. *
  9. * From code orinally in TCP
  10. */
  11. #include <linux/kernel.h>
  12. #include <linux/slab.h>
  13. #include <linux/module.h>
  14. #include <net/inet_hashtables.h>
  15. #include <net/inet_timewait_sock.h>
  16. #include <net/ip.h>
  17. /**
  18. * inet_twsk_bind_unhash - unhash a timewait socket from bind hash
  19. * @tw: timewait socket
  20. * @hashinfo: hashinfo pointer
  21. *
  22. * unhash a timewait socket from bind hash, if hashed.
  23. * bind hash lock must be held by caller.
  24. * Returns 1 if caller should call inet_twsk_put() after lock release.
  25. */
  26. void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
  27. struct inet_hashinfo *hashinfo)
  28. {
  29. struct inet_bind2_bucket *tb2 = tw->tw_tb2;
  30. struct inet_bind_bucket *tb = tw->tw_tb;
  31. if (!tb)
  32. return;
  33. __hlist_del(&tw->tw_bind_node);
  34. tw->tw_tb = NULL;
  35. inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
  36. __hlist_del(&tw->tw_bind2_node);
  37. tw->tw_tb2 = NULL;
  38. inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
  39. __sock_put((struct sock *)tw);
  40. }
  41. /* Must be called with locally disabled BHs. */
  42. static void inet_twsk_kill(struct inet_timewait_sock *tw)
  43. {
  44. struct inet_hashinfo *hashinfo = tw->tw_dr->hashinfo;
  45. spinlock_t *lock = inet_ehash_lockp(hashinfo, tw->tw_hash);
  46. struct inet_bind_hashbucket *bhead, *bhead2;
  47. spin_lock(lock);
  48. sk_nulls_del_node_init_rcu((struct sock *)tw);
  49. spin_unlock(lock);
  50. /* Disassociate with bind bucket. */
  51. bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), tw->tw_num,
  52. hashinfo->bhash_size)];
  53. bhead2 = inet_bhashfn_portaddr(hashinfo, (struct sock *)tw,
  54. twsk_net(tw), tw->tw_num);
  55. spin_lock(&bhead->lock);
  56. spin_lock(&bhead2->lock);
  57. inet_twsk_bind_unhash(tw, hashinfo);
  58. spin_unlock(&bhead2->lock);
  59. spin_unlock(&bhead->lock);
  60. refcount_dec(&tw->tw_dr->tw_refcount);
  61. inet_twsk_put(tw);
  62. }
  63. void inet_twsk_free(struct inet_timewait_sock *tw)
  64. {
  65. struct module *owner = tw->tw_prot->owner;
  66. twsk_destructor((struct sock *)tw);
  67. #ifdef SOCK_REFCNT_DEBUG
  68. pr_debug("%s timewait_sock %p released\n", tw->tw_prot->name, tw);
  69. #endif
  70. kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw);
  71. module_put(owner);
  72. }
  73. void inet_twsk_put(struct inet_timewait_sock *tw)
  74. {
  75. if (refcount_dec_and_test(&tw->tw_refcnt))
  76. inet_twsk_free(tw);
  77. }
  78. EXPORT_SYMBOL_GPL(inet_twsk_put);
  79. static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw,
  80. struct hlist_nulls_head *list)
  81. {
  82. hlist_nulls_add_head_rcu(&tw->tw_node, list);
  83. }
  84. static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw,
  85. struct hlist_head *list)
  86. {
  87. hlist_add_head(&tw->tw_bind_node, list);
  88. }
  89. static void inet_twsk_add_bind2_node(struct inet_timewait_sock *tw,
  90. struct hlist_head *list)
  91. {
  92. hlist_add_head(&tw->tw_bind2_node, list);
  93. }
  94. /*
  95. * Enter the time wait state. This is called with locally disabled BH.
  96. * Essentially we whip up a timewait bucket, copy the relevant info into it
  97. * from the SK, and mess with hash chains and list linkage.
  98. */
  99. void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk,
  100. struct inet_hashinfo *hashinfo)
  101. {
  102. const struct inet_sock *inet = inet_sk(sk);
  103. const struct inet_connection_sock *icsk = inet_csk(sk);
  104. struct inet_ehash_bucket *ehead = inet_ehash_bucket(hashinfo, sk->sk_hash);
  105. spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
  106. struct inet_bind_hashbucket *bhead, *bhead2;
  107. /* Step 1: Put TW into bind hash. Original socket stays there too.
  108. Note, that any socket with inet->num != 0 MUST be bound in
  109. binding cache, even if it is closed.
  110. */
  111. bhead = &hashinfo->bhash[inet_bhashfn(twsk_net(tw), inet->inet_num,
  112. hashinfo->bhash_size)];
  113. bhead2 = inet_bhashfn_portaddr(hashinfo, sk, twsk_net(tw), inet->inet_num);
  114. spin_lock(&bhead->lock);
  115. spin_lock(&bhead2->lock);
  116. tw->tw_tb = icsk->icsk_bind_hash;
  117. WARN_ON(!icsk->icsk_bind_hash);
  118. inet_twsk_add_bind_node(tw, &tw->tw_tb->owners);
  119. tw->tw_tb2 = icsk->icsk_bind2_hash;
  120. WARN_ON(!icsk->icsk_bind2_hash);
  121. inet_twsk_add_bind2_node(tw, &tw->tw_tb2->deathrow);
  122. spin_unlock(&bhead2->lock);
  123. spin_unlock(&bhead->lock);
  124. spin_lock(lock);
  125. inet_twsk_add_node_rcu(tw, &ehead->chain);
  126. /* Step 3: Remove SK from hash chain */
  127. if (__sk_nulls_del_node_init_rcu(sk))
  128. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  129. spin_unlock(lock);
  130. /* tw_refcnt is set to 3 because we have :
  131. * - one reference for bhash chain.
  132. * - one reference for ehash chain.
  133. * - one reference for timer.
  134. * We can use atomic_set() because prior spin_lock()/spin_unlock()
  135. * committed into memory all tw fields.
  136. * Also note that after this point, we lost our implicit reference
  137. * so we are not allowed to use tw anymore.
  138. */
  139. refcount_set(&tw->tw_refcnt, 3);
  140. }
  141. EXPORT_SYMBOL_GPL(inet_twsk_hashdance);
  142. static void tw_timer_handler(struct timer_list *t)
  143. {
  144. struct inet_timewait_sock *tw = from_timer(tw, t, tw_timer);
  145. inet_twsk_kill(tw);
  146. }
  147. struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk,
  148. struct inet_timewait_death_row *dr,
  149. const int state)
  150. {
  151. struct inet_timewait_sock *tw;
  152. if (refcount_read(&dr->tw_refcount) - 1 >=
  153. READ_ONCE(dr->sysctl_max_tw_buckets))
  154. return NULL;
  155. tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab,
  156. GFP_ATOMIC);
  157. if (tw) {
  158. const struct inet_sock *inet = inet_sk(sk);
  159. tw->tw_dr = dr;
  160. /* Give us an identity. */
  161. tw->tw_daddr = inet->inet_daddr;
  162. tw->tw_rcv_saddr = inet->inet_rcv_saddr;
  163. tw->tw_bound_dev_if = sk->sk_bound_dev_if;
  164. tw->tw_tos = inet->tos;
  165. tw->tw_num = inet->inet_num;
  166. tw->tw_state = TCP_TIME_WAIT;
  167. tw->tw_substate = state;
  168. tw->tw_sport = inet->inet_sport;
  169. tw->tw_dport = inet->inet_dport;
  170. tw->tw_family = sk->sk_family;
  171. tw->tw_reuse = sk->sk_reuse;
  172. tw->tw_reuseport = sk->sk_reuseport;
  173. tw->tw_hash = sk->sk_hash;
  174. tw->tw_ipv6only = 0;
  175. tw->tw_transparent = inet->transparent;
  176. tw->tw_prot = sk->sk_prot_creator;
  177. atomic64_set(&tw->tw_cookie, atomic64_read(&sk->sk_cookie));
  178. twsk_net_set(tw, sock_net(sk));
  179. timer_setup(&tw->tw_timer, tw_timer_handler, TIMER_PINNED);
  180. /*
  181. * Because we use RCU lookups, we should not set tw_refcnt
  182. * to a non null value before everything is setup for this
  183. * timewait socket.
  184. */
  185. refcount_set(&tw->tw_refcnt, 0);
  186. __module_get(tw->tw_prot->owner);
  187. }
  188. return tw;
  189. }
  190. EXPORT_SYMBOL_GPL(inet_twsk_alloc);
  191. /* These are always called from BH context. See callers in
  192. * tcp_input.c to verify this.
  193. */
  194. /* This is for handling early-kills of TIME_WAIT sockets.
  195. * Warning : consume reference.
  196. * Caller should not access tw anymore.
  197. */
  198. void inet_twsk_deschedule_put(struct inet_timewait_sock *tw)
  199. {
  200. if (del_timer_sync(&tw->tw_timer))
  201. inet_twsk_kill(tw);
  202. inet_twsk_put(tw);
  203. }
  204. EXPORT_SYMBOL(inet_twsk_deschedule_put);
  205. void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
  206. {
  207. /* timeout := RTO * 3.5
  208. *
  209. * 3.5 = 1+2+0.5 to wait for two retransmits.
  210. *
  211. * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
  212. * our ACK acking that FIN can be lost. If N subsequent retransmitted
  213. * FINs (or previous seqments) are lost (probability of such event
  214. * is p^(N+1), where p is probability to lose single packet and
  215. * time to detect the loss is about RTO*(2^N - 1) with exponential
  216. * backoff). Normal timewait length is calculated so, that we
  217. * waited at least for one retransmitted FIN (maximal RTO is 120sec).
  218. * [ BTW Linux. following BSD, violates this requirement waiting
  219. * only for 60sec, we should wait at least for 240 secs.
  220. * Well, 240 consumes too much of resources 8)
  221. * ]
  222. * This interval is not reduced to catch old duplicate and
  223. * responces to our wandering segments living for two MSLs.
  224. * However, if we use PAWS to detect
  225. * old duplicates, we can reduce the interval to bounds required
  226. * by RTO, rather than MSL. So, if peer understands PAWS, we
  227. * kill tw bucket after 3.5*RTO (it is important that this number
  228. * is greater than TS tick!) and detect old duplicates with help
  229. * of PAWS.
  230. */
  231. if (!rearm) {
  232. bool kill = timeo <= 4*HZ;
  233. __NET_INC_STATS(twsk_net(tw), kill ? LINUX_MIB_TIMEWAITKILLED :
  234. LINUX_MIB_TIMEWAITED);
  235. BUG_ON(mod_timer(&tw->tw_timer, jiffies + timeo));
  236. refcount_inc(&tw->tw_dr->tw_refcount);
  237. } else {
  238. mod_timer_pending(&tw->tw_timer, jiffies + timeo);
  239. }
  240. }
  241. EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
  242. void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
  243. {
  244. struct inet_timewait_sock *tw;
  245. struct sock *sk;
  246. struct hlist_nulls_node *node;
  247. unsigned int slot;
  248. for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
  249. struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
  250. restart_rcu:
  251. cond_resched();
  252. rcu_read_lock();
  253. restart:
  254. sk_nulls_for_each_rcu(sk, node, &head->chain) {
  255. if (sk->sk_state != TCP_TIME_WAIT) {
  256. /* A kernel listener socket might not hold refcnt for net,
  257. * so reqsk_timer_handler() could be fired after net is
  258. * freed. Userspace listener and reqsk never exist here.
  259. */
  260. if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV &&
  261. hashinfo->pernet)) {
  262. struct request_sock *req = inet_reqsk(sk);
  263. inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req);
  264. }
  265. continue;
  266. }
  267. tw = inet_twsk(sk);
  268. if ((tw->tw_family != family) ||
  269. refcount_read(&twsk_net(tw)->ns.count))
  270. continue;
  271. if (unlikely(!refcount_inc_not_zero(&tw->tw_refcnt)))
  272. continue;
  273. if (unlikely((tw->tw_family != family) ||
  274. refcount_read(&twsk_net(tw)->ns.count))) {
  275. inet_twsk_put(tw);
  276. goto restart;
  277. }
  278. rcu_read_unlock();
  279. local_bh_disable();
  280. inet_twsk_deschedule_put(tw);
  281. local_bh_enable();
  282. goto restart_rcu;
  283. }
  284. /* If the nulls value we got at the end of this lookup is
  285. * not the expected one, we must restart lookup.
  286. * We probably met an item that was moved to another chain.
  287. */
  288. if (get_nulls_value(node) != slot)
  289. goto restart;
  290. rcu_read_unlock();
  291. }
  292. }
  293. EXPORT_SYMBOL_GPL(inet_twsk_purge);