inet_hashtables.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * Generic INET transport hashtables
  8. *
  9. * Authors: Lotsa people, from code originally in tcp
  10. */
  11. #include <linux/module.h>
  12. #include <linux/random.h>
  13. #include <linux/sched.h>
  14. #include <linux/slab.h>
  15. #include <linux/wait.h>
  16. #include <linux/vmalloc.h>
  17. #include <linux/memblock.h>
  18. #include <net/addrconf.h>
  19. #include <net/inet_connection_sock.h>
  20. #include <net/inet_hashtables.h>
  21. #if IS_ENABLED(CONFIG_IPV6)
  22. #include <net/inet6_hashtables.h>
  23. #endif
  24. #include <net/secure_seq.h>
  25. #include <net/ip.h>
  26. #include <net/tcp.h>
  27. #include <net/sock_reuseport.h>
  28. static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
  29. const __u16 lport, const __be32 faddr,
  30. const __be16 fport)
  31. {
  32. static u32 inet_ehash_secret __read_mostly;
  33. net_get_random_once(&inet_ehash_secret, sizeof(inet_ehash_secret));
  34. return __inet_ehashfn(laddr, lport, faddr, fport,
  35. inet_ehash_secret + net_hash_mix(net));
  36. }
  37. /* This function handles inet_sock, but also timewait and request sockets
  38. * for IPv4/IPv6.
  39. */
  40. static u32 sk_ehashfn(const struct sock *sk)
  41. {
  42. #if IS_ENABLED(CONFIG_IPV6)
  43. if (sk->sk_family == AF_INET6 &&
  44. !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
  45. return inet6_ehashfn(sock_net(sk),
  46. &sk->sk_v6_rcv_saddr, sk->sk_num,
  47. &sk->sk_v6_daddr, sk->sk_dport);
  48. #endif
  49. return inet_ehashfn(sock_net(sk),
  50. sk->sk_rcv_saddr, sk->sk_num,
  51. sk->sk_daddr, sk->sk_dport);
  52. }
  53. /*
  54. * Allocate and initialize a new local port bind bucket.
  55. * The bindhash mutex for snum's hash chain must be held here.
  56. */
  57. struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
  58. struct net *net,
  59. struct inet_bind_hashbucket *head,
  60. const unsigned short snum,
  61. int l3mdev)
  62. {
  63. struct inet_bind_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
  64. if (tb) {
  65. write_pnet(&tb->ib_net, net);
  66. tb->l3mdev = l3mdev;
  67. tb->port = snum;
  68. tb->fastreuse = 0;
  69. tb->fastreuseport = 0;
  70. INIT_HLIST_HEAD(&tb->owners);
  71. hlist_add_head(&tb->node, &head->chain);
  72. }
  73. return tb;
  74. }
  75. /*
  76. * Caller must hold hashbucket lock for this tb with local BH disabled
  77. */
  78. void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
  79. {
  80. if (hlist_empty(&tb->owners)) {
  81. __hlist_del(&tb->node);
  82. kmem_cache_free(cachep, tb);
  83. }
  84. }
  85. bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net,
  86. unsigned short port, int l3mdev)
  87. {
  88. return net_eq(ib_net(tb), net) && tb->port == port &&
  89. tb->l3mdev == l3mdev;
  90. }
  91. static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb,
  92. struct net *net,
  93. struct inet_bind_hashbucket *head,
  94. unsigned short port, int l3mdev,
  95. const struct sock *sk)
  96. {
  97. write_pnet(&tb->ib_net, net);
  98. tb->l3mdev = l3mdev;
  99. tb->port = port;
  100. #if IS_ENABLED(CONFIG_IPV6)
  101. tb->family = sk->sk_family;
  102. if (sk->sk_family == AF_INET6)
  103. tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr;
  104. else
  105. #endif
  106. tb->rcv_saddr = sk->sk_rcv_saddr;
  107. INIT_HLIST_HEAD(&tb->owners);
  108. INIT_HLIST_HEAD(&tb->deathrow);
  109. hlist_add_head(&tb->node, &head->chain);
  110. }
  111. struct inet_bind2_bucket *inet_bind2_bucket_create(struct kmem_cache *cachep,
  112. struct net *net,
  113. struct inet_bind_hashbucket *head,
  114. unsigned short port,
  115. int l3mdev,
  116. const struct sock *sk)
  117. {
  118. struct inet_bind2_bucket *tb = kmem_cache_alloc(cachep, GFP_ATOMIC);
  119. if (tb)
  120. inet_bind2_bucket_init(tb, net, head, port, l3mdev, sk);
  121. return tb;
  122. }
  123. /* Caller must hold hashbucket lock for this tb with local BH disabled */
  124. void inet_bind2_bucket_destroy(struct kmem_cache *cachep, struct inet_bind2_bucket *tb)
  125. {
  126. if (hlist_empty(&tb->owners) && hlist_empty(&tb->deathrow)) {
  127. __hlist_del(&tb->node);
  128. kmem_cache_free(cachep, tb);
  129. }
  130. }
  131. static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2,
  132. const struct sock *sk)
  133. {
  134. #if IS_ENABLED(CONFIG_IPV6)
  135. if (sk->sk_family != tb2->family) {
  136. if (sk->sk_family == AF_INET)
  137. return ipv6_addr_v4mapped(&tb2->v6_rcv_saddr) &&
  138. tb2->v6_rcv_saddr.s6_addr32[3] == sk->sk_rcv_saddr;
  139. return ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr) &&
  140. sk->sk_v6_rcv_saddr.s6_addr32[3] == tb2->rcv_saddr;
  141. }
  142. if (sk->sk_family == AF_INET6)
  143. return ipv6_addr_equal(&tb2->v6_rcv_saddr,
  144. &sk->sk_v6_rcv_saddr);
  145. #endif
  146. return tb2->rcv_saddr == sk->sk_rcv_saddr;
  147. }
  148. void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,
  149. struct inet_bind2_bucket *tb2, unsigned short port)
  150. {
  151. inet_sk(sk)->inet_num = port;
  152. sk_add_bind_node(sk, &tb->owners);
  153. inet_csk(sk)->icsk_bind_hash = tb;
  154. sk_add_bind2_node(sk, &tb2->owners);
  155. inet_csk(sk)->icsk_bind2_hash = tb2;
  156. }
  157. /*
  158. * Get rid of any references to a local port held by the given sock.
  159. */
  160. static void __inet_put_port(struct sock *sk)
  161. {
  162. struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
  163. struct inet_bind_hashbucket *head, *head2;
  164. struct net *net = sock_net(sk);
  165. struct inet_bind_bucket *tb;
  166. int bhash;
  167. bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size);
  168. head = &hashinfo->bhash[bhash];
  169. head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num);
  170. spin_lock(&head->lock);
  171. tb = inet_csk(sk)->icsk_bind_hash;
  172. __sk_del_bind_node(sk);
  173. inet_csk(sk)->icsk_bind_hash = NULL;
  174. inet_sk(sk)->inet_num = 0;
  175. inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
  176. spin_lock(&head2->lock);
  177. if (inet_csk(sk)->icsk_bind2_hash) {
  178. struct inet_bind2_bucket *tb2 = inet_csk(sk)->icsk_bind2_hash;
  179. __sk_del_bind2_node(sk);
  180. inet_csk(sk)->icsk_bind2_hash = NULL;
  181. inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
  182. }
  183. spin_unlock(&head2->lock);
  184. spin_unlock(&head->lock);
  185. }
  186. void inet_put_port(struct sock *sk)
  187. {
  188. local_bh_disable();
  189. __inet_put_port(sk);
  190. local_bh_enable();
  191. }
  192. EXPORT_SYMBOL(inet_put_port);
  193. int __inet_inherit_port(const struct sock *sk, struct sock *child)
  194. {
  195. struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk);
  196. unsigned short port = inet_sk(child)->inet_num;
  197. struct inet_bind_hashbucket *head, *head2;
  198. bool created_inet_bind_bucket = false;
  199. struct net *net = sock_net(sk);
  200. bool update_fastreuse = false;
  201. struct inet_bind2_bucket *tb2;
  202. struct inet_bind_bucket *tb;
  203. int bhash, l3mdev;
  204. bhash = inet_bhashfn(net, port, table->bhash_size);
  205. head = &table->bhash[bhash];
  206. head2 = inet_bhashfn_portaddr(table, child, net, port);
  207. spin_lock(&head->lock);
  208. spin_lock(&head2->lock);
  209. tb = inet_csk(sk)->icsk_bind_hash;
  210. tb2 = inet_csk(sk)->icsk_bind2_hash;
  211. if (unlikely(!tb || !tb2)) {
  212. spin_unlock(&head2->lock);
  213. spin_unlock(&head->lock);
  214. return -ENOENT;
  215. }
  216. if (tb->port != port) {
  217. l3mdev = inet_sk_bound_l3mdev(sk);
  218. /* NOTE: using tproxy and redirecting skbs to a proxy
  219. * on a different listener port breaks the assumption
  220. * that the listener socket's icsk_bind_hash is the same
  221. * as that of the child socket. We have to look up or
  222. * create a new bind bucket for the child here. */
  223. inet_bind_bucket_for_each(tb, &head->chain) {
  224. if (inet_bind_bucket_match(tb, net, port, l3mdev))
  225. break;
  226. }
  227. if (!tb) {
  228. tb = inet_bind_bucket_create(table->bind_bucket_cachep,
  229. net, head, port, l3mdev);
  230. if (!tb) {
  231. spin_unlock(&head2->lock);
  232. spin_unlock(&head->lock);
  233. return -ENOMEM;
  234. }
  235. created_inet_bind_bucket = true;
  236. }
  237. update_fastreuse = true;
  238. goto bhash2_find;
  239. } else if (!inet_bind2_bucket_addr_match(tb2, child)) {
  240. l3mdev = inet_sk_bound_l3mdev(sk);
  241. bhash2_find:
  242. tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, child);
  243. if (!tb2) {
  244. tb2 = inet_bind2_bucket_create(table->bind2_bucket_cachep,
  245. net, head2, port,
  246. l3mdev, child);
  247. if (!tb2)
  248. goto error;
  249. }
  250. }
  251. if (update_fastreuse)
  252. inet_csk_update_fastreuse(tb, child);
  253. inet_bind_hash(child, tb, tb2, port);
  254. spin_unlock(&head2->lock);
  255. spin_unlock(&head->lock);
  256. return 0;
  257. error:
  258. if (created_inet_bind_bucket)
  259. inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
  260. spin_unlock(&head2->lock);
  261. spin_unlock(&head->lock);
  262. return -ENOMEM;
  263. }
  264. EXPORT_SYMBOL_GPL(__inet_inherit_port);
  265. static struct inet_listen_hashbucket *
  266. inet_lhash2_bucket_sk(struct inet_hashinfo *h, struct sock *sk)
  267. {
  268. u32 hash;
  269. #if IS_ENABLED(CONFIG_IPV6)
  270. if (sk->sk_family == AF_INET6)
  271. hash = ipv6_portaddr_hash(sock_net(sk),
  272. &sk->sk_v6_rcv_saddr,
  273. inet_sk(sk)->inet_num);
  274. else
  275. #endif
  276. hash = ipv4_portaddr_hash(sock_net(sk),
  277. inet_sk(sk)->inet_rcv_saddr,
  278. inet_sk(sk)->inet_num);
  279. return inet_lhash2_bucket(h, hash);
  280. }
  281. static inline int compute_score(struct sock *sk, struct net *net,
  282. const unsigned short hnum, const __be32 daddr,
  283. const int dif, const int sdif)
  284. {
  285. int score = -1;
  286. if (net_eq(sock_net(sk), net) && sk->sk_num == hnum &&
  287. !ipv6_only_sock(sk)) {
  288. if (sk->sk_rcv_saddr != daddr)
  289. return -1;
  290. if (!inet_sk_bound_dev_eq(net, sk->sk_bound_dev_if, dif, sdif))
  291. return -1;
  292. score = sk->sk_bound_dev_if ? 2 : 1;
  293. if (sk->sk_family == PF_INET)
  294. score++;
  295. if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id())
  296. score++;
  297. }
  298. return score;
  299. }
  300. static inline struct sock *lookup_reuseport(struct net *net, struct sock *sk,
  301. struct sk_buff *skb, int doff,
  302. __be32 saddr, __be16 sport,
  303. __be32 daddr, unsigned short hnum)
  304. {
  305. struct sock *reuse_sk = NULL;
  306. u32 phash;
  307. if (sk->sk_reuseport) {
  308. phash = inet_ehashfn(net, daddr, hnum, saddr, sport);
  309. reuse_sk = reuseport_select_sock(sk, phash, skb, doff);
  310. }
  311. return reuse_sk;
  312. }
  313. /*
  314. * Here are some nice properties to exploit here. The BSD API
  315. * does not allow a listening sock to specify the remote port nor the
  316. * remote address for the connection. So always assume those are both
  317. * wildcarded during the search since they can never be otherwise.
  318. */
  319. /* called with rcu_read_lock() : No refcount taken on the socket */
  320. static struct sock *inet_lhash2_lookup(struct net *net,
  321. struct inet_listen_hashbucket *ilb2,
  322. struct sk_buff *skb, int doff,
  323. const __be32 saddr, __be16 sport,
  324. const __be32 daddr, const unsigned short hnum,
  325. const int dif, const int sdif)
  326. {
  327. struct sock *sk, *result = NULL;
  328. struct hlist_nulls_node *node;
  329. int score, hiscore = 0;
  330. sk_nulls_for_each_rcu(sk, node, &ilb2->nulls_head) {
  331. score = compute_score(sk, net, hnum, daddr, dif, sdif);
  332. if (score > hiscore) {
  333. result = lookup_reuseport(net, sk, skb, doff,
  334. saddr, sport, daddr, hnum);
  335. if (result)
  336. return result;
  337. result = sk;
  338. hiscore = score;
  339. }
  340. }
  341. return result;
  342. }
  343. static inline struct sock *inet_lookup_run_bpf(struct net *net,
  344. struct inet_hashinfo *hashinfo,
  345. struct sk_buff *skb, int doff,
  346. __be32 saddr, __be16 sport,
  347. __be32 daddr, u16 hnum, const int dif)
  348. {
  349. struct sock *sk, *reuse_sk;
  350. bool no_reuseport;
  351. if (hashinfo != net->ipv4.tcp_death_row.hashinfo)
  352. return NULL; /* only TCP is supported */
  353. no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport,
  354. daddr, hnum, dif, &sk);
  355. if (no_reuseport || IS_ERR_OR_NULL(sk))
  356. return sk;
  357. reuse_sk = lookup_reuseport(net, sk, skb, doff, saddr, sport, daddr, hnum);
  358. if (reuse_sk)
  359. sk = reuse_sk;
  360. return sk;
  361. }
  362. struct sock *__inet_lookup_listener(struct net *net,
  363. struct inet_hashinfo *hashinfo,
  364. struct sk_buff *skb, int doff,
  365. const __be32 saddr, __be16 sport,
  366. const __be32 daddr, const unsigned short hnum,
  367. const int dif, const int sdif)
  368. {
  369. struct inet_listen_hashbucket *ilb2;
  370. struct sock *result = NULL;
  371. unsigned int hash2;
  372. /* Lookup redirect from BPF */
  373. if (static_branch_unlikely(&bpf_sk_lookup_enabled)) {
  374. result = inet_lookup_run_bpf(net, hashinfo, skb, doff,
  375. saddr, sport, daddr, hnum, dif);
  376. if (result)
  377. goto done;
  378. }
  379. hash2 = ipv4_portaddr_hash(net, daddr, hnum);
  380. ilb2 = inet_lhash2_bucket(hashinfo, hash2);
  381. result = inet_lhash2_lookup(net, ilb2, skb, doff,
  382. saddr, sport, daddr, hnum,
  383. dif, sdif);
  384. if (result)
  385. goto done;
  386. /* Lookup lhash2 with INADDR_ANY */
  387. hash2 = ipv4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
  388. ilb2 = inet_lhash2_bucket(hashinfo, hash2);
  389. result = inet_lhash2_lookup(net, ilb2, skb, doff,
  390. saddr, sport, htonl(INADDR_ANY), hnum,
  391. dif, sdif);
  392. done:
  393. if (IS_ERR(result))
  394. return NULL;
  395. return result;
  396. }
  397. EXPORT_SYMBOL_GPL(__inet_lookup_listener);
  398. /* All sockets share common refcount, but have different destructors */
  399. void sock_gen_put(struct sock *sk)
  400. {
  401. if (!refcount_dec_and_test(&sk->sk_refcnt))
  402. return;
  403. if (sk->sk_state == TCP_TIME_WAIT)
  404. inet_twsk_free(inet_twsk(sk));
  405. else if (sk->sk_state == TCP_NEW_SYN_RECV)
  406. reqsk_free(inet_reqsk(sk));
  407. else
  408. sk_free(sk);
  409. }
  410. EXPORT_SYMBOL_GPL(sock_gen_put);
  411. void sock_edemux(struct sk_buff *skb)
  412. {
  413. sock_gen_put(skb->sk);
  414. }
  415. EXPORT_SYMBOL(sock_edemux);
  416. struct sock *__inet_lookup_established(struct net *net,
  417. struct inet_hashinfo *hashinfo,
  418. const __be32 saddr, const __be16 sport,
  419. const __be32 daddr, const u16 hnum,
  420. const int dif, const int sdif)
  421. {
  422. INET_ADDR_COOKIE(acookie, saddr, daddr);
  423. const __portpair ports = INET_COMBINED_PORTS(sport, hnum);
  424. struct sock *sk;
  425. const struct hlist_nulls_node *node;
  426. /* Optimize here for direct hit, only listening connections can
  427. * have wildcards anyways.
  428. */
  429. unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
  430. unsigned int slot = hash & hashinfo->ehash_mask;
  431. struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
  432. begin:
  433. sk_nulls_for_each_rcu(sk, node, &head->chain) {
  434. if (sk->sk_hash != hash)
  435. continue;
  436. if (likely(inet_match(net, sk, acookie, ports, dif, sdif))) {
  437. if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
  438. goto out;
  439. if (unlikely(!inet_match(net, sk, acookie,
  440. ports, dif, sdif))) {
  441. sock_gen_put(sk);
  442. goto begin;
  443. }
  444. goto found;
  445. }
  446. }
  447. /*
  448. * if the nulls value we got at the end of this lookup is
  449. * not the expected one, we must restart lookup.
  450. * We probably met an item that was moved to another chain.
  451. */
  452. if (get_nulls_value(node) != slot)
  453. goto begin;
  454. out:
  455. sk = NULL;
  456. found:
  457. return sk;
  458. }
  459. EXPORT_SYMBOL_GPL(__inet_lookup_established);
  460. /* called with local bh disabled */
  461. static int __inet_check_established(struct inet_timewait_death_row *death_row,
  462. struct sock *sk, __u16 lport,
  463. struct inet_timewait_sock **twp)
  464. {
  465. struct inet_hashinfo *hinfo = death_row->hashinfo;
  466. struct inet_sock *inet = inet_sk(sk);
  467. __be32 daddr = inet->inet_rcv_saddr;
  468. __be32 saddr = inet->inet_daddr;
  469. int dif = sk->sk_bound_dev_if;
  470. struct net *net = sock_net(sk);
  471. int sdif = l3mdev_master_ifindex_by_index(net, dif);
  472. INET_ADDR_COOKIE(acookie, saddr, daddr);
  473. const __portpair ports = INET_COMBINED_PORTS(inet->inet_dport, lport);
  474. unsigned int hash = inet_ehashfn(net, daddr, lport,
  475. saddr, inet->inet_dport);
  476. struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
  477. spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
  478. struct sock *sk2;
  479. const struct hlist_nulls_node *node;
  480. struct inet_timewait_sock *tw = NULL;
  481. spin_lock(lock);
  482. sk_nulls_for_each(sk2, node, &head->chain) {
  483. if (sk2->sk_hash != hash)
  484. continue;
  485. if (likely(inet_match(net, sk2, acookie, ports, dif, sdif))) {
  486. if (sk2->sk_state == TCP_TIME_WAIT) {
  487. tw = inet_twsk(sk2);
  488. if (twsk_unique(sk, sk2, twp))
  489. break;
  490. }
  491. goto not_unique;
  492. }
  493. }
  494. /* Must record num and sport now. Otherwise we will see
  495. * in hash table socket with a funny identity.
  496. */
  497. inet->inet_num = lport;
  498. inet->inet_sport = htons(lport);
  499. sk->sk_hash = hash;
  500. WARN_ON(!sk_unhashed(sk));
  501. __sk_nulls_add_node_rcu(sk, &head->chain);
  502. if (tw) {
  503. sk_nulls_del_node_init_rcu((struct sock *)tw);
  504. __NET_INC_STATS(net, LINUX_MIB_TIMEWAITRECYCLED);
  505. }
  506. spin_unlock(lock);
  507. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  508. if (twp) {
  509. *twp = tw;
  510. } else if (tw) {
  511. /* Silly. Should hash-dance instead... */
  512. inet_twsk_deschedule_put(tw);
  513. }
  514. return 0;
  515. not_unique:
  516. spin_unlock(lock);
  517. return -EADDRNOTAVAIL;
  518. }
  519. static u64 inet_sk_port_offset(const struct sock *sk)
  520. {
  521. const struct inet_sock *inet = inet_sk(sk);
  522. return secure_ipv4_port_ephemeral(inet->inet_rcv_saddr,
  523. inet->inet_daddr,
  524. inet->inet_dport);
  525. }
  526. /* Searches for an exsiting socket in the ehash bucket list.
  527. * Returns true if found, false otherwise.
  528. */
  529. static bool inet_ehash_lookup_by_sk(struct sock *sk,
  530. struct hlist_nulls_head *list)
  531. {
  532. const __portpair ports = INET_COMBINED_PORTS(sk->sk_dport, sk->sk_num);
  533. const int sdif = sk->sk_bound_dev_if;
  534. const int dif = sk->sk_bound_dev_if;
  535. const struct hlist_nulls_node *node;
  536. struct net *net = sock_net(sk);
  537. struct sock *esk;
  538. INET_ADDR_COOKIE(acookie, sk->sk_daddr, sk->sk_rcv_saddr);
  539. sk_nulls_for_each_rcu(esk, node, list) {
  540. if (esk->sk_hash != sk->sk_hash)
  541. continue;
  542. if (sk->sk_family == AF_INET) {
  543. if (unlikely(inet_match(net, esk, acookie,
  544. ports, dif, sdif))) {
  545. return true;
  546. }
  547. }
  548. #if IS_ENABLED(CONFIG_IPV6)
  549. else if (sk->sk_family == AF_INET6) {
  550. if (unlikely(inet6_match(net, esk,
  551. &sk->sk_v6_daddr,
  552. &sk->sk_v6_rcv_saddr,
  553. ports, dif, sdif))) {
  554. return true;
  555. }
  556. }
  557. #endif
  558. }
  559. return false;
  560. }
  561. /* Insert a socket into ehash, and eventually remove another one
  562. * (The another one can be a SYN_RECV or TIMEWAIT)
  563. * If an existing socket already exists, socket sk is not inserted,
  564. * and sets found_dup_sk parameter to true.
  565. */
  566. bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk)
  567. {
  568. struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
  569. struct inet_ehash_bucket *head;
  570. struct hlist_nulls_head *list;
  571. spinlock_t *lock;
  572. bool ret = true;
  573. WARN_ON_ONCE(!sk_unhashed(sk));
  574. sk->sk_hash = sk_ehashfn(sk);
  575. head = inet_ehash_bucket(hashinfo, sk->sk_hash);
  576. list = &head->chain;
  577. lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
  578. spin_lock(lock);
  579. if (osk) {
  580. WARN_ON_ONCE(sk->sk_hash != osk->sk_hash);
  581. ret = sk_nulls_del_node_init_rcu(osk);
  582. } else if (found_dup_sk) {
  583. *found_dup_sk = inet_ehash_lookup_by_sk(sk, list);
  584. if (*found_dup_sk)
  585. ret = false;
  586. }
  587. if (ret)
  588. __sk_nulls_add_node_rcu(sk, list);
  589. spin_unlock(lock);
  590. return ret;
  591. }
  592. bool inet_ehash_nolisten(struct sock *sk, struct sock *osk, bool *found_dup_sk)
  593. {
  594. bool ok = inet_ehash_insert(sk, osk, found_dup_sk);
  595. if (ok) {
  596. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  597. } else {
  598. this_cpu_inc(*sk->sk_prot->orphan_count);
  599. inet_sk_set_state(sk, TCP_CLOSE);
  600. sock_set_flag(sk, SOCK_DEAD);
  601. inet_csk_destroy_sock(sk);
  602. }
  603. return ok;
  604. }
  605. EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
  606. static int inet_reuseport_add_sock(struct sock *sk,
  607. struct inet_listen_hashbucket *ilb)
  608. {
  609. struct inet_bind_bucket *tb = inet_csk(sk)->icsk_bind_hash;
  610. const struct hlist_nulls_node *node;
  611. struct sock *sk2;
  612. kuid_t uid = sock_i_uid(sk);
  613. sk_nulls_for_each_rcu(sk2, node, &ilb->nulls_head) {
  614. if (sk2 != sk &&
  615. sk2->sk_family == sk->sk_family &&
  616. ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
  617. sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
  618. inet_csk(sk2)->icsk_bind_hash == tb &&
  619. sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
  620. inet_rcv_saddr_equal(sk, sk2, false))
  621. return reuseport_add_sock(sk, sk2,
  622. inet_rcv_saddr_any(sk));
  623. }
  624. return reuseport_alloc(sk, inet_rcv_saddr_any(sk));
  625. }
  626. int __inet_hash(struct sock *sk, struct sock *osk)
  627. {
  628. struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
  629. struct inet_listen_hashbucket *ilb2;
  630. int err = 0;
  631. if (sk->sk_state != TCP_LISTEN) {
  632. local_bh_disable();
  633. inet_ehash_nolisten(sk, osk, NULL);
  634. local_bh_enable();
  635. return 0;
  636. }
  637. WARN_ON(!sk_unhashed(sk));
  638. ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
  639. spin_lock(&ilb2->lock);
  640. if (sk->sk_reuseport) {
  641. err = inet_reuseport_add_sock(sk, ilb2);
  642. if (err)
  643. goto unlock;
  644. }
  645. sock_set_flag(sk, SOCK_RCU_FREE);
  646. if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
  647. sk->sk_family == AF_INET6)
  648. __sk_nulls_add_node_tail_rcu(sk, &ilb2->nulls_head);
  649. else
  650. __sk_nulls_add_node_rcu(sk, &ilb2->nulls_head);
  651. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
  652. unlock:
  653. spin_unlock(&ilb2->lock);
  654. return err;
  655. }
  656. EXPORT_SYMBOL(__inet_hash);
  657. int inet_hash(struct sock *sk)
  658. {
  659. int err = 0;
  660. if (sk->sk_state != TCP_CLOSE)
  661. err = __inet_hash(sk, NULL);
  662. return err;
  663. }
  664. EXPORT_SYMBOL_GPL(inet_hash);
  665. void inet_unhash(struct sock *sk)
  666. {
  667. struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk);
  668. if (sk_unhashed(sk))
  669. return;
  670. if (sk->sk_state == TCP_LISTEN) {
  671. struct inet_listen_hashbucket *ilb2;
  672. ilb2 = inet_lhash2_bucket_sk(hashinfo, sk);
  673. /* Don't disable bottom halves while acquiring the lock to
  674. * avoid circular locking dependency on PREEMPT_RT.
  675. */
  676. spin_lock(&ilb2->lock);
  677. if (sk_unhashed(sk)) {
  678. spin_unlock(&ilb2->lock);
  679. return;
  680. }
  681. if (rcu_access_pointer(sk->sk_reuseport_cb))
  682. reuseport_stop_listen_sock(sk);
  683. __sk_nulls_del_node_init_rcu(sk);
  684. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  685. spin_unlock(&ilb2->lock);
  686. } else {
  687. spinlock_t *lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
  688. spin_lock_bh(lock);
  689. if (sk_unhashed(sk)) {
  690. spin_unlock_bh(lock);
  691. return;
  692. }
  693. __sk_nulls_del_node_init_rcu(sk);
  694. sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
  695. spin_unlock_bh(lock);
  696. }
  697. }
  698. EXPORT_SYMBOL_GPL(inet_unhash);
  699. static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb,
  700. const struct net *net, unsigned short port,
  701. int l3mdev, const struct sock *sk)
  702. {
  703. if (!net_eq(ib2_net(tb), net) || tb->port != port ||
  704. tb->l3mdev != l3mdev)
  705. return false;
  706. return inet_bind2_bucket_addr_match(tb, sk);
  707. }
  708. bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const struct net *net,
  709. unsigned short port, int l3mdev, const struct sock *sk)
  710. {
  711. if (!net_eq(ib2_net(tb), net) || tb->port != port ||
  712. tb->l3mdev != l3mdev)
  713. return false;
  714. #if IS_ENABLED(CONFIG_IPV6)
  715. if (sk->sk_family != tb->family) {
  716. if (sk->sk_family == AF_INET)
  717. return ipv6_addr_any(&tb->v6_rcv_saddr) ||
  718. ipv6_addr_v4mapped_any(&tb->v6_rcv_saddr);
  719. return false;
  720. }
  721. if (sk->sk_family == AF_INET6)
  722. return ipv6_addr_any(&tb->v6_rcv_saddr);
  723. #endif
  724. return tb->rcv_saddr == 0;
  725. }
  726. /* The socket's bhash2 hashbucket spinlock must be held when this is called */
  727. struct inet_bind2_bucket *
  728. inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net *net,
  729. unsigned short port, int l3mdev, const struct sock *sk)
  730. {
  731. struct inet_bind2_bucket *bhash2 = NULL;
  732. inet_bind_bucket_for_each(bhash2, &head->chain)
  733. if (inet_bind2_bucket_match(bhash2, net, port, l3mdev, sk))
  734. break;
  735. return bhash2;
  736. }
  737. struct inet_bind_hashbucket *
  738. inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port)
  739. {
  740. struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
  741. u32 hash;
  742. #if IS_ENABLED(CONFIG_IPV6)
  743. if (sk->sk_family == AF_INET6)
  744. hash = ipv6_portaddr_hash(net, &in6addr_any, port);
  745. else
  746. #endif
  747. hash = ipv4_portaddr_hash(net, 0, port);
  748. return &hinfo->bhash2[hash & (hinfo->bhash_size - 1)];
  749. }
  750. static void inet_update_saddr(struct sock *sk, void *saddr, int family)
  751. {
  752. if (family == AF_INET) {
  753. inet_sk(sk)->inet_saddr = *(__be32 *)saddr;
  754. sk_rcv_saddr_set(sk, inet_sk(sk)->inet_saddr);
  755. }
  756. #if IS_ENABLED(CONFIG_IPV6)
  757. else {
  758. sk->sk_v6_rcv_saddr = *(struct in6_addr *)saddr;
  759. }
  760. #endif
  761. }
  762. static int __inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family, bool reset)
  763. {
  764. struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk);
  765. struct inet_bind_hashbucket *head, *head2;
  766. struct inet_bind2_bucket *tb2, *new_tb2;
  767. int l3mdev = inet_sk_bound_l3mdev(sk);
  768. int port = inet_sk(sk)->inet_num;
  769. struct net *net = sock_net(sk);
  770. int bhash;
  771. if (!inet_csk(sk)->icsk_bind2_hash) {
  772. /* Not bind()ed before. */
  773. if (reset)
  774. inet_reset_saddr(sk);
  775. else
  776. inet_update_saddr(sk, saddr, family);
  777. return 0;
  778. }
  779. /* Allocate a bind2 bucket ahead of time to avoid permanently putting
  780. * the bhash2 table in an inconsistent state if a new tb2 bucket
  781. * allocation fails.
  782. */
  783. new_tb2 = kmem_cache_alloc(hinfo->bind2_bucket_cachep, GFP_ATOMIC);
  784. if (!new_tb2) {
  785. if (reset) {
  786. /* The (INADDR_ANY, port) bucket might have already
  787. * been freed, then we cannot fixup icsk_bind2_hash,
  788. * so we give up and unlink sk from bhash/bhash2 not
  789. * to leave inconsistency in bhash2.
  790. */
  791. inet_put_port(sk);
  792. inet_reset_saddr(sk);
  793. }
  794. return -ENOMEM;
  795. }
  796. bhash = inet_bhashfn(net, port, hinfo->bhash_size);
  797. head = &hinfo->bhash[bhash];
  798. head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
  799. /* If we change saddr locklessly, another thread
  800. * iterating over bhash might see corrupted address.
  801. */
  802. spin_lock_bh(&head->lock);
  803. spin_lock(&head2->lock);
  804. __sk_del_bind2_node(sk);
  805. inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, inet_csk(sk)->icsk_bind2_hash);
  806. spin_unlock(&head2->lock);
  807. if (reset)
  808. inet_reset_saddr(sk);
  809. else
  810. inet_update_saddr(sk, saddr, family);
  811. head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
  812. spin_lock(&head2->lock);
  813. tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
  814. if (!tb2) {
  815. tb2 = new_tb2;
  816. inet_bind2_bucket_init(tb2, net, head2, port, l3mdev, sk);
  817. }
  818. sk_add_bind2_node(sk, &tb2->owners);
  819. inet_csk(sk)->icsk_bind2_hash = tb2;
  820. spin_unlock(&head2->lock);
  821. spin_unlock_bh(&head->lock);
  822. if (tb2 != new_tb2)
  823. kmem_cache_free(hinfo->bind2_bucket_cachep, new_tb2);
  824. return 0;
  825. }
  826. int inet_bhash2_update_saddr(struct sock *sk, void *saddr, int family)
  827. {
  828. return __inet_bhash2_update_saddr(sk, saddr, family, false);
  829. }
  830. EXPORT_SYMBOL_GPL(inet_bhash2_update_saddr);
  831. void inet_bhash2_reset_saddr(struct sock *sk)
  832. {
  833. if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK))
  834. __inet_bhash2_update_saddr(sk, NULL, 0, true);
  835. }
  836. EXPORT_SYMBOL_GPL(inet_bhash2_reset_saddr);
  837. /* RFC 6056 3.3.4. Algorithm 4: Double-Hash Port Selection Algorithm
  838. * Note that we use 32bit integers (vs RFC 'short integers')
  839. * because 2^16 is not a multiple of num_ephemeral and this
  840. * property might be used by clever attacker.
  841. *
  842. * RFC claims using TABLE_LENGTH=10 buckets gives an improvement, though
  843. * attacks were since demonstrated, thus we use 65536 by default instead
  844. * to really give more isolation and privacy, at the expense of 256kB
  845. * of kernel memory.
  846. */
  847. #define INET_TABLE_PERTURB_SIZE (1 << CONFIG_INET_TABLE_PERTURB_ORDER)
  848. static u32 *table_perturb;
  849. int __inet_hash_connect(struct inet_timewait_death_row *death_row,
  850. struct sock *sk, u64 port_offset,
  851. int (*check_established)(struct inet_timewait_death_row *,
  852. struct sock *, __u16, struct inet_timewait_sock **))
  853. {
  854. struct inet_hashinfo *hinfo = death_row->hashinfo;
  855. struct inet_bind_hashbucket *head, *head2;
  856. struct inet_timewait_sock *tw = NULL;
  857. int port = inet_sk(sk)->inet_num;
  858. struct net *net = sock_net(sk);
  859. struct inet_bind2_bucket *tb2;
  860. struct inet_bind_bucket *tb;
  861. bool tb_created = false;
  862. u32 remaining, offset;
  863. int ret, i, low, high;
  864. int l3mdev;
  865. u32 index;
  866. if (port) {
  867. local_bh_disable();
  868. ret = check_established(death_row, sk, port, NULL);
  869. local_bh_enable();
  870. return ret;
  871. }
  872. l3mdev = inet_sk_bound_l3mdev(sk);
  873. inet_sk_get_local_port_range(sk, &low, &high);
  874. high++; /* [32768, 60999] -> [32768, 61000[ */
  875. remaining = high - low;
  876. if (likely(remaining > 1))
  877. remaining &= ~1U;
  878. get_random_sleepable_once(table_perturb,
  879. INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
  880. index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);
  881. offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
  882. offset %= remaining;
  883. /* In first pass we try ports of @low parity.
  884. * inet_csk_get_port() does the opposite choice.
  885. */
  886. offset &= ~1U;
  887. other_parity_scan:
  888. port = low + offset;
  889. for (i = 0; i < remaining; i += 2, port += 2) {
  890. if (unlikely(port >= high))
  891. port -= remaining;
  892. if (inet_is_local_reserved_port(net, port))
  893. continue;
  894. head = &hinfo->bhash[inet_bhashfn(net, port,
  895. hinfo->bhash_size)];
  896. spin_lock_bh(&head->lock);
  897. /* Does not bother with rcv_saddr checks, because
  898. * the established check is already unique enough.
  899. */
  900. inet_bind_bucket_for_each(tb, &head->chain) {
  901. if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
  902. if (tb->fastreuse >= 0 ||
  903. tb->fastreuseport >= 0)
  904. goto next_port;
  905. WARN_ON(hlist_empty(&tb->owners));
  906. if (!check_established(death_row, sk,
  907. port, &tw))
  908. goto ok;
  909. goto next_port;
  910. }
  911. }
  912. tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
  913. net, head, port, l3mdev);
  914. if (!tb) {
  915. spin_unlock_bh(&head->lock);
  916. return -ENOMEM;
  917. }
  918. tb_created = true;
  919. tb->fastreuse = -1;
  920. tb->fastreuseport = -1;
  921. goto ok;
  922. next_port:
  923. spin_unlock_bh(&head->lock);
  924. cond_resched();
  925. }
  926. offset++;
  927. if ((offset & 1) && remaining > 1)
  928. goto other_parity_scan;
  929. return -EADDRNOTAVAIL;
  930. ok:
  931. /* Find the corresponding tb2 bucket since we need to
  932. * add the socket to the bhash2 table as well
  933. */
  934. head2 = inet_bhashfn_portaddr(hinfo, sk, net, port);
  935. spin_lock(&head2->lock);
  936. tb2 = inet_bind2_bucket_find(head2, net, port, l3mdev, sk);
  937. if (!tb2) {
  938. tb2 = inet_bind2_bucket_create(hinfo->bind2_bucket_cachep, net,
  939. head2, port, l3mdev, sk);
  940. if (!tb2)
  941. goto error;
  942. }
  943. /* Here we want to add a little bit of randomness to the next source
  944. * port that will be chosen. We use a max() with a random here so that
  945. * on low contention the randomness is maximal and on high contention
  946. * it may be inexistent.
  947. */
  948. i = max_t(int, i, prandom_u32_max(8) * 2);
  949. WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);
  950. /* Head lock still held and bh's disabled */
  951. inet_bind_hash(sk, tb, tb2, port);
  952. if (sk_unhashed(sk)) {
  953. inet_sk(sk)->inet_sport = htons(port);
  954. inet_ehash_nolisten(sk, (struct sock *)tw, NULL);
  955. }
  956. if (tw)
  957. inet_twsk_bind_unhash(tw, hinfo);
  958. spin_unlock(&head2->lock);
  959. spin_unlock(&head->lock);
  960. if (tw)
  961. inet_twsk_deschedule_put(tw);
  962. local_bh_enable();
  963. return 0;
  964. error:
  965. spin_unlock(&head2->lock);
  966. if (tb_created)
  967. inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
  968. spin_unlock_bh(&head->lock);
  969. return -ENOMEM;
  970. }
  971. /*
  972. * Bind a port for a connect operation and hash it.
  973. */
  974. int inet_hash_connect(struct inet_timewait_death_row *death_row,
  975. struct sock *sk)
  976. {
  977. u64 port_offset = 0;
  978. if (!inet_sk(sk)->inet_num)
  979. port_offset = inet_sk_port_offset(sk);
  980. return __inet_hash_connect(death_row, sk, port_offset,
  981. __inet_check_established);
  982. }
  983. EXPORT_SYMBOL_GPL(inet_hash_connect);
  984. static void init_hashinfo_lhash2(struct inet_hashinfo *h)
  985. {
  986. int i;
  987. for (i = 0; i <= h->lhash2_mask; i++) {
  988. spin_lock_init(&h->lhash2[i].lock);
  989. INIT_HLIST_NULLS_HEAD(&h->lhash2[i].nulls_head,
  990. i + LISTENING_NULLS_BASE);
  991. }
  992. }
  993. void __init inet_hashinfo2_init(struct inet_hashinfo *h, const char *name,
  994. unsigned long numentries, int scale,
  995. unsigned long low_limit,
  996. unsigned long high_limit)
  997. {
  998. h->lhash2 = alloc_large_system_hash(name,
  999. sizeof(*h->lhash2),
  1000. numentries,
  1001. scale,
  1002. 0,
  1003. NULL,
  1004. &h->lhash2_mask,
  1005. low_limit,
  1006. high_limit);
  1007. init_hashinfo_lhash2(h);
  1008. /* this one is used for source ports of outgoing connections */
  1009. table_perturb = alloc_large_system_hash("Table-perturb",
  1010. sizeof(*table_perturb),
  1011. INET_TABLE_PERTURB_SIZE,
  1012. 0, 0, NULL, NULL,
  1013. INET_TABLE_PERTURB_SIZE,
  1014. INET_TABLE_PERTURB_SIZE);
  1015. }
  1016. int inet_hashinfo2_init_mod(struct inet_hashinfo *h)
  1017. {
  1018. h->lhash2 = kmalloc_array(INET_LHTABLE_SIZE, sizeof(*h->lhash2), GFP_KERNEL);
  1019. if (!h->lhash2)
  1020. return -ENOMEM;
  1021. h->lhash2_mask = INET_LHTABLE_SIZE - 1;
  1022. /* INET_LHTABLE_SIZE must be a power of 2 */
  1023. BUG_ON(INET_LHTABLE_SIZE & h->lhash2_mask);
  1024. init_hashinfo_lhash2(h);
  1025. return 0;
  1026. }
  1027. EXPORT_SYMBOL_GPL(inet_hashinfo2_init_mod);
  1028. int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo)
  1029. {
  1030. unsigned int locksz = sizeof(spinlock_t);
  1031. unsigned int i, nblocks = 1;
  1032. if (locksz != 0) {
  1033. /* allocate 2 cache lines or at least one spinlock per cpu */
  1034. nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U);
  1035. nblocks = roundup_pow_of_two(nblocks * num_possible_cpus());
  1036. /* no more locks than number of hash buckets */
  1037. nblocks = min(nblocks, hashinfo->ehash_mask + 1);
  1038. hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL);
  1039. if (!hashinfo->ehash_locks)
  1040. return -ENOMEM;
  1041. for (i = 0; i < nblocks; i++)
  1042. spin_lock_init(&hashinfo->ehash_locks[i]);
  1043. }
  1044. hashinfo->ehash_locks_mask = nblocks - 1;
  1045. return 0;
  1046. }
  1047. EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc);
  1048. struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo,
  1049. unsigned int ehash_entries)
  1050. {
  1051. struct inet_hashinfo *new_hashinfo;
  1052. int i;
  1053. new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL);
  1054. if (!new_hashinfo)
  1055. goto err;
  1056. new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket),
  1057. GFP_KERNEL_ACCOUNT);
  1058. if (!new_hashinfo->ehash)
  1059. goto free_hashinfo;
  1060. new_hashinfo->ehash_mask = ehash_entries - 1;
  1061. if (inet_ehash_locks_alloc(new_hashinfo))
  1062. goto free_ehash;
  1063. for (i = 0; i < ehash_entries; i++)
  1064. INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i);
  1065. new_hashinfo->pernet = true;
  1066. return new_hashinfo;
  1067. free_ehash:
  1068. vfree(new_hashinfo->ehash);
  1069. free_hashinfo:
  1070. kfree(new_hashinfo);
  1071. err:
  1072. return NULL;
  1073. }
  1074. EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc);
  1075. void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo)
  1076. {
  1077. if (!hashinfo->pernet)
  1078. return;
  1079. inet_ehash_locks_free(hashinfo);
  1080. vfree(hashinfo->ehash);
  1081. kfree(hashinfo);
  1082. }
  1083. EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free);