pm.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* Multipath TCP
  3. *
  4. * Copyright (c) 2019, Intel Corporation.
  5. */
  6. #define pr_fmt(fmt) "MPTCP: " fmt
  7. #include <linux/kernel.h>
  8. #include <net/tcp.h>
  9. #include <net/mptcp.h>
  10. #include "protocol.h"
  11. #include "mib.h"
  12. /* path manager command handlers */
  13. int mptcp_pm_announce_addr(struct mptcp_sock *msk,
  14. const struct mptcp_addr_info *addr,
  15. bool echo)
  16. {
  17. u8 add_addr = READ_ONCE(msk->pm.addr_signal);
  18. pr_debug("msk=%p, local_id=%d, echo=%d", msk, addr->id, echo);
  19. lockdep_assert_held(&msk->pm.lock);
  20. if (add_addr &
  21. (echo ? BIT(MPTCP_ADD_ADDR_ECHO) : BIT(MPTCP_ADD_ADDR_SIGNAL))) {
  22. pr_warn("addr_signal error, add_addr=%d, echo=%d", add_addr, echo);
  23. return -EINVAL;
  24. }
  25. if (echo) {
  26. msk->pm.remote = *addr;
  27. add_addr |= BIT(MPTCP_ADD_ADDR_ECHO);
  28. } else {
  29. msk->pm.local = *addr;
  30. add_addr |= BIT(MPTCP_ADD_ADDR_SIGNAL);
  31. }
  32. WRITE_ONCE(msk->pm.addr_signal, add_addr);
  33. return 0;
  34. }
  35. int mptcp_pm_remove_addr(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
  36. {
  37. u8 rm_addr = READ_ONCE(msk->pm.addr_signal);
  38. pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
  39. if (rm_addr) {
  40. pr_warn("addr_signal error, rm_addr=%d", rm_addr);
  41. return -EINVAL;
  42. }
  43. msk->pm.rm_list_tx = *rm_list;
  44. rm_addr |= BIT(MPTCP_RM_ADDR_SIGNAL);
  45. WRITE_ONCE(msk->pm.addr_signal, rm_addr);
  46. mptcp_pm_nl_addr_send_ack(msk);
  47. return 0;
  48. }
  49. int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list *rm_list)
  50. {
  51. pr_debug("msk=%p, rm_list_nr=%d", msk, rm_list->nr);
  52. spin_lock_bh(&msk->pm.lock);
  53. mptcp_pm_nl_rm_subflow_received(msk, rm_list);
  54. spin_unlock_bh(&msk->pm.lock);
  55. return 0;
  56. }
  57. /* path manager event handlers */
  58. void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side)
  59. {
  60. struct mptcp_pm_data *pm = &msk->pm;
  61. pr_debug("msk=%p, token=%u side=%d", msk, msk->token, server_side);
  62. WRITE_ONCE(pm->server_side, server_side);
  63. mptcp_event(MPTCP_EVENT_CREATED, msk, ssk, GFP_ATOMIC);
  64. }
  65. bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk)
  66. {
  67. struct mptcp_pm_data *pm = &msk->pm;
  68. unsigned int subflows_max;
  69. int ret = 0;
  70. if (mptcp_pm_is_userspace(msk)) {
  71. if (mptcp_userspace_pm_active(msk)) {
  72. spin_lock_bh(&pm->lock);
  73. pm->subflows++;
  74. spin_unlock_bh(&pm->lock);
  75. return true;
  76. }
  77. return false;
  78. }
  79. subflows_max = mptcp_pm_get_subflows_max(msk);
  80. pr_debug("msk=%p subflows=%d max=%d allow=%d", msk, pm->subflows,
  81. subflows_max, READ_ONCE(pm->accept_subflow));
  82. /* try to avoid acquiring the lock below */
  83. if (!READ_ONCE(pm->accept_subflow))
  84. return false;
  85. spin_lock_bh(&pm->lock);
  86. if (READ_ONCE(pm->accept_subflow)) {
  87. ret = pm->subflows < subflows_max;
  88. if (ret && ++pm->subflows == subflows_max)
  89. WRITE_ONCE(pm->accept_subflow, false);
  90. }
  91. spin_unlock_bh(&pm->lock);
  92. return ret;
  93. }
  94. /* return true if the new status bit is currently cleared, that is, this event
  95. * can be server, eventually by an already scheduled work
  96. */
  97. static bool mptcp_pm_schedule_work(struct mptcp_sock *msk,
  98. enum mptcp_pm_status new_status)
  99. {
  100. pr_debug("msk=%p status=%x new=%lx", msk, msk->pm.status,
  101. BIT(new_status));
  102. if (msk->pm.status & BIT(new_status))
  103. return false;
  104. msk->pm.status |= BIT(new_status);
  105. mptcp_schedule_work((struct sock *)msk);
  106. return true;
  107. }
  108. void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp)
  109. {
  110. struct mptcp_pm_data *pm = &msk->pm;
  111. bool announce = false;
  112. pr_debug("msk=%p", msk);
  113. spin_lock_bh(&pm->lock);
  114. /* mptcp_pm_fully_established() can be invoked by multiple
  115. * racing paths - accept() and check_fully_established()
  116. * be sure to serve this event only once.
  117. */
  118. if (READ_ONCE(pm->work_pending) &&
  119. !(msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)))
  120. mptcp_pm_schedule_work(msk, MPTCP_PM_ESTABLISHED);
  121. if ((msk->pm.status & BIT(MPTCP_PM_ALREADY_ESTABLISHED)) == 0)
  122. announce = true;
  123. msk->pm.status |= BIT(MPTCP_PM_ALREADY_ESTABLISHED);
  124. spin_unlock_bh(&pm->lock);
  125. if (announce)
  126. mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp);
  127. }
  128. void mptcp_pm_connection_closed(struct mptcp_sock *msk)
  129. {
  130. pr_debug("msk=%p", msk);
  131. }
  132. void mptcp_pm_subflow_established(struct mptcp_sock *msk)
  133. {
  134. struct mptcp_pm_data *pm = &msk->pm;
  135. pr_debug("msk=%p", msk);
  136. if (!READ_ONCE(pm->work_pending))
  137. return;
  138. spin_lock_bh(&pm->lock);
  139. if (READ_ONCE(pm->work_pending))
  140. mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
  141. spin_unlock_bh(&pm->lock);
  142. }
  143. void mptcp_pm_subflow_check_next(struct mptcp_sock *msk, const struct sock *ssk,
  144. const struct mptcp_subflow_context *subflow)
  145. {
  146. struct mptcp_pm_data *pm = &msk->pm;
  147. bool update_subflows;
  148. update_subflows = subflow->request_join || subflow->mp_join;
  149. if (mptcp_pm_is_userspace(msk)) {
  150. if (update_subflows) {
  151. spin_lock_bh(&pm->lock);
  152. pm->subflows--;
  153. spin_unlock_bh(&pm->lock);
  154. }
  155. return;
  156. }
  157. if (!READ_ONCE(pm->work_pending) && !update_subflows)
  158. return;
  159. spin_lock_bh(&pm->lock);
  160. if (update_subflows)
  161. __mptcp_pm_close_subflow(msk);
  162. /* Even if this subflow is not really established, tell the PM to try
  163. * to pick the next ones, if possible.
  164. */
  165. if (mptcp_pm_nl_check_work_pending(msk))
  166. mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
  167. spin_unlock_bh(&pm->lock);
  168. }
  169. void mptcp_pm_add_addr_received(const struct sock *ssk,
  170. const struct mptcp_addr_info *addr)
  171. {
  172. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  173. struct mptcp_sock *msk = mptcp_sk(subflow->conn);
  174. struct mptcp_pm_data *pm = &msk->pm;
  175. pr_debug("msk=%p remote_id=%d accept=%d", msk, addr->id,
  176. READ_ONCE(pm->accept_addr));
  177. mptcp_event_addr_announced(ssk, addr);
  178. spin_lock_bh(&pm->lock);
  179. if (mptcp_pm_is_userspace(msk)) {
  180. if (mptcp_userspace_pm_active(msk)) {
  181. mptcp_pm_announce_addr(msk, addr, true);
  182. mptcp_pm_add_addr_send_ack(msk);
  183. } else {
  184. __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
  185. }
  186. } else if (!READ_ONCE(pm->accept_addr)) {
  187. mptcp_pm_announce_addr(msk, addr, true);
  188. mptcp_pm_add_addr_send_ack(msk);
  189. } else if (mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_RECEIVED)) {
  190. pm->remote = *addr;
  191. } else {
  192. __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_ADDADDRDROP);
  193. }
  194. spin_unlock_bh(&pm->lock);
  195. }
  196. void mptcp_pm_add_addr_echoed(struct mptcp_sock *msk,
  197. const struct mptcp_addr_info *addr)
  198. {
  199. struct mptcp_pm_data *pm = &msk->pm;
  200. pr_debug("msk=%p", msk);
  201. spin_lock_bh(&pm->lock);
  202. if (mptcp_lookup_anno_list_by_saddr(msk, addr) && READ_ONCE(pm->work_pending))
  203. mptcp_pm_schedule_work(msk, MPTCP_PM_SUBFLOW_ESTABLISHED);
  204. spin_unlock_bh(&pm->lock);
  205. }
  206. void mptcp_pm_add_addr_send_ack(struct mptcp_sock *msk)
  207. {
  208. if (!mptcp_pm_should_add_signal(msk))
  209. return;
  210. mptcp_pm_schedule_work(msk, MPTCP_PM_ADD_ADDR_SEND_ACK);
  211. }
  212. void mptcp_pm_rm_addr_received(struct mptcp_sock *msk,
  213. const struct mptcp_rm_list *rm_list)
  214. {
  215. struct mptcp_pm_data *pm = &msk->pm;
  216. u8 i;
  217. pr_debug("msk=%p remote_ids_nr=%d", msk, rm_list->nr);
  218. for (i = 0; i < rm_list->nr; i++)
  219. mptcp_event_addr_removed(msk, rm_list->ids[i]);
  220. spin_lock_bh(&pm->lock);
  221. if (mptcp_pm_schedule_work(msk, MPTCP_PM_RM_ADDR_RECEIVED))
  222. pm->rm_list_rx = *rm_list;
  223. else
  224. __MPTCP_INC_STATS(sock_net((struct sock *)msk), MPTCP_MIB_RMADDRDROP);
  225. spin_unlock_bh(&pm->lock);
  226. }
  227. void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
  228. {
  229. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  230. struct sock *sk = subflow->conn;
  231. struct mptcp_sock *msk;
  232. pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
  233. msk = mptcp_sk(sk);
  234. if (subflow->backup != bkup) {
  235. subflow->backup = bkup;
  236. mptcp_data_lock(sk);
  237. if (!sock_owned_by_user(sk))
  238. msk->last_snd = NULL;
  239. else
  240. __set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
  241. mptcp_data_unlock(sk);
  242. }
  243. mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
  244. }
  245. void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
  246. {
  247. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  248. struct mptcp_sock *msk = mptcp_sk(subflow->conn);
  249. pr_debug("fail_seq=%llu", fail_seq);
  250. if (!READ_ONCE(msk->allow_infinite_fallback))
  251. return;
  252. if (!subflow->fail_tout) {
  253. pr_debug("send MP_FAIL response and infinite map");
  254. subflow->send_mp_fail = 1;
  255. subflow->send_infinite_map = 1;
  256. tcp_send_ack(sk);
  257. } else {
  258. pr_debug("MP_FAIL response received");
  259. WRITE_ONCE(subflow->fail_tout, 0);
  260. }
  261. }
  262. /* path manager helpers */
  263. bool mptcp_pm_add_addr_signal(struct mptcp_sock *msk, const struct sk_buff *skb,
  264. unsigned int opt_size, unsigned int remaining,
  265. struct mptcp_addr_info *addr, bool *echo,
  266. bool *drop_other_suboptions)
  267. {
  268. int ret = false;
  269. u8 add_addr;
  270. u8 family;
  271. bool port;
  272. spin_lock_bh(&msk->pm.lock);
  273. /* double check after the lock is acquired */
  274. if (!mptcp_pm_should_add_signal(msk))
  275. goto out_unlock;
  276. /* always drop every other options for pure ack ADD_ADDR; this is a
  277. * plain dup-ack from TCP perspective. The other MPTCP-relevant info,
  278. * if any, will be carried by the 'original' TCP ack
  279. */
  280. if (skb && skb_is_tcp_pure_ack(skb)) {
  281. remaining += opt_size;
  282. *drop_other_suboptions = true;
  283. }
  284. *echo = mptcp_pm_should_add_signal_echo(msk);
  285. port = !!(*echo ? msk->pm.remote.port : msk->pm.local.port);
  286. family = *echo ? msk->pm.remote.family : msk->pm.local.family;
  287. if (remaining < mptcp_add_addr_len(family, *echo, port))
  288. goto out_unlock;
  289. if (*echo) {
  290. *addr = msk->pm.remote;
  291. add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_ECHO);
  292. } else {
  293. *addr = msk->pm.local;
  294. add_addr = msk->pm.addr_signal & ~BIT(MPTCP_ADD_ADDR_SIGNAL);
  295. }
  296. WRITE_ONCE(msk->pm.addr_signal, add_addr);
  297. ret = true;
  298. out_unlock:
  299. spin_unlock_bh(&msk->pm.lock);
  300. return ret;
  301. }
  302. bool mptcp_pm_rm_addr_signal(struct mptcp_sock *msk, unsigned int remaining,
  303. struct mptcp_rm_list *rm_list)
  304. {
  305. int ret = false, len;
  306. u8 rm_addr;
  307. spin_lock_bh(&msk->pm.lock);
  308. /* double check after the lock is acquired */
  309. if (!mptcp_pm_should_rm_signal(msk))
  310. goto out_unlock;
  311. rm_addr = msk->pm.addr_signal & ~BIT(MPTCP_RM_ADDR_SIGNAL);
  312. len = mptcp_rm_addr_len(&msk->pm.rm_list_tx);
  313. if (len < 0) {
  314. WRITE_ONCE(msk->pm.addr_signal, rm_addr);
  315. goto out_unlock;
  316. }
  317. if (remaining < len)
  318. goto out_unlock;
  319. *rm_list = msk->pm.rm_list_tx;
  320. WRITE_ONCE(msk->pm.addr_signal, rm_addr);
  321. ret = true;
  322. out_unlock:
  323. spin_unlock_bh(&msk->pm.lock);
  324. return ret;
  325. }
  326. int mptcp_pm_get_local_id(struct mptcp_sock *msk, struct sock_common *skc)
  327. {
  328. return mptcp_pm_nl_get_local_id(msk, skc);
  329. }
  330. void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk)
  331. {
  332. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  333. u32 rcv_tstamp = READ_ONCE(tcp_sk(ssk)->rcv_tstamp);
  334. /* keep track of rtx periods with no progress */
  335. if (!subflow->stale_count) {
  336. subflow->stale_rcv_tstamp = rcv_tstamp;
  337. subflow->stale_count++;
  338. } else if (subflow->stale_rcv_tstamp == rcv_tstamp) {
  339. if (subflow->stale_count < U8_MAX)
  340. subflow->stale_count++;
  341. mptcp_pm_nl_subflow_chk_stale(msk, ssk);
  342. } else {
  343. subflow->stale_count = 0;
  344. mptcp_subflow_set_active(subflow);
  345. }
  346. }
  347. /* if sk is ipv4 or ipv6_only allows only same-family local and remote addresses,
  348. * otherwise allow any matching local/remote pair
  349. */
  350. bool mptcp_pm_addr_families_match(const struct sock *sk,
  351. const struct mptcp_addr_info *loc,
  352. const struct mptcp_addr_info *rem)
  353. {
  354. bool mptcp_is_v4 = sk->sk_family == AF_INET;
  355. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  356. bool loc_is_v4 = loc->family == AF_INET || ipv6_addr_v4mapped(&loc->addr6);
  357. bool rem_is_v4 = rem->family == AF_INET || ipv6_addr_v4mapped(&rem->addr6);
  358. if (mptcp_is_v4)
  359. return loc_is_v4 && rem_is_v4;
  360. if (ipv6_only_sock(sk))
  361. return !loc_is_v4 && !rem_is_v4;
  362. return loc_is_v4 == rem_is_v4;
  363. #else
  364. return mptcp_is_v4 && loc->family == AF_INET && rem->family == AF_INET;
  365. #endif
  366. }
  367. void mptcp_pm_data_reset(struct mptcp_sock *msk)
  368. {
  369. u8 pm_type = mptcp_get_pm_type(sock_net((struct sock *)msk));
  370. struct mptcp_pm_data *pm = &msk->pm;
  371. pm->add_addr_signaled = 0;
  372. pm->add_addr_accepted = 0;
  373. pm->local_addr_used = 0;
  374. pm->subflows = 0;
  375. pm->rm_list_tx.nr = 0;
  376. pm->rm_list_rx.nr = 0;
  377. WRITE_ONCE(pm->pm_type, pm_type);
  378. if (pm_type == MPTCP_PM_TYPE_KERNEL) {
  379. bool subflows_allowed = !!mptcp_pm_get_subflows_max(msk);
  380. /* pm->work_pending must be only be set to 'true' when
  381. * pm->pm_type is set to MPTCP_PM_TYPE_KERNEL
  382. */
  383. WRITE_ONCE(pm->work_pending,
  384. (!!mptcp_pm_get_local_addr_max(msk) &&
  385. subflows_allowed) ||
  386. !!mptcp_pm_get_add_addr_signal_max(msk));
  387. WRITE_ONCE(pm->accept_addr,
  388. !!mptcp_pm_get_add_addr_accept_max(msk) &&
  389. subflows_allowed);
  390. WRITE_ONCE(pm->accept_subflow, subflows_allowed);
  391. } else {
  392. WRITE_ONCE(pm->work_pending, 0);
  393. WRITE_ONCE(pm->accept_addr, 0);
  394. WRITE_ONCE(pm->accept_subflow, 0);
  395. }
  396. WRITE_ONCE(pm->addr_signal, 0);
  397. WRITE_ONCE(pm->remote_deny_join_id0, false);
  398. pm->status = 0;
  399. bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1);
  400. }
  401. void mptcp_pm_data_init(struct mptcp_sock *msk)
  402. {
  403. spin_lock_init(&msk->pm.lock);
  404. INIT_LIST_HEAD(&msk->pm.anno_list);
  405. INIT_LIST_HEAD(&msk->pm.userspace_pm_local_addr_list);
  406. mptcp_pm_data_reset(msk);
  407. }
  408. void __init mptcp_pm_init(void)
  409. {
  410. mptcp_pm_nl_init();
  411. }