subflow.c 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* Multipath TCP
  3. *
  4. * Copyright (c) 2017 - 2019, Intel Corporation.
  5. */
  6. #define pr_fmt(fmt) "MPTCP: " fmt
  7. #include <linux/kernel.h>
  8. #include <linux/module.h>
  9. #include <linux/netdevice.h>
  10. #include <crypto/algapi.h>
  11. #include <crypto/sha2.h>
  12. #include <net/sock.h>
  13. #include <net/inet_common.h>
  14. #include <net/inet_hashtables.h>
  15. #include <net/protocol.h>
  16. #include <net/tcp.h>
  17. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  18. #include <net/ip6_route.h>
  19. #include <net/transp_v6.h>
  20. #endif
  21. #include <net/mptcp.h>
  22. #include <uapi/linux/mptcp.h>
  23. #include "protocol.h"
  24. #include "mib.h"
  25. #include <trace/events/mptcp.h>
  26. static void mptcp_subflow_ops_undo_override(struct sock *ssk);
  27. static void SUBFLOW_REQ_INC_STATS(struct request_sock *req,
  28. enum linux_mptcp_mib_field field)
  29. {
  30. MPTCP_INC_STATS(sock_net(req_to_sk(req)), field);
  31. }
  32. static void subflow_req_destructor(struct request_sock *req)
  33. {
  34. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  35. pr_debug("subflow_req=%p", subflow_req);
  36. if (subflow_req->msk)
  37. sock_put((struct sock *)subflow_req->msk);
  38. mptcp_token_destroy_request(req);
  39. }
  40. static void subflow_generate_hmac(u64 key1, u64 key2, u32 nonce1, u32 nonce2,
  41. void *hmac)
  42. {
  43. u8 msg[8];
  44. put_unaligned_be32(nonce1, &msg[0]);
  45. put_unaligned_be32(nonce2, &msg[4]);
  46. mptcp_crypto_hmac_sha(key1, key2, msg, 8, hmac);
  47. }
  48. static bool mptcp_can_accept_new_subflow(const struct mptcp_sock *msk)
  49. {
  50. return mptcp_is_fully_established((void *)msk) &&
  51. ((mptcp_pm_is_userspace(msk) &&
  52. mptcp_userspace_pm_active(msk)) ||
  53. READ_ONCE(msk->pm.accept_subflow));
  54. }
  55. /* validate received token and create truncated hmac and nonce for SYN-ACK */
  56. static void subflow_req_create_thmac(struct mptcp_subflow_request_sock *subflow_req)
  57. {
  58. struct mptcp_sock *msk = subflow_req->msk;
  59. u8 hmac[SHA256_DIGEST_SIZE];
  60. get_random_bytes(&subflow_req->local_nonce, sizeof(u32));
  61. subflow_generate_hmac(msk->local_key, msk->remote_key,
  62. subflow_req->local_nonce,
  63. subflow_req->remote_nonce, hmac);
  64. subflow_req->thmac = get_unaligned_be64(hmac);
  65. }
  66. static struct mptcp_sock *subflow_token_join_request(struct request_sock *req)
  67. {
  68. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  69. struct mptcp_sock *msk;
  70. int local_id;
  71. msk = mptcp_token_get_sock(sock_net(req_to_sk(req)), subflow_req->token);
  72. if (!msk) {
  73. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINNOTOKEN);
  74. return NULL;
  75. }
  76. local_id = mptcp_pm_get_local_id(msk, (struct sock_common *)req);
  77. if (local_id < 0) {
  78. sock_put((struct sock *)msk);
  79. return NULL;
  80. }
  81. subflow_req->local_id = local_id;
  82. return msk;
  83. }
  84. static void subflow_init_req(struct request_sock *req, const struct sock *sk_listener)
  85. {
  86. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  87. subflow_req->mp_capable = 0;
  88. subflow_req->mp_join = 0;
  89. subflow_req->csum_reqd = mptcp_is_checksum_enabled(sock_net(sk_listener));
  90. subflow_req->allow_join_id0 = mptcp_allow_join_id0(sock_net(sk_listener));
  91. subflow_req->msk = NULL;
  92. mptcp_token_init_request(req);
  93. }
  94. static bool subflow_use_different_sport(struct mptcp_sock *msk, const struct sock *sk)
  95. {
  96. return inet_sk(sk)->inet_sport != inet_sk((struct sock *)msk)->inet_sport;
  97. }
  98. static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
  99. {
  100. struct mptcp_ext *mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
  101. if (mpext) {
  102. memset(mpext, 0, sizeof(*mpext));
  103. mpext->reset_reason = reason;
  104. }
  105. }
  106. /* Init mptcp request socket.
  107. *
  108. * Returns an error code if a JOIN has failed and a TCP reset
  109. * should be sent.
  110. */
  111. static int subflow_check_req(struct request_sock *req,
  112. const struct sock *sk_listener,
  113. struct sk_buff *skb)
  114. {
  115. struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
  116. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  117. struct mptcp_options_received mp_opt;
  118. bool opt_mp_capable, opt_mp_join;
  119. pr_debug("subflow_req=%p, listener=%p", subflow_req, listener);
  120. #ifdef CONFIG_TCP_MD5SIG
  121. /* no MPTCP if MD5SIG is enabled on this socket or we may run out of
  122. * TCP option space.
  123. */
  124. if (rcu_access_pointer(tcp_sk(sk_listener)->md5sig_info))
  125. return -EINVAL;
  126. #endif
  127. mptcp_get_options(skb, &mp_opt);
  128. opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
  129. opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
  130. if (opt_mp_capable) {
  131. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
  132. if (opt_mp_join)
  133. return 0;
  134. } else if (opt_mp_join) {
  135. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNRX);
  136. }
  137. if (opt_mp_capable && listener->request_mptcp) {
  138. int err, retries = MPTCP_TOKEN_MAX_RETRIES;
  139. subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
  140. again:
  141. do {
  142. get_random_bytes(&subflow_req->local_key, sizeof(subflow_req->local_key));
  143. } while (subflow_req->local_key == 0);
  144. if (unlikely(req->syncookie)) {
  145. mptcp_crypto_key_sha(subflow_req->local_key,
  146. &subflow_req->token,
  147. &subflow_req->idsn);
  148. if (mptcp_token_exists(subflow_req->token)) {
  149. if (retries-- > 0)
  150. goto again;
  151. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
  152. } else {
  153. subflow_req->mp_capable = 1;
  154. }
  155. return 0;
  156. }
  157. err = mptcp_token_new_request(req);
  158. if (err == 0)
  159. subflow_req->mp_capable = 1;
  160. else if (retries-- > 0)
  161. goto again;
  162. else
  163. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_TOKENFALLBACKINIT);
  164. } else if (opt_mp_join && listener->request_mptcp) {
  165. subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq;
  166. subflow_req->mp_join = 1;
  167. subflow_req->backup = mp_opt.backup;
  168. subflow_req->remote_id = mp_opt.join_id;
  169. subflow_req->token = mp_opt.token;
  170. subflow_req->remote_nonce = mp_opt.nonce;
  171. subflow_req->msk = subflow_token_join_request(req);
  172. /* Can't fall back to TCP in this case. */
  173. if (!subflow_req->msk) {
  174. subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
  175. return -EPERM;
  176. }
  177. if (subflow_use_different_sport(subflow_req->msk, sk_listener)) {
  178. pr_debug("syn inet_sport=%d %d",
  179. ntohs(inet_sk(sk_listener)->inet_sport),
  180. ntohs(inet_sk((struct sock *)subflow_req->msk)->inet_sport));
  181. if (!mptcp_pm_sport_in_anno_list(subflow_req->msk, sk_listener)) {
  182. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTSYNRX);
  183. return -EPERM;
  184. }
  185. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTSYNRX);
  186. }
  187. subflow_req_create_thmac(subflow_req);
  188. if (unlikely(req->syncookie)) {
  189. if (mptcp_can_accept_new_subflow(subflow_req->msk))
  190. subflow_init_req_cookie_join_save(subflow_req, skb);
  191. else
  192. return -EPERM;
  193. }
  194. pr_debug("token=%u, remote_nonce=%u msk=%p", subflow_req->token,
  195. subflow_req->remote_nonce, subflow_req->msk);
  196. }
  197. return 0;
  198. }
  199. int mptcp_subflow_init_cookie_req(struct request_sock *req,
  200. const struct sock *sk_listener,
  201. struct sk_buff *skb)
  202. {
  203. struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk_listener);
  204. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  205. struct mptcp_options_received mp_opt;
  206. bool opt_mp_capable, opt_mp_join;
  207. int err;
  208. subflow_init_req(req, sk_listener);
  209. mptcp_get_options(skb, &mp_opt);
  210. opt_mp_capable = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPC);
  211. opt_mp_join = !!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ);
  212. if (opt_mp_capable && opt_mp_join)
  213. return -EINVAL;
  214. if (opt_mp_capable && listener->request_mptcp) {
  215. if (mp_opt.sndr_key == 0)
  216. return -EINVAL;
  217. subflow_req->local_key = mp_opt.rcvr_key;
  218. err = mptcp_token_new_request(req);
  219. if (err)
  220. return err;
  221. subflow_req->mp_capable = 1;
  222. subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
  223. } else if (opt_mp_join && listener->request_mptcp) {
  224. if (!mptcp_token_join_cookie_init_state(subflow_req, skb))
  225. return -EINVAL;
  226. subflow_req->mp_join = 1;
  227. subflow_req->ssn_offset = TCP_SKB_CB(skb)->seq - 1;
  228. }
  229. return 0;
  230. }
  231. EXPORT_SYMBOL_GPL(mptcp_subflow_init_cookie_req);
  232. static struct dst_entry *subflow_v4_route_req(const struct sock *sk,
  233. struct sk_buff *skb,
  234. struct flowi *fl,
  235. struct request_sock *req)
  236. {
  237. struct dst_entry *dst;
  238. int err;
  239. tcp_rsk(req)->is_mptcp = 1;
  240. subflow_init_req(req, sk);
  241. dst = tcp_request_sock_ipv4_ops.route_req(sk, skb, fl, req);
  242. if (!dst)
  243. return NULL;
  244. err = subflow_check_req(req, sk, skb);
  245. if (err == 0)
  246. return dst;
  247. dst_release(dst);
  248. if (!req->syncookie)
  249. tcp_request_sock_ops.send_reset(sk, skb);
  250. return NULL;
  251. }
  252. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  253. static struct dst_entry *subflow_v6_route_req(const struct sock *sk,
  254. struct sk_buff *skb,
  255. struct flowi *fl,
  256. struct request_sock *req)
  257. {
  258. struct dst_entry *dst;
  259. int err;
  260. tcp_rsk(req)->is_mptcp = 1;
  261. subflow_init_req(req, sk);
  262. dst = tcp_request_sock_ipv6_ops.route_req(sk, skb, fl, req);
  263. if (!dst)
  264. return NULL;
  265. err = subflow_check_req(req, sk, skb);
  266. if (err == 0)
  267. return dst;
  268. dst_release(dst);
  269. if (!req->syncookie)
  270. tcp6_request_sock_ops.send_reset(sk, skb);
  271. return NULL;
  272. }
  273. #endif
  274. /* validate received truncated hmac and create hmac for third ACK */
  275. static bool subflow_thmac_valid(struct mptcp_subflow_context *subflow)
  276. {
  277. u8 hmac[SHA256_DIGEST_SIZE];
  278. u64 thmac;
  279. subflow_generate_hmac(subflow->remote_key, subflow->local_key,
  280. subflow->remote_nonce, subflow->local_nonce,
  281. hmac);
  282. thmac = get_unaligned_be64(hmac);
  283. pr_debug("subflow=%p, token=%u, thmac=%llu, subflow->thmac=%llu\n",
  284. subflow, subflow->token, thmac, subflow->thmac);
  285. return thmac == subflow->thmac;
  286. }
  287. void mptcp_subflow_reset(struct sock *ssk)
  288. {
  289. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  290. struct sock *sk = subflow->conn;
  291. /* mptcp_mp_fail_no_response() can reach here on an already closed
  292. * socket
  293. */
  294. if (ssk->sk_state == TCP_CLOSE)
  295. return;
  296. /* must hold: tcp_done() could drop last reference on parent */
  297. sock_hold(sk);
  298. tcp_send_active_reset(ssk, GFP_ATOMIC);
  299. tcp_done(ssk);
  300. if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags))
  301. mptcp_schedule_work(sk);
  302. sock_put(sk);
  303. }
  304. static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct sock *sk)
  305. {
  306. return inet_sk(sk)->inet_dport != inet_sk((struct sock *)msk)->inet_dport;
  307. }
  308. void __mptcp_set_connected(struct sock *sk)
  309. {
  310. if (sk->sk_state == TCP_SYN_SENT) {
  311. inet_sk_state_store(sk, TCP_ESTABLISHED);
  312. sk->sk_state_change(sk);
  313. }
  314. }
  315. static void mptcp_set_connected(struct sock *sk)
  316. {
  317. mptcp_data_lock(sk);
  318. if (!sock_owned_by_user(sk))
  319. __mptcp_set_connected(sk);
  320. else
  321. __set_bit(MPTCP_CONNECTED, &mptcp_sk(sk)->cb_flags);
  322. mptcp_data_unlock(sk);
  323. }
  324. static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
  325. {
  326. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  327. struct mptcp_options_received mp_opt;
  328. struct sock *parent = subflow->conn;
  329. subflow->icsk_af_ops->sk_rx_dst_set(sk, skb);
  330. /* be sure no special action on any packet other than syn-ack */
  331. if (subflow->conn_finished)
  332. return;
  333. mptcp_propagate_sndbuf(parent, sk);
  334. subflow->rel_write_seq = 1;
  335. subflow->conn_finished = 1;
  336. subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
  337. pr_debug("subflow=%p synack seq=%x", subflow, subflow->ssn_offset);
  338. mptcp_get_options(skb, &mp_opt);
  339. if (subflow->request_mptcp) {
  340. if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) {
  341. MPTCP_INC_STATS(sock_net(sk),
  342. MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
  343. mptcp_do_fallback(sk);
  344. pr_fallback(mptcp_sk(subflow->conn));
  345. goto fallback;
  346. }
  347. if (mp_opt.suboptions & OPTION_MPTCP_CSUMREQD)
  348. WRITE_ONCE(mptcp_sk(parent)->csum_enabled, true);
  349. if (mp_opt.deny_join_id0)
  350. WRITE_ONCE(mptcp_sk(parent)->pm.remote_deny_join_id0, true);
  351. subflow->mp_capable = 1;
  352. subflow->can_ack = 1;
  353. subflow->remote_key = mp_opt.sndr_key;
  354. pr_debug("subflow=%p, remote_key=%llu", subflow,
  355. subflow->remote_key);
  356. MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVEACK);
  357. mptcp_finish_connect(sk);
  358. mptcp_set_connected(parent);
  359. } else if (subflow->request_join) {
  360. u8 hmac[SHA256_DIGEST_SIZE];
  361. if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ)) {
  362. subflow->reset_reason = MPTCP_RST_EMPTCP;
  363. goto do_reset;
  364. }
  365. subflow->backup = mp_opt.backup;
  366. subflow->thmac = mp_opt.thmac;
  367. subflow->remote_nonce = mp_opt.nonce;
  368. subflow->remote_id = mp_opt.join_id;
  369. pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u backup=%d",
  370. subflow, subflow->thmac, subflow->remote_nonce,
  371. subflow->backup);
  372. if (!subflow_thmac_valid(subflow)) {
  373. MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINACKMAC);
  374. subflow->reset_reason = MPTCP_RST_EMPTCP;
  375. goto do_reset;
  376. }
  377. if (!mptcp_finish_join(sk))
  378. goto do_reset;
  379. subflow_generate_hmac(subflow->local_key, subflow->remote_key,
  380. subflow->local_nonce,
  381. subflow->remote_nonce,
  382. hmac);
  383. memcpy(subflow->hmac, hmac, MPTCPOPT_HMAC_LEN);
  384. subflow->mp_join = 1;
  385. MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINSYNACKRX);
  386. if (subflow_use_different_dport(mptcp_sk(parent), sk)) {
  387. pr_debug("synack inet_dport=%d %d",
  388. ntohs(inet_sk(sk)->inet_dport),
  389. ntohs(inet_sk(parent)->inet_dport));
  390. MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_JOINPORTSYNACKRX);
  391. }
  392. } else if (mptcp_check_fallback(sk)) {
  393. fallback:
  394. mptcp_rcv_space_init(mptcp_sk(parent), sk);
  395. mptcp_set_connected(parent);
  396. }
  397. return;
  398. do_reset:
  399. subflow->reset_transient = 0;
  400. mptcp_subflow_reset(sk);
  401. }
  402. static void subflow_set_local_id(struct mptcp_subflow_context *subflow, int local_id)
  403. {
  404. subflow->local_id = local_id;
  405. subflow->local_id_valid = 1;
  406. }
  407. static int subflow_chk_local_id(struct sock *sk)
  408. {
  409. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  410. struct mptcp_sock *msk = mptcp_sk(subflow->conn);
  411. int err;
  412. if (likely(subflow->local_id_valid))
  413. return 0;
  414. err = mptcp_pm_get_local_id(msk, (struct sock_common *)sk);
  415. if (err < 0)
  416. return err;
  417. subflow_set_local_id(subflow, err);
  418. return 0;
  419. }
  420. static int subflow_rebuild_header(struct sock *sk)
  421. {
  422. int err = subflow_chk_local_id(sk);
  423. if (unlikely(err < 0))
  424. return err;
  425. return inet_sk_rebuild_header(sk);
  426. }
  427. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  428. static int subflow_v6_rebuild_header(struct sock *sk)
  429. {
  430. int err = subflow_chk_local_id(sk);
  431. if (unlikely(err < 0))
  432. return err;
  433. return inet6_sk_rebuild_header(sk);
  434. }
  435. #endif
  436. static struct request_sock_ops mptcp_subflow_v4_request_sock_ops __ro_after_init;
  437. static struct tcp_request_sock_ops subflow_request_sock_ipv4_ops __ro_after_init;
  438. static int subflow_v4_conn_request(struct sock *sk, struct sk_buff *skb)
  439. {
  440. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  441. pr_debug("subflow=%p", subflow);
  442. /* Never answer to SYNs sent to broadcast or multicast */
  443. if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
  444. goto drop;
  445. return tcp_conn_request(&mptcp_subflow_v4_request_sock_ops,
  446. &subflow_request_sock_ipv4_ops,
  447. sk, skb);
  448. drop:
  449. tcp_listendrop(sk);
  450. return 0;
  451. }
  452. static void subflow_v4_req_destructor(struct request_sock *req)
  453. {
  454. subflow_req_destructor(req);
  455. tcp_request_sock_ops.destructor(req);
  456. }
  457. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  458. static struct request_sock_ops mptcp_subflow_v6_request_sock_ops __ro_after_init;
  459. static struct tcp_request_sock_ops subflow_request_sock_ipv6_ops __ro_after_init;
  460. static struct inet_connection_sock_af_ops subflow_v6_specific __ro_after_init;
  461. static struct inet_connection_sock_af_ops subflow_v6m_specific __ro_after_init;
  462. static struct proto tcpv6_prot_override __ro_after_init;
  463. static int subflow_v6_conn_request(struct sock *sk, struct sk_buff *skb)
  464. {
  465. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  466. pr_debug("subflow=%p", subflow);
  467. if (skb->protocol == htons(ETH_P_IP))
  468. return subflow_v4_conn_request(sk, skb);
  469. if (!ipv6_unicast_destination(skb))
  470. goto drop;
  471. if (ipv6_addr_v4mapped(&ipv6_hdr(skb)->saddr)) {
  472. __IP6_INC_STATS(sock_net(sk), NULL, IPSTATS_MIB_INHDRERRORS);
  473. return 0;
  474. }
  475. return tcp_conn_request(&mptcp_subflow_v6_request_sock_ops,
  476. &subflow_request_sock_ipv6_ops, sk, skb);
  477. drop:
  478. tcp_listendrop(sk);
  479. return 0; /* don't send reset */
  480. }
  481. static void subflow_v6_req_destructor(struct request_sock *req)
  482. {
  483. subflow_req_destructor(req);
  484. tcp6_request_sock_ops.destructor(req);
  485. }
  486. #endif
  487. struct request_sock *mptcp_subflow_reqsk_alloc(const struct request_sock_ops *ops,
  488. struct sock *sk_listener,
  489. bool attach_listener)
  490. {
  491. if (ops->family == AF_INET)
  492. ops = &mptcp_subflow_v4_request_sock_ops;
  493. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  494. else if (ops->family == AF_INET6)
  495. ops = &mptcp_subflow_v6_request_sock_ops;
  496. #endif
  497. return inet_reqsk_alloc(ops, sk_listener, attach_listener);
  498. }
  499. EXPORT_SYMBOL(mptcp_subflow_reqsk_alloc);
  500. /* validate hmac received in third ACK */
  501. static bool subflow_hmac_valid(const struct request_sock *req,
  502. const struct mptcp_options_received *mp_opt)
  503. {
  504. const struct mptcp_subflow_request_sock *subflow_req;
  505. u8 hmac[SHA256_DIGEST_SIZE];
  506. struct mptcp_sock *msk;
  507. subflow_req = mptcp_subflow_rsk(req);
  508. msk = subflow_req->msk;
  509. if (!msk)
  510. return false;
  511. subflow_generate_hmac(msk->remote_key, msk->local_key,
  512. subflow_req->remote_nonce,
  513. subflow_req->local_nonce, hmac);
  514. return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN);
  515. }
  516. static void subflow_ulp_fallback(struct sock *sk,
  517. struct mptcp_subflow_context *old_ctx)
  518. {
  519. struct inet_connection_sock *icsk = inet_csk(sk);
  520. mptcp_subflow_tcp_fallback(sk, old_ctx);
  521. icsk->icsk_ulp_ops = NULL;
  522. rcu_assign_pointer(icsk->icsk_ulp_data, NULL);
  523. tcp_sk(sk)->is_mptcp = 0;
  524. mptcp_subflow_ops_undo_override(sk);
  525. }
  526. void mptcp_subflow_drop_ctx(struct sock *ssk)
  527. {
  528. struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
  529. if (!ctx)
  530. return;
  531. list_del(&mptcp_subflow_ctx(ssk)->node);
  532. if (inet_csk(ssk)->icsk_ulp_ops) {
  533. subflow_ulp_fallback(ssk, ctx);
  534. if (ctx->conn)
  535. sock_put(ctx->conn);
  536. }
  537. kfree_rcu(ctx, rcu);
  538. }
  539. void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
  540. struct mptcp_options_received *mp_opt)
  541. {
  542. struct mptcp_sock *msk = mptcp_sk(subflow->conn);
  543. subflow->remote_key = mp_opt->sndr_key;
  544. subflow->fully_established = 1;
  545. subflow->can_ack = 1;
  546. WRITE_ONCE(msk->fully_established, true);
  547. }
  548. static struct sock *subflow_syn_recv_sock(const struct sock *sk,
  549. struct sk_buff *skb,
  550. struct request_sock *req,
  551. struct dst_entry *dst,
  552. struct request_sock *req_unhash,
  553. bool *own_req)
  554. {
  555. struct mptcp_subflow_context *listener = mptcp_subflow_ctx(sk);
  556. struct mptcp_subflow_request_sock *subflow_req;
  557. struct mptcp_options_received mp_opt;
  558. bool fallback, fallback_is_fatal;
  559. struct mptcp_sock *owner;
  560. struct sock *child;
  561. pr_debug("listener=%p, req=%p, conn=%p", listener, req, listener->conn);
  562. /* After child creation we must look for MPC even when options
  563. * are not parsed
  564. */
  565. mp_opt.suboptions = 0;
  566. /* hopefully temporary handling for MP_JOIN+syncookie */
  567. subflow_req = mptcp_subflow_rsk(req);
  568. fallback_is_fatal = tcp_rsk(req)->is_mptcp && subflow_req->mp_join;
  569. fallback = !tcp_rsk(req)->is_mptcp;
  570. if (fallback)
  571. goto create_child;
  572. /* if the sk is MP_CAPABLE, we try to fetch the client key */
  573. if (subflow_req->mp_capable) {
  574. /* we can receive and accept an in-window, out-of-order pkt,
  575. * which may not carry the MP_CAPABLE opt even on mptcp enabled
  576. * paths: always try to extract the peer key, and fallback
  577. * for packets missing it.
  578. * Even OoO DSS packets coming legitly after dropped or
  579. * reordered MPC will cause fallback, but we don't have other
  580. * options.
  581. */
  582. mptcp_get_options(skb, &mp_opt);
  583. if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC))
  584. fallback = true;
  585. } else if (subflow_req->mp_join) {
  586. mptcp_get_options(skb, &mp_opt);
  587. if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) ||
  588. !subflow_hmac_valid(req, &mp_opt) ||
  589. !mptcp_can_accept_new_subflow(subflow_req->msk)) {
  590. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKMAC);
  591. fallback = true;
  592. }
  593. }
  594. create_child:
  595. child = listener->icsk_af_ops->syn_recv_sock(sk, skb, req, dst,
  596. req_unhash, own_req);
  597. if (child && *own_req) {
  598. struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(child);
  599. tcp_rsk(req)->drop_req = false;
  600. /* we need to fallback on ctx allocation failure and on pre-reqs
  601. * checking above. In the latter scenario we additionally need
  602. * to reset the context to non MPTCP status.
  603. */
  604. if (!ctx || fallback) {
  605. if (fallback_is_fatal) {
  606. subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP);
  607. goto dispose_child;
  608. }
  609. goto fallback;
  610. }
  611. /* ssk inherits options of listener sk */
  612. ctx->setsockopt_seq = listener->setsockopt_seq;
  613. if (ctx->mp_capable) {
  614. ctx->conn = mptcp_sk_clone_init(listener->conn, &mp_opt, child, req);
  615. if (!ctx->conn)
  616. goto fallback;
  617. owner = mptcp_sk(ctx->conn);
  618. mptcp_pm_new_connection(owner, child, 1);
  619. /* with OoO packets we can reach here without ingress
  620. * mpc option
  621. */
  622. if (mp_opt.suboptions & OPTIONS_MPTCP_MPC) {
  623. mptcp_subflow_fully_established(ctx, &mp_opt);
  624. mptcp_pm_fully_established(owner, child, GFP_ATOMIC);
  625. ctx->pm_notified = 1;
  626. }
  627. } else if (ctx->mp_join) {
  628. owner = subflow_req->msk;
  629. if (!owner) {
  630. subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
  631. goto dispose_child;
  632. }
  633. /* move the msk reference ownership to the subflow */
  634. subflow_req->msk = NULL;
  635. ctx->conn = (struct sock *)owner;
  636. if (subflow_use_different_sport(owner, sk)) {
  637. pr_debug("ack inet_sport=%d %d",
  638. ntohs(inet_sk(sk)->inet_sport),
  639. ntohs(inet_sk((struct sock *)owner)->inet_sport));
  640. if (!mptcp_pm_sport_in_anno_list(owner, sk)) {
  641. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MISMATCHPORTACKRX);
  642. goto dispose_child;
  643. }
  644. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINPORTACKRX);
  645. }
  646. if (!mptcp_finish_join(child))
  647. goto dispose_child;
  648. SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINACKRX);
  649. tcp_rsk(req)->drop_req = true;
  650. }
  651. }
  652. /* check for expected invariant - should never trigger, just help
  653. * catching eariler subtle bugs
  654. */
  655. WARN_ON_ONCE(child && *own_req && tcp_sk(child)->is_mptcp &&
  656. (!mptcp_subflow_ctx(child) ||
  657. !mptcp_subflow_ctx(child)->conn));
  658. return child;
  659. dispose_child:
  660. mptcp_subflow_drop_ctx(child);
  661. tcp_rsk(req)->drop_req = true;
  662. inet_csk_prepare_for_destroy_sock(child);
  663. tcp_done(child);
  664. req->rsk_ops->send_reset(sk, skb);
  665. /* The last child reference will be released by the caller */
  666. return child;
  667. fallback:
  668. mptcp_subflow_drop_ctx(child);
  669. return child;
  670. }
  671. static struct inet_connection_sock_af_ops subflow_specific __ro_after_init;
  672. static struct proto tcp_prot_override __ro_after_init;
  673. enum mapping_status {
  674. MAPPING_OK,
  675. MAPPING_INVALID,
  676. MAPPING_EMPTY,
  677. MAPPING_DATA_FIN,
  678. MAPPING_DUMMY,
  679. MAPPING_BAD_CSUM
  680. };
  681. static void dbg_bad_map(struct mptcp_subflow_context *subflow, u32 ssn)
  682. {
  683. pr_debug("Bad mapping: ssn=%d map_seq=%d map_data_len=%d",
  684. ssn, subflow->map_subflow_seq, subflow->map_data_len);
  685. }
  686. static bool skb_is_fully_mapped(struct sock *ssk, struct sk_buff *skb)
  687. {
  688. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  689. unsigned int skb_consumed;
  690. skb_consumed = tcp_sk(ssk)->copied_seq - TCP_SKB_CB(skb)->seq;
  691. if (WARN_ON_ONCE(skb_consumed >= skb->len))
  692. return true;
  693. return skb->len - skb_consumed <= subflow->map_data_len -
  694. mptcp_subflow_get_map_offset(subflow);
  695. }
  696. static bool validate_mapping(struct sock *ssk, struct sk_buff *skb)
  697. {
  698. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  699. u32 ssn = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
  700. if (unlikely(before(ssn, subflow->map_subflow_seq))) {
  701. /* Mapping covers data later in the subflow stream,
  702. * currently unsupported.
  703. */
  704. dbg_bad_map(subflow, ssn);
  705. return false;
  706. }
  707. if (unlikely(!before(ssn, subflow->map_subflow_seq +
  708. subflow->map_data_len))) {
  709. /* Mapping does covers past subflow data, invalid */
  710. dbg_bad_map(subflow, ssn);
  711. return false;
  712. }
  713. return true;
  714. }
  715. static enum mapping_status validate_data_csum(struct sock *ssk, struct sk_buff *skb,
  716. bool csum_reqd)
  717. {
  718. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  719. u32 offset, seq, delta;
  720. __sum16 csum;
  721. int len;
  722. if (!csum_reqd)
  723. return MAPPING_OK;
  724. /* mapping already validated on previous traversal */
  725. if (subflow->map_csum_len == subflow->map_data_len)
  726. return MAPPING_OK;
  727. /* traverse the receive queue, ensuring it contains a full
  728. * DSS mapping and accumulating the related csum.
  729. * Preserve the accoumlate csum across multiple calls, to compute
  730. * the csum only once
  731. */
  732. delta = subflow->map_data_len - subflow->map_csum_len;
  733. for (;;) {
  734. seq = tcp_sk(ssk)->copied_seq + subflow->map_csum_len;
  735. offset = seq - TCP_SKB_CB(skb)->seq;
  736. /* if the current skb has not been accounted yet, csum its contents
  737. * up to the amount covered by the current DSS
  738. */
  739. if (offset < skb->len) {
  740. __wsum csum;
  741. len = min(skb->len - offset, delta);
  742. csum = skb_checksum(skb, offset, len, 0);
  743. subflow->map_data_csum = csum_block_add(subflow->map_data_csum, csum,
  744. subflow->map_csum_len);
  745. delta -= len;
  746. subflow->map_csum_len += len;
  747. }
  748. if (delta == 0)
  749. break;
  750. if (skb_queue_is_last(&ssk->sk_receive_queue, skb)) {
  751. /* if this subflow is closed, the partial mapping
  752. * will be never completed; flush the pending skbs, so
  753. * that subflow_sched_work_if_closed() can kick in
  754. */
  755. if (unlikely(ssk->sk_state == TCP_CLOSE))
  756. while ((skb = skb_peek(&ssk->sk_receive_queue)))
  757. sk_eat_skb(ssk, skb);
  758. /* not enough data to validate the csum */
  759. return MAPPING_EMPTY;
  760. }
  761. /* the DSS mapping for next skbs will be validated later,
  762. * when a get_mapping_status call will process such skb
  763. */
  764. skb = skb->next;
  765. }
  766. /* note that 'map_data_len' accounts only for the carried data, does
  767. * not include the eventual seq increment due to the data fin,
  768. * while the pseudo header requires the original DSS data len,
  769. * including that
  770. */
  771. csum = __mptcp_make_csum(subflow->map_seq,
  772. subflow->map_subflow_seq,
  773. subflow->map_data_len + subflow->map_data_fin,
  774. subflow->map_data_csum);
  775. if (unlikely(csum)) {
  776. MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DATACSUMERR);
  777. return MAPPING_BAD_CSUM;
  778. }
  779. subflow->valid_csum_seen = 1;
  780. return MAPPING_OK;
  781. }
  782. static enum mapping_status get_mapping_status(struct sock *ssk,
  783. struct mptcp_sock *msk)
  784. {
  785. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  786. bool csum_reqd = READ_ONCE(msk->csum_enabled);
  787. struct mptcp_ext *mpext;
  788. struct sk_buff *skb;
  789. u16 data_len;
  790. u64 map_seq;
  791. skb = skb_peek(&ssk->sk_receive_queue);
  792. if (!skb)
  793. return MAPPING_EMPTY;
  794. if (mptcp_check_fallback(ssk))
  795. return MAPPING_DUMMY;
  796. mpext = mptcp_get_ext(skb);
  797. if (!mpext || !mpext->use_map) {
  798. if (!subflow->map_valid && !skb->len) {
  799. /* the TCP stack deliver 0 len FIN pkt to the receive
  800. * queue, that is the only 0len pkts ever expected here,
  801. * and we can admit no mapping only for 0 len pkts
  802. */
  803. if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN))
  804. WARN_ONCE(1, "0len seq %d:%d flags %x",
  805. TCP_SKB_CB(skb)->seq,
  806. TCP_SKB_CB(skb)->end_seq,
  807. TCP_SKB_CB(skb)->tcp_flags);
  808. sk_eat_skb(ssk, skb);
  809. return MAPPING_EMPTY;
  810. }
  811. if (!subflow->map_valid)
  812. return MAPPING_INVALID;
  813. goto validate_seq;
  814. }
  815. trace_get_mapping_status(mpext);
  816. data_len = mpext->data_len;
  817. if (data_len == 0) {
  818. pr_debug("infinite mapping received");
  819. MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPRX);
  820. subflow->map_data_len = 0;
  821. return MAPPING_INVALID;
  822. }
  823. if (mpext->data_fin == 1) {
  824. if (data_len == 1) {
  825. bool updated = mptcp_update_rcv_data_fin(msk, mpext->data_seq,
  826. mpext->dsn64);
  827. pr_debug("DATA_FIN with no payload seq=%llu", mpext->data_seq);
  828. if (subflow->map_valid) {
  829. /* A DATA_FIN might arrive in a DSS
  830. * option before the previous mapping
  831. * has been fully consumed. Continue
  832. * handling the existing mapping.
  833. */
  834. skb_ext_del(skb, SKB_EXT_MPTCP);
  835. return MAPPING_OK;
  836. } else {
  837. if (updated)
  838. mptcp_schedule_work((struct sock *)msk);
  839. return MAPPING_DATA_FIN;
  840. }
  841. } else {
  842. u64 data_fin_seq = mpext->data_seq + data_len - 1;
  843. /* If mpext->data_seq is a 32-bit value, data_fin_seq
  844. * must also be limited to 32 bits.
  845. */
  846. if (!mpext->dsn64)
  847. data_fin_seq &= GENMASK_ULL(31, 0);
  848. mptcp_update_rcv_data_fin(msk, data_fin_seq, mpext->dsn64);
  849. pr_debug("DATA_FIN with mapping seq=%llu dsn64=%d",
  850. data_fin_seq, mpext->dsn64);
  851. }
  852. /* Adjust for DATA_FIN using 1 byte of sequence space */
  853. data_len--;
  854. }
  855. map_seq = mptcp_expand_seq(READ_ONCE(msk->ack_seq), mpext->data_seq, mpext->dsn64);
  856. WRITE_ONCE(mptcp_sk(subflow->conn)->use_64bit_ack, !!mpext->dsn64);
  857. if (subflow->map_valid) {
  858. /* Allow replacing only with an identical map */
  859. if (subflow->map_seq == map_seq &&
  860. subflow->map_subflow_seq == mpext->subflow_seq &&
  861. subflow->map_data_len == data_len &&
  862. subflow->map_csum_reqd == mpext->csum_reqd) {
  863. skb_ext_del(skb, SKB_EXT_MPTCP);
  864. goto validate_csum;
  865. }
  866. /* If this skb data are fully covered by the current mapping,
  867. * the new map would need caching, which is not supported
  868. */
  869. if (skb_is_fully_mapped(ssk, skb)) {
  870. MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSNOMATCH);
  871. return MAPPING_INVALID;
  872. }
  873. /* will validate the next map after consuming the current one */
  874. goto validate_csum;
  875. }
  876. subflow->map_seq = map_seq;
  877. subflow->map_subflow_seq = mpext->subflow_seq;
  878. subflow->map_data_len = data_len;
  879. subflow->map_valid = 1;
  880. subflow->map_data_fin = mpext->data_fin;
  881. subflow->mpc_map = mpext->mpc_map;
  882. subflow->map_csum_reqd = mpext->csum_reqd;
  883. subflow->map_csum_len = 0;
  884. subflow->map_data_csum = csum_unfold(mpext->csum);
  885. /* Cfr RFC 8684 Section 3.3.0 */
  886. if (unlikely(subflow->map_csum_reqd != csum_reqd))
  887. return MAPPING_INVALID;
  888. pr_debug("new map seq=%llu subflow_seq=%u data_len=%u csum=%d:%u",
  889. subflow->map_seq, subflow->map_subflow_seq,
  890. subflow->map_data_len, subflow->map_csum_reqd,
  891. subflow->map_data_csum);
  892. validate_seq:
  893. /* we revalidate valid mapping on new skb, because we must ensure
  894. * the current skb is completely covered by the available mapping
  895. */
  896. if (!validate_mapping(ssk, skb)) {
  897. MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSTCPMISMATCH);
  898. return MAPPING_INVALID;
  899. }
  900. skb_ext_del(skb, SKB_EXT_MPTCP);
  901. validate_csum:
  902. return validate_data_csum(ssk, skb, csum_reqd);
  903. }
  904. static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb,
  905. u64 limit)
  906. {
  907. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  908. bool fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN;
  909. u32 incr;
  910. incr = limit >= skb->len ? skb->len + fin : limit;
  911. pr_debug("discarding=%d len=%d seq=%d", incr, skb->len,
  912. subflow->map_subflow_seq);
  913. MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DUPDATA);
  914. tcp_sk(ssk)->copied_seq += incr;
  915. if (!before(tcp_sk(ssk)->copied_seq, TCP_SKB_CB(skb)->end_seq))
  916. sk_eat_skb(ssk, skb);
  917. if (mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len)
  918. subflow->map_valid = 0;
  919. }
  920. /* sched mptcp worker to remove the subflow if no more data is pending */
  921. static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk)
  922. {
  923. if (likely(ssk->sk_state != TCP_CLOSE))
  924. return;
  925. if (skb_queue_empty(&ssk->sk_receive_queue) &&
  926. !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
  927. mptcp_schedule_work((struct sock *)msk);
  928. }
  929. static bool subflow_can_fallback(struct mptcp_subflow_context *subflow)
  930. {
  931. struct mptcp_sock *msk = mptcp_sk(subflow->conn);
  932. if (subflow->mp_join)
  933. return false;
  934. else if (READ_ONCE(msk->csum_enabled))
  935. return !subflow->valid_csum_seen;
  936. else
  937. return !subflow->fully_established;
  938. }
  939. static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
  940. {
  941. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  942. unsigned long fail_tout;
  943. /* greceful failure can happen only on the MPC subflow */
  944. if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
  945. return;
  946. /* since the close timeout take precedence on the fail one,
  947. * no need to start the latter when the first is already set
  948. */
  949. if (sock_flag((struct sock *)msk, SOCK_DEAD))
  950. return;
  951. /* we don't need extreme accuracy here, use a zero fail_tout as special
  952. * value meaning no fail timeout at all;
  953. */
  954. fail_tout = jiffies + TCP_RTO_MAX;
  955. if (!fail_tout)
  956. fail_tout = 1;
  957. WRITE_ONCE(subflow->fail_tout, fail_tout);
  958. tcp_send_ack(ssk);
  959. mptcp_reset_tout_timer(msk, subflow->fail_tout);
  960. }
  961. static bool subflow_check_data_avail(struct sock *ssk)
  962. {
  963. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  964. enum mapping_status status;
  965. struct mptcp_sock *msk;
  966. struct sk_buff *skb;
  967. if (!skb_peek(&ssk->sk_receive_queue))
  968. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
  969. if (subflow->data_avail)
  970. return true;
  971. msk = mptcp_sk(subflow->conn);
  972. for (;;) {
  973. u64 ack_seq;
  974. u64 old_ack;
  975. status = get_mapping_status(ssk, msk);
  976. trace_subflow_check_data_avail(status, skb_peek(&ssk->sk_receive_queue));
  977. if (unlikely(status == MAPPING_INVALID || status == MAPPING_DUMMY ||
  978. status == MAPPING_BAD_CSUM))
  979. goto fallback;
  980. if (status != MAPPING_OK)
  981. goto no_data;
  982. skb = skb_peek(&ssk->sk_receive_queue);
  983. if (WARN_ON_ONCE(!skb))
  984. goto no_data;
  985. /* if msk lacks the remote key, this subflow must provide an
  986. * MP_CAPABLE-based mapping
  987. */
  988. if (unlikely(!READ_ONCE(msk->can_ack))) {
  989. if (!subflow->mpc_map)
  990. goto fallback;
  991. WRITE_ONCE(msk->remote_key, subflow->remote_key);
  992. WRITE_ONCE(msk->ack_seq, subflow->map_seq);
  993. WRITE_ONCE(msk->can_ack, true);
  994. }
  995. old_ack = READ_ONCE(msk->ack_seq);
  996. ack_seq = mptcp_subflow_get_mapped_dsn(subflow);
  997. pr_debug("msk ack_seq=%llx subflow ack_seq=%llx", old_ack,
  998. ack_seq);
  999. if (unlikely(before64(ack_seq, old_ack))) {
  1000. mptcp_subflow_discard_data(ssk, skb, old_ack - ack_seq);
  1001. continue;
  1002. }
  1003. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
  1004. break;
  1005. }
  1006. return true;
  1007. no_data:
  1008. subflow_sched_work_if_closed(msk, ssk);
  1009. return false;
  1010. fallback:
  1011. if (!__mptcp_check_fallback(msk)) {
  1012. /* RFC 8684 section 3.7. */
  1013. if (status == MAPPING_BAD_CSUM &&
  1014. (subflow->mp_join || subflow->valid_csum_seen)) {
  1015. subflow->send_mp_fail = 1;
  1016. if (!READ_ONCE(msk->allow_infinite_fallback)) {
  1017. subflow->reset_transient = 0;
  1018. subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
  1019. goto reset;
  1020. }
  1021. mptcp_subflow_fail(msk, ssk);
  1022. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
  1023. return true;
  1024. }
  1025. if (!subflow_can_fallback(subflow) && subflow->map_data_len) {
  1026. /* fatal protocol error, close the socket.
  1027. * subflow_error_report() will introduce the appropriate barriers
  1028. */
  1029. subflow->reset_transient = 0;
  1030. subflow->reset_reason = MPTCP_RST_EMPTCP;
  1031. reset:
  1032. WRITE_ONCE(ssk->sk_err, EBADMSG);
  1033. tcp_set_state(ssk, TCP_CLOSE);
  1034. while ((skb = skb_peek(&ssk->sk_receive_queue)))
  1035. sk_eat_skb(ssk, skb);
  1036. tcp_send_active_reset(ssk, GFP_ATOMIC);
  1037. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
  1038. return false;
  1039. }
  1040. mptcp_do_fallback(ssk);
  1041. }
  1042. skb = skb_peek(&ssk->sk_receive_queue);
  1043. subflow->map_valid = 1;
  1044. subflow->map_seq = READ_ONCE(msk->ack_seq);
  1045. subflow->map_data_len = skb->len;
  1046. subflow->map_subflow_seq = tcp_sk(ssk)->copied_seq - subflow->ssn_offset;
  1047. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_DATA_AVAIL);
  1048. return true;
  1049. }
  1050. bool mptcp_subflow_data_available(struct sock *sk)
  1051. {
  1052. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  1053. /* check if current mapping is still valid */
  1054. if (subflow->map_valid &&
  1055. mptcp_subflow_get_map_offset(subflow) >= subflow->map_data_len) {
  1056. subflow->map_valid = 0;
  1057. WRITE_ONCE(subflow->data_avail, MPTCP_SUBFLOW_NODATA);
  1058. pr_debug("Done with mapping: seq=%u data_len=%u",
  1059. subflow->map_subflow_seq,
  1060. subflow->map_data_len);
  1061. }
  1062. return subflow_check_data_avail(sk);
  1063. }
  1064. /* If ssk has an mptcp parent socket, use the mptcp rcvbuf occupancy,
  1065. * not the ssk one.
  1066. *
  1067. * In mptcp, rwin is about the mptcp-level connection data.
  1068. *
  1069. * Data that is still on the ssk rx queue can thus be ignored,
  1070. * as far as mptcp peer is concerned that data is still inflight.
  1071. * DSS ACK is updated when skb is moved to the mptcp rx queue.
  1072. */
  1073. void mptcp_space(const struct sock *ssk, int *space, int *full_space)
  1074. {
  1075. const struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  1076. const struct sock *sk = subflow->conn;
  1077. *space = __mptcp_space(sk);
  1078. *full_space = tcp_full_space(sk);
  1079. }
  1080. static void subflow_error_report(struct sock *ssk)
  1081. {
  1082. struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
  1083. /* bail early if this is a no-op, so that we avoid introducing a
  1084. * problematic lockdep dependency between TCP accept queue lock
  1085. * and msk socket spinlock
  1086. */
  1087. if (!sk->sk_socket)
  1088. return;
  1089. mptcp_data_lock(sk);
  1090. if (!sock_owned_by_user(sk))
  1091. __mptcp_error_report(sk);
  1092. else
  1093. __set_bit(MPTCP_ERROR_REPORT, &mptcp_sk(sk)->cb_flags);
  1094. mptcp_data_unlock(sk);
  1095. }
  1096. static void subflow_data_ready(struct sock *sk)
  1097. {
  1098. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  1099. u16 state = 1 << inet_sk_state_load(sk);
  1100. struct sock *parent = subflow->conn;
  1101. struct mptcp_sock *msk;
  1102. msk = mptcp_sk(parent);
  1103. if (state & TCPF_LISTEN) {
  1104. /* MPJ subflow are removed from accept queue before reaching here,
  1105. * avoid stray wakeups
  1106. */
  1107. if (reqsk_queue_empty(&inet_csk(sk)->icsk_accept_queue))
  1108. return;
  1109. parent->sk_data_ready(parent);
  1110. return;
  1111. }
  1112. WARN_ON_ONCE(!__mptcp_check_fallback(msk) && !subflow->mp_capable &&
  1113. !subflow->mp_join && !(state & TCPF_CLOSE));
  1114. if (mptcp_subflow_data_available(sk))
  1115. mptcp_data_ready(parent, sk);
  1116. else if (unlikely(sk->sk_err))
  1117. subflow_error_report(sk);
  1118. }
  1119. static void subflow_write_space(struct sock *ssk)
  1120. {
  1121. struct sock *sk = mptcp_subflow_ctx(ssk)->conn;
  1122. mptcp_propagate_sndbuf(sk, ssk);
  1123. mptcp_write_space(sk);
  1124. }
  1125. static const struct inet_connection_sock_af_ops *
  1126. subflow_default_af_ops(struct sock *sk)
  1127. {
  1128. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1129. if (sk->sk_family == AF_INET6)
  1130. return &subflow_v6_specific;
  1131. #endif
  1132. return &subflow_specific;
  1133. }
  1134. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1135. void mptcpv6_handle_mapped(struct sock *sk, bool mapped)
  1136. {
  1137. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  1138. struct inet_connection_sock *icsk = inet_csk(sk);
  1139. const struct inet_connection_sock_af_ops *target;
  1140. target = mapped ? &subflow_v6m_specific : subflow_default_af_ops(sk);
  1141. pr_debug("subflow=%p family=%d ops=%p target=%p mapped=%d",
  1142. subflow, sk->sk_family, icsk->icsk_af_ops, target, mapped);
  1143. if (likely(icsk->icsk_af_ops == target))
  1144. return;
  1145. subflow->icsk_af_ops = icsk->icsk_af_ops;
  1146. icsk->icsk_af_ops = target;
  1147. }
  1148. #endif
  1149. void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
  1150. struct sockaddr_storage *addr,
  1151. unsigned short family)
  1152. {
  1153. memset(addr, 0, sizeof(*addr));
  1154. addr->ss_family = family;
  1155. if (addr->ss_family == AF_INET) {
  1156. struct sockaddr_in *in_addr = (struct sockaddr_in *)addr;
  1157. if (info->family == AF_INET)
  1158. in_addr->sin_addr = info->addr;
  1159. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1160. else if (ipv6_addr_v4mapped(&info->addr6))
  1161. in_addr->sin_addr.s_addr = info->addr6.s6_addr32[3];
  1162. #endif
  1163. in_addr->sin_port = info->port;
  1164. }
  1165. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1166. else if (addr->ss_family == AF_INET6) {
  1167. struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)addr;
  1168. if (info->family == AF_INET)
  1169. ipv6_addr_set_v4mapped(info->addr.s_addr,
  1170. &in6_addr->sin6_addr);
  1171. else
  1172. in6_addr->sin6_addr = info->addr6;
  1173. in6_addr->sin6_port = info->port;
  1174. }
  1175. #endif
  1176. }
  1177. int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_addr_info *loc,
  1178. const struct mptcp_addr_info *remote)
  1179. {
  1180. struct mptcp_sock *msk = mptcp_sk(sk);
  1181. struct mptcp_subflow_context *subflow;
  1182. struct sockaddr_storage addr;
  1183. int remote_id = remote->id;
  1184. int local_id = loc->id;
  1185. int err = -ENOTCONN;
  1186. struct socket *sf;
  1187. struct sock *ssk;
  1188. u32 remote_token;
  1189. int addrlen;
  1190. int ifindex;
  1191. u8 flags;
  1192. if (!mptcp_is_fully_established(sk))
  1193. goto err_out;
  1194. err = mptcp_subflow_create_socket(sk, loc->family, &sf);
  1195. if (err)
  1196. goto err_out;
  1197. ssk = sf->sk;
  1198. subflow = mptcp_subflow_ctx(ssk);
  1199. do {
  1200. get_random_bytes(&subflow->local_nonce, sizeof(u32));
  1201. } while (!subflow->local_nonce);
  1202. if (local_id)
  1203. subflow_set_local_id(subflow, local_id);
  1204. mptcp_pm_get_flags_and_ifindex_by_id(msk, local_id,
  1205. &flags, &ifindex);
  1206. subflow->remote_key = msk->remote_key;
  1207. subflow->local_key = msk->local_key;
  1208. subflow->token = msk->token;
  1209. mptcp_info2sockaddr(loc, &addr, ssk->sk_family);
  1210. addrlen = sizeof(struct sockaddr_in);
  1211. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1212. if (addr.ss_family == AF_INET6)
  1213. addrlen = sizeof(struct sockaddr_in6);
  1214. #endif
  1215. mptcp_sockopt_sync(msk, ssk);
  1216. ssk->sk_bound_dev_if = ifindex;
  1217. err = kernel_bind(sf, (struct sockaddr *)&addr, addrlen);
  1218. if (err)
  1219. goto failed;
  1220. mptcp_crypto_key_sha(subflow->remote_key, &remote_token, NULL);
  1221. pr_debug("msk=%p remote_token=%u local_id=%d remote_id=%d", msk,
  1222. remote_token, local_id, remote_id);
  1223. subflow->remote_token = remote_token;
  1224. subflow->remote_id = remote_id;
  1225. subflow->request_join = 1;
  1226. subflow->request_bkup = !!(flags & MPTCP_PM_ADDR_FLAG_BACKUP);
  1227. mptcp_info2sockaddr(remote, &addr, ssk->sk_family);
  1228. sock_hold(ssk);
  1229. list_add_tail(&subflow->node, &msk->conn_list);
  1230. err = kernel_connect(sf, (struct sockaddr *)&addr, addrlen, O_NONBLOCK);
  1231. if (err && err != -EINPROGRESS)
  1232. goto failed_unlink;
  1233. /* discard the subflow socket */
  1234. mptcp_sock_graft(ssk, sk->sk_socket);
  1235. iput(SOCK_INODE(sf));
  1236. WRITE_ONCE(msk->allow_infinite_fallback, false);
  1237. mptcp_stop_tout_timer(sk);
  1238. return 0;
  1239. failed_unlink:
  1240. list_del(&subflow->node);
  1241. sock_put(mptcp_subflow_tcp_sock(subflow));
  1242. failed:
  1243. subflow->disposable = 1;
  1244. sock_release(sf);
  1245. err_out:
  1246. /* we account subflows before the creation, and this failures will not
  1247. * be caught by sk_state_change()
  1248. */
  1249. mptcp_pm_close_subflow(msk);
  1250. return err;
  1251. }
  1252. static void mptcp_attach_cgroup(struct sock *parent, struct sock *child)
  1253. {
  1254. #ifdef CONFIG_SOCK_CGROUP_DATA
  1255. struct sock_cgroup_data *parent_skcd = &parent->sk_cgrp_data,
  1256. *child_skcd = &child->sk_cgrp_data;
  1257. /* only the additional subflows created by kworkers have to be modified */
  1258. if (cgroup_id(sock_cgroup_ptr(parent_skcd)) !=
  1259. cgroup_id(sock_cgroup_ptr(child_skcd))) {
  1260. #ifdef CONFIG_MEMCG
  1261. struct mem_cgroup *memcg = parent->sk_memcg;
  1262. mem_cgroup_sk_free(child);
  1263. if (memcg && css_tryget(&memcg->css))
  1264. child->sk_memcg = memcg;
  1265. #endif /* CONFIG_MEMCG */
  1266. cgroup_sk_free(child_skcd);
  1267. *child_skcd = *parent_skcd;
  1268. cgroup_sk_clone(child_skcd);
  1269. }
  1270. #endif /* CONFIG_SOCK_CGROUP_DATA */
  1271. }
  1272. static void mptcp_subflow_ops_override(struct sock *ssk)
  1273. {
  1274. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1275. if (ssk->sk_prot == &tcpv6_prot)
  1276. ssk->sk_prot = &tcpv6_prot_override;
  1277. else
  1278. #endif
  1279. ssk->sk_prot = &tcp_prot_override;
  1280. }
  1281. static void mptcp_subflow_ops_undo_override(struct sock *ssk)
  1282. {
  1283. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1284. if (ssk->sk_prot == &tcpv6_prot_override)
  1285. ssk->sk_prot = &tcpv6_prot;
  1286. else
  1287. #endif
  1288. ssk->sk_prot = &tcp_prot;
  1289. }
  1290. int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
  1291. struct socket **new_sock)
  1292. {
  1293. struct mptcp_subflow_context *subflow;
  1294. struct net *net = sock_net(sk);
  1295. struct socket *sf;
  1296. int err;
  1297. /* un-accepted server sockets can reach here - on bad configuration
  1298. * bail early to avoid greater trouble later
  1299. */
  1300. if (unlikely(!sk->sk_socket))
  1301. return -EINVAL;
  1302. err = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP, &sf);
  1303. if (err)
  1304. return err;
  1305. lock_sock_nested(sf->sk, SINGLE_DEPTH_NESTING);
  1306. /* the newly created socket has to be in the same cgroup as its parent */
  1307. mptcp_attach_cgroup(sk, sf->sk);
  1308. /* kernel sockets do not by default acquire net ref, but TCP timer
  1309. * needs it.
  1310. */
  1311. sf->sk->sk_net_refcnt = 1;
  1312. get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL);
  1313. sock_inuse_add(net, 1);
  1314. err = tcp_set_ulp(sf->sk, "mptcp");
  1315. release_sock(sf->sk);
  1316. if (err) {
  1317. sock_release(sf);
  1318. return err;
  1319. }
  1320. /* the newly created socket really belongs to the owning MPTCP master
  1321. * socket, even if for additional subflows the allocation is performed
  1322. * by a kernel workqueue. Adjust inode references, so that the
  1323. * procfs/diag interfaces really show this one belonging to the correct
  1324. * user.
  1325. */
  1326. SOCK_INODE(sf)->i_ino = SOCK_INODE(sk->sk_socket)->i_ino;
  1327. SOCK_INODE(sf)->i_uid = SOCK_INODE(sk->sk_socket)->i_uid;
  1328. SOCK_INODE(sf)->i_gid = SOCK_INODE(sk->sk_socket)->i_gid;
  1329. subflow = mptcp_subflow_ctx(sf->sk);
  1330. pr_debug("subflow=%p", subflow);
  1331. *new_sock = sf;
  1332. sock_hold(sk);
  1333. subflow->conn = sk;
  1334. mptcp_subflow_ops_override(sf->sk);
  1335. return 0;
  1336. }
  1337. static struct mptcp_subflow_context *subflow_create_ctx(struct sock *sk,
  1338. gfp_t priority)
  1339. {
  1340. struct inet_connection_sock *icsk = inet_csk(sk);
  1341. struct mptcp_subflow_context *ctx;
  1342. ctx = kzalloc(sizeof(*ctx), priority);
  1343. if (!ctx)
  1344. return NULL;
  1345. rcu_assign_pointer(icsk->icsk_ulp_data, ctx);
  1346. INIT_LIST_HEAD(&ctx->node);
  1347. INIT_LIST_HEAD(&ctx->delegated_node);
  1348. pr_debug("subflow=%p", ctx);
  1349. ctx->tcp_sock = sk;
  1350. return ctx;
  1351. }
  1352. static void __subflow_state_change(struct sock *sk)
  1353. {
  1354. struct socket_wq *wq;
  1355. rcu_read_lock();
  1356. wq = rcu_dereference(sk->sk_wq);
  1357. if (skwq_has_sleeper(wq))
  1358. wake_up_interruptible_all(&wq->wait);
  1359. rcu_read_unlock();
  1360. }
  1361. static bool subflow_is_done(const struct sock *sk)
  1362. {
  1363. return sk->sk_shutdown & RCV_SHUTDOWN || sk->sk_state == TCP_CLOSE;
  1364. }
  1365. static void subflow_state_change(struct sock *sk)
  1366. {
  1367. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
  1368. struct sock *parent = subflow->conn;
  1369. struct mptcp_sock *msk;
  1370. __subflow_state_change(sk);
  1371. msk = mptcp_sk(parent);
  1372. if (subflow_simultaneous_connect(sk)) {
  1373. mptcp_propagate_sndbuf(parent, sk);
  1374. mptcp_do_fallback(sk);
  1375. mptcp_rcv_space_init(msk, sk);
  1376. pr_fallback(msk);
  1377. subflow->conn_finished = 1;
  1378. mptcp_set_connected(parent);
  1379. }
  1380. /* as recvmsg() does not acquire the subflow socket for ssk selection
  1381. * a fin packet carrying a DSS can be unnoticed if we don't trigger
  1382. * the data available machinery here.
  1383. */
  1384. if (mptcp_subflow_data_available(sk))
  1385. mptcp_data_ready(parent, sk);
  1386. else if (unlikely(sk->sk_err))
  1387. subflow_error_report(sk);
  1388. subflow_sched_work_if_closed(mptcp_sk(parent), sk);
  1389. /* when the fallback subflow closes the rx side, trigger a 'dummy'
  1390. * ingress data fin, so that the msk state will follow along
  1391. */
  1392. if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
  1393. mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
  1394. mptcp_schedule_work(parent);
  1395. }
  1396. void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
  1397. {
  1398. struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue;
  1399. struct request_sock *req, *head, *tail;
  1400. struct mptcp_subflow_context *subflow;
  1401. struct sock *sk, *ssk;
  1402. /* Due to lock dependencies no relevant lock can be acquired under rskq_lock.
  1403. * Splice the req list, so that accept() can not reach the pending ssk after
  1404. * the listener socket is released below.
  1405. */
  1406. spin_lock_bh(&queue->rskq_lock);
  1407. head = queue->rskq_accept_head;
  1408. tail = queue->rskq_accept_tail;
  1409. queue->rskq_accept_head = NULL;
  1410. queue->rskq_accept_tail = NULL;
  1411. spin_unlock_bh(&queue->rskq_lock);
  1412. if (!head)
  1413. return;
  1414. /* can't acquire the msk socket lock under the subflow one,
  1415. * or will cause ABBA deadlock
  1416. */
  1417. release_sock(listener_ssk);
  1418. for (req = head; req; req = req->dl_next) {
  1419. ssk = req->sk;
  1420. if (!sk_is_mptcp(ssk))
  1421. continue;
  1422. subflow = mptcp_subflow_ctx(ssk);
  1423. if (!subflow || !subflow->conn)
  1424. continue;
  1425. sk = subflow->conn;
  1426. sock_hold(sk);
  1427. lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
  1428. __mptcp_unaccepted_force_close(sk);
  1429. release_sock(sk);
  1430. /* lockdep will report a false positive ABBA deadlock
  1431. * between cancel_work_sync and the listener socket.
  1432. * The involved locks belong to different sockets WRT
  1433. * the existing AB chain.
  1434. * Using a per socket key is problematic as key
  1435. * deregistration requires process context and must be
  1436. * performed at socket disposal time, in atomic
  1437. * context.
  1438. * Just tell lockdep to consider the listener socket
  1439. * released here.
  1440. */
  1441. mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_);
  1442. mptcp_cancel_work(sk);
  1443. mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_);
  1444. sock_put(sk);
  1445. }
  1446. /* we are still under the listener msk socket lock */
  1447. lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING);
  1448. /* restore the listener queue, to let the TCP code clean it up */
  1449. spin_lock_bh(&queue->rskq_lock);
  1450. WARN_ON_ONCE(queue->rskq_accept_head);
  1451. queue->rskq_accept_head = head;
  1452. queue->rskq_accept_tail = tail;
  1453. spin_unlock_bh(&queue->rskq_lock);
  1454. }
  1455. static int subflow_ulp_init(struct sock *sk)
  1456. {
  1457. struct inet_connection_sock *icsk = inet_csk(sk);
  1458. struct mptcp_subflow_context *ctx;
  1459. struct tcp_sock *tp = tcp_sk(sk);
  1460. int err = 0;
  1461. /* disallow attaching ULP to a socket unless it has been
  1462. * created with sock_create_kern()
  1463. */
  1464. if (!sk->sk_kern_sock) {
  1465. err = -EOPNOTSUPP;
  1466. goto out;
  1467. }
  1468. ctx = subflow_create_ctx(sk, GFP_KERNEL);
  1469. if (!ctx) {
  1470. err = -ENOMEM;
  1471. goto out;
  1472. }
  1473. pr_debug("subflow=%p, family=%d", ctx, sk->sk_family);
  1474. tp->is_mptcp = 1;
  1475. ctx->icsk_af_ops = icsk->icsk_af_ops;
  1476. icsk->icsk_af_ops = subflow_default_af_ops(sk);
  1477. ctx->tcp_state_change = sk->sk_state_change;
  1478. ctx->tcp_error_report = sk->sk_error_report;
  1479. WARN_ON_ONCE(sk->sk_data_ready != sock_def_readable);
  1480. WARN_ON_ONCE(sk->sk_write_space != sk_stream_write_space);
  1481. sk->sk_data_ready = subflow_data_ready;
  1482. sk->sk_write_space = subflow_write_space;
  1483. sk->sk_state_change = subflow_state_change;
  1484. sk->sk_error_report = subflow_error_report;
  1485. out:
  1486. return err;
  1487. }
  1488. static void subflow_ulp_release(struct sock *ssk)
  1489. {
  1490. struct mptcp_subflow_context *ctx = mptcp_subflow_ctx(ssk);
  1491. bool release = true;
  1492. struct sock *sk;
  1493. if (!ctx)
  1494. return;
  1495. sk = ctx->conn;
  1496. if (sk) {
  1497. /* if the msk has been orphaned, keep the ctx
  1498. * alive, will be freed by __mptcp_close_ssk(),
  1499. * when the subflow is still unaccepted
  1500. */
  1501. release = ctx->disposable || list_empty(&ctx->node);
  1502. /* inet_child_forget() does not call sk_state_change(),
  1503. * explicitly trigger the socket close machinery
  1504. */
  1505. if (!release && !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW,
  1506. &mptcp_sk(sk)->flags))
  1507. mptcp_schedule_work(sk);
  1508. sock_put(sk);
  1509. }
  1510. mptcp_subflow_ops_undo_override(ssk);
  1511. if (release)
  1512. kfree_rcu(ctx, rcu);
  1513. }
  1514. static void subflow_ulp_clone(const struct request_sock *req,
  1515. struct sock *newsk,
  1516. const gfp_t priority)
  1517. {
  1518. struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
  1519. struct mptcp_subflow_context *old_ctx = mptcp_subflow_ctx(newsk);
  1520. struct mptcp_subflow_context *new_ctx;
  1521. if (!tcp_rsk(req)->is_mptcp ||
  1522. (!subflow_req->mp_capable && !subflow_req->mp_join)) {
  1523. subflow_ulp_fallback(newsk, old_ctx);
  1524. return;
  1525. }
  1526. new_ctx = subflow_create_ctx(newsk, priority);
  1527. if (!new_ctx) {
  1528. subflow_ulp_fallback(newsk, old_ctx);
  1529. return;
  1530. }
  1531. new_ctx->conn_finished = 1;
  1532. new_ctx->icsk_af_ops = old_ctx->icsk_af_ops;
  1533. new_ctx->tcp_state_change = old_ctx->tcp_state_change;
  1534. new_ctx->tcp_error_report = old_ctx->tcp_error_report;
  1535. new_ctx->rel_write_seq = 1;
  1536. new_ctx->tcp_sock = newsk;
  1537. if (subflow_req->mp_capable) {
  1538. /* see comments in subflow_syn_recv_sock(), MPTCP connection
  1539. * is fully established only after we receive the remote key
  1540. */
  1541. new_ctx->mp_capable = 1;
  1542. new_ctx->local_key = subflow_req->local_key;
  1543. new_ctx->token = subflow_req->token;
  1544. new_ctx->ssn_offset = subflow_req->ssn_offset;
  1545. new_ctx->idsn = subflow_req->idsn;
  1546. /* this is the first subflow, id is always 0 */
  1547. new_ctx->local_id_valid = 1;
  1548. } else if (subflow_req->mp_join) {
  1549. new_ctx->ssn_offset = subflow_req->ssn_offset;
  1550. new_ctx->mp_join = 1;
  1551. new_ctx->fully_established = 1;
  1552. new_ctx->backup = subflow_req->backup;
  1553. new_ctx->remote_id = subflow_req->remote_id;
  1554. new_ctx->token = subflow_req->token;
  1555. new_ctx->thmac = subflow_req->thmac;
  1556. /* the subflow req id is valid, fetched via subflow_check_req()
  1557. * and subflow_token_join_request()
  1558. */
  1559. subflow_set_local_id(new_ctx, subflow_req->local_id);
  1560. }
  1561. }
  1562. static void tcp_release_cb_override(struct sock *ssk)
  1563. {
  1564. struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
  1565. long status;
  1566. /* process and clear all the pending actions, but leave the subflow into
  1567. * the napi queue. To respect locking, only the same CPU that originated
  1568. * the action can touch the list. mptcp_napi_poll will take care of it.
  1569. */
  1570. status = set_mask_bits(&subflow->delegated_status, MPTCP_DELEGATE_ACTIONS_MASK, 0);
  1571. if (status)
  1572. mptcp_subflow_process_delegated(ssk, status);
  1573. tcp_release_cb(ssk);
  1574. }
  1575. static struct tcp_ulp_ops subflow_ulp_ops __read_mostly = {
  1576. .name = "mptcp",
  1577. .owner = THIS_MODULE,
  1578. .init = subflow_ulp_init,
  1579. .release = subflow_ulp_release,
  1580. .clone = subflow_ulp_clone,
  1581. };
  1582. static int subflow_ops_init(struct request_sock_ops *subflow_ops)
  1583. {
  1584. subflow_ops->obj_size = sizeof(struct mptcp_subflow_request_sock);
  1585. subflow_ops->slab = kmem_cache_create(subflow_ops->slab_name,
  1586. subflow_ops->obj_size, 0,
  1587. SLAB_ACCOUNT |
  1588. SLAB_TYPESAFE_BY_RCU,
  1589. NULL);
  1590. if (!subflow_ops->slab)
  1591. return -ENOMEM;
  1592. return 0;
  1593. }
  1594. void __init mptcp_subflow_init(void)
  1595. {
  1596. mptcp_subflow_v4_request_sock_ops = tcp_request_sock_ops;
  1597. mptcp_subflow_v4_request_sock_ops.slab_name = "request_sock_subflow_v4";
  1598. mptcp_subflow_v4_request_sock_ops.destructor = subflow_v4_req_destructor;
  1599. if (subflow_ops_init(&mptcp_subflow_v4_request_sock_ops) != 0)
  1600. panic("MPTCP: failed to init subflow v4 request sock ops\n");
  1601. subflow_request_sock_ipv4_ops = tcp_request_sock_ipv4_ops;
  1602. subflow_request_sock_ipv4_ops.route_req = subflow_v4_route_req;
  1603. subflow_specific = ipv4_specific;
  1604. subflow_specific.conn_request = subflow_v4_conn_request;
  1605. subflow_specific.syn_recv_sock = subflow_syn_recv_sock;
  1606. subflow_specific.sk_rx_dst_set = subflow_finish_connect;
  1607. subflow_specific.rebuild_header = subflow_rebuild_header;
  1608. tcp_prot_override = tcp_prot;
  1609. tcp_prot_override.release_cb = tcp_release_cb_override;
  1610. #if IS_ENABLED(CONFIG_MPTCP_IPV6)
  1611. /* In struct mptcp_subflow_request_sock, we assume the TCP request sock
  1612. * structures for v4 and v6 have the same size. It should not changed in
  1613. * the future but better to make sure to be warned if it is no longer
  1614. * the case.
  1615. */
  1616. BUILD_BUG_ON(sizeof(struct tcp_request_sock) != sizeof(struct tcp6_request_sock));
  1617. mptcp_subflow_v6_request_sock_ops = tcp6_request_sock_ops;
  1618. mptcp_subflow_v6_request_sock_ops.slab_name = "request_sock_subflow_v6";
  1619. mptcp_subflow_v6_request_sock_ops.destructor = subflow_v6_req_destructor;
  1620. if (subflow_ops_init(&mptcp_subflow_v6_request_sock_ops) != 0)
  1621. panic("MPTCP: failed to init subflow v6 request sock ops\n");
  1622. subflow_request_sock_ipv6_ops = tcp_request_sock_ipv6_ops;
  1623. subflow_request_sock_ipv6_ops.route_req = subflow_v6_route_req;
  1624. subflow_v6_specific = ipv6_specific;
  1625. subflow_v6_specific.conn_request = subflow_v6_conn_request;
  1626. subflow_v6_specific.syn_recv_sock = subflow_syn_recv_sock;
  1627. subflow_v6_specific.sk_rx_dst_set = subflow_finish_connect;
  1628. subflow_v6_specific.rebuild_header = subflow_v6_rebuild_header;
  1629. subflow_v6m_specific = subflow_v6_specific;
  1630. subflow_v6m_specific.queue_xmit = ipv4_specific.queue_xmit;
  1631. subflow_v6m_specific.send_check = ipv4_specific.send_check;
  1632. subflow_v6m_specific.net_header_len = ipv4_specific.net_header_len;
  1633. subflow_v6m_specific.mtu_reduced = ipv4_specific.mtu_reduced;
  1634. subflow_v6m_specific.net_frag_header_len = 0;
  1635. subflow_v6m_specific.rebuild_header = subflow_rebuild_header;
  1636. tcpv6_prot_override = tcpv6_prot;
  1637. tcpv6_prot_override.release_cb = tcp_release_cb_override;
  1638. #endif
  1639. mptcp_diag_subflow_init(&subflow_ulp_ops);
  1640. if (tcp_register_ulp(&subflow_ulp_ops) != 0)
  1641. panic("MPTCP: failed to register subflows to ULP\n");
  1642. }