act_ct.c 40 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682
  1. // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
  2. /* -
  3. * net/sched/act_ct.c Connection Tracking action
  4. *
  5. * Authors: Paul Blakey <[email protected]>
  6. * Yossi Kuperman <[email protected]>
  7. * Marcelo Ricardo Leitner <[email protected]>
  8. */
  9. #include <linux/module.h>
  10. #include <linux/init.h>
  11. #include <linux/kernel.h>
  12. #include <linux/skbuff.h>
  13. #include <linux/rtnetlink.h>
  14. #include <linux/pkt_cls.h>
  15. #include <linux/ip.h>
  16. #include <linux/ipv6.h>
  17. #include <linux/rhashtable.h>
  18. #include <net/netlink.h>
  19. #include <net/pkt_sched.h>
  20. #include <net/pkt_cls.h>
  21. #include <net/act_api.h>
  22. #include <net/ip.h>
  23. #include <net/ipv6_frag.h>
  24. #include <uapi/linux/tc_act/tc_ct.h>
  25. #include <net/tc_act/tc_ct.h>
  26. #include <net/netfilter/nf_flow_table.h>
  27. #include <net/netfilter/nf_conntrack.h>
  28. #include <net/netfilter/nf_conntrack_core.h>
  29. #include <net/netfilter/nf_conntrack_zones.h>
  30. #include <net/netfilter/nf_conntrack_helper.h>
  31. #include <net/netfilter/nf_conntrack_acct.h>
  32. #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
  33. #include <net/netfilter/nf_conntrack_act_ct.h>
  34. #include <uapi/linux/netfilter/nf_nat.h>
  35. static struct workqueue_struct *act_ct_wq;
  36. static struct rhashtable zones_ht;
  37. static DEFINE_MUTEX(zones_mutex);
  38. struct tcf_ct_flow_table {
  39. struct rhash_head node; /* In zones tables */
  40. struct rcu_work rwork;
  41. struct nf_flowtable nf_ft;
  42. refcount_t ref;
  43. u16 zone;
  44. bool dying;
  45. };
  46. static const struct rhashtable_params zones_params = {
  47. .head_offset = offsetof(struct tcf_ct_flow_table, node),
  48. .key_offset = offsetof(struct tcf_ct_flow_table, zone),
  49. .key_len = sizeof_field(struct tcf_ct_flow_table, zone),
  50. .automatic_shrinking = true,
  51. };
  52. static struct flow_action_entry *
  53. tcf_ct_flow_table_flow_action_get_next(struct flow_action *flow_action)
  54. {
  55. int i = flow_action->num_entries++;
  56. return &flow_action->entries[i];
  57. }
  58. static void tcf_ct_add_mangle_action(struct flow_action *action,
  59. enum flow_action_mangle_base htype,
  60. u32 offset,
  61. u32 mask,
  62. u32 val)
  63. {
  64. struct flow_action_entry *entry;
  65. entry = tcf_ct_flow_table_flow_action_get_next(action);
  66. entry->id = FLOW_ACTION_MANGLE;
  67. entry->mangle.htype = htype;
  68. entry->mangle.mask = ~mask;
  69. entry->mangle.offset = offset;
  70. entry->mangle.val = val;
  71. }
  72. /* The following nat helper functions check if the inverted reverse tuple
  73. * (target) is different then the current dir tuple - meaning nat for ports
  74. * and/or ip is needed, and add the relevant mangle actions.
  75. */
  76. static void
  77. tcf_ct_flow_table_add_action_nat_ipv4(const struct nf_conntrack_tuple *tuple,
  78. struct nf_conntrack_tuple target,
  79. struct flow_action *action)
  80. {
  81. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  82. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  83. offsetof(struct iphdr, saddr),
  84. 0xFFFFFFFF,
  85. be32_to_cpu(target.src.u3.ip));
  86. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  87. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP4,
  88. offsetof(struct iphdr, daddr),
  89. 0xFFFFFFFF,
  90. be32_to_cpu(target.dst.u3.ip));
  91. }
  92. static void
  93. tcf_ct_add_ipv6_addr_mangle_action(struct flow_action *action,
  94. union nf_inet_addr *addr,
  95. u32 offset)
  96. {
  97. int i;
  98. for (i = 0; i < sizeof(struct in6_addr) / sizeof(u32); i++)
  99. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_IP6,
  100. i * sizeof(u32) + offset,
  101. 0xFFFFFFFF, be32_to_cpu(addr->ip6[i]));
  102. }
  103. static void
  104. tcf_ct_flow_table_add_action_nat_ipv6(const struct nf_conntrack_tuple *tuple,
  105. struct nf_conntrack_tuple target,
  106. struct flow_action *action)
  107. {
  108. if (memcmp(&target.src.u3, &tuple->src.u3, sizeof(target.src.u3)))
  109. tcf_ct_add_ipv6_addr_mangle_action(action, &target.src.u3,
  110. offsetof(struct ipv6hdr,
  111. saddr));
  112. if (memcmp(&target.dst.u3, &tuple->dst.u3, sizeof(target.dst.u3)))
  113. tcf_ct_add_ipv6_addr_mangle_action(action, &target.dst.u3,
  114. offsetof(struct ipv6hdr,
  115. daddr));
  116. }
  117. static void
  118. tcf_ct_flow_table_add_action_nat_tcp(const struct nf_conntrack_tuple *tuple,
  119. struct nf_conntrack_tuple target,
  120. struct flow_action *action)
  121. {
  122. __be16 target_src = target.src.u.tcp.port;
  123. __be16 target_dst = target.dst.u.tcp.port;
  124. if (target_src != tuple->src.u.tcp.port)
  125. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  126. offsetof(struct tcphdr, source),
  127. 0xFFFF, be16_to_cpu(target_src));
  128. if (target_dst != tuple->dst.u.tcp.port)
  129. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_TCP,
  130. offsetof(struct tcphdr, dest),
  131. 0xFFFF, be16_to_cpu(target_dst));
  132. }
  133. static void
  134. tcf_ct_flow_table_add_action_nat_udp(const struct nf_conntrack_tuple *tuple,
  135. struct nf_conntrack_tuple target,
  136. struct flow_action *action)
  137. {
  138. __be16 target_src = target.src.u.udp.port;
  139. __be16 target_dst = target.dst.u.udp.port;
  140. if (target_src != tuple->src.u.udp.port)
  141. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  142. offsetof(struct udphdr, source),
  143. 0xFFFF, be16_to_cpu(target_src));
  144. if (target_dst != tuple->dst.u.udp.port)
  145. tcf_ct_add_mangle_action(action, FLOW_ACT_MANGLE_HDR_TYPE_UDP,
  146. offsetof(struct udphdr, dest),
  147. 0xFFFF, be16_to_cpu(target_dst));
  148. }
  149. static void tcf_ct_flow_table_add_action_meta(struct nf_conn *ct,
  150. enum ip_conntrack_dir dir,
  151. struct flow_action *action)
  152. {
  153. struct nf_conn_labels *ct_labels;
  154. struct flow_action_entry *entry;
  155. enum ip_conntrack_info ctinfo;
  156. u32 *act_ct_labels;
  157. entry = tcf_ct_flow_table_flow_action_get_next(action);
  158. entry->id = FLOW_ACTION_CT_METADATA;
  159. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  160. entry->ct_metadata.mark = READ_ONCE(ct->mark);
  161. #endif
  162. ctinfo = dir == IP_CT_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
  163. IP_CT_ESTABLISHED_REPLY;
  164. /* aligns with the CT reference on the SKB nf_ct_set */
  165. entry->ct_metadata.cookie = (unsigned long)ct | ctinfo;
  166. entry->ct_metadata.orig_dir = dir == IP_CT_DIR_ORIGINAL;
  167. act_ct_labels = entry->ct_metadata.labels;
  168. ct_labels = nf_ct_labels_find(ct);
  169. if (ct_labels)
  170. memcpy(act_ct_labels, ct_labels->bits, NF_CT_LABELS_MAX_SIZE);
  171. else
  172. memset(act_ct_labels, 0, NF_CT_LABELS_MAX_SIZE);
  173. }
  174. static int tcf_ct_flow_table_add_action_nat(struct net *net,
  175. struct nf_conn *ct,
  176. enum ip_conntrack_dir dir,
  177. struct flow_action *action)
  178. {
  179. const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
  180. struct nf_conntrack_tuple target;
  181. if (!(ct->status & IPS_NAT_MASK))
  182. return 0;
  183. nf_ct_invert_tuple(&target, &ct->tuplehash[!dir].tuple);
  184. switch (tuple->src.l3num) {
  185. case NFPROTO_IPV4:
  186. tcf_ct_flow_table_add_action_nat_ipv4(tuple, target,
  187. action);
  188. break;
  189. case NFPROTO_IPV6:
  190. tcf_ct_flow_table_add_action_nat_ipv6(tuple, target,
  191. action);
  192. break;
  193. default:
  194. return -EOPNOTSUPP;
  195. }
  196. switch (nf_ct_protonum(ct)) {
  197. case IPPROTO_TCP:
  198. tcf_ct_flow_table_add_action_nat_tcp(tuple, target, action);
  199. break;
  200. case IPPROTO_UDP:
  201. tcf_ct_flow_table_add_action_nat_udp(tuple, target, action);
  202. break;
  203. default:
  204. return -EOPNOTSUPP;
  205. }
  206. return 0;
  207. }
  208. static int tcf_ct_flow_table_fill_actions(struct net *net,
  209. const struct flow_offload *flow,
  210. enum flow_offload_tuple_dir tdir,
  211. struct nf_flow_rule *flow_rule)
  212. {
  213. struct flow_action *action = &flow_rule->rule->action;
  214. int num_entries = action->num_entries;
  215. struct nf_conn *ct = flow->ct;
  216. enum ip_conntrack_dir dir;
  217. int i, err;
  218. switch (tdir) {
  219. case FLOW_OFFLOAD_DIR_ORIGINAL:
  220. dir = IP_CT_DIR_ORIGINAL;
  221. break;
  222. case FLOW_OFFLOAD_DIR_REPLY:
  223. dir = IP_CT_DIR_REPLY;
  224. break;
  225. default:
  226. return -EOPNOTSUPP;
  227. }
  228. err = tcf_ct_flow_table_add_action_nat(net, ct, dir, action);
  229. if (err)
  230. goto err_nat;
  231. tcf_ct_flow_table_add_action_meta(ct, dir, action);
  232. return 0;
  233. err_nat:
  234. /* Clear filled actions */
  235. for (i = num_entries; i < action->num_entries; i++)
  236. memset(&action->entries[i], 0, sizeof(action->entries[i]));
  237. action->num_entries = num_entries;
  238. return err;
  239. }
  240. static struct nf_flowtable_type flowtable_ct = {
  241. .action = tcf_ct_flow_table_fill_actions,
  242. .owner = THIS_MODULE,
  243. };
  244. static int tcf_ct_flow_table_get(struct net *net, struct tcf_ct_params *params)
  245. {
  246. struct tcf_ct_flow_table *ct_ft;
  247. int err = -ENOMEM;
  248. mutex_lock(&zones_mutex);
  249. ct_ft = rhashtable_lookup_fast(&zones_ht, &params->zone, zones_params);
  250. if (ct_ft && refcount_inc_not_zero(&ct_ft->ref))
  251. goto out_unlock;
  252. ct_ft = kzalloc(sizeof(*ct_ft), GFP_KERNEL);
  253. if (!ct_ft)
  254. goto err_alloc;
  255. refcount_set(&ct_ft->ref, 1);
  256. ct_ft->zone = params->zone;
  257. err = rhashtable_insert_fast(&zones_ht, &ct_ft->node, zones_params);
  258. if (err)
  259. goto err_insert;
  260. ct_ft->nf_ft.type = &flowtable_ct;
  261. ct_ft->nf_ft.flags |= NF_FLOWTABLE_HW_OFFLOAD |
  262. NF_FLOWTABLE_COUNTER;
  263. err = nf_flow_table_init(&ct_ft->nf_ft);
  264. if (err)
  265. goto err_init;
  266. write_pnet(&ct_ft->nf_ft.net, net);
  267. __module_get(THIS_MODULE);
  268. out_unlock:
  269. params->ct_ft = ct_ft;
  270. params->nf_ft = &ct_ft->nf_ft;
  271. mutex_unlock(&zones_mutex);
  272. return 0;
  273. err_init:
  274. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  275. err_insert:
  276. kfree(ct_ft);
  277. err_alloc:
  278. mutex_unlock(&zones_mutex);
  279. return err;
  280. }
  281. static void tcf_ct_flow_table_cleanup_work(struct work_struct *work)
  282. {
  283. struct flow_block_cb *block_cb, *tmp_cb;
  284. struct tcf_ct_flow_table *ct_ft;
  285. struct flow_block *block;
  286. ct_ft = container_of(to_rcu_work(work), struct tcf_ct_flow_table,
  287. rwork);
  288. nf_flow_table_free(&ct_ft->nf_ft);
  289. /* Remove any remaining callbacks before cleanup */
  290. block = &ct_ft->nf_ft.flow_block;
  291. down_write(&ct_ft->nf_ft.flow_block_lock);
  292. list_for_each_entry_safe(block_cb, tmp_cb, &block->cb_list, list) {
  293. list_del(&block_cb->list);
  294. flow_block_cb_free(block_cb);
  295. }
  296. up_write(&ct_ft->nf_ft.flow_block_lock);
  297. kfree(ct_ft);
  298. module_put(THIS_MODULE);
  299. }
  300. static void tcf_ct_flow_table_put(struct tcf_ct_params *params)
  301. {
  302. struct tcf_ct_flow_table *ct_ft = params->ct_ft;
  303. if (refcount_dec_and_test(&params->ct_ft->ref)) {
  304. rhashtable_remove_fast(&zones_ht, &ct_ft->node, zones_params);
  305. INIT_RCU_WORK(&ct_ft->rwork, tcf_ct_flow_table_cleanup_work);
  306. queue_rcu_work(act_ct_wq, &ct_ft->rwork);
  307. }
  308. }
  309. static void tcf_ct_flow_tc_ifidx(struct flow_offload *entry,
  310. struct nf_conn_act_ct_ext *act_ct_ext, u8 dir)
  311. {
  312. entry->tuplehash[dir].tuple.xmit_type = FLOW_OFFLOAD_XMIT_TC;
  313. entry->tuplehash[dir].tuple.tc.iifidx = act_ct_ext->ifindex[dir];
  314. }
  315. static void tcf_ct_flow_table_add(struct tcf_ct_flow_table *ct_ft,
  316. struct nf_conn *ct,
  317. bool tcp)
  318. {
  319. struct nf_conn_act_ct_ext *act_ct_ext;
  320. struct flow_offload *entry;
  321. int err;
  322. if (test_and_set_bit(IPS_OFFLOAD_BIT, &ct->status))
  323. return;
  324. entry = flow_offload_alloc(ct);
  325. if (!entry) {
  326. WARN_ON_ONCE(1);
  327. goto err_alloc;
  328. }
  329. if (tcp) {
  330. ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  331. ct->proto.tcp.seen[1].flags |= IP_CT_TCP_FLAG_BE_LIBERAL;
  332. }
  333. act_ct_ext = nf_conn_act_ct_ext_find(ct);
  334. if (act_ct_ext) {
  335. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_ORIGINAL);
  336. tcf_ct_flow_tc_ifidx(entry, act_ct_ext, FLOW_OFFLOAD_DIR_REPLY);
  337. }
  338. err = flow_offload_add(&ct_ft->nf_ft, entry);
  339. if (err)
  340. goto err_add;
  341. return;
  342. err_add:
  343. flow_offload_free(entry);
  344. err_alloc:
  345. clear_bit(IPS_OFFLOAD_BIT, &ct->status);
  346. }
  347. static void tcf_ct_flow_table_process_conn(struct tcf_ct_flow_table *ct_ft,
  348. struct nf_conn *ct,
  349. enum ip_conntrack_info ctinfo)
  350. {
  351. bool tcp = false;
  352. if ((ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) ||
  353. !test_bit(IPS_ASSURED_BIT, &ct->status))
  354. return;
  355. switch (nf_ct_protonum(ct)) {
  356. case IPPROTO_TCP:
  357. tcp = true;
  358. if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED)
  359. return;
  360. break;
  361. case IPPROTO_UDP:
  362. break;
  363. #ifdef CONFIG_NF_CT_PROTO_GRE
  364. case IPPROTO_GRE: {
  365. struct nf_conntrack_tuple *tuple;
  366. if (ct->status & IPS_NAT_MASK)
  367. return;
  368. tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
  369. /* No support for GRE v1 */
  370. if (tuple->src.u.gre.key || tuple->dst.u.gre.key)
  371. return;
  372. break;
  373. }
  374. #endif
  375. default:
  376. return;
  377. }
  378. if (nf_ct_ext_exist(ct, NF_CT_EXT_HELPER) ||
  379. ct->status & IPS_SEQ_ADJUST)
  380. return;
  381. tcf_ct_flow_table_add(ct_ft, ct, tcp);
  382. }
  383. static bool
  384. tcf_ct_flow_table_fill_tuple_ipv4(struct sk_buff *skb,
  385. struct flow_offload_tuple *tuple,
  386. struct tcphdr **tcph)
  387. {
  388. struct flow_ports *ports;
  389. unsigned int thoff;
  390. struct iphdr *iph;
  391. size_t hdrsize;
  392. u8 ipproto;
  393. if (!pskb_network_may_pull(skb, sizeof(*iph)))
  394. return false;
  395. iph = ip_hdr(skb);
  396. thoff = iph->ihl * 4;
  397. if (ip_is_fragment(iph) ||
  398. unlikely(thoff != sizeof(struct iphdr)))
  399. return false;
  400. ipproto = iph->protocol;
  401. switch (ipproto) {
  402. case IPPROTO_TCP:
  403. hdrsize = sizeof(struct tcphdr);
  404. break;
  405. case IPPROTO_UDP:
  406. hdrsize = sizeof(*ports);
  407. break;
  408. #ifdef CONFIG_NF_CT_PROTO_GRE
  409. case IPPROTO_GRE:
  410. hdrsize = sizeof(struct gre_base_hdr);
  411. break;
  412. #endif
  413. default:
  414. return false;
  415. }
  416. if (iph->ttl <= 1)
  417. return false;
  418. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  419. return false;
  420. switch (ipproto) {
  421. case IPPROTO_TCP:
  422. *tcph = (void *)(skb_network_header(skb) + thoff);
  423. fallthrough;
  424. case IPPROTO_UDP:
  425. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  426. tuple->src_port = ports->source;
  427. tuple->dst_port = ports->dest;
  428. break;
  429. case IPPROTO_GRE: {
  430. struct gre_base_hdr *greh;
  431. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  432. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  433. return false;
  434. break;
  435. }
  436. }
  437. iph = ip_hdr(skb);
  438. tuple->src_v4.s_addr = iph->saddr;
  439. tuple->dst_v4.s_addr = iph->daddr;
  440. tuple->l3proto = AF_INET;
  441. tuple->l4proto = ipproto;
  442. return true;
  443. }
  444. static bool
  445. tcf_ct_flow_table_fill_tuple_ipv6(struct sk_buff *skb,
  446. struct flow_offload_tuple *tuple,
  447. struct tcphdr **tcph)
  448. {
  449. struct flow_ports *ports;
  450. struct ipv6hdr *ip6h;
  451. unsigned int thoff;
  452. size_t hdrsize;
  453. u8 nexthdr;
  454. if (!pskb_network_may_pull(skb, sizeof(*ip6h)))
  455. return false;
  456. ip6h = ipv6_hdr(skb);
  457. thoff = sizeof(*ip6h);
  458. nexthdr = ip6h->nexthdr;
  459. switch (nexthdr) {
  460. case IPPROTO_TCP:
  461. hdrsize = sizeof(struct tcphdr);
  462. break;
  463. case IPPROTO_UDP:
  464. hdrsize = sizeof(*ports);
  465. break;
  466. #ifdef CONFIG_NF_CT_PROTO_GRE
  467. case IPPROTO_GRE:
  468. hdrsize = sizeof(struct gre_base_hdr);
  469. break;
  470. #endif
  471. default:
  472. return false;
  473. }
  474. if (ip6h->hop_limit <= 1)
  475. return false;
  476. if (!pskb_network_may_pull(skb, thoff + hdrsize))
  477. return false;
  478. switch (nexthdr) {
  479. case IPPROTO_TCP:
  480. *tcph = (void *)(skb_network_header(skb) + thoff);
  481. fallthrough;
  482. case IPPROTO_UDP:
  483. ports = (struct flow_ports *)(skb_network_header(skb) + thoff);
  484. tuple->src_port = ports->source;
  485. tuple->dst_port = ports->dest;
  486. break;
  487. case IPPROTO_GRE: {
  488. struct gre_base_hdr *greh;
  489. greh = (struct gre_base_hdr *)(skb_network_header(skb) + thoff);
  490. if ((greh->flags & GRE_VERSION) != GRE_VERSION_0)
  491. return false;
  492. break;
  493. }
  494. }
  495. ip6h = ipv6_hdr(skb);
  496. tuple->src_v6 = ip6h->saddr;
  497. tuple->dst_v6 = ip6h->daddr;
  498. tuple->l3proto = AF_INET6;
  499. tuple->l4proto = nexthdr;
  500. return true;
  501. }
  502. static bool tcf_ct_flow_table_lookup(struct tcf_ct_params *p,
  503. struct sk_buff *skb,
  504. u8 family)
  505. {
  506. struct nf_flowtable *nf_ft = &p->ct_ft->nf_ft;
  507. struct flow_offload_tuple_rhash *tuplehash;
  508. struct flow_offload_tuple tuple = {};
  509. enum ip_conntrack_info ctinfo;
  510. struct tcphdr *tcph = NULL;
  511. struct flow_offload *flow;
  512. struct nf_conn *ct;
  513. u8 dir;
  514. switch (family) {
  515. case NFPROTO_IPV4:
  516. if (!tcf_ct_flow_table_fill_tuple_ipv4(skb, &tuple, &tcph))
  517. return false;
  518. break;
  519. case NFPROTO_IPV6:
  520. if (!tcf_ct_flow_table_fill_tuple_ipv6(skb, &tuple, &tcph))
  521. return false;
  522. break;
  523. default:
  524. return false;
  525. }
  526. tuplehash = flow_offload_lookup(nf_ft, &tuple);
  527. if (!tuplehash)
  528. return false;
  529. dir = tuplehash->tuple.dir;
  530. flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
  531. ct = flow->ct;
  532. if (tcph && (unlikely(tcph->fin || tcph->rst))) {
  533. flow_offload_teardown(flow);
  534. return false;
  535. }
  536. ctinfo = dir == FLOW_OFFLOAD_DIR_ORIGINAL ? IP_CT_ESTABLISHED :
  537. IP_CT_ESTABLISHED_REPLY;
  538. flow_offload_refresh(nf_ft, flow);
  539. nf_conntrack_get(&ct->ct_general);
  540. nf_ct_set(skb, ct, ctinfo);
  541. if (nf_ft->flags & NF_FLOWTABLE_COUNTER)
  542. nf_ct_acct_update(ct, dir, skb->len);
  543. return true;
  544. }
  545. static int tcf_ct_flow_tables_init(void)
  546. {
  547. return rhashtable_init(&zones_ht, &zones_params);
  548. }
  549. static void tcf_ct_flow_tables_uninit(void)
  550. {
  551. rhashtable_destroy(&zones_ht);
  552. }
  553. static struct tc_action_ops act_ct_ops;
  554. struct tc_ct_action_net {
  555. struct tc_action_net tn; /* Must be first */
  556. bool labels;
  557. };
  558. /* Determine whether skb->_nfct is equal to the result of conntrack lookup. */
  559. static bool tcf_ct_skb_nfct_cached(struct net *net, struct sk_buff *skb,
  560. u16 zone_id, bool force)
  561. {
  562. enum ip_conntrack_info ctinfo;
  563. struct nf_conn *ct;
  564. ct = nf_ct_get(skb, &ctinfo);
  565. if (!ct)
  566. return false;
  567. if (!net_eq(net, read_pnet(&ct->ct_net)))
  568. goto drop_ct;
  569. if (nf_ct_zone(ct)->id != zone_id)
  570. goto drop_ct;
  571. /* Force conntrack entry direction. */
  572. if (force && CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) {
  573. if (nf_ct_is_confirmed(ct))
  574. nf_ct_kill(ct);
  575. goto drop_ct;
  576. }
  577. return true;
  578. drop_ct:
  579. nf_ct_put(ct);
  580. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  581. return false;
  582. }
  583. /* Trim the skb to the length specified by the IP/IPv6 header,
  584. * removing any trailing lower-layer padding. This prepares the skb
  585. * for higher-layer processing that assumes skb->len excludes padding
  586. * (such as nf_ip_checksum). The caller needs to pull the skb to the
  587. * network header, and ensure ip_hdr/ipv6_hdr points to valid data.
  588. */
  589. static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family)
  590. {
  591. unsigned int len;
  592. switch (family) {
  593. case NFPROTO_IPV4:
  594. len = ntohs(ip_hdr(skb)->tot_len);
  595. break;
  596. case NFPROTO_IPV6:
  597. len = sizeof(struct ipv6hdr)
  598. + ntohs(ipv6_hdr(skb)->payload_len);
  599. break;
  600. default:
  601. len = skb->len;
  602. }
  603. return pskb_trim_rcsum(skb, len);
  604. }
  605. static u8 tcf_ct_skb_nf_family(struct sk_buff *skb)
  606. {
  607. u8 family = NFPROTO_UNSPEC;
  608. switch (skb_protocol(skb, true)) {
  609. case htons(ETH_P_IP):
  610. family = NFPROTO_IPV4;
  611. break;
  612. case htons(ETH_P_IPV6):
  613. family = NFPROTO_IPV6;
  614. break;
  615. default:
  616. break;
  617. }
  618. return family;
  619. }
  620. static int tcf_ct_ipv4_is_fragment(struct sk_buff *skb, bool *frag)
  621. {
  622. unsigned int len;
  623. len = skb_network_offset(skb) + sizeof(struct iphdr);
  624. if (unlikely(skb->len < len))
  625. return -EINVAL;
  626. if (unlikely(!pskb_may_pull(skb, len)))
  627. return -ENOMEM;
  628. *frag = ip_is_fragment(ip_hdr(skb));
  629. return 0;
  630. }
  631. static int tcf_ct_ipv6_is_fragment(struct sk_buff *skb, bool *frag)
  632. {
  633. unsigned int flags = 0, len, payload_ofs = 0;
  634. unsigned short frag_off;
  635. int nexthdr;
  636. len = skb_network_offset(skb) + sizeof(struct ipv6hdr);
  637. if (unlikely(skb->len < len))
  638. return -EINVAL;
  639. if (unlikely(!pskb_may_pull(skb, len)))
  640. return -ENOMEM;
  641. nexthdr = ipv6_find_hdr(skb, &payload_ofs, -1, &frag_off, &flags);
  642. if (unlikely(nexthdr < 0))
  643. return -EPROTO;
  644. *frag = flags & IP6_FH_F_FRAG;
  645. return 0;
  646. }
  647. static int tcf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
  648. u8 family, u16 zone, bool *defrag)
  649. {
  650. enum ip_conntrack_info ctinfo;
  651. struct nf_conn *ct;
  652. int err = 0;
  653. bool frag;
  654. u16 mru;
  655. /* Previously seen (loopback)? Ignore. */
  656. ct = nf_ct_get(skb, &ctinfo);
  657. if ((ct && !nf_ct_is_template(ct)) || ctinfo == IP_CT_UNTRACKED)
  658. return 0;
  659. if (family == NFPROTO_IPV4)
  660. err = tcf_ct_ipv4_is_fragment(skb, &frag);
  661. else
  662. err = tcf_ct_ipv6_is_fragment(skb, &frag);
  663. if (err || !frag)
  664. return err;
  665. skb_get(skb);
  666. mru = tc_skb_cb(skb)->mru;
  667. if (family == NFPROTO_IPV4) {
  668. enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
  669. memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
  670. local_bh_disable();
  671. err = ip_defrag(net, skb, user);
  672. local_bh_enable();
  673. if (err && err != -EINPROGRESS)
  674. return err;
  675. if (!err) {
  676. *defrag = true;
  677. mru = IPCB(skb)->frag_max_size;
  678. }
  679. } else { /* NFPROTO_IPV6 */
  680. #if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
  681. enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
  682. memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
  683. err = nf_ct_frag6_gather(net, skb, user);
  684. if (err && err != -EINPROGRESS)
  685. goto out_free;
  686. if (!err) {
  687. *defrag = true;
  688. mru = IP6CB(skb)->frag_max_size;
  689. }
  690. #else
  691. err = -EOPNOTSUPP;
  692. goto out_free;
  693. #endif
  694. }
  695. if (err != -EINPROGRESS)
  696. tc_skb_cb(skb)->mru = mru;
  697. skb_clear_hash(skb);
  698. skb->ignore_df = 1;
  699. return err;
  700. out_free:
  701. kfree_skb(skb);
  702. return err;
  703. }
  704. static void tcf_ct_params_free(struct rcu_head *head)
  705. {
  706. struct tcf_ct_params *params = container_of(head,
  707. struct tcf_ct_params, rcu);
  708. tcf_ct_flow_table_put(params);
  709. if (params->tmpl)
  710. nf_ct_put(params->tmpl);
  711. kfree(params);
  712. }
  713. #if IS_ENABLED(CONFIG_NF_NAT)
  714. /* Modelled after nf_nat_ipv[46]_fn().
  715. * range is only used for new, uninitialized NAT state.
  716. * Returns either NF_ACCEPT or NF_DROP.
  717. */
  718. static int ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
  719. enum ip_conntrack_info ctinfo,
  720. const struct nf_nat_range2 *range,
  721. enum nf_nat_manip_type maniptype)
  722. {
  723. __be16 proto = skb_protocol(skb, true);
  724. int hooknum, err = NF_ACCEPT;
  725. /* See HOOK2MANIP(). */
  726. if (maniptype == NF_NAT_MANIP_SRC)
  727. hooknum = NF_INET_LOCAL_IN; /* Source NAT */
  728. else
  729. hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
  730. switch (ctinfo) {
  731. case IP_CT_RELATED:
  732. case IP_CT_RELATED_REPLY:
  733. if (proto == htons(ETH_P_IP) &&
  734. ip_hdr(skb)->protocol == IPPROTO_ICMP) {
  735. if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
  736. hooknum))
  737. err = NF_DROP;
  738. goto out;
  739. } else if (IS_ENABLED(CONFIG_IPV6) && proto == htons(ETH_P_IPV6)) {
  740. __be16 frag_off;
  741. u8 nexthdr = ipv6_hdr(skb)->nexthdr;
  742. int hdrlen = ipv6_skip_exthdr(skb,
  743. sizeof(struct ipv6hdr),
  744. &nexthdr, &frag_off);
  745. if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
  746. if (!nf_nat_icmpv6_reply_translation(skb, ct,
  747. ctinfo,
  748. hooknum,
  749. hdrlen))
  750. err = NF_DROP;
  751. goto out;
  752. }
  753. }
  754. /* Non-ICMP, fall thru to initialize if needed. */
  755. fallthrough;
  756. case IP_CT_NEW:
  757. /* Seen it before? This can happen for loopback, retrans,
  758. * or local packets.
  759. */
  760. if (!nf_nat_initialized(ct, maniptype)) {
  761. /* Initialize according to the NAT action. */
  762. err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
  763. /* Action is set up to establish a new
  764. * mapping.
  765. */
  766. ? nf_nat_setup_info(ct, range, maniptype)
  767. : nf_nat_alloc_null_binding(ct, hooknum);
  768. if (err != NF_ACCEPT)
  769. goto out;
  770. }
  771. break;
  772. case IP_CT_ESTABLISHED:
  773. case IP_CT_ESTABLISHED_REPLY:
  774. break;
  775. default:
  776. err = NF_DROP;
  777. goto out;
  778. }
  779. err = nf_nat_packet(ct, ctinfo, hooknum, skb);
  780. if (err == NF_ACCEPT) {
  781. if (maniptype == NF_NAT_MANIP_SRC)
  782. tc_skb_cb(skb)->post_ct_snat = 1;
  783. if (maniptype == NF_NAT_MANIP_DST)
  784. tc_skb_cb(skb)->post_ct_dnat = 1;
  785. }
  786. out:
  787. return err;
  788. }
  789. #endif /* CONFIG_NF_NAT */
  790. static void tcf_ct_act_set_mark(struct nf_conn *ct, u32 mark, u32 mask)
  791. {
  792. #if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
  793. u32 new_mark;
  794. if (!mask)
  795. return;
  796. new_mark = mark | (READ_ONCE(ct->mark) & ~(mask));
  797. if (READ_ONCE(ct->mark) != new_mark) {
  798. WRITE_ONCE(ct->mark, new_mark);
  799. if (nf_ct_is_confirmed(ct))
  800. nf_conntrack_event_cache(IPCT_MARK, ct);
  801. }
  802. #endif
  803. }
  804. static void tcf_ct_act_set_labels(struct nf_conn *ct,
  805. u32 *labels,
  806. u32 *labels_m)
  807. {
  808. #if IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)
  809. size_t labels_sz = sizeof_field(struct tcf_ct_params, labels);
  810. if (!memchr_inv(labels_m, 0, labels_sz))
  811. return;
  812. nf_connlabels_replace(ct, labels, labels_m, 4);
  813. #endif
  814. }
  815. static int tcf_ct_act_nat(struct sk_buff *skb,
  816. struct nf_conn *ct,
  817. enum ip_conntrack_info ctinfo,
  818. int ct_action,
  819. struct nf_nat_range2 *range,
  820. bool commit)
  821. {
  822. #if IS_ENABLED(CONFIG_NF_NAT)
  823. int err;
  824. enum nf_nat_manip_type maniptype;
  825. if (!(ct_action & TCA_CT_ACT_NAT))
  826. return NF_ACCEPT;
  827. /* Add NAT extension if not confirmed yet. */
  828. if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
  829. return NF_DROP; /* Can't NAT. */
  830. if (ctinfo != IP_CT_NEW && (ct->status & IPS_NAT_MASK) &&
  831. (ctinfo != IP_CT_RELATED || commit)) {
  832. /* NAT an established or related connection like before. */
  833. if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
  834. /* This is the REPLY direction for a connection
  835. * for which NAT was applied in the forward
  836. * direction. Do the reverse NAT.
  837. */
  838. maniptype = ct->status & IPS_SRC_NAT
  839. ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
  840. else
  841. maniptype = ct->status & IPS_SRC_NAT
  842. ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
  843. } else if (ct_action & TCA_CT_ACT_NAT_SRC) {
  844. maniptype = NF_NAT_MANIP_SRC;
  845. } else if (ct_action & TCA_CT_ACT_NAT_DST) {
  846. maniptype = NF_NAT_MANIP_DST;
  847. } else {
  848. return NF_ACCEPT;
  849. }
  850. err = ct_nat_execute(skb, ct, ctinfo, range, maniptype);
  851. if (err == NF_ACCEPT && ct->status & IPS_DST_NAT) {
  852. if (ct->status & IPS_SRC_NAT) {
  853. if (maniptype == NF_NAT_MANIP_SRC)
  854. maniptype = NF_NAT_MANIP_DST;
  855. else
  856. maniptype = NF_NAT_MANIP_SRC;
  857. err = ct_nat_execute(skb, ct, ctinfo, range,
  858. maniptype);
  859. } else if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) {
  860. err = ct_nat_execute(skb, ct, ctinfo, NULL,
  861. NF_NAT_MANIP_SRC);
  862. }
  863. }
  864. return err;
  865. #else
  866. return NF_ACCEPT;
  867. #endif
  868. }
  869. static int tcf_ct_act(struct sk_buff *skb, const struct tc_action *a,
  870. struct tcf_result *res)
  871. {
  872. struct net *net = dev_net(skb->dev);
  873. bool cached, commit, clear, force;
  874. enum ip_conntrack_info ctinfo;
  875. struct tcf_ct *c = to_ct(a);
  876. struct nf_conn *tmpl = NULL;
  877. struct nf_hook_state state;
  878. int nh_ofs, err, retval;
  879. struct tcf_ct_params *p;
  880. bool skip_add = false;
  881. bool defrag = false;
  882. struct nf_conn *ct;
  883. u8 family;
  884. p = rcu_dereference_bh(c->params);
  885. retval = READ_ONCE(c->tcf_action);
  886. commit = p->ct_action & TCA_CT_ACT_COMMIT;
  887. clear = p->ct_action & TCA_CT_ACT_CLEAR;
  888. force = p->ct_action & TCA_CT_ACT_FORCE;
  889. tmpl = p->tmpl;
  890. tcf_lastuse_update(&c->tcf_tm);
  891. tcf_action_update_bstats(&c->common, skb);
  892. if (clear) {
  893. tc_skb_cb(skb)->post_ct = false;
  894. ct = nf_ct_get(skb, &ctinfo);
  895. if (ct) {
  896. nf_ct_put(ct);
  897. nf_ct_set(skb, NULL, IP_CT_UNTRACKED);
  898. }
  899. goto out_clear;
  900. }
  901. family = tcf_ct_skb_nf_family(skb);
  902. if (family == NFPROTO_UNSPEC)
  903. goto drop;
  904. /* The conntrack module expects to be working at L3.
  905. * We also try to pull the IPv4/6 header to linear area
  906. */
  907. nh_ofs = skb_network_offset(skb);
  908. skb_pull_rcsum(skb, nh_ofs);
  909. err = tcf_ct_handle_fragments(net, skb, family, p->zone, &defrag);
  910. if (err == -EINPROGRESS) {
  911. retval = TC_ACT_STOLEN;
  912. goto out_clear;
  913. }
  914. if (err)
  915. goto drop;
  916. err = tcf_ct_skb_network_trim(skb, family);
  917. if (err)
  918. goto drop;
  919. /* If we are recirculating packets to match on ct fields and
  920. * committing with a separate ct action, then we don't need to
  921. * actually run the packet through conntrack twice unless it's for a
  922. * different zone.
  923. */
  924. cached = tcf_ct_skb_nfct_cached(net, skb, p->zone, force);
  925. if (!cached) {
  926. if (tcf_ct_flow_table_lookup(p, skb, family)) {
  927. skip_add = true;
  928. goto do_nat;
  929. }
  930. /* Associate skb with specified zone. */
  931. if (tmpl) {
  932. nf_conntrack_put(skb_nfct(skb));
  933. nf_conntrack_get(&tmpl->ct_general);
  934. nf_ct_set(skb, tmpl, IP_CT_NEW);
  935. }
  936. state.hook = NF_INET_PRE_ROUTING;
  937. state.net = net;
  938. state.pf = family;
  939. err = nf_conntrack_in(skb, &state);
  940. if (err != NF_ACCEPT)
  941. goto out_push;
  942. }
  943. do_nat:
  944. ct = nf_ct_get(skb, &ctinfo);
  945. if (!ct)
  946. goto out_push;
  947. nf_ct_deliver_cached_events(ct);
  948. nf_conn_act_ct_ext_fill(skb, ct, ctinfo);
  949. err = tcf_ct_act_nat(skb, ct, ctinfo, p->ct_action, &p->range, commit);
  950. if (err != NF_ACCEPT)
  951. goto drop;
  952. if (commit) {
  953. tcf_ct_act_set_mark(ct, p->mark, p->mark_mask);
  954. tcf_ct_act_set_labels(ct, p->labels, p->labels_mask);
  955. if (!nf_ct_is_confirmed(ct))
  956. nf_conn_act_ct_ext_add(ct);
  957. /* This will take care of sending queued events
  958. * even if the connection is already confirmed.
  959. */
  960. if (nf_conntrack_confirm(skb) != NF_ACCEPT)
  961. goto drop;
  962. }
  963. if (!skip_add)
  964. tcf_ct_flow_table_process_conn(p->ct_ft, ct, ctinfo);
  965. out_push:
  966. skb_push_rcsum(skb, nh_ofs);
  967. tc_skb_cb(skb)->post_ct = true;
  968. tc_skb_cb(skb)->zone = p->zone;
  969. out_clear:
  970. if (defrag)
  971. qdisc_skb_cb(skb)->pkt_len = skb->len;
  972. return retval;
  973. drop:
  974. tcf_action_inc_drop_qstats(&c->common);
  975. return TC_ACT_SHOT;
  976. }
  977. static const struct nla_policy ct_policy[TCA_CT_MAX + 1] = {
  978. [TCA_CT_ACTION] = { .type = NLA_U16 },
  979. [TCA_CT_PARMS] = NLA_POLICY_EXACT_LEN(sizeof(struct tc_ct)),
  980. [TCA_CT_ZONE] = { .type = NLA_U16 },
  981. [TCA_CT_MARK] = { .type = NLA_U32 },
  982. [TCA_CT_MARK_MASK] = { .type = NLA_U32 },
  983. [TCA_CT_LABELS] = { .type = NLA_BINARY,
  984. .len = 128 / BITS_PER_BYTE },
  985. [TCA_CT_LABELS_MASK] = { .type = NLA_BINARY,
  986. .len = 128 / BITS_PER_BYTE },
  987. [TCA_CT_NAT_IPV4_MIN] = { .type = NLA_U32 },
  988. [TCA_CT_NAT_IPV4_MAX] = { .type = NLA_U32 },
  989. [TCA_CT_NAT_IPV6_MIN] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  990. [TCA_CT_NAT_IPV6_MAX] = NLA_POLICY_EXACT_LEN(sizeof(struct in6_addr)),
  991. [TCA_CT_NAT_PORT_MIN] = { .type = NLA_U16 },
  992. [TCA_CT_NAT_PORT_MAX] = { .type = NLA_U16 },
  993. };
  994. static int tcf_ct_fill_params_nat(struct tcf_ct_params *p,
  995. struct tc_ct *parm,
  996. struct nlattr **tb,
  997. struct netlink_ext_ack *extack)
  998. {
  999. struct nf_nat_range2 *range;
  1000. if (!(p->ct_action & TCA_CT_ACT_NAT))
  1001. return 0;
  1002. if (!IS_ENABLED(CONFIG_NF_NAT)) {
  1003. NL_SET_ERR_MSG_MOD(extack, "Netfilter nat isn't enabled in kernel");
  1004. return -EOPNOTSUPP;
  1005. }
  1006. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  1007. return 0;
  1008. if ((p->ct_action & TCA_CT_ACT_NAT_SRC) &&
  1009. (p->ct_action & TCA_CT_ACT_NAT_DST)) {
  1010. NL_SET_ERR_MSG_MOD(extack, "dnat and snat can't be enabled at the same time");
  1011. return -EOPNOTSUPP;
  1012. }
  1013. range = &p->range;
  1014. if (tb[TCA_CT_NAT_IPV4_MIN]) {
  1015. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV4_MAX];
  1016. p->ipv4_range = true;
  1017. range->flags |= NF_NAT_RANGE_MAP_IPS;
  1018. range->min_addr.ip =
  1019. nla_get_in_addr(tb[TCA_CT_NAT_IPV4_MIN]);
  1020. range->max_addr.ip = max_attr ?
  1021. nla_get_in_addr(max_attr) :
  1022. range->min_addr.ip;
  1023. } else if (tb[TCA_CT_NAT_IPV6_MIN]) {
  1024. struct nlattr *max_attr = tb[TCA_CT_NAT_IPV6_MAX];
  1025. p->ipv4_range = false;
  1026. range->flags |= NF_NAT_RANGE_MAP_IPS;
  1027. range->min_addr.in6 =
  1028. nla_get_in6_addr(tb[TCA_CT_NAT_IPV6_MIN]);
  1029. range->max_addr.in6 = max_attr ?
  1030. nla_get_in6_addr(max_attr) :
  1031. range->min_addr.in6;
  1032. }
  1033. if (tb[TCA_CT_NAT_PORT_MIN]) {
  1034. range->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
  1035. range->min_proto.all = nla_get_be16(tb[TCA_CT_NAT_PORT_MIN]);
  1036. range->max_proto.all = tb[TCA_CT_NAT_PORT_MAX] ?
  1037. nla_get_be16(tb[TCA_CT_NAT_PORT_MAX]) :
  1038. range->min_proto.all;
  1039. }
  1040. return 0;
  1041. }
  1042. static void tcf_ct_set_key_val(struct nlattr **tb,
  1043. void *val, int val_type,
  1044. void *mask, int mask_type,
  1045. int len)
  1046. {
  1047. if (!tb[val_type])
  1048. return;
  1049. nla_memcpy(val, tb[val_type], len);
  1050. if (!mask)
  1051. return;
  1052. if (mask_type == TCA_CT_UNSPEC || !tb[mask_type])
  1053. memset(mask, 0xff, len);
  1054. else
  1055. nla_memcpy(mask, tb[mask_type], len);
  1056. }
  1057. static int tcf_ct_fill_params(struct net *net,
  1058. struct tcf_ct_params *p,
  1059. struct tc_ct *parm,
  1060. struct nlattr **tb,
  1061. struct netlink_ext_ack *extack)
  1062. {
  1063. struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1064. struct nf_conntrack_zone zone;
  1065. struct nf_conn *tmpl;
  1066. int err;
  1067. p->zone = NF_CT_DEFAULT_ZONE_ID;
  1068. tcf_ct_set_key_val(tb,
  1069. &p->ct_action, TCA_CT_ACTION,
  1070. NULL, TCA_CT_UNSPEC,
  1071. sizeof(p->ct_action));
  1072. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1073. return 0;
  1074. err = tcf_ct_fill_params_nat(p, parm, tb, extack);
  1075. if (err)
  1076. return err;
  1077. if (tb[TCA_CT_MARK]) {
  1078. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)) {
  1079. NL_SET_ERR_MSG_MOD(extack, "Conntrack mark isn't enabled.");
  1080. return -EOPNOTSUPP;
  1081. }
  1082. tcf_ct_set_key_val(tb,
  1083. &p->mark, TCA_CT_MARK,
  1084. &p->mark_mask, TCA_CT_MARK_MASK,
  1085. sizeof(p->mark));
  1086. }
  1087. if (tb[TCA_CT_LABELS]) {
  1088. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS)) {
  1089. NL_SET_ERR_MSG_MOD(extack, "Conntrack labels isn't enabled.");
  1090. return -EOPNOTSUPP;
  1091. }
  1092. if (!tn->labels) {
  1093. NL_SET_ERR_MSG_MOD(extack, "Failed to set connlabel length");
  1094. return -EOPNOTSUPP;
  1095. }
  1096. tcf_ct_set_key_val(tb,
  1097. p->labels, TCA_CT_LABELS,
  1098. p->labels_mask, TCA_CT_LABELS_MASK,
  1099. sizeof(p->labels));
  1100. }
  1101. if (tb[TCA_CT_ZONE]) {
  1102. if (!IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)) {
  1103. NL_SET_ERR_MSG_MOD(extack, "Conntrack zones isn't enabled.");
  1104. return -EOPNOTSUPP;
  1105. }
  1106. tcf_ct_set_key_val(tb,
  1107. &p->zone, TCA_CT_ZONE,
  1108. NULL, TCA_CT_UNSPEC,
  1109. sizeof(p->zone));
  1110. }
  1111. nf_ct_zone_init(&zone, p->zone, NF_CT_DEFAULT_ZONE_DIR, 0);
  1112. tmpl = nf_ct_tmpl_alloc(net, &zone, GFP_KERNEL);
  1113. if (!tmpl) {
  1114. NL_SET_ERR_MSG_MOD(extack, "Failed to allocate conntrack template");
  1115. return -ENOMEM;
  1116. }
  1117. __set_bit(IPS_CONFIRMED_BIT, &tmpl->status);
  1118. p->tmpl = tmpl;
  1119. return 0;
  1120. }
  1121. static int tcf_ct_init(struct net *net, struct nlattr *nla,
  1122. struct nlattr *est, struct tc_action **a,
  1123. struct tcf_proto *tp, u32 flags,
  1124. struct netlink_ext_ack *extack)
  1125. {
  1126. struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1127. bool bind = flags & TCA_ACT_FLAGS_BIND;
  1128. struct tcf_ct_params *params = NULL;
  1129. struct nlattr *tb[TCA_CT_MAX + 1];
  1130. struct tcf_chain *goto_ch = NULL;
  1131. struct tc_ct *parm;
  1132. struct tcf_ct *c;
  1133. int err, res = 0;
  1134. u32 index;
  1135. if (!nla) {
  1136. NL_SET_ERR_MSG_MOD(extack, "Ct requires attributes to be passed");
  1137. return -EINVAL;
  1138. }
  1139. err = nla_parse_nested(tb, TCA_CT_MAX, nla, ct_policy, extack);
  1140. if (err < 0)
  1141. return err;
  1142. if (!tb[TCA_CT_PARMS]) {
  1143. NL_SET_ERR_MSG_MOD(extack, "Missing required ct parameters");
  1144. return -EINVAL;
  1145. }
  1146. parm = nla_data(tb[TCA_CT_PARMS]);
  1147. index = parm->index;
  1148. err = tcf_idr_check_alloc(tn, &index, a, bind);
  1149. if (err < 0)
  1150. return err;
  1151. if (!err) {
  1152. err = tcf_idr_create_from_flags(tn, index, est, a,
  1153. &act_ct_ops, bind, flags);
  1154. if (err) {
  1155. tcf_idr_cleanup(tn, index);
  1156. return err;
  1157. }
  1158. res = ACT_P_CREATED;
  1159. } else {
  1160. if (bind)
  1161. return 0;
  1162. if (!(flags & TCA_ACT_FLAGS_REPLACE)) {
  1163. tcf_idr_release(*a, bind);
  1164. return -EEXIST;
  1165. }
  1166. }
  1167. err = tcf_action_check_ctrlact(parm->action, tp, &goto_ch, extack);
  1168. if (err < 0)
  1169. goto cleanup;
  1170. c = to_ct(*a);
  1171. params = kzalloc(sizeof(*params), GFP_KERNEL);
  1172. if (unlikely(!params)) {
  1173. err = -ENOMEM;
  1174. goto cleanup;
  1175. }
  1176. err = tcf_ct_fill_params(net, params, parm, tb, extack);
  1177. if (err)
  1178. goto cleanup;
  1179. err = tcf_ct_flow_table_get(net, params);
  1180. if (err)
  1181. goto cleanup_params;
  1182. spin_lock_bh(&c->tcf_lock);
  1183. goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch);
  1184. params = rcu_replace_pointer(c->params, params,
  1185. lockdep_is_held(&c->tcf_lock));
  1186. spin_unlock_bh(&c->tcf_lock);
  1187. if (goto_ch)
  1188. tcf_chain_put_by_act(goto_ch);
  1189. if (params)
  1190. call_rcu(&params->rcu, tcf_ct_params_free);
  1191. return res;
  1192. cleanup_params:
  1193. if (params->tmpl)
  1194. nf_ct_put(params->tmpl);
  1195. cleanup:
  1196. if (goto_ch)
  1197. tcf_chain_put_by_act(goto_ch);
  1198. kfree(params);
  1199. tcf_idr_release(*a, bind);
  1200. return err;
  1201. }
  1202. static void tcf_ct_cleanup(struct tc_action *a)
  1203. {
  1204. struct tcf_ct_params *params;
  1205. struct tcf_ct *c = to_ct(a);
  1206. params = rcu_dereference_protected(c->params, 1);
  1207. if (params)
  1208. call_rcu(&params->rcu, tcf_ct_params_free);
  1209. }
  1210. static int tcf_ct_dump_key_val(struct sk_buff *skb,
  1211. void *val, int val_type,
  1212. void *mask, int mask_type,
  1213. int len)
  1214. {
  1215. int err;
  1216. if (mask && !memchr_inv(mask, 0, len))
  1217. return 0;
  1218. err = nla_put(skb, val_type, len, val);
  1219. if (err)
  1220. return err;
  1221. if (mask_type != TCA_CT_UNSPEC) {
  1222. err = nla_put(skb, mask_type, len, mask);
  1223. if (err)
  1224. return err;
  1225. }
  1226. return 0;
  1227. }
  1228. static int tcf_ct_dump_nat(struct sk_buff *skb, struct tcf_ct_params *p)
  1229. {
  1230. struct nf_nat_range2 *range = &p->range;
  1231. if (!(p->ct_action & TCA_CT_ACT_NAT))
  1232. return 0;
  1233. if (!(p->ct_action & (TCA_CT_ACT_NAT_SRC | TCA_CT_ACT_NAT_DST)))
  1234. return 0;
  1235. if (range->flags & NF_NAT_RANGE_MAP_IPS) {
  1236. if (p->ipv4_range) {
  1237. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MIN,
  1238. range->min_addr.ip))
  1239. return -1;
  1240. if (nla_put_in_addr(skb, TCA_CT_NAT_IPV4_MAX,
  1241. range->max_addr.ip))
  1242. return -1;
  1243. } else {
  1244. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MIN,
  1245. &range->min_addr.in6))
  1246. return -1;
  1247. if (nla_put_in6_addr(skb, TCA_CT_NAT_IPV6_MAX,
  1248. &range->max_addr.in6))
  1249. return -1;
  1250. }
  1251. }
  1252. if (range->flags & NF_NAT_RANGE_PROTO_SPECIFIED) {
  1253. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MIN,
  1254. range->min_proto.all))
  1255. return -1;
  1256. if (nla_put_be16(skb, TCA_CT_NAT_PORT_MAX,
  1257. range->max_proto.all))
  1258. return -1;
  1259. }
  1260. return 0;
  1261. }
  1262. static inline int tcf_ct_dump(struct sk_buff *skb, struct tc_action *a,
  1263. int bind, int ref)
  1264. {
  1265. unsigned char *b = skb_tail_pointer(skb);
  1266. struct tcf_ct *c = to_ct(a);
  1267. struct tcf_ct_params *p;
  1268. struct tc_ct opt = {
  1269. .index = c->tcf_index,
  1270. .refcnt = refcount_read(&c->tcf_refcnt) - ref,
  1271. .bindcnt = atomic_read(&c->tcf_bindcnt) - bind,
  1272. };
  1273. struct tcf_t t;
  1274. spin_lock_bh(&c->tcf_lock);
  1275. p = rcu_dereference_protected(c->params,
  1276. lockdep_is_held(&c->tcf_lock));
  1277. opt.action = c->tcf_action;
  1278. if (tcf_ct_dump_key_val(skb,
  1279. &p->ct_action, TCA_CT_ACTION,
  1280. NULL, TCA_CT_UNSPEC,
  1281. sizeof(p->ct_action)))
  1282. goto nla_put_failure;
  1283. if (p->ct_action & TCA_CT_ACT_CLEAR)
  1284. goto skip_dump;
  1285. if (IS_ENABLED(CONFIG_NF_CONNTRACK_MARK) &&
  1286. tcf_ct_dump_key_val(skb,
  1287. &p->mark, TCA_CT_MARK,
  1288. &p->mark_mask, TCA_CT_MARK_MASK,
  1289. sizeof(p->mark)))
  1290. goto nla_put_failure;
  1291. if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
  1292. tcf_ct_dump_key_val(skb,
  1293. p->labels, TCA_CT_LABELS,
  1294. p->labels_mask, TCA_CT_LABELS_MASK,
  1295. sizeof(p->labels)))
  1296. goto nla_put_failure;
  1297. if (IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES) &&
  1298. tcf_ct_dump_key_val(skb,
  1299. &p->zone, TCA_CT_ZONE,
  1300. NULL, TCA_CT_UNSPEC,
  1301. sizeof(p->zone)))
  1302. goto nla_put_failure;
  1303. if (tcf_ct_dump_nat(skb, p))
  1304. goto nla_put_failure;
  1305. skip_dump:
  1306. if (nla_put(skb, TCA_CT_PARMS, sizeof(opt), &opt))
  1307. goto nla_put_failure;
  1308. tcf_tm_dump(&t, &c->tcf_tm);
  1309. if (nla_put_64bit(skb, TCA_CT_TM, sizeof(t), &t, TCA_CT_PAD))
  1310. goto nla_put_failure;
  1311. spin_unlock_bh(&c->tcf_lock);
  1312. return skb->len;
  1313. nla_put_failure:
  1314. spin_unlock_bh(&c->tcf_lock);
  1315. nlmsg_trim(skb, b);
  1316. return -1;
  1317. }
  1318. static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets,
  1319. u64 drops, u64 lastuse, bool hw)
  1320. {
  1321. struct tcf_ct *c = to_ct(a);
  1322. tcf_action_update_stats(a, bytes, packets, drops, hw);
  1323. c->tcf_tm.lastuse = max_t(u64, c->tcf_tm.lastuse, lastuse);
  1324. }
  1325. static int tcf_ct_offload_act_setup(struct tc_action *act, void *entry_data,
  1326. u32 *index_inc, bool bind,
  1327. struct netlink_ext_ack *extack)
  1328. {
  1329. if (bind) {
  1330. struct flow_action_entry *entry = entry_data;
  1331. entry->id = FLOW_ACTION_CT;
  1332. entry->ct.action = tcf_ct_action(act);
  1333. entry->ct.zone = tcf_ct_zone(act);
  1334. entry->ct.flow_table = tcf_ct_ft(act);
  1335. *index_inc = 1;
  1336. } else {
  1337. struct flow_offload_action *fl_action = entry_data;
  1338. fl_action->id = FLOW_ACTION_CT;
  1339. }
  1340. return 0;
  1341. }
  1342. static struct tc_action_ops act_ct_ops = {
  1343. .kind = "ct",
  1344. .id = TCA_ID_CT,
  1345. .owner = THIS_MODULE,
  1346. .act = tcf_ct_act,
  1347. .dump = tcf_ct_dump,
  1348. .init = tcf_ct_init,
  1349. .cleanup = tcf_ct_cleanup,
  1350. .stats_update = tcf_stats_update,
  1351. .offload_act_setup = tcf_ct_offload_act_setup,
  1352. .size = sizeof(struct tcf_ct),
  1353. };
  1354. static __net_init int ct_init_net(struct net *net)
  1355. {
  1356. unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8;
  1357. struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1358. if (nf_connlabels_get(net, n_bits - 1)) {
  1359. tn->labels = false;
  1360. pr_err("act_ct: Failed to set connlabels length");
  1361. } else {
  1362. tn->labels = true;
  1363. }
  1364. return tc_action_net_init(net, &tn->tn, &act_ct_ops);
  1365. }
  1366. static void __net_exit ct_exit_net(struct list_head *net_list)
  1367. {
  1368. struct net *net;
  1369. rtnl_lock();
  1370. list_for_each_entry(net, net_list, exit_list) {
  1371. struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id);
  1372. if (tn->labels)
  1373. nf_connlabels_put(net);
  1374. }
  1375. rtnl_unlock();
  1376. tc_action_net_exit(net_list, act_ct_ops.net_id);
  1377. }
  1378. static struct pernet_operations ct_net_ops = {
  1379. .init = ct_init_net,
  1380. .exit_batch = ct_exit_net,
  1381. .id = &act_ct_ops.net_id,
  1382. .size = sizeof(struct tc_ct_action_net),
  1383. };
  1384. static int __init ct_init_module(void)
  1385. {
  1386. int err;
  1387. act_ct_wq = alloc_ordered_workqueue("act_ct_workqueue", 0);
  1388. if (!act_ct_wq)
  1389. return -ENOMEM;
  1390. err = tcf_ct_flow_tables_init();
  1391. if (err)
  1392. goto err_tbl_init;
  1393. err = tcf_register_action(&act_ct_ops, &ct_net_ops);
  1394. if (err)
  1395. goto err_register;
  1396. static_branch_inc(&tcf_frag_xmit_count);
  1397. return 0;
  1398. err_register:
  1399. tcf_ct_flow_tables_uninit();
  1400. err_tbl_init:
  1401. destroy_workqueue(act_ct_wq);
  1402. return err;
  1403. }
  1404. static void __exit ct_cleanup_module(void)
  1405. {
  1406. static_branch_dec(&tcf_frag_xmit_count);
  1407. tcf_unregister_action(&act_ct_ops, &ct_net_ops);
  1408. tcf_ct_flow_tables_uninit();
  1409. destroy_workqueue(act_ct_wq);
  1410. }
  1411. module_init(ct_init_module);
  1412. module_exit(ct_cleanup_module);
  1413. MODULE_AUTHOR("Paul Blakey <[email protected]>");
  1414. MODULE_AUTHOR("Yossi Kuperman <[email protected]>");
  1415. MODULE_AUTHOR("Marcelo Ricardo Leitner <[email protected]>");
  1416. MODULE_DESCRIPTION("Connection tracking action");
  1417. MODULE_LICENSE("GPL v2");