devmap.c 29 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  3. */
  4. /* Devmaps primary use is as a backend map for XDP BPF helper call
  5. * bpf_redirect_map(). Because XDP is mostly concerned with performance we
  6. * spent some effort to ensure the datapath with redirect maps does not use
  7. * any locking. This is a quick note on the details.
  8. *
  9. * We have three possible paths to get into the devmap control plane bpf
  10. * syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
  11. * will invoke an update, delete, or lookup operation. To ensure updates and
  12. * deletes appear atomic from the datapath side xchg() is used to modify the
  13. * netdev_map array. Then because the datapath does a lookup into the netdev_map
  14. * array (read-only) from an RCU critical section we use call_rcu() to wait for
  15. * an rcu grace period before free'ing the old data structures. This ensures the
  16. * datapath always has a valid copy. However, the datapath does a "flush"
  17. * operation that pushes any pending packets in the driver outside the RCU
  18. * critical section. Each bpf_dtab_netdev tracks these pending operations using
  19. * a per-cpu flush list. The bpf_dtab_netdev object will not be destroyed until
  20. * this list is empty, indicating outstanding flush operations have completed.
  21. *
  22. * BPF syscalls may race with BPF program calls on any of the update, delete
  23. * or lookup operations. As noted above the xchg() operation also keep the
  24. * netdev_map consistent in this case. From the devmap side BPF programs
  25. * calling into these operations are the same as multiple user space threads
  26. * making system calls.
  27. *
  28. * Finally, any of the above may race with a netdev_unregister notifier. The
  29. * unregister notifier must search for net devices in the map structure that
  30. * contain a reference to the net device and remove them. This is a two step
  31. * process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
  32. * check to see if the ifindex is the same as the net_device being removed.
  33. * When removing the dev a cmpxchg() is used to ensure the correct dev is
  34. * removed, in the case of a concurrent update or delete operation it is
  35. * possible that the initially referenced dev is no longer in the map. As the
  36. * notifier hook walks the map we know that new dev references can not be
  37. * added by the user because core infrastructure ensures dev_get_by_index()
  38. * calls will fail at this point.
  39. *
  40. * The devmap_hash type is a map type which interprets keys as ifindexes and
  41. * indexes these using a hashmap. This allows maps that use ifindex as key to be
  42. * densely packed instead of having holes in the lookup array for unused
  43. * ifindexes. The setup and packet enqueue/send code is shared between the two
  44. * types of devmap; only the lookup and insertion is different.
  45. */
  46. #include <linux/bpf.h>
  47. #include <net/xdp.h>
  48. #include <linux/filter.h>
  49. #include <trace/events/xdp.h>
  50. #include <linux/btf_ids.h>
  51. #define DEV_CREATE_FLAG_MASK \
  52. (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  53. struct xdp_dev_bulk_queue {
  54. struct xdp_frame *q[DEV_MAP_BULK_SIZE];
  55. struct list_head flush_node;
  56. struct net_device *dev;
  57. struct net_device *dev_rx;
  58. struct bpf_prog *xdp_prog;
  59. unsigned int count;
  60. };
  61. struct bpf_dtab_netdev {
  62. struct net_device *dev; /* must be first member, due to tracepoint */
  63. struct hlist_node index_hlist;
  64. struct bpf_dtab *dtab;
  65. struct bpf_prog *xdp_prog;
  66. struct rcu_head rcu;
  67. unsigned int idx;
  68. struct bpf_devmap_val val;
  69. };
  70. struct bpf_dtab {
  71. struct bpf_map map;
  72. struct bpf_dtab_netdev __rcu **netdev_map; /* DEVMAP type only */
  73. struct list_head list;
  74. /* these are only used for DEVMAP_HASH type maps */
  75. struct hlist_head *dev_index_head;
  76. spinlock_t index_lock;
  77. unsigned int items;
  78. u32 n_buckets;
  79. };
  80. static DEFINE_PER_CPU(struct list_head, dev_flush_list);
  81. static DEFINE_SPINLOCK(dev_map_lock);
  82. static LIST_HEAD(dev_map_list);
  83. static struct hlist_head *dev_map_create_hash(unsigned int entries,
  84. int numa_node)
  85. {
  86. int i;
  87. struct hlist_head *hash;
  88. hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
  89. if (hash != NULL)
  90. for (i = 0; i < entries; i++)
  91. INIT_HLIST_HEAD(&hash[i]);
  92. return hash;
  93. }
  94. static inline struct hlist_head *dev_map_index_hash(struct bpf_dtab *dtab,
  95. int idx)
  96. {
  97. return &dtab->dev_index_head[idx & (dtab->n_buckets - 1)];
  98. }
  99. static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
  100. {
  101. u32 valsize = attr->value_size;
  102. /* check sanity of attributes. 2 value sizes supported:
  103. * 4 bytes: ifindex
  104. * 8 bytes: ifindex + prog fd
  105. */
  106. if (attr->max_entries == 0 || attr->key_size != 4 ||
  107. (valsize != offsetofend(struct bpf_devmap_val, ifindex) &&
  108. valsize != offsetofend(struct bpf_devmap_val, bpf_prog.fd)) ||
  109. attr->map_flags & ~DEV_CREATE_FLAG_MASK)
  110. return -EINVAL;
  111. /* Lookup returns a pointer straight to dev->ifindex, so make sure the
  112. * verifier prevents writes from the BPF side
  113. */
  114. attr->map_flags |= BPF_F_RDONLY_PROG;
  115. bpf_map_init_from_attr(&dtab->map, attr);
  116. if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
  117. dtab->n_buckets = roundup_pow_of_two(dtab->map.max_entries);
  118. if (!dtab->n_buckets) /* Overflow check */
  119. return -EINVAL;
  120. }
  121. if (attr->map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
  122. dtab->dev_index_head = dev_map_create_hash(dtab->n_buckets,
  123. dtab->map.numa_node);
  124. if (!dtab->dev_index_head)
  125. return -ENOMEM;
  126. spin_lock_init(&dtab->index_lock);
  127. } else {
  128. dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
  129. sizeof(struct bpf_dtab_netdev *),
  130. dtab->map.numa_node);
  131. if (!dtab->netdev_map)
  132. return -ENOMEM;
  133. }
  134. return 0;
  135. }
  136. static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
  137. {
  138. struct bpf_dtab *dtab;
  139. int err;
  140. if (!capable(CAP_NET_ADMIN))
  141. return ERR_PTR(-EPERM);
  142. dtab = bpf_map_area_alloc(sizeof(*dtab), NUMA_NO_NODE);
  143. if (!dtab)
  144. return ERR_PTR(-ENOMEM);
  145. err = dev_map_init_map(dtab, attr);
  146. if (err) {
  147. bpf_map_area_free(dtab);
  148. return ERR_PTR(err);
  149. }
  150. spin_lock(&dev_map_lock);
  151. list_add_tail_rcu(&dtab->list, &dev_map_list);
  152. spin_unlock(&dev_map_lock);
  153. return &dtab->map;
  154. }
  155. static void dev_map_free(struct bpf_map *map)
  156. {
  157. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  158. int i;
  159. /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
  160. * so the programs (can be more than one that used this map) were
  161. * disconnected from events. The following synchronize_rcu() guarantees
  162. * both rcu read critical sections complete and waits for
  163. * preempt-disable regions (NAPI being the relevant context here) so we
  164. * are certain there will be no further reads against the netdev_map and
  165. * all flush operations are complete. Flush operations can only be done
  166. * from NAPI context for this reason.
  167. */
  168. spin_lock(&dev_map_lock);
  169. list_del_rcu(&dtab->list);
  170. spin_unlock(&dev_map_lock);
  171. bpf_clear_redirect_map(map);
  172. synchronize_rcu();
  173. /* Make sure prior __dev_map_entry_free() have completed. */
  174. rcu_barrier();
  175. if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
  176. for (i = 0; i < dtab->n_buckets; i++) {
  177. struct bpf_dtab_netdev *dev;
  178. struct hlist_head *head;
  179. struct hlist_node *next;
  180. head = dev_map_index_hash(dtab, i);
  181. hlist_for_each_entry_safe(dev, next, head, index_hlist) {
  182. hlist_del_rcu(&dev->index_hlist);
  183. if (dev->xdp_prog)
  184. bpf_prog_put(dev->xdp_prog);
  185. dev_put(dev->dev);
  186. kfree(dev);
  187. }
  188. }
  189. bpf_map_area_free(dtab->dev_index_head);
  190. } else {
  191. for (i = 0; i < dtab->map.max_entries; i++) {
  192. struct bpf_dtab_netdev *dev;
  193. dev = rcu_dereference_raw(dtab->netdev_map[i]);
  194. if (!dev)
  195. continue;
  196. if (dev->xdp_prog)
  197. bpf_prog_put(dev->xdp_prog);
  198. dev_put(dev->dev);
  199. kfree(dev);
  200. }
  201. bpf_map_area_free(dtab->netdev_map);
  202. }
  203. bpf_map_area_free(dtab);
  204. }
  205. static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  206. {
  207. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  208. u32 index = key ? *(u32 *)key : U32_MAX;
  209. u32 *next = next_key;
  210. if (index >= dtab->map.max_entries) {
  211. *next = 0;
  212. return 0;
  213. }
  214. if (index == dtab->map.max_entries - 1)
  215. return -ENOENT;
  216. *next = index + 1;
  217. return 0;
  218. }
  219. /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
  220. * by local_bh_disable() (from XDP calls inside NAPI). The
  221. * rcu_read_lock_bh_held() below makes lockdep accept both.
  222. */
  223. static void *__dev_map_hash_lookup_elem(struct bpf_map *map, u32 key)
  224. {
  225. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  226. struct hlist_head *head = dev_map_index_hash(dtab, key);
  227. struct bpf_dtab_netdev *dev;
  228. hlist_for_each_entry_rcu(dev, head, index_hlist,
  229. lockdep_is_held(&dtab->index_lock))
  230. if (dev->idx == key)
  231. return dev;
  232. return NULL;
  233. }
  234. static int dev_map_hash_get_next_key(struct bpf_map *map, void *key,
  235. void *next_key)
  236. {
  237. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  238. u32 idx, *next = next_key;
  239. struct bpf_dtab_netdev *dev, *next_dev;
  240. struct hlist_head *head;
  241. int i = 0;
  242. if (!key)
  243. goto find_first;
  244. idx = *(u32 *)key;
  245. dev = __dev_map_hash_lookup_elem(map, idx);
  246. if (!dev)
  247. goto find_first;
  248. next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&dev->index_hlist)),
  249. struct bpf_dtab_netdev, index_hlist);
  250. if (next_dev) {
  251. *next = next_dev->idx;
  252. return 0;
  253. }
  254. i = idx & (dtab->n_buckets - 1);
  255. i++;
  256. find_first:
  257. for (; i < dtab->n_buckets; i++) {
  258. head = dev_map_index_hash(dtab, i);
  259. next_dev = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
  260. struct bpf_dtab_netdev,
  261. index_hlist);
  262. if (next_dev) {
  263. *next = next_dev->idx;
  264. return 0;
  265. }
  266. }
  267. return -ENOENT;
  268. }
  269. static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
  270. struct xdp_frame **frames, int n,
  271. struct net_device *dev)
  272. {
  273. struct xdp_txq_info txq = { .dev = dev };
  274. struct xdp_buff xdp;
  275. int i, nframes = 0;
  276. for (i = 0; i < n; i++) {
  277. struct xdp_frame *xdpf = frames[i];
  278. u32 act;
  279. int err;
  280. xdp_convert_frame_to_buff(xdpf, &xdp);
  281. xdp.txq = &txq;
  282. act = bpf_prog_run_xdp(xdp_prog, &xdp);
  283. switch (act) {
  284. case XDP_PASS:
  285. err = xdp_update_frame_from_buff(&xdp, xdpf);
  286. if (unlikely(err < 0))
  287. xdp_return_frame_rx_napi(xdpf);
  288. else
  289. frames[nframes++] = xdpf;
  290. break;
  291. default:
  292. bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
  293. fallthrough;
  294. case XDP_ABORTED:
  295. trace_xdp_exception(dev, xdp_prog, act);
  296. fallthrough;
  297. case XDP_DROP:
  298. xdp_return_frame_rx_napi(xdpf);
  299. break;
  300. }
  301. }
  302. return nframes; /* sent frames count */
  303. }
  304. static void bq_xmit_all(struct xdp_dev_bulk_queue *bq, u32 flags)
  305. {
  306. struct net_device *dev = bq->dev;
  307. unsigned int cnt = bq->count;
  308. int sent = 0, err = 0;
  309. int to_send = cnt;
  310. int i;
  311. if (unlikely(!cnt))
  312. return;
  313. for (i = 0; i < cnt; i++) {
  314. struct xdp_frame *xdpf = bq->q[i];
  315. prefetch(xdpf);
  316. }
  317. if (bq->xdp_prog) {
  318. to_send = dev_map_bpf_prog_run(bq->xdp_prog, bq->q, cnt, dev);
  319. if (!to_send)
  320. goto out;
  321. }
  322. sent = dev->netdev_ops->ndo_xdp_xmit(dev, to_send, bq->q, flags);
  323. if (sent < 0) {
  324. /* If ndo_xdp_xmit fails with an errno, no frames have
  325. * been xmit'ed.
  326. */
  327. err = sent;
  328. sent = 0;
  329. }
  330. /* If not all frames have been transmitted, it is our
  331. * responsibility to free them
  332. */
  333. for (i = sent; unlikely(i < to_send); i++)
  334. xdp_return_frame_rx_napi(bq->q[i]);
  335. out:
  336. bq->count = 0;
  337. trace_xdp_devmap_xmit(bq->dev_rx, dev, sent, cnt - sent, err);
  338. }
  339. /* __dev_flush is called from xdp_do_flush() which _must_ be signalled from the
  340. * driver before returning from its napi->poll() routine. See the comment above
  341. * xdp_do_flush() in filter.c.
  342. */
  343. void __dev_flush(void)
  344. {
  345. struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
  346. struct xdp_dev_bulk_queue *bq, *tmp;
  347. list_for_each_entry_safe(bq, tmp, flush_list, flush_node) {
  348. bq_xmit_all(bq, XDP_XMIT_FLUSH);
  349. bq->dev_rx = NULL;
  350. bq->xdp_prog = NULL;
  351. __list_del_clearprev(&bq->flush_node);
  352. }
  353. }
  354. /* Elements are kept alive by RCU; either by rcu_read_lock() (from syscall) or
  355. * by local_bh_disable() (from XDP calls inside NAPI). The
  356. * rcu_read_lock_bh_held() below makes lockdep accept both.
  357. */
  358. static void *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
  359. {
  360. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  361. struct bpf_dtab_netdev *obj;
  362. if (key >= map->max_entries)
  363. return NULL;
  364. obj = rcu_dereference_check(dtab->netdev_map[key],
  365. rcu_read_lock_bh_held());
  366. return obj;
  367. }
  368. /* Runs in NAPI, i.e., softirq under local_bh_disable(). Thus, safe percpu
  369. * variable access, and map elements stick around. See comment above
  370. * xdp_do_flush() in filter.c.
  371. */
  372. static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
  373. struct net_device *dev_rx, struct bpf_prog *xdp_prog)
  374. {
  375. struct list_head *flush_list = this_cpu_ptr(&dev_flush_list);
  376. struct xdp_dev_bulk_queue *bq = this_cpu_ptr(dev->xdp_bulkq);
  377. if (unlikely(bq->count == DEV_MAP_BULK_SIZE))
  378. bq_xmit_all(bq, 0);
  379. /* Ingress dev_rx will be the same for all xdp_frame's in
  380. * bulk_queue, because bq stored per-CPU and must be flushed
  381. * from net_device drivers NAPI func end.
  382. *
  383. * Do the same with xdp_prog and flush_list since these fields
  384. * are only ever modified together.
  385. */
  386. if (!bq->dev_rx) {
  387. bq->dev_rx = dev_rx;
  388. bq->xdp_prog = xdp_prog;
  389. list_add(&bq->flush_node, flush_list);
  390. }
  391. bq->q[bq->count++] = xdpf;
  392. }
  393. static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
  394. struct net_device *dev_rx,
  395. struct bpf_prog *xdp_prog)
  396. {
  397. int err;
  398. if (!dev->netdev_ops->ndo_xdp_xmit)
  399. return -EOPNOTSUPP;
  400. err = xdp_ok_fwd_dev(dev, xdp_get_frame_len(xdpf));
  401. if (unlikely(err))
  402. return err;
  403. bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
  404. return 0;
  405. }
  406. static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev *dst)
  407. {
  408. struct xdp_txq_info txq = { .dev = dst->dev };
  409. struct xdp_buff xdp;
  410. u32 act;
  411. if (!dst->xdp_prog)
  412. return XDP_PASS;
  413. __skb_pull(skb, skb->mac_len);
  414. xdp.txq = &txq;
  415. act = bpf_prog_run_generic_xdp(skb, &xdp, dst->xdp_prog);
  416. switch (act) {
  417. case XDP_PASS:
  418. __skb_push(skb, skb->mac_len);
  419. break;
  420. default:
  421. bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
  422. fallthrough;
  423. case XDP_ABORTED:
  424. trace_xdp_exception(dst->dev, dst->xdp_prog, act);
  425. fallthrough;
  426. case XDP_DROP:
  427. kfree_skb(skb);
  428. break;
  429. }
  430. return act;
  431. }
  432. int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
  433. struct net_device *dev_rx)
  434. {
  435. return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
  436. }
  437. int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
  438. struct net_device *dev_rx)
  439. {
  440. struct net_device *dev = dst->dev;
  441. return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
  442. }
  443. static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
  444. {
  445. if (!obj ||
  446. !obj->dev->netdev_ops->ndo_xdp_xmit)
  447. return false;
  448. if (xdp_ok_fwd_dev(obj->dev, xdp_get_frame_len(xdpf)))
  449. return false;
  450. return true;
  451. }
  452. static int dev_map_enqueue_clone(struct bpf_dtab_netdev *obj,
  453. struct net_device *dev_rx,
  454. struct xdp_frame *xdpf)
  455. {
  456. struct xdp_frame *nxdpf;
  457. nxdpf = xdpf_clone(xdpf);
  458. if (!nxdpf)
  459. return -ENOMEM;
  460. bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);
  461. return 0;
  462. }
  463. static inline bool is_ifindex_excluded(int *excluded, int num_excluded, int ifindex)
  464. {
  465. while (num_excluded--) {
  466. if (ifindex == excluded[num_excluded])
  467. return true;
  468. }
  469. return false;
  470. }
  471. /* Get ifindex of each upper device. 'indexes' must be able to hold at
  472. * least MAX_NEST_DEV elements.
  473. * Returns the number of ifindexes added.
  474. */
  475. static int get_upper_ifindexes(struct net_device *dev, int *indexes)
  476. {
  477. struct net_device *upper;
  478. struct list_head *iter;
  479. int n = 0;
  480. netdev_for_each_upper_dev_rcu(dev, upper, iter) {
  481. indexes[n++] = upper->ifindex;
  482. }
  483. return n;
  484. }
  485. int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
  486. struct bpf_map *map, bool exclude_ingress)
  487. {
  488. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  489. struct bpf_dtab_netdev *dst, *last_dst = NULL;
  490. int excluded_devices[1+MAX_NEST_DEV];
  491. struct hlist_head *head;
  492. int num_excluded = 0;
  493. unsigned int i;
  494. int err;
  495. if (exclude_ingress) {
  496. num_excluded = get_upper_ifindexes(dev_rx, excluded_devices);
  497. excluded_devices[num_excluded++] = dev_rx->ifindex;
  498. }
  499. if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
  500. for (i = 0; i < map->max_entries; i++) {
  501. dst = rcu_dereference_check(dtab->netdev_map[i],
  502. rcu_read_lock_bh_held());
  503. if (!is_valid_dst(dst, xdpf))
  504. continue;
  505. if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
  506. continue;
  507. /* we only need n-1 clones; last_dst enqueued below */
  508. if (!last_dst) {
  509. last_dst = dst;
  510. continue;
  511. }
  512. err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
  513. if (err)
  514. return err;
  515. last_dst = dst;
  516. }
  517. } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
  518. for (i = 0; i < dtab->n_buckets; i++) {
  519. head = dev_map_index_hash(dtab, i);
  520. hlist_for_each_entry_rcu(dst, head, index_hlist,
  521. lockdep_is_held(&dtab->index_lock)) {
  522. if (!is_valid_dst(dst, xdpf))
  523. continue;
  524. if (is_ifindex_excluded(excluded_devices, num_excluded,
  525. dst->dev->ifindex))
  526. continue;
  527. /* we only need n-1 clones; last_dst enqueued below */
  528. if (!last_dst) {
  529. last_dst = dst;
  530. continue;
  531. }
  532. err = dev_map_enqueue_clone(last_dst, dev_rx, xdpf);
  533. if (err)
  534. return err;
  535. last_dst = dst;
  536. }
  537. }
  538. }
  539. /* consume the last copy of the frame */
  540. if (last_dst)
  541. bq_enqueue(last_dst->dev, xdpf, dev_rx, last_dst->xdp_prog);
  542. else
  543. xdp_return_frame_rx_napi(xdpf); /* dtab is empty */
  544. return 0;
  545. }
  546. int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
  547. struct bpf_prog *xdp_prog)
  548. {
  549. int err;
  550. err = xdp_ok_fwd_dev(dst->dev, skb->len);
  551. if (unlikely(err))
  552. return err;
  553. /* Redirect has already succeeded semantically at this point, so we just
  554. * return 0 even if packet is dropped. Helper below takes care of
  555. * freeing skb.
  556. */
  557. if (dev_map_bpf_prog_run_skb(skb, dst) != XDP_PASS)
  558. return 0;
  559. skb->dev = dst->dev;
  560. generic_xdp_tx(skb, xdp_prog);
  561. return 0;
  562. }
  563. static int dev_map_redirect_clone(struct bpf_dtab_netdev *dst,
  564. struct sk_buff *skb,
  565. struct bpf_prog *xdp_prog)
  566. {
  567. struct sk_buff *nskb;
  568. int err;
  569. nskb = skb_clone(skb, GFP_ATOMIC);
  570. if (!nskb)
  571. return -ENOMEM;
  572. err = dev_map_generic_redirect(dst, nskb, xdp_prog);
  573. if (unlikely(err)) {
  574. consume_skb(nskb);
  575. return err;
  576. }
  577. return 0;
  578. }
  579. int dev_map_redirect_multi(struct net_device *dev, struct sk_buff *skb,
  580. struct bpf_prog *xdp_prog, struct bpf_map *map,
  581. bool exclude_ingress)
  582. {
  583. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  584. struct bpf_dtab_netdev *dst, *last_dst = NULL;
  585. int excluded_devices[1+MAX_NEST_DEV];
  586. struct hlist_head *head;
  587. struct hlist_node *next;
  588. int num_excluded = 0;
  589. unsigned int i;
  590. int err;
  591. if (exclude_ingress) {
  592. num_excluded = get_upper_ifindexes(dev, excluded_devices);
  593. excluded_devices[num_excluded++] = dev->ifindex;
  594. }
  595. if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
  596. for (i = 0; i < map->max_entries; i++) {
  597. dst = rcu_dereference_check(dtab->netdev_map[i],
  598. rcu_read_lock_bh_held());
  599. if (!dst)
  600. continue;
  601. if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
  602. continue;
  603. /* we only need n-1 clones; last_dst enqueued below */
  604. if (!last_dst) {
  605. last_dst = dst;
  606. continue;
  607. }
  608. err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
  609. if (err)
  610. return err;
  611. last_dst = dst;
  612. }
  613. } else { /* BPF_MAP_TYPE_DEVMAP_HASH */
  614. for (i = 0; i < dtab->n_buckets; i++) {
  615. head = dev_map_index_hash(dtab, i);
  616. hlist_for_each_entry_safe(dst, next, head, index_hlist) {
  617. if (!dst)
  618. continue;
  619. if (is_ifindex_excluded(excluded_devices, num_excluded,
  620. dst->dev->ifindex))
  621. continue;
  622. /* we only need n-1 clones; last_dst enqueued below */
  623. if (!last_dst) {
  624. last_dst = dst;
  625. continue;
  626. }
  627. err = dev_map_redirect_clone(last_dst, skb, xdp_prog);
  628. if (err)
  629. return err;
  630. last_dst = dst;
  631. }
  632. }
  633. }
  634. /* consume the first skb and return */
  635. if (last_dst)
  636. return dev_map_generic_redirect(last_dst, skb, xdp_prog);
  637. /* dtab is empty */
  638. consume_skb(skb);
  639. return 0;
  640. }
  641. static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
  642. {
  643. struct bpf_dtab_netdev *obj = __dev_map_lookup_elem(map, *(u32 *)key);
  644. return obj ? &obj->val : NULL;
  645. }
  646. static void *dev_map_hash_lookup_elem(struct bpf_map *map, void *key)
  647. {
  648. struct bpf_dtab_netdev *obj = __dev_map_hash_lookup_elem(map,
  649. *(u32 *)key);
  650. return obj ? &obj->val : NULL;
  651. }
  652. static void __dev_map_entry_free(struct rcu_head *rcu)
  653. {
  654. struct bpf_dtab_netdev *dev;
  655. dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
  656. if (dev->xdp_prog)
  657. bpf_prog_put(dev->xdp_prog);
  658. dev_put(dev->dev);
  659. kfree(dev);
  660. }
  661. static int dev_map_delete_elem(struct bpf_map *map, void *key)
  662. {
  663. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  664. struct bpf_dtab_netdev *old_dev;
  665. int k = *(u32 *)key;
  666. if (k >= map->max_entries)
  667. return -EINVAL;
  668. old_dev = unrcu_pointer(xchg(&dtab->netdev_map[k], NULL));
  669. if (old_dev)
  670. call_rcu(&old_dev->rcu, __dev_map_entry_free);
  671. return 0;
  672. }
  673. static int dev_map_hash_delete_elem(struct bpf_map *map, void *key)
  674. {
  675. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  676. struct bpf_dtab_netdev *old_dev;
  677. int k = *(u32 *)key;
  678. unsigned long flags;
  679. int ret = -ENOENT;
  680. spin_lock_irqsave(&dtab->index_lock, flags);
  681. old_dev = __dev_map_hash_lookup_elem(map, k);
  682. if (old_dev) {
  683. dtab->items--;
  684. hlist_del_init_rcu(&old_dev->index_hlist);
  685. call_rcu(&old_dev->rcu, __dev_map_entry_free);
  686. ret = 0;
  687. }
  688. spin_unlock_irqrestore(&dtab->index_lock, flags);
  689. return ret;
  690. }
  691. static struct bpf_dtab_netdev *__dev_map_alloc_node(struct net *net,
  692. struct bpf_dtab *dtab,
  693. struct bpf_devmap_val *val,
  694. unsigned int idx)
  695. {
  696. struct bpf_prog *prog = NULL;
  697. struct bpf_dtab_netdev *dev;
  698. dev = bpf_map_kmalloc_node(&dtab->map, sizeof(*dev),
  699. GFP_NOWAIT | __GFP_NOWARN,
  700. dtab->map.numa_node);
  701. if (!dev)
  702. return ERR_PTR(-ENOMEM);
  703. dev->dev = dev_get_by_index(net, val->ifindex);
  704. if (!dev->dev)
  705. goto err_out;
  706. if (val->bpf_prog.fd > 0) {
  707. prog = bpf_prog_get_type_dev(val->bpf_prog.fd,
  708. BPF_PROG_TYPE_XDP, false);
  709. if (IS_ERR(prog))
  710. goto err_put_dev;
  711. if (prog->expected_attach_type != BPF_XDP_DEVMAP ||
  712. !bpf_prog_map_compatible(&dtab->map, prog))
  713. goto err_put_prog;
  714. }
  715. dev->idx = idx;
  716. dev->dtab = dtab;
  717. if (prog) {
  718. dev->xdp_prog = prog;
  719. dev->val.bpf_prog.id = prog->aux->id;
  720. } else {
  721. dev->xdp_prog = NULL;
  722. dev->val.bpf_prog.id = 0;
  723. }
  724. dev->val.ifindex = val->ifindex;
  725. return dev;
  726. err_put_prog:
  727. bpf_prog_put(prog);
  728. err_put_dev:
  729. dev_put(dev->dev);
  730. err_out:
  731. kfree(dev);
  732. return ERR_PTR(-EINVAL);
  733. }
  734. static int __dev_map_update_elem(struct net *net, struct bpf_map *map,
  735. void *key, void *value, u64 map_flags)
  736. {
  737. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  738. struct bpf_dtab_netdev *dev, *old_dev;
  739. struct bpf_devmap_val val = {};
  740. u32 i = *(u32 *)key;
  741. if (unlikely(map_flags > BPF_EXIST))
  742. return -EINVAL;
  743. if (unlikely(i >= dtab->map.max_entries))
  744. return -E2BIG;
  745. if (unlikely(map_flags == BPF_NOEXIST))
  746. return -EEXIST;
  747. /* already verified value_size <= sizeof val */
  748. memcpy(&val, value, map->value_size);
  749. if (!val.ifindex) {
  750. dev = NULL;
  751. /* can not specify fd if ifindex is 0 */
  752. if (val.bpf_prog.fd > 0)
  753. return -EINVAL;
  754. } else {
  755. dev = __dev_map_alloc_node(net, dtab, &val, i);
  756. if (IS_ERR(dev))
  757. return PTR_ERR(dev);
  758. }
  759. /* Use call_rcu() here to ensure rcu critical sections have completed
  760. * Remembering the driver side flush operation will happen before the
  761. * net device is removed.
  762. */
  763. old_dev = unrcu_pointer(xchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev)));
  764. if (old_dev)
  765. call_rcu(&old_dev->rcu, __dev_map_entry_free);
  766. return 0;
  767. }
  768. static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
  769. u64 map_flags)
  770. {
  771. return __dev_map_update_elem(current->nsproxy->net_ns,
  772. map, key, value, map_flags);
  773. }
  774. static int __dev_map_hash_update_elem(struct net *net, struct bpf_map *map,
  775. void *key, void *value, u64 map_flags)
  776. {
  777. struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
  778. struct bpf_dtab_netdev *dev, *old_dev;
  779. struct bpf_devmap_val val = {};
  780. u32 idx = *(u32 *)key;
  781. unsigned long flags;
  782. int err = -EEXIST;
  783. /* already verified value_size <= sizeof val */
  784. memcpy(&val, value, map->value_size);
  785. if (unlikely(map_flags > BPF_EXIST || !val.ifindex))
  786. return -EINVAL;
  787. spin_lock_irqsave(&dtab->index_lock, flags);
  788. old_dev = __dev_map_hash_lookup_elem(map, idx);
  789. if (old_dev && (map_flags & BPF_NOEXIST))
  790. goto out_err;
  791. dev = __dev_map_alloc_node(net, dtab, &val, idx);
  792. if (IS_ERR(dev)) {
  793. err = PTR_ERR(dev);
  794. goto out_err;
  795. }
  796. if (old_dev) {
  797. hlist_del_rcu(&old_dev->index_hlist);
  798. } else {
  799. if (dtab->items >= dtab->map.max_entries) {
  800. spin_unlock_irqrestore(&dtab->index_lock, flags);
  801. call_rcu(&dev->rcu, __dev_map_entry_free);
  802. return -E2BIG;
  803. }
  804. dtab->items++;
  805. }
  806. hlist_add_head_rcu(&dev->index_hlist,
  807. dev_map_index_hash(dtab, idx));
  808. spin_unlock_irqrestore(&dtab->index_lock, flags);
  809. if (old_dev)
  810. call_rcu(&old_dev->rcu, __dev_map_entry_free);
  811. return 0;
  812. out_err:
  813. spin_unlock_irqrestore(&dtab->index_lock, flags);
  814. return err;
  815. }
  816. static int dev_map_hash_update_elem(struct bpf_map *map, void *key, void *value,
  817. u64 map_flags)
  818. {
  819. return __dev_map_hash_update_elem(current->nsproxy->net_ns,
  820. map, key, value, map_flags);
  821. }
  822. static int dev_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
  823. {
  824. return __bpf_xdp_redirect_map(map, ifindex, flags,
  825. BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
  826. __dev_map_lookup_elem);
  827. }
  828. static int dev_hash_map_redirect(struct bpf_map *map, u32 ifindex, u64 flags)
  829. {
  830. return __bpf_xdp_redirect_map(map, ifindex, flags,
  831. BPF_F_BROADCAST | BPF_F_EXCLUDE_INGRESS,
  832. __dev_map_hash_lookup_elem);
  833. }
  834. BTF_ID_LIST_SINGLE(dev_map_btf_ids, struct, bpf_dtab)
  835. const struct bpf_map_ops dev_map_ops = {
  836. .map_meta_equal = bpf_map_meta_equal,
  837. .map_alloc = dev_map_alloc,
  838. .map_free = dev_map_free,
  839. .map_get_next_key = dev_map_get_next_key,
  840. .map_lookup_elem = dev_map_lookup_elem,
  841. .map_update_elem = dev_map_update_elem,
  842. .map_delete_elem = dev_map_delete_elem,
  843. .map_check_btf = map_check_no_btf,
  844. .map_btf_id = &dev_map_btf_ids[0],
  845. .map_redirect = dev_map_redirect,
  846. };
  847. const struct bpf_map_ops dev_map_hash_ops = {
  848. .map_meta_equal = bpf_map_meta_equal,
  849. .map_alloc = dev_map_alloc,
  850. .map_free = dev_map_free,
  851. .map_get_next_key = dev_map_hash_get_next_key,
  852. .map_lookup_elem = dev_map_hash_lookup_elem,
  853. .map_update_elem = dev_map_hash_update_elem,
  854. .map_delete_elem = dev_map_hash_delete_elem,
  855. .map_check_btf = map_check_no_btf,
  856. .map_btf_id = &dev_map_btf_ids[0],
  857. .map_redirect = dev_hash_map_redirect,
  858. };
  859. static void dev_map_hash_remove_netdev(struct bpf_dtab *dtab,
  860. struct net_device *netdev)
  861. {
  862. unsigned long flags;
  863. u32 i;
  864. spin_lock_irqsave(&dtab->index_lock, flags);
  865. for (i = 0; i < dtab->n_buckets; i++) {
  866. struct bpf_dtab_netdev *dev;
  867. struct hlist_head *head;
  868. struct hlist_node *next;
  869. head = dev_map_index_hash(dtab, i);
  870. hlist_for_each_entry_safe(dev, next, head, index_hlist) {
  871. if (netdev != dev->dev)
  872. continue;
  873. dtab->items--;
  874. hlist_del_rcu(&dev->index_hlist);
  875. call_rcu(&dev->rcu, __dev_map_entry_free);
  876. }
  877. }
  878. spin_unlock_irqrestore(&dtab->index_lock, flags);
  879. }
  880. static int dev_map_notification(struct notifier_block *notifier,
  881. ulong event, void *ptr)
  882. {
  883. struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
  884. struct bpf_dtab *dtab;
  885. int i, cpu;
  886. switch (event) {
  887. case NETDEV_REGISTER:
  888. if (!netdev->netdev_ops->ndo_xdp_xmit || netdev->xdp_bulkq)
  889. break;
  890. /* will be freed in free_netdev() */
  891. netdev->xdp_bulkq = alloc_percpu(struct xdp_dev_bulk_queue);
  892. if (!netdev->xdp_bulkq)
  893. return NOTIFY_BAD;
  894. for_each_possible_cpu(cpu)
  895. per_cpu_ptr(netdev->xdp_bulkq, cpu)->dev = netdev;
  896. break;
  897. case NETDEV_UNREGISTER:
  898. /* This rcu_read_lock/unlock pair is needed because
  899. * dev_map_list is an RCU list AND to ensure a delete
  900. * operation does not free a netdev_map entry while we
  901. * are comparing it against the netdev being unregistered.
  902. */
  903. rcu_read_lock();
  904. list_for_each_entry_rcu(dtab, &dev_map_list, list) {
  905. if (dtab->map.map_type == BPF_MAP_TYPE_DEVMAP_HASH) {
  906. dev_map_hash_remove_netdev(dtab, netdev);
  907. continue;
  908. }
  909. for (i = 0; i < dtab->map.max_entries; i++) {
  910. struct bpf_dtab_netdev *dev, *odev;
  911. dev = rcu_dereference(dtab->netdev_map[i]);
  912. if (!dev || netdev != dev->dev)
  913. continue;
  914. odev = unrcu_pointer(cmpxchg(&dtab->netdev_map[i], RCU_INITIALIZER(dev), NULL));
  915. if (dev == odev)
  916. call_rcu(&dev->rcu,
  917. __dev_map_entry_free);
  918. }
  919. }
  920. rcu_read_unlock();
  921. break;
  922. default:
  923. break;
  924. }
  925. return NOTIFY_OK;
  926. }
  927. static struct notifier_block dev_map_notifier = {
  928. .notifier_call = dev_map_notification,
  929. };
  930. static int __init dev_map_init(void)
  931. {
  932. int cpu;
  933. /* Assure tracepoint shadow struct _bpf_dtab_netdev is in sync */
  934. BUILD_BUG_ON(offsetof(struct bpf_dtab_netdev, dev) !=
  935. offsetof(struct _bpf_dtab_netdev, dev));
  936. register_netdevice_notifier(&dev_map_notifier);
  937. for_each_possible_cpu(cpu)
  938. INIT_LIST_HEAD(&per_cpu(dev_flush_list, cpu));
  939. return 0;
  940. }
  941. subsys_initcall(dev_map_init);