ringbuf.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776
  1. #include <linux/bpf.h>
  2. #include <linux/btf.h>
  3. #include <linux/err.h>
  4. #include <linux/irq_work.h>
  5. #include <linux/slab.h>
  6. #include <linux/filter.h>
  7. #include <linux/mm.h>
  8. #include <linux/vmalloc.h>
  9. #include <linux/wait.h>
  10. #include <linux/poll.h>
  11. #include <linux/kmemleak.h>
  12. #include <uapi/linux/btf.h>
  13. #include <linux/btf_ids.h>
  14. #define RINGBUF_CREATE_FLAG_MASK (BPF_F_NUMA_NODE)
  15. /* non-mmap()'able part of bpf_ringbuf (everything up to consumer page) */
  16. #define RINGBUF_PGOFF \
  17. (offsetof(struct bpf_ringbuf, consumer_pos) >> PAGE_SHIFT)
  18. /* consumer page and producer page */
  19. #define RINGBUF_POS_PAGES 2
  20. #define RINGBUF_MAX_RECORD_SZ (UINT_MAX/4)
  21. /* Maximum size of ring buffer area is limited by 32-bit page offset within
  22. * record header, counted in pages. Reserve 8 bits for extensibility, and take
  23. * into account few extra pages for consumer/producer pages and
  24. * non-mmap()'able parts. This gives 64GB limit, which seems plenty for single
  25. * ring buffer.
  26. */
  27. #define RINGBUF_MAX_DATA_SZ \
  28. (((1ULL << 24) - RINGBUF_POS_PAGES - RINGBUF_PGOFF) * PAGE_SIZE)
  29. struct bpf_ringbuf {
  30. wait_queue_head_t waitq;
  31. struct irq_work work;
  32. u64 mask;
  33. struct page **pages;
  34. int nr_pages;
  35. spinlock_t spinlock ____cacheline_aligned_in_smp;
  36. /* For user-space producer ring buffers, an atomic_t busy bit is used
  37. * to synchronize access to the ring buffers in the kernel, rather than
  38. * the spinlock that is used for kernel-producer ring buffers. This is
  39. * done because the ring buffer must hold a lock across a BPF program's
  40. * callback:
  41. *
  42. * __bpf_user_ringbuf_peek() // lock acquired
  43. * -> program callback_fn()
  44. * -> __bpf_user_ringbuf_sample_release() // lock released
  45. *
  46. * It is unsafe and incorrect to hold an IRQ spinlock across what could
  47. * be a long execution window, so we instead simply disallow concurrent
  48. * access to the ring buffer by kernel consumers, and return -EBUSY from
  49. * __bpf_user_ringbuf_peek() if the busy bit is held by another task.
  50. */
  51. atomic_t busy ____cacheline_aligned_in_smp;
  52. /* Consumer and producer counters are put into separate pages to
  53. * allow each position to be mapped with different permissions.
  54. * This prevents a user-space application from modifying the
  55. * position and ruining in-kernel tracking. The permissions of the
  56. * pages depend on who is producing samples: user-space or the
  57. * kernel.
  58. *
  59. * Kernel-producer
  60. * ---------------
  61. * The producer position and data pages are mapped as r/o in
  62. * userspace. For this approach, bits in the header of samples are
  63. * used to signal to user-space, and to other producers, whether a
  64. * sample is currently being written.
  65. *
  66. * User-space producer
  67. * -------------------
  68. * Only the page containing the consumer position is mapped r/o in
  69. * user-space. User-space producers also use bits of the header to
  70. * communicate to the kernel, but the kernel must carefully check and
  71. * validate each sample to ensure that they're correctly formatted, and
  72. * fully contained within the ring buffer.
  73. */
  74. unsigned long consumer_pos __aligned(PAGE_SIZE);
  75. unsigned long producer_pos __aligned(PAGE_SIZE);
  76. char data[] __aligned(PAGE_SIZE);
  77. };
  78. struct bpf_ringbuf_map {
  79. struct bpf_map map;
  80. struct bpf_ringbuf *rb;
  81. };
  82. /* 8-byte ring buffer record header structure */
  83. struct bpf_ringbuf_hdr {
  84. u32 len;
  85. u32 pg_off;
  86. };
  87. static struct bpf_ringbuf *bpf_ringbuf_area_alloc(size_t data_sz, int numa_node)
  88. {
  89. const gfp_t flags = GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL |
  90. __GFP_NOWARN | __GFP_ZERO;
  91. int nr_meta_pages = RINGBUF_PGOFF + RINGBUF_POS_PAGES;
  92. int nr_data_pages = data_sz >> PAGE_SHIFT;
  93. int nr_pages = nr_meta_pages + nr_data_pages;
  94. struct page **pages, *page;
  95. struct bpf_ringbuf *rb;
  96. size_t array_size;
  97. int i;
  98. /* Each data page is mapped twice to allow "virtual"
  99. * continuous read of samples wrapping around the end of ring
  100. * buffer area:
  101. * ------------------------------------------------------
  102. * | meta pages | real data pages | same data pages |
  103. * ------------------------------------------------------
  104. * | | 1 2 3 4 5 6 7 8 9 | 1 2 3 4 5 6 7 8 9 |
  105. * ------------------------------------------------------
  106. * | | TA DA | TA DA |
  107. * ------------------------------------------------------
  108. * ^^^^^^^
  109. * |
  110. * Here, no need to worry about special handling of wrapped-around
  111. * data due to double-mapped data pages. This works both in kernel and
  112. * when mmap()'ed in user-space, simplifying both kernel and
  113. * user-space implementations significantly.
  114. */
  115. array_size = (nr_meta_pages + 2 * nr_data_pages) * sizeof(*pages);
  116. pages = bpf_map_area_alloc(array_size, numa_node);
  117. if (!pages)
  118. return NULL;
  119. for (i = 0; i < nr_pages; i++) {
  120. page = alloc_pages_node(numa_node, flags, 0);
  121. if (!page) {
  122. nr_pages = i;
  123. goto err_free_pages;
  124. }
  125. pages[i] = page;
  126. if (i >= nr_meta_pages)
  127. pages[nr_data_pages + i] = page;
  128. }
  129. rb = vmap(pages, nr_meta_pages + 2 * nr_data_pages,
  130. VM_MAP | VM_USERMAP, PAGE_KERNEL);
  131. if (rb) {
  132. kmemleak_not_leak(pages);
  133. rb->pages = pages;
  134. rb->nr_pages = nr_pages;
  135. return rb;
  136. }
  137. err_free_pages:
  138. for (i = 0; i < nr_pages; i++)
  139. __free_page(pages[i]);
  140. bpf_map_area_free(pages);
  141. return NULL;
  142. }
  143. static void bpf_ringbuf_notify(struct irq_work *work)
  144. {
  145. struct bpf_ringbuf *rb = container_of(work, struct bpf_ringbuf, work);
  146. wake_up_all(&rb->waitq);
  147. }
  148. static struct bpf_ringbuf *bpf_ringbuf_alloc(size_t data_sz, int numa_node)
  149. {
  150. struct bpf_ringbuf *rb;
  151. rb = bpf_ringbuf_area_alloc(data_sz, numa_node);
  152. if (!rb)
  153. return NULL;
  154. spin_lock_init(&rb->spinlock);
  155. atomic_set(&rb->busy, 0);
  156. init_waitqueue_head(&rb->waitq);
  157. init_irq_work(&rb->work, bpf_ringbuf_notify);
  158. rb->mask = data_sz - 1;
  159. rb->consumer_pos = 0;
  160. rb->producer_pos = 0;
  161. return rb;
  162. }
  163. static struct bpf_map *ringbuf_map_alloc(union bpf_attr *attr)
  164. {
  165. struct bpf_ringbuf_map *rb_map;
  166. if (attr->map_flags & ~RINGBUF_CREATE_FLAG_MASK)
  167. return ERR_PTR(-EINVAL);
  168. if (attr->key_size || attr->value_size ||
  169. !is_power_of_2(attr->max_entries) ||
  170. !PAGE_ALIGNED(attr->max_entries))
  171. return ERR_PTR(-EINVAL);
  172. #ifdef CONFIG_64BIT
  173. /* on 32-bit arch, it's impossible to overflow record's hdr->pgoff */
  174. if (attr->max_entries > RINGBUF_MAX_DATA_SZ)
  175. return ERR_PTR(-E2BIG);
  176. #endif
  177. rb_map = bpf_map_area_alloc(sizeof(*rb_map), NUMA_NO_NODE);
  178. if (!rb_map)
  179. return ERR_PTR(-ENOMEM);
  180. bpf_map_init_from_attr(&rb_map->map, attr);
  181. rb_map->rb = bpf_ringbuf_alloc(attr->max_entries, rb_map->map.numa_node);
  182. if (!rb_map->rb) {
  183. bpf_map_area_free(rb_map);
  184. return ERR_PTR(-ENOMEM);
  185. }
  186. return &rb_map->map;
  187. }
  188. static void bpf_ringbuf_free(struct bpf_ringbuf *rb)
  189. {
  190. /* copy pages pointer and nr_pages to local variable, as we are going
  191. * to unmap rb itself with vunmap() below
  192. */
  193. struct page **pages = rb->pages;
  194. int i, nr_pages = rb->nr_pages;
  195. vunmap(rb);
  196. for (i = 0; i < nr_pages; i++)
  197. __free_page(pages[i]);
  198. bpf_map_area_free(pages);
  199. }
  200. static void ringbuf_map_free(struct bpf_map *map)
  201. {
  202. struct bpf_ringbuf_map *rb_map;
  203. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  204. bpf_ringbuf_free(rb_map->rb);
  205. bpf_map_area_free(rb_map);
  206. }
  207. static void *ringbuf_map_lookup_elem(struct bpf_map *map, void *key)
  208. {
  209. return ERR_PTR(-ENOTSUPP);
  210. }
  211. static int ringbuf_map_update_elem(struct bpf_map *map, void *key, void *value,
  212. u64 flags)
  213. {
  214. return -ENOTSUPP;
  215. }
  216. static int ringbuf_map_delete_elem(struct bpf_map *map, void *key)
  217. {
  218. return -ENOTSUPP;
  219. }
  220. static int ringbuf_map_get_next_key(struct bpf_map *map, void *key,
  221. void *next_key)
  222. {
  223. return -ENOTSUPP;
  224. }
  225. static int ringbuf_map_mmap_kern(struct bpf_map *map, struct vm_area_struct *vma)
  226. {
  227. struct bpf_ringbuf_map *rb_map;
  228. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  229. if (vma->vm_flags & VM_WRITE) {
  230. /* allow writable mapping for the consumer_pos only */
  231. if (vma->vm_pgoff != 0 || vma->vm_end - vma->vm_start != PAGE_SIZE)
  232. return -EPERM;
  233. } else {
  234. vm_flags_clear(vma, VM_MAYWRITE);
  235. }
  236. /* remap_vmalloc_range() checks size and offset constraints */
  237. return remap_vmalloc_range(vma, rb_map->rb,
  238. vma->vm_pgoff + RINGBUF_PGOFF);
  239. }
  240. static int ringbuf_map_mmap_user(struct bpf_map *map, struct vm_area_struct *vma)
  241. {
  242. struct bpf_ringbuf_map *rb_map;
  243. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  244. if (vma->vm_flags & VM_WRITE) {
  245. if (vma->vm_pgoff == 0)
  246. /* Disallow writable mappings to the consumer pointer,
  247. * and allow writable mappings to both the producer
  248. * position, and the ring buffer data itself.
  249. */
  250. return -EPERM;
  251. } else {
  252. vm_flags_clear(vma, VM_MAYWRITE);
  253. }
  254. /* remap_vmalloc_range() checks size and offset constraints */
  255. return remap_vmalloc_range(vma, rb_map->rb, vma->vm_pgoff + RINGBUF_PGOFF);
  256. }
  257. static unsigned long ringbuf_avail_data_sz(struct bpf_ringbuf *rb)
  258. {
  259. unsigned long cons_pos, prod_pos;
  260. cons_pos = smp_load_acquire(&rb->consumer_pos);
  261. prod_pos = smp_load_acquire(&rb->producer_pos);
  262. return prod_pos - cons_pos;
  263. }
  264. static u32 ringbuf_total_data_sz(const struct bpf_ringbuf *rb)
  265. {
  266. return rb->mask + 1;
  267. }
  268. static __poll_t ringbuf_map_poll_kern(struct bpf_map *map, struct file *filp,
  269. struct poll_table_struct *pts)
  270. {
  271. struct bpf_ringbuf_map *rb_map;
  272. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  273. poll_wait(filp, &rb_map->rb->waitq, pts);
  274. if (ringbuf_avail_data_sz(rb_map->rb))
  275. return EPOLLIN | EPOLLRDNORM;
  276. return 0;
  277. }
  278. static __poll_t ringbuf_map_poll_user(struct bpf_map *map, struct file *filp,
  279. struct poll_table_struct *pts)
  280. {
  281. struct bpf_ringbuf_map *rb_map;
  282. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  283. poll_wait(filp, &rb_map->rb->waitq, pts);
  284. if (ringbuf_avail_data_sz(rb_map->rb) < ringbuf_total_data_sz(rb_map->rb))
  285. return EPOLLOUT | EPOLLWRNORM;
  286. return 0;
  287. }
  288. BTF_ID_LIST_SINGLE(ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
  289. const struct bpf_map_ops ringbuf_map_ops = {
  290. .map_meta_equal = bpf_map_meta_equal,
  291. .map_alloc = ringbuf_map_alloc,
  292. .map_free = ringbuf_map_free,
  293. .map_mmap = ringbuf_map_mmap_kern,
  294. .map_poll = ringbuf_map_poll_kern,
  295. .map_lookup_elem = ringbuf_map_lookup_elem,
  296. .map_update_elem = ringbuf_map_update_elem,
  297. .map_delete_elem = ringbuf_map_delete_elem,
  298. .map_get_next_key = ringbuf_map_get_next_key,
  299. .map_btf_id = &ringbuf_map_btf_ids[0],
  300. };
  301. BTF_ID_LIST_SINGLE(user_ringbuf_map_btf_ids, struct, bpf_ringbuf_map)
  302. const struct bpf_map_ops user_ringbuf_map_ops = {
  303. .map_meta_equal = bpf_map_meta_equal,
  304. .map_alloc = ringbuf_map_alloc,
  305. .map_free = ringbuf_map_free,
  306. .map_mmap = ringbuf_map_mmap_user,
  307. .map_poll = ringbuf_map_poll_user,
  308. .map_lookup_elem = ringbuf_map_lookup_elem,
  309. .map_update_elem = ringbuf_map_update_elem,
  310. .map_delete_elem = ringbuf_map_delete_elem,
  311. .map_get_next_key = ringbuf_map_get_next_key,
  312. .map_btf_id = &user_ringbuf_map_btf_ids[0],
  313. };
  314. /* Given pointer to ring buffer record metadata and struct bpf_ringbuf itself,
  315. * calculate offset from record metadata to ring buffer in pages, rounded
  316. * down. This page offset is stored as part of record metadata and allows to
  317. * restore struct bpf_ringbuf * from record pointer. This page offset is
  318. * stored at offset 4 of record metadata header.
  319. */
  320. static size_t bpf_ringbuf_rec_pg_off(struct bpf_ringbuf *rb,
  321. struct bpf_ringbuf_hdr *hdr)
  322. {
  323. return ((void *)hdr - (void *)rb) >> PAGE_SHIFT;
  324. }
  325. /* Given pointer to ring buffer record header, restore pointer to struct
  326. * bpf_ringbuf itself by using page offset stored at offset 4
  327. */
  328. static struct bpf_ringbuf *
  329. bpf_ringbuf_restore_from_rec(struct bpf_ringbuf_hdr *hdr)
  330. {
  331. unsigned long addr = (unsigned long)(void *)hdr;
  332. unsigned long off = (unsigned long)hdr->pg_off << PAGE_SHIFT;
  333. return (void*)((addr & PAGE_MASK) - off);
  334. }
  335. static void *__bpf_ringbuf_reserve(struct bpf_ringbuf *rb, u64 size)
  336. {
  337. unsigned long cons_pos, prod_pos, new_prod_pos, flags;
  338. u32 len, pg_off;
  339. struct bpf_ringbuf_hdr *hdr;
  340. if (unlikely(size > RINGBUF_MAX_RECORD_SZ))
  341. return NULL;
  342. len = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
  343. if (len > ringbuf_total_data_sz(rb))
  344. return NULL;
  345. cons_pos = smp_load_acquire(&rb->consumer_pos);
  346. if (in_nmi()) {
  347. if (!spin_trylock_irqsave(&rb->spinlock, flags))
  348. return NULL;
  349. } else {
  350. spin_lock_irqsave(&rb->spinlock, flags);
  351. }
  352. prod_pos = rb->producer_pos;
  353. new_prod_pos = prod_pos + len;
  354. /* check for out of ringbuf space by ensuring producer position
  355. * doesn't advance more than (ringbuf_size - 1) ahead
  356. */
  357. if (new_prod_pos - cons_pos > rb->mask) {
  358. spin_unlock_irqrestore(&rb->spinlock, flags);
  359. return NULL;
  360. }
  361. hdr = (void *)rb->data + (prod_pos & rb->mask);
  362. pg_off = bpf_ringbuf_rec_pg_off(rb, hdr);
  363. hdr->len = size | BPF_RINGBUF_BUSY_BIT;
  364. hdr->pg_off = pg_off;
  365. /* pairs with consumer's smp_load_acquire() */
  366. smp_store_release(&rb->producer_pos, new_prod_pos);
  367. spin_unlock_irqrestore(&rb->spinlock, flags);
  368. return (void *)hdr + BPF_RINGBUF_HDR_SZ;
  369. }
  370. BPF_CALL_3(bpf_ringbuf_reserve, struct bpf_map *, map, u64, size, u64, flags)
  371. {
  372. struct bpf_ringbuf_map *rb_map;
  373. if (unlikely(flags))
  374. return 0;
  375. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  376. return (unsigned long)__bpf_ringbuf_reserve(rb_map->rb, size);
  377. }
  378. const struct bpf_func_proto bpf_ringbuf_reserve_proto = {
  379. .func = bpf_ringbuf_reserve,
  380. .ret_type = RET_PTR_TO_ALLOC_MEM_OR_NULL,
  381. .arg1_type = ARG_CONST_MAP_PTR,
  382. .arg2_type = ARG_CONST_ALLOC_SIZE_OR_ZERO,
  383. .arg3_type = ARG_ANYTHING,
  384. };
  385. static void bpf_ringbuf_commit(void *sample, u64 flags, bool discard)
  386. {
  387. unsigned long rec_pos, cons_pos;
  388. struct bpf_ringbuf_hdr *hdr;
  389. struct bpf_ringbuf *rb;
  390. u32 new_len;
  391. hdr = sample - BPF_RINGBUF_HDR_SZ;
  392. rb = bpf_ringbuf_restore_from_rec(hdr);
  393. new_len = hdr->len ^ BPF_RINGBUF_BUSY_BIT;
  394. if (discard)
  395. new_len |= BPF_RINGBUF_DISCARD_BIT;
  396. /* update record header with correct final size prefix */
  397. xchg(&hdr->len, new_len);
  398. /* if consumer caught up and is waiting for our record, notify about
  399. * new data availability
  400. */
  401. rec_pos = (void *)hdr - (void *)rb->data;
  402. cons_pos = smp_load_acquire(&rb->consumer_pos) & rb->mask;
  403. if (flags & BPF_RB_FORCE_WAKEUP)
  404. irq_work_queue(&rb->work);
  405. else if (cons_pos == rec_pos && !(flags & BPF_RB_NO_WAKEUP))
  406. irq_work_queue(&rb->work);
  407. }
  408. BPF_CALL_2(bpf_ringbuf_submit, void *, sample, u64, flags)
  409. {
  410. bpf_ringbuf_commit(sample, flags, false /* discard */);
  411. return 0;
  412. }
  413. const struct bpf_func_proto bpf_ringbuf_submit_proto = {
  414. .func = bpf_ringbuf_submit,
  415. .ret_type = RET_VOID,
  416. .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
  417. .arg2_type = ARG_ANYTHING,
  418. };
  419. BPF_CALL_2(bpf_ringbuf_discard, void *, sample, u64, flags)
  420. {
  421. bpf_ringbuf_commit(sample, flags, true /* discard */);
  422. return 0;
  423. }
  424. const struct bpf_func_proto bpf_ringbuf_discard_proto = {
  425. .func = bpf_ringbuf_discard,
  426. .ret_type = RET_VOID,
  427. .arg1_type = ARG_PTR_TO_ALLOC_MEM | OBJ_RELEASE,
  428. .arg2_type = ARG_ANYTHING,
  429. };
  430. BPF_CALL_4(bpf_ringbuf_output, struct bpf_map *, map, void *, data, u64, size,
  431. u64, flags)
  432. {
  433. struct bpf_ringbuf_map *rb_map;
  434. void *rec;
  435. if (unlikely(flags & ~(BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP)))
  436. return -EINVAL;
  437. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  438. rec = __bpf_ringbuf_reserve(rb_map->rb, size);
  439. if (!rec)
  440. return -EAGAIN;
  441. memcpy(rec, data, size);
  442. bpf_ringbuf_commit(rec, flags, false /* discard */);
  443. return 0;
  444. }
  445. const struct bpf_func_proto bpf_ringbuf_output_proto = {
  446. .func = bpf_ringbuf_output,
  447. .ret_type = RET_INTEGER,
  448. .arg1_type = ARG_CONST_MAP_PTR,
  449. .arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
  450. .arg3_type = ARG_CONST_SIZE_OR_ZERO,
  451. .arg4_type = ARG_ANYTHING,
  452. };
  453. BPF_CALL_2(bpf_ringbuf_query, struct bpf_map *, map, u64, flags)
  454. {
  455. struct bpf_ringbuf *rb;
  456. rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
  457. switch (flags) {
  458. case BPF_RB_AVAIL_DATA:
  459. return ringbuf_avail_data_sz(rb);
  460. case BPF_RB_RING_SIZE:
  461. return ringbuf_total_data_sz(rb);
  462. case BPF_RB_CONS_POS:
  463. return smp_load_acquire(&rb->consumer_pos);
  464. case BPF_RB_PROD_POS:
  465. return smp_load_acquire(&rb->producer_pos);
  466. default:
  467. return 0;
  468. }
  469. }
  470. const struct bpf_func_proto bpf_ringbuf_query_proto = {
  471. .func = bpf_ringbuf_query,
  472. .ret_type = RET_INTEGER,
  473. .arg1_type = ARG_CONST_MAP_PTR,
  474. .arg2_type = ARG_ANYTHING,
  475. };
  476. BPF_CALL_4(bpf_ringbuf_reserve_dynptr, struct bpf_map *, map, u32, size, u64, flags,
  477. struct bpf_dynptr_kern *, ptr)
  478. {
  479. struct bpf_ringbuf_map *rb_map;
  480. void *sample;
  481. int err;
  482. if (unlikely(flags)) {
  483. bpf_dynptr_set_null(ptr);
  484. return -EINVAL;
  485. }
  486. err = bpf_dynptr_check_size(size);
  487. if (err) {
  488. bpf_dynptr_set_null(ptr);
  489. return err;
  490. }
  491. rb_map = container_of(map, struct bpf_ringbuf_map, map);
  492. sample = __bpf_ringbuf_reserve(rb_map->rb, size);
  493. if (!sample) {
  494. bpf_dynptr_set_null(ptr);
  495. return -EINVAL;
  496. }
  497. bpf_dynptr_init(ptr, sample, BPF_DYNPTR_TYPE_RINGBUF, 0, size);
  498. return 0;
  499. }
  500. const struct bpf_func_proto bpf_ringbuf_reserve_dynptr_proto = {
  501. .func = bpf_ringbuf_reserve_dynptr,
  502. .ret_type = RET_INTEGER,
  503. .arg1_type = ARG_CONST_MAP_PTR,
  504. .arg2_type = ARG_ANYTHING,
  505. .arg3_type = ARG_ANYTHING,
  506. .arg4_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | MEM_UNINIT,
  507. };
  508. BPF_CALL_2(bpf_ringbuf_submit_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
  509. {
  510. if (!ptr->data)
  511. return 0;
  512. bpf_ringbuf_commit(ptr->data, flags, false /* discard */);
  513. bpf_dynptr_set_null(ptr);
  514. return 0;
  515. }
  516. const struct bpf_func_proto bpf_ringbuf_submit_dynptr_proto = {
  517. .func = bpf_ringbuf_submit_dynptr,
  518. .ret_type = RET_VOID,
  519. .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
  520. .arg2_type = ARG_ANYTHING,
  521. };
  522. BPF_CALL_2(bpf_ringbuf_discard_dynptr, struct bpf_dynptr_kern *, ptr, u64, flags)
  523. {
  524. if (!ptr->data)
  525. return 0;
  526. bpf_ringbuf_commit(ptr->data, flags, true /* discard */);
  527. bpf_dynptr_set_null(ptr);
  528. return 0;
  529. }
  530. const struct bpf_func_proto bpf_ringbuf_discard_dynptr_proto = {
  531. .func = bpf_ringbuf_discard_dynptr,
  532. .ret_type = RET_VOID,
  533. .arg1_type = ARG_PTR_TO_DYNPTR | DYNPTR_TYPE_RINGBUF | OBJ_RELEASE,
  534. .arg2_type = ARG_ANYTHING,
  535. };
  536. static int __bpf_user_ringbuf_peek(struct bpf_ringbuf *rb, void **sample, u32 *size)
  537. {
  538. int err;
  539. u32 hdr_len, sample_len, total_len, flags, *hdr;
  540. u64 cons_pos, prod_pos;
  541. /* Synchronizes with smp_store_release() in user-space producer. */
  542. prod_pos = smp_load_acquire(&rb->producer_pos);
  543. if (prod_pos % 8)
  544. return -EINVAL;
  545. /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_sample_release() */
  546. cons_pos = smp_load_acquire(&rb->consumer_pos);
  547. if (cons_pos >= prod_pos)
  548. return -ENODATA;
  549. hdr = (u32 *)((uintptr_t)rb->data + (uintptr_t)(cons_pos & rb->mask));
  550. /* Synchronizes with smp_store_release() in user-space producer. */
  551. hdr_len = smp_load_acquire(hdr);
  552. flags = hdr_len & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT);
  553. sample_len = hdr_len & ~flags;
  554. total_len = round_up(sample_len + BPF_RINGBUF_HDR_SZ, 8);
  555. /* The sample must fit within the region advertised by the producer position. */
  556. if (total_len > prod_pos - cons_pos)
  557. return -EINVAL;
  558. /* The sample must fit within the data region of the ring buffer. */
  559. if (total_len > ringbuf_total_data_sz(rb))
  560. return -E2BIG;
  561. /* The sample must fit into a struct bpf_dynptr. */
  562. err = bpf_dynptr_check_size(sample_len);
  563. if (err)
  564. return -E2BIG;
  565. if (flags & BPF_RINGBUF_DISCARD_BIT) {
  566. /* If the discard bit is set, the sample should be skipped.
  567. *
  568. * Update the consumer pos, and return -EAGAIN so the caller
  569. * knows to skip this sample and try to read the next one.
  570. */
  571. smp_store_release(&rb->consumer_pos, cons_pos + total_len);
  572. return -EAGAIN;
  573. }
  574. if (flags & BPF_RINGBUF_BUSY_BIT)
  575. return -ENODATA;
  576. *sample = (void *)((uintptr_t)rb->data +
  577. (uintptr_t)((cons_pos + BPF_RINGBUF_HDR_SZ) & rb->mask));
  578. *size = sample_len;
  579. return 0;
  580. }
  581. static void __bpf_user_ringbuf_sample_release(struct bpf_ringbuf *rb, size_t size, u64 flags)
  582. {
  583. u64 consumer_pos;
  584. u32 rounded_size = round_up(size + BPF_RINGBUF_HDR_SZ, 8);
  585. /* Using smp_load_acquire() is unnecessary here, as the busy-bit
  586. * prevents another task from writing to consumer_pos after it was read
  587. * by this task with smp_load_acquire() in __bpf_user_ringbuf_peek().
  588. */
  589. consumer_pos = rb->consumer_pos;
  590. /* Synchronizes with smp_load_acquire() in user-space producer. */
  591. smp_store_release(&rb->consumer_pos, consumer_pos + rounded_size);
  592. }
  593. BPF_CALL_4(bpf_user_ringbuf_drain, struct bpf_map *, map,
  594. void *, callback_fn, void *, callback_ctx, u64, flags)
  595. {
  596. struct bpf_ringbuf *rb;
  597. long samples, discarded_samples = 0, ret = 0;
  598. bpf_callback_t callback = (bpf_callback_t)callback_fn;
  599. u64 wakeup_flags = BPF_RB_NO_WAKEUP | BPF_RB_FORCE_WAKEUP;
  600. int busy = 0;
  601. if (unlikely(flags & ~wakeup_flags))
  602. return -EINVAL;
  603. rb = container_of(map, struct bpf_ringbuf_map, map)->rb;
  604. /* If another consumer is already consuming a sample, wait for them to finish. */
  605. if (!atomic_try_cmpxchg(&rb->busy, &busy, 1))
  606. return -EBUSY;
  607. for (samples = 0; samples < BPF_MAX_USER_RINGBUF_SAMPLES && ret == 0; samples++) {
  608. int err;
  609. u32 size;
  610. void *sample;
  611. struct bpf_dynptr_kern dynptr;
  612. err = __bpf_user_ringbuf_peek(rb, &sample, &size);
  613. if (err) {
  614. if (err == -ENODATA) {
  615. break;
  616. } else if (err == -EAGAIN) {
  617. discarded_samples++;
  618. continue;
  619. } else {
  620. ret = err;
  621. goto schedule_work_return;
  622. }
  623. }
  624. bpf_dynptr_init(&dynptr, sample, BPF_DYNPTR_TYPE_LOCAL, 0, size);
  625. ret = callback((uintptr_t)&dynptr, (uintptr_t)callback_ctx, 0, 0, 0);
  626. __bpf_user_ringbuf_sample_release(rb, size, flags);
  627. }
  628. ret = samples - discarded_samples;
  629. schedule_work_return:
  630. /* Prevent the clearing of the busy-bit from being reordered before the
  631. * storing of any rb consumer or producer positions.
  632. */
  633. smp_mb__before_atomic();
  634. atomic_set(&rb->busy, 0);
  635. if (flags & BPF_RB_FORCE_WAKEUP)
  636. irq_work_queue(&rb->work);
  637. else if (!(flags & BPF_RB_NO_WAKEUP) && samples > 0)
  638. irq_work_queue(&rb->work);
  639. return ret;
  640. }
  641. const struct bpf_func_proto bpf_user_ringbuf_drain_proto = {
  642. .func = bpf_user_ringbuf_drain,
  643. .ret_type = RET_INTEGER,
  644. .arg1_type = ARG_CONST_MAP_PTR,
  645. .arg2_type = ARG_PTR_TO_FUNC,
  646. .arg3_type = ARG_PTR_TO_STACK_OR_NULL,
  647. .arg4_type = ARG_ANYTHING,
  648. };