eventfd.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * fs/eventfd.c
  4. *
  5. * Copyright (C) 2007 Davide Libenzi <[email protected]>
  6. *
  7. */
  8. #include <linux/file.h>
  9. #include <linux/poll.h>
  10. #include <linux/init.h>
  11. #include <linux/fs.h>
  12. #include <linux/sched/signal.h>
  13. #include <linux/kernel.h>
  14. #include <linux/slab.h>
  15. #include <linux/list.h>
  16. #include <linux/spinlock.h>
  17. #include <linux/anon_inodes.h>
  18. #include <linux/syscalls.h>
  19. #include <linux/export.h>
  20. #include <linux/kref.h>
  21. #include <linux/eventfd.h>
  22. #include <linux/proc_fs.h>
  23. #include <linux/seq_file.h>
  24. #include <linux/idr.h>
  25. #include <linux/uio.h>
  26. static DEFINE_IDA(eventfd_ida);
  27. struct eventfd_ctx {
  28. struct kref kref;
  29. wait_queue_head_t wqh;
  30. /*
  31. * Every time that a write(2) is performed on an eventfd, the
  32. * value of the __u64 being written is added to "count" and a
  33. * wakeup is performed on "wqh". A read(2) will return the "count"
  34. * value to userspace, and will reset "count" to zero. The kernel
  35. * side eventfd_signal() also, adds to the "count" counter and
  36. * issue a wakeup.
  37. */
  38. __u64 count;
  39. unsigned int flags;
  40. int id;
  41. };
  42. __u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask)
  43. {
  44. unsigned long flags;
  45. /*
  46. * Deadlock or stack overflow issues can happen if we recurse here
  47. * through waitqueue wakeup handlers. If the caller users potentially
  48. * nested waitqueues with custom wakeup handlers, then it should
  49. * check eventfd_signal_allowed() before calling this function. If
  50. * it returns false, the eventfd_signal() call should be deferred to a
  51. * safe context.
  52. */
  53. if (WARN_ON_ONCE(current->in_eventfd))
  54. return 0;
  55. spin_lock_irqsave(&ctx->wqh.lock, flags);
  56. current->in_eventfd = 1;
  57. if (ULLONG_MAX - ctx->count < n)
  58. n = ULLONG_MAX - ctx->count;
  59. ctx->count += n;
  60. if (waitqueue_active(&ctx->wqh))
  61. wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask);
  62. current->in_eventfd = 0;
  63. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  64. return n;
  65. }
  66. /**
  67. * eventfd_signal - Adds @n to the eventfd counter.
  68. * @ctx: [in] Pointer to the eventfd context.
  69. * @n: [in] Value of the counter to be added to the eventfd internal counter.
  70. * The value cannot be negative.
  71. *
  72. * This function is supposed to be called by the kernel in paths that do not
  73. * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
  74. * value, and we signal this as overflow condition by returning a EPOLLERR
  75. * to poll(2).
  76. *
  77. * Returns the amount by which the counter was incremented. This will be less
  78. * than @n if the counter has overflowed.
  79. */
  80. __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
  81. {
  82. return eventfd_signal_mask(ctx, n, 0);
  83. }
  84. EXPORT_SYMBOL_GPL(eventfd_signal);
  85. static void eventfd_free_ctx(struct eventfd_ctx *ctx)
  86. {
  87. if (ctx->id >= 0)
  88. ida_simple_remove(&eventfd_ida, ctx->id);
  89. kfree(ctx);
  90. }
  91. static void eventfd_free(struct kref *kref)
  92. {
  93. struct eventfd_ctx *ctx = container_of(kref, struct eventfd_ctx, kref);
  94. eventfd_free_ctx(ctx);
  95. }
  96. /**
  97. * eventfd_ctx_put - Releases a reference to the internal eventfd context.
  98. * @ctx: [in] Pointer to eventfd context.
  99. *
  100. * The eventfd context reference must have been previously acquired either
  101. * with eventfd_ctx_fdget() or eventfd_ctx_fileget().
  102. */
  103. void eventfd_ctx_put(struct eventfd_ctx *ctx)
  104. {
  105. kref_put(&ctx->kref, eventfd_free);
  106. }
  107. EXPORT_SYMBOL_GPL(eventfd_ctx_put);
  108. static int eventfd_release(struct inode *inode, struct file *file)
  109. {
  110. struct eventfd_ctx *ctx = file->private_data;
  111. wake_up_poll(&ctx->wqh, EPOLLHUP);
  112. eventfd_ctx_put(ctx);
  113. return 0;
  114. }
  115. static __poll_t eventfd_poll(struct file *file, poll_table *wait)
  116. {
  117. struct eventfd_ctx *ctx = file->private_data;
  118. __poll_t events = 0;
  119. u64 count;
  120. poll_wait(file, &ctx->wqh, wait);
  121. /*
  122. * All writes to ctx->count occur within ctx->wqh.lock. This read
  123. * can be done outside ctx->wqh.lock because we know that poll_wait
  124. * takes that lock (through add_wait_queue) if our caller will sleep.
  125. *
  126. * The read _can_ therefore seep into add_wait_queue's critical
  127. * section, but cannot move above it! add_wait_queue's spin_lock acts
  128. * as an acquire barrier and ensures that the read be ordered properly
  129. * against the writes. The following CAN happen and is safe:
  130. *
  131. * poll write
  132. * ----------------- ------------
  133. * lock ctx->wqh.lock (in poll_wait)
  134. * count = ctx->count
  135. * __add_wait_queue
  136. * unlock ctx->wqh.lock
  137. * lock ctx->qwh.lock
  138. * ctx->count += n
  139. * if (waitqueue_active)
  140. * wake_up_locked_poll
  141. * unlock ctx->qwh.lock
  142. * eventfd_poll returns 0
  143. *
  144. * but the following, which would miss a wakeup, cannot happen:
  145. *
  146. * poll write
  147. * ----------------- ------------
  148. * count = ctx->count (INVALID!)
  149. * lock ctx->qwh.lock
  150. * ctx->count += n
  151. * **waitqueue_active is false**
  152. * **no wake_up_locked_poll!**
  153. * unlock ctx->qwh.lock
  154. * lock ctx->wqh.lock (in poll_wait)
  155. * __add_wait_queue
  156. * unlock ctx->wqh.lock
  157. * eventfd_poll returns 0
  158. */
  159. count = READ_ONCE(ctx->count);
  160. if (count > 0)
  161. events |= EPOLLIN;
  162. if (count == ULLONG_MAX)
  163. events |= EPOLLERR;
  164. if (ULLONG_MAX - 1 > count)
  165. events |= EPOLLOUT;
  166. return events;
  167. }
  168. void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt)
  169. {
  170. lockdep_assert_held(&ctx->wqh.lock);
  171. *cnt = ((ctx->flags & EFD_SEMAPHORE) && ctx->count) ? 1 : ctx->count;
  172. ctx->count -= *cnt;
  173. }
  174. EXPORT_SYMBOL_GPL(eventfd_ctx_do_read);
  175. /**
  176. * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue.
  177. * @ctx: [in] Pointer to eventfd context.
  178. * @wait: [in] Wait queue to be removed.
  179. * @cnt: [out] Pointer to the 64-bit counter value.
  180. *
  181. * Returns %0 if successful, or the following error codes:
  182. *
  183. * -EAGAIN : The operation would have blocked.
  184. *
  185. * This is used to atomically remove a wait queue entry from the eventfd wait
  186. * queue head, and read/reset the counter value.
  187. */
  188. int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait,
  189. __u64 *cnt)
  190. {
  191. unsigned long flags;
  192. spin_lock_irqsave(&ctx->wqh.lock, flags);
  193. eventfd_ctx_do_read(ctx, cnt);
  194. __remove_wait_queue(&ctx->wqh, wait);
  195. if (*cnt != 0 && waitqueue_active(&ctx->wqh))
  196. wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
  197. spin_unlock_irqrestore(&ctx->wqh.lock, flags);
  198. return *cnt != 0 ? 0 : -EAGAIN;
  199. }
  200. EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue);
  201. static ssize_t eventfd_read(struct kiocb *iocb, struct iov_iter *to)
  202. {
  203. struct file *file = iocb->ki_filp;
  204. struct eventfd_ctx *ctx = file->private_data;
  205. __u64 ucnt = 0;
  206. DECLARE_WAITQUEUE(wait, current);
  207. if (iov_iter_count(to) < sizeof(ucnt))
  208. return -EINVAL;
  209. spin_lock_irq(&ctx->wqh.lock);
  210. if (!ctx->count) {
  211. if ((file->f_flags & O_NONBLOCK) ||
  212. (iocb->ki_flags & IOCB_NOWAIT)) {
  213. spin_unlock_irq(&ctx->wqh.lock);
  214. return -EAGAIN;
  215. }
  216. __add_wait_queue(&ctx->wqh, &wait);
  217. for (;;) {
  218. set_current_state(TASK_INTERRUPTIBLE);
  219. if (ctx->count)
  220. break;
  221. if (signal_pending(current)) {
  222. __remove_wait_queue(&ctx->wqh, &wait);
  223. __set_current_state(TASK_RUNNING);
  224. spin_unlock_irq(&ctx->wqh.lock);
  225. return -ERESTARTSYS;
  226. }
  227. spin_unlock_irq(&ctx->wqh.lock);
  228. schedule();
  229. spin_lock_irq(&ctx->wqh.lock);
  230. }
  231. __remove_wait_queue(&ctx->wqh, &wait);
  232. __set_current_state(TASK_RUNNING);
  233. }
  234. eventfd_ctx_do_read(ctx, &ucnt);
  235. current->in_eventfd = 1;
  236. if (waitqueue_active(&ctx->wqh))
  237. wake_up_locked_poll(&ctx->wqh, EPOLLOUT);
  238. current->in_eventfd = 0;
  239. spin_unlock_irq(&ctx->wqh.lock);
  240. if (unlikely(copy_to_iter(&ucnt, sizeof(ucnt), to) != sizeof(ucnt)))
  241. return -EFAULT;
  242. return sizeof(ucnt);
  243. }
  244. static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count,
  245. loff_t *ppos)
  246. {
  247. struct eventfd_ctx *ctx = file->private_data;
  248. ssize_t res;
  249. __u64 ucnt;
  250. DECLARE_WAITQUEUE(wait, current);
  251. if (count < sizeof(ucnt))
  252. return -EINVAL;
  253. if (copy_from_user(&ucnt, buf, sizeof(ucnt)))
  254. return -EFAULT;
  255. if (ucnt == ULLONG_MAX)
  256. return -EINVAL;
  257. spin_lock_irq(&ctx->wqh.lock);
  258. res = -EAGAIN;
  259. if (ULLONG_MAX - ctx->count > ucnt)
  260. res = sizeof(ucnt);
  261. else if (!(file->f_flags & O_NONBLOCK)) {
  262. __add_wait_queue(&ctx->wqh, &wait);
  263. for (res = 0;;) {
  264. set_current_state(TASK_INTERRUPTIBLE);
  265. if (ULLONG_MAX - ctx->count > ucnt) {
  266. res = sizeof(ucnt);
  267. break;
  268. }
  269. if (signal_pending(current)) {
  270. res = -ERESTARTSYS;
  271. break;
  272. }
  273. spin_unlock_irq(&ctx->wqh.lock);
  274. schedule();
  275. spin_lock_irq(&ctx->wqh.lock);
  276. }
  277. __remove_wait_queue(&ctx->wqh, &wait);
  278. __set_current_state(TASK_RUNNING);
  279. }
  280. if (likely(res > 0)) {
  281. ctx->count += ucnt;
  282. current->in_eventfd = 1;
  283. if (waitqueue_active(&ctx->wqh))
  284. wake_up_locked_poll(&ctx->wqh, EPOLLIN);
  285. current->in_eventfd = 0;
  286. }
  287. spin_unlock_irq(&ctx->wqh.lock);
  288. return res;
  289. }
  290. #ifdef CONFIG_PROC_FS
  291. static void eventfd_show_fdinfo(struct seq_file *m, struct file *f)
  292. {
  293. struct eventfd_ctx *ctx = f->private_data;
  294. spin_lock_irq(&ctx->wqh.lock);
  295. seq_printf(m, "eventfd-count: %16llx\n",
  296. (unsigned long long)ctx->count);
  297. spin_unlock_irq(&ctx->wqh.lock);
  298. seq_printf(m, "eventfd-id: %d\n", ctx->id);
  299. }
  300. #endif
  301. static const struct file_operations eventfd_fops = {
  302. #ifdef CONFIG_PROC_FS
  303. .show_fdinfo = eventfd_show_fdinfo,
  304. #endif
  305. .release = eventfd_release,
  306. .poll = eventfd_poll,
  307. .read_iter = eventfd_read,
  308. .write = eventfd_write,
  309. .llseek = noop_llseek,
  310. };
  311. /**
  312. * eventfd_fget - Acquire a reference of an eventfd file descriptor.
  313. * @fd: [in] Eventfd file descriptor.
  314. *
  315. * Returns a pointer to the eventfd file structure in case of success, or the
  316. * following error pointer:
  317. *
  318. * -EBADF : Invalid @fd file descriptor.
  319. * -EINVAL : The @fd file descriptor is not an eventfd file.
  320. */
  321. struct file *eventfd_fget(int fd)
  322. {
  323. struct file *file;
  324. file = fget(fd);
  325. if (!file)
  326. return ERR_PTR(-EBADF);
  327. if (file->f_op != &eventfd_fops) {
  328. fput(file);
  329. return ERR_PTR(-EINVAL);
  330. }
  331. return file;
  332. }
  333. EXPORT_SYMBOL_GPL(eventfd_fget);
  334. /**
  335. * eventfd_ctx_fdget - Acquires a reference to the internal eventfd context.
  336. * @fd: [in] Eventfd file descriptor.
  337. *
  338. * Returns a pointer to the internal eventfd context, otherwise the error
  339. * pointers returned by the following functions:
  340. *
  341. * eventfd_fget
  342. */
  343. struct eventfd_ctx *eventfd_ctx_fdget(int fd)
  344. {
  345. struct eventfd_ctx *ctx;
  346. struct fd f = fdget(fd);
  347. if (!f.file)
  348. return ERR_PTR(-EBADF);
  349. ctx = eventfd_ctx_fileget(f.file);
  350. fdput(f);
  351. return ctx;
  352. }
  353. EXPORT_SYMBOL_GPL(eventfd_ctx_fdget);
  354. /**
  355. * eventfd_ctx_fileget - Acquires a reference to the internal eventfd context.
  356. * @file: [in] Eventfd file pointer.
  357. *
  358. * Returns a pointer to the internal eventfd context, otherwise the error
  359. * pointer:
  360. *
  361. * -EINVAL : The @fd file descriptor is not an eventfd file.
  362. */
  363. struct eventfd_ctx *eventfd_ctx_fileget(struct file *file)
  364. {
  365. struct eventfd_ctx *ctx;
  366. if (file->f_op != &eventfd_fops)
  367. return ERR_PTR(-EINVAL);
  368. ctx = file->private_data;
  369. kref_get(&ctx->kref);
  370. return ctx;
  371. }
  372. EXPORT_SYMBOL_GPL(eventfd_ctx_fileget);
  373. static int do_eventfd(unsigned int count, int flags)
  374. {
  375. struct eventfd_ctx *ctx;
  376. struct file *file;
  377. int fd;
  378. /* Check the EFD_* constants for consistency. */
  379. BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
  380. BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
  381. if (flags & ~EFD_FLAGS_SET)
  382. return -EINVAL;
  383. ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
  384. if (!ctx)
  385. return -ENOMEM;
  386. kref_init(&ctx->kref);
  387. init_waitqueue_head(&ctx->wqh);
  388. ctx->count = count;
  389. ctx->flags = flags;
  390. ctx->id = ida_simple_get(&eventfd_ida, 0, 0, GFP_KERNEL);
  391. flags &= EFD_SHARED_FCNTL_FLAGS;
  392. flags |= O_RDWR;
  393. fd = get_unused_fd_flags(flags);
  394. if (fd < 0)
  395. goto err;
  396. file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
  397. if (IS_ERR(file)) {
  398. put_unused_fd(fd);
  399. fd = PTR_ERR(file);
  400. goto err;
  401. }
  402. file->f_mode |= FMODE_NOWAIT;
  403. fd_install(fd, file);
  404. return fd;
  405. err:
  406. eventfd_free_ctx(ctx);
  407. return fd;
  408. }
  409. SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
  410. {
  411. return do_eventfd(count, flags);
  412. }
  413. SYSCALL_DEFINE1(eventfd, unsigned int, count)
  414. {
  415. return do_eventfd(count, 0);
  416. }