pipe.c 36 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/fs/pipe.c
  4. *
  5. * Copyright (C) 1991, 1992, 1999 Linus Torvalds
  6. */
  7. #include <linux/mm.h>
  8. #include <linux/file.h>
  9. #include <linux/poll.h>
  10. #include <linux/slab.h>
  11. #include <linux/module.h>
  12. #include <linux/init.h>
  13. #include <linux/fs.h>
  14. #include <linux/log2.h>
  15. #include <linux/mount.h>
  16. #include <linux/pseudo_fs.h>
  17. #include <linux/magic.h>
  18. #include <linux/pipe_fs_i.h>
  19. #include <linux/uio.h>
  20. #include <linux/highmem.h>
  21. #include <linux/pagemap.h>
  22. #include <linux/audit.h>
  23. #include <linux/syscalls.h>
  24. #include <linux/fcntl.h>
  25. #include <linux/memcontrol.h>
  26. #include <linux/watch_queue.h>
  27. #include <linux/sysctl.h>
  28. #include <linux/uaccess.h>
  29. #include <asm/ioctls.h>
  30. #include "internal.h"
  31. /*
  32. * New pipe buffers will be restricted to this size while the user is exceeding
  33. * their pipe buffer quota. The general pipe use case needs at least two
  34. * buffers: one for data yet to be read, and one for new data. If this is less
  35. * than two, then a write to a non-empty pipe may block even if the pipe is not
  36. * full. This can occur with GNU make jobserver or similar uses of pipes as
  37. * semaphores: multiple processes may be waiting to write tokens back to the
  38. * pipe before reading tokens: https://lore.kernel.org/lkml/1628086770.5rn8p04n6j.none@localhost/.
  39. *
  40. * Users can reduce their pipe buffers with F_SETPIPE_SZ below this at their
  41. * own risk, namely: pipe writes to non-full pipes may block until the pipe is
  42. * emptied.
  43. */
  44. #define PIPE_MIN_DEF_BUFFERS 2
  45. /*
  46. * The max size that a non-root user is allowed to grow the pipe. Can
  47. * be set by root in /proc/sys/fs/pipe-max-size
  48. */
  49. static unsigned int pipe_max_size = 1048576;
  50. /* Maximum allocatable pages per user. Hard limit is unset by default, soft
  51. * matches default values.
  52. */
  53. static unsigned long pipe_user_pages_hard;
  54. static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
  55. /*
  56. * We use head and tail indices that aren't masked off, except at the point of
  57. * dereference, but rather they're allowed to wrap naturally. This means there
  58. * isn't a dead spot in the buffer, but the ring has to be a power of two and
  59. * <= 2^31.
  60. * -- David Howells 2019-09-23.
  61. *
  62. * Reads with count = 0 should always return 0.
  63. * -- Julian Bradfield 1999-06-07.
  64. *
  65. * FIFOs and Pipes now generate SIGIO for both readers and writers.
  66. * -- Jeremy Elson <[email protected]> 2001-08-16
  67. *
  68. * pipe_read & write cleanup
  69. * -- Manfred Spraul <[email protected]> 2002-05-09
  70. */
  71. static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
  72. {
  73. if (pipe->files)
  74. mutex_lock_nested(&pipe->mutex, subclass);
  75. }
  76. void pipe_lock(struct pipe_inode_info *pipe)
  77. {
  78. /*
  79. * pipe_lock() nests non-pipe inode locks (for writing to a file)
  80. */
  81. pipe_lock_nested(pipe, I_MUTEX_PARENT);
  82. }
  83. EXPORT_SYMBOL(pipe_lock);
  84. void pipe_unlock(struct pipe_inode_info *pipe)
  85. {
  86. if (pipe->files)
  87. mutex_unlock(&pipe->mutex);
  88. }
  89. EXPORT_SYMBOL(pipe_unlock);
  90. static inline void __pipe_lock(struct pipe_inode_info *pipe)
  91. {
  92. mutex_lock_nested(&pipe->mutex, I_MUTEX_PARENT);
  93. }
  94. static inline void __pipe_unlock(struct pipe_inode_info *pipe)
  95. {
  96. mutex_unlock(&pipe->mutex);
  97. }
  98. void pipe_double_lock(struct pipe_inode_info *pipe1,
  99. struct pipe_inode_info *pipe2)
  100. {
  101. BUG_ON(pipe1 == pipe2);
  102. if (pipe1 < pipe2) {
  103. pipe_lock_nested(pipe1, I_MUTEX_PARENT);
  104. pipe_lock_nested(pipe2, I_MUTEX_CHILD);
  105. } else {
  106. pipe_lock_nested(pipe2, I_MUTEX_PARENT);
  107. pipe_lock_nested(pipe1, I_MUTEX_CHILD);
  108. }
  109. }
  110. static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
  111. struct pipe_buffer *buf)
  112. {
  113. struct page *page = buf->page;
  114. /*
  115. * If nobody else uses this page, and we don't already have a
  116. * temporary page, let's keep track of it as a one-deep
  117. * allocation cache. (Otherwise just release our reference to it)
  118. */
  119. if (page_count(page) == 1 && !pipe->tmp_page)
  120. pipe->tmp_page = page;
  121. else
  122. put_page(page);
  123. }
  124. static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  125. struct pipe_buffer *buf)
  126. {
  127. struct page *page = buf->page;
  128. if (page_count(page) != 1)
  129. return false;
  130. memcg_kmem_uncharge_page(page, 0);
  131. __SetPageLocked(page);
  132. return true;
  133. }
  134. /**
  135. * generic_pipe_buf_try_steal - attempt to take ownership of a &pipe_buffer
  136. * @pipe: the pipe that the buffer belongs to
  137. * @buf: the buffer to attempt to steal
  138. *
  139. * Description:
  140. * This function attempts to steal the &struct page attached to
  141. * @buf. If successful, this function returns 0 and returns with
  142. * the page locked. The caller may then reuse the page for whatever
  143. * he wishes; the typical use is insertion into a different file
  144. * page cache.
  145. */
  146. bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe,
  147. struct pipe_buffer *buf)
  148. {
  149. struct page *page = buf->page;
  150. /*
  151. * A reference of one is golden, that means that the owner of this
  152. * page is the only one holding a reference to it. lock the page
  153. * and return OK.
  154. */
  155. if (page_count(page) == 1) {
  156. lock_page(page);
  157. return true;
  158. }
  159. return false;
  160. }
  161. EXPORT_SYMBOL(generic_pipe_buf_try_steal);
  162. /**
  163. * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
  164. * @pipe: the pipe that the buffer belongs to
  165. * @buf: the buffer to get a reference to
  166. *
  167. * Description:
  168. * This function grabs an extra reference to @buf. It's used in
  169. * the tee() system call, when we duplicate the buffers in one
  170. * pipe into another.
  171. */
  172. bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
  173. {
  174. return try_get_page(buf->page);
  175. }
  176. EXPORT_SYMBOL(generic_pipe_buf_get);
  177. /**
  178. * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
  179. * @pipe: the pipe that the buffer belongs to
  180. * @buf: the buffer to put a reference to
  181. *
  182. * Description:
  183. * This function releases a reference to @buf.
  184. */
  185. void generic_pipe_buf_release(struct pipe_inode_info *pipe,
  186. struct pipe_buffer *buf)
  187. {
  188. put_page(buf->page);
  189. }
  190. EXPORT_SYMBOL(generic_pipe_buf_release);
  191. static const struct pipe_buf_operations anon_pipe_buf_ops = {
  192. .release = anon_pipe_buf_release,
  193. .try_steal = anon_pipe_buf_try_steal,
  194. .get = generic_pipe_buf_get,
  195. };
  196. /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
  197. static inline bool pipe_readable(const struct pipe_inode_info *pipe)
  198. {
  199. unsigned int head = READ_ONCE(pipe->head);
  200. unsigned int tail = READ_ONCE(pipe->tail);
  201. unsigned int writers = READ_ONCE(pipe->writers);
  202. return !pipe_empty(head, tail) || !writers;
  203. }
  204. static ssize_t
  205. pipe_read(struct kiocb *iocb, struct iov_iter *to)
  206. {
  207. size_t total_len = iov_iter_count(to);
  208. struct file *filp = iocb->ki_filp;
  209. struct pipe_inode_info *pipe = filp->private_data;
  210. bool was_full, wake_next_reader = false;
  211. ssize_t ret;
  212. /* Null read succeeds. */
  213. if (unlikely(total_len == 0))
  214. return 0;
  215. ret = 0;
  216. __pipe_lock(pipe);
  217. /*
  218. * We only wake up writers if the pipe was full when we started
  219. * reading in order to avoid unnecessary wakeups.
  220. *
  221. * But when we do wake up writers, we do so using a sync wakeup
  222. * (WF_SYNC), because we want them to get going and generate more
  223. * data for us.
  224. */
  225. was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
  226. for (;;) {
  227. /* Read ->head with a barrier vs post_one_notification() */
  228. unsigned int head = smp_load_acquire(&pipe->head);
  229. unsigned int tail = pipe->tail;
  230. unsigned int mask = pipe->ring_size - 1;
  231. #ifdef CONFIG_WATCH_QUEUE
  232. if (pipe->note_loss) {
  233. struct watch_notification n;
  234. if (total_len < 8) {
  235. if (ret == 0)
  236. ret = -ENOBUFS;
  237. break;
  238. }
  239. n.type = WATCH_TYPE_META;
  240. n.subtype = WATCH_META_LOSS_NOTIFICATION;
  241. n.info = watch_sizeof(n);
  242. if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) {
  243. if (ret == 0)
  244. ret = -EFAULT;
  245. break;
  246. }
  247. ret += sizeof(n);
  248. total_len -= sizeof(n);
  249. pipe->note_loss = false;
  250. }
  251. #endif
  252. if (!pipe_empty(head, tail)) {
  253. struct pipe_buffer *buf = &pipe->bufs[tail & mask];
  254. size_t chars = buf->len;
  255. size_t written;
  256. int error;
  257. if (chars > total_len) {
  258. if (buf->flags & PIPE_BUF_FLAG_WHOLE) {
  259. if (ret == 0)
  260. ret = -ENOBUFS;
  261. break;
  262. }
  263. chars = total_len;
  264. }
  265. error = pipe_buf_confirm(pipe, buf);
  266. if (error) {
  267. if (!ret)
  268. ret = error;
  269. break;
  270. }
  271. written = copy_page_to_iter(buf->page, buf->offset, chars, to);
  272. if (unlikely(written < chars)) {
  273. if (!ret)
  274. ret = -EFAULT;
  275. break;
  276. }
  277. ret += chars;
  278. buf->offset += chars;
  279. buf->len -= chars;
  280. /* Was it a packet buffer? Clean up and exit */
  281. if (buf->flags & PIPE_BUF_FLAG_PACKET) {
  282. total_len = chars;
  283. buf->len = 0;
  284. }
  285. if (!buf->len) {
  286. pipe_buf_release(pipe, buf);
  287. spin_lock_irq(&pipe->rd_wait.lock);
  288. #ifdef CONFIG_WATCH_QUEUE
  289. if (buf->flags & PIPE_BUF_FLAG_LOSS)
  290. pipe->note_loss = true;
  291. #endif
  292. tail++;
  293. pipe->tail = tail;
  294. spin_unlock_irq(&pipe->rd_wait.lock);
  295. }
  296. total_len -= chars;
  297. if (!total_len)
  298. break; /* common path: read succeeded */
  299. if (!pipe_empty(head, tail)) /* More to do? */
  300. continue;
  301. }
  302. if (!pipe->writers)
  303. break;
  304. if (ret)
  305. break;
  306. if (filp->f_flags & O_NONBLOCK) {
  307. ret = -EAGAIN;
  308. break;
  309. }
  310. __pipe_unlock(pipe);
  311. /*
  312. * We only get here if we didn't actually read anything.
  313. *
  314. * However, we could have seen (and removed) a zero-sized
  315. * pipe buffer, and might have made space in the buffers
  316. * that way.
  317. *
  318. * You can't make zero-sized pipe buffers by doing an empty
  319. * write (not even in packet mode), but they can happen if
  320. * the writer gets an EFAULT when trying to fill a buffer
  321. * that already got allocated and inserted in the buffer
  322. * array.
  323. *
  324. * So we still need to wake up any pending writers in the
  325. * _very_ unlikely case that the pipe was full, but we got
  326. * no data.
  327. */
  328. if (unlikely(was_full))
  329. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  330. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  331. /*
  332. * But because we didn't read anything, at this point we can
  333. * just return directly with -ERESTARTSYS if we're interrupted,
  334. * since we've done any required wakeups and there's no need
  335. * to mark anything accessed. And we've dropped the lock.
  336. */
  337. if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0)
  338. return -ERESTARTSYS;
  339. __pipe_lock(pipe);
  340. was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage);
  341. wake_next_reader = true;
  342. }
  343. if (pipe_empty(pipe->head, pipe->tail))
  344. wake_next_reader = false;
  345. __pipe_unlock(pipe);
  346. if (was_full)
  347. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  348. if (wake_next_reader)
  349. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  350. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  351. if (ret > 0)
  352. file_accessed(filp);
  353. return ret;
  354. }
  355. static inline int is_packetized(struct file *file)
  356. {
  357. return (file->f_flags & O_DIRECT) != 0;
  358. }
  359. /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */
  360. static inline bool pipe_writable(const struct pipe_inode_info *pipe)
  361. {
  362. unsigned int head = READ_ONCE(pipe->head);
  363. unsigned int tail = READ_ONCE(pipe->tail);
  364. unsigned int max_usage = READ_ONCE(pipe->max_usage);
  365. return !pipe_full(head, tail, max_usage) ||
  366. !READ_ONCE(pipe->readers);
  367. }
  368. static ssize_t
  369. pipe_write(struct kiocb *iocb, struct iov_iter *from)
  370. {
  371. struct file *filp = iocb->ki_filp;
  372. struct pipe_inode_info *pipe = filp->private_data;
  373. unsigned int head;
  374. ssize_t ret = 0;
  375. size_t total_len = iov_iter_count(from);
  376. ssize_t chars;
  377. bool was_empty = false;
  378. bool wake_next_writer = false;
  379. /* Null write succeeds. */
  380. if (unlikely(total_len == 0))
  381. return 0;
  382. __pipe_lock(pipe);
  383. if (!pipe->readers) {
  384. send_sig(SIGPIPE, current, 0);
  385. ret = -EPIPE;
  386. goto out;
  387. }
  388. #ifdef CONFIG_WATCH_QUEUE
  389. if (pipe->watch_queue) {
  390. ret = -EXDEV;
  391. goto out;
  392. }
  393. #endif
  394. /*
  395. * If it wasn't empty we try to merge new data into
  396. * the last buffer.
  397. *
  398. * That naturally merges small writes, but it also
  399. * page-aligns the rest of the writes for large writes
  400. * spanning multiple pages.
  401. */
  402. head = pipe->head;
  403. was_empty = pipe_empty(head, pipe->tail);
  404. chars = total_len & (PAGE_SIZE-1);
  405. if (chars && !was_empty) {
  406. unsigned int mask = pipe->ring_size - 1;
  407. struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask];
  408. int offset = buf->offset + buf->len;
  409. if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) &&
  410. offset + chars <= PAGE_SIZE) {
  411. ret = pipe_buf_confirm(pipe, buf);
  412. if (ret)
  413. goto out;
  414. ret = copy_page_from_iter(buf->page, offset, chars, from);
  415. if (unlikely(ret < chars)) {
  416. ret = -EFAULT;
  417. goto out;
  418. }
  419. buf->len += ret;
  420. if (!iov_iter_count(from))
  421. goto out;
  422. }
  423. }
  424. for (;;) {
  425. if (!pipe->readers) {
  426. send_sig(SIGPIPE, current, 0);
  427. if (!ret)
  428. ret = -EPIPE;
  429. break;
  430. }
  431. head = pipe->head;
  432. if (!pipe_full(head, pipe->tail, pipe->max_usage)) {
  433. unsigned int mask = pipe->ring_size - 1;
  434. struct pipe_buffer *buf = &pipe->bufs[head & mask];
  435. struct page *page = pipe->tmp_page;
  436. int copied;
  437. if (!page) {
  438. page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT);
  439. if (unlikely(!page)) {
  440. ret = ret ? : -ENOMEM;
  441. break;
  442. }
  443. pipe->tmp_page = page;
  444. }
  445. /* Allocate a slot in the ring in advance and attach an
  446. * empty buffer. If we fault or otherwise fail to use
  447. * it, either the reader will consume it or it'll still
  448. * be there for the next write.
  449. */
  450. spin_lock_irq(&pipe->rd_wait.lock);
  451. head = pipe->head;
  452. if (pipe_full(head, pipe->tail, pipe->max_usage)) {
  453. spin_unlock_irq(&pipe->rd_wait.lock);
  454. continue;
  455. }
  456. pipe->head = head + 1;
  457. spin_unlock_irq(&pipe->rd_wait.lock);
  458. /* Insert it into the buffer array */
  459. buf = &pipe->bufs[head & mask];
  460. buf->page = page;
  461. buf->ops = &anon_pipe_buf_ops;
  462. buf->offset = 0;
  463. buf->len = 0;
  464. if (is_packetized(filp))
  465. buf->flags = PIPE_BUF_FLAG_PACKET;
  466. else
  467. buf->flags = PIPE_BUF_FLAG_CAN_MERGE;
  468. pipe->tmp_page = NULL;
  469. copied = copy_page_from_iter(page, 0, PAGE_SIZE, from);
  470. if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) {
  471. if (!ret)
  472. ret = -EFAULT;
  473. break;
  474. }
  475. ret += copied;
  476. buf->offset = 0;
  477. buf->len = copied;
  478. if (!iov_iter_count(from))
  479. break;
  480. }
  481. if (!pipe_full(head, pipe->tail, pipe->max_usage))
  482. continue;
  483. /* Wait for buffer space to become available. */
  484. if (filp->f_flags & O_NONBLOCK) {
  485. if (!ret)
  486. ret = -EAGAIN;
  487. break;
  488. }
  489. if (signal_pending(current)) {
  490. if (!ret)
  491. ret = -ERESTARTSYS;
  492. break;
  493. }
  494. /*
  495. * We're going to release the pipe lock and wait for more
  496. * space. We wake up any readers if necessary, and then
  497. * after waiting we need to re-check whether the pipe
  498. * become empty while we dropped the lock.
  499. */
  500. __pipe_unlock(pipe);
  501. if (was_empty)
  502. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  503. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  504. wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
  505. __pipe_lock(pipe);
  506. was_empty = pipe_empty(pipe->head, pipe->tail);
  507. wake_next_writer = true;
  508. }
  509. out:
  510. if (pipe_full(pipe->head, pipe->tail, pipe->max_usage))
  511. wake_next_writer = false;
  512. __pipe_unlock(pipe);
  513. /*
  514. * If we do do a wakeup event, we do a 'sync' wakeup, because we
  515. * want the reader to start processing things asap, rather than
  516. * leave the data pending.
  517. *
  518. * This is particularly important for small writes, because of
  519. * how (for example) the GNU make jobserver uses small writes to
  520. * wake up pending jobs
  521. *
  522. * Epoll nonsensically wants a wakeup whether the pipe
  523. * was already empty or not.
  524. */
  525. if (was_empty || pipe->poll_usage)
  526. wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
  527. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  528. if (wake_next_writer)
  529. wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  530. if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
  531. int err = file_update_time(filp);
  532. if (err)
  533. ret = err;
  534. sb_end_write(file_inode(filp)->i_sb);
  535. }
  536. return ret;
  537. }
  538. static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  539. {
  540. struct pipe_inode_info *pipe = filp->private_data;
  541. unsigned int count, head, tail, mask;
  542. switch (cmd) {
  543. case FIONREAD:
  544. __pipe_lock(pipe);
  545. count = 0;
  546. head = pipe->head;
  547. tail = pipe->tail;
  548. mask = pipe->ring_size - 1;
  549. while (tail != head) {
  550. count += pipe->bufs[tail & mask].len;
  551. tail++;
  552. }
  553. __pipe_unlock(pipe);
  554. return put_user(count, (int __user *)arg);
  555. #ifdef CONFIG_WATCH_QUEUE
  556. case IOC_WATCH_QUEUE_SET_SIZE: {
  557. int ret;
  558. __pipe_lock(pipe);
  559. ret = watch_queue_set_size(pipe, arg);
  560. __pipe_unlock(pipe);
  561. return ret;
  562. }
  563. case IOC_WATCH_QUEUE_SET_FILTER:
  564. return watch_queue_set_filter(
  565. pipe, (struct watch_notification_filter __user *)arg);
  566. #endif
  567. default:
  568. return -ENOIOCTLCMD;
  569. }
  570. }
  571. /* No kernel lock held - fine */
  572. static __poll_t
  573. pipe_poll(struct file *filp, poll_table *wait)
  574. {
  575. __poll_t mask;
  576. struct pipe_inode_info *pipe = filp->private_data;
  577. unsigned int head, tail;
  578. /* Epoll has some historical nasty semantics, this enables them */
  579. WRITE_ONCE(pipe->poll_usage, true);
  580. /*
  581. * Reading pipe state only -- no need for acquiring the semaphore.
  582. *
  583. * But because this is racy, the code has to add the
  584. * entry to the poll table _first_ ..
  585. */
  586. if (filp->f_mode & FMODE_READ)
  587. poll_wait(filp, &pipe->rd_wait, wait);
  588. if (filp->f_mode & FMODE_WRITE)
  589. poll_wait(filp, &pipe->wr_wait, wait);
  590. /*
  591. * .. and only then can you do the racy tests. That way,
  592. * if something changes and you got it wrong, the poll
  593. * table entry will wake you up and fix it.
  594. */
  595. head = READ_ONCE(pipe->head);
  596. tail = READ_ONCE(pipe->tail);
  597. mask = 0;
  598. if (filp->f_mode & FMODE_READ) {
  599. if (!pipe_empty(head, tail))
  600. mask |= EPOLLIN | EPOLLRDNORM;
  601. if (!pipe->writers && filp->f_version != pipe->w_counter)
  602. mask |= EPOLLHUP;
  603. }
  604. if (filp->f_mode & FMODE_WRITE) {
  605. if (!pipe_full(head, tail, pipe->max_usage))
  606. mask |= EPOLLOUT | EPOLLWRNORM;
  607. /*
  608. * Most Unices do not set EPOLLERR for FIFOs but on Linux they
  609. * behave exactly like pipes for poll().
  610. */
  611. if (!pipe->readers)
  612. mask |= EPOLLERR;
  613. }
  614. return mask;
  615. }
  616. static void put_pipe_info(struct inode *inode, struct pipe_inode_info *pipe)
  617. {
  618. int kill = 0;
  619. spin_lock(&inode->i_lock);
  620. if (!--pipe->files) {
  621. inode->i_pipe = NULL;
  622. kill = 1;
  623. }
  624. spin_unlock(&inode->i_lock);
  625. if (kill)
  626. free_pipe_info(pipe);
  627. }
  628. static int
  629. pipe_release(struct inode *inode, struct file *file)
  630. {
  631. struct pipe_inode_info *pipe = file->private_data;
  632. __pipe_lock(pipe);
  633. if (file->f_mode & FMODE_READ)
  634. pipe->readers--;
  635. if (file->f_mode & FMODE_WRITE)
  636. pipe->writers--;
  637. /* Was that the last reader or writer, but not the other side? */
  638. if (!pipe->readers != !pipe->writers) {
  639. wake_up_interruptible_all(&pipe->rd_wait);
  640. wake_up_interruptible_all(&pipe->wr_wait);
  641. kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  642. kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  643. }
  644. __pipe_unlock(pipe);
  645. put_pipe_info(inode, pipe);
  646. return 0;
  647. }
  648. static int
  649. pipe_fasync(int fd, struct file *filp, int on)
  650. {
  651. struct pipe_inode_info *pipe = filp->private_data;
  652. int retval = 0;
  653. __pipe_lock(pipe);
  654. if (filp->f_mode & FMODE_READ)
  655. retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
  656. if ((filp->f_mode & FMODE_WRITE) && retval >= 0) {
  657. retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
  658. if (retval < 0 && (filp->f_mode & FMODE_READ))
  659. /* this can happen only if on == T */
  660. fasync_helper(-1, filp, 0, &pipe->fasync_readers);
  661. }
  662. __pipe_unlock(pipe);
  663. return retval;
  664. }
  665. unsigned long account_pipe_buffers(struct user_struct *user,
  666. unsigned long old, unsigned long new)
  667. {
  668. return atomic_long_add_return(new - old, &user->pipe_bufs);
  669. }
  670. bool too_many_pipe_buffers_soft(unsigned long user_bufs)
  671. {
  672. unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft);
  673. return soft_limit && user_bufs > soft_limit;
  674. }
  675. bool too_many_pipe_buffers_hard(unsigned long user_bufs)
  676. {
  677. unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard);
  678. return hard_limit && user_bufs > hard_limit;
  679. }
  680. bool pipe_is_unprivileged_user(void)
  681. {
  682. return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
  683. }
  684. struct pipe_inode_info *alloc_pipe_info(void)
  685. {
  686. struct pipe_inode_info *pipe;
  687. unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
  688. struct user_struct *user = get_current_user();
  689. unsigned long user_bufs;
  690. unsigned int max_size = READ_ONCE(pipe_max_size);
  691. pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
  692. if (pipe == NULL)
  693. goto out_free_uid;
  694. if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE))
  695. pipe_bufs = max_size >> PAGE_SHIFT;
  696. user_bufs = account_pipe_buffers(user, 0, pipe_bufs);
  697. if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) {
  698. user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS);
  699. pipe_bufs = PIPE_MIN_DEF_BUFFERS;
  700. }
  701. if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user())
  702. goto out_revert_acct;
  703. pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer),
  704. GFP_KERNEL_ACCOUNT);
  705. if (pipe->bufs) {
  706. init_waitqueue_head(&pipe->rd_wait);
  707. init_waitqueue_head(&pipe->wr_wait);
  708. pipe->r_counter = pipe->w_counter = 1;
  709. pipe->max_usage = pipe_bufs;
  710. pipe->ring_size = pipe_bufs;
  711. pipe->nr_accounted = pipe_bufs;
  712. pipe->user = user;
  713. mutex_init(&pipe->mutex);
  714. return pipe;
  715. }
  716. out_revert_acct:
  717. (void) account_pipe_buffers(user, pipe_bufs, 0);
  718. kfree(pipe);
  719. out_free_uid:
  720. free_uid(user);
  721. return NULL;
  722. }
  723. void free_pipe_info(struct pipe_inode_info *pipe)
  724. {
  725. unsigned int i;
  726. #ifdef CONFIG_WATCH_QUEUE
  727. if (pipe->watch_queue)
  728. watch_queue_clear(pipe->watch_queue);
  729. #endif
  730. (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0);
  731. free_uid(pipe->user);
  732. for (i = 0; i < pipe->ring_size; i++) {
  733. struct pipe_buffer *buf = pipe->bufs + i;
  734. if (buf->ops)
  735. pipe_buf_release(pipe, buf);
  736. }
  737. #ifdef CONFIG_WATCH_QUEUE
  738. if (pipe->watch_queue)
  739. put_watch_queue(pipe->watch_queue);
  740. #endif
  741. if (pipe->tmp_page)
  742. __free_page(pipe->tmp_page);
  743. kfree(pipe->bufs);
  744. kfree(pipe);
  745. }
  746. static struct vfsmount *pipe_mnt __read_mostly;
  747. /*
  748. * pipefs_dname() is called from d_path().
  749. */
  750. static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
  751. {
  752. return dynamic_dname(buffer, buflen, "pipe:[%lu]",
  753. d_inode(dentry)->i_ino);
  754. }
  755. static const struct dentry_operations pipefs_dentry_operations = {
  756. .d_dname = pipefs_dname,
  757. };
  758. static struct inode * get_pipe_inode(void)
  759. {
  760. struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb);
  761. struct pipe_inode_info *pipe;
  762. if (!inode)
  763. goto fail_inode;
  764. inode->i_ino = get_next_ino();
  765. pipe = alloc_pipe_info();
  766. if (!pipe)
  767. goto fail_iput;
  768. inode->i_pipe = pipe;
  769. pipe->files = 2;
  770. pipe->readers = pipe->writers = 1;
  771. inode->i_fop = &pipefifo_fops;
  772. /*
  773. * Mark the inode dirty from the very beginning,
  774. * that way it will never be moved to the dirty
  775. * list because "mark_inode_dirty()" will think
  776. * that it already _is_ on the dirty list.
  777. */
  778. inode->i_state = I_DIRTY;
  779. inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
  780. inode->i_uid = current_fsuid();
  781. inode->i_gid = current_fsgid();
  782. inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
  783. return inode;
  784. fail_iput:
  785. iput(inode);
  786. fail_inode:
  787. return NULL;
  788. }
  789. int create_pipe_files(struct file **res, int flags)
  790. {
  791. struct inode *inode = get_pipe_inode();
  792. struct file *f;
  793. int error;
  794. if (!inode)
  795. return -ENFILE;
  796. if (flags & O_NOTIFICATION_PIPE) {
  797. error = watch_queue_init(inode->i_pipe);
  798. if (error) {
  799. free_pipe_info(inode->i_pipe);
  800. iput(inode);
  801. return error;
  802. }
  803. }
  804. f = alloc_file_pseudo(inode, pipe_mnt, "",
  805. O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)),
  806. &pipefifo_fops);
  807. if (IS_ERR(f)) {
  808. free_pipe_info(inode->i_pipe);
  809. iput(inode);
  810. return PTR_ERR(f);
  811. }
  812. f->private_data = inode->i_pipe;
  813. res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK),
  814. &pipefifo_fops);
  815. if (IS_ERR(res[0])) {
  816. put_pipe_info(inode, inode->i_pipe);
  817. fput(f);
  818. return PTR_ERR(res[0]);
  819. }
  820. res[0]->private_data = inode->i_pipe;
  821. res[1] = f;
  822. stream_open(inode, res[0]);
  823. stream_open(inode, res[1]);
  824. return 0;
  825. }
  826. static int __do_pipe_flags(int *fd, struct file **files, int flags)
  827. {
  828. int error;
  829. int fdw, fdr;
  830. if (flags & ~(O_CLOEXEC | O_NONBLOCK | O_DIRECT | O_NOTIFICATION_PIPE))
  831. return -EINVAL;
  832. error = create_pipe_files(files, flags);
  833. if (error)
  834. return error;
  835. error = get_unused_fd_flags(flags);
  836. if (error < 0)
  837. goto err_read_pipe;
  838. fdr = error;
  839. error = get_unused_fd_flags(flags);
  840. if (error < 0)
  841. goto err_fdr;
  842. fdw = error;
  843. audit_fd_pair(fdr, fdw);
  844. fd[0] = fdr;
  845. fd[1] = fdw;
  846. return 0;
  847. err_fdr:
  848. put_unused_fd(fdr);
  849. err_read_pipe:
  850. fput(files[0]);
  851. fput(files[1]);
  852. return error;
  853. }
  854. int do_pipe_flags(int *fd, int flags)
  855. {
  856. struct file *files[2];
  857. int error = __do_pipe_flags(fd, files, flags);
  858. if (!error) {
  859. fd_install(fd[0], files[0]);
  860. fd_install(fd[1], files[1]);
  861. }
  862. return error;
  863. }
  864. /*
  865. * sys_pipe() is the normal C calling standard for creating
  866. * a pipe. It's not the way Unix traditionally does this, though.
  867. */
  868. static int do_pipe2(int __user *fildes, int flags)
  869. {
  870. struct file *files[2];
  871. int fd[2];
  872. int error;
  873. error = __do_pipe_flags(fd, files, flags);
  874. if (!error) {
  875. if (unlikely(copy_to_user(fildes, fd, sizeof(fd)))) {
  876. fput(files[0]);
  877. fput(files[1]);
  878. put_unused_fd(fd[0]);
  879. put_unused_fd(fd[1]);
  880. error = -EFAULT;
  881. } else {
  882. fd_install(fd[0], files[0]);
  883. fd_install(fd[1], files[1]);
  884. }
  885. }
  886. return error;
  887. }
  888. SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
  889. {
  890. return do_pipe2(fildes, flags);
  891. }
  892. SYSCALL_DEFINE1(pipe, int __user *, fildes)
  893. {
  894. return do_pipe2(fildes, 0);
  895. }
  896. /*
  897. * This is the stupid "wait for pipe to be readable or writable"
  898. * model.
  899. *
  900. * See pipe_read/write() for the proper kind of exclusive wait,
  901. * but that requires that we wake up any other readers/writers
  902. * if we then do not end up reading everything (ie the whole
  903. * "wake_next_reader/writer" logic in pipe_read/write()).
  904. */
  905. void pipe_wait_readable(struct pipe_inode_info *pipe)
  906. {
  907. pipe_unlock(pipe);
  908. wait_event_interruptible(pipe->rd_wait, pipe_readable(pipe));
  909. pipe_lock(pipe);
  910. }
  911. void pipe_wait_writable(struct pipe_inode_info *pipe)
  912. {
  913. pipe_unlock(pipe);
  914. wait_event_interruptible(pipe->wr_wait, pipe_writable(pipe));
  915. pipe_lock(pipe);
  916. }
  917. /*
  918. * This depends on both the wait (here) and the wakeup (wake_up_partner)
  919. * holding the pipe lock, so "*cnt" is stable and we know a wakeup cannot
  920. * race with the count check and waitqueue prep.
  921. *
  922. * Normally in order to avoid races, you'd do the prepare_to_wait() first,
  923. * then check the condition you're waiting for, and only then sleep. But
  924. * because of the pipe lock, we can check the condition before being on
  925. * the wait queue.
  926. *
  927. * We use the 'rd_wait' waitqueue for pipe partner waiting.
  928. */
  929. static int wait_for_partner(struct pipe_inode_info *pipe, unsigned int *cnt)
  930. {
  931. DEFINE_WAIT(rdwait);
  932. int cur = *cnt;
  933. while (cur == *cnt) {
  934. prepare_to_wait(&pipe->rd_wait, &rdwait, TASK_INTERRUPTIBLE);
  935. pipe_unlock(pipe);
  936. schedule();
  937. finish_wait(&pipe->rd_wait, &rdwait);
  938. pipe_lock(pipe);
  939. if (signal_pending(current))
  940. break;
  941. }
  942. return cur == *cnt ? -ERESTARTSYS : 0;
  943. }
  944. static void wake_up_partner(struct pipe_inode_info *pipe)
  945. {
  946. wake_up_interruptible_all(&pipe->rd_wait);
  947. }
  948. static int fifo_open(struct inode *inode, struct file *filp)
  949. {
  950. struct pipe_inode_info *pipe;
  951. bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC;
  952. int ret;
  953. filp->f_version = 0;
  954. spin_lock(&inode->i_lock);
  955. if (inode->i_pipe) {
  956. pipe = inode->i_pipe;
  957. pipe->files++;
  958. spin_unlock(&inode->i_lock);
  959. } else {
  960. spin_unlock(&inode->i_lock);
  961. pipe = alloc_pipe_info();
  962. if (!pipe)
  963. return -ENOMEM;
  964. pipe->files = 1;
  965. spin_lock(&inode->i_lock);
  966. if (unlikely(inode->i_pipe)) {
  967. inode->i_pipe->files++;
  968. spin_unlock(&inode->i_lock);
  969. free_pipe_info(pipe);
  970. pipe = inode->i_pipe;
  971. } else {
  972. inode->i_pipe = pipe;
  973. spin_unlock(&inode->i_lock);
  974. }
  975. }
  976. filp->private_data = pipe;
  977. /* OK, we have a pipe and it's pinned down */
  978. __pipe_lock(pipe);
  979. /* We can only do regular read/write on fifos */
  980. stream_open(inode, filp);
  981. switch (filp->f_mode & (FMODE_READ | FMODE_WRITE)) {
  982. case FMODE_READ:
  983. /*
  984. * O_RDONLY
  985. * POSIX.1 says that O_NONBLOCK means return with the FIFO
  986. * opened, even when there is no process writing the FIFO.
  987. */
  988. pipe->r_counter++;
  989. if (pipe->readers++ == 0)
  990. wake_up_partner(pipe);
  991. if (!is_pipe && !pipe->writers) {
  992. if ((filp->f_flags & O_NONBLOCK)) {
  993. /* suppress EPOLLHUP until we have
  994. * seen a writer */
  995. filp->f_version = pipe->w_counter;
  996. } else {
  997. if (wait_for_partner(pipe, &pipe->w_counter))
  998. goto err_rd;
  999. }
  1000. }
  1001. break;
  1002. case FMODE_WRITE:
  1003. /*
  1004. * O_WRONLY
  1005. * POSIX.1 says that O_NONBLOCK means return -1 with
  1006. * errno=ENXIO when there is no process reading the FIFO.
  1007. */
  1008. ret = -ENXIO;
  1009. if (!is_pipe && (filp->f_flags & O_NONBLOCK) && !pipe->readers)
  1010. goto err;
  1011. pipe->w_counter++;
  1012. if (!pipe->writers++)
  1013. wake_up_partner(pipe);
  1014. if (!is_pipe && !pipe->readers) {
  1015. if (wait_for_partner(pipe, &pipe->r_counter))
  1016. goto err_wr;
  1017. }
  1018. break;
  1019. case FMODE_READ | FMODE_WRITE:
  1020. /*
  1021. * O_RDWR
  1022. * POSIX.1 leaves this case "undefined" when O_NONBLOCK is set.
  1023. * This implementation will NEVER block on a O_RDWR open, since
  1024. * the process can at least talk to itself.
  1025. */
  1026. pipe->readers++;
  1027. pipe->writers++;
  1028. pipe->r_counter++;
  1029. pipe->w_counter++;
  1030. if (pipe->readers == 1 || pipe->writers == 1)
  1031. wake_up_partner(pipe);
  1032. break;
  1033. default:
  1034. ret = -EINVAL;
  1035. goto err;
  1036. }
  1037. /* Ok! */
  1038. __pipe_unlock(pipe);
  1039. return 0;
  1040. err_rd:
  1041. if (!--pipe->readers)
  1042. wake_up_interruptible(&pipe->wr_wait);
  1043. ret = -ERESTARTSYS;
  1044. goto err;
  1045. err_wr:
  1046. if (!--pipe->writers)
  1047. wake_up_interruptible_all(&pipe->rd_wait);
  1048. ret = -ERESTARTSYS;
  1049. goto err;
  1050. err:
  1051. __pipe_unlock(pipe);
  1052. put_pipe_info(inode, pipe);
  1053. return ret;
  1054. }
  1055. const struct file_operations pipefifo_fops = {
  1056. .open = fifo_open,
  1057. .llseek = no_llseek,
  1058. .read_iter = pipe_read,
  1059. .write_iter = pipe_write,
  1060. .poll = pipe_poll,
  1061. .unlocked_ioctl = pipe_ioctl,
  1062. .release = pipe_release,
  1063. .fasync = pipe_fasync,
  1064. .splice_write = iter_file_splice_write,
  1065. };
  1066. /*
  1067. * Currently we rely on the pipe array holding a power-of-2 number
  1068. * of pages. Returns 0 on error.
  1069. */
  1070. unsigned int round_pipe_size(unsigned long size)
  1071. {
  1072. if (size > (1U << 31))
  1073. return 0;
  1074. /* Minimum pipe size, as required by POSIX */
  1075. if (size < PAGE_SIZE)
  1076. return PAGE_SIZE;
  1077. return roundup_pow_of_two(size);
  1078. }
  1079. /*
  1080. * Resize the pipe ring to a number of slots.
  1081. *
  1082. * Note the pipe can be reduced in capacity, but only if the current
  1083. * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
  1084. * returned instead.
  1085. */
  1086. int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
  1087. {
  1088. struct pipe_buffer *bufs;
  1089. unsigned int head, tail, mask, n;
  1090. bufs = kcalloc(nr_slots, sizeof(*bufs),
  1091. GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
  1092. if (unlikely(!bufs))
  1093. return -ENOMEM;
  1094. spin_lock_irq(&pipe->rd_wait.lock);
  1095. mask = pipe->ring_size - 1;
  1096. head = pipe->head;
  1097. tail = pipe->tail;
  1098. n = pipe_occupancy(head, tail);
  1099. if (nr_slots < n) {
  1100. spin_unlock_irq(&pipe->rd_wait.lock);
  1101. kfree(bufs);
  1102. return -EBUSY;
  1103. }
  1104. /*
  1105. * The pipe array wraps around, so just start the new one at zero
  1106. * and adjust the indices.
  1107. */
  1108. if (n > 0) {
  1109. unsigned int h = head & mask;
  1110. unsigned int t = tail & mask;
  1111. if (h > t) {
  1112. memcpy(bufs, pipe->bufs + t,
  1113. n * sizeof(struct pipe_buffer));
  1114. } else {
  1115. unsigned int tsize = pipe->ring_size - t;
  1116. if (h > 0)
  1117. memcpy(bufs + tsize, pipe->bufs,
  1118. h * sizeof(struct pipe_buffer));
  1119. memcpy(bufs, pipe->bufs + t,
  1120. tsize * sizeof(struct pipe_buffer));
  1121. }
  1122. }
  1123. head = n;
  1124. tail = 0;
  1125. kfree(pipe->bufs);
  1126. pipe->bufs = bufs;
  1127. pipe->ring_size = nr_slots;
  1128. if (pipe->max_usage > nr_slots)
  1129. pipe->max_usage = nr_slots;
  1130. pipe->tail = tail;
  1131. pipe->head = head;
  1132. spin_unlock_irq(&pipe->rd_wait.lock);
  1133. /* This might have made more room for writers */
  1134. wake_up_interruptible(&pipe->wr_wait);
  1135. return 0;
  1136. }
  1137. /*
  1138. * Allocate a new array of pipe buffers and copy the info over. Returns the
  1139. * pipe size if successful, or return -ERROR on error.
  1140. */
  1141. static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
  1142. {
  1143. unsigned long user_bufs;
  1144. unsigned int nr_slots, size;
  1145. long ret = 0;
  1146. #ifdef CONFIG_WATCH_QUEUE
  1147. if (pipe->watch_queue)
  1148. return -EBUSY;
  1149. #endif
  1150. size = round_pipe_size(arg);
  1151. nr_slots = size >> PAGE_SHIFT;
  1152. if (!nr_slots)
  1153. return -EINVAL;
  1154. /*
  1155. * If trying to increase the pipe capacity, check that an
  1156. * unprivileged user is not trying to exceed various limits
  1157. * (soft limit check here, hard limit check just below).
  1158. * Decreasing the pipe capacity is always permitted, even
  1159. * if the user is currently over a limit.
  1160. */
  1161. if (nr_slots > pipe->max_usage &&
  1162. size > pipe_max_size && !capable(CAP_SYS_RESOURCE))
  1163. return -EPERM;
  1164. user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots);
  1165. if (nr_slots > pipe->max_usage &&
  1166. (too_many_pipe_buffers_hard(user_bufs) ||
  1167. too_many_pipe_buffers_soft(user_bufs)) &&
  1168. pipe_is_unprivileged_user()) {
  1169. ret = -EPERM;
  1170. goto out_revert_acct;
  1171. }
  1172. ret = pipe_resize_ring(pipe, nr_slots);
  1173. if (ret < 0)
  1174. goto out_revert_acct;
  1175. pipe->max_usage = nr_slots;
  1176. pipe->nr_accounted = nr_slots;
  1177. return pipe->max_usage * PAGE_SIZE;
  1178. out_revert_acct:
  1179. (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted);
  1180. return ret;
  1181. }
  1182. /*
  1183. * Note that i_pipe and i_cdev share the same location, so checking ->i_pipe is
  1184. * not enough to verify that this is a pipe.
  1185. */
  1186. struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice)
  1187. {
  1188. struct pipe_inode_info *pipe = file->private_data;
  1189. if (file->f_op != &pipefifo_fops || !pipe)
  1190. return NULL;
  1191. #ifdef CONFIG_WATCH_QUEUE
  1192. if (for_splice && pipe->watch_queue)
  1193. return NULL;
  1194. #endif
  1195. return pipe;
  1196. }
  1197. long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
  1198. {
  1199. struct pipe_inode_info *pipe;
  1200. long ret;
  1201. pipe = get_pipe_info(file, false);
  1202. if (!pipe)
  1203. return -EBADF;
  1204. __pipe_lock(pipe);
  1205. switch (cmd) {
  1206. case F_SETPIPE_SZ:
  1207. ret = pipe_set_size(pipe, arg);
  1208. break;
  1209. case F_GETPIPE_SZ:
  1210. ret = pipe->max_usage * PAGE_SIZE;
  1211. break;
  1212. default:
  1213. ret = -EINVAL;
  1214. break;
  1215. }
  1216. __pipe_unlock(pipe);
  1217. return ret;
  1218. }
  1219. static const struct super_operations pipefs_ops = {
  1220. .destroy_inode = free_inode_nonrcu,
  1221. .statfs = simple_statfs,
  1222. };
  1223. /*
  1224. * pipefs should _never_ be mounted by userland - too much of security hassle,
  1225. * no real gain from having the whole whorehouse mounted. So we don't need
  1226. * any operations on the root directory. However, we need a non-trivial
  1227. * d_name - pipe: will go nicely and kill the special-casing in procfs.
  1228. */
  1229. static int pipefs_init_fs_context(struct fs_context *fc)
  1230. {
  1231. struct pseudo_fs_context *ctx = init_pseudo(fc, PIPEFS_MAGIC);
  1232. if (!ctx)
  1233. return -ENOMEM;
  1234. ctx->ops = &pipefs_ops;
  1235. ctx->dops = &pipefs_dentry_operations;
  1236. return 0;
  1237. }
  1238. static struct file_system_type pipe_fs_type = {
  1239. .name = "pipefs",
  1240. .init_fs_context = pipefs_init_fs_context,
  1241. .kill_sb = kill_anon_super,
  1242. };
  1243. #ifdef CONFIG_SYSCTL
  1244. static int do_proc_dopipe_max_size_conv(unsigned long *lvalp,
  1245. unsigned int *valp,
  1246. int write, void *data)
  1247. {
  1248. if (write) {
  1249. unsigned int val;
  1250. val = round_pipe_size(*lvalp);
  1251. if (val == 0)
  1252. return -EINVAL;
  1253. *valp = val;
  1254. } else {
  1255. unsigned int val = *valp;
  1256. *lvalp = (unsigned long) val;
  1257. }
  1258. return 0;
  1259. }
  1260. static int proc_dopipe_max_size(struct ctl_table *table, int write,
  1261. void *buffer, size_t *lenp, loff_t *ppos)
  1262. {
  1263. return do_proc_douintvec(table, write, buffer, lenp, ppos,
  1264. do_proc_dopipe_max_size_conv, NULL);
  1265. }
  1266. static struct ctl_table fs_pipe_sysctls[] = {
  1267. {
  1268. .procname = "pipe-max-size",
  1269. .data = &pipe_max_size,
  1270. .maxlen = sizeof(pipe_max_size),
  1271. .mode = 0644,
  1272. .proc_handler = proc_dopipe_max_size,
  1273. },
  1274. {
  1275. .procname = "pipe-user-pages-hard",
  1276. .data = &pipe_user_pages_hard,
  1277. .maxlen = sizeof(pipe_user_pages_hard),
  1278. .mode = 0644,
  1279. .proc_handler = proc_doulongvec_minmax,
  1280. },
  1281. {
  1282. .procname = "pipe-user-pages-soft",
  1283. .data = &pipe_user_pages_soft,
  1284. .maxlen = sizeof(pipe_user_pages_soft),
  1285. .mode = 0644,
  1286. .proc_handler = proc_doulongvec_minmax,
  1287. },
  1288. { }
  1289. };
  1290. #endif
  1291. static int __init init_pipe_fs(void)
  1292. {
  1293. int err = register_filesystem(&pipe_fs_type);
  1294. if (!err) {
  1295. pipe_mnt = kern_mount(&pipe_fs_type);
  1296. if (IS_ERR(pipe_mnt)) {
  1297. err = PTR_ERR(pipe_mnt);
  1298. unregister_filesystem(&pipe_fs_type);
  1299. }
  1300. }
  1301. #ifdef CONFIG_SYSCTL
  1302. register_sysctl_init("fs", fs_pipe_sysctls);
  1303. #endif
  1304. return err;
  1305. }
  1306. fs_initcall(init_pipe_fs);