file.c 33 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * linux/fs/file.c
  4. *
  5. * Copyright (C) 1998-1999, Stephen Tweedie and Bill Hawes
  6. *
  7. * Manage the dynamic fd arrays in the process files_struct.
  8. */
  9. #include <linux/syscalls.h>
  10. #include <linux/export.h>
  11. #include <linux/fs.h>
  12. #include <linux/kernel.h>
  13. #include <linux/mm.h>
  14. #include <linux/sched/signal.h>
  15. #include <linux/slab.h>
  16. #include <linux/file.h>
  17. #include <linux/fdtable.h>
  18. #include <linux/bitops.h>
  19. #include <linux/spinlock.h>
  20. #include <linux/rcupdate.h>
  21. #include <linux/close_range.h>
  22. #include <net/sock.h>
  23. #include "internal.h"
  24. unsigned int sysctl_nr_open __read_mostly = 1024*1024;
  25. unsigned int sysctl_nr_open_min = BITS_PER_LONG;
  26. /* our min() is unusable in constant expressions ;-/ */
  27. #define __const_min(x, y) ((x) < (y) ? (x) : (y))
  28. unsigned int sysctl_nr_open_max =
  29. __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG;
  30. static void __free_fdtable(struct fdtable *fdt)
  31. {
  32. kvfree(fdt->fd);
  33. kvfree(fdt->open_fds);
  34. kfree(fdt);
  35. }
  36. static void free_fdtable_rcu(struct rcu_head *rcu)
  37. {
  38. __free_fdtable(container_of(rcu, struct fdtable, rcu));
  39. }
  40. #define BITBIT_NR(nr) BITS_TO_LONGS(BITS_TO_LONGS(nr))
  41. #define BITBIT_SIZE(nr) (BITBIT_NR(nr) * sizeof(long))
  42. /*
  43. * Copy 'count' fd bits from the old table to the new table and clear the extra
  44. * space if any. This does not copy the file pointers. Called with the files
  45. * spinlock held for write.
  46. */
  47. static void copy_fd_bitmaps(struct fdtable *nfdt, struct fdtable *ofdt,
  48. unsigned int count)
  49. {
  50. unsigned int cpy, set;
  51. cpy = count / BITS_PER_BYTE;
  52. set = (nfdt->max_fds - count) / BITS_PER_BYTE;
  53. memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
  54. memset((char *)nfdt->open_fds + cpy, 0, set);
  55. memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
  56. memset((char *)nfdt->close_on_exec + cpy, 0, set);
  57. cpy = BITBIT_SIZE(count);
  58. set = BITBIT_SIZE(nfdt->max_fds) - cpy;
  59. memcpy(nfdt->full_fds_bits, ofdt->full_fds_bits, cpy);
  60. memset((char *)nfdt->full_fds_bits + cpy, 0, set);
  61. }
  62. /*
  63. * Copy all file descriptors from the old table to the new, expanded table and
  64. * clear the extra space. Called with the files spinlock held for write.
  65. */
  66. static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
  67. {
  68. size_t cpy, set;
  69. BUG_ON(nfdt->max_fds < ofdt->max_fds);
  70. cpy = ofdt->max_fds * sizeof(struct file *);
  71. set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
  72. memcpy(nfdt->fd, ofdt->fd, cpy);
  73. memset((char *)nfdt->fd + cpy, 0, set);
  74. copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
  75. }
  76. /*
  77. * Note how the fdtable bitmap allocations very much have to be a multiple of
  78. * BITS_PER_LONG. This is not only because we walk those things in chunks of
  79. * 'unsigned long' in some places, but simply because that is how the Linux
  80. * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
  81. * they are very much "bits in an array of unsigned long".
  82. *
  83. * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
  84. * by that "1024/sizeof(ptr)" before, we already know there are sufficient
  85. * clear low bits. Clang seems to realize that, gcc ends up being confused.
  86. *
  87. * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
  88. * let's consider it documentation (and maybe a test-case for gcc to improve
  89. * its code generation ;)
  90. */
  91. static struct fdtable * alloc_fdtable(unsigned int nr)
  92. {
  93. struct fdtable *fdt;
  94. void *data;
  95. /*
  96. * Figure out how many fds we actually want to support in this fdtable.
  97. * Allocation steps are keyed to the size of the fdarray, since it
  98. * grows far faster than any of the other dynamic data. We try to fit
  99. * the fdarray into comfortable page-tuned chunks: starting at 1024B
  100. * and growing in powers of two from there on.
  101. */
  102. nr /= (1024 / sizeof(struct file *));
  103. nr = roundup_pow_of_two(nr + 1);
  104. nr *= (1024 / sizeof(struct file *));
  105. nr = ALIGN(nr, BITS_PER_LONG);
  106. /*
  107. * Note that this can drive nr *below* what we had passed if sysctl_nr_open
  108. * had been set lower between the check in expand_files() and here. Deal
  109. * with that in caller, it's cheaper that way.
  110. *
  111. * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise
  112. * bitmaps handling below becomes unpleasant, to put it mildly...
  113. */
  114. if (unlikely(nr > sysctl_nr_open))
  115. nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
  116. fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
  117. if (!fdt)
  118. goto out;
  119. fdt->max_fds = nr;
  120. data = kvmalloc_array(nr, sizeof(struct file *), GFP_KERNEL_ACCOUNT);
  121. if (!data)
  122. goto out_fdt;
  123. fdt->fd = data;
  124. data = kvmalloc(max_t(size_t,
  125. 2 * nr / BITS_PER_BYTE + BITBIT_SIZE(nr), L1_CACHE_BYTES),
  126. GFP_KERNEL_ACCOUNT);
  127. if (!data)
  128. goto out_arr;
  129. fdt->open_fds = data;
  130. data += nr / BITS_PER_BYTE;
  131. fdt->close_on_exec = data;
  132. data += nr / BITS_PER_BYTE;
  133. fdt->full_fds_bits = data;
  134. return fdt;
  135. out_arr:
  136. kvfree(fdt->fd);
  137. out_fdt:
  138. kfree(fdt);
  139. out:
  140. return NULL;
  141. }
  142. /*
  143. * Expand the file descriptor table.
  144. * This function will allocate a new fdtable and both fd array and fdset, of
  145. * the given size.
  146. * Return <0 error code on error; 1 on successful completion.
  147. * The files->file_lock should be held on entry, and will be held on exit.
  148. */
  149. static int expand_fdtable(struct files_struct *files, unsigned int nr)
  150. __releases(files->file_lock)
  151. __acquires(files->file_lock)
  152. {
  153. struct fdtable *new_fdt, *cur_fdt;
  154. spin_unlock(&files->file_lock);
  155. new_fdt = alloc_fdtable(nr);
  156. /* make sure all fd_install() have seen resize_in_progress
  157. * or have finished their rcu_read_lock_sched() section.
  158. */
  159. if (atomic_read(&files->count) > 1)
  160. synchronize_rcu();
  161. spin_lock(&files->file_lock);
  162. if (!new_fdt)
  163. return -ENOMEM;
  164. /*
  165. * extremely unlikely race - sysctl_nr_open decreased between the check in
  166. * caller and alloc_fdtable(). Cheaper to catch it here...
  167. */
  168. if (unlikely(new_fdt->max_fds <= nr)) {
  169. __free_fdtable(new_fdt);
  170. return -EMFILE;
  171. }
  172. cur_fdt = files_fdtable(files);
  173. BUG_ON(nr < cur_fdt->max_fds);
  174. copy_fdtable(new_fdt, cur_fdt);
  175. rcu_assign_pointer(files->fdt, new_fdt);
  176. if (cur_fdt != &files->fdtab)
  177. call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
  178. /* coupled with smp_rmb() in fd_install() */
  179. smp_wmb();
  180. return 1;
  181. }
  182. /*
  183. * Expand files.
  184. * This function will expand the file structures, if the requested size exceeds
  185. * the current capacity and there is room for expansion.
  186. * Return <0 error code on error; 0 when nothing done; 1 when files were
  187. * expanded and execution may have blocked.
  188. * The files->file_lock should be held on entry, and will be held on exit.
  189. */
  190. static int expand_files(struct files_struct *files, unsigned int nr)
  191. __releases(files->file_lock)
  192. __acquires(files->file_lock)
  193. {
  194. struct fdtable *fdt;
  195. int expanded = 0;
  196. repeat:
  197. fdt = files_fdtable(files);
  198. /* Do we need to expand? */
  199. if (nr < fdt->max_fds)
  200. return expanded;
  201. /* Can we expand? */
  202. if (nr >= sysctl_nr_open)
  203. return -EMFILE;
  204. if (unlikely(files->resize_in_progress)) {
  205. spin_unlock(&files->file_lock);
  206. expanded = 1;
  207. wait_event(files->resize_wait, !files->resize_in_progress);
  208. spin_lock(&files->file_lock);
  209. goto repeat;
  210. }
  211. /* All good, so we try */
  212. files->resize_in_progress = true;
  213. expanded = expand_fdtable(files, nr);
  214. files->resize_in_progress = false;
  215. wake_up_all(&files->resize_wait);
  216. return expanded;
  217. }
  218. static inline void __set_close_on_exec(unsigned int fd, struct fdtable *fdt)
  219. {
  220. __set_bit(fd, fdt->close_on_exec);
  221. }
  222. static inline void __clear_close_on_exec(unsigned int fd, struct fdtable *fdt)
  223. {
  224. if (test_bit(fd, fdt->close_on_exec))
  225. __clear_bit(fd, fdt->close_on_exec);
  226. }
  227. static inline void __set_open_fd(unsigned int fd, struct fdtable *fdt)
  228. {
  229. __set_bit(fd, fdt->open_fds);
  230. fd /= BITS_PER_LONG;
  231. if (!~fdt->open_fds[fd])
  232. __set_bit(fd, fdt->full_fds_bits);
  233. }
  234. static inline void __clear_open_fd(unsigned int fd, struct fdtable *fdt)
  235. {
  236. __clear_bit(fd, fdt->open_fds);
  237. __clear_bit(fd / BITS_PER_LONG, fdt->full_fds_bits);
  238. }
  239. static unsigned int count_open_files(struct fdtable *fdt)
  240. {
  241. unsigned int size = fdt->max_fds;
  242. unsigned int i;
  243. /* Find the last open fd */
  244. for (i = size / BITS_PER_LONG; i > 0; ) {
  245. if (fdt->open_fds[--i])
  246. break;
  247. }
  248. i = (i + 1) * BITS_PER_LONG;
  249. return i;
  250. }
  251. /*
  252. * Note that a sane fdtable size always has to be a multiple of
  253. * BITS_PER_LONG, since we have bitmaps that are sized by this.
  254. *
  255. * 'max_fds' will normally already be properly aligned, but it
  256. * turns out that in the close_range() -> __close_range() ->
  257. * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
  258. * up having a 'max_fds' value that isn't already aligned.
  259. *
  260. * Rather than make close_range() have to worry about this,
  261. * just make that BITS_PER_LONG alignment be part of a sane
  262. * fdtable size. Becuase that's really what it is.
  263. */
  264. static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
  265. {
  266. unsigned int count;
  267. count = count_open_files(fdt);
  268. if (max_fds < NR_OPEN_DEFAULT)
  269. max_fds = NR_OPEN_DEFAULT;
  270. return ALIGN(min(count, max_fds), BITS_PER_LONG);
  271. }
  272. /*
  273. * Allocate a new files structure and copy contents from the
  274. * passed in files structure.
  275. * errorp will be valid only when the returned files_struct is NULL.
  276. */
  277. struct files_struct *dup_fd(struct files_struct *oldf, unsigned int max_fds, int *errorp)
  278. {
  279. struct files_struct *newf;
  280. struct file **old_fds, **new_fds;
  281. unsigned int open_files, i;
  282. struct fdtable *old_fdt, *new_fdt;
  283. *errorp = -ENOMEM;
  284. newf = kmem_cache_alloc(files_cachep, GFP_KERNEL);
  285. if (!newf)
  286. goto out;
  287. atomic_set(&newf->count, 1);
  288. spin_lock_init(&newf->file_lock);
  289. newf->resize_in_progress = false;
  290. init_waitqueue_head(&newf->resize_wait);
  291. newf->next_fd = 0;
  292. new_fdt = &newf->fdtab;
  293. new_fdt->max_fds = NR_OPEN_DEFAULT;
  294. new_fdt->close_on_exec = newf->close_on_exec_init;
  295. new_fdt->open_fds = newf->open_fds_init;
  296. new_fdt->full_fds_bits = newf->full_fds_bits_init;
  297. new_fdt->fd = &newf->fd_array[0];
  298. spin_lock(&oldf->file_lock);
  299. old_fdt = files_fdtable(oldf);
  300. open_files = sane_fdtable_size(old_fdt, max_fds);
  301. /*
  302. * Check whether we need to allocate a larger fd array and fd set.
  303. */
  304. while (unlikely(open_files > new_fdt->max_fds)) {
  305. spin_unlock(&oldf->file_lock);
  306. if (new_fdt != &newf->fdtab)
  307. __free_fdtable(new_fdt);
  308. new_fdt = alloc_fdtable(open_files - 1);
  309. if (!new_fdt) {
  310. *errorp = -ENOMEM;
  311. goto out_release;
  312. }
  313. /* beyond sysctl_nr_open; nothing to do */
  314. if (unlikely(new_fdt->max_fds < open_files)) {
  315. __free_fdtable(new_fdt);
  316. *errorp = -EMFILE;
  317. goto out_release;
  318. }
  319. /*
  320. * Reacquire the oldf lock and a pointer to its fd table
  321. * who knows it may have a new bigger fd table. We need
  322. * the latest pointer.
  323. */
  324. spin_lock(&oldf->file_lock);
  325. old_fdt = files_fdtable(oldf);
  326. open_files = sane_fdtable_size(old_fdt, max_fds);
  327. }
  328. copy_fd_bitmaps(new_fdt, old_fdt, open_files);
  329. old_fds = old_fdt->fd;
  330. new_fds = new_fdt->fd;
  331. for (i = open_files; i != 0; i--) {
  332. struct file *f = *old_fds++;
  333. if (f) {
  334. get_file(f);
  335. } else {
  336. /*
  337. * The fd may be claimed in the fd bitmap but not yet
  338. * instantiated in the files array if a sibling thread
  339. * is partway through open(). So make sure that this
  340. * fd is available to the new process.
  341. */
  342. __clear_open_fd(open_files - i, new_fdt);
  343. }
  344. rcu_assign_pointer(*new_fds++, f);
  345. }
  346. spin_unlock(&oldf->file_lock);
  347. /* clear the remainder */
  348. memset(new_fds, 0, (new_fdt->max_fds - open_files) * sizeof(struct file *));
  349. rcu_assign_pointer(newf->fdt, new_fdt);
  350. return newf;
  351. out_release:
  352. kmem_cache_free(files_cachep, newf);
  353. out:
  354. return NULL;
  355. }
  356. static struct fdtable *close_files(struct files_struct * files)
  357. {
  358. /*
  359. * It is safe to dereference the fd table without RCU or
  360. * ->file_lock because this is the last reference to the
  361. * files structure.
  362. */
  363. struct fdtable *fdt = rcu_dereference_raw(files->fdt);
  364. unsigned int i, j = 0;
  365. for (;;) {
  366. unsigned long set;
  367. i = j * BITS_PER_LONG;
  368. if (i >= fdt->max_fds)
  369. break;
  370. set = fdt->open_fds[j++];
  371. while (set) {
  372. if (set & 1) {
  373. struct file * file = xchg(&fdt->fd[i], NULL);
  374. if (file) {
  375. filp_close(file, files);
  376. cond_resched();
  377. }
  378. }
  379. i++;
  380. set >>= 1;
  381. }
  382. }
  383. return fdt;
  384. }
  385. void put_files_struct(struct files_struct *files)
  386. {
  387. if (atomic_dec_and_test(&files->count)) {
  388. struct fdtable *fdt = close_files(files);
  389. /* free the arrays if they are not embedded */
  390. if (fdt != &files->fdtab)
  391. __free_fdtable(fdt);
  392. kmem_cache_free(files_cachep, files);
  393. }
  394. }
  395. void exit_files(struct task_struct *tsk)
  396. {
  397. struct files_struct * files = tsk->files;
  398. if (files) {
  399. task_lock(tsk);
  400. tsk->files = NULL;
  401. task_unlock(tsk);
  402. put_files_struct(files);
  403. }
  404. }
  405. struct files_struct init_files = {
  406. .count = ATOMIC_INIT(1),
  407. .fdt = &init_files.fdtab,
  408. .fdtab = {
  409. .max_fds = NR_OPEN_DEFAULT,
  410. .fd = &init_files.fd_array[0],
  411. .close_on_exec = init_files.close_on_exec_init,
  412. .open_fds = init_files.open_fds_init,
  413. .full_fds_bits = init_files.full_fds_bits_init,
  414. },
  415. .file_lock = __SPIN_LOCK_UNLOCKED(init_files.file_lock),
  416. .resize_wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_files.resize_wait),
  417. };
  418. static unsigned int find_next_fd(struct fdtable *fdt, unsigned int start)
  419. {
  420. unsigned int maxfd = fdt->max_fds;
  421. unsigned int maxbit = maxfd / BITS_PER_LONG;
  422. unsigned int bitbit = start / BITS_PER_LONG;
  423. bitbit = find_next_zero_bit(fdt->full_fds_bits, maxbit, bitbit) * BITS_PER_LONG;
  424. if (bitbit > maxfd)
  425. return maxfd;
  426. if (bitbit > start)
  427. start = bitbit;
  428. return find_next_zero_bit(fdt->open_fds, maxfd, start);
  429. }
  430. /*
  431. * allocate a file descriptor, mark it busy.
  432. */
  433. static int alloc_fd(unsigned start, unsigned end, unsigned flags)
  434. {
  435. struct files_struct *files = current->files;
  436. unsigned int fd;
  437. int error;
  438. struct fdtable *fdt;
  439. spin_lock(&files->file_lock);
  440. repeat:
  441. fdt = files_fdtable(files);
  442. fd = start;
  443. if (fd < files->next_fd)
  444. fd = files->next_fd;
  445. if (fd < fdt->max_fds)
  446. fd = find_next_fd(fdt, fd);
  447. /*
  448. * N.B. For clone tasks sharing a files structure, this test
  449. * will limit the total number of files that can be opened.
  450. */
  451. error = -EMFILE;
  452. if (fd >= end)
  453. goto out;
  454. error = expand_files(files, fd);
  455. if (error < 0)
  456. goto out;
  457. /*
  458. * If we needed to expand the fs array we
  459. * might have blocked - try again.
  460. */
  461. if (error)
  462. goto repeat;
  463. if (start <= files->next_fd)
  464. files->next_fd = fd + 1;
  465. __set_open_fd(fd, fdt);
  466. if (flags & O_CLOEXEC)
  467. __set_close_on_exec(fd, fdt);
  468. else
  469. __clear_close_on_exec(fd, fdt);
  470. error = fd;
  471. #if 1
  472. /* Sanity check */
  473. if (rcu_access_pointer(fdt->fd[fd]) != NULL) {
  474. printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd);
  475. rcu_assign_pointer(fdt->fd[fd], NULL);
  476. }
  477. #endif
  478. out:
  479. spin_unlock(&files->file_lock);
  480. return error;
  481. }
  482. int __get_unused_fd_flags(unsigned flags, unsigned long nofile)
  483. {
  484. return alloc_fd(0, nofile, flags);
  485. }
  486. int get_unused_fd_flags(unsigned flags)
  487. {
  488. return __get_unused_fd_flags(flags, rlimit(RLIMIT_NOFILE));
  489. }
  490. EXPORT_SYMBOL(get_unused_fd_flags);
  491. static void __put_unused_fd(struct files_struct *files, unsigned int fd)
  492. {
  493. struct fdtable *fdt = files_fdtable(files);
  494. __clear_open_fd(fd, fdt);
  495. if (fd < files->next_fd)
  496. files->next_fd = fd;
  497. }
  498. void put_unused_fd(unsigned int fd)
  499. {
  500. struct files_struct *files = current->files;
  501. spin_lock(&files->file_lock);
  502. __put_unused_fd(files, fd);
  503. spin_unlock(&files->file_lock);
  504. }
  505. EXPORT_SYMBOL(put_unused_fd);
  506. /*
  507. * Install a file pointer in the fd array.
  508. *
  509. * The VFS is full of places where we drop the files lock between
  510. * setting the open_fds bitmap and installing the file in the file
  511. * array. At any such point, we are vulnerable to a dup2() race
  512. * installing a file in the array before us. We need to detect this and
  513. * fput() the struct file we are about to overwrite in this case.
  514. *
  515. * It should never happen - if we allow dup2() do it, _really_ bad things
  516. * will follow.
  517. *
  518. * This consumes the "file" refcount, so callers should treat it
  519. * as if they had called fput(file).
  520. */
  521. void fd_install(unsigned int fd, struct file *file)
  522. {
  523. struct files_struct *files = current->files;
  524. struct fdtable *fdt;
  525. rcu_read_lock_sched();
  526. if (unlikely(files->resize_in_progress)) {
  527. rcu_read_unlock_sched();
  528. spin_lock(&files->file_lock);
  529. fdt = files_fdtable(files);
  530. BUG_ON(fdt->fd[fd] != NULL);
  531. rcu_assign_pointer(fdt->fd[fd], file);
  532. spin_unlock(&files->file_lock);
  533. return;
  534. }
  535. /* coupled with smp_wmb() in expand_fdtable() */
  536. smp_rmb();
  537. fdt = rcu_dereference_sched(files->fdt);
  538. BUG_ON(fdt->fd[fd] != NULL);
  539. rcu_assign_pointer(fdt->fd[fd], file);
  540. rcu_read_unlock_sched();
  541. }
  542. EXPORT_SYMBOL(fd_install);
  543. /**
  544. * pick_file - return file associatd with fd
  545. * @files: file struct to retrieve file from
  546. * @fd: file descriptor to retrieve file for
  547. *
  548. * Context: files_lock must be held.
  549. *
  550. * Returns: The file associated with @fd (NULL if @fd is not open)
  551. */
  552. static struct file *pick_file(struct files_struct *files, unsigned fd)
  553. {
  554. struct fdtable *fdt = files_fdtable(files);
  555. struct file *file;
  556. if (fd >= fdt->max_fds)
  557. return NULL;
  558. fd = array_index_nospec(fd, fdt->max_fds);
  559. file = fdt->fd[fd];
  560. if (file) {
  561. rcu_assign_pointer(fdt->fd[fd], NULL);
  562. __put_unused_fd(files, fd);
  563. }
  564. return file;
  565. }
  566. int close_fd(unsigned fd)
  567. {
  568. struct files_struct *files = current->files;
  569. struct file *file;
  570. spin_lock(&files->file_lock);
  571. file = pick_file(files, fd);
  572. spin_unlock(&files->file_lock);
  573. if (!file)
  574. return -EBADF;
  575. return filp_close(file, files);
  576. }
  577. EXPORT_SYMBOL(close_fd); /* for ksys_close() */
  578. /**
  579. * last_fd - return last valid index into fd table
  580. * @cur_fds: files struct
  581. *
  582. * Context: Either rcu read lock or files_lock must be held.
  583. *
  584. * Returns: Last valid index into fdtable.
  585. */
  586. static inline unsigned last_fd(struct fdtable *fdt)
  587. {
  588. return fdt->max_fds - 1;
  589. }
  590. static inline void __range_cloexec(struct files_struct *cur_fds,
  591. unsigned int fd, unsigned int max_fd)
  592. {
  593. struct fdtable *fdt;
  594. /* make sure we're using the correct maximum value */
  595. spin_lock(&cur_fds->file_lock);
  596. fdt = files_fdtable(cur_fds);
  597. max_fd = min(last_fd(fdt), max_fd);
  598. if (fd <= max_fd)
  599. bitmap_set(fdt->close_on_exec, fd, max_fd - fd + 1);
  600. spin_unlock(&cur_fds->file_lock);
  601. }
  602. static inline void __range_close(struct files_struct *cur_fds, unsigned int fd,
  603. unsigned int max_fd)
  604. {
  605. unsigned n;
  606. rcu_read_lock();
  607. n = last_fd(files_fdtable(cur_fds));
  608. rcu_read_unlock();
  609. max_fd = min(max_fd, n);
  610. while (fd <= max_fd) {
  611. struct file *file;
  612. spin_lock(&cur_fds->file_lock);
  613. file = pick_file(cur_fds, fd++);
  614. spin_unlock(&cur_fds->file_lock);
  615. if (file) {
  616. /* found a valid file to close */
  617. filp_close(file, cur_fds);
  618. cond_resched();
  619. }
  620. }
  621. }
  622. /**
  623. * __close_range() - Close all file descriptors in a given range.
  624. *
  625. * @fd: starting file descriptor to close
  626. * @max_fd: last file descriptor to close
  627. *
  628. * This closes a range of file descriptors. All file descriptors
  629. * from @fd up to and including @max_fd are closed.
  630. */
  631. int __close_range(unsigned fd, unsigned max_fd, unsigned int flags)
  632. {
  633. struct task_struct *me = current;
  634. struct files_struct *cur_fds = me->files, *fds = NULL;
  635. if (flags & ~(CLOSE_RANGE_UNSHARE | CLOSE_RANGE_CLOEXEC))
  636. return -EINVAL;
  637. if (fd > max_fd)
  638. return -EINVAL;
  639. if (flags & CLOSE_RANGE_UNSHARE) {
  640. int ret;
  641. unsigned int max_unshare_fds = NR_OPEN_MAX;
  642. /*
  643. * If the caller requested all fds to be made cloexec we always
  644. * copy all of the file descriptors since they still want to
  645. * use them.
  646. */
  647. if (!(flags & CLOSE_RANGE_CLOEXEC)) {
  648. /*
  649. * If the requested range is greater than the current
  650. * maximum, we're closing everything so only copy all
  651. * file descriptors beneath the lowest file descriptor.
  652. */
  653. rcu_read_lock();
  654. if (max_fd >= last_fd(files_fdtable(cur_fds)))
  655. max_unshare_fds = fd;
  656. rcu_read_unlock();
  657. }
  658. ret = unshare_fd(CLONE_FILES, max_unshare_fds, &fds);
  659. if (ret)
  660. return ret;
  661. /*
  662. * We used to share our file descriptor table, and have now
  663. * created a private one, make sure we're using it below.
  664. */
  665. if (fds)
  666. swap(cur_fds, fds);
  667. }
  668. if (flags & CLOSE_RANGE_CLOEXEC)
  669. __range_cloexec(cur_fds, fd, max_fd);
  670. else
  671. __range_close(cur_fds, fd, max_fd);
  672. if (fds) {
  673. /*
  674. * We're done closing the files we were supposed to. Time to install
  675. * the new file descriptor table and drop the old one.
  676. */
  677. task_lock(me);
  678. me->files = cur_fds;
  679. task_unlock(me);
  680. put_files_struct(fds);
  681. }
  682. return 0;
  683. }
  684. /*
  685. * See close_fd_get_file() below, this variant assumes current->files->file_lock
  686. * is held.
  687. */
  688. struct file *__close_fd_get_file(unsigned int fd)
  689. {
  690. return pick_file(current->files, fd);
  691. }
  692. /*
  693. * variant of close_fd that gets a ref on the file for later fput.
  694. * The caller must ensure that filp_close() called on the file.
  695. */
  696. struct file *close_fd_get_file(unsigned int fd)
  697. {
  698. struct files_struct *files = current->files;
  699. struct file *file;
  700. spin_lock(&files->file_lock);
  701. file = pick_file(files, fd);
  702. spin_unlock(&files->file_lock);
  703. return file;
  704. }
  705. void do_close_on_exec(struct files_struct *files)
  706. {
  707. unsigned i;
  708. struct fdtable *fdt;
  709. /* exec unshares first */
  710. spin_lock(&files->file_lock);
  711. for (i = 0; ; i++) {
  712. unsigned long set;
  713. unsigned fd = i * BITS_PER_LONG;
  714. fdt = files_fdtable(files);
  715. if (fd >= fdt->max_fds)
  716. break;
  717. set = fdt->close_on_exec[i];
  718. if (!set)
  719. continue;
  720. fdt->close_on_exec[i] = 0;
  721. for ( ; set ; fd++, set >>= 1) {
  722. struct file *file;
  723. if (!(set & 1))
  724. continue;
  725. file = fdt->fd[fd];
  726. if (!file)
  727. continue;
  728. rcu_assign_pointer(fdt->fd[fd], NULL);
  729. __put_unused_fd(files, fd);
  730. spin_unlock(&files->file_lock);
  731. filp_close(file, files);
  732. cond_resched();
  733. spin_lock(&files->file_lock);
  734. }
  735. }
  736. spin_unlock(&files->file_lock);
  737. }
  738. static inline struct file *__fget_files_rcu(struct files_struct *files,
  739. unsigned int fd, fmode_t mask)
  740. {
  741. for (;;) {
  742. struct file *file;
  743. struct fdtable *fdt = rcu_dereference_raw(files->fdt);
  744. struct file __rcu **fdentry;
  745. if (unlikely(fd >= fdt->max_fds))
  746. return NULL;
  747. fdentry = fdt->fd + array_index_nospec(fd, fdt->max_fds);
  748. file = rcu_dereference_raw(*fdentry);
  749. if (unlikely(!file))
  750. return NULL;
  751. if (unlikely(file->f_mode & mask))
  752. return NULL;
  753. /*
  754. * Ok, we have a file pointer. However, because we do
  755. * this all locklessly under RCU, we may be racing with
  756. * that file being closed.
  757. *
  758. * Such a race can take two forms:
  759. *
  760. * (a) the file ref already went down to zero,
  761. * and get_file_rcu() fails. Just try again:
  762. */
  763. if (unlikely(!get_file_rcu(file)))
  764. continue;
  765. /*
  766. * (b) the file table entry has changed under us.
  767. * Note that we don't need to re-check the 'fdt->fd'
  768. * pointer having changed, because it always goes
  769. * hand-in-hand with 'fdt'.
  770. *
  771. * If so, we need to put our ref and try again.
  772. */
  773. if (unlikely(rcu_dereference_raw(files->fdt) != fdt) ||
  774. unlikely(rcu_dereference_raw(*fdentry) != file)) {
  775. fput(file);
  776. continue;
  777. }
  778. /*
  779. * Ok, we have a ref to the file, and checked that it
  780. * still exists.
  781. */
  782. return file;
  783. }
  784. }
  785. static struct file *__fget_files(struct files_struct *files, unsigned int fd,
  786. fmode_t mask)
  787. {
  788. struct file *file;
  789. rcu_read_lock();
  790. file = __fget_files_rcu(files, fd, mask);
  791. rcu_read_unlock();
  792. return file;
  793. }
  794. static inline struct file *__fget(unsigned int fd, fmode_t mask)
  795. {
  796. return __fget_files(current->files, fd, mask);
  797. }
  798. struct file *fget(unsigned int fd)
  799. {
  800. return __fget(fd, FMODE_PATH);
  801. }
  802. EXPORT_SYMBOL(fget);
  803. struct file *fget_raw(unsigned int fd)
  804. {
  805. return __fget(fd, 0);
  806. }
  807. EXPORT_SYMBOL(fget_raw);
  808. struct file *fget_task(struct task_struct *task, unsigned int fd)
  809. {
  810. struct file *file = NULL;
  811. task_lock(task);
  812. if (task->files)
  813. file = __fget_files(task->files, fd, 0);
  814. task_unlock(task);
  815. return file;
  816. }
  817. struct file *task_lookup_fd_rcu(struct task_struct *task, unsigned int fd)
  818. {
  819. /* Must be called with rcu_read_lock held */
  820. struct files_struct *files;
  821. struct file *file = NULL;
  822. task_lock(task);
  823. files = task->files;
  824. if (files)
  825. file = files_lookup_fd_rcu(files, fd);
  826. task_unlock(task);
  827. return file;
  828. }
  829. struct file *task_lookup_next_fd_rcu(struct task_struct *task, unsigned int *ret_fd)
  830. {
  831. /* Must be called with rcu_read_lock held */
  832. struct files_struct *files;
  833. unsigned int fd = *ret_fd;
  834. struct file *file = NULL;
  835. task_lock(task);
  836. files = task->files;
  837. if (files) {
  838. for (; fd < files_fdtable(files)->max_fds; fd++) {
  839. file = files_lookup_fd_rcu(files, fd);
  840. if (file)
  841. break;
  842. }
  843. }
  844. task_unlock(task);
  845. *ret_fd = fd;
  846. return file;
  847. }
  848. EXPORT_SYMBOL(task_lookup_next_fd_rcu);
  849. /*
  850. * Lightweight file lookup - no refcnt increment if fd table isn't shared.
  851. *
  852. * You can use this instead of fget if you satisfy all of the following
  853. * conditions:
  854. * 1) You must call fput_light before exiting the syscall and returning control
  855. * to userspace (i.e. you cannot remember the returned struct file * after
  856. * returning to userspace).
  857. * 2) You must not call filp_close on the returned struct file * in between
  858. * calls to fget_light and fput_light.
  859. * 3) You must not clone the current task in between the calls to fget_light
  860. * and fput_light.
  861. *
  862. * The fput_needed flag returned by fget_light should be passed to the
  863. * corresponding fput_light.
  864. */
  865. static unsigned long __fget_light(unsigned int fd, fmode_t mask)
  866. {
  867. struct files_struct *files = current->files;
  868. struct file *file;
  869. /*
  870. * If another thread is concurrently calling close_fd() followed
  871. * by put_files_struct(), we must not observe the old table
  872. * entry combined with the new refcount - otherwise we could
  873. * return a file that is concurrently being freed.
  874. *
  875. * atomic_read_acquire() pairs with atomic_dec_and_test() in
  876. * put_files_struct().
  877. */
  878. if (atomic_read_acquire(&files->count) == 1) {
  879. file = files_lookup_fd_raw(files, fd);
  880. if (!file || unlikely(file->f_mode & mask))
  881. return 0;
  882. return (unsigned long)file;
  883. } else {
  884. file = __fget(fd, mask);
  885. if (!file)
  886. return 0;
  887. return FDPUT_FPUT | (unsigned long)file;
  888. }
  889. }
  890. unsigned long __fdget(unsigned int fd)
  891. {
  892. return __fget_light(fd, FMODE_PATH);
  893. }
  894. EXPORT_SYMBOL(__fdget);
  895. unsigned long __fdget_raw(unsigned int fd)
  896. {
  897. return __fget_light(fd, 0);
  898. }
  899. /*
  900. * Try to avoid f_pos locking. We only need it if the
  901. * file is marked for FMODE_ATOMIC_POS, and it can be
  902. * accessed multiple ways.
  903. *
  904. * Always do it for directories, because pidfd_getfd()
  905. * can make a file accessible even if it otherwise would
  906. * not be, and for directories this is a correctness
  907. * issue, not a "POSIX requirement".
  908. */
  909. static inline bool file_needs_f_pos_lock(struct file *file)
  910. {
  911. return (file->f_mode & FMODE_ATOMIC_POS) &&
  912. (file_count(file) > 1 || S_ISDIR(file_inode(file)->i_mode));
  913. }
  914. unsigned long __fdget_pos(unsigned int fd)
  915. {
  916. unsigned long v = __fdget(fd);
  917. struct file *file = (struct file *)(v & ~3);
  918. if (file && file_needs_f_pos_lock(file)) {
  919. v |= FDPUT_POS_UNLOCK;
  920. mutex_lock(&file->f_pos_lock);
  921. }
  922. return v;
  923. }
  924. void __f_unlock_pos(struct file *f)
  925. {
  926. mutex_unlock(&f->f_pos_lock);
  927. }
  928. /*
  929. * We only lock f_pos if we have threads or if the file might be
  930. * shared with another process. In both cases we'll have an elevated
  931. * file count (done either by fdget() or by fork()).
  932. */
  933. void set_close_on_exec(unsigned int fd, int flag)
  934. {
  935. struct files_struct *files = current->files;
  936. struct fdtable *fdt;
  937. spin_lock(&files->file_lock);
  938. fdt = files_fdtable(files);
  939. if (flag)
  940. __set_close_on_exec(fd, fdt);
  941. else
  942. __clear_close_on_exec(fd, fdt);
  943. spin_unlock(&files->file_lock);
  944. }
  945. bool get_close_on_exec(unsigned int fd)
  946. {
  947. struct files_struct *files = current->files;
  948. struct fdtable *fdt;
  949. bool res;
  950. rcu_read_lock();
  951. fdt = files_fdtable(files);
  952. res = close_on_exec(fd, fdt);
  953. rcu_read_unlock();
  954. return res;
  955. }
  956. static int do_dup2(struct files_struct *files,
  957. struct file *file, unsigned fd, unsigned flags)
  958. __releases(&files->file_lock)
  959. {
  960. struct file *tofree;
  961. struct fdtable *fdt;
  962. /*
  963. * We need to detect attempts to do dup2() over allocated but still
  964. * not finished descriptor. NB: OpenBSD avoids that at the price of
  965. * extra work in their equivalent of fget() - they insert struct
  966. * file immediately after grabbing descriptor, mark it larval if
  967. * more work (e.g. actual opening) is needed and make sure that
  968. * fget() treats larval files as absent. Potentially interesting,
  969. * but while extra work in fget() is trivial, locking implications
  970. * and amount of surgery on open()-related paths in VFS are not.
  971. * FreeBSD fails with -EBADF in the same situation, NetBSD "solution"
  972. * deadlocks in rather amusing ways, AFAICS. All of that is out of
  973. * scope of POSIX or SUS, since neither considers shared descriptor
  974. * tables and this condition does not arise without those.
  975. */
  976. fdt = files_fdtable(files);
  977. tofree = fdt->fd[fd];
  978. if (!tofree && fd_is_open(fd, fdt))
  979. goto Ebusy;
  980. get_file(file);
  981. rcu_assign_pointer(fdt->fd[fd], file);
  982. __set_open_fd(fd, fdt);
  983. if (flags & O_CLOEXEC)
  984. __set_close_on_exec(fd, fdt);
  985. else
  986. __clear_close_on_exec(fd, fdt);
  987. spin_unlock(&files->file_lock);
  988. if (tofree)
  989. filp_close(tofree, files);
  990. return fd;
  991. Ebusy:
  992. spin_unlock(&files->file_lock);
  993. return -EBUSY;
  994. }
  995. int replace_fd(unsigned fd, struct file *file, unsigned flags)
  996. {
  997. int err;
  998. struct files_struct *files = current->files;
  999. if (!file)
  1000. return close_fd(fd);
  1001. if (fd >= rlimit(RLIMIT_NOFILE))
  1002. return -EBADF;
  1003. spin_lock(&files->file_lock);
  1004. err = expand_files(files, fd);
  1005. if (unlikely(err < 0))
  1006. goto out_unlock;
  1007. return do_dup2(files, file, fd, flags);
  1008. out_unlock:
  1009. spin_unlock(&files->file_lock);
  1010. return err;
  1011. }
  1012. /**
  1013. * __receive_fd() - Install received file into file descriptor table
  1014. * @file: struct file that was received from another process
  1015. * @ufd: __user pointer to write new fd number to
  1016. * @o_flags: the O_* flags to apply to the new fd entry
  1017. *
  1018. * Installs a received file into the file descriptor table, with appropriate
  1019. * checks and count updates. Optionally writes the fd number to userspace, if
  1020. * @ufd is non-NULL.
  1021. *
  1022. * This helper handles its own reference counting of the incoming
  1023. * struct file.
  1024. *
  1025. * Returns newly install fd or -ve on error.
  1026. */
  1027. int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags)
  1028. {
  1029. int new_fd;
  1030. int error;
  1031. error = security_file_receive(file);
  1032. if (error)
  1033. return error;
  1034. new_fd = get_unused_fd_flags(o_flags);
  1035. if (new_fd < 0)
  1036. return new_fd;
  1037. if (ufd) {
  1038. error = put_user(new_fd, ufd);
  1039. if (error) {
  1040. put_unused_fd(new_fd);
  1041. return error;
  1042. }
  1043. }
  1044. fd_install(new_fd, get_file(file));
  1045. __receive_sock(file);
  1046. return new_fd;
  1047. }
  1048. int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags)
  1049. {
  1050. int error;
  1051. error = security_file_receive(file);
  1052. if (error)
  1053. return error;
  1054. error = replace_fd(new_fd, file, o_flags);
  1055. if (error)
  1056. return error;
  1057. __receive_sock(file);
  1058. return new_fd;
  1059. }
  1060. int receive_fd(struct file *file, unsigned int o_flags)
  1061. {
  1062. return __receive_fd(file, NULL, o_flags);
  1063. }
  1064. EXPORT_SYMBOL_GPL(receive_fd);
  1065. static int ksys_dup3(unsigned int oldfd, unsigned int newfd, int flags)
  1066. {
  1067. int err = -EBADF;
  1068. struct file *file;
  1069. struct files_struct *files = current->files;
  1070. if ((flags & ~O_CLOEXEC) != 0)
  1071. return -EINVAL;
  1072. if (unlikely(oldfd == newfd))
  1073. return -EINVAL;
  1074. if (newfd >= rlimit(RLIMIT_NOFILE))
  1075. return -EBADF;
  1076. spin_lock(&files->file_lock);
  1077. err = expand_files(files, newfd);
  1078. file = files_lookup_fd_locked(files, oldfd);
  1079. if (unlikely(!file))
  1080. goto Ebadf;
  1081. if (unlikely(err < 0)) {
  1082. if (err == -EMFILE)
  1083. goto Ebadf;
  1084. goto out_unlock;
  1085. }
  1086. return do_dup2(files, file, newfd, flags);
  1087. Ebadf:
  1088. err = -EBADF;
  1089. out_unlock:
  1090. spin_unlock(&files->file_lock);
  1091. return err;
  1092. }
  1093. SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
  1094. {
  1095. return ksys_dup3(oldfd, newfd, flags);
  1096. }
  1097. SYSCALL_DEFINE2(dup2, unsigned int, oldfd, unsigned int, newfd)
  1098. {
  1099. if (unlikely(newfd == oldfd)) { /* corner case */
  1100. struct files_struct *files = current->files;
  1101. int retval = oldfd;
  1102. rcu_read_lock();
  1103. if (!files_lookup_fd_rcu(files, oldfd))
  1104. retval = -EBADF;
  1105. rcu_read_unlock();
  1106. return retval;
  1107. }
  1108. return ksys_dup3(oldfd, newfd, 0);
  1109. }
  1110. SYSCALL_DEFINE1(dup, unsigned int, fildes)
  1111. {
  1112. int ret = -EBADF;
  1113. struct file *file = fget_raw(fildes);
  1114. if (file) {
  1115. ret = get_unused_fd_flags(0);
  1116. if (ret >= 0)
  1117. fd_install(ret, file);
  1118. else
  1119. fput(file);
  1120. }
  1121. return ret;
  1122. }
  1123. int f_dupfd(unsigned int from, struct file *file, unsigned flags)
  1124. {
  1125. unsigned long nofile = rlimit(RLIMIT_NOFILE);
  1126. int err;
  1127. if (from >= nofile)
  1128. return -EINVAL;
  1129. err = alloc_fd(from, nofile, flags);
  1130. if (err >= 0) {
  1131. get_file(file);
  1132. fd_install(err, file);
  1133. }
  1134. return err;
  1135. }
  1136. int iterate_fd(struct files_struct *files, unsigned n,
  1137. int (*f)(const void *, struct file *, unsigned),
  1138. const void *p)
  1139. {
  1140. struct fdtable *fdt;
  1141. int res = 0;
  1142. if (!files)
  1143. return 0;
  1144. spin_lock(&files->file_lock);
  1145. for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
  1146. struct file *file;
  1147. file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
  1148. if (!file)
  1149. continue;
  1150. res = f(p, file, n);
  1151. if (res)
  1152. break;
  1153. }
  1154. spin_unlock(&files->file_lock);
  1155. return res;
  1156. }
  1157. EXPORT_SYMBOL(iterate_fd);