readahead.c 26 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * mm/readahead.c - address_space-level file readahead.
  4. *
  5. * Copyright (C) 2002, Linus Torvalds
  6. *
  7. * 09Apr2002 Andrew Morton
  8. * Initial version.
  9. */
  10. /**
  11. * DOC: Readahead Overview
  12. *
  13. * Readahead is used to read content into the page cache before it is
  14. * explicitly requested by the application. Readahead only ever
  15. * attempts to read folios that are not yet in the page cache. If a
  16. * folio is present but not up-to-date, readahead will not try to read
  17. * it. In that case a simple ->read_folio() will be requested.
  18. *
  19. * Readahead is triggered when an application read request (whether a
  20. * system call or a page fault) finds that the requested folio is not in
  21. * the page cache, or that it is in the page cache and has the
  22. * readahead flag set. This flag indicates that the folio was read
  23. * as part of a previous readahead request and now that it has been
  24. * accessed, it is time for the next readahead.
  25. *
  26. * Each readahead request is partly synchronous read, and partly async
  27. * readahead. This is reflected in the struct file_ra_state which
  28. * contains ->size being the total number of pages, and ->async_size
  29. * which is the number of pages in the async section. The readahead
  30. * flag will be set on the first folio in this async section to trigger
  31. * a subsequent readahead. Once a series of sequential reads has been
  32. * established, there should be no need for a synchronous component and
  33. * all readahead request will be fully asynchronous.
  34. *
  35. * When either of the triggers causes a readahead, three numbers need
  36. * to be determined: the start of the region to read, the size of the
  37. * region, and the size of the async tail.
  38. *
  39. * The start of the region is simply the first page address at or after
  40. * the accessed address, which is not currently populated in the page
  41. * cache. This is found with a simple search in the page cache.
  42. *
  43. * The size of the async tail is determined by subtracting the size that
  44. * was explicitly requested from the determined request size, unless
  45. * this would be less than zero - then zero is used. NOTE THIS
  46. * CALCULATION IS WRONG WHEN THE START OF THE REGION IS NOT THE ACCESSED
  47. * PAGE. ALSO THIS CALCULATION IS NOT USED CONSISTENTLY.
  48. *
  49. * The size of the region is normally determined from the size of the
  50. * previous readahead which loaded the preceding pages. This may be
  51. * discovered from the struct file_ra_state for simple sequential reads,
  52. * or from examining the state of the page cache when multiple
  53. * sequential reads are interleaved. Specifically: where the readahead
  54. * was triggered by the readahead flag, the size of the previous
  55. * readahead is assumed to be the number of pages from the triggering
  56. * page to the start of the new readahead. In these cases, the size of
  57. * the previous readahead is scaled, often doubled, for the new
  58. * readahead, though see get_next_ra_size() for details.
  59. *
  60. * If the size of the previous read cannot be determined, the number of
  61. * preceding pages in the page cache is used to estimate the size of
  62. * a previous read. This estimate could easily be misled by random
  63. * reads being coincidentally adjacent, so it is ignored unless it is
  64. * larger than the current request, and it is not scaled up, unless it
  65. * is at the start of file.
  66. *
  67. * In general readahead is accelerated at the start of the file, as
  68. * reads from there are often sequential. There are other minor
  69. * adjustments to the readahead size in various special cases and these
  70. * are best discovered by reading the code.
  71. *
  72. * The above calculation, based on the previous readahead size,
  73. * determines the size of the readahead, to which any requested read
  74. * size may be added.
  75. *
  76. * Readahead requests are sent to the filesystem using the ->readahead()
  77. * address space operation, for which mpage_readahead() is a canonical
  78. * implementation. ->readahead() should normally initiate reads on all
  79. * folios, but may fail to read any or all folios without causing an I/O
  80. * error. The page cache reading code will issue a ->read_folio() request
  81. * for any folio which ->readahead() did not read, and only an error
  82. * from this will be final.
  83. *
  84. * ->readahead() will generally call readahead_folio() repeatedly to get
  85. * each folio from those prepared for readahead. It may fail to read a
  86. * folio by:
  87. *
  88. * * not calling readahead_folio() sufficiently many times, effectively
  89. * ignoring some folios, as might be appropriate if the path to
  90. * storage is congested.
  91. *
  92. * * failing to actually submit a read request for a given folio,
  93. * possibly due to insufficient resources, or
  94. *
  95. * * getting an error during subsequent processing of a request.
  96. *
  97. * In the last two cases, the folio should be unlocked by the filesystem
  98. * to indicate that the read attempt has failed. In the first case the
  99. * folio will be unlocked by the VFS.
  100. *
  101. * Those folios not in the final ``async_size`` of the request should be
  102. * considered to be important and ->readahead() should not fail them due
  103. * to congestion or temporary resource unavailability, but should wait
  104. * for necessary resources (e.g. memory or indexing information) to
  105. * become available. Folios in the final ``async_size`` may be
  106. * considered less urgent and failure to read them is more acceptable.
  107. * In this case it is best to use filemap_remove_folio() to remove the
  108. * folios from the page cache as is automatically done for folios that
  109. * were not fetched with readahead_folio(). This will allow a
  110. * subsequent synchronous readahead request to try them again. If they
  111. * are left in the page cache, then they will be read individually using
  112. * ->read_folio() which may be less efficient.
  113. */
  114. #include <linux/blkdev.h>
  115. #include <linux/kernel.h>
  116. #include <linux/dax.h>
  117. #include <linux/gfp.h>
  118. #include <linux/export.h>
  119. #include <linux/backing-dev.h>
  120. #include <linux/task_io_accounting_ops.h>
  121. #include <linux/pagevec.h>
  122. #include <linux/pagemap.h>
  123. #include <linux/psi.h>
  124. #include <linux/syscalls.h>
  125. #include <linux/file.h>
  126. #include <linux/mm_inline.h>
  127. #include <linux/blk-cgroup.h>
  128. #include <linux/fadvise.h>
  129. #include <linux/sched/mm.h>
  130. #include <trace/hooks/mm.h>
  131. #include "internal.h"
  132. /*
  133. * Initialise a struct file's readahead state. Assumes that the caller has
  134. * memset *ra to zero.
  135. */
  136. void
  137. file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
  138. {
  139. ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
  140. ra->prev_pos = -1;
  141. }
  142. EXPORT_SYMBOL_GPL(file_ra_state_init);
  143. gfp_t readahead_gfp_mask(struct address_space *x)
  144. {
  145. gfp_t mask = __readahead_gfp_mask(x);
  146. trace_android_rvh_set_readahead_gfp_mask(&mask);
  147. return mask;
  148. }
  149. EXPORT_SYMBOL_GPL(readahead_gfp_mask);
  150. static void read_pages(struct readahead_control *rac)
  151. {
  152. const struct address_space_operations *aops = rac->mapping->a_ops;
  153. struct folio *folio;
  154. struct blk_plug plug;
  155. if (!readahead_count(rac))
  156. return;
  157. if (unlikely(rac->_workingset))
  158. psi_memstall_enter(&rac->_pflags);
  159. blk_start_plug(&plug);
  160. trace_android_vh_read_pages(rac);
  161. if (aops->readahead) {
  162. aops->readahead(rac);
  163. /*
  164. * Clean up the remaining folios. The sizes in ->ra
  165. * may be used to size the next readahead, so make sure
  166. * they accurately reflect what happened.
  167. */
  168. while ((folio = readahead_folio(rac)) != NULL) {
  169. unsigned long nr = folio_nr_pages(folio);
  170. folio_get(folio);
  171. rac->ra->size -= nr;
  172. if (rac->ra->async_size >= nr) {
  173. rac->ra->async_size -= nr;
  174. filemap_remove_folio(folio);
  175. }
  176. folio_unlock(folio);
  177. folio_put(folio);
  178. }
  179. } else {
  180. while ((folio = readahead_folio(rac)) != NULL)
  181. aops->read_folio(rac->file, folio);
  182. }
  183. blk_finish_plug(&plug);
  184. if (unlikely(rac->_workingset))
  185. psi_memstall_leave(&rac->_pflags);
  186. rac->_workingset = false;
  187. BUG_ON(readahead_count(rac));
  188. }
  189. /**
  190. * page_cache_ra_unbounded - Start unchecked readahead.
  191. * @ractl: Readahead control.
  192. * @nr_to_read: The number of pages to read.
  193. * @lookahead_size: Where to start the next readahead.
  194. *
  195. * This function is for filesystems to call when they want to start
  196. * readahead beyond a file's stated i_size. This is almost certainly
  197. * not the function you want to call. Use page_cache_async_readahead()
  198. * or page_cache_sync_readahead() instead.
  199. *
  200. * Context: File is referenced by caller. Mutexes may be held by caller.
  201. * May sleep, but will not reenter filesystem to reclaim memory.
  202. */
  203. void page_cache_ra_unbounded(struct readahead_control *ractl,
  204. unsigned long nr_to_read, unsigned long lookahead_size)
  205. {
  206. struct address_space *mapping = ractl->mapping;
  207. unsigned long index = readahead_index(ractl);
  208. gfp_t gfp_mask = readahead_gfp_mask(mapping);
  209. unsigned long i;
  210. /*
  211. * Partway through the readahead operation, we will have added
  212. * locked pages to the page cache, but will not yet have submitted
  213. * them for I/O. Adding another page may need to allocate memory,
  214. * which can trigger memory reclaim. Telling the VM we're in
  215. * the middle of a filesystem operation will cause it to not
  216. * touch file-backed pages, preventing a deadlock. Most (all?)
  217. * filesystems already specify __GFP_NOFS in their mapping's
  218. * gfp_mask, but let's be explicit here.
  219. */
  220. unsigned int nofs = memalloc_nofs_save();
  221. filemap_invalidate_lock_shared(mapping);
  222. /*
  223. * Preallocate as many pages as we will need.
  224. */
  225. for (i = 0; i < nr_to_read; i++) {
  226. struct folio *folio = xa_load(&mapping->i_pages, index + i);
  227. if (folio && !xa_is_value(folio)) {
  228. /*
  229. * Page already present? Kick off the current batch
  230. * of contiguous pages before continuing with the
  231. * next batch. This page may be the one we would
  232. * have intended to mark as Readahead, but we don't
  233. * have a stable reference to this page, and it's
  234. * not worth getting one just for that.
  235. */
  236. read_pages(ractl);
  237. ractl->_index++;
  238. i = ractl->_index + ractl->_nr_pages - index - 1;
  239. continue;
  240. }
  241. folio = filemap_alloc_folio(gfp_mask, 0);
  242. if (!folio)
  243. break;
  244. if (filemap_add_folio(mapping, folio, index + i,
  245. gfp_mask) < 0) {
  246. folio_put(folio);
  247. read_pages(ractl);
  248. ractl->_index++;
  249. i = ractl->_index + ractl->_nr_pages - index - 1;
  250. continue;
  251. }
  252. if (i == nr_to_read - lookahead_size)
  253. folio_set_readahead(folio);
  254. ractl->_workingset |= folio_test_workingset(folio);
  255. ractl->_nr_pages++;
  256. }
  257. /*
  258. * Now start the IO. We ignore I/O errors - if the folio is not
  259. * uptodate then the caller will launch read_folio again, and
  260. * will then handle the error.
  261. */
  262. read_pages(ractl);
  263. filemap_invalidate_unlock_shared(mapping);
  264. memalloc_nofs_restore(nofs);
  265. }
  266. EXPORT_SYMBOL_GPL(page_cache_ra_unbounded);
  267. /*
  268. * do_page_cache_ra() actually reads a chunk of disk. It allocates
  269. * the pages first, then submits them for I/O. This avoids the very bad
  270. * behaviour which would occur if page allocations are causing VM writeback.
  271. * We really don't want to intermingle reads and writes like that.
  272. */
  273. static void do_page_cache_ra(struct readahead_control *ractl,
  274. unsigned long nr_to_read, unsigned long lookahead_size)
  275. {
  276. struct inode *inode = ractl->mapping->host;
  277. unsigned long index = readahead_index(ractl);
  278. loff_t isize = i_size_read(inode);
  279. pgoff_t end_index; /* The last page we want to read */
  280. if (isize == 0)
  281. return;
  282. end_index = (isize - 1) >> PAGE_SHIFT;
  283. if (index > end_index)
  284. return;
  285. /* Don't read past the page containing the last byte of the file */
  286. if (nr_to_read > end_index - index)
  287. nr_to_read = end_index - index + 1;
  288. page_cache_ra_unbounded(ractl, nr_to_read, lookahead_size);
  289. }
  290. /*
  291. * Chunk the readahead into 2 megabyte units, so that we don't pin too much
  292. * memory at once.
  293. */
  294. void force_page_cache_ra(struct readahead_control *ractl,
  295. unsigned long nr_to_read)
  296. {
  297. struct address_space *mapping = ractl->mapping;
  298. struct file_ra_state *ra = ractl->ra;
  299. struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
  300. unsigned long max_pages, index;
  301. if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
  302. return;
  303. /*
  304. * If the request exceeds the readahead window, allow the read to
  305. * be up to the optimal hardware IO size
  306. */
  307. index = readahead_index(ractl);
  308. max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages);
  309. nr_to_read = min_t(unsigned long, nr_to_read, max_pages);
  310. while (nr_to_read) {
  311. unsigned long this_chunk = (2 * 1024 * 1024) / PAGE_SIZE;
  312. if (this_chunk > nr_to_read)
  313. this_chunk = nr_to_read;
  314. ractl->_index = index;
  315. do_page_cache_ra(ractl, this_chunk, 0);
  316. index += this_chunk;
  317. nr_to_read -= this_chunk;
  318. }
  319. }
  320. /*
  321. * Set the initial window size, round to next power of 2 and square
  322. * for small size, x 4 for medium, and x 2 for large
  323. * for 128k (32 page) max ra
  324. * 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
  325. */
  326. static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
  327. {
  328. unsigned long newsize = roundup_pow_of_two(size);
  329. if (newsize <= max / 32)
  330. newsize = newsize * 4;
  331. else if (newsize <= max / 4)
  332. newsize = newsize * 2;
  333. else
  334. newsize = max;
  335. return newsize;
  336. }
  337. /*
  338. * Get the previous window size, ramp it up, and
  339. * return it as the new window size.
  340. */
  341. static unsigned long get_next_ra_size(struct file_ra_state *ra,
  342. unsigned long max)
  343. {
  344. unsigned long cur = ra->size;
  345. if (cur < max / 16)
  346. return 4 * cur;
  347. if (cur <= max / 2)
  348. return 2 * cur;
  349. return max;
  350. }
  351. /*
  352. * On-demand readahead design.
  353. *
  354. * The fields in struct file_ra_state represent the most-recently-executed
  355. * readahead attempt:
  356. *
  357. * |<----- async_size ---------|
  358. * |------------------- size -------------------->|
  359. * |==================#===========================|
  360. * ^start ^page marked with PG_readahead
  361. *
  362. * To overlap application thinking time and disk I/O time, we do
  363. * `readahead pipelining': Do not wait until the application consumed all
  364. * readahead pages and stalled on the missing page at readahead_index;
  365. * Instead, submit an asynchronous readahead I/O as soon as there are
  366. * only async_size pages left in the readahead window. Normally async_size
  367. * will be equal to size, for maximum pipelining.
  368. *
  369. * In interleaved sequential reads, concurrent streams on the same fd can
  370. * be invalidating each other's readahead state. So we flag the new readahead
  371. * page at (start+size-async_size) with PG_readahead, and use it as readahead
  372. * indicator. The flag won't be set on already cached pages, to avoid the
  373. * readahead-for-nothing fuss, saving pointless page cache lookups.
  374. *
  375. * prev_pos tracks the last visited byte in the _previous_ read request.
  376. * It should be maintained by the caller, and will be used for detecting
  377. * small random reads. Note that the readahead algorithm checks loosely
  378. * for sequential patterns. Hence interleaved reads might be served as
  379. * sequential ones.
  380. *
  381. * There is a special-case: if the first page which the application tries to
  382. * read happens to be the first page of the file, it is assumed that a linear
  383. * read is about to happen and the window is immediately set to the initial size
  384. * based on I/O request size and the max_readahead.
  385. *
  386. * The code ramps up the readahead size aggressively at first, but slow down as
  387. * it approaches max_readhead.
  388. */
  389. /*
  390. * Count contiguously cached pages from @index-1 to @index-@max,
  391. * this count is a conservative estimation of
  392. * - length of the sequential read sequence, or
  393. * - thrashing threshold in memory tight systems
  394. */
  395. static pgoff_t count_history_pages(struct address_space *mapping,
  396. pgoff_t index, unsigned long max)
  397. {
  398. pgoff_t head;
  399. rcu_read_lock();
  400. head = page_cache_prev_miss(mapping, index - 1, max);
  401. rcu_read_unlock();
  402. return index - 1 - head;
  403. }
  404. /*
  405. * page cache context based readahead
  406. */
  407. static int try_context_readahead(struct address_space *mapping,
  408. struct file_ra_state *ra,
  409. pgoff_t index,
  410. unsigned long req_size,
  411. unsigned long max)
  412. {
  413. pgoff_t size;
  414. size = count_history_pages(mapping, index, max);
  415. /*
  416. * not enough history pages:
  417. * it could be a random read
  418. */
  419. if (size <= req_size)
  420. return 0;
  421. /*
  422. * starts from beginning of file:
  423. * it is a strong indication of long-run stream (or whole-file-read)
  424. */
  425. if (size >= index)
  426. size *= 2;
  427. ra->start = index;
  428. ra->size = min(size + req_size, max);
  429. ra->async_size = 1;
  430. return 1;
  431. }
  432. /*
  433. * There are some parts of the kernel which assume that PMD entries
  434. * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
  435. * limit the maximum allocation order to PMD size. I'm not aware of any
  436. * assumptions about maximum order if THP are disabled, but 8 seems like
  437. * a good order (that's 1MB if you're using 4kB pages)
  438. */
  439. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  440. #define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
  441. #else
  442. #define MAX_PAGECACHE_ORDER 8
  443. #endif
  444. static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
  445. pgoff_t mark, unsigned int order, gfp_t gfp)
  446. {
  447. int err;
  448. struct folio *folio = filemap_alloc_folio(gfp, order);
  449. if (!folio)
  450. return -ENOMEM;
  451. mark = round_up(mark, 1UL << order);
  452. if (index == mark)
  453. folio_set_readahead(folio);
  454. err = filemap_add_folio(ractl->mapping, folio, index, gfp);
  455. if (err) {
  456. folio_put(folio);
  457. return err;
  458. }
  459. ractl->_nr_pages += 1UL << order;
  460. ractl->_workingset |= folio_test_workingset(folio);
  461. return 0;
  462. }
  463. void page_cache_ra_order(struct readahead_control *ractl,
  464. struct file_ra_state *ra, unsigned int new_order)
  465. {
  466. struct address_space *mapping = ractl->mapping;
  467. pgoff_t index = readahead_index(ractl);
  468. pgoff_t limit = (i_size_read(mapping->host) - 1) >> PAGE_SHIFT;
  469. pgoff_t mark = index + ra->size - ra->async_size;
  470. int err = 0;
  471. gfp_t gfp = readahead_gfp_mask(mapping);
  472. if (!mapping_large_folio_support(mapping) || ra->size < 4)
  473. goto fallback;
  474. limit = min(limit, index + ra->size - 1);
  475. if (new_order < MAX_PAGECACHE_ORDER) {
  476. new_order += 2;
  477. if (new_order > MAX_PAGECACHE_ORDER)
  478. new_order = MAX_PAGECACHE_ORDER;
  479. while ((1 << new_order) > ra->size)
  480. new_order--;
  481. }
  482. filemap_invalidate_lock_shared(mapping);
  483. while (index <= limit) {
  484. unsigned int order = new_order;
  485. /* Align with smaller pages if needed */
  486. if (index & ((1UL << order) - 1)) {
  487. order = __ffs(index);
  488. if (order == 1)
  489. order = 0;
  490. }
  491. /* Don't allocate pages past EOF */
  492. while (index + (1UL << order) - 1 > limit) {
  493. if (--order == 1)
  494. order = 0;
  495. }
  496. err = ra_alloc_folio(ractl, index, mark, order, gfp);
  497. if (err)
  498. break;
  499. index += 1UL << order;
  500. }
  501. if (index > limit) {
  502. ra->size += index - limit - 1;
  503. ra->async_size += index - limit - 1;
  504. }
  505. read_pages(ractl);
  506. filemap_invalidate_unlock_shared(mapping);
  507. /*
  508. * If there were already pages in the page cache, then we may have
  509. * left some gaps. Let the regular readahead code take care of this
  510. * situation.
  511. */
  512. if (!err)
  513. return;
  514. fallback:
  515. do_page_cache_ra(ractl, ra->size, ra->async_size);
  516. }
  517. /*
  518. * A minimal readahead algorithm for trivial sequential/random reads.
  519. */
  520. static void ondemand_readahead(struct readahead_control *ractl,
  521. struct folio *folio, unsigned long req_size)
  522. {
  523. struct backing_dev_info *bdi = inode_to_bdi(ractl->mapping->host);
  524. struct file_ra_state *ra = ractl->ra;
  525. unsigned long max_pages = ra->ra_pages;
  526. unsigned long add_pages;
  527. pgoff_t index = readahead_index(ractl);
  528. pgoff_t expected, prev_index;
  529. unsigned int order = folio ? folio_order(folio) : 0;
  530. /*
  531. * If the request exceeds the readahead window, allow the read to
  532. * be up to the optimal hardware IO size
  533. */
  534. if (req_size > max_pages && bdi->io_pages > max_pages)
  535. max_pages = min(req_size, bdi->io_pages);
  536. trace_android_vh_ra_tuning_max_page(ractl, &max_pages);
  537. /*
  538. * start of file
  539. */
  540. if (!index)
  541. goto initial_readahead;
  542. /*
  543. * It's the expected callback index, assume sequential access.
  544. * Ramp up sizes, and push forward the readahead window.
  545. */
  546. expected = round_up(ra->start + ra->size - ra->async_size,
  547. 1UL << order);
  548. if (index == expected || index == (ra->start + ra->size)) {
  549. ra->start += ra->size;
  550. ra->size = get_next_ra_size(ra, max_pages);
  551. ra->async_size = ra->size;
  552. goto readit;
  553. }
  554. /*
  555. * Hit a marked folio without valid readahead state.
  556. * E.g. interleaved reads.
  557. * Query the pagecache for async_size, which normally equals to
  558. * readahead size. Ramp it up and use it as the new readahead size.
  559. */
  560. if (folio) {
  561. pgoff_t start;
  562. rcu_read_lock();
  563. start = page_cache_next_miss(ractl->mapping, index + 1,
  564. max_pages);
  565. rcu_read_unlock();
  566. if (!start || start - index > max_pages)
  567. return;
  568. ra->start = start;
  569. ra->size = start - index; /* old async_size */
  570. ra->size += req_size;
  571. ra->size = get_next_ra_size(ra, max_pages);
  572. ra->async_size = ra->size;
  573. goto readit;
  574. }
  575. /*
  576. * oversize read
  577. */
  578. if (req_size > max_pages)
  579. goto initial_readahead;
  580. /*
  581. * sequential cache miss
  582. * trivial case: (index - prev_index) == 1
  583. * unaligned reads: (index - prev_index) == 0
  584. */
  585. prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
  586. if (index - prev_index <= 1UL)
  587. goto initial_readahead;
  588. /*
  589. * Query the page cache and look for the traces(cached history pages)
  590. * that a sequential stream would leave behind.
  591. */
  592. if (try_context_readahead(ractl->mapping, ra, index, req_size,
  593. max_pages))
  594. goto readit;
  595. /*
  596. * standalone, small random read
  597. * Read as is, and do not pollute the readahead state.
  598. */
  599. do_page_cache_ra(ractl, req_size, 0);
  600. return;
  601. initial_readahead:
  602. ra->start = index;
  603. ra->size = get_init_ra_size(req_size, max_pages);
  604. ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
  605. readit:
  606. /*
  607. * Will this read hit the readahead marker made by itself?
  608. * If so, trigger the readahead marker hit now, and merge
  609. * the resulted next readahead window into the current one.
  610. * Take care of maximum IO pages as above.
  611. */
  612. if (index == ra->start && ra->size == ra->async_size) {
  613. add_pages = get_next_ra_size(ra, max_pages);
  614. if (ra->size + add_pages <= max_pages) {
  615. ra->async_size = add_pages;
  616. ra->size += add_pages;
  617. } else {
  618. ra->size = max_pages;
  619. ra->async_size = max_pages >> 1;
  620. }
  621. }
  622. ractl->_index = ra->start;
  623. page_cache_ra_order(ractl, ra, order);
  624. }
  625. void page_cache_sync_ra(struct readahead_control *ractl,
  626. unsigned long req_count)
  627. {
  628. bool do_forced_ra = ractl->file && (ractl->file->f_mode & FMODE_RANDOM);
  629. /*
  630. * Even if readahead is disabled, issue this request as readahead
  631. * as we'll need it to satisfy the requested range. The forced
  632. * readahead will do the right thing and limit the read to just the
  633. * requested range, which we'll set to 1 page for this case.
  634. */
  635. if (!ractl->ra->ra_pages || blk_cgroup_congested()) {
  636. if (!ractl->file)
  637. return;
  638. req_count = 1;
  639. do_forced_ra = true;
  640. }
  641. /* be dumb */
  642. if (do_forced_ra) {
  643. force_page_cache_ra(ractl, req_count);
  644. return;
  645. }
  646. ondemand_readahead(ractl, NULL, req_count);
  647. }
  648. EXPORT_SYMBOL_GPL(page_cache_sync_ra);
  649. void page_cache_async_ra(struct readahead_control *ractl,
  650. struct folio *folio, unsigned long req_count)
  651. {
  652. /* no readahead */
  653. if (!ractl->ra->ra_pages)
  654. return;
  655. /*
  656. * Same bit is used for PG_readahead and PG_reclaim.
  657. */
  658. if (folio_test_writeback(folio))
  659. return;
  660. folio_clear_readahead(folio);
  661. if (blk_cgroup_congested())
  662. return;
  663. ondemand_readahead(ractl, folio, req_count);
  664. }
  665. EXPORT_SYMBOL_GPL(page_cache_async_ra);
  666. ssize_t ksys_readahead(int fd, loff_t offset, size_t count)
  667. {
  668. ssize_t ret;
  669. struct fd f;
  670. ret = -EBADF;
  671. f = fdget(fd);
  672. if (!f.file || !(f.file->f_mode & FMODE_READ))
  673. goto out;
  674. /*
  675. * The readahead() syscall is intended to run only on files
  676. * that can execute readahead. If readahead is not possible
  677. * on this file, then we must return -EINVAL.
  678. */
  679. ret = -EINVAL;
  680. if (!f.file->f_mapping || !f.file->f_mapping->a_ops ||
  681. (!S_ISREG(file_inode(f.file)->i_mode) &&
  682. !S_ISBLK(file_inode(f.file)->i_mode)))
  683. goto out;
  684. ret = vfs_fadvise(f.file, offset, count, POSIX_FADV_WILLNEED);
  685. out:
  686. fdput(f);
  687. return ret;
  688. }
  689. SYSCALL_DEFINE3(readahead, int, fd, loff_t, offset, size_t, count)
  690. {
  691. return ksys_readahead(fd, offset, count);
  692. }
  693. #if defined(CONFIG_COMPAT) && defined(__ARCH_WANT_COMPAT_READAHEAD)
  694. COMPAT_SYSCALL_DEFINE4(readahead, int, fd, compat_arg_u64_dual(offset), size_t, count)
  695. {
  696. return ksys_readahead(fd, compat_arg_u64_glue(offset), count);
  697. }
  698. #endif
  699. /**
  700. * readahead_expand - Expand a readahead request
  701. * @ractl: The request to be expanded
  702. * @new_start: The revised start
  703. * @new_len: The revised size of the request
  704. *
  705. * Attempt to expand a readahead request outwards from the current size to the
  706. * specified size by inserting locked pages before and after the current window
  707. * to increase the size to the new window. This may involve the insertion of
  708. * THPs, in which case the window may get expanded even beyond what was
  709. * requested.
  710. *
  711. * The algorithm will stop if it encounters a conflicting page already in the
  712. * pagecache and leave a smaller expansion than requested.
  713. *
  714. * The caller must check for this by examining the revised @ractl object for a
  715. * different expansion than was requested.
  716. */
  717. void readahead_expand(struct readahead_control *ractl,
  718. loff_t new_start, size_t new_len)
  719. {
  720. struct address_space *mapping = ractl->mapping;
  721. struct file_ra_state *ra = ractl->ra;
  722. pgoff_t new_index, new_nr_pages;
  723. gfp_t gfp_mask = readahead_gfp_mask(mapping);
  724. new_index = new_start / PAGE_SIZE;
  725. /* Expand the leading edge downwards */
  726. while (ractl->_index > new_index) {
  727. unsigned long index = ractl->_index - 1;
  728. struct page *page = xa_load(&mapping->i_pages, index);
  729. if (page && !xa_is_value(page))
  730. return; /* Page apparently present */
  731. page = __page_cache_alloc(gfp_mask);
  732. if (!page)
  733. return;
  734. if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
  735. put_page(page);
  736. return;
  737. }
  738. ractl->_nr_pages++;
  739. ractl->_index = page->index;
  740. }
  741. new_len += new_start - readahead_pos(ractl);
  742. new_nr_pages = DIV_ROUND_UP(new_len, PAGE_SIZE);
  743. /* Expand the trailing edge upwards */
  744. while (ractl->_nr_pages < new_nr_pages) {
  745. unsigned long index = ractl->_index + ractl->_nr_pages;
  746. struct page *page = xa_load(&mapping->i_pages, index);
  747. if (page && !xa_is_value(page))
  748. return; /* Page apparently present */
  749. page = __page_cache_alloc(gfp_mask);
  750. if (!page)
  751. return;
  752. if (add_to_page_cache_lru(page, mapping, index, gfp_mask) < 0) {
  753. put_page(page);
  754. return;
  755. }
  756. if (unlikely(PageWorkingset(page)) && !ractl->_workingset) {
  757. ractl->_workingset = true;
  758. psi_memstall_enter(&ractl->_pflags);
  759. }
  760. ractl->_nr_pages++;
  761. if (ra) {
  762. ra->size++;
  763. ra->async_size++;
  764. }
  765. }
  766. }
  767. EXPORT_SYMBOL(readahead_expand);