buffered_read.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. // SPDX-License-Identifier: GPL-2.0-or-later
  2. /* Network filesystem high-level buffered read support.
  3. *
  4. * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
  5. * Written by David Howells ([email protected])
  6. */
  7. #include <linux/export.h>
  8. #include <linux/task_io_accounting_ops.h>
  9. #include "internal.h"
  10. /*
  11. * Unlock the folios in a read operation. We need to set PG_fscache on any
  12. * folios we're going to write back before we unlock them.
  13. */
  14. void netfs_rreq_unlock_folios(struct netfs_io_request *rreq)
  15. {
  16. struct netfs_io_subrequest *subreq;
  17. struct folio *folio;
  18. pgoff_t start_page = rreq->start / PAGE_SIZE;
  19. pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
  20. size_t account = 0;
  21. bool subreq_failed = false;
  22. XA_STATE(xas, &rreq->mapping->i_pages, start_page);
  23. if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) {
  24. __clear_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags);
  25. list_for_each_entry(subreq, &rreq->subrequests, rreq_link) {
  26. __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
  27. }
  28. }
  29. /* Walk through the pagecache and the I/O request lists simultaneously.
  30. * We may have a mixture of cached and uncached sections and we only
  31. * really want to write out the uncached sections. This is slightly
  32. * complicated by the possibility that we might have huge pages with a
  33. * mixture inside.
  34. */
  35. subreq = list_first_entry(&rreq->subrequests,
  36. struct netfs_io_subrequest, rreq_link);
  37. subreq_failed = (subreq->error < 0);
  38. trace_netfs_rreq(rreq, netfs_rreq_trace_unlock);
  39. rcu_read_lock();
  40. xas_for_each(&xas, folio, last_page) {
  41. loff_t pg_end;
  42. bool pg_failed = false;
  43. bool folio_started;
  44. if (xas_retry(&xas, folio))
  45. continue;
  46. pg_end = folio_pos(folio) + folio_size(folio) - 1;
  47. folio_started = false;
  48. for (;;) {
  49. loff_t sreq_end;
  50. if (!subreq) {
  51. pg_failed = true;
  52. break;
  53. }
  54. if (!folio_started && test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) {
  55. folio_start_fscache(folio);
  56. folio_started = true;
  57. }
  58. pg_failed |= subreq_failed;
  59. sreq_end = subreq->start + subreq->len - 1;
  60. if (pg_end < sreq_end)
  61. break;
  62. account += subreq->transferred;
  63. if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) {
  64. subreq = list_next_entry(subreq, rreq_link);
  65. subreq_failed = (subreq->error < 0);
  66. } else {
  67. subreq = NULL;
  68. subreq_failed = false;
  69. }
  70. if (pg_end == sreq_end)
  71. break;
  72. }
  73. if (!pg_failed) {
  74. flush_dcache_folio(folio);
  75. folio_mark_uptodate(folio);
  76. }
  77. if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) {
  78. if (folio_index(folio) == rreq->no_unlock_folio &&
  79. test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags))
  80. _debug("no unlock");
  81. else
  82. folio_unlock(folio);
  83. }
  84. }
  85. rcu_read_unlock();
  86. task_io_account_read(account);
  87. if (rreq->netfs_ops->done)
  88. rreq->netfs_ops->done(rreq);
  89. }
  90. static void netfs_cache_expand_readahead(struct netfs_io_request *rreq,
  91. loff_t *_start, size_t *_len, loff_t i_size)
  92. {
  93. struct netfs_cache_resources *cres = &rreq->cache_resources;
  94. if (cres->ops && cres->ops->expand_readahead)
  95. cres->ops->expand_readahead(cres, _start, _len, i_size);
  96. }
  97. static void netfs_rreq_expand(struct netfs_io_request *rreq,
  98. struct readahead_control *ractl)
  99. {
  100. /* Give the cache a chance to change the request parameters. The
  101. * resultant request must contain the original region.
  102. */
  103. netfs_cache_expand_readahead(rreq, &rreq->start, &rreq->len, rreq->i_size);
  104. /* Give the netfs a chance to change the request parameters. The
  105. * resultant request must contain the original region.
  106. */
  107. if (rreq->netfs_ops->expand_readahead)
  108. rreq->netfs_ops->expand_readahead(rreq);
  109. /* Expand the request if the cache wants it to start earlier. Note
  110. * that the expansion may get further extended if the VM wishes to
  111. * insert THPs and the preferred start and/or end wind up in the middle
  112. * of THPs.
  113. *
  114. * If this is the case, however, the THP size should be an integer
  115. * multiple of the cache granule size, so we get a whole number of
  116. * granules to deal with.
  117. */
  118. if (rreq->start != readahead_pos(ractl) ||
  119. rreq->len != readahead_length(ractl)) {
  120. readahead_expand(ractl, rreq->start, rreq->len);
  121. rreq->start = readahead_pos(ractl);
  122. rreq->len = readahead_length(ractl);
  123. trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
  124. netfs_read_trace_expanded);
  125. }
  126. }
  127. /**
  128. * netfs_readahead - Helper to manage a read request
  129. * @ractl: The description of the readahead request
  130. *
  131. * Fulfil a readahead request by drawing data from the cache if possible, or
  132. * the netfs if not. Space beyond the EOF is zero-filled. Multiple I/O
  133. * requests from different sources will get munged together. If necessary, the
  134. * readahead window can be expanded in either direction to a more convenient
  135. * alighment for RPC efficiency or to make storage in the cache feasible.
  136. *
  137. * The calling netfs must initialise a netfs context contiguous to the vfs
  138. * inode before calling this.
  139. *
  140. * This is usable whether or not caching is enabled.
  141. */
  142. void netfs_readahead(struct readahead_control *ractl)
  143. {
  144. struct netfs_io_request *rreq;
  145. struct netfs_inode *ctx = netfs_inode(ractl->mapping->host);
  146. int ret;
  147. _enter("%lx,%x", readahead_index(ractl), readahead_count(ractl));
  148. if (readahead_count(ractl) == 0)
  149. return;
  150. rreq = netfs_alloc_request(ractl->mapping, ractl->file,
  151. readahead_pos(ractl),
  152. readahead_length(ractl),
  153. NETFS_READAHEAD);
  154. if (IS_ERR(rreq))
  155. return;
  156. if (ctx->ops->begin_cache_operation) {
  157. ret = ctx->ops->begin_cache_operation(rreq);
  158. if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
  159. goto cleanup_free;
  160. }
  161. netfs_stat(&netfs_n_rh_readahead);
  162. trace_netfs_read(rreq, readahead_pos(ractl), readahead_length(ractl),
  163. netfs_read_trace_readahead);
  164. netfs_rreq_expand(rreq, ractl);
  165. /* Drop the refs on the folios here rather than in the cache or
  166. * filesystem. The locks will be dropped in netfs_rreq_unlock().
  167. */
  168. while (readahead_folio(ractl))
  169. ;
  170. netfs_begin_read(rreq, false);
  171. return;
  172. cleanup_free:
  173. netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
  174. return;
  175. }
  176. EXPORT_SYMBOL(netfs_readahead);
  177. /**
  178. * netfs_read_folio - Helper to manage a read_folio request
  179. * @file: The file to read from
  180. * @folio: The folio to read
  181. *
  182. * Fulfil a read_folio request by drawing data from the cache if
  183. * possible, or the netfs if not. Space beyond the EOF is zero-filled.
  184. * Multiple I/O requests from different sources will get munged together.
  185. *
  186. * The calling netfs must initialise a netfs context contiguous to the vfs
  187. * inode before calling this.
  188. *
  189. * This is usable whether or not caching is enabled.
  190. */
  191. int netfs_read_folio(struct file *file, struct folio *folio)
  192. {
  193. struct address_space *mapping = folio_file_mapping(folio);
  194. struct netfs_io_request *rreq;
  195. struct netfs_inode *ctx = netfs_inode(mapping->host);
  196. int ret;
  197. _enter("%lx", folio_index(folio));
  198. rreq = netfs_alloc_request(mapping, file,
  199. folio_file_pos(folio), folio_size(folio),
  200. NETFS_READPAGE);
  201. if (IS_ERR(rreq)) {
  202. ret = PTR_ERR(rreq);
  203. goto alloc_error;
  204. }
  205. if (ctx->ops->begin_cache_operation) {
  206. ret = ctx->ops->begin_cache_operation(rreq);
  207. if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
  208. goto discard;
  209. }
  210. netfs_stat(&netfs_n_rh_readpage);
  211. trace_netfs_read(rreq, rreq->start, rreq->len, netfs_read_trace_readpage);
  212. return netfs_begin_read(rreq, true);
  213. discard:
  214. netfs_put_request(rreq, false, netfs_rreq_trace_put_discard);
  215. alloc_error:
  216. folio_unlock(folio);
  217. return ret;
  218. }
  219. EXPORT_SYMBOL(netfs_read_folio);
  220. /*
  221. * Prepare a folio for writing without reading first
  222. * @folio: The folio being prepared
  223. * @pos: starting position for the write
  224. * @len: length of write
  225. * @always_fill: T if the folio should always be completely filled/cleared
  226. *
  227. * In some cases, write_begin doesn't need to read at all:
  228. * - full folio write
  229. * - write that lies in a folio that is completely beyond EOF
  230. * - write that covers the folio from start to EOF or beyond it
  231. *
  232. * If any of these criteria are met, then zero out the unwritten parts
  233. * of the folio and return true. Otherwise, return false.
  234. */
  235. static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len,
  236. bool always_fill)
  237. {
  238. struct inode *inode = folio_inode(folio);
  239. loff_t i_size = i_size_read(inode);
  240. size_t offset = offset_in_folio(folio, pos);
  241. size_t plen = folio_size(folio);
  242. if (unlikely(always_fill)) {
  243. if (pos - offset + len <= i_size)
  244. return false; /* Page entirely before EOF */
  245. zero_user_segment(&folio->page, 0, plen);
  246. folio_mark_uptodate(folio);
  247. return true;
  248. }
  249. /* Full folio write */
  250. if (offset == 0 && len >= plen)
  251. return true;
  252. /* Page entirely beyond the end of the file */
  253. if (pos - offset >= i_size)
  254. goto zero_out;
  255. /* Write that covers from the start of the folio to EOF or beyond */
  256. if (offset == 0 && (pos + len) >= i_size)
  257. goto zero_out;
  258. return false;
  259. zero_out:
  260. zero_user_segments(&folio->page, 0, offset, offset + len, plen);
  261. return true;
  262. }
  263. /**
  264. * netfs_write_begin - Helper to prepare for writing
  265. * @ctx: The netfs context
  266. * @file: The file to read from
  267. * @mapping: The mapping to read from
  268. * @pos: File position at which the write will begin
  269. * @len: The length of the write (may extend beyond the end of the folio chosen)
  270. * @_folio: Where to put the resultant folio
  271. * @_fsdata: Place for the netfs to store a cookie
  272. *
  273. * Pre-read data for a write-begin request by drawing data from the cache if
  274. * possible, or the netfs if not. Space beyond the EOF is zero-filled.
  275. * Multiple I/O requests from different sources will get munged together. If
  276. * necessary, the readahead window can be expanded in either direction to a
  277. * more convenient alighment for RPC efficiency or to make storage in the cache
  278. * feasible.
  279. *
  280. * The calling netfs must provide a table of operations, only one of which,
  281. * issue_op, is mandatory.
  282. *
  283. * The check_write_begin() operation can be provided to check for and flush
  284. * conflicting writes once the folio is grabbed and locked. It is passed a
  285. * pointer to the fsdata cookie that gets returned to the VM to be passed to
  286. * write_end. It is permitted to sleep. It should return 0 if the request
  287. * should go ahead or it may return an error. It may also unlock and put the
  288. * folio, provided it sets ``*foliop`` to NULL, in which case a return of 0
  289. * will cause the folio to be re-got and the process to be retried.
  290. *
  291. * The calling netfs must initialise a netfs context contiguous to the vfs
  292. * inode before calling this.
  293. *
  294. * This is usable whether or not caching is enabled.
  295. */
  296. int netfs_write_begin(struct netfs_inode *ctx,
  297. struct file *file, struct address_space *mapping,
  298. loff_t pos, unsigned int len, struct folio **_folio,
  299. void **_fsdata)
  300. {
  301. struct netfs_io_request *rreq;
  302. struct folio *folio;
  303. unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
  304. pgoff_t index = pos >> PAGE_SHIFT;
  305. int ret;
  306. DEFINE_READAHEAD(ractl, file, NULL, mapping, index);
  307. retry:
  308. folio = __filemap_get_folio(mapping, index, fgp_flags,
  309. mapping_gfp_mask(mapping));
  310. if (!folio)
  311. return -ENOMEM;
  312. if (ctx->ops->check_write_begin) {
  313. /* Allow the netfs (eg. ceph) to flush conflicts. */
  314. ret = ctx->ops->check_write_begin(file, pos, len, &folio, _fsdata);
  315. if (ret < 0) {
  316. trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin);
  317. goto error;
  318. }
  319. if (!folio)
  320. goto retry;
  321. }
  322. if (folio_test_uptodate(folio))
  323. goto have_folio;
  324. /* If the page is beyond the EOF, we want to clear it - unless it's
  325. * within the cache granule containing the EOF, in which case we need
  326. * to preload the granule.
  327. */
  328. if (!netfs_is_cache_enabled(ctx) &&
  329. netfs_skip_folio_read(folio, pos, len, false)) {
  330. netfs_stat(&netfs_n_rh_write_zskip);
  331. goto have_folio_no_wait;
  332. }
  333. rreq = netfs_alloc_request(mapping, file,
  334. folio_file_pos(folio), folio_size(folio),
  335. NETFS_READ_FOR_WRITE);
  336. if (IS_ERR(rreq)) {
  337. ret = PTR_ERR(rreq);
  338. goto error;
  339. }
  340. rreq->no_unlock_folio = folio_index(folio);
  341. __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags);
  342. if (ctx->ops->begin_cache_operation) {
  343. ret = ctx->ops->begin_cache_operation(rreq);
  344. if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS)
  345. goto error_put;
  346. }
  347. netfs_stat(&netfs_n_rh_write_begin);
  348. trace_netfs_read(rreq, pos, len, netfs_read_trace_write_begin);
  349. /* Expand the request to meet caching requirements and download
  350. * preferences.
  351. */
  352. ractl._nr_pages = folio_nr_pages(folio);
  353. netfs_rreq_expand(rreq, &ractl);
  354. /* We hold the folio locks, so we can drop the references */
  355. folio_get(folio);
  356. while (readahead_folio(&ractl))
  357. ;
  358. ret = netfs_begin_read(rreq, true);
  359. if (ret < 0)
  360. goto error;
  361. have_folio:
  362. ret = folio_wait_fscache_killable(folio);
  363. if (ret < 0)
  364. goto error;
  365. have_folio_no_wait:
  366. *_folio = folio;
  367. _leave(" = 0");
  368. return 0;
  369. error_put:
  370. netfs_put_request(rreq, false, netfs_rreq_trace_put_failed);
  371. error:
  372. if (folio) {
  373. folio_unlock(folio);
  374. folio_put(folio);
  375. }
  376. _leave(" = %d", ret);
  377. return ret;
  378. }
  379. EXPORT_SYMBOL(netfs_write_begin);