xfs_aops.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4. * Copyright (c) 2016-2018 Christoph Hellwig.
  5. * All Rights Reserved.
  6. */
  7. #include "xfs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_inode.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_iomap.h"
  16. #include "xfs_trace.h"
  17. #include "xfs_bmap.h"
  18. #include "xfs_bmap_util.h"
  19. #include "xfs_reflink.h"
  20. struct xfs_writepage_ctx {
  21. struct iomap_writepage_ctx ctx;
  22. unsigned int data_seq;
  23. unsigned int cow_seq;
  24. };
  25. static inline struct xfs_writepage_ctx *
  26. XFS_WPC(struct iomap_writepage_ctx *ctx)
  27. {
  28. return container_of(ctx, struct xfs_writepage_ctx, ctx);
  29. }
  30. /*
  31. * Fast and loose check if this write could update the on-disk inode size.
  32. */
  33. static inline bool xfs_ioend_is_append(struct iomap_ioend *ioend)
  34. {
  35. return ioend->io_offset + ioend->io_size >
  36. XFS_I(ioend->io_inode)->i_disk_size;
  37. }
  38. /*
  39. * Update on-disk file size now that data has been written to disk.
  40. */
  41. int
  42. xfs_setfilesize(
  43. struct xfs_inode *ip,
  44. xfs_off_t offset,
  45. size_t size)
  46. {
  47. struct xfs_mount *mp = ip->i_mount;
  48. struct xfs_trans *tp;
  49. xfs_fsize_t isize;
  50. int error;
  51. error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
  52. if (error)
  53. return error;
  54. xfs_ilock(ip, XFS_ILOCK_EXCL);
  55. isize = xfs_new_eof(ip, offset + size);
  56. if (!isize) {
  57. xfs_iunlock(ip, XFS_ILOCK_EXCL);
  58. xfs_trans_cancel(tp);
  59. return 0;
  60. }
  61. trace_xfs_setfilesize(ip, offset, size);
  62. ip->i_disk_size = isize;
  63. xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
  64. xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
  65. return xfs_trans_commit(tp);
  66. }
  67. /*
  68. * IO write completion.
  69. */
  70. STATIC void
  71. xfs_end_ioend(
  72. struct iomap_ioend *ioend)
  73. {
  74. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  75. struct xfs_mount *mp = ip->i_mount;
  76. xfs_off_t offset = ioend->io_offset;
  77. size_t size = ioend->io_size;
  78. unsigned int nofs_flag;
  79. int error;
  80. /*
  81. * We can allocate memory here while doing writeback on behalf of
  82. * memory reclaim. To avoid memory allocation deadlocks set the
  83. * task-wide nofs context for the following operations.
  84. */
  85. nofs_flag = memalloc_nofs_save();
  86. /*
  87. * Just clean up the in-memory structures if the fs has been shut down.
  88. */
  89. if (xfs_is_shutdown(mp)) {
  90. error = -EIO;
  91. goto done;
  92. }
  93. /*
  94. * Clean up all COW blocks and underlying data fork delalloc blocks on
  95. * I/O error. The delalloc punch is required because this ioend was
  96. * mapped to blocks in the COW fork and the associated pages are no
  97. * longer dirty. If we don't remove delalloc blocks here, they become
  98. * stale and can corrupt free space accounting on unmount.
  99. */
  100. error = blk_status_to_errno(ioend->io_bio->bi_status);
  101. if (unlikely(error)) {
  102. if (ioend->io_flags & IOMAP_F_SHARED) {
  103. xfs_reflink_cancel_cow_range(ip, offset, size, true);
  104. xfs_bmap_punch_delalloc_range(ip,
  105. XFS_B_TO_FSBT(mp, offset),
  106. XFS_B_TO_FSB(mp, size));
  107. }
  108. goto done;
  109. }
  110. /*
  111. * Success: commit the COW or unwritten blocks if needed.
  112. */
  113. if (ioend->io_flags & IOMAP_F_SHARED)
  114. error = xfs_reflink_end_cow(ip, offset, size);
  115. else if (ioend->io_type == IOMAP_UNWRITTEN)
  116. error = xfs_iomap_write_unwritten(ip, offset, size, false);
  117. if (!error && xfs_ioend_is_append(ioend))
  118. error = xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
  119. done:
  120. iomap_finish_ioends(ioend, error);
  121. memalloc_nofs_restore(nofs_flag);
  122. }
  123. /*
  124. * Finish all pending IO completions that require transactional modifications.
  125. *
  126. * We try to merge physical and logically contiguous ioends before completion to
  127. * minimise the number of transactions we need to perform during IO completion.
  128. * Both unwritten extent conversion and COW remapping need to iterate and modify
  129. * one physical extent at a time, so we gain nothing by merging physically
  130. * discontiguous extents here.
  131. *
  132. * The ioend chain length that we can be processing here is largely unbound in
  133. * length and we may have to perform significant amounts of work on each ioend
  134. * to complete it. Hence we have to be careful about holding the CPU for too
  135. * long in this loop.
  136. */
  137. void
  138. xfs_end_io(
  139. struct work_struct *work)
  140. {
  141. struct xfs_inode *ip =
  142. container_of(work, struct xfs_inode, i_ioend_work);
  143. struct iomap_ioend *ioend;
  144. struct list_head tmp;
  145. unsigned long flags;
  146. spin_lock_irqsave(&ip->i_ioend_lock, flags);
  147. list_replace_init(&ip->i_ioend_list, &tmp);
  148. spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
  149. iomap_sort_ioends(&tmp);
  150. while ((ioend = list_first_entry_or_null(&tmp, struct iomap_ioend,
  151. io_list))) {
  152. list_del_init(&ioend->io_list);
  153. iomap_ioend_try_merge(ioend, &tmp);
  154. xfs_end_ioend(ioend);
  155. cond_resched();
  156. }
  157. }
  158. STATIC void
  159. xfs_end_bio(
  160. struct bio *bio)
  161. {
  162. struct iomap_ioend *ioend = bio->bi_private;
  163. struct xfs_inode *ip = XFS_I(ioend->io_inode);
  164. unsigned long flags;
  165. spin_lock_irqsave(&ip->i_ioend_lock, flags);
  166. if (list_empty(&ip->i_ioend_list))
  167. WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue,
  168. &ip->i_ioend_work));
  169. list_add_tail(&ioend->io_list, &ip->i_ioend_list);
  170. spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
  171. }
  172. /*
  173. * Fast revalidation of the cached writeback mapping. Return true if the current
  174. * mapping is valid, false otherwise.
  175. */
  176. static bool
  177. xfs_imap_valid(
  178. struct iomap_writepage_ctx *wpc,
  179. struct xfs_inode *ip,
  180. loff_t offset)
  181. {
  182. if (offset < wpc->iomap.offset ||
  183. offset >= wpc->iomap.offset + wpc->iomap.length)
  184. return false;
  185. /*
  186. * If this is a COW mapping, it is sufficient to check that the mapping
  187. * covers the offset. Be careful to check this first because the caller
  188. * can revalidate a COW mapping without updating the data seqno.
  189. */
  190. if (wpc->iomap.flags & IOMAP_F_SHARED)
  191. return true;
  192. /*
  193. * This is not a COW mapping. Check the sequence number of the data fork
  194. * because concurrent changes could have invalidated the extent. Check
  195. * the COW fork because concurrent changes since the last time we
  196. * checked (and found nothing at this offset) could have added
  197. * overlapping blocks.
  198. */
  199. if (XFS_WPC(wpc)->data_seq != READ_ONCE(ip->i_df.if_seq))
  200. return false;
  201. if (xfs_inode_has_cow_data(ip) &&
  202. XFS_WPC(wpc)->cow_seq != READ_ONCE(ip->i_cowfp->if_seq))
  203. return false;
  204. return true;
  205. }
  206. /*
  207. * Pass in a dellalloc extent and convert it to real extents, return the real
  208. * extent that maps offset_fsb in wpc->iomap.
  209. *
  210. * The current page is held locked so nothing could have removed the block
  211. * backing offset_fsb, although it could have moved from the COW to the data
  212. * fork by another thread.
  213. */
  214. static int
  215. xfs_convert_blocks(
  216. struct iomap_writepage_ctx *wpc,
  217. struct xfs_inode *ip,
  218. int whichfork,
  219. loff_t offset)
  220. {
  221. int error;
  222. unsigned *seq;
  223. if (whichfork == XFS_COW_FORK)
  224. seq = &XFS_WPC(wpc)->cow_seq;
  225. else
  226. seq = &XFS_WPC(wpc)->data_seq;
  227. /*
  228. * Attempt to allocate whatever delalloc extent currently backs offset
  229. * and put the result into wpc->iomap. Allocate in a loop because it
  230. * may take several attempts to allocate real blocks for a contiguous
  231. * delalloc extent if free space is sufficiently fragmented.
  232. */
  233. do {
  234. error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
  235. &wpc->iomap, seq);
  236. if (error)
  237. return error;
  238. } while (wpc->iomap.offset + wpc->iomap.length <= offset);
  239. return 0;
  240. }
  241. static int
  242. xfs_map_blocks(
  243. struct iomap_writepage_ctx *wpc,
  244. struct inode *inode,
  245. loff_t offset)
  246. {
  247. struct xfs_inode *ip = XFS_I(inode);
  248. struct xfs_mount *mp = ip->i_mount;
  249. ssize_t count = i_blocksize(inode);
  250. xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
  251. xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
  252. xfs_fileoff_t cow_fsb;
  253. int whichfork;
  254. struct xfs_bmbt_irec imap;
  255. struct xfs_iext_cursor icur;
  256. int retries = 0;
  257. int error = 0;
  258. if (xfs_is_shutdown(mp))
  259. return -EIO;
  260. /*
  261. * COW fork blocks can overlap data fork blocks even if the blocks
  262. * aren't shared. COW I/O always takes precedent, so we must always
  263. * check for overlap on reflink inodes unless the mapping is already a
  264. * COW one, or the COW fork hasn't changed from the last time we looked
  265. * at it.
  266. *
  267. * It's safe to check the COW fork if_seq here without the ILOCK because
  268. * we've indirectly protected against concurrent updates: writeback has
  269. * the page locked, which prevents concurrent invalidations by reflink
  270. * and directio and prevents concurrent buffered writes to the same
  271. * page. Changes to if_seq always happen under i_lock, which protects
  272. * against concurrent updates and provides a memory barrier on the way
  273. * out that ensures that we always see the current value.
  274. */
  275. if (xfs_imap_valid(wpc, ip, offset))
  276. return 0;
  277. /*
  278. * If we don't have a valid map, now it's time to get a new one for this
  279. * offset. This will convert delayed allocations (including COW ones)
  280. * into real extents. If we return without a valid map, it means we
  281. * landed in a hole and we skip the block.
  282. */
  283. retry:
  284. cow_fsb = NULLFILEOFF;
  285. whichfork = XFS_DATA_FORK;
  286. xfs_ilock(ip, XFS_ILOCK_SHARED);
  287. ASSERT(!xfs_need_iread_extents(&ip->i_df));
  288. /*
  289. * Check if this is offset is covered by a COW extents, and if yes use
  290. * it directly instead of looking up anything in the data fork.
  291. */
  292. if (xfs_inode_has_cow_data(ip) &&
  293. xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap))
  294. cow_fsb = imap.br_startoff;
  295. if (cow_fsb != NULLFILEOFF && cow_fsb <= offset_fsb) {
  296. XFS_WPC(wpc)->cow_seq = READ_ONCE(ip->i_cowfp->if_seq);
  297. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  298. whichfork = XFS_COW_FORK;
  299. goto allocate_blocks;
  300. }
  301. /*
  302. * No COW extent overlap. Revalidate now that we may have updated
  303. * ->cow_seq. If the data mapping is still valid, we're done.
  304. */
  305. if (xfs_imap_valid(wpc, ip, offset)) {
  306. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  307. return 0;
  308. }
  309. /*
  310. * If we don't have a valid map, now it's time to get a new one for this
  311. * offset. This will convert delayed allocations (including COW ones)
  312. * into real extents.
  313. */
  314. if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
  315. imap.br_startoff = end_fsb; /* fake a hole past EOF */
  316. XFS_WPC(wpc)->data_seq = READ_ONCE(ip->i_df.if_seq);
  317. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  318. /* landed in a hole or beyond EOF? */
  319. if (imap.br_startoff > offset_fsb) {
  320. imap.br_blockcount = imap.br_startoff - offset_fsb;
  321. imap.br_startoff = offset_fsb;
  322. imap.br_startblock = HOLESTARTBLOCK;
  323. imap.br_state = XFS_EXT_NORM;
  324. }
  325. /*
  326. * Truncate to the next COW extent if there is one. This is the only
  327. * opportunity to do this because we can skip COW fork lookups for the
  328. * subsequent blocks in the mapping; however, the requirement to treat
  329. * the COW range separately remains.
  330. */
  331. if (cow_fsb != NULLFILEOFF &&
  332. cow_fsb < imap.br_startoff + imap.br_blockcount)
  333. imap.br_blockcount = cow_fsb - imap.br_startoff;
  334. /* got a delalloc extent? */
  335. if (imap.br_startblock != HOLESTARTBLOCK &&
  336. isnullstartblock(imap.br_startblock))
  337. goto allocate_blocks;
  338. xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0);
  339. trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
  340. return 0;
  341. allocate_blocks:
  342. error = xfs_convert_blocks(wpc, ip, whichfork, offset);
  343. if (error) {
  344. /*
  345. * If we failed to find the extent in the COW fork we might have
  346. * raced with a COW to data fork conversion or truncate.
  347. * Restart the lookup to catch the extent in the data fork for
  348. * the former case, but prevent additional retries to avoid
  349. * looping forever for the latter case.
  350. */
  351. if (error == -EAGAIN && whichfork == XFS_COW_FORK && !retries++)
  352. goto retry;
  353. ASSERT(error != -EAGAIN);
  354. return error;
  355. }
  356. /*
  357. * Due to merging the return real extent might be larger than the
  358. * original delalloc one. Trim the return extent to the next COW
  359. * boundary again to force a re-lookup.
  360. */
  361. if (whichfork != XFS_COW_FORK && cow_fsb != NULLFILEOFF) {
  362. loff_t cow_offset = XFS_FSB_TO_B(mp, cow_fsb);
  363. if (cow_offset < wpc->iomap.offset + wpc->iomap.length)
  364. wpc->iomap.length = cow_offset - wpc->iomap.offset;
  365. }
  366. ASSERT(wpc->iomap.offset <= offset);
  367. ASSERT(wpc->iomap.offset + wpc->iomap.length > offset);
  368. trace_xfs_map_blocks_alloc(ip, offset, count, whichfork, &imap);
  369. return 0;
  370. }
  371. static int
  372. xfs_prepare_ioend(
  373. struct iomap_ioend *ioend,
  374. int status)
  375. {
  376. unsigned int nofs_flag;
  377. /*
  378. * We can allocate memory here while doing writeback on behalf of
  379. * memory reclaim. To avoid memory allocation deadlocks set the
  380. * task-wide nofs context for the following operations.
  381. */
  382. nofs_flag = memalloc_nofs_save();
  383. /* Convert CoW extents to regular */
  384. if (!status && (ioend->io_flags & IOMAP_F_SHARED)) {
  385. status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
  386. ioend->io_offset, ioend->io_size);
  387. }
  388. memalloc_nofs_restore(nofs_flag);
  389. /* send ioends that might require a transaction to the completion wq */
  390. if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN ||
  391. (ioend->io_flags & IOMAP_F_SHARED))
  392. ioend->io_bio->bi_end_io = xfs_end_bio;
  393. return status;
  394. }
  395. /*
  396. * If the page has delalloc blocks on it, we need to punch them out before we
  397. * invalidate the page. If we don't, we leave a stale delalloc mapping on the
  398. * inode that can trip up a later direct I/O read operation on the same region.
  399. *
  400. * We prevent this by truncating away the delalloc regions on the page. Because
  401. * they are delalloc, we can do this without needing a transaction. Indeed - if
  402. * we get ENOSPC errors, we have to be able to do this truncation without a
  403. * transaction as there is no space left for block reservation (typically why we
  404. * see a ENOSPC in writeback).
  405. */
  406. static void
  407. xfs_discard_folio(
  408. struct folio *folio,
  409. loff_t pos)
  410. {
  411. struct inode *inode = folio->mapping->host;
  412. struct xfs_inode *ip = XFS_I(inode);
  413. struct xfs_mount *mp = ip->i_mount;
  414. size_t offset = offset_in_folio(folio, pos);
  415. xfs_fileoff_t start_fsb = XFS_B_TO_FSBT(mp, pos);
  416. xfs_fileoff_t pageoff_fsb = XFS_B_TO_FSBT(mp, offset);
  417. int error;
  418. if (xfs_is_shutdown(mp))
  419. return;
  420. xfs_alert_ratelimited(mp,
  421. "page discard on page "PTR_FMT", inode 0x%llx, pos %llu.",
  422. folio, ip->i_ino, pos);
  423. error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
  424. i_blocks_per_folio(inode, folio) - pageoff_fsb);
  425. if (error && !xfs_is_shutdown(mp))
  426. xfs_alert(mp, "page discard unable to remove delalloc mapping.");
  427. }
  428. static const struct iomap_writeback_ops xfs_writeback_ops = {
  429. .map_blocks = xfs_map_blocks,
  430. .prepare_ioend = xfs_prepare_ioend,
  431. .discard_folio = xfs_discard_folio,
  432. };
  433. STATIC int
  434. xfs_vm_writepages(
  435. struct address_space *mapping,
  436. struct writeback_control *wbc)
  437. {
  438. struct xfs_writepage_ctx wpc = { };
  439. /*
  440. * Writing back data in a transaction context can result in recursive
  441. * transactions. This is bad, so issue a warning and get out of here.
  442. */
  443. if (WARN_ON_ONCE(current->journal_info))
  444. return 0;
  445. xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
  446. return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
  447. }
  448. STATIC int
  449. xfs_dax_writepages(
  450. struct address_space *mapping,
  451. struct writeback_control *wbc)
  452. {
  453. struct xfs_inode *ip = XFS_I(mapping->host);
  454. xfs_iflags_clear(ip, XFS_ITRUNCATED);
  455. return dax_writeback_mapping_range(mapping,
  456. xfs_inode_buftarg(ip)->bt_daxdev, wbc);
  457. }
  458. STATIC sector_t
  459. xfs_vm_bmap(
  460. struct address_space *mapping,
  461. sector_t block)
  462. {
  463. struct xfs_inode *ip = XFS_I(mapping->host);
  464. trace_xfs_vm_bmap(ip);
  465. /*
  466. * The swap code (ab-)uses ->bmap to get a block mapping and then
  467. * bypasses the file system for actual I/O. We really can't allow
  468. * that on reflinks inodes, so we have to skip out here. And yes,
  469. * 0 is the magic code for a bmap error.
  470. *
  471. * Since we don't pass back blockdev info, we can't return bmap
  472. * information for rt files either.
  473. */
  474. if (xfs_is_cow_inode(ip) || XFS_IS_REALTIME_INODE(ip))
  475. return 0;
  476. return iomap_bmap(mapping, block, &xfs_read_iomap_ops);
  477. }
  478. STATIC int
  479. xfs_vm_read_folio(
  480. struct file *unused,
  481. struct folio *folio)
  482. {
  483. return iomap_read_folio(folio, &xfs_read_iomap_ops);
  484. }
  485. STATIC void
  486. xfs_vm_readahead(
  487. struct readahead_control *rac)
  488. {
  489. iomap_readahead(rac, &xfs_read_iomap_ops);
  490. }
  491. static int
  492. xfs_iomap_swapfile_activate(
  493. struct swap_info_struct *sis,
  494. struct file *swap_file,
  495. sector_t *span)
  496. {
  497. sis->bdev = xfs_inode_buftarg(XFS_I(file_inode(swap_file)))->bt_bdev;
  498. return iomap_swapfile_activate(sis, swap_file, span,
  499. &xfs_read_iomap_ops);
  500. }
  501. const struct address_space_operations xfs_address_space_operations = {
  502. .read_folio = xfs_vm_read_folio,
  503. .readahead = xfs_vm_readahead,
  504. .writepages = xfs_vm_writepages,
  505. .dirty_folio = filemap_dirty_folio,
  506. .release_folio = iomap_release_folio,
  507. .invalidate_folio = iomap_invalidate_folio,
  508. .bmap = xfs_vm_bmap,
  509. .direct_IO = noop_direct_IO,
  510. .migrate_folio = filemap_migrate_folio,
  511. .is_partially_uptodate = iomap_is_partially_uptodate,
  512. .error_remove_page = generic_error_remove_page,
  513. .swap_activate = xfs_iomap_swapfile_activate,
  514. };
  515. const struct address_space_operations xfs_dax_aops = {
  516. .writepages = xfs_dax_writepages,
  517. .direct_IO = noop_direct_IO,
  518. .dirty_folio = noop_dirty_folio,
  519. .swap_activate = xfs_iomap_swapfile_activate,
  520. };