xfs_file.c 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2005 Silicon Graphics, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_inode.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_inode_item.h"
  16. #include "xfs_bmap.h"
  17. #include "xfs_bmap_util.h"
  18. #include "xfs_dir2.h"
  19. #include "xfs_dir2_priv.h"
  20. #include "xfs_ioctl.h"
  21. #include "xfs_trace.h"
  22. #include "xfs_log.h"
  23. #include "xfs_icache.h"
  24. #include "xfs_pnfs.h"
  25. #include "xfs_iomap.h"
  26. #include "xfs_reflink.h"
  27. #include <linux/dax.h>
  28. #include <linux/falloc.h>
  29. #include <linux/backing-dev.h>
  30. #include <linux/mman.h>
  31. #include <linux/fadvise.h>
  32. #include <linux/mount.h>
  33. static const struct vm_operations_struct xfs_file_vm_ops;
  34. /*
  35. * Decide if the given file range is aligned to the size of the fundamental
  36. * allocation unit for the file.
  37. */
  38. static bool
  39. xfs_is_falloc_aligned(
  40. struct xfs_inode *ip,
  41. loff_t pos,
  42. long long int len)
  43. {
  44. struct xfs_mount *mp = ip->i_mount;
  45. uint64_t mask;
  46. if (XFS_IS_REALTIME_INODE(ip)) {
  47. if (!is_power_of_2(mp->m_sb.sb_rextsize)) {
  48. u64 rextbytes;
  49. u32 mod;
  50. rextbytes = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize);
  51. div_u64_rem(pos, rextbytes, &mod);
  52. if (mod)
  53. return false;
  54. div_u64_rem(len, rextbytes, &mod);
  55. return mod == 0;
  56. }
  57. mask = XFS_FSB_TO_B(mp, mp->m_sb.sb_rextsize) - 1;
  58. } else {
  59. mask = mp->m_sb.sb_blocksize - 1;
  60. }
  61. return !((pos | len) & mask);
  62. }
  63. /*
  64. * Fsync operations on directories are much simpler than on regular files,
  65. * as there is no file data to flush, and thus also no need for explicit
  66. * cache flush operations, and there are no non-transaction metadata updates
  67. * on directories either.
  68. */
  69. STATIC int
  70. xfs_dir_fsync(
  71. struct file *file,
  72. loff_t start,
  73. loff_t end,
  74. int datasync)
  75. {
  76. struct xfs_inode *ip = XFS_I(file->f_mapping->host);
  77. trace_xfs_dir_fsync(ip);
  78. return xfs_log_force_inode(ip);
  79. }
  80. static xfs_csn_t
  81. xfs_fsync_seq(
  82. struct xfs_inode *ip,
  83. bool datasync)
  84. {
  85. if (!xfs_ipincount(ip))
  86. return 0;
  87. if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
  88. return 0;
  89. return ip->i_itemp->ili_commit_seq;
  90. }
  91. /*
  92. * All metadata updates are logged, which means that we just have to flush the
  93. * log up to the latest LSN that touched the inode.
  94. *
  95. * If we have concurrent fsync/fdatasync() calls, we need them to all block on
  96. * the log force before we clear the ili_fsync_fields field. This ensures that
  97. * we don't get a racing sync operation that does not wait for the metadata to
  98. * hit the journal before returning. If we race with clearing ili_fsync_fields,
  99. * then all that will happen is the log force will do nothing as the lsn will
  100. * already be on disk. We can't race with setting ili_fsync_fields because that
  101. * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
  102. * shared until after the ili_fsync_fields is cleared.
  103. */
  104. static int
  105. xfs_fsync_flush_log(
  106. struct xfs_inode *ip,
  107. bool datasync,
  108. int *log_flushed)
  109. {
  110. int error = 0;
  111. xfs_csn_t seq;
  112. xfs_ilock(ip, XFS_ILOCK_SHARED);
  113. seq = xfs_fsync_seq(ip, datasync);
  114. if (seq) {
  115. error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
  116. log_flushed);
  117. spin_lock(&ip->i_itemp->ili_lock);
  118. ip->i_itemp->ili_fsync_fields = 0;
  119. spin_unlock(&ip->i_itemp->ili_lock);
  120. }
  121. xfs_iunlock(ip, XFS_ILOCK_SHARED);
  122. return error;
  123. }
  124. STATIC int
  125. xfs_file_fsync(
  126. struct file *file,
  127. loff_t start,
  128. loff_t end,
  129. int datasync)
  130. {
  131. struct xfs_inode *ip = XFS_I(file->f_mapping->host);
  132. struct xfs_mount *mp = ip->i_mount;
  133. int error, err2;
  134. int log_flushed = 0;
  135. trace_xfs_file_fsync(ip);
  136. error = file_write_and_wait_range(file, start, end);
  137. if (error)
  138. return error;
  139. if (xfs_is_shutdown(mp))
  140. return -EIO;
  141. xfs_iflags_clear(ip, XFS_ITRUNCATED);
  142. /*
  143. * If we have an RT and/or log subvolume we need to make sure to flush
  144. * the write cache the device used for file data first. This is to
  145. * ensure newly written file data make it to disk before logging the new
  146. * inode size in case of an extending write.
  147. */
  148. if (XFS_IS_REALTIME_INODE(ip))
  149. error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev);
  150. else if (mp->m_logdev_targp != mp->m_ddev_targp)
  151. error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
  152. /*
  153. * Any inode that has dirty modifications in the log is pinned. The
  154. * racy check here for a pinned inode will not catch modifications
  155. * that happen concurrently to the fsync call, but fsync semantics
  156. * only require to sync previously completed I/O.
  157. */
  158. if (xfs_ipincount(ip)) {
  159. err2 = xfs_fsync_flush_log(ip, datasync, &log_flushed);
  160. if (err2 && !error)
  161. error = err2;
  162. }
  163. /*
  164. * If we only have a single device, and the log force about was
  165. * a no-op we might have to flush the data device cache here.
  166. * This can only happen for fdatasync/O_DSYNC if we were overwriting
  167. * an already allocated file and thus do not have any metadata to
  168. * commit.
  169. */
  170. if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
  171. mp->m_logdev_targp == mp->m_ddev_targp) {
  172. err2 = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev);
  173. if (err2 && !error)
  174. error = err2;
  175. }
  176. return error;
  177. }
  178. static int
  179. xfs_ilock_iocb(
  180. struct kiocb *iocb,
  181. unsigned int lock_mode)
  182. {
  183. struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
  184. if (iocb->ki_flags & IOCB_NOWAIT) {
  185. if (!xfs_ilock_nowait(ip, lock_mode))
  186. return -EAGAIN;
  187. } else {
  188. xfs_ilock(ip, lock_mode);
  189. }
  190. return 0;
  191. }
  192. STATIC ssize_t
  193. xfs_file_dio_read(
  194. struct kiocb *iocb,
  195. struct iov_iter *to)
  196. {
  197. struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
  198. ssize_t ret;
  199. trace_xfs_file_direct_read(iocb, to);
  200. if (!iov_iter_count(to))
  201. return 0; /* skip atime */
  202. file_accessed(iocb->ki_filp);
  203. ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
  204. if (ret)
  205. return ret;
  206. ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
  207. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  208. return ret;
  209. }
  210. static noinline ssize_t
  211. xfs_file_dax_read(
  212. struct kiocb *iocb,
  213. struct iov_iter *to)
  214. {
  215. struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host);
  216. ssize_t ret = 0;
  217. trace_xfs_file_dax_read(iocb, to);
  218. if (!iov_iter_count(to))
  219. return 0; /* skip atime */
  220. ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
  221. if (ret)
  222. return ret;
  223. ret = dax_iomap_rw(iocb, to, &xfs_read_iomap_ops);
  224. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  225. file_accessed(iocb->ki_filp);
  226. return ret;
  227. }
  228. STATIC ssize_t
  229. xfs_file_buffered_read(
  230. struct kiocb *iocb,
  231. struct iov_iter *to)
  232. {
  233. struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
  234. ssize_t ret;
  235. trace_xfs_file_buffered_read(iocb, to);
  236. ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
  237. if (ret)
  238. return ret;
  239. ret = generic_file_read_iter(iocb, to);
  240. xfs_iunlock(ip, XFS_IOLOCK_SHARED);
  241. return ret;
  242. }
  243. STATIC ssize_t
  244. xfs_file_read_iter(
  245. struct kiocb *iocb,
  246. struct iov_iter *to)
  247. {
  248. struct inode *inode = file_inode(iocb->ki_filp);
  249. struct xfs_mount *mp = XFS_I(inode)->i_mount;
  250. ssize_t ret = 0;
  251. XFS_STATS_INC(mp, xs_read_calls);
  252. if (xfs_is_shutdown(mp))
  253. return -EIO;
  254. if (IS_DAX(inode))
  255. ret = xfs_file_dax_read(iocb, to);
  256. else if (iocb->ki_flags & IOCB_DIRECT)
  257. ret = xfs_file_dio_read(iocb, to);
  258. else
  259. ret = xfs_file_buffered_read(iocb, to);
  260. if (ret > 0)
  261. XFS_STATS_ADD(mp, xs_read_bytes, ret);
  262. return ret;
  263. }
  264. /*
  265. * Common pre-write limit and setup checks.
  266. *
  267. * Called with the iolocked held either shared and exclusive according to
  268. * @iolock, and returns with it held. Might upgrade the iolock to exclusive
  269. * if called for a direct write beyond i_size.
  270. */
  271. STATIC ssize_t
  272. xfs_file_write_checks(
  273. struct kiocb *iocb,
  274. struct iov_iter *from,
  275. unsigned int *iolock)
  276. {
  277. struct file *file = iocb->ki_filp;
  278. struct inode *inode = file->f_mapping->host;
  279. struct xfs_inode *ip = XFS_I(inode);
  280. ssize_t error = 0;
  281. size_t count = iov_iter_count(from);
  282. bool drained_dio = false;
  283. loff_t isize;
  284. restart:
  285. error = generic_write_checks(iocb, from);
  286. if (error <= 0)
  287. return error;
  288. if (iocb->ki_flags & IOCB_NOWAIT) {
  289. error = break_layout(inode, false);
  290. if (error == -EWOULDBLOCK)
  291. error = -EAGAIN;
  292. } else {
  293. error = xfs_break_layouts(inode, iolock, BREAK_WRITE);
  294. }
  295. if (error)
  296. return error;
  297. /*
  298. * For changing security info in file_remove_privs() we need i_rwsem
  299. * exclusively.
  300. */
  301. if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
  302. xfs_iunlock(ip, *iolock);
  303. *iolock = XFS_IOLOCK_EXCL;
  304. error = xfs_ilock_iocb(iocb, *iolock);
  305. if (error) {
  306. *iolock = 0;
  307. return error;
  308. }
  309. goto restart;
  310. }
  311. /*
  312. * If the offset is beyond the size of the file, we need to zero any
  313. * blocks that fall between the existing EOF and the start of this
  314. * write. If zeroing is needed and we are currently holding the iolock
  315. * shared, we need to update it to exclusive which implies having to
  316. * redo all checks before.
  317. *
  318. * We need to serialise against EOF updates that occur in IO completions
  319. * here. We want to make sure that nobody is changing the size while we
  320. * do this check until we have placed an IO barrier (i.e. hold the
  321. * XFS_IOLOCK_EXCL) that prevents new IO from being dispatched. The
  322. * spinlock effectively forms a memory barrier once we have the
  323. * XFS_IOLOCK_EXCL so we are guaranteed to see the latest EOF value and
  324. * hence be able to correctly determine if we need to run zeroing.
  325. *
  326. * We can do an unlocked check here safely as IO completion can only
  327. * extend EOF. Truncate is locked out at this point, so the EOF can
  328. * not move backwards, only forwards. Hence we only need to take the
  329. * slow path and spin locks when we are at or beyond the current EOF.
  330. */
  331. if (iocb->ki_pos <= i_size_read(inode))
  332. goto out;
  333. spin_lock(&ip->i_flags_lock);
  334. isize = i_size_read(inode);
  335. if (iocb->ki_pos > isize) {
  336. spin_unlock(&ip->i_flags_lock);
  337. if (iocb->ki_flags & IOCB_NOWAIT)
  338. return -EAGAIN;
  339. if (!drained_dio) {
  340. if (*iolock == XFS_IOLOCK_SHARED) {
  341. xfs_iunlock(ip, *iolock);
  342. *iolock = XFS_IOLOCK_EXCL;
  343. xfs_ilock(ip, *iolock);
  344. iov_iter_reexpand(from, count);
  345. }
  346. /*
  347. * We now have an IO submission barrier in place, but
  348. * AIO can do EOF updates during IO completion and hence
  349. * we now need to wait for all of them to drain. Non-AIO
  350. * DIO will have drained before we are given the
  351. * XFS_IOLOCK_EXCL, and so for most cases this wait is a
  352. * no-op.
  353. */
  354. inode_dio_wait(inode);
  355. drained_dio = true;
  356. goto restart;
  357. }
  358. trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize);
  359. error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL);
  360. if (error)
  361. return error;
  362. } else
  363. spin_unlock(&ip->i_flags_lock);
  364. out:
  365. return kiocb_modified(iocb);
  366. }
  367. static int
  368. xfs_dio_write_end_io(
  369. struct kiocb *iocb,
  370. ssize_t size,
  371. int error,
  372. unsigned flags)
  373. {
  374. struct inode *inode = file_inode(iocb->ki_filp);
  375. struct xfs_inode *ip = XFS_I(inode);
  376. loff_t offset = iocb->ki_pos;
  377. unsigned int nofs_flag;
  378. trace_xfs_end_io_direct_write(ip, offset, size);
  379. if (xfs_is_shutdown(ip->i_mount))
  380. return -EIO;
  381. if (error)
  382. return error;
  383. if (!size)
  384. return 0;
  385. /*
  386. * Capture amount written on completion as we can't reliably account
  387. * for it on submission.
  388. */
  389. XFS_STATS_ADD(ip->i_mount, xs_write_bytes, size);
  390. /*
  391. * We can allocate memory here while doing writeback on behalf of
  392. * memory reclaim. To avoid memory allocation deadlocks set the
  393. * task-wide nofs context for the following operations.
  394. */
  395. nofs_flag = memalloc_nofs_save();
  396. if (flags & IOMAP_DIO_COW) {
  397. error = xfs_reflink_end_cow(ip, offset, size);
  398. if (error)
  399. goto out;
  400. }
  401. /*
  402. * Unwritten conversion updates the in-core isize after extent
  403. * conversion but before updating the on-disk size. Updating isize any
  404. * earlier allows a racing dio read to find unwritten extents before
  405. * they are converted.
  406. */
  407. if (flags & IOMAP_DIO_UNWRITTEN) {
  408. error = xfs_iomap_write_unwritten(ip, offset, size, true);
  409. goto out;
  410. }
  411. /*
  412. * We need to update the in-core inode size here so that we don't end up
  413. * with the on-disk inode size being outside the in-core inode size. We
  414. * have no other method of updating EOF for AIO, so always do it here
  415. * if necessary.
  416. *
  417. * We need to lock the test/set EOF update as we can be racing with
  418. * other IO completions here to update the EOF. Failing to serialise
  419. * here can result in EOF moving backwards and Bad Things Happen when
  420. * that occurs.
  421. *
  422. * As IO completion only ever extends EOF, we can do an unlocked check
  423. * here to avoid taking the spinlock. If we land within the current EOF,
  424. * then we do not need to do an extending update at all, and we don't
  425. * need to take the lock to check this. If we race with an update moving
  426. * EOF, then we'll either still be beyond EOF and need to take the lock,
  427. * or we'll be within EOF and we don't need to take it at all.
  428. */
  429. if (offset + size <= i_size_read(inode))
  430. goto out;
  431. spin_lock(&ip->i_flags_lock);
  432. if (offset + size > i_size_read(inode)) {
  433. i_size_write(inode, offset + size);
  434. spin_unlock(&ip->i_flags_lock);
  435. error = xfs_setfilesize(ip, offset, size);
  436. } else {
  437. spin_unlock(&ip->i_flags_lock);
  438. }
  439. out:
  440. memalloc_nofs_restore(nofs_flag);
  441. return error;
  442. }
  443. static const struct iomap_dio_ops xfs_dio_write_ops = {
  444. .end_io = xfs_dio_write_end_io,
  445. };
  446. /*
  447. * Handle block aligned direct I/O writes
  448. */
  449. static noinline ssize_t
  450. xfs_file_dio_write_aligned(
  451. struct xfs_inode *ip,
  452. struct kiocb *iocb,
  453. struct iov_iter *from)
  454. {
  455. unsigned int iolock = XFS_IOLOCK_SHARED;
  456. ssize_t ret;
  457. ret = xfs_ilock_iocb(iocb, iolock);
  458. if (ret)
  459. return ret;
  460. ret = xfs_file_write_checks(iocb, from, &iolock);
  461. if (ret)
  462. goto out_unlock;
  463. /*
  464. * We don't need to hold the IOLOCK exclusively across the IO, so demote
  465. * the iolock back to shared if we had to take the exclusive lock in
  466. * xfs_file_write_checks() for other reasons.
  467. */
  468. if (iolock == XFS_IOLOCK_EXCL) {
  469. xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
  470. iolock = XFS_IOLOCK_SHARED;
  471. }
  472. trace_xfs_file_direct_write(iocb, from);
  473. ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
  474. &xfs_dio_write_ops, 0, NULL, 0);
  475. out_unlock:
  476. if (iolock)
  477. xfs_iunlock(ip, iolock);
  478. return ret;
  479. }
  480. /*
  481. * Handle block unaligned direct I/O writes
  482. *
  483. * In most cases direct I/O writes will be done holding IOLOCK_SHARED, allowing
  484. * them to be done in parallel with reads and other direct I/O writes. However,
  485. * if the I/O is not aligned to filesystem blocks, the direct I/O layer may need
  486. * to do sub-block zeroing and that requires serialisation against other direct
  487. * I/O to the same block. In this case we need to serialise the submission of
  488. * the unaligned I/O so that we don't get racing block zeroing in the dio layer.
  489. * In the case where sub-block zeroing is not required, we can do concurrent
  490. * sub-block dios to the same block successfully.
  491. *
  492. * Optimistically submit the I/O using the shared lock first, but use the
  493. * IOMAP_DIO_OVERWRITE_ONLY flag to tell the lower layers to return -EAGAIN
  494. * if block allocation or partial block zeroing would be required. In that case
  495. * we try again with the exclusive lock.
  496. */
  497. static noinline ssize_t
  498. xfs_file_dio_write_unaligned(
  499. struct xfs_inode *ip,
  500. struct kiocb *iocb,
  501. struct iov_iter *from)
  502. {
  503. size_t isize = i_size_read(VFS_I(ip));
  504. size_t count = iov_iter_count(from);
  505. unsigned int iolock = XFS_IOLOCK_SHARED;
  506. unsigned int flags = IOMAP_DIO_OVERWRITE_ONLY;
  507. ssize_t ret;
  508. /*
  509. * Extending writes need exclusivity because of the sub-block zeroing
  510. * that the DIO code always does for partial tail blocks beyond EOF, so
  511. * don't even bother trying the fast path in this case.
  512. */
  513. if (iocb->ki_pos > isize || iocb->ki_pos + count >= isize) {
  514. if (iocb->ki_flags & IOCB_NOWAIT)
  515. return -EAGAIN;
  516. retry_exclusive:
  517. iolock = XFS_IOLOCK_EXCL;
  518. flags = IOMAP_DIO_FORCE_WAIT;
  519. }
  520. ret = xfs_ilock_iocb(iocb, iolock);
  521. if (ret)
  522. return ret;
  523. /*
  524. * We can't properly handle unaligned direct I/O to reflink files yet,
  525. * as we can't unshare a partial block.
  526. */
  527. if (xfs_is_cow_inode(ip)) {
  528. trace_xfs_reflink_bounce_dio_write(iocb, from);
  529. ret = -ENOTBLK;
  530. goto out_unlock;
  531. }
  532. ret = xfs_file_write_checks(iocb, from, &iolock);
  533. if (ret)
  534. goto out_unlock;
  535. /*
  536. * If we are doing exclusive unaligned I/O, this must be the only I/O
  537. * in-flight. Otherwise we risk data corruption due to unwritten extent
  538. * conversions from the AIO end_io handler. Wait for all other I/O to
  539. * drain first.
  540. */
  541. if (flags & IOMAP_DIO_FORCE_WAIT)
  542. inode_dio_wait(VFS_I(ip));
  543. trace_xfs_file_direct_write(iocb, from);
  544. ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
  545. &xfs_dio_write_ops, flags, NULL, 0);
  546. /*
  547. * Retry unaligned I/O with exclusive blocking semantics if the DIO
  548. * layer rejected it for mapping or locking reasons. If we are doing
  549. * nonblocking user I/O, propagate the error.
  550. */
  551. if (ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT)) {
  552. ASSERT(flags & IOMAP_DIO_OVERWRITE_ONLY);
  553. xfs_iunlock(ip, iolock);
  554. goto retry_exclusive;
  555. }
  556. out_unlock:
  557. if (iolock)
  558. xfs_iunlock(ip, iolock);
  559. return ret;
  560. }
  561. static ssize_t
  562. xfs_file_dio_write(
  563. struct kiocb *iocb,
  564. struct iov_iter *from)
  565. {
  566. struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp));
  567. struct xfs_buftarg *target = xfs_inode_buftarg(ip);
  568. size_t count = iov_iter_count(from);
  569. /* direct I/O must be aligned to device logical sector size */
  570. if ((iocb->ki_pos | count) & target->bt_logical_sectormask)
  571. return -EINVAL;
  572. if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask)
  573. return xfs_file_dio_write_unaligned(ip, iocb, from);
  574. return xfs_file_dio_write_aligned(ip, iocb, from);
  575. }
  576. static noinline ssize_t
  577. xfs_file_dax_write(
  578. struct kiocb *iocb,
  579. struct iov_iter *from)
  580. {
  581. struct inode *inode = iocb->ki_filp->f_mapping->host;
  582. struct xfs_inode *ip = XFS_I(inode);
  583. unsigned int iolock = XFS_IOLOCK_EXCL;
  584. ssize_t ret, error = 0;
  585. loff_t pos;
  586. ret = xfs_ilock_iocb(iocb, iolock);
  587. if (ret)
  588. return ret;
  589. ret = xfs_file_write_checks(iocb, from, &iolock);
  590. if (ret)
  591. goto out;
  592. pos = iocb->ki_pos;
  593. trace_xfs_file_dax_write(iocb, from);
  594. ret = dax_iomap_rw(iocb, from, &xfs_dax_write_iomap_ops);
  595. if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
  596. i_size_write(inode, iocb->ki_pos);
  597. error = xfs_setfilesize(ip, pos, ret);
  598. }
  599. out:
  600. if (iolock)
  601. xfs_iunlock(ip, iolock);
  602. if (error)
  603. return error;
  604. if (ret > 0) {
  605. XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
  606. /* Handle various SYNC-type writes */
  607. ret = generic_write_sync(iocb, ret);
  608. }
  609. return ret;
  610. }
  611. STATIC ssize_t
  612. xfs_file_buffered_write(
  613. struct kiocb *iocb,
  614. struct iov_iter *from)
  615. {
  616. struct inode *inode = iocb->ki_filp->f_mapping->host;
  617. struct xfs_inode *ip = XFS_I(inode);
  618. ssize_t ret;
  619. bool cleared_space = false;
  620. unsigned int iolock;
  621. write_retry:
  622. iolock = XFS_IOLOCK_EXCL;
  623. ret = xfs_ilock_iocb(iocb, iolock);
  624. if (ret)
  625. return ret;
  626. ret = xfs_file_write_checks(iocb, from, &iolock);
  627. if (ret)
  628. goto out;
  629. /* We can write back this queue in page reclaim */
  630. current->backing_dev_info = inode_to_bdi(inode);
  631. trace_xfs_file_buffered_write(iocb, from);
  632. ret = iomap_file_buffered_write(iocb, from,
  633. &xfs_buffered_write_iomap_ops);
  634. if (likely(ret >= 0))
  635. iocb->ki_pos += ret;
  636. /*
  637. * If we hit a space limit, try to free up some lingering preallocated
  638. * space before returning an error. In the case of ENOSPC, first try to
  639. * write back all dirty inodes to free up some of the excess reserved
  640. * metadata space. This reduces the chances that the eofblocks scan
  641. * waits on dirty mappings. Since xfs_flush_inodes() is serialized, this
  642. * also behaves as a filter to prevent too many eofblocks scans from
  643. * running at the same time. Use a synchronous scan to increase the
  644. * effectiveness of the scan.
  645. */
  646. if (ret == -EDQUOT && !cleared_space) {
  647. xfs_iunlock(ip, iolock);
  648. xfs_blockgc_free_quota(ip, XFS_ICWALK_FLAG_SYNC);
  649. cleared_space = true;
  650. goto write_retry;
  651. } else if (ret == -ENOSPC && !cleared_space) {
  652. struct xfs_icwalk icw = {0};
  653. cleared_space = true;
  654. xfs_flush_inodes(ip->i_mount);
  655. xfs_iunlock(ip, iolock);
  656. icw.icw_flags = XFS_ICWALK_FLAG_SYNC;
  657. xfs_blockgc_free_space(ip->i_mount, &icw);
  658. goto write_retry;
  659. }
  660. current->backing_dev_info = NULL;
  661. out:
  662. if (iolock)
  663. xfs_iunlock(ip, iolock);
  664. if (ret > 0) {
  665. XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
  666. /* Handle various SYNC-type writes */
  667. ret = generic_write_sync(iocb, ret);
  668. }
  669. return ret;
  670. }
  671. STATIC ssize_t
  672. xfs_file_write_iter(
  673. struct kiocb *iocb,
  674. struct iov_iter *from)
  675. {
  676. struct inode *inode = iocb->ki_filp->f_mapping->host;
  677. struct xfs_inode *ip = XFS_I(inode);
  678. ssize_t ret;
  679. size_t ocount = iov_iter_count(from);
  680. XFS_STATS_INC(ip->i_mount, xs_write_calls);
  681. if (ocount == 0)
  682. return 0;
  683. if (xfs_is_shutdown(ip->i_mount))
  684. return -EIO;
  685. if (IS_DAX(inode))
  686. return xfs_file_dax_write(iocb, from);
  687. if (iocb->ki_flags & IOCB_DIRECT) {
  688. /*
  689. * Allow a directio write to fall back to a buffered
  690. * write *only* in the case that we're doing a reflink
  691. * CoW. In all other directio scenarios we do not
  692. * allow an operation to fall back to buffered mode.
  693. */
  694. ret = xfs_file_dio_write(iocb, from);
  695. if (ret != -ENOTBLK)
  696. return ret;
  697. }
  698. return xfs_file_buffered_write(iocb, from);
  699. }
  700. static void
  701. xfs_wait_dax_page(
  702. struct inode *inode)
  703. {
  704. struct xfs_inode *ip = XFS_I(inode);
  705. xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
  706. schedule();
  707. xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
  708. }
  709. int
  710. xfs_break_dax_layouts(
  711. struct inode *inode,
  712. bool *retry)
  713. {
  714. struct page *page;
  715. ASSERT(xfs_isilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL));
  716. page = dax_layout_busy_page(inode->i_mapping);
  717. if (!page)
  718. return 0;
  719. *retry = true;
  720. return ___wait_var_event(&page->_refcount,
  721. atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE,
  722. 0, 0, xfs_wait_dax_page(inode));
  723. }
  724. int
  725. xfs_break_layouts(
  726. struct inode *inode,
  727. uint *iolock,
  728. enum layout_break_reason reason)
  729. {
  730. bool retry;
  731. int error;
  732. ASSERT(xfs_isilocked(XFS_I(inode), XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
  733. do {
  734. retry = false;
  735. switch (reason) {
  736. case BREAK_UNMAP:
  737. error = xfs_break_dax_layouts(inode, &retry);
  738. if (error || retry)
  739. break;
  740. fallthrough;
  741. case BREAK_WRITE:
  742. error = xfs_break_leased_layouts(inode, iolock, &retry);
  743. break;
  744. default:
  745. WARN_ON_ONCE(1);
  746. error = -EINVAL;
  747. }
  748. } while (error == 0 && retry);
  749. return error;
  750. }
  751. /* Does this file, inode, or mount want synchronous writes? */
  752. static inline bool xfs_file_sync_writes(struct file *filp)
  753. {
  754. struct xfs_inode *ip = XFS_I(file_inode(filp));
  755. if (xfs_has_wsync(ip->i_mount))
  756. return true;
  757. if (filp->f_flags & (__O_SYNC | O_DSYNC))
  758. return true;
  759. if (IS_SYNC(file_inode(filp)))
  760. return true;
  761. return false;
  762. }
  763. #define XFS_FALLOC_FL_SUPPORTED \
  764. (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
  765. FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | \
  766. FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
  767. STATIC long
  768. xfs_file_fallocate(
  769. struct file *file,
  770. int mode,
  771. loff_t offset,
  772. loff_t len)
  773. {
  774. struct inode *inode = file_inode(file);
  775. struct xfs_inode *ip = XFS_I(inode);
  776. long error;
  777. uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
  778. loff_t new_size = 0;
  779. bool do_file_insert = false;
  780. if (!S_ISREG(inode->i_mode))
  781. return -EINVAL;
  782. if (mode & ~XFS_FALLOC_FL_SUPPORTED)
  783. return -EOPNOTSUPP;
  784. xfs_ilock(ip, iolock);
  785. error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP);
  786. if (error)
  787. goto out_unlock;
  788. /*
  789. * Must wait for all AIO to complete before we continue as AIO can
  790. * change the file size on completion without holding any locks we
  791. * currently hold. We must do this first because AIO can update both
  792. * the on disk and in memory inode sizes, and the operations that follow
  793. * require the in-memory size to be fully up-to-date.
  794. */
  795. inode_dio_wait(inode);
  796. /*
  797. * Now AIO and DIO has drained we flush and (if necessary) invalidate
  798. * the cached range over the first operation we are about to run.
  799. *
  800. * We care about zero and collapse here because they both run a hole
  801. * punch over the range first. Because that can zero data, and the range
  802. * of invalidation for the shift operations is much larger, we still do
  803. * the required flush for collapse in xfs_prepare_shift().
  804. *
  805. * Insert has the same range requirements as collapse, and we extend the
  806. * file first which can zero data. Hence insert has the same
  807. * flush/invalidate requirements as collapse and so they are both
  808. * handled at the right time by xfs_prepare_shift().
  809. */
  810. if (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE |
  811. FALLOC_FL_COLLAPSE_RANGE)) {
  812. error = xfs_flush_unmap_range(ip, offset, len);
  813. if (error)
  814. goto out_unlock;
  815. }
  816. error = file_modified(file);
  817. if (error)
  818. goto out_unlock;
  819. if (mode & FALLOC_FL_PUNCH_HOLE) {
  820. error = xfs_free_file_space(ip, offset, len);
  821. if (error)
  822. goto out_unlock;
  823. } else if (mode & FALLOC_FL_COLLAPSE_RANGE) {
  824. if (!xfs_is_falloc_aligned(ip, offset, len)) {
  825. error = -EINVAL;
  826. goto out_unlock;
  827. }
  828. /*
  829. * There is no need to overlap collapse range with EOF,
  830. * in which case it is effectively a truncate operation
  831. */
  832. if (offset + len >= i_size_read(inode)) {
  833. error = -EINVAL;
  834. goto out_unlock;
  835. }
  836. new_size = i_size_read(inode) - len;
  837. error = xfs_collapse_file_space(ip, offset, len);
  838. if (error)
  839. goto out_unlock;
  840. } else if (mode & FALLOC_FL_INSERT_RANGE) {
  841. loff_t isize = i_size_read(inode);
  842. if (!xfs_is_falloc_aligned(ip, offset, len)) {
  843. error = -EINVAL;
  844. goto out_unlock;
  845. }
  846. /*
  847. * New inode size must not exceed ->s_maxbytes, accounting for
  848. * possible signed overflow.
  849. */
  850. if (inode->i_sb->s_maxbytes - isize < len) {
  851. error = -EFBIG;
  852. goto out_unlock;
  853. }
  854. new_size = isize + len;
  855. /* Offset should be less than i_size */
  856. if (offset >= isize) {
  857. error = -EINVAL;
  858. goto out_unlock;
  859. }
  860. do_file_insert = true;
  861. } else {
  862. if (!(mode & FALLOC_FL_KEEP_SIZE) &&
  863. offset + len > i_size_read(inode)) {
  864. new_size = offset + len;
  865. error = inode_newsize_ok(inode, new_size);
  866. if (error)
  867. goto out_unlock;
  868. }
  869. if (mode & FALLOC_FL_ZERO_RANGE) {
  870. /*
  871. * Punch a hole and prealloc the range. We use a hole
  872. * punch rather than unwritten extent conversion for two
  873. * reasons:
  874. *
  875. * 1.) Hole punch handles partial block zeroing for us.
  876. * 2.) If prealloc returns ENOSPC, the file range is
  877. * still zero-valued by virtue of the hole punch.
  878. */
  879. unsigned int blksize = i_blocksize(inode);
  880. trace_xfs_zero_file_space(ip);
  881. error = xfs_free_file_space(ip, offset, len);
  882. if (error)
  883. goto out_unlock;
  884. len = round_up(offset + len, blksize) -
  885. round_down(offset, blksize);
  886. offset = round_down(offset, blksize);
  887. } else if (mode & FALLOC_FL_UNSHARE_RANGE) {
  888. error = xfs_reflink_unshare(ip, offset, len);
  889. if (error)
  890. goto out_unlock;
  891. } else {
  892. /*
  893. * If always_cow mode we can't use preallocations and
  894. * thus should not create them.
  895. */
  896. if (xfs_is_always_cow_inode(ip)) {
  897. error = -EOPNOTSUPP;
  898. goto out_unlock;
  899. }
  900. }
  901. if (!xfs_is_always_cow_inode(ip)) {
  902. error = xfs_alloc_file_space(ip, offset, len);
  903. if (error)
  904. goto out_unlock;
  905. }
  906. }
  907. /* Change file size if needed */
  908. if (new_size) {
  909. struct iattr iattr;
  910. iattr.ia_valid = ATTR_SIZE;
  911. iattr.ia_size = new_size;
  912. error = xfs_vn_setattr_size(file_mnt_user_ns(file),
  913. file_dentry(file), &iattr);
  914. if (error)
  915. goto out_unlock;
  916. }
  917. /*
  918. * Perform hole insertion now that the file size has been
  919. * updated so that if we crash during the operation we don't
  920. * leave shifted extents past EOF and hence losing access to
  921. * the data that is contained within them.
  922. */
  923. if (do_file_insert) {
  924. error = xfs_insert_file_space(ip, offset, len);
  925. if (error)
  926. goto out_unlock;
  927. }
  928. if (xfs_file_sync_writes(file))
  929. error = xfs_log_force_inode(ip);
  930. out_unlock:
  931. xfs_iunlock(ip, iolock);
  932. return error;
  933. }
  934. STATIC int
  935. xfs_file_fadvise(
  936. struct file *file,
  937. loff_t start,
  938. loff_t end,
  939. int advice)
  940. {
  941. struct xfs_inode *ip = XFS_I(file_inode(file));
  942. int ret;
  943. int lockflags = 0;
  944. /*
  945. * Operations creating pages in page cache need protection from hole
  946. * punching and similar ops
  947. */
  948. if (advice == POSIX_FADV_WILLNEED) {
  949. lockflags = XFS_IOLOCK_SHARED;
  950. xfs_ilock(ip, lockflags);
  951. }
  952. ret = generic_fadvise(file, start, end, advice);
  953. if (lockflags)
  954. xfs_iunlock(ip, lockflags);
  955. return ret;
  956. }
  957. STATIC loff_t
  958. xfs_file_remap_range(
  959. struct file *file_in,
  960. loff_t pos_in,
  961. struct file *file_out,
  962. loff_t pos_out,
  963. loff_t len,
  964. unsigned int remap_flags)
  965. {
  966. struct inode *inode_in = file_inode(file_in);
  967. struct xfs_inode *src = XFS_I(inode_in);
  968. struct inode *inode_out = file_inode(file_out);
  969. struct xfs_inode *dest = XFS_I(inode_out);
  970. struct xfs_mount *mp = src->i_mount;
  971. loff_t remapped = 0;
  972. xfs_extlen_t cowextsize;
  973. int ret;
  974. if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
  975. return -EINVAL;
  976. if (!xfs_has_reflink(mp))
  977. return -EOPNOTSUPP;
  978. if (xfs_is_shutdown(mp))
  979. return -EIO;
  980. /* Prepare and then clone file data. */
  981. ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
  982. &len, remap_flags);
  983. if (ret || len == 0)
  984. return ret;
  985. trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
  986. ret = xfs_reflink_remap_blocks(src, pos_in, dest, pos_out, len,
  987. &remapped);
  988. if (ret)
  989. goto out_unlock;
  990. /*
  991. * Carry the cowextsize hint from src to dest if we're sharing the
  992. * entire source file to the entire destination file, the source file
  993. * has a cowextsize hint, and the destination file does not.
  994. */
  995. cowextsize = 0;
  996. if (pos_in == 0 && len == i_size_read(inode_in) &&
  997. (src->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE) &&
  998. pos_out == 0 && len >= i_size_read(inode_out) &&
  999. !(dest->i_diflags2 & XFS_DIFLAG2_COWEXTSIZE))
  1000. cowextsize = src->i_cowextsize;
  1001. ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize,
  1002. remap_flags);
  1003. if (ret)
  1004. goto out_unlock;
  1005. if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out))
  1006. xfs_log_force_inode(dest);
  1007. out_unlock:
  1008. xfs_iunlock2_io_mmap(src, dest);
  1009. if (ret)
  1010. trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
  1011. return remapped > 0 ? remapped : ret;
  1012. }
  1013. STATIC int
  1014. xfs_file_open(
  1015. struct inode *inode,
  1016. struct file *file)
  1017. {
  1018. if (xfs_is_shutdown(XFS_M(inode->i_sb)))
  1019. return -EIO;
  1020. file->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC;
  1021. return generic_file_open(inode, file);
  1022. }
  1023. STATIC int
  1024. xfs_dir_open(
  1025. struct inode *inode,
  1026. struct file *file)
  1027. {
  1028. struct xfs_inode *ip = XFS_I(inode);
  1029. unsigned int mode;
  1030. int error;
  1031. error = xfs_file_open(inode, file);
  1032. if (error)
  1033. return error;
  1034. /*
  1035. * If there are any blocks, read-ahead block 0 as we're almost
  1036. * certain to have the next operation be a read there.
  1037. */
  1038. mode = xfs_ilock_data_map_shared(ip);
  1039. if (ip->i_df.if_nextents > 0)
  1040. error = xfs_dir3_data_readahead(ip, 0, 0);
  1041. xfs_iunlock(ip, mode);
  1042. return error;
  1043. }
  1044. STATIC int
  1045. xfs_file_release(
  1046. struct inode *inode,
  1047. struct file *filp)
  1048. {
  1049. return xfs_release(XFS_I(inode));
  1050. }
  1051. STATIC int
  1052. xfs_file_readdir(
  1053. struct file *file,
  1054. struct dir_context *ctx)
  1055. {
  1056. struct inode *inode = file_inode(file);
  1057. xfs_inode_t *ip = XFS_I(inode);
  1058. size_t bufsize;
  1059. /*
  1060. * The Linux API doesn't pass down the total size of the buffer
  1061. * we read into down to the filesystem. With the filldir concept
  1062. * it's not needed for correct information, but the XFS dir2 leaf
  1063. * code wants an estimate of the buffer size to calculate it's
  1064. * readahead window and size the buffers used for mapping to
  1065. * physical blocks.
  1066. *
  1067. * Try to give it an estimate that's good enough, maybe at some
  1068. * point we can change the ->readdir prototype to include the
  1069. * buffer size. For now we use the current glibc buffer size.
  1070. */
  1071. bufsize = (size_t)min_t(loff_t, XFS_READDIR_BUFSIZE, ip->i_disk_size);
  1072. return xfs_readdir(NULL, ip, ctx, bufsize);
  1073. }
  1074. STATIC loff_t
  1075. xfs_file_llseek(
  1076. struct file *file,
  1077. loff_t offset,
  1078. int whence)
  1079. {
  1080. struct inode *inode = file->f_mapping->host;
  1081. if (xfs_is_shutdown(XFS_I(inode)->i_mount))
  1082. return -EIO;
  1083. switch (whence) {
  1084. default:
  1085. return generic_file_llseek(file, offset, whence);
  1086. case SEEK_HOLE:
  1087. offset = iomap_seek_hole(inode, offset, &xfs_seek_iomap_ops);
  1088. break;
  1089. case SEEK_DATA:
  1090. offset = iomap_seek_data(inode, offset, &xfs_seek_iomap_ops);
  1091. break;
  1092. }
  1093. if (offset < 0)
  1094. return offset;
  1095. return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
  1096. }
  1097. #ifdef CONFIG_FS_DAX
  1098. static inline vm_fault_t
  1099. xfs_dax_fault(
  1100. struct vm_fault *vmf,
  1101. enum page_entry_size pe_size,
  1102. bool write_fault,
  1103. pfn_t *pfn)
  1104. {
  1105. return dax_iomap_fault(vmf, pe_size, pfn, NULL,
  1106. (write_fault && !vmf->cow_page) ?
  1107. &xfs_dax_write_iomap_ops :
  1108. &xfs_read_iomap_ops);
  1109. }
  1110. #else
  1111. static inline vm_fault_t
  1112. xfs_dax_fault(
  1113. struct vm_fault *vmf,
  1114. enum page_entry_size pe_size,
  1115. bool write_fault,
  1116. pfn_t *pfn)
  1117. {
  1118. ASSERT(0);
  1119. return VM_FAULT_SIGBUS;
  1120. }
  1121. #endif
  1122. /*
  1123. * Locking for serialisation of IO during page faults. This results in a lock
  1124. * ordering of:
  1125. *
  1126. * mmap_lock (MM)
  1127. * sb_start_pagefault(vfs, freeze)
  1128. * invalidate_lock (vfs/XFS_MMAPLOCK - truncate serialisation)
  1129. * page_lock (MM)
  1130. * i_lock (XFS - extent map serialisation)
  1131. */
  1132. static vm_fault_t
  1133. __xfs_filemap_fault(
  1134. struct vm_fault *vmf,
  1135. enum page_entry_size pe_size,
  1136. bool write_fault)
  1137. {
  1138. struct inode *inode = file_inode(vmf->vma->vm_file);
  1139. struct xfs_inode *ip = XFS_I(inode);
  1140. vm_fault_t ret;
  1141. trace_xfs_filemap_fault(ip, pe_size, write_fault);
  1142. if (write_fault) {
  1143. sb_start_pagefault(inode->i_sb);
  1144. file_update_time(vmf->vma->vm_file);
  1145. }
  1146. if (IS_DAX(inode)) {
  1147. pfn_t pfn;
  1148. xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1149. ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn);
  1150. if (ret & VM_FAULT_NEEDDSYNC)
  1151. ret = dax_finish_sync_fault(vmf, pe_size, pfn);
  1152. xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1153. } else {
  1154. if (write_fault) {
  1155. xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1156. ret = iomap_page_mkwrite(vmf,
  1157. &xfs_buffered_write_iomap_ops);
  1158. xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1159. } else {
  1160. ret = filemap_fault(vmf);
  1161. }
  1162. }
  1163. if (write_fault)
  1164. sb_end_pagefault(inode->i_sb);
  1165. return ret;
  1166. }
  1167. static inline bool
  1168. xfs_is_write_fault(
  1169. struct vm_fault *vmf)
  1170. {
  1171. return (vmf->flags & FAULT_FLAG_WRITE) &&
  1172. (vmf->vma->vm_flags & VM_SHARED);
  1173. }
  1174. static vm_fault_t
  1175. xfs_filemap_fault(
  1176. struct vm_fault *vmf)
  1177. {
  1178. /* DAX can shortcut the normal fault path on write faults! */
  1179. return __xfs_filemap_fault(vmf, PE_SIZE_PTE,
  1180. IS_DAX(file_inode(vmf->vma->vm_file)) &&
  1181. xfs_is_write_fault(vmf));
  1182. }
  1183. static vm_fault_t
  1184. xfs_filemap_huge_fault(
  1185. struct vm_fault *vmf,
  1186. enum page_entry_size pe_size)
  1187. {
  1188. if (!IS_DAX(file_inode(vmf->vma->vm_file)))
  1189. return VM_FAULT_FALLBACK;
  1190. /* DAX can shortcut the normal fault path on write faults! */
  1191. return __xfs_filemap_fault(vmf, pe_size,
  1192. xfs_is_write_fault(vmf));
  1193. }
  1194. static vm_fault_t
  1195. xfs_filemap_page_mkwrite(
  1196. struct vm_fault *vmf)
  1197. {
  1198. return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
  1199. }
  1200. /*
  1201. * pfn_mkwrite was originally intended to ensure we capture time stamp updates
  1202. * on write faults. In reality, it needs to serialise against truncate and
  1203. * prepare memory for writing so handle is as standard write fault.
  1204. */
  1205. static vm_fault_t
  1206. xfs_filemap_pfn_mkwrite(
  1207. struct vm_fault *vmf)
  1208. {
  1209. return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
  1210. }
  1211. static vm_fault_t
  1212. xfs_filemap_map_pages(
  1213. struct vm_fault *vmf,
  1214. pgoff_t start_pgoff,
  1215. pgoff_t end_pgoff)
  1216. {
  1217. struct inode *inode = file_inode(vmf->vma->vm_file);
  1218. vm_fault_t ret;
  1219. xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1220. ret = filemap_map_pages(vmf, start_pgoff, end_pgoff);
  1221. xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
  1222. return ret;
  1223. }
  1224. static const struct vm_operations_struct xfs_file_vm_ops = {
  1225. .fault = xfs_filemap_fault,
  1226. .huge_fault = xfs_filemap_huge_fault,
  1227. .map_pages = xfs_filemap_map_pages,
  1228. .page_mkwrite = xfs_filemap_page_mkwrite,
  1229. .pfn_mkwrite = xfs_filemap_pfn_mkwrite,
  1230. };
  1231. STATIC int
  1232. xfs_file_mmap(
  1233. struct file *file,
  1234. struct vm_area_struct *vma)
  1235. {
  1236. struct inode *inode = file_inode(file);
  1237. struct xfs_buftarg *target = xfs_inode_buftarg(XFS_I(inode));
  1238. /*
  1239. * We don't support synchronous mappings for non-DAX files and
  1240. * for DAX files if underneath dax_device is not synchronous.
  1241. */
  1242. if (!daxdev_mapping_supported(vma, target->bt_daxdev))
  1243. return -EOPNOTSUPP;
  1244. file_accessed(file);
  1245. vma->vm_ops = &xfs_file_vm_ops;
  1246. if (IS_DAX(inode))
  1247. vm_flags_set(vma, VM_HUGEPAGE);
  1248. return 0;
  1249. }
  1250. const struct file_operations xfs_file_operations = {
  1251. .llseek = xfs_file_llseek,
  1252. .read_iter = xfs_file_read_iter,
  1253. .write_iter = xfs_file_write_iter,
  1254. .splice_read = generic_file_splice_read,
  1255. .splice_write = iter_file_splice_write,
  1256. .iopoll = iocb_bio_iopoll,
  1257. .unlocked_ioctl = xfs_file_ioctl,
  1258. #ifdef CONFIG_COMPAT
  1259. .compat_ioctl = xfs_file_compat_ioctl,
  1260. #endif
  1261. .mmap = xfs_file_mmap,
  1262. .mmap_supported_flags = MAP_SYNC,
  1263. .open = xfs_file_open,
  1264. .release = xfs_file_release,
  1265. .fsync = xfs_file_fsync,
  1266. .get_unmapped_area = thp_get_unmapped_area,
  1267. .fallocate = xfs_file_fallocate,
  1268. .fadvise = xfs_file_fadvise,
  1269. .remap_file_range = xfs_file_remap_range,
  1270. };
  1271. const struct file_operations xfs_dir_file_operations = {
  1272. .open = xfs_dir_open,
  1273. .read = generic_read_dir,
  1274. .iterate_shared = xfs_file_readdir,
  1275. .llseek = generic_file_llseek,
  1276. .unlocked_ioctl = xfs_file_ioctl,
  1277. #ifdef CONFIG_COMPAT
  1278. .compat_ioctl = xfs_file_compat_ioctl,
  1279. #endif
  1280. .fsync = xfs_dir_fsync,
  1281. };