remap_range.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/slab.h>
  3. #include <linux/stat.h>
  4. #include <linux/sched/xacct.h>
  5. #include <linux/fcntl.h>
  6. #include <linux/file.h>
  7. #include <linux/uio.h>
  8. #include <linux/fsnotify.h>
  9. #include <linux/security.h>
  10. #include <linux/export.h>
  11. #include <linux/syscalls.h>
  12. #include <linux/pagemap.h>
  13. #include <linux/splice.h>
  14. #include <linux/compat.h>
  15. #include <linux/mount.h>
  16. #include <linux/fs.h>
  17. #include <linux/dax.h>
  18. #include "internal.h"
  19. #include <linux/uaccess.h>
  20. #include <asm/unistd.h>
  21. /*
  22. * Performs necessary checks before doing a clone.
  23. *
  24. * Can adjust amount of bytes to clone via @req_count argument.
  25. * Returns appropriate error code that caller should return or
  26. * zero in case the clone should be allowed.
  27. */
  28. static int generic_remap_checks(struct file *file_in, loff_t pos_in,
  29. struct file *file_out, loff_t pos_out,
  30. loff_t *req_count, unsigned int remap_flags)
  31. {
  32. struct inode *inode_in = file_in->f_mapping->host;
  33. struct inode *inode_out = file_out->f_mapping->host;
  34. uint64_t count = *req_count;
  35. uint64_t bcount;
  36. loff_t size_in, size_out;
  37. loff_t bs = inode_out->i_sb->s_blocksize;
  38. int ret;
  39. /* The start of both ranges must be aligned to an fs block. */
  40. if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
  41. return -EINVAL;
  42. /* Ensure offsets don't wrap. */
  43. if (pos_in + count < pos_in || pos_out + count < pos_out)
  44. return -EINVAL;
  45. size_in = i_size_read(inode_in);
  46. size_out = i_size_read(inode_out);
  47. /* Dedupe requires both ranges to be within EOF. */
  48. if ((remap_flags & REMAP_FILE_DEDUP) &&
  49. (pos_in >= size_in || pos_in + count > size_in ||
  50. pos_out >= size_out || pos_out + count > size_out))
  51. return -EINVAL;
  52. /* Ensure the infile range is within the infile. */
  53. if (pos_in >= size_in)
  54. return -EINVAL;
  55. count = min(count, size_in - (uint64_t)pos_in);
  56. ret = generic_write_check_limits(file_out, pos_out, &count);
  57. if (ret)
  58. return ret;
  59. /*
  60. * If the user wanted us to link to the infile's EOF, round up to the
  61. * next block boundary for this check.
  62. *
  63. * Otherwise, make sure the count is also block-aligned, having
  64. * already confirmed the starting offsets' block alignment.
  65. */
  66. if (pos_in + count == size_in &&
  67. (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
  68. bcount = ALIGN(size_in, bs) - pos_in;
  69. } else {
  70. if (!IS_ALIGNED(count, bs))
  71. count = ALIGN_DOWN(count, bs);
  72. bcount = count;
  73. }
  74. /* Don't allow overlapped cloning within the same file. */
  75. if (inode_in == inode_out &&
  76. pos_out + bcount > pos_in &&
  77. pos_out < pos_in + bcount)
  78. return -EINVAL;
  79. /*
  80. * We shortened the request but the caller can't deal with that, so
  81. * bounce the request back to userspace.
  82. */
  83. if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
  84. return -EINVAL;
  85. *req_count = count;
  86. return 0;
  87. }
  88. static int remap_verify_area(struct file *file, loff_t pos, loff_t len,
  89. bool write)
  90. {
  91. if (unlikely(pos < 0 || len < 0))
  92. return -EINVAL;
  93. if (unlikely((loff_t) (pos + len) < 0))
  94. return -EINVAL;
  95. return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
  96. }
  97. /*
  98. * Ensure that we don't remap a partial EOF block in the middle of something
  99. * else. Assume that the offsets have already been checked for block
  100. * alignment.
  101. *
  102. * For clone we only link a partial EOF block above or at the destination file's
  103. * EOF. For deduplication we accept a partial EOF block only if it ends at the
  104. * destination file's EOF (can not link it into the middle of a file).
  105. *
  106. * Shorten the request if possible.
  107. */
  108. static int generic_remap_check_len(struct inode *inode_in,
  109. struct inode *inode_out,
  110. loff_t pos_out,
  111. loff_t *len,
  112. unsigned int remap_flags)
  113. {
  114. u64 blkmask = i_blocksize(inode_in) - 1;
  115. loff_t new_len = *len;
  116. if ((*len & blkmask) == 0)
  117. return 0;
  118. if (pos_out + *len < i_size_read(inode_out))
  119. new_len &= ~blkmask;
  120. if (new_len == *len)
  121. return 0;
  122. if (remap_flags & REMAP_FILE_CAN_SHORTEN) {
  123. *len = new_len;
  124. return 0;
  125. }
  126. return (remap_flags & REMAP_FILE_DEDUP) ? -EBADE : -EINVAL;
  127. }
  128. /* Read a page's worth of file data into the page cache. */
  129. static struct folio *vfs_dedupe_get_folio(struct file *file, loff_t pos)
  130. {
  131. return read_mapping_folio(file->f_mapping, pos >> PAGE_SHIFT, file);
  132. }
  133. /*
  134. * Lock two folios, ensuring that we lock in offset order if the folios
  135. * are from the same file.
  136. */
  137. static void vfs_lock_two_folios(struct folio *folio1, struct folio *folio2)
  138. {
  139. /* Always lock in order of increasing index. */
  140. if (folio1->index > folio2->index)
  141. swap(folio1, folio2);
  142. folio_lock(folio1);
  143. if (folio1 != folio2)
  144. folio_lock(folio2);
  145. }
  146. /* Unlock two folios, being careful not to unlock the same folio twice. */
  147. static void vfs_unlock_two_folios(struct folio *folio1, struct folio *folio2)
  148. {
  149. folio_unlock(folio1);
  150. if (folio1 != folio2)
  151. folio_unlock(folio2);
  152. }
  153. /*
  154. * Compare extents of two files to see if they are the same.
  155. * Caller must have locked both inodes to prevent write races.
  156. */
  157. static int vfs_dedupe_file_range_compare(struct file *src, loff_t srcoff,
  158. struct file *dest, loff_t dstoff,
  159. loff_t len, bool *is_same)
  160. {
  161. bool same = true;
  162. int error = -EINVAL;
  163. while (len) {
  164. struct folio *src_folio, *dst_folio;
  165. void *src_addr, *dst_addr;
  166. loff_t cmp_len = min(PAGE_SIZE - offset_in_page(srcoff),
  167. PAGE_SIZE - offset_in_page(dstoff));
  168. cmp_len = min(cmp_len, len);
  169. if (cmp_len <= 0)
  170. goto out_error;
  171. src_folio = vfs_dedupe_get_folio(src, srcoff);
  172. if (IS_ERR(src_folio)) {
  173. error = PTR_ERR(src_folio);
  174. goto out_error;
  175. }
  176. dst_folio = vfs_dedupe_get_folio(dest, dstoff);
  177. if (IS_ERR(dst_folio)) {
  178. error = PTR_ERR(dst_folio);
  179. folio_put(src_folio);
  180. goto out_error;
  181. }
  182. vfs_lock_two_folios(src_folio, dst_folio);
  183. /*
  184. * Now that we've locked both folios, make sure they're still
  185. * mapped to the file data we're interested in. If not,
  186. * someone is invalidating pages on us and we lose.
  187. */
  188. if (!folio_test_uptodate(src_folio) || !folio_test_uptodate(dst_folio) ||
  189. src_folio->mapping != src->f_mapping ||
  190. dst_folio->mapping != dest->f_mapping) {
  191. same = false;
  192. goto unlock;
  193. }
  194. src_addr = kmap_local_folio(src_folio,
  195. offset_in_folio(src_folio, srcoff));
  196. dst_addr = kmap_local_folio(dst_folio,
  197. offset_in_folio(dst_folio, dstoff));
  198. flush_dcache_folio(src_folio);
  199. flush_dcache_folio(dst_folio);
  200. if (memcmp(src_addr, dst_addr, cmp_len))
  201. same = false;
  202. kunmap_local(dst_addr);
  203. kunmap_local(src_addr);
  204. unlock:
  205. vfs_unlock_two_folios(src_folio, dst_folio);
  206. folio_put(dst_folio);
  207. folio_put(src_folio);
  208. if (!same)
  209. break;
  210. srcoff += cmp_len;
  211. dstoff += cmp_len;
  212. len -= cmp_len;
  213. }
  214. *is_same = same;
  215. return 0;
  216. out_error:
  217. return error;
  218. }
  219. /*
  220. * Check that the two inodes are eligible for cloning, the ranges make
  221. * sense, and then flush all dirty data. Caller must ensure that the
  222. * inodes have been locked against any other modifications.
  223. *
  224. * If there's an error, then the usual negative error code is returned.
  225. * Otherwise returns 0 with *len set to the request length.
  226. */
  227. int
  228. __generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  229. struct file *file_out, loff_t pos_out,
  230. loff_t *len, unsigned int remap_flags,
  231. const struct iomap_ops *dax_read_ops)
  232. {
  233. struct inode *inode_in = file_inode(file_in);
  234. struct inode *inode_out = file_inode(file_out);
  235. bool same_inode = (inode_in == inode_out);
  236. int ret;
  237. /* Don't touch certain kinds of inodes */
  238. if (IS_IMMUTABLE(inode_out))
  239. return -EPERM;
  240. if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
  241. return -ETXTBSY;
  242. /* Don't reflink dirs, pipes, sockets... */
  243. if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
  244. return -EISDIR;
  245. if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
  246. return -EINVAL;
  247. /* Zero length dedupe exits immediately; reflink goes to EOF. */
  248. if (*len == 0) {
  249. loff_t isize = i_size_read(inode_in);
  250. if ((remap_flags & REMAP_FILE_DEDUP) || pos_in == isize)
  251. return 0;
  252. if (pos_in > isize)
  253. return -EINVAL;
  254. *len = isize - pos_in;
  255. if (*len == 0)
  256. return 0;
  257. }
  258. /* Check that we don't violate system file offset limits. */
  259. ret = generic_remap_checks(file_in, pos_in, file_out, pos_out, len,
  260. remap_flags);
  261. if (ret)
  262. return ret;
  263. /* Wait for the completion of any pending IOs on both files */
  264. inode_dio_wait(inode_in);
  265. if (!same_inode)
  266. inode_dio_wait(inode_out);
  267. ret = filemap_write_and_wait_range(inode_in->i_mapping,
  268. pos_in, pos_in + *len - 1);
  269. if (ret)
  270. return ret;
  271. ret = filemap_write_and_wait_range(inode_out->i_mapping,
  272. pos_out, pos_out + *len - 1);
  273. if (ret)
  274. return ret;
  275. /*
  276. * Check that the extents are the same.
  277. */
  278. if (remap_flags & REMAP_FILE_DEDUP) {
  279. bool is_same = false;
  280. if (*len == 0)
  281. return 0;
  282. if (!IS_DAX(inode_in))
  283. ret = vfs_dedupe_file_range_compare(file_in, pos_in,
  284. file_out, pos_out, *len, &is_same);
  285. else if (dax_read_ops)
  286. ret = dax_dedupe_file_range_compare(inode_in, pos_in,
  287. inode_out, pos_out, *len, &is_same,
  288. dax_read_ops);
  289. else
  290. return -EINVAL;
  291. if (ret)
  292. return ret;
  293. if (!is_same)
  294. return -EBADE;
  295. }
  296. ret = generic_remap_check_len(inode_in, inode_out, pos_out, len,
  297. remap_flags);
  298. if (ret)
  299. return ret;
  300. /* If can't alter the file contents, we're done. */
  301. if (!(remap_flags & REMAP_FILE_DEDUP))
  302. ret = file_modified(file_out);
  303. return ret;
  304. }
  305. int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  306. struct file *file_out, loff_t pos_out,
  307. loff_t *len, unsigned int remap_flags)
  308. {
  309. return __generic_remap_file_range_prep(file_in, pos_in, file_out,
  310. pos_out, len, remap_flags, NULL);
  311. }
  312. EXPORT_SYMBOL(generic_remap_file_range_prep);
  313. loff_t do_clone_file_range(struct file *file_in, loff_t pos_in,
  314. struct file *file_out, loff_t pos_out,
  315. loff_t len, unsigned int remap_flags)
  316. {
  317. loff_t ret;
  318. WARN_ON_ONCE(remap_flags & REMAP_FILE_DEDUP);
  319. if (file_inode(file_in)->i_sb != file_inode(file_out)->i_sb)
  320. return -EXDEV;
  321. ret = generic_file_rw_checks(file_in, file_out);
  322. if (ret < 0)
  323. return ret;
  324. if (!file_in->f_op->remap_file_range)
  325. return -EOPNOTSUPP;
  326. ret = remap_verify_area(file_in, pos_in, len, false);
  327. if (ret)
  328. return ret;
  329. ret = remap_verify_area(file_out, pos_out, len, true);
  330. if (ret)
  331. return ret;
  332. ret = file_in->f_op->remap_file_range(file_in, pos_in,
  333. file_out, pos_out, len, remap_flags);
  334. if (ret < 0)
  335. return ret;
  336. fsnotify_access(file_in);
  337. fsnotify_modify(file_out);
  338. return ret;
  339. }
  340. EXPORT_SYMBOL(do_clone_file_range);
  341. loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in,
  342. struct file *file_out, loff_t pos_out,
  343. loff_t len, unsigned int remap_flags)
  344. {
  345. loff_t ret;
  346. file_start_write(file_out);
  347. ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len,
  348. remap_flags);
  349. file_end_write(file_out);
  350. return ret;
  351. }
  352. EXPORT_SYMBOL(vfs_clone_file_range);
  353. /* Check whether we are allowed to dedupe the destination file */
  354. static bool allow_file_dedupe(struct file *file)
  355. {
  356. struct user_namespace *mnt_userns = file_mnt_user_ns(file);
  357. struct inode *inode = file_inode(file);
  358. if (capable(CAP_SYS_ADMIN))
  359. return true;
  360. if (file->f_mode & FMODE_WRITE)
  361. return true;
  362. if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)))
  363. return true;
  364. if (!inode_permission(mnt_userns, inode, MAY_WRITE))
  365. return true;
  366. return false;
  367. }
  368. loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos,
  369. struct file *dst_file, loff_t dst_pos,
  370. loff_t len, unsigned int remap_flags)
  371. {
  372. loff_t ret;
  373. WARN_ON_ONCE(remap_flags & ~(REMAP_FILE_DEDUP |
  374. REMAP_FILE_CAN_SHORTEN));
  375. ret = mnt_want_write_file(dst_file);
  376. if (ret)
  377. return ret;
  378. /*
  379. * This is redundant if called from vfs_dedupe_file_range(), but other
  380. * callers need it and it's not performance sesitive...
  381. */
  382. ret = remap_verify_area(src_file, src_pos, len, false);
  383. if (ret)
  384. goto out_drop_write;
  385. ret = remap_verify_area(dst_file, dst_pos, len, true);
  386. if (ret)
  387. goto out_drop_write;
  388. ret = -EPERM;
  389. if (!allow_file_dedupe(dst_file))
  390. goto out_drop_write;
  391. ret = -EXDEV;
  392. if (file_inode(src_file)->i_sb != file_inode(dst_file)->i_sb)
  393. goto out_drop_write;
  394. ret = -EISDIR;
  395. if (S_ISDIR(file_inode(dst_file)->i_mode))
  396. goto out_drop_write;
  397. ret = -EINVAL;
  398. if (!dst_file->f_op->remap_file_range)
  399. goto out_drop_write;
  400. if (len == 0) {
  401. ret = 0;
  402. goto out_drop_write;
  403. }
  404. ret = dst_file->f_op->remap_file_range(src_file, src_pos, dst_file,
  405. dst_pos, len, remap_flags | REMAP_FILE_DEDUP);
  406. out_drop_write:
  407. mnt_drop_write_file(dst_file);
  408. return ret;
  409. }
  410. EXPORT_SYMBOL(vfs_dedupe_file_range_one);
  411. int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
  412. {
  413. struct file_dedupe_range_info *info;
  414. struct inode *src = file_inode(file);
  415. u64 off;
  416. u64 len;
  417. int i;
  418. int ret;
  419. u16 count = same->dest_count;
  420. loff_t deduped;
  421. if (!(file->f_mode & FMODE_READ))
  422. return -EINVAL;
  423. if (same->reserved1 || same->reserved2)
  424. return -EINVAL;
  425. off = same->src_offset;
  426. len = same->src_length;
  427. if (S_ISDIR(src->i_mode))
  428. return -EISDIR;
  429. if (!S_ISREG(src->i_mode))
  430. return -EINVAL;
  431. if (!file->f_op->remap_file_range)
  432. return -EOPNOTSUPP;
  433. ret = remap_verify_area(file, off, len, false);
  434. if (ret < 0)
  435. return ret;
  436. ret = 0;
  437. if (off + len > i_size_read(src))
  438. return -EINVAL;
  439. /* Arbitrary 1G limit on a single dedupe request, can be raised. */
  440. len = min_t(u64, len, 1 << 30);
  441. /* pre-format output fields to sane values */
  442. for (i = 0; i < count; i++) {
  443. same->info[i].bytes_deduped = 0ULL;
  444. same->info[i].status = FILE_DEDUPE_RANGE_SAME;
  445. }
  446. for (i = 0, info = same->info; i < count; i++, info++) {
  447. struct fd dst_fd = fdget(info->dest_fd);
  448. struct file *dst_file = dst_fd.file;
  449. if (!dst_file) {
  450. info->status = -EBADF;
  451. goto next_loop;
  452. }
  453. if (info->reserved) {
  454. info->status = -EINVAL;
  455. goto next_fdput;
  456. }
  457. deduped = vfs_dedupe_file_range_one(file, off, dst_file,
  458. info->dest_offset, len,
  459. REMAP_FILE_CAN_SHORTEN);
  460. if (deduped == -EBADE)
  461. info->status = FILE_DEDUPE_RANGE_DIFFERS;
  462. else if (deduped < 0)
  463. info->status = deduped;
  464. else
  465. info->bytes_deduped = len;
  466. next_fdput:
  467. fdput(dst_fd);
  468. next_loop:
  469. if (fatal_signal_pending(current))
  470. break;
  471. }
  472. return ret;
  473. }
  474. EXPORT_SYMBOL(vfs_dedupe_file_range);