delalloc-space.c 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include "ctree.h"
  3. #include "delalloc-space.h"
  4. #include "block-rsv.h"
  5. #include "btrfs_inode.h"
  6. #include "space-info.h"
  7. #include "transaction.h"
  8. #include "qgroup.h"
  9. #include "block-group.h"
  10. /*
  11. * HOW DOES THIS WORK
  12. *
  13. * There are two stages to data reservations, one for data and one for metadata
  14. * to handle the new extents and checksums generated by writing data.
  15. *
  16. *
  17. * DATA RESERVATION
  18. * The general flow of the data reservation is as follows
  19. *
  20. * -> Reserve
  21. * We call into btrfs_reserve_data_bytes() for the user request bytes that
  22. * they wish to write. We make this reservation and add it to
  23. * space_info->bytes_may_use. We set EXTENT_DELALLOC on the inode io_tree
  24. * for the range and carry on if this is buffered, or follow up trying to
  25. * make a real allocation if we are pre-allocating or doing O_DIRECT.
  26. *
  27. * -> Use
  28. * At writepages()/prealloc/O_DIRECT time we will call into
  29. * btrfs_reserve_extent() for some part or all of this range of bytes. We
  30. * will make the allocation and subtract space_info->bytes_may_use by the
  31. * original requested length and increase the space_info->bytes_reserved by
  32. * the allocated length. This distinction is important because compression
  33. * may allocate a smaller on disk extent than we previously reserved.
  34. *
  35. * -> Allocation
  36. * finish_ordered_io() will insert the new file extent item for this range,
  37. * and then add a delayed ref update for the extent tree. Once that delayed
  38. * ref is written the extent size is subtracted from
  39. * space_info->bytes_reserved and added to space_info->bytes_used.
  40. *
  41. * Error handling
  42. *
  43. * -> By the reservation maker
  44. * This is the simplest case, we haven't completed our operation and we know
  45. * how much we reserved, we can simply call
  46. * btrfs_free_reserved_data_space*() and it will be removed from
  47. * space_info->bytes_may_use.
  48. *
  49. * -> After the reservation has been made, but before cow_file_range()
  50. * This is specifically for the delalloc case. You must clear
  51. * EXTENT_DELALLOC with the EXTENT_CLEAR_DATA_RESV bit, and the range will
  52. * be subtracted from space_info->bytes_may_use.
  53. *
  54. * METADATA RESERVATION
  55. * The general metadata reservation lifetimes are discussed elsewhere, this
  56. * will just focus on how it is used for delalloc space.
  57. *
  58. * We keep track of two things on a per inode bases
  59. *
  60. * ->outstanding_extents
  61. * This is the number of file extent items we'll need to handle all of the
  62. * outstanding DELALLOC space we have in this inode. We limit the maximum
  63. * size of an extent, so a large contiguous dirty area may require more than
  64. * one outstanding_extent, which is why count_max_extents() is used to
  65. * determine how many outstanding_extents get added.
  66. *
  67. * ->csum_bytes
  68. * This is essentially how many dirty bytes we have for this inode, so we
  69. * can calculate the number of checksum items we would have to add in order
  70. * to checksum our outstanding data.
  71. *
  72. * We keep a per-inode block_rsv in order to make it easier to keep track of
  73. * our reservation. We use btrfs_calculate_inode_block_rsv_size() to
  74. * calculate the current theoretical maximum reservation we would need for the
  75. * metadata for this inode. We call this and then adjust our reservation as
  76. * necessary, either by attempting to reserve more space, or freeing up excess
  77. * space.
  78. *
  79. * OUTSTANDING_EXTENTS HANDLING
  80. *
  81. * ->outstanding_extents is used for keeping track of how many extents we will
  82. * need to use for this inode, and it will fluctuate depending on where you are
  83. * in the life cycle of the dirty data. Consider the following normal case for
  84. * a completely clean inode, with a num_bytes < our maximum allowed extent size
  85. *
  86. * -> reserve
  87. * ->outstanding_extents += 1 (current value is 1)
  88. *
  89. * -> set_delalloc
  90. * ->outstanding_extents += 1 (current value is 2)
  91. *
  92. * -> btrfs_delalloc_release_extents()
  93. * ->outstanding_extents -= 1 (current value is 1)
  94. *
  95. * We must call this once we are done, as we hold our reservation for the
  96. * duration of our operation, and then assume set_delalloc will update the
  97. * counter appropriately.
  98. *
  99. * -> add ordered extent
  100. * ->outstanding_extents += 1 (current value is 2)
  101. *
  102. * -> btrfs_clear_delalloc_extent
  103. * ->outstanding_extents -= 1 (current value is 1)
  104. *
  105. * -> finish_ordered_io/btrfs_remove_ordered_extent
  106. * ->outstanding_extents -= 1 (current value is 0)
  107. *
  108. * Each stage is responsible for their own accounting of the extent, thus
  109. * making error handling and cleanup easier.
  110. */
  111. int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes)
  112. {
  113. struct btrfs_root *root = inode->root;
  114. struct btrfs_fs_info *fs_info = root->fs_info;
  115. enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
  116. /* Make sure bytes are sectorsize aligned */
  117. bytes = ALIGN(bytes, fs_info->sectorsize);
  118. if (btrfs_is_free_space_inode(inode))
  119. flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
  120. return btrfs_reserve_data_bytes(fs_info, bytes, flush);
  121. }
  122. int btrfs_check_data_free_space(struct btrfs_inode *inode,
  123. struct extent_changeset **reserved, u64 start,
  124. u64 len, bool noflush)
  125. {
  126. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  127. enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA;
  128. int ret;
  129. /* align the range */
  130. len = round_up(start + len, fs_info->sectorsize) -
  131. round_down(start, fs_info->sectorsize);
  132. start = round_down(start, fs_info->sectorsize);
  133. if (noflush)
  134. flush = BTRFS_RESERVE_NO_FLUSH;
  135. else if (btrfs_is_free_space_inode(inode))
  136. flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE;
  137. ret = btrfs_reserve_data_bytes(fs_info, len, flush);
  138. if (ret < 0)
  139. return ret;
  140. /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
  141. ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
  142. if (ret < 0) {
  143. btrfs_free_reserved_data_space_noquota(fs_info, len);
  144. extent_changeset_free(*reserved);
  145. *reserved = NULL;
  146. } else {
  147. ret = 0;
  148. }
  149. return ret;
  150. }
  151. /*
  152. * Called if we need to clear a data reservation for this inode
  153. * Normally in a error case.
  154. *
  155. * This one will *NOT* use accurate qgroup reserved space API, just for case
  156. * which we can't sleep and is sure it won't affect qgroup reserved space.
  157. * Like clear_bit_hook().
  158. */
  159. void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info,
  160. u64 len)
  161. {
  162. struct btrfs_space_info *data_sinfo;
  163. ASSERT(IS_ALIGNED(len, fs_info->sectorsize));
  164. data_sinfo = fs_info->data_sinfo;
  165. btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len);
  166. }
  167. /*
  168. * Called if we need to clear a data reservation for this inode
  169. * Normally in a error case.
  170. *
  171. * This one will handle the per-inode data rsv map for accurate reserved
  172. * space framework.
  173. */
  174. void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
  175. struct extent_changeset *reserved, u64 start, u64 len)
  176. {
  177. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  178. /* Make sure the range is aligned to sectorsize */
  179. len = round_up(start + len, fs_info->sectorsize) -
  180. round_down(start, fs_info->sectorsize);
  181. start = round_down(start, fs_info->sectorsize);
  182. btrfs_free_reserved_data_space_noquota(fs_info, len);
  183. btrfs_qgroup_free_data(inode, reserved, start, len);
  184. }
  185. /**
  186. * Release any excessive reservation
  187. *
  188. * @inode: the inode we need to release from
  189. * @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
  190. * meta reservation needs to know if we are freeing qgroup
  191. * reservation or just converting it into per-trans. Normally
  192. * @qgroup_free is true for error handling, and false for normal
  193. * release.
  194. *
  195. * This is the same as btrfs_block_rsv_release, except that it handles the
  196. * tracepoint for the reservation.
  197. */
  198. static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool qgroup_free)
  199. {
  200. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  201. struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
  202. u64 released = 0;
  203. u64 qgroup_to_release = 0;
  204. /*
  205. * Since we statically set the block_rsv->size we just want to say we
  206. * are releasing 0 bytes, and then we'll just get the reservation over
  207. * the size free'd.
  208. */
  209. released = btrfs_block_rsv_release(fs_info, block_rsv, 0,
  210. &qgroup_to_release);
  211. if (released > 0)
  212. trace_btrfs_space_reservation(fs_info, "delalloc",
  213. btrfs_ino(inode), released, 0);
  214. if (qgroup_free)
  215. btrfs_qgroup_free_meta_prealloc(inode->root, qgroup_to_release);
  216. else
  217. btrfs_qgroup_convert_reserved_meta(inode->root,
  218. qgroup_to_release);
  219. }
  220. static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info,
  221. struct btrfs_inode *inode)
  222. {
  223. struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
  224. u64 reserve_size = 0;
  225. u64 qgroup_rsv_size = 0;
  226. u64 csum_leaves;
  227. unsigned outstanding_extents;
  228. lockdep_assert_held(&inode->lock);
  229. outstanding_extents = inode->outstanding_extents;
  230. /*
  231. * Insert size for the number of outstanding extents, 1 normal size for
  232. * updating the inode.
  233. */
  234. if (outstanding_extents) {
  235. reserve_size = btrfs_calc_insert_metadata_size(fs_info,
  236. outstanding_extents);
  237. reserve_size += btrfs_calc_metadata_size(fs_info, 1);
  238. }
  239. csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
  240. inode->csum_bytes);
  241. reserve_size += btrfs_calc_insert_metadata_size(fs_info,
  242. csum_leaves);
  243. /*
  244. * For qgroup rsv, the calculation is very simple:
  245. * account one nodesize for each outstanding extent
  246. *
  247. * This is overestimating in most cases.
  248. */
  249. qgroup_rsv_size = (u64)outstanding_extents * fs_info->nodesize;
  250. spin_lock(&block_rsv->lock);
  251. block_rsv->size = reserve_size;
  252. block_rsv->qgroup_rsv_size = qgroup_rsv_size;
  253. spin_unlock(&block_rsv->lock);
  254. }
  255. static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
  256. u64 num_bytes, u64 disk_num_bytes,
  257. u64 *meta_reserve, u64 *qgroup_reserve)
  258. {
  259. u64 nr_extents = count_max_extents(fs_info, num_bytes);
  260. u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes);
  261. u64 inode_update = btrfs_calc_metadata_size(fs_info, 1);
  262. *meta_reserve = btrfs_calc_insert_metadata_size(fs_info,
  263. nr_extents + csum_leaves);
  264. /*
  265. * finish_ordered_io has to update the inode, so add the space required
  266. * for an inode update.
  267. */
  268. *meta_reserve += inode_update;
  269. *qgroup_reserve = nr_extents * fs_info->nodesize;
  270. }
  271. int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
  272. u64 disk_num_bytes, bool noflush)
  273. {
  274. struct btrfs_root *root = inode->root;
  275. struct btrfs_fs_info *fs_info = root->fs_info;
  276. struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
  277. u64 meta_reserve, qgroup_reserve;
  278. unsigned nr_extents;
  279. enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
  280. int ret = 0;
  281. /*
  282. * If we are a free space inode we need to not flush since we will be in
  283. * the middle of a transaction commit. We also don't need the delalloc
  284. * mutex since we won't race with anybody. We need this mostly to make
  285. * lockdep shut its filthy mouth.
  286. *
  287. * If we have a transaction open (can happen if we call truncate_block
  288. * from truncate), then we need FLUSH_LIMIT so we don't deadlock.
  289. */
  290. if (noflush || btrfs_is_free_space_inode(inode)) {
  291. flush = BTRFS_RESERVE_NO_FLUSH;
  292. } else {
  293. if (current->journal_info)
  294. flush = BTRFS_RESERVE_FLUSH_LIMIT;
  295. }
  296. num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
  297. disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize);
  298. /*
  299. * We always want to do it this way, every other way is wrong and ends
  300. * in tears. Pre-reserving the amount we are going to add will always
  301. * be the right way, because otherwise if we have enough parallelism we
  302. * could end up with thousands of inodes all holding little bits of
  303. * reservations they were able to make previously and the only way to
  304. * reclaim that space is to ENOSPC out the operations and clear
  305. * everything out and try again, which is bad. This way we just
  306. * over-reserve slightly, and clean up the mess when we are done.
  307. */
  308. calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
  309. &meta_reserve, &qgroup_reserve);
  310. ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
  311. noflush);
  312. if (ret)
  313. return ret;
  314. ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
  315. if (ret) {
  316. btrfs_qgroup_free_meta_prealloc(root, qgroup_reserve);
  317. return ret;
  318. }
  319. /*
  320. * Now we need to update our outstanding extents and csum bytes _first_
  321. * and then add the reservation to the block_rsv. This keeps us from
  322. * racing with an ordered completion or some such that would think it
  323. * needs to free the reservation we just made.
  324. */
  325. spin_lock(&inode->lock);
  326. nr_extents = count_max_extents(fs_info, num_bytes);
  327. btrfs_mod_outstanding_extents(inode, nr_extents);
  328. inode->csum_bytes += disk_num_bytes;
  329. btrfs_calculate_inode_block_rsv_size(fs_info, inode);
  330. spin_unlock(&inode->lock);
  331. /* Now we can safely add our space to our block rsv */
  332. btrfs_block_rsv_add_bytes(block_rsv, meta_reserve, false);
  333. trace_btrfs_space_reservation(root->fs_info, "delalloc",
  334. btrfs_ino(inode), meta_reserve, 1);
  335. spin_lock(&block_rsv->lock);
  336. block_rsv->qgroup_rsv_reserved += qgroup_reserve;
  337. spin_unlock(&block_rsv->lock);
  338. return 0;
  339. }
  340. /**
  341. * Release a metadata reservation for an inode
  342. *
  343. * @inode: the inode to release the reservation for.
  344. * @num_bytes: the number of bytes we are releasing.
  345. * @qgroup_free: free qgroup reservation or convert it to per-trans reservation
  346. *
  347. * This will release the metadata reservation for an inode. This can be called
  348. * once we complete IO for a given set of bytes to release their metadata
  349. * reservations, or on error for the same reason.
  350. */
  351. void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
  352. bool qgroup_free)
  353. {
  354. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  355. num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
  356. spin_lock(&inode->lock);
  357. inode->csum_bytes -= num_bytes;
  358. btrfs_calculate_inode_block_rsv_size(fs_info, inode);
  359. spin_unlock(&inode->lock);
  360. if (btrfs_is_testing(fs_info))
  361. return;
  362. btrfs_inode_rsv_release(inode, qgroup_free);
  363. }
  364. /**
  365. * btrfs_delalloc_release_extents - release our outstanding_extents
  366. * @inode: the inode to balance the reservation for.
  367. * @num_bytes: the number of bytes we originally reserved with
  368. *
  369. * When we reserve space we increase outstanding_extents for the extents we may
  370. * add. Once we've set the range as delalloc or created our ordered extents we
  371. * have outstanding_extents to track the real usage, so we use this to free our
  372. * temporarily tracked outstanding_extents. This _must_ be used in conjunction
  373. * with btrfs_delalloc_reserve_metadata.
  374. */
  375. void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
  376. {
  377. struct btrfs_fs_info *fs_info = inode->root->fs_info;
  378. unsigned num_extents;
  379. spin_lock(&inode->lock);
  380. num_extents = count_max_extents(fs_info, num_bytes);
  381. btrfs_mod_outstanding_extents(inode, -num_extents);
  382. btrfs_calculate_inode_block_rsv_size(fs_info, inode);
  383. spin_unlock(&inode->lock);
  384. if (btrfs_is_testing(fs_info))
  385. return;
  386. btrfs_inode_rsv_release(inode, true);
  387. }
  388. /**
  389. * btrfs_delalloc_reserve_space - reserve data and metadata space for
  390. * delalloc
  391. * @inode: inode we're writing to
  392. * @start: start range we are writing to
  393. * @len: how long the range we are writing to
  394. * @reserved: mandatory parameter, record actually reserved qgroup ranges of
  395. * current reservation.
  396. *
  397. * This will do the following things
  398. *
  399. * - reserve space in data space info for num bytes
  400. * and reserve precious corresponding qgroup space
  401. * (Done in check_data_free_space)
  402. *
  403. * - reserve space for metadata space, based on the number of outstanding
  404. * extents and how much csums will be needed
  405. * also reserve metadata space in a per root over-reserve method.
  406. * - add to the inodes->delalloc_bytes
  407. * - add it to the fs_info's delalloc inodes list.
  408. * (Above 3 all done in delalloc_reserve_metadata)
  409. *
  410. * Return 0 for success
  411. * Return <0 for error(-ENOSPC or -EQUOT)
  412. */
  413. int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
  414. struct extent_changeset **reserved, u64 start, u64 len)
  415. {
  416. int ret;
  417. ret = btrfs_check_data_free_space(inode, reserved, start, len, false);
  418. if (ret < 0)
  419. return ret;
  420. ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
  421. if (ret < 0) {
  422. btrfs_free_reserved_data_space(inode, *reserved, start, len);
  423. extent_changeset_free(*reserved);
  424. *reserved = NULL;
  425. }
  426. return ret;
  427. }
  428. /**
  429. * Release data and metadata space for delalloc
  430. *
  431. * @inode: inode we're releasing space for
  432. * @reserved: list of changed/reserved ranges
  433. * @start: start position of the space already reserved
  434. * @len: length of the space already reserved
  435. * @qgroup_free: should qgroup reserved-space also be freed
  436. *
  437. * This function will release the metadata space that was not used and will
  438. * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
  439. * list if there are no delalloc bytes left.
  440. * Also it will handle the qgroup reserved space.
  441. */
  442. void btrfs_delalloc_release_space(struct btrfs_inode *inode,
  443. struct extent_changeset *reserved,
  444. u64 start, u64 len, bool qgroup_free)
  445. {
  446. btrfs_delalloc_release_metadata(inode, len, qgroup_free);
  447. btrfs_free_reserved_data_space(inode, reserved, start, len);
  448. }