xfs_ag_resv.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * Copyright (C) 2016 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <[email protected]>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_log_format.h"
  11. #include "xfs_trans_resv.h"
  12. #include "xfs_mount.h"
  13. #include "xfs_alloc.h"
  14. #include "xfs_errortag.h"
  15. #include "xfs_error.h"
  16. #include "xfs_trace.h"
  17. #include "xfs_trans.h"
  18. #include "xfs_rmap_btree.h"
  19. #include "xfs_btree.h"
  20. #include "xfs_refcount_btree.h"
  21. #include "xfs_ialloc_btree.h"
  22. #include "xfs_ag.h"
  23. #include "xfs_ag_resv.h"
  24. /*
  25. * Per-AG Block Reservations
  26. *
  27. * For some kinds of allocation group metadata structures, it is advantageous
  28. * to reserve a small number of blocks in each AG so that future expansions of
  29. * that data structure do not encounter ENOSPC because errors during a btree
  30. * split cause the filesystem to go offline.
  31. *
  32. * Prior to the introduction of reflink, this wasn't an issue because the free
  33. * space btrees maintain a reserve of space (the AGFL) to handle any expansion
  34. * that may be necessary; and allocations of other metadata (inodes, BMBT,
  35. * dir/attr) aren't restricted to a single AG. However, with reflink it is
  36. * possible to allocate all the space in an AG, have subsequent reflink/CoW
  37. * activity expand the refcount btree, and discover that there's no space left
  38. * to handle that expansion. Since we can calculate the maximum size of the
  39. * refcount btree, we can reserve space for it and avoid ENOSPC.
  40. *
  41. * Handling per-AG reservations consists of three changes to the allocator's
  42. * behavior: First, because these reservations are always needed, we decrease
  43. * the ag_max_usable counter to reflect the size of the AG after the reserved
  44. * blocks are taken. Second, the reservations must be reflected in the
  45. * fdblocks count to maintain proper accounting. Third, each AG must maintain
  46. * its own reserved block counter so that we can calculate the amount of space
  47. * that must remain free to maintain the reservations. Fourth, the "remaining
  48. * reserved blocks" count must be used when calculating the length of the
  49. * longest free extent in an AG and to clamp maxlen in the per-AG allocation
  50. * functions. In other words, we maintain a virtual allocation via in-core
  51. * accounting tricks so that we don't have to clean up after a crash. :)
  52. *
  53. * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type
  54. * values via struct xfs_alloc_arg or directly to the xfs_free_extent
  55. * function. It might seem a little funny to maintain a reservoir of blocks
  56. * to feed another reservoir, but the AGFL only holds enough blocks to get
  57. * through the next transaction. The per-AG reservation is to ensure (we
  58. * hope) that each AG never runs out of blocks. Each data structure wanting
  59. * to use the reservation system should update ask/used in xfs_ag_resv_init.
  60. */
  61. /*
  62. * Are we critically low on blocks? For now we'll define that as the number
  63. * of blocks we can get our hands on being less than 10% of what we reserved
  64. * or less than some arbitrary number (maximum btree height).
  65. */
  66. bool
  67. xfs_ag_resv_critical(
  68. struct xfs_perag *pag,
  69. enum xfs_ag_resv_type type)
  70. {
  71. xfs_extlen_t avail;
  72. xfs_extlen_t orig;
  73. switch (type) {
  74. case XFS_AG_RESV_METADATA:
  75. avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved;
  76. orig = pag->pag_meta_resv.ar_asked;
  77. break;
  78. case XFS_AG_RESV_RMAPBT:
  79. avail = pag->pagf_freeblks + pag->pagf_flcount -
  80. pag->pag_meta_resv.ar_reserved;
  81. orig = pag->pag_rmapbt_resv.ar_asked;
  82. break;
  83. default:
  84. ASSERT(0);
  85. return false;
  86. }
  87. trace_xfs_ag_resv_critical(pag, type, avail);
  88. /* Critically low if less than 10% or max btree height remains. */
  89. return XFS_TEST_ERROR(avail < orig / 10 ||
  90. avail < pag->pag_mount->m_agbtree_maxlevels,
  91. pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL);
  92. }
  93. /*
  94. * How many blocks are reserved but not used, and therefore must not be
  95. * allocated away?
  96. */
  97. xfs_extlen_t
  98. xfs_ag_resv_needed(
  99. struct xfs_perag *pag,
  100. enum xfs_ag_resv_type type)
  101. {
  102. xfs_extlen_t len;
  103. len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved;
  104. switch (type) {
  105. case XFS_AG_RESV_METADATA:
  106. case XFS_AG_RESV_RMAPBT:
  107. len -= xfs_perag_resv(pag, type)->ar_reserved;
  108. break;
  109. case XFS_AG_RESV_NONE:
  110. /* empty */
  111. break;
  112. default:
  113. ASSERT(0);
  114. }
  115. trace_xfs_ag_resv_needed(pag, type, len);
  116. return len;
  117. }
  118. /* Clean out a reservation */
  119. static int
  120. __xfs_ag_resv_free(
  121. struct xfs_perag *pag,
  122. enum xfs_ag_resv_type type)
  123. {
  124. struct xfs_ag_resv *resv;
  125. xfs_extlen_t oldresv;
  126. int error;
  127. trace_xfs_ag_resv_free(pag, type, 0);
  128. resv = xfs_perag_resv(pag, type);
  129. if (pag->pag_agno == 0)
  130. pag->pag_mount->m_ag_max_usable += resv->ar_asked;
  131. /*
  132. * RMAPBT blocks come from the AGFL and AGFL blocks are always
  133. * considered "free", so whatever was reserved at mount time must be
  134. * given back at umount.
  135. */
  136. if (type == XFS_AG_RESV_RMAPBT)
  137. oldresv = resv->ar_orig_reserved;
  138. else
  139. oldresv = resv->ar_reserved;
  140. error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true);
  141. resv->ar_reserved = 0;
  142. resv->ar_asked = 0;
  143. resv->ar_orig_reserved = 0;
  144. if (error)
  145. trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno,
  146. error, _RET_IP_);
  147. return error;
  148. }
  149. /* Free a per-AG reservation. */
  150. int
  151. xfs_ag_resv_free(
  152. struct xfs_perag *pag)
  153. {
  154. int error;
  155. int err2;
  156. error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT);
  157. err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA);
  158. if (err2 && !error)
  159. error = err2;
  160. return error;
  161. }
  162. static int
  163. __xfs_ag_resv_init(
  164. struct xfs_perag *pag,
  165. enum xfs_ag_resv_type type,
  166. xfs_extlen_t ask,
  167. xfs_extlen_t used)
  168. {
  169. struct xfs_mount *mp = pag->pag_mount;
  170. struct xfs_ag_resv *resv;
  171. int error;
  172. xfs_extlen_t hidden_space;
  173. if (used > ask)
  174. ask = used;
  175. switch (type) {
  176. case XFS_AG_RESV_RMAPBT:
  177. /*
  178. * Space taken by the rmapbt is not subtracted from fdblocks
  179. * because the rmapbt lives in the free space. Here we must
  180. * subtract the entire reservation from fdblocks so that we
  181. * always have blocks available for rmapbt expansion.
  182. */
  183. hidden_space = ask;
  184. break;
  185. case XFS_AG_RESV_METADATA:
  186. /*
  187. * Space taken by all other metadata btrees are accounted
  188. * on-disk as used space. We therefore only hide the space
  189. * that is reserved but not used by the trees.
  190. */
  191. hidden_space = ask - used;
  192. break;
  193. default:
  194. ASSERT(0);
  195. return -EINVAL;
  196. }
  197. if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_AG_RESV_FAIL))
  198. error = -ENOSPC;
  199. else
  200. error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true);
  201. if (error) {
  202. trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno,
  203. error, _RET_IP_);
  204. xfs_warn(mp,
  205. "Per-AG reservation for AG %u failed. Filesystem may run out of space.",
  206. pag->pag_agno);
  207. return error;
  208. }
  209. /*
  210. * Reduce the maximum per-AG allocation length by however much we're
  211. * trying to reserve for an AG. Since this is a filesystem-wide
  212. * counter, we only make the adjustment for AG 0. This assumes that
  213. * there aren't any AGs hungrier for per-AG reservation than AG 0.
  214. */
  215. if (pag->pag_agno == 0)
  216. mp->m_ag_max_usable -= ask;
  217. resv = xfs_perag_resv(pag, type);
  218. resv->ar_asked = ask;
  219. resv->ar_orig_reserved = hidden_space;
  220. resv->ar_reserved = ask - used;
  221. trace_xfs_ag_resv_init(pag, type, ask);
  222. return 0;
  223. }
  224. /* Create a per-AG block reservation. */
  225. int
  226. xfs_ag_resv_init(
  227. struct xfs_perag *pag,
  228. struct xfs_trans *tp)
  229. {
  230. struct xfs_mount *mp = pag->pag_mount;
  231. xfs_extlen_t ask;
  232. xfs_extlen_t used;
  233. int error = 0, error2;
  234. bool has_resv = false;
  235. /* Create the metadata reservation. */
  236. if (pag->pag_meta_resv.ar_asked == 0) {
  237. ask = used = 0;
  238. error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask, &used);
  239. if (error)
  240. goto out;
  241. error = xfs_finobt_calc_reserves(mp, tp, pag, &ask, &used);
  242. if (error)
  243. goto out;
  244. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
  245. ask, used);
  246. if (error) {
  247. /*
  248. * Because we didn't have per-AG reservations when the
  249. * finobt feature was added we might not be able to
  250. * reserve all needed blocks. Warn and fall back to the
  251. * old and potentially buggy code in that case, but
  252. * ensure we do have the reservation for the refcountbt.
  253. */
  254. ask = used = 0;
  255. mp->m_finobt_nores = true;
  256. error = xfs_refcountbt_calc_reserves(mp, tp, pag, &ask,
  257. &used);
  258. if (error)
  259. goto out;
  260. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
  261. ask, used);
  262. if (error)
  263. goto out;
  264. }
  265. if (ask)
  266. has_resv = true;
  267. }
  268. /* Create the RMAPBT metadata reservation */
  269. if (pag->pag_rmapbt_resv.ar_asked == 0) {
  270. ask = used = 0;
  271. error = xfs_rmapbt_calc_reserves(mp, tp, pag, &ask, &used);
  272. if (error)
  273. goto out;
  274. error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used);
  275. if (error)
  276. goto out;
  277. if (ask)
  278. has_resv = true;
  279. }
  280. out:
  281. /*
  282. * Initialize the pagf if we have at least one active reservation on the
  283. * AG. This may have occurred already via reservation calculation, but
  284. * fall back to an explicit init to ensure the in-core allocbt usage
  285. * counters are initialized as soon as possible. This is important
  286. * because filesystems with large perag reservations are susceptible to
  287. * free space reservation problems that the allocbt counter is used to
  288. * address.
  289. */
  290. if (has_resv) {
  291. error2 = xfs_alloc_read_agf(pag, tp, 0, NULL);
  292. if (error2)
  293. return error2;
  294. /*
  295. * If there isn't enough space in the AG to satisfy the
  296. * reservation, let the caller know that there wasn't enough
  297. * space. Callers are responsible for deciding what to do
  298. * next, since (in theory) we can stumble along with
  299. * insufficient reservation if data blocks are being freed to
  300. * replenish the AG's free space.
  301. */
  302. if (!error &&
  303. xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
  304. xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved >
  305. pag->pagf_freeblks + pag->pagf_flcount)
  306. error = -ENOSPC;
  307. }
  308. return error;
  309. }
  310. /* Allocate a block from the reservation. */
  311. void
  312. xfs_ag_resv_alloc_extent(
  313. struct xfs_perag *pag,
  314. enum xfs_ag_resv_type type,
  315. struct xfs_alloc_arg *args)
  316. {
  317. struct xfs_ag_resv *resv;
  318. xfs_extlen_t len;
  319. uint field;
  320. trace_xfs_ag_resv_alloc_extent(pag, type, args->len);
  321. switch (type) {
  322. case XFS_AG_RESV_AGFL:
  323. return;
  324. case XFS_AG_RESV_METADATA:
  325. case XFS_AG_RESV_RMAPBT:
  326. resv = xfs_perag_resv(pag, type);
  327. break;
  328. default:
  329. ASSERT(0);
  330. fallthrough;
  331. case XFS_AG_RESV_NONE:
  332. field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS :
  333. XFS_TRANS_SB_FDBLOCKS;
  334. xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len);
  335. return;
  336. }
  337. len = min_t(xfs_extlen_t, args->len, resv->ar_reserved);
  338. resv->ar_reserved -= len;
  339. if (type == XFS_AG_RESV_RMAPBT)
  340. return;
  341. /* Allocations of reserved blocks only need on-disk sb updates... */
  342. xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len);
  343. /* ...but non-reserved blocks need in-core and on-disk updates. */
  344. if (args->len > len)
  345. xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS,
  346. -((int64_t)args->len - len));
  347. }
  348. /* Free a block to the reservation. */
  349. void
  350. xfs_ag_resv_free_extent(
  351. struct xfs_perag *pag,
  352. enum xfs_ag_resv_type type,
  353. struct xfs_trans *tp,
  354. xfs_extlen_t len)
  355. {
  356. xfs_extlen_t leftover;
  357. struct xfs_ag_resv *resv;
  358. trace_xfs_ag_resv_free_extent(pag, type, len);
  359. switch (type) {
  360. case XFS_AG_RESV_AGFL:
  361. return;
  362. case XFS_AG_RESV_METADATA:
  363. case XFS_AG_RESV_RMAPBT:
  364. resv = xfs_perag_resv(pag, type);
  365. break;
  366. default:
  367. ASSERT(0);
  368. fallthrough;
  369. case XFS_AG_RESV_NONE:
  370. xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len);
  371. return;
  372. }
  373. leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved);
  374. resv->ar_reserved += leftover;
  375. if (type == XFS_AG_RESV_RMAPBT)
  376. return;
  377. /* Freeing into the reserved pool only requires on-disk update... */
  378. xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len);
  379. /* ...but freeing beyond that requires in-core and on-disk update. */
  380. if (len > leftover)
  381. xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover);
  382. }