space-info.c 56 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include "misc.h"
  3. #include "ctree.h"
  4. #include "space-info.h"
  5. #include "sysfs.h"
  6. #include "volumes.h"
  7. #include "free-space-cache.h"
  8. #include "ordered-data.h"
  9. #include "transaction.h"
  10. #include "block-group.h"
  11. #include "zoned.h"
  12. /*
  13. * HOW DOES SPACE RESERVATION WORK
  14. *
  15. * If you want to know about delalloc specifically, there is a separate comment
  16. * for that with the delalloc code. This comment is about how the whole system
  17. * works generally.
  18. *
  19. * BASIC CONCEPTS
  20. *
  21. * 1) space_info. This is the ultimate arbiter of how much space we can use.
  22. * There's a description of the bytes_ fields with the struct declaration,
  23. * refer to that for specifics on each field. Suffice it to say that for
  24. * reservations we care about total_bytes - SUM(space_info->bytes_) when
  25. * determining if there is space to make an allocation. There is a space_info
  26. * for METADATA, SYSTEM, and DATA areas.
  27. *
  28. * 2) block_rsv's. These are basically buckets for every different type of
  29. * metadata reservation we have. You can see the comment in the block_rsv
  30. * code on the rules for each type, but generally block_rsv->reserved is how
  31. * much space is accounted for in space_info->bytes_may_use.
  32. *
  33. * 3) btrfs_calc*_size. These are the worst case calculations we used based
  34. * on the number of items we will want to modify. We have one for changing
  35. * items, and one for inserting new items. Generally we use these helpers to
  36. * determine the size of the block reserves, and then use the actual bytes
  37. * values to adjust the space_info counters.
  38. *
  39. * MAKING RESERVATIONS, THE NORMAL CASE
  40. *
  41. * We call into either btrfs_reserve_data_bytes() or
  42. * btrfs_reserve_metadata_bytes(), depending on which we're looking for, with
  43. * num_bytes we want to reserve.
  44. *
  45. * ->reserve
  46. * space_info->bytes_may_reserve += num_bytes
  47. *
  48. * ->extent allocation
  49. * Call btrfs_add_reserved_bytes() which does
  50. * space_info->bytes_may_reserve -= num_bytes
  51. * space_info->bytes_reserved += extent_bytes
  52. *
  53. * ->insert reference
  54. * Call btrfs_update_block_group() which does
  55. * space_info->bytes_reserved -= extent_bytes
  56. * space_info->bytes_used += extent_bytes
  57. *
  58. * MAKING RESERVATIONS, FLUSHING NORMALLY (non-priority)
  59. *
  60. * Assume we are unable to simply make the reservation because we do not have
  61. * enough space
  62. *
  63. * -> __reserve_bytes
  64. * create a reserve_ticket with ->bytes set to our reservation, add it to
  65. * the tail of space_info->tickets, kick async flush thread
  66. *
  67. * ->handle_reserve_ticket
  68. * wait on ticket->wait for ->bytes to be reduced to 0, or ->error to be set
  69. * on the ticket.
  70. *
  71. * -> btrfs_async_reclaim_metadata_space/btrfs_async_reclaim_data_space
  72. * Flushes various things attempting to free up space.
  73. *
  74. * -> btrfs_try_granting_tickets()
  75. * This is called by anything that either subtracts space from
  76. * space_info->bytes_may_use, ->bytes_pinned, etc, or adds to the
  77. * space_info->total_bytes. This loops through the ->priority_tickets and
  78. * then the ->tickets list checking to see if the reservation can be
  79. * completed. If it can the space is added to space_info->bytes_may_use and
  80. * the ticket is woken up.
  81. *
  82. * -> ticket wakeup
  83. * Check if ->bytes == 0, if it does we got our reservation and we can carry
  84. * on, if not return the appropriate error (ENOSPC, but can be EINTR if we
  85. * were interrupted.)
  86. *
  87. * MAKING RESERVATIONS, FLUSHING HIGH PRIORITY
  88. *
  89. * Same as the above, except we add ourselves to the
  90. * space_info->priority_tickets, and we do not use ticket->wait, we simply
  91. * call flush_space() ourselves for the states that are safe for us to call
  92. * without deadlocking and hope for the best.
  93. *
  94. * THE FLUSHING STATES
  95. *
  96. * Generally speaking we will have two cases for each state, a "nice" state
  97. * and a "ALL THE THINGS" state. In btrfs we delay a lot of work in order to
  98. * reduce the locking over head on the various trees, and even to keep from
  99. * doing any work at all in the case of delayed refs. Each of these delayed
  100. * things however hold reservations, and so letting them run allows us to
  101. * reclaim space so we can make new reservations.
  102. *
  103. * FLUSH_DELAYED_ITEMS
  104. * Every inode has a delayed item to update the inode. Take a simple write
  105. * for example, we would update the inode item at write time to update the
  106. * mtime, and then again at finish_ordered_io() time in order to update the
  107. * isize or bytes. We keep these delayed items to coalesce these operations
  108. * into a single operation done on demand. These are an easy way to reclaim
  109. * metadata space.
  110. *
  111. * FLUSH_DELALLOC
  112. * Look at the delalloc comment to get an idea of how much space is reserved
  113. * for delayed allocation. We can reclaim some of this space simply by
  114. * running delalloc, but usually we need to wait for ordered extents to
  115. * reclaim the bulk of this space.
  116. *
  117. * FLUSH_DELAYED_REFS
  118. * We have a block reserve for the outstanding delayed refs space, and every
  119. * delayed ref operation holds a reservation. Running these is a quick way
  120. * to reclaim space, but we want to hold this until the end because COW can
  121. * churn a lot and we can avoid making some extent tree modifications if we
  122. * are able to delay for as long as possible.
  123. *
  124. * ALLOC_CHUNK
  125. * We will skip this the first time through space reservation, because of
  126. * overcommit and we don't want to have a lot of useless metadata space when
  127. * our worst case reservations will likely never come true.
  128. *
  129. * RUN_DELAYED_IPUTS
  130. * If we're freeing inodes we're likely freeing checksums, file extent
  131. * items, and extent tree items. Loads of space could be freed up by these
  132. * operations, however they won't be usable until the transaction commits.
  133. *
  134. * COMMIT_TRANS
  135. * This will commit the transaction. Historically we had a lot of logic
  136. * surrounding whether or not we'd commit the transaction, but this waits born
  137. * out of a pre-tickets era where we could end up committing the transaction
  138. * thousands of times in a row without making progress. Now thanks to our
  139. * ticketing system we know if we're not making progress and can error
  140. * everybody out after a few commits rather than burning the disk hoping for
  141. * a different answer.
  142. *
  143. * OVERCOMMIT
  144. *
  145. * Because we hold so many reservations for metadata we will allow you to
  146. * reserve more space than is currently free in the currently allocate
  147. * metadata space. This only happens with metadata, data does not allow
  148. * overcommitting.
  149. *
  150. * You can see the current logic for when we allow overcommit in
  151. * btrfs_can_overcommit(), but it only applies to unallocated space. If there
  152. * is no unallocated space to be had, all reservations are kept within the
  153. * free space in the allocated metadata chunks.
  154. *
  155. * Because of overcommitting, you generally want to use the
  156. * btrfs_can_overcommit() logic for metadata allocations, as it does the right
  157. * thing with or without extra unallocated space.
  158. */
  159. u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info,
  160. bool may_use_included)
  161. {
  162. ASSERT(s_info);
  163. return s_info->bytes_used + s_info->bytes_reserved +
  164. s_info->bytes_pinned + s_info->bytes_readonly +
  165. s_info->bytes_zone_unusable +
  166. (may_use_included ? s_info->bytes_may_use : 0);
  167. }
  168. /*
  169. * after adding space to the filesystem, we need to clear the full flags
  170. * on all the space infos.
  171. */
  172. void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
  173. {
  174. struct list_head *head = &info->space_info;
  175. struct btrfs_space_info *found;
  176. list_for_each_entry(found, head, list)
  177. found->full = 0;
  178. }
  179. /*
  180. * Block groups with more than this value (percents) of unusable space will be
  181. * scheduled for background reclaim.
  182. */
  183. #define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
  184. /*
  185. * Calculate chunk size depending on volume type (regular or zoned).
  186. */
  187. static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags)
  188. {
  189. if (btrfs_is_zoned(fs_info))
  190. return fs_info->zone_size;
  191. ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
  192. if (flags & BTRFS_BLOCK_GROUP_DATA)
  193. return BTRFS_MAX_DATA_CHUNK_SIZE;
  194. else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
  195. return SZ_32M;
  196. /* Handle BTRFS_BLOCK_GROUP_METADATA */
  197. if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G)
  198. return SZ_1G;
  199. return SZ_256M;
  200. }
  201. /*
  202. * Update default chunk size.
  203. */
  204. void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info,
  205. u64 chunk_size)
  206. {
  207. WRITE_ONCE(space_info->chunk_size, chunk_size);
  208. }
  209. static int create_space_info(struct btrfs_fs_info *info, u64 flags)
  210. {
  211. struct btrfs_space_info *space_info;
  212. int i;
  213. int ret;
  214. space_info = kzalloc(sizeof(*space_info), GFP_NOFS);
  215. if (!space_info)
  216. return -ENOMEM;
  217. for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
  218. INIT_LIST_HEAD(&space_info->block_groups[i]);
  219. init_rwsem(&space_info->groups_sem);
  220. spin_lock_init(&space_info->lock);
  221. space_info->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
  222. space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
  223. INIT_LIST_HEAD(&space_info->ro_bgs);
  224. INIT_LIST_HEAD(&space_info->tickets);
  225. INIT_LIST_HEAD(&space_info->priority_tickets);
  226. space_info->clamp = 1;
  227. btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags));
  228. if (btrfs_is_zoned(info))
  229. space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
  230. ret = btrfs_sysfs_add_space_info_type(info, space_info);
  231. if (ret)
  232. return ret;
  233. list_add(&space_info->list, &info->space_info);
  234. if (flags & BTRFS_BLOCK_GROUP_DATA)
  235. info->data_sinfo = space_info;
  236. return ret;
  237. }
  238. int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
  239. {
  240. struct btrfs_super_block *disk_super;
  241. u64 features;
  242. u64 flags;
  243. int mixed = 0;
  244. int ret;
  245. disk_super = fs_info->super_copy;
  246. if (!btrfs_super_root(disk_super))
  247. return -EINVAL;
  248. features = btrfs_super_incompat_flags(disk_super);
  249. if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
  250. mixed = 1;
  251. flags = BTRFS_BLOCK_GROUP_SYSTEM;
  252. ret = create_space_info(fs_info, flags);
  253. if (ret)
  254. goto out;
  255. if (mixed) {
  256. flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
  257. ret = create_space_info(fs_info, flags);
  258. } else {
  259. flags = BTRFS_BLOCK_GROUP_METADATA;
  260. ret = create_space_info(fs_info, flags);
  261. if (ret)
  262. goto out;
  263. flags = BTRFS_BLOCK_GROUP_DATA;
  264. ret = create_space_info(fs_info, flags);
  265. }
  266. out:
  267. return ret;
  268. }
  269. void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info,
  270. struct btrfs_block_group *block_group)
  271. {
  272. struct btrfs_space_info *found;
  273. int factor, index;
  274. factor = btrfs_bg_type_to_factor(block_group->flags);
  275. found = btrfs_find_space_info(info, block_group->flags);
  276. ASSERT(found);
  277. spin_lock(&found->lock);
  278. found->total_bytes += block_group->length;
  279. if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
  280. found->active_total_bytes += block_group->length;
  281. found->disk_total += block_group->length * factor;
  282. found->bytes_used += block_group->used;
  283. found->disk_used += block_group->used * factor;
  284. found->bytes_readonly += block_group->bytes_super;
  285. found->bytes_zone_unusable += block_group->zone_unusable;
  286. if (block_group->length > 0)
  287. found->full = 0;
  288. btrfs_try_granting_tickets(info, found);
  289. spin_unlock(&found->lock);
  290. block_group->space_info = found;
  291. index = btrfs_bg_flags_to_raid_index(block_group->flags);
  292. down_write(&found->groups_sem);
  293. list_add_tail(&block_group->list, &found->block_groups[index]);
  294. up_write(&found->groups_sem);
  295. }
  296. struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info,
  297. u64 flags)
  298. {
  299. struct list_head *head = &info->space_info;
  300. struct btrfs_space_info *found;
  301. flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
  302. list_for_each_entry(found, head, list) {
  303. if (found->flags & flags)
  304. return found;
  305. }
  306. return NULL;
  307. }
  308. static u64 calc_available_free_space(struct btrfs_fs_info *fs_info,
  309. struct btrfs_space_info *space_info,
  310. enum btrfs_reserve_flush_enum flush)
  311. {
  312. u64 profile;
  313. u64 avail;
  314. int factor;
  315. if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
  316. profile = btrfs_system_alloc_profile(fs_info);
  317. else
  318. profile = btrfs_metadata_alloc_profile(fs_info);
  319. avail = atomic64_read(&fs_info->free_chunk_space);
  320. /*
  321. * If we have dup, raid1 or raid10 then only half of the free
  322. * space is actually usable. For raid56, the space info used
  323. * doesn't include the parity drive, so we don't have to
  324. * change the math
  325. */
  326. factor = btrfs_bg_type_to_factor(profile);
  327. avail = div_u64(avail, factor);
  328. /*
  329. * If we aren't flushing all things, let us overcommit up to
  330. * 1/2th of the space. If we can flush, don't let us overcommit
  331. * too much, let it overcommit up to 1/8 of the space.
  332. */
  333. if (flush == BTRFS_RESERVE_FLUSH_ALL)
  334. avail >>= 3;
  335. else
  336. avail >>= 1;
  337. return avail;
  338. }
  339. static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info,
  340. struct btrfs_space_info *space_info)
  341. {
  342. /*
  343. * On regular filesystem, all total_bytes are always writable. On zoned
  344. * filesystem, there may be a limitation imposed by max_active_zones.
  345. * For metadata allocation, we cannot finish an existing active block
  346. * group to avoid a deadlock. Thus, we need to consider only the active
  347. * groups to be writable for metadata space.
  348. */
  349. if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA))
  350. return space_info->total_bytes;
  351. return space_info->active_total_bytes;
  352. }
  353. int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
  354. struct btrfs_space_info *space_info, u64 bytes,
  355. enum btrfs_reserve_flush_enum flush)
  356. {
  357. u64 avail;
  358. u64 used;
  359. /* Don't overcommit when in mixed mode */
  360. if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
  361. return 0;
  362. used = btrfs_space_info_used(space_info, true);
  363. avail = calc_available_free_space(fs_info, space_info, flush);
  364. if (used + bytes < writable_total_bytes(fs_info, space_info) + avail)
  365. return 1;
  366. return 0;
  367. }
  368. static void remove_ticket(struct btrfs_space_info *space_info,
  369. struct reserve_ticket *ticket)
  370. {
  371. if (!list_empty(&ticket->list)) {
  372. list_del_init(&ticket->list);
  373. ASSERT(space_info->reclaim_size >= ticket->bytes);
  374. space_info->reclaim_size -= ticket->bytes;
  375. }
  376. }
  377. /*
  378. * This is for space we already have accounted in space_info->bytes_may_use, so
  379. * basically when we're returning space from block_rsv's.
  380. */
  381. void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
  382. struct btrfs_space_info *space_info)
  383. {
  384. struct list_head *head;
  385. enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_NO_FLUSH;
  386. lockdep_assert_held(&space_info->lock);
  387. head = &space_info->priority_tickets;
  388. again:
  389. while (!list_empty(head)) {
  390. struct reserve_ticket *ticket;
  391. u64 used = btrfs_space_info_used(space_info, true);
  392. ticket = list_first_entry(head, struct reserve_ticket, list);
  393. /* Check and see if our ticket can be satisfied now. */
  394. if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) ||
  395. btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
  396. flush)) {
  397. btrfs_space_info_update_bytes_may_use(fs_info,
  398. space_info,
  399. ticket->bytes);
  400. remove_ticket(space_info, ticket);
  401. ticket->bytes = 0;
  402. space_info->tickets_id++;
  403. wake_up(&ticket->wait);
  404. } else {
  405. break;
  406. }
  407. }
  408. if (head == &space_info->priority_tickets) {
  409. head = &space_info->tickets;
  410. flush = BTRFS_RESERVE_FLUSH_ALL;
  411. goto again;
  412. }
  413. }
  414. #define DUMP_BLOCK_RSV(fs_info, rsv_name) \
  415. do { \
  416. struct btrfs_block_rsv *__rsv = &(fs_info)->rsv_name; \
  417. spin_lock(&__rsv->lock); \
  418. btrfs_info(fs_info, #rsv_name ": size %llu reserved %llu", \
  419. __rsv->size, __rsv->reserved); \
  420. spin_unlock(&__rsv->lock); \
  421. } while (0)
  422. static const char *space_info_flag_to_str(const struct btrfs_space_info *space_info)
  423. {
  424. switch (space_info->flags) {
  425. case BTRFS_BLOCK_GROUP_SYSTEM:
  426. return "SYSTEM";
  427. case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA:
  428. return "DATA+METADATA";
  429. case BTRFS_BLOCK_GROUP_DATA:
  430. return "DATA";
  431. case BTRFS_BLOCK_GROUP_METADATA:
  432. return "METADATA";
  433. default:
  434. return "UNKNOWN";
  435. }
  436. }
  437. static void dump_global_block_rsv(struct btrfs_fs_info *fs_info)
  438. {
  439. DUMP_BLOCK_RSV(fs_info, global_block_rsv);
  440. DUMP_BLOCK_RSV(fs_info, trans_block_rsv);
  441. DUMP_BLOCK_RSV(fs_info, chunk_block_rsv);
  442. DUMP_BLOCK_RSV(fs_info, delayed_block_rsv);
  443. DUMP_BLOCK_RSV(fs_info, delayed_refs_rsv);
  444. }
  445. static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
  446. struct btrfs_space_info *info)
  447. {
  448. const char *flag_str = space_info_flag_to_str(info);
  449. lockdep_assert_held(&info->lock);
  450. /* The free space could be negative in case of overcommit */
  451. btrfs_info(fs_info, "space_info %s has %lld free, is %sfull",
  452. flag_str,
  453. (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
  454. info->full ? "" : "not ");
  455. btrfs_info(fs_info,
  456. "space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu zone_unusable=%llu",
  457. info->total_bytes, info->bytes_used, info->bytes_pinned,
  458. info->bytes_reserved, info->bytes_may_use,
  459. info->bytes_readonly, info->bytes_zone_unusable);
  460. }
  461. void btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
  462. struct btrfs_space_info *info, u64 bytes,
  463. int dump_block_groups)
  464. {
  465. struct btrfs_block_group *cache;
  466. int index = 0;
  467. spin_lock(&info->lock);
  468. __btrfs_dump_space_info(fs_info, info);
  469. dump_global_block_rsv(fs_info);
  470. spin_unlock(&info->lock);
  471. if (!dump_block_groups)
  472. return;
  473. down_read(&info->groups_sem);
  474. again:
  475. list_for_each_entry(cache, &info->block_groups[index], list) {
  476. spin_lock(&cache->lock);
  477. btrfs_info(fs_info,
  478. "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %llu zone_unusable %s",
  479. cache->start, cache->length, cache->used, cache->pinned,
  480. cache->reserved, cache->zone_unusable,
  481. cache->ro ? "[readonly]" : "");
  482. spin_unlock(&cache->lock);
  483. btrfs_dump_free_space(cache, bytes);
  484. }
  485. if (++index < BTRFS_NR_RAID_TYPES)
  486. goto again;
  487. up_read(&info->groups_sem);
  488. }
  489. static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info,
  490. u64 to_reclaim)
  491. {
  492. u64 bytes;
  493. u64 nr;
  494. bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
  495. nr = div64_u64(to_reclaim, bytes);
  496. if (!nr)
  497. nr = 1;
  498. return nr;
  499. }
  500. #define EXTENT_SIZE_PER_ITEM SZ_256K
  501. /*
  502. * shrink metadata reservation for delalloc
  503. */
  504. static void shrink_delalloc(struct btrfs_fs_info *fs_info,
  505. struct btrfs_space_info *space_info,
  506. u64 to_reclaim, bool wait_ordered,
  507. bool for_preempt)
  508. {
  509. struct btrfs_trans_handle *trans;
  510. u64 delalloc_bytes;
  511. u64 ordered_bytes;
  512. u64 items;
  513. long time_left;
  514. int loops;
  515. delalloc_bytes = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
  516. ordered_bytes = percpu_counter_sum_positive(&fs_info->ordered_bytes);
  517. if (delalloc_bytes == 0 && ordered_bytes == 0)
  518. return;
  519. /* Calc the number of the pages we need flush for space reservation */
  520. if (to_reclaim == U64_MAX) {
  521. items = U64_MAX;
  522. } else {
  523. /*
  524. * to_reclaim is set to however much metadata we need to
  525. * reclaim, but reclaiming that much data doesn't really track
  526. * exactly. What we really want to do is reclaim full inode's
  527. * worth of reservations, however that's not available to us
  528. * here. We will take a fraction of the delalloc bytes for our
  529. * flushing loops and hope for the best. Delalloc will expand
  530. * the amount we write to cover an entire dirty extent, which
  531. * will reclaim the metadata reservation for that range. If
  532. * it's not enough subsequent flush stages will be more
  533. * aggressive.
  534. */
  535. to_reclaim = max(to_reclaim, delalloc_bytes >> 3);
  536. items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
  537. }
  538. trans = current->journal_info;
  539. /*
  540. * If we are doing more ordered than delalloc we need to just wait on
  541. * ordered extents, otherwise we'll waste time trying to flush delalloc
  542. * that likely won't give us the space back we need.
  543. */
  544. if (ordered_bytes > delalloc_bytes && !for_preempt)
  545. wait_ordered = true;
  546. loops = 0;
  547. while ((delalloc_bytes || ordered_bytes) && loops < 3) {
  548. u64 temp = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT;
  549. long nr_pages = min_t(u64, temp, LONG_MAX);
  550. int async_pages;
  551. btrfs_start_delalloc_roots(fs_info, nr_pages, true);
  552. /*
  553. * We need to make sure any outstanding async pages are now
  554. * processed before we continue. This is because things like
  555. * sync_inode() try to be smart and skip writing if the inode is
  556. * marked clean. We don't use filemap_fwrite for flushing
  557. * because we want to control how many pages we write out at a
  558. * time, thus this is the only safe way to make sure we've
  559. * waited for outstanding compressed workers to have started
  560. * their jobs and thus have ordered extents set up properly.
  561. *
  562. * This exists because we do not want to wait for each
  563. * individual inode to finish its async work, we simply want to
  564. * start the IO on everybody, and then come back here and wait
  565. * for all of the async work to catch up. Once we're done with
  566. * that we know we'll have ordered extents for everything and we
  567. * can decide if we wait for that or not.
  568. *
  569. * If we choose to replace this in the future, make absolutely
  570. * sure that the proper waiting is being done in the async case,
  571. * as there have been bugs in that area before.
  572. */
  573. async_pages = atomic_read(&fs_info->async_delalloc_pages);
  574. if (!async_pages)
  575. goto skip_async;
  576. /*
  577. * We don't want to wait forever, if we wrote less pages in this
  578. * loop than we have outstanding, only wait for that number of
  579. * pages, otherwise we can wait for all async pages to finish
  580. * before continuing.
  581. */
  582. if (async_pages > nr_pages)
  583. async_pages -= nr_pages;
  584. else
  585. async_pages = 0;
  586. wait_event(fs_info->async_submit_wait,
  587. atomic_read(&fs_info->async_delalloc_pages) <=
  588. async_pages);
  589. skip_async:
  590. loops++;
  591. if (wait_ordered && !trans) {
  592. btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1);
  593. } else {
  594. time_left = schedule_timeout_killable(1);
  595. if (time_left)
  596. break;
  597. }
  598. /*
  599. * If we are for preemption we just want a one-shot of delalloc
  600. * flushing so we can stop flushing if we decide we don't need
  601. * to anymore.
  602. */
  603. if (for_preempt)
  604. break;
  605. spin_lock(&space_info->lock);
  606. if (list_empty(&space_info->tickets) &&
  607. list_empty(&space_info->priority_tickets)) {
  608. spin_unlock(&space_info->lock);
  609. break;
  610. }
  611. spin_unlock(&space_info->lock);
  612. delalloc_bytes = percpu_counter_sum_positive(
  613. &fs_info->delalloc_bytes);
  614. ordered_bytes = percpu_counter_sum_positive(
  615. &fs_info->ordered_bytes);
  616. }
  617. }
  618. /*
  619. * Try to flush some data based on policy set by @state. This is only advisory
  620. * and may fail for various reasons. The caller is supposed to examine the
  621. * state of @space_info to detect the outcome.
  622. */
  623. static void flush_space(struct btrfs_fs_info *fs_info,
  624. struct btrfs_space_info *space_info, u64 num_bytes,
  625. enum btrfs_flush_state state, bool for_preempt)
  626. {
  627. struct btrfs_root *root = fs_info->tree_root;
  628. struct btrfs_trans_handle *trans;
  629. int nr;
  630. int ret = 0;
  631. switch (state) {
  632. case FLUSH_DELAYED_ITEMS_NR:
  633. case FLUSH_DELAYED_ITEMS:
  634. if (state == FLUSH_DELAYED_ITEMS_NR)
  635. nr = calc_reclaim_items_nr(fs_info, num_bytes) * 2;
  636. else
  637. nr = -1;
  638. trans = btrfs_join_transaction(root);
  639. if (IS_ERR(trans)) {
  640. ret = PTR_ERR(trans);
  641. break;
  642. }
  643. ret = btrfs_run_delayed_items_nr(trans, nr);
  644. btrfs_end_transaction(trans);
  645. break;
  646. case FLUSH_DELALLOC:
  647. case FLUSH_DELALLOC_WAIT:
  648. case FLUSH_DELALLOC_FULL:
  649. if (state == FLUSH_DELALLOC_FULL)
  650. num_bytes = U64_MAX;
  651. shrink_delalloc(fs_info, space_info, num_bytes,
  652. state != FLUSH_DELALLOC, for_preempt);
  653. break;
  654. case FLUSH_DELAYED_REFS_NR:
  655. case FLUSH_DELAYED_REFS:
  656. trans = btrfs_join_transaction(root);
  657. if (IS_ERR(trans)) {
  658. ret = PTR_ERR(trans);
  659. break;
  660. }
  661. if (state == FLUSH_DELAYED_REFS_NR)
  662. nr = calc_reclaim_items_nr(fs_info, num_bytes);
  663. else
  664. nr = 0;
  665. btrfs_run_delayed_refs(trans, nr);
  666. btrfs_end_transaction(trans);
  667. break;
  668. case ALLOC_CHUNK:
  669. case ALLOC_CHUNK_FORCE:
  670. /*
  671. * For metadata space on zoned filesystem, reaching here means we
  672. * don't have enough space left in active_total_bytes. Try to
  673. * activate a block group first, because we may have inactive
  674. * block group already allocated.
  675. */
  676. ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false);
  677. if (ret < 0)
  678. break;
  679. else if (ret == 1)
  680. break;
  681. trans = btrfs_join_transaction(root);
  682. if (IS_ERR(trans)) {
  683. ret = PTR_ERR(trans);
  684. break;
  685. }
  686. ret = btrfs_chunk_alloc(trans,
  687. btrfs_get_alloc_profile(fs_info, space_info->flags),
  688. (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE :
  689. CHUNK_ALLOC_FORCE);
  690. btrfs_end_transaction(trans);
  691. /*
  692. * For metadata space on zoned filesystem, allocating a new chunk
  693. * is not enough. We still need to activate the block * group.
  694. * Active the newly allocated block group by (maybe) finishing
  695. * a block group.
  696. */
  697. if (ret == 1) {
  698. ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true);
  699. /*
  700. * Revert to the original ret regardless we could finish
  701. * one block group or not.
  702. */
  703. if (ret >= 0)
  704. ret = 1;
  705. }
  706. if (ret > 0 || ret == -ENOSPC)
  707. ret = 0;
  708. break;
  709. case RUN_DELAYED_IPUTS:
  710. /*
  711. * If we have pending delayed iputs then we could free up a
  712. * bunch of pinned space, so make sure we run the iputs before
  713. * we do our pinned bytes check below.
  714. */
  715. btrfs_run_delayed_iputs(fs_info);
  716. btrfs_wait_on_delayed_iputs(fs_info);
  717. break;
  718. case COMMIT_TRANS:
  719. ASSERT(current->journal_info == NULL);
  720. trans = btrfs_join_transaction(root);
  721. if (IS_ERR(trans)) {
  722. ret = PTR_ERR(trans);
  723. break;
  724. }
  725. ret = btrfs_commit_transaction(trans);
  726. break;
  727. default:
  728. ret = -ENOSPC;
  729. break;
  730. }
  731. trace_btrfs_flush_space(fs_info, space_info->flags, num_bytes, state,
  732. ret, for_preempt);
  733. return;
  734. }
  735. static inline u64
  736. btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
  737. struct btrfs_space_info *space_info)
  738. {
  739. u64 used;
  740. u64 avail;
  741. u64 total;
  742. u64 to_reclaim = space_info->reclaim_size;
  743. lockdep_assert_held(&space_info->lock);
  744. avail = calc_available_free_space(fs_info, space_info,
  745. BTRFS_RESERVE_FLUSH_ALL);
  746. used = btrfs_space_info_used(space_info, true);
  747. /*
  748. * We may be flushing because suddenly we have less space than we had
  749. * before, and now we're well over-committed based on our current free
  750. * space. If that's the case add in our overage so we make sure to put
  751. * appropriate pressure on the flushing state machine.
  752. */
  753. total = writable_total_bytes(fs_info, space_info);
  754. if (total + avail < used)
  755. to_reclaim += used - (total + avail);
  756. return to_reclaim;
  757. }
  758. static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
  759. struct btrfs_space_info *space_info)
  760. {
  761. u64 global_rsv_size = fs_info->global_block_rsv.reserved;
  762. u64 ordered, delalloc;
  763. u64 total = writable_total_bytes(fs_info, space_info);
  764. u64 thresh;
  765. u64 used;
  766. thresh = div_factor_fine(total, 90);
  767. lockdep_assert_held(&space_info->lock);
  768. /* If we're just plain full then async reclaim just slows us down. */
  769. if ((space_info->bytes_used + space_info->bytes_reserved +
  770. global_rsv_size) >= thresh)
  771. return false;
  772. used = space_info->bytes_may_use + space_info->bytes_pinned;
  773. /* The total flushable belongs to the global rsv, don't flush. */
  774. if (global_rsv_size >= used)
  775. return false;
  776. /*
  777. * 128MiB is 1/4 of the maximum global rsv size. If we have less than
  778. * that devoted to other reservations then there's no sense in flushing,
  779. * we don't have a lot of things that need flushing.
  780. */
  781. if (used - global_rsv_size <= SZ_128M)
  782. return false;
  783. /*
  784. * We have tickets queued, bail so we don't compete with the async
  785. * flushers.
  786. */
  787. if (space_info->reclaim_size)
  788. return false;
  789. /*
  790. * If we have over half of the free space occupied by reservations or
  791. * pinned then we want to start flushing.
  792. *
  793. * We do not do the traditional thing here, which is to say
  794. *
  795. * if (used >= ((total_bytes + avail) / 2))
  796. * return 1;
  797. *
  798. * because this doesn't quite work how we want. If we had more than 50%
  799. * of the space_info used by bytes_used and we had 0 available we'd just
  800. * constantly run the background flusher. Instead we want it to kick in
  801. * if our reclaimable space exceeds our clamped free space.
  802. *
  803. * Our clamping range is 2^1 -> 2^8. Practically speaking that means
  804. * the following:
  805. *
  806. * Amount of RAM Minimum threshold Maximum threshold
  807. *
  808. * 256GiB 1GiB 128GiB
  809. * 128GiB 512MiB 64GiB
  810. * 64GiB 256MiB 32GiB
  811. * 32GiB 128MiB 16GiB
  812. * 16GiB 64MiB 8GiB
  813. *
  814. * These are the range our thresholds will fall in, corresponding to how
  815. * much delalloc we need for the background flusher to kick in.
  816. */
  817. thresh = calc_available_free_space(fs_info, space_info,
  818. BTRFS_RESERVE_FLUSH_ALL);
  819. used = space_info->bytes_used + space_info->bytes_reserved +
  820. space_info->bytes_readonly + global_rsv_size;
  821. if (used < total)
  822. thresh += total - used;
  823. thresh >>= space_info->clamp;
  824. used = space_info->bytes_pinned;
  825. /*
  826. * If we have more ordered bytes than delalloc bytes then we're either
  827. * doing a lot of DIO, or we simply don't have a lot of delalloc waiting
  828. * around. Preemptive flushing is only useful in that it can free up
  829. * space before tickets need to wait for things to finish. In the case
  830. * of ordered extents, preemptively waiting on ordered extents gets us
  831. * nothing, if our reservations are tied up in ordered extents we'll
  832. * simply have to slow down writers by forcing them to wait on ordered
  833. * extents.
  834. *
  835. * In the case that ordered is larger than delalloc, only include the
  836. * block reserves that we would actually be able to directly reclaim
  837. * from. In this case if we're heavy on metadata operations this will
  838. * clearly be heavy enough to warrant preemptive flushing. In the case
  839. * of heavy DIO or ordered reservations, preemptive flushing will just
  840. * waste time and cause us to slow down.
  841. *
  842. * We want to make sure we truly are maxed out on ordered however, so
  843. * cut ordered in half, and if it's still higher than delalloc then we
  844. * can keep flushing. This is to avoid the case where we start
  845. * flushing, and now delalloc == ordered and we stop preemptively
  846. * flushing when we could still have several gigs of delalloc to flush.
  847. */
  848. ordered = percpu_counter_read_positive(&fs_info->ordered_bytes) >> 1;
  849. delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
  850. if (ordered >= delalloc)
  851. used += fs_info->delayed_refs_rsv.reserved +
  852. fs_info->delayed_block_rsv.reserved;
  853. else
  854. used += space_info->bytes_may_use - global_rsv_size;
  855. return (used >= thresh && !btrfs_fs_closing(fs_info) &&
  856. !test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
  857. }
  858. static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
  859. struct btrfs_space_info *space_info,
  860. struct reserve_ticket *ticket)
  861. {
  862. struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
  863. u64 min_bytes;
  864. if (!ticket->steal)
  865. return false;
  866. if (global_rsv->space_info != space_info)
  867. return false;
  868. spin_lock(&global_rsv->lock);
  869. min_bytes = div_factor(global_rsv->size, 1);
  870. if (global_rsv->reserved < min_bytes + ticket->bytes) {
  871. spin_unlock(&global_rsv->lock);
  872. return false;
  873. }
  874. global_rsv->reserved -= ticket->bytes;
  875. remove_ticket(space_info, ticket);
  876. ticket->bytes = 0;
  877. wake_up(&ticket->wait);
  878. space_info->tickets_id++;
  879. if (global_rsv->reserved < global_rsv->size)
  880. global_rsv->full = 0;
  881. spin_unlock(&global_rsv->lock);
  882. return true;
  883. }
  884. /*
  885. * maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
  886. * @fs_info - fs_info for this fs
  887. * @space_info - the space info we were flushing
  888. *
  889. * We call this when we've exhausted our flushing ability and haven't made
  890. * progress in satisfying tickets. The reservation code handles tickets in
  891. * order, so if there is a large ticket first and then smaller ones we could
  892. * very well satisfy the smaller tickets. This will attempt to wake up any
  893. * tickets in the list to catch this case.
  894. *
  895. * This function returns true if it was able to make progress by clearing out
  896. * other tickets, or if it stumbles across a ticket that was smaller than the
  897. * first ticket.
  898. */
  899. static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
  900. struct btrfs_space_info *space_info)
  901. {
  902. struct reserve_ticket *ticket;
  903. u64 tickets_id = space_info->tickets_id;
  904. const bool aborted = BTRFS_FS_ERROR(fs_info);
  905. trace_btrfs_fail_all_tickets(fs_info, space_info);
  906. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
  907. btrfs_info(fs_info, "cannot satisfy tickets, dumping space info");
  908. __btrfs_dump_space_info(fs_info, space_info);
  909. }
  910. while (!list_empty(&space_info->tickets) &&
  911. tickets_id == space_info->tickets_id) {
  912. ticket = list_first_entry(&space_info->tickets,
  913. struct reserve_ticket, list);
  914. if (!aborted && steal_from_global_rsv(fs_info, space_info, ticket))
  915. return true;
  916. if (!aborted && btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  917. btrfs_info(fs_info, "failing ticket with %llu bytes",
  918. ticket->bytes);
  919. remove_ticket(space_info, ticket);
  920. if (aborted)
  921. ticket->error = -EIO;
  922. else
  923. ticket->error = -ENOSPC;
  924. wake_up(&ticket->wait);
  925. /*
  926. * We're just throwing tickets away, so more flushing may not
  927. * trip over btrfs_try_granting_tickets, so we need to call it
  928. * here to see if we can make progress with the next ticket in
  929. * the list.
  930. */
  931. if (!aborted)
  932. btrfs_try_granting_tickets(fs_info, space_info);
  933. }
  934. return (tickets_id != space_info->tickets_id);
  935. }
  936. /*
  937. * This is for normal flushers, we can wait all goddamned day if we want to. We
  938. * will loop and continuously try to flush as long as we are making progress.
  939. * We count progress as clearing off tickets each time we have to loop.
  940. */
  941. static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
  942. {
  943. struct btrfs_fs_info *fs_info;
  944. struct btrfs_space_info *space_info;
  945. u64 to_reclaim;
  946. enum btrfs_flush_state flush_state;
  947. int commit_cycles = 0;
  948. u64 last_tickets_id;
  949. fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
  950. space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
  951. spin_lock(&space_info->lock);
  952. to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
  953. if (!to_reclaim) {
  954. space_info->flush = 0;
  955. spin_unlock(&space_info->lock);
  956. return;
  957. }
  958. last_tickets_id = space_info->tickets_id;
  959. spin_unlock(&space_info->lock);
  960. flush_state = FLUSH_DELAYED_ITEMS_NR;
  961. do {
  962. flush_space(fs_info, space_info, to_reclaim, flush_state, false);
  963. spin_lock(&space_info->lock);
  964. if (list_empty(&space_info->tickets)) {
  965. space_info->flush = 0;
  966. spin_unlock(&space_info->lock);
  967. return;
  968. }
  969. to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
  970. space_info);
  971. if (last_tickets_id == space_info->tickets_id) {
  972. flush_state++;
  973. } else {
  974. last_tickets_id = space_info->tickets_id;
  975. flush_state = FLUSH_DELAYED_ITEMS_NR;
  976. if (commit_cycles)
  977. commit_cycles--;
  978. }
  979. /*
  980. * We do not want to empty the system of delalloc unless we're
  981. * under heavy pressure, so allow one trip through the flushing
  982. * logic before we start doing a FLUSH_DELALLOC_FULL.
  983. */
  984. if (flush_state == FLUSH_DELALLOC_FULL && !commit_cycles)
  985. flush_state++;
  986. /*
  987. * We don't want to force a chunk allocation until we've tried
  988. * pretty hard to reclaim space. Think of the case where we
  989. * freed up a bunch of space and so have a lot of pinned space
  990. * to reclaim. We would rather use that than possibly create a
  991. * underutilized metadata chunk. So if this is our first run
  992. * through the flushing state machine skip ALLOC_CHUNK_FORCE and
  993. * commit the transaction. If nothing has changed the next go
  994. * around then we can force a chunk allocation.
  995. */
  996. if (flush_state == ALLOC_CHUNK_FORCE && !commit_cycles)
  997. flush_state++;
  998. if (flush_state > COMMIT_TRANS) {
  999. commit_cycles++;
  1000. if (commit_cycles > 2) {
  1001. if (maybe_fail_all_tickets(fs_info, space_info)) {
  1002. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1003. commit_cycles--;
  1004. } else {
  1005. space_info->flush = 0;
  1006. }
  1007. } else {
  1008. flush_state = FLUSH_DELAYED_ITEMS_NR;
  1009. }
  1010. }
  1011. spin_unlock(&space_info->lock);
  1012. } while (flush_state <= COMMIT_TRANS);
  1013. }
  1014. /*
  1015. * This handles pre-flushing of metadata space before we get to the point that
  1016. * we need to start blocking threads on tickets. The logic here is different
  1017. * from the other flush paths because it doesn't rely on tickets to tell us how
  1018. * much we need to flush, instead it attempts to keep us below the 80% full
  1019. * watermark of space by flushing whichever reservation pool is currently the
  1020. * largest.
  1021. */
  1022. static void btrfs_preempt_reclaim_metadata_space(struct work_struct *work)
  1023. {
  1024. struct btrfs_fs_info *fs_info;
  1025. struct btrfs_space_info *space_info;
  1026. struct btrfs_block_rsv *delayed_block_rsv;
  1027. struct btrfs_block_rsv *delayed_refs_rsv;
  1028. struct btrfs_block_rsv *global_rsv;
  1029. struct btrfs_block_rsv *trans_rsv;
  1030. int loops = 0;
  1031. fs_info = container_of(work, struct btrfs_fs_info,
  1032. preempt_reclaim_work);
  1033. space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
  1034. delayed_block_rsv = &fs_info->delayed_block_rsv;
  1035. delayed_refs_rsv = &fs_info->delayed_refs_rsv;
  1036. global_rsv = &fs_info->global_block_rsv;
  1037. trans_rsv = &fs_info->trans_block_rsv;
  1038. spin_lock(&space_info->lock);
  1039. while (need_preemptive_reclaim(fs_info, space_info)) {
  1040. enum btrfs_flush_state flush;
  1041. u64 delalloc_size = 0;
  1042. u64 to_reclaim, block_rsv_size;
  1043. u64 global_rsv_size = global_rsv->reserved;
  1044. loops++;
  1045. /*
  1046. * We don't have a precise counter for the metadata being
  1047. * reserved for delalloc, so we'll approximate it by subtracting
  1048. * out the block rsv's space from the bytes_may_use. If that
  1049. * amount is higher than the individual reserves, then we can
  1050. * assume it's tied up in delalloc reservations.
  1051. */
  1052. block_rsv_size = global_rsv_size +
  1053. delayed_block_rsv->reserved +
  1054. delayed_refs_rsv->reserved +
  1055. trans_rsv->reserved;
  1056. if (block_rsv_size < space_info->bytes_may_use)
  1057. delalloc_size = space_info->bytes_may_use - block_rsv_size;
  1058. /*
  1059. * We don't want to include the global_rsv in our calculation,
  1060. * because that's space we can't touch. Subtract it from the
  1061. * block_rsv_size for the next checks.
  1062. */
  1063. block_rsv_size -= global_rsv_size;
  1064. /*
  1065. * We really want to avoid flushing delalloc too much, as it
  1066. * could result in poor allocation patterns, so only flush it if
  1067. * it's larger than the rest of the pools combined.
  1068. */
  1069. if (delalloc_size > block_rsv_size) {
  1070. to_reclaim = delalloc_size;
  1071. flush = FLUSH_DELALLOC;
  1072. } else if (space_info->bytes_pinned >
  1073. (delayed_block_rsv->reserved +
  1074. delayed_refs_rsv->reserved)) {
  1075. to_reclaim = space_info->bytes_pinned;
  1076. flush = COMMIT_TRANS;
  1077. } else if (delayed_block_rsv->reserved >
  1078. delayed_refs_rsv->reserved) {
  1079. to_reclaim = delayed_block_rsv->reserved;
  1080. flush = FLUSH_DELAYED_ITEMS_NR;
  1081. } else {
  1082. to_reclaim = delayed_refs_rsv->reserved;
  1083. flush = FLUSH_DELAYED_REFS_NR;
  1084. }
  1085. spin_unlock(&space_info->lock);
  1086. /*
  1087. * We don't want to reclaim everything, just a portion, so scale
  1088. * down the to_reclaim by 1/4. If it takes us down to 0,
  1089. * reclaim 1 items worth.
  1090. */
  1091. to_reclaim >>= 2;
  1092. if (!to_reclaim)
  1093. to_reclaim = btrfs_calc_insert_metadata_size(fs_info, 1);
  1094. flush_space(fs_info, space_info, to_reclaim, flush, true);
  1095. cond_resched();
  1096. spin_lock(&space_info->lock);
  1097. }
  1098. /* We only went through once, back off our clamping. */
  1099. if (loops == 1 && !space_info->reclaim_size)
  1100. space_info->clamp = max(1, space_info->clamp - 1);
  1101. trace_btrfs_done_preemptive_reclaim(fs_info, space_info);
  1102. spin_unlock(&space_info->lock);
  1103. }
  1104. /*
  1105. * FLUSH_DELALLOC_WAIT:
  1106. * Space is freed from flushing delalloc in one of two ways.
  1107. *
  1108. * 1) compression is on and we allocate less space than we reserved
  1109. * 2) we are overwriting existing space
  1110. *
  1111. * For #1 that extra space is reclaimed as soon as the delalloc pages are
  1112. * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent
  1113. * length to ->bytes_reserved, and subtracts the reserved space from
  1114. * ->bytes_may_use.
  1115. *
  1116. * For #2 this is trickier. Once the ordered extent runs we will drop the
  1117. * extent in the range we are overwriting, which creates a delayed ref for
  1118. * that freed extent. This however is not reclaimed until the transaction
  1119. * commits, thus the next stages.
  1120. *
  1121. * RUN_DELAYED_IPUTS
  1122. * If we are freeing inodes, we want to make sure all delayed iputs have
  1123. * completed, because they could have been on an inode with i_nlink == 0, and
  1124. * thus have been truncated and freed up space. But again this space is not
  1125. * immediately re-usable, it comes in the form of a delayed ref, which must be
  1126. * run and then the transaction must be committed.
  1127. *
  1128. * COMMIT_TRANS
  1129. * This is where we reclaim all of the pinned space generated by running the
  1130. * iputs
  1131. *
  1132. * ALLOC_CHUNK_FORCE
  1133. * For data we start with alloc chunk force, however we could have been full
  1134. * before, and then the transaction commit could have freed new block groups,
  1135. * so if we now have space to allocate do the force chunk allocation.
  1136. */
  1137. static const enum btrfs_flush_state data_flush_states[] = {
  1138. FLUSH_DELALLOC_FULL,
  1139. RUN_DELAYED_IPUTS,
  1140. COMMIT_TRANS,
  1141. ALLOC_CHUNK_FORCE,
  1142. };
  1143. static void btrfs_async_reclaim_data_space(struct work_struct *work)
  1144. {
  1145. struct btrfs_fs_info *fs_info;
  1146. struct btrfs_space_info *space_info;
  1147. u64 last_tickets_id;
  1148. enum btrfs_flush_state flush_state = 0;
  1149. fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work);
  1150. space_info = fs_info->data_sinfo;
  1151. spin_lock(&space_info->lock);
  1152. if (list_empty(&space_info->tickets)) {
  1153. space_info->flush = 0;
  1154. spin_unlock(&space_info->lock);
  1155. return;
  1156. }
  1157. last_tickets_id = space_info->tickets_id;
  1158. spin_unlock(&space_info->lock);
  1159. while (!space_info->full) {
  1160. flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
  1161. spin_lock(&space_info->lock);
  1162. if (list_empty(&space_info->tickets)) {
  1163. space_info->flush = 0;
  1164. spin_unlock(&space_info->lock);
  1165. return;
  1166. }
  1167. /* Something happened, fail everything and bail. */
  1168. if (BTRFS_FS_ERROR(fs_info))
  1169. goto aborted_fs;
  1170. last_tickets_id = space_info->tickets_id;
  1171. spin_unlock(&space_info->lock);
  1172. }
  1173. while (flush_state < ARRAY_SIZE(data_flush_states)) {
  1174. flush_space(fs_info, space_info, U64_MAX,
  1175. data_flush_states[flush_state], false);
  1176. spin_lock(&space_info->lock);
  1177. if (list_empty(&space_info->tickets)) {
  1178. space_info->flush = 0;
  1179. spin_unlock(&space_info->lock);
  1180. return;
  1181. }
  1182. if (last_tickets_id == space_info->tickets_id) {
  1183. flush_state++;
  1184. } else {
  1185. last_tickets_id = space_info->tickets_id;
  1186. flush_state = 0;
  1187. }
  1188. if (flush_state >= ARRAY_SIZE(data_flush_states)) {
  1189. if (space_info->full) {
  1190. if (maybe_fail_all_tickets(fs_info, space_info))
  1191. flush_state = 0;
  1192. else
  1193. space_info->flush = 0;
  1194. } else {
  1195. flush_state = 0;
  1196. }
  1197. /* Something happened, fail everything and bail. */
  1198. if (BTRFS_FS_ERROR(fs_info))
  1199. goto aborted_fs;
  1200. }
  1201. spin_unlock(&space_info->lock);
  1202. }
  1203. return;
  1204. aborted_fs:
  1205. maybe_fail_all_tickets(fs_info, space_info);
  1206. space_info->flush = 0;
  1207. spin_unlock(&space_info->lock);
  1208. }
  1209. void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info)
  1210. {
  1211. INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space);
  1212. INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space);
  1213. INIT_WORK(&fs_info->preempt_reclaim_work,
  1214. btrfs_preempt_reclaim_metadata_space);
  1215. }
  1216. static const enum btrfs_flush_state priority_flush_states[] = {
  1217. FLUSH_DELAYED_ITEMS_NR,
  1218. FLUSH_DELAYED_ITEMS,
  1219. ALLOC_CHUNK,
  1220. };
  1221. static const enum btrfs_flush_state evict_flush_states[] = {
  1222. FLUSH_DELAYED_ITEMS_NR,
  1223. FLUSH_DELAYED_ITEMS,
  1224. FLUSH_DELAYED_REFS_NR,
  1225. FLUSH_DELAYED_REFS,
  1226. FLUSH_DELALLOC,
  1227. FLUSH_DELALLOC_WAIT,
  1228. FLUSH_DELALLOC_FULL,
  1229. ALLOC_CHUNK,
  1230. COMMIT_TRANS,
  1231. };
  1232. static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
  1233. struct btrfs_space_info *space_info,
  1234. struct reserve_ticket *ticket,
  1235. const enum btrfs_flush_state *states,
  1236. int states_nr)
  1237. {
  1238. u64 to_reclaim;
  1239. int flush_state = 0;
  1240. spin_lock(&space_info->lock);
  1241. to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
  1242. /*
  1243. * This is the priority reclaim path, so to_reclaim could be >0 still
  1244. * because we may have only satisfied the priority tickets and still
  1245. * left non priority tickets on the list. We would then have
  1246. * to_reclaim but ->bytes == 0.
  1247. */
  1248. if (ticket->bytes == 0) {
  1249. spin_unlock(&space_info->lock);
  1250. return;
  1251. }
  1252. while (flush_state < states_nr) {
  1253. spin_unlock(&space_info->lock);
  1254. flush_space(fs_info, space_info, to_reclaim, states[flush_state],
  1255. false);
  1256. flush_state++;
  1257. spin_lock(&space_info->lock);
  1258. if (ticket->bytes == 0) {
  1259. spin_unlock(&space_info->lock);
  1260. return;
  1261. }
  1262. }
  1263. /* Attempt to steal from the global rsv if we can. */
  1264. if (!steal_from_global_rsv(fs_info, space_info, ticket)) {
  1265. ticket->error = -ENOSPC;
  1266. remove_ticket(space_info, ticket);
  1267. }
  1268. /*
  1269. * We must run try_granting_tickets here because we could be a large
  1270. * ticket in front of a smaller ticket that can now be satisfied with
  1271. * the available space.
  1272. */
  1273. btrfs_try_granting_tickets(fs_info, space_info);
  1274. spin_unlock(&space_info->lock);
  1275. }
  1276. static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info,
  1277. struct btrfs_space_info *space_info,
  1278. struct reserve_ticket *ticket)
  1279. {
  1280. spin_lock(&space_info->lock);
  1281. /* We could have been granted before we got here. */
  1282. if (ticket->bytes == 0) {
  1283. spin_unlock(&space_info->lock);
  1284. return;
  1285. }
  1286. while (!space_info->full) {
  1287. spin_unlock(&space_info->lock);
  1288. flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE, false);
  1289. spin_lock(&space_info->lock);
  1290. if (ticket->bytes == 0) {
  1291. spin_unlock(&space_info->lock);
  1292. return;
  1293. }
  1294. }
  1295. ticket->error = -ENOSPC;
  1296. remove_ticket(space_info, ticket);
  1297. btrfs_try_granting_tickets(fs_info, space_info);
  1298. spin_unlock(&space_info->lock);
  1299. }
  1300. static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
  1301. struct btrfs_space_info *space_info,
  1302. struct reserve_ticket *ticket)
  1303. {
  1304. DEFINE_WAIT(wait);
  1305. int ret = 0;
  1306. spin_lock(&space_info->lock);
  1307. while (ticket->bytes > 0 && ticket->error == 0) {
  1308. ret = prepare_to_wait_event(&ticket->wait, &wait, TASK_KILLABLE);
  1309. if (ret) {
  1310. /*
  1311. * Delete us from the list. After we unlock the space
  1312. * info, we don't want the async reclaim job to reserve
  1313. * space for this ticket. If that would happen, then the
  1314. * ticket's task would not known that space was reserved
  1315. * despite getting an error, resulting in a space leak
  1316. * (bytes_may_use counter of our space_info).
  1317. */
  1318. remove_ticket(space_info, ticket);
  1319. ticket->error = -EINTR;
  1320. break;
  1321. }
  1322. spin_unlock(&space_info->lock);
  1323. schedule();
  1324. finish_wait(&ticket->wait, &wait);
  1325. spin_lock(&space_info->lock);
  1326. }
  1327. spin_unlock(&space_info->lock);
  1328. }
  1329. /**
  1330. * Do the appropriate flushing and waiting for a ticket
  1331. *
  1332. * @fs_info: the filesystem
  1333. * @space_info: space info for the reservation
  1334. * @ticket: ticket for the reservation
  1335. * @start_ns: timestamp when the reservation started
  1336. * @orig_bytes: amount of bytes originally reserved
  1337. * @flush: how much we can flush
  1338. *
  1339. * This does the work of figuring out how to flush for the ticket, waiting for
  1340. * the reservation, and returning the appropriate error if there is one.
  1341. */
  1342. static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
  1343. struct btrfs_space_info *space_info,
  1344. struct reserve_ticket *ticket,
  1345. u64 start_ns, u64 orig_bytes,
  1346. enum btrfs_reserve_flush_enum flush)
  1347. {
  1348. int ret;
  1349. switch (flush) {
  1350. case BTRFS_RESERVE_FLUSH_DATA:
  1351. case BTRFS_RESERVE_FLUSH_ALL:
  1352. case BTRFS_RESERVE_FLUSH_ALL_STEAL:
  1353. wait_reserve_ticket(fs_info, space_info, ticket);
  1354. break;
  1355. case BTRFS_RESERVE_FLUSH_LIMIT:
  1356. priority_reclaim_metadata_space(fs_info, space_info, ticket,
  1357. priority_flush_states,
  1358. ARRAY_SIZE(priority_flush_states));
  1359. break;
  1360. case BTRFS_RESERVE_FLUSH_EVICT:
  1361. priority_reclaim_metadata_space(fs_info, space_info, ticket,
  1362. evict_flush_states,
  1363. ARRAY_SIZE(evict_flush_states));
  1364. break;
  1365. case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE:
  1366. priority_reclaim_data_space(fs_info, space_info, ticket);
  1367. break;
  1368. default:
  1369. ASSERT(0);
  1370. break;
  1371. }
  1372. ret = ticket->error;
  1373. ASSERT(list_empty(&ticket->list));
  1374. /*
  1375. * Check that we can't have an error set if the reservation succeeded,
  1376. * as that would confuse tasks and lead them to error out without
  1377. * releasing reserved space (if an error happens the expectation is that
  1378. * space wasn't reserved at all).
  1379. */
  1380. ASSERT(!(ticket->bytes == 0 && ticket->error));
  1381. trace_btrfs_reserve_ticket(fs_info, space_info->flags, orig_bytes,
  1382. start_ns, flush, ticket->error);
  1383. return ret;
  1384. }
  1385. /*
  1386. * This returns true if this flush state will go through the ordinary flushing
  1387. * code.
  1388. */
  1389. static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush)
  1390. {
  1391. return (flush == BTRFS_RESERVE_FLUSH_ALL) ||
  1392. (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
  1393. }
  1394. static inline void maybe_clamp_preempt(struct btrfs_fs_info *fs_info,
  1395. struct btrfs_space_info *space_info)
  1396. {
  1397. u64 ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
  1398. u64 delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
  1399. /*
  1400. * If we're heavy on ordered operations then clamping won't help us. We
  1401. * need to clamp specifically to keep up with dirty'ing buffered
  1402. * writers, because there's not a 1:1 correlation of writing delalloc
  1403. * and freeing space, like there is with flushing delayed refs or
  1404. * delayed nodes. If we're already more ordered than delalloc then
  1405. * we're keeping up, otherwise we aren't and should probably clamp.
  1406. */
  1407. if (ordered < delalloc)
  1408. space_info->clamp = min(space_info->clamp + 1, 8);
  1409. }
  1410. static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
  1411. {
  1412. return (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
  1413. flush == BTRFS_RESERVE_FLUSH_EVICT);
  1414. }
  1415. /**
  1416. * Try to reserve bytes from the block_rsv's space
  1417. *
  1418. * @fs_info: the filesystem
  1419. * @space_info: space info we want to allocate from
  1420. * @orig_bytes: number of bytes we want
  1421. * @flush: whether or not we can flush to make our reservation
  1422. *
  1423. * This will reserve orig_bytes number of bytes from the space info associated
  1424. * with the block_rsv. If there is not enough space it will make an attempt to
  1425. * flush out space to make room. It will do this by flushing delalloc if
  1426. * possible or committing the transaction. If flush is 0 then no attempts to
  1427. * regain reservations will be made and this will fail if there is not enough
  1428. * space already.
  1429. */
  1430. static int __reserve_bytes(struct btrfs_fs_info *fs_info,
  1431. struct btrfs_space_info *space_info, u64 orig_bytes,
  1432. enum btrfs_reserve_flush_enum flush)
  1433. {
  1434. struct work_struct *async_work;
  1435. struct reserve_ticket ticket;
  1436. u64 start_ns = 0;
  1437. u64 used;
  1438. int ret = 0;
  1439. bool pending_tickets;
  1440. ASSERT(orig_bytes);
  1441. ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL);
  1442. if (flush == BTRFS_RESERVE_FLUSH_DATA)
  1443. async_work = &fs_info->async_data_reclaim_work;
  1444. else
  1445. async_work = &fs_info->async_reclaim_work;
  1446. spin_lock(&space_info->lock);
  1447. ret = -ENOSPC;
  1448. used = btrfs_space_info_used(space_info, true);
  1449. /*
  1450. * We don't want NO_FLUSH allocations to jump everybody, they can
  1451. * generally handle ENOSPC in a different way, so treat them the same as
  1452. * normal flushers when it comes to skipping pending tickets.
  1453. */
  1454. if (is_normal_flushing(flush) || (flush == BTRFS_RESERVE_NO_FLUSH))
  1455. pending_tickets = !list_empty(&space_info->tickets) ||
  1456. !list_empty(&space_info->priority_tickets);
  1457. else
  1458. pending_tickets = !list_empty(&space_info->priority_tickets);
  1459. /*
  1460. * Carry on if we have enough space (short-circuit) OR call
  1461. * can_overcommit() to ensure we can overcommit to continue.
  1462. */
  1463. if (!pending_tickets &&
  1464. ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) ||
  1465. btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
  1466. btrfs_space_info_update_bytes_may_use(fs_info, space_info,
  1467. orig_bytes);
  1468. ret = 0;
  1469. }
  1470. /*
  1471. * If we couldn't make a reservation then setup our reservation ticket
  1472. * and kick the async worker if it's not already running.
  1473. *
  1474. * If we are a priority flusher then we just need to add our ticket to
  1475. * the list and we will do our own flushing further down.
  1476. */
  1477. if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
  1478. ticket.bytes = orig_bytes;
  1479. ticket.error = 0;
  1480. space_info->reclaim_size += ticket.bytes;
  1481. init_waitqueue_head(&ticket.wait);
  1482. ticket.steal = can_steal(flush);
  1483. if (trace_btrfs_reserve_ticket_enabled())
  1484. start_ns = ktime_get_ns();
  1485. if (flush == BTRFS_RESERVE_FLUSH_ALL ||
  1486. flush == BTRFS_RESERVE_FLUSH_ALL_STEAL ||
  1487. flush == BTRFS_RESERVE_FLUSH_DATA) {
  1488. list_add_tail(&ticket.list, &space_info->tickets);
  1489. if (!space_info->flush) {
  1490. /*
  1491. * We were forced to add a reserve ticket, so
  1492. * our preemptive flushing is unable to keep
  1493. * up. Clamp down on the threshold for the
  1494. * preemptive flushing in order to keep up with
  1495. * the workload.
  1496. */
  1497. maybe_clamp_preempt(fs_info, space_info);
  1498. space_info->flush = 1;
  1499. trace_btrfs_trigger_flush(fs_info,
  1500. space_info->flags,
  1501. orig_bytes, flush,
  1502. "enospc");
  1503. queue_work(system_unbound_wq, async_work);
  1504. }
  1505. } else {
  1506. list_add_tail(&ticket.list,
  1507. &space_info->priority_tickets);
  1508. }
  1509. } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
  1510. /*
  1511. * We will do the space reservation dance during log replay,
  1512. * which means we won't have fs_info->fs_root set, so don't do
  1513. * the async reclaim as we will panic.
  1514. */
  1515. if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
  1516. !work_busy(&fs_info->preempt_reclaim_work) &&
  1517. need_preemptive_reclaim(fs_info, space_info)) {
  1518. trace_btrfs_trigger_flush(fs_info, space_info->flags,
  1519. orig_bytes, flush, "preempt");
  1520. queue_work(system_unbound_wq,
  1521. &fs_info->preempt_reclaim_work);
  1522. }
  1523. }
  1524. spin_unlock(&space_info->lock);
  1525. if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
  1526. return ret;
  1527. return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
  1528. orig_bytes, flush);
  1529. }
  1530. /**
  1531. * Trye to reserve metadata bytes from the block_rsv's space
  1532. *
  1533. * @fs_info: the filesystem
  1534. * @block_rsv: block_rsv we're allocating for
  1535. * @orig_bytes: number of bytes we want
  1536. * @flush: whether or not we can flush to make our reservation
  1537. *
  1538. * This will reserve orig_bytes number of bytes from the space info associated
  1539. * with the block_rsv. If there is not enough space it will make an attempt to
  1540. * flush out space to make room. It will do this by flushing delalloc if
  1541. * possible or committing the transaction. If flush is 0 then no attempts to
  1542. * regain reservations will be made and this will fail if there is not enough
  1543. * space already.
  1544. */
  1545. int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
  1546. struct btrfs_block_rsv *block_rsv,
  1547. u64 orig_bytes,
  1548. enum btrfs_reserve_flush_enum flush)
  1549. {
  1550. int ret;
  1551. ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush);
  1552. if (ret == -ENOSPC) {
  1553. trace_btrfs_space_reservation(fs_info, "space_info:enospc",
  1554. block_rsv->space_info->flags,
  1555. orig_bytes, 1);
  1556. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  1557. btrfs_dump_space_info(fs_info, block_rsv->space_info,
  1558. orig_bytes, 0);
  1559. }
  1560. return ret;
  1561. }
  1562. /**
  1563. * Try to reserve data bytes for an allocation
  1564. *
  1565. * @fs_info: the filesystem
  1566. * @bytes: number of bytes we need
  1567. * @flush: how we are allowed to flush
  1568. *
  1569. * This will reserve bytes from the data space info. If there is not enough
  1570. * space then we will attempt to flush space as specified by flush.
  1571. */
  1572. int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
  1573. enum btrfs_reserve_flush_enum flush)
  1574. {
  1575. struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
  1576. int ret;
  1577. ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA ||
  1578. flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE ||
  1579. flush == BTRFS_RESERVE_NO_FLUSH);
  1580. ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA);
  1581. ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush);
  1582. if (ret == -ENOSPC) {
  1583. trace_btrfs_space_reservation(fs_info, "space_info:enospc",
  1584. data_sinfo->flags, bytes, 1);
  1585. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG))
  1586. btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0);
  1587. }
  1588. return ret;
  1589. }
  1590. /* Dump all the space infos when we abort a transaction due to ENOSPC. */
  1591. __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
  1592. {
  1593. struct btrfs_space_info *space_info;
  1594. btrfs_info(fs_info, "dumping space info:");
  1595. list_for_each_entry(space_info, &fs_info->space_info, list) {
  1596. spin_lock(&space_info->lock);
  1597. __btrfs_dump_space_info(fs_info, space_info);
  1598. spin_unlock(&space_info->lock);
  1599. }
  1600. dump_global_block_rsv(fs_info);
  1601. }