block-group.c 125 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232
  1. // SPDX-License-Identifier: GPL-2.0
  2. #include <linux/list_sort.h>
  3. #include "misc.h"
  4. #include "ctree.h"
  5. #include "block-group.h"
  6. #include "space-info.h"
  7. #include "disk-io.h"
  8. #include "free-space-cache.h"
  9. #include "free-space-tree.h"
  10. #include "volumes.h"
  11. #include "transaction.h"
  12. #include "ref-verify.h"
  13. #include "sysfs.h"
  14. #include "tree-log.h"
  15. #include "delalloc-space.h"
  16. #include "discard.h"
  17. #include "raid56.h"
  18. #include "zoned.h"
  19. /*
  20. * Return target flags in extended format or 0 if restripe for this chunk_type
  21. * is not in progress
  22. *
  23. * Should be called with balance_lock held
  24. */
  25. static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
  26. {
  27. struct btrfs_balance_control *bctl = fs_info->balance_ctl;
  28. u64 target = 0;
  29. if (!bctl)
  30. return 0;
  31. if (flags & BTRFS_BLOCK_GROUP_DATA &&
  32. bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  33. target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
  34. } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
  35. bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  36. target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
  37. } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
  38. bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
  39. target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
  40. }
  41. return target;
  42. }
  43. /*
  44. * @flags: available profiles in extended format (see ctree.h)
  45. *
  46. * Return reduced profile in chunk format. If profile changing is in progress
  47. * (either running or paused) picks the target profile (if it's already
  48. * available), otherwise falls back to plain reducing.
  49. */
  50. static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags)
  51. {
  52. u64 num_devices = fs_info->fs_devices->rw_devices;
  53. u64 target;
  54. u64 raid_type;
  55. u64 allowed = 0;
  56. /*
  57. * See if restripe for this chunk_type is in progress, if so try to
  58. * reduce to the target profile
  59. */
  60. spin_lock(&fs_info->balance_lock);
  61. target = get_restripe_target(fs_info, flags);
  62. if (target) {
  63. spin_unlock(&fs_info->balance_lock);
  64. return extended_to_chunk(target);
  65. }
  66. spin_unlock(&fs_info->balance_lock);
  67. /* First, mask out the RAID levels which aren't possible */
  68. for (raid_type = 0; raid_type < BTRFS_NR_RAID_TYPES; raid_type++) {
  69. if (num_devices >= btrfs_raid_array[raid_type].devs_min)
  70. allowed |= btrfs_raid_array[raid_type].bg_flag;
  71. }
  72. allowed &= flags;
  73. /* Select the highest-redundancy RAID level. */
  74. if (allowed & BTRFS_BLOCK_GROUP_RAID1C4)
  75. allowed = BTRFS_BLOCK_GROUP_RAID1C4;
  76. else if (allowed & BTRFS_BLOCK_GROUP_RAID6)
  77. allowed = BTRFS_BLOCK_GROUP_RAID6;
  78. else if (allowed & BTRFS_BLOCK_GROUP_RAID1C3)
  79. allowed = BTRFS_BLOCK_GROUP_RAID1C3;
  80. else if (allowed & BTRFS_BLOCK_GROUP_RAID5)
  81. allowed = BTRFS_BLOCK_GROUP_RAID5;
  82. else if (allowed & BTRFS_BLOCK_GROUP_RAID10)
  83. allowed = BTRFS_BLOCK_GROUP_RAID10;
  84. else if (allowed & BTRFS_BLOCK_GROUP_RAID1)
  85. allowed = BTRFS_BLOCK_GROUP_RAID1;
  86. else if (allowed & BTRFS_BLOCK_GROUP_DUP)
  87. allowed = BTRFS_BLOCK_GROUP_DUP;
  88. else if (allowed & BTRFS_BLOCK_GROUP_RAID0)
  89. allowed = BTRFS_BLOCK_GROUP_RAID0;
  90. flags &= ~BTRFS_BLOCK_GROUP_PROFILE_MASK;
  91. return extended_to_chunk(flags | allowed);
  92. }
  93. u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags)
  94. {
  95. unsigned seq;
  96. u64 flags;
  97. do {
  98. flags = orig_flags;
  99. seq = read_seqbegin(&fs_info->profiles_lock);
  100. if (flags & BTRFS_BLOCK_GROUP_DATA)
  101. flags |= fs_info->avail_data_alloc_bits;
  102. else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
  103. flags |= fs_info->avail_system_alloc_bits;
  104. else if (flags & BTRFS_BLOCK_GROUP_METADATA)
  105. flags |= fs_info->avail_metadata_alloc_bits;
  106. } while (read_seqretry(&fs_info->profiles_lock, seq));
  107. return btrfs_reduce_alloc_profile(fs_info, flags);
  108. }
  109. void btrfs_get_block_group(struct btrfs_block_group *cache)
  110. {
  111. refcount_inc(&cache->refs);
  112. }
  113. void btrfs_put_block_group(struct btrfs_block_group *cache)
  114. {
  115. if (refcount_dec_and_test(&cache->refs)) {
  116. WARN_ON(cache->pinned > 0);
  117. /*
  118. * If there was a failure to cleanup a log tree, very likely due
  119. * to an IO failure on a writeback attempt of one or more of its
  120. * extent buffers, we could not do proper (and cheap) unaccounting
  121. * of their reserved space, so don't warn on reserved > 0 in that
  122. * case.
  123. */
  124. if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) ||
  125. !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info))
  126. WARN_ON(cache->reserved > 0);
  127. /*
  128. * A block_group shouldn't be on the discard_list anymore.
  129. * Remove the block_group from the discard_list to prevent us
  130. * from causing a panic due to NULL pointer dereference.
  131. */
  132. if (WARN_ON(!list_empty(&cache->discard_list)))
  133. btrfs_discard_cancel_work(&cache->fs_info->discard_ctl,
  134. cache);
  135. /*
  136. * If not empty, someone is still holding mutex of
  137. * full_stripe_lock, which can only be released by caller.
  138. * And it will definitely cause use-after-free when caller
  139. * tries to release full stripe lock.
  140. *
  141. * No better way to resolve, but only to warn.
  142. */
  143. WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root));
  144. kfree(cache->free_space_ctl);
  145. kfree(cache->physical_map);
  146. kfree(cache);
  147. }
  148. }
  149. /*
  150. * This adds the block group to the fs_info rb tree for the block group cache
  151. */
  152. static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
  153. struct btrfs_block_group *block_group)
  154. {
  155. struct rb_node **p;
  156. struct rb_node *parent = NULL;
  157. struct btrfs_block_group *cache;
  158. bool leftmost = true;
  159. ASSERT(block_group->length != 0);
  160. write_lock(&info->block_group_cache_lock);
  161. p = &info->block_group_cache_tree.rb_root.rb_node;
  162. while (*p) {
  163. parent = *p;
  164. cache = rb_entry(parent, struct btrfs_block_group, cache_node);
  165. if (block_group->start < cache->start) {
  166. p = &(*p)->rb_left;
  167. } else if (block_group->start > cache->start) {
  168. p = &(*p)->rb_right;
  169. leftmost = false;
  170. } else {
  171. write_unlock(&info->block_group_cache_lock);
  172. return -EEXIST;
  173. }
  174. }
  175. rb_link_node(&block_group->cache_node, parent, p);
  176. rb_insert_color_cached(&block_group->cache_node,
  177. &info->block_group_cache_tree, leftmost);
  178. write_unlock(&info->block_group_cache_lock);
  179. return 0;
  180. }
  181. /*
  182. * This will return the block group at or after bytenr if contains is 0, else
  183. * it will return the block group that contains the bytenr
  184. */
  185. static struct btrfs_block_group *block_group_cache_tree_search(
  186. struct btrfs_fs_info *info, u64 bytenr, int contains)
  187. {
  188. struct btrfs_block_group *cache, *ret = NULL;
  189. struct rb_node *n;
  190. u64 end, start;
  191. read_lock(&info->block_group_cache_lock);
  192. n = info->block_group_cache_tree.rb_root.rb_node;
  193. while (n) {
  194. cache = rb_entry(n, struct btrfs_block_group, cache_node);
  195. end = cache->start + cache->length - 1;
  196. start = cache->start;
  197. if (bytenr < start) {
  198. if (!contains && (!ret || start < ret->start))
  199. ret = cache;
  200. n = n->rb_left;
  201. } else if (bytenr > start) {
  202. if (contains && bytenr <= end) {
  203. ret = cache;
  204. break;
  205. }
  206. n = n->rb_right;
  207. } else {
  208. ret = cache;
  209. break;
  210. }
  211. }
  212. if (ret)
  213. btrfs_get_block_group(ret);
  214. read_unlock(&info->block_group_cache_lock);
  215. return ret;
  216. }
  217. /*
  218. * Return the block group that starts at or after bytenr
  219. */
  220. struct btrfs_block_group *btrfs_lookup_first_block_group(
  221. struct btrfs_fs_info *info, u64 bytenr)
  222. {
  223. return block_group_cache_tree_search(info, bytenr, 0);
  224. }
  225. /*
  226. * Return the block group that contains the given bytenr
  227. */
  228. struct btrfs_block_group *btrfs_lookup_block_group(
  229. struct btrfs_fs_info *info, u64 bytenr)
  230. {
  231. return block_group_cache_tree_search(info, bytenr, 1);
  232. }
  233. struct btrfs_block_group *btrfs_next_block_group(
  234. struct btrfs_block_group *cache)
  235. {
  236. struct btrfs_fs_info *fs_info = cache->fs_info;
  237. struct rb_node *node;
  238. read_lock(&fs_info->block_group_cache_lock);
  239. /* If our block group was removed, we need a full search. */
  240. if (RB_EMPTY_NODE(&cache->cache_node)) {
  241. const u64 next_bytenr = cache->start + cache->length;
  242. read_unlock(&fs_info->block_group_cache_lock);
  243. btrfs_put_block_group(cache);
  244. return btrfs_lookup_first_block_group(fs_info, next_bytenr);
  245. }
  246. node = rb_next(&cache->cache_node);
  247. btrfs_put_block_group(cache);
  248. if (node) {
  249. cache = rb_entry(node, struct btrfs_block_group, cache_node);
  250. btrfs_get_block_group(cache);
  251. } else
  252. cache = NULL;
  253. read_unlock(&fs_info->block_group_cache_lock);
  254. return cache;
  255. }
  256. /**
  257. * Check if we can do a NOCOW write for a given extent.
  258. *
  259. * @fs_info: The filesystem information object.
  260. * @bytenr: Logical start address of the extent.
  261. *
  262. * Check if we can do a NOCOW write for the given extent, and increments the
  263. * number of NOCOW writers in the block group that contains the extent, as long
  264. * as the block group exists and it's currently not in read-only mode.
  265. *
  266. * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
  267. * is responsible for calling btrfs_dec_nocow_writers() later.
  268. *
  269. * Or NULL if we can not do a NOCOW write
  270. */
  271. struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
  272. u64 bytenr)
  273. {
  274. struct btrfs_block_group *bg;
  275. bool can_nocow = true;
  276. bg = btrfs_lookup_block_group(fs_info, bytenr);
  277. if (!bg)
  278. return NULL;
  279. spin_lock(&bg->lock);
  280. if (bg->ro)
  281. can_nocow = false;
  282. else
  283. atomic_inc(&bg->nocow_writers);
  284. spin_unlock(&bg->lock);
  285. if (!can_nocow) {
  286. btrfs_put_block_group(bg);
  287. return NULL;
  288. }
  289. /* No put on block group, done by btrfs_dec_nocow_writers(). */
  290. return bg;
  291. }
  292. /**
  293. * Decrement the number of NOCOW writers in a block group.
  294. *
  295. * @bg: The block group.
  296. *
  297. * This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
  298. * and on the block group returned by that call. Typically this is called after
  299. * creating an ordered extent for a NOCOW write, to prevent races with scrub and
  300. * relocation.
  301. *
  302. * After this call, the caller should not use the block group anymore. It it wants
  303. * to use it, then it should get a reference on it before calling this function.
  304. */
  305. void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
  306. {
  307. if (atomic_dec_and_test(&bg->nocow_writers))
  308. wake_up_var(&bg->nocow_writers);
  309. /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
  310. btrfs_put_block_group(bg);
  311. }
  312. void btrfs_wait_nocow_writers(struct btrfs_block_group *bg)
  313. {
  314. wait_var_event(&bg->nocow_writers, !atomic_read(&bg->nocow_writers));
  315. }
  316. void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
  317. const u64 start)
  318. {
  319. struct btrfs_block_group *bg;
  320. bg = btrfs_lookup_block_group(fs_info, start);
  321. ASSERT(bg);
  322. if (atomic_dec_and_test(&bg->reservations))
  323. wake_up_var(&bg->reservations);
  324. btrfs_put_block_group(bg);
  325. }
  326. void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg)
  327. {
  328. struct btrfs_space_info *space_info = bg->space_info;
  329. ASSERT(bg->ro);
  330. if (!(bg->flags & BTRFS_BLOCK_GROUP_DATA))
  331. return;
  332. /*
  333. * Our block group is read only but before we set it to read only,
  334. * some task might have had allocated an extent from it already, but it
  335. * has not yet created a respective ordered extent (and added it to a
  336. * root's list of ordered extents).
  337. * Therefore wait for any task currently allocating extents, since the
  338. * block group's reservations counter is incremented while a read lock
  339. * on the groups' semaphore is held and decremented after releasing
  340. * the read access on that semaphore and creating the ordered extent.
  341. */
  342. down_write(&space_info->groups_sem);
  343. up_write(&space_info->groups_sem);
  344. wait_var_event(&bg->reservations, !atomic_read(&bg->reservations));
  345. }
  346. struct btrfs_caching_control *btrfs_get_caching_control(
  347. struct btrfs_block_group *cache)
  348. {
  349. struct btrfs_caching_control *ctl;
  350. spin_lock(&cache->lock);
  351. if (!cache->caching_ctl) {
  352. spin_unlock(&cache->lock);
  353. return NULL;
  354. }
  355. ctl = cache->caching_ctl;
  356. refcount_inc(&ctl->count);
  357. spin_unlock(&cache->lock);
  358. return ctl;
  359. }
  360. void btrfs_put_caching_control(struct btrfs_caching_control *ctl)
  361. {
  362. if (refcount_dec_and_test(&ctl->count))
  363. kfree(ctl);
  364. }
  365. /*
  366. * When we wait for progress in the block group caching, its because our
  367. * allocation attempt failed at least once. So, we must sleep and let some
  368. * progress happen before we try again.
  369. *
  370. * This function will sleep at least once waiting for new free space to show
  371. * up, and then it will check the block group free space numbers for our min
  372. * num_bytes. Another option is to have it go ahead and look in the rbtree for
  373. * a free extent of a given size, but this is a good start.
  374. *
  375. * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
  376. * any of the information in this block group.
  377. */
  378. void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
  379. u64 num_bytes)
  380. {
  381. struct btrfs_caching_control *caching_ctl;
  382. int progress;
  383. caching_ctl = btrfs_get_caching_control(cache);
  384. if (!caching_ctl)
  385. return;
  386. /*
  387. * We've already failed to allocate from this block group, so even if
  388. * there's enough space in the block group it isn't contiguous enough to
  389. * allow for an allocation, so wait for at least the next wakeup tick,
  390. * or for the thing to be done.
  391. */
  392. progress = atomic_read(&caching_ctl->progress);
  393. wait_event(caching_ctl->wait, btrfs_block_group_done(cache) ||
  394. (progress != atomic_read(&caching_ctl->progress) &&
  395. (cache->free_space_ctl->free_space >= num_bytes)));
  396. btrfs_put_caching_control(caching_ctl);
  397. }
  398. static int btrfs_caching_ctl_wait_done(struct btrfs_block_group *cache,
  399. struct btrfs_caching_control *caching_ctl)
  400. {
  401. wait_event(caching_ctl->wait, btrfs_block_group_done(cache));
  402. return cache->cached == BTRFS_CACHE_ERROR ? -EIO : 0;
  403. }
  404. static int btrfs_wait_block_group_cache_done(struct btrfs_block_group *cache)
  405. {
  406. struct btrfs_caching_control *caching_ctl;
  407. int ret;
  408. caching_ctl = btrfs_get_caching_control(cache);
  409. if (!caching_ctl)
  410. return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
  411. ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
  412. btrfs_put_caching_control(caching_ctl);
  413. return ret;
  414. }
  415. #ifdef CONFIG_BTRFS_DEBUG
  416. static void fragment_free_space(struct btrfs_block_group *block_group)
  417. {
  418. struct btrfs_fs_info *fs_info = block_group->fs_info;
  419. u64 start = block_group->start;
  420. u64 len = block_group->length;
  421. u64 chunk = block_group->flags & BTRFS_BLOCK_GROUP_METADATA ?
  422. fs_info->nodesize : fs_info->sectorsize;
  423. u64 step = chunk << 1;
  424. while (len > chunk) {
  425. btrfs_remove_free_space(block_group, start, chunk);
  426. start += step;
  427. if (len < step)
  428. len = 0;
  429. else
  430. len -= step;
  431. }
  432. }
  433. #endif
  434. /*
  435. * This is only called by btrfs_cache_block_group, since we could have freed
  436. * extents we need to check the pinned_extents for any extents that can't be
  437. * used yet since their free space will be released as soon as the transaction
  438. * commits.
  439. */
  440. int add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end,
  441. u64 *total_added_ret)
  442. {
  443. struct btrfs_fs_info *info = block_group->fs_info;
  444. u64 extent_start, extent_end, size;
  445. int ret;
  446. if (total_added_ret)
  447. *total_added_ret = 0;
  448. while (start < end) {
  449. ret = find_first_extent_bit(&info->excluded_extents, start,
  450. &extent_start, &extent_end,
  451. EXTENT_DIRTY | EXTENT_UPTODATE,
  452. NULL);
  453. if (ret)
  454. break;
  455. if (extent_start <= start) {
  456. start = extent_end + 1;
  457. } else if (extent_start > start && extent_start < end) {
  458. size = extent_start - start;
  459. ret = btrfs_add_free_space_async_trimmed(block_group,
  460. start, size);
  461. if (ret)
  462. return ret;
  463. if (total_added_ret)
  464. *total_added_ret += size;
  465. start = extent_end + 1;
  466. } else {
  467. break;
  468. }
  469. }
  470. if (start < end) {
  471. size = end - start;
  472. ret = btrfs_add_free_space_async_trimmed(block_group, start,
  473. size);
  474. if (ret)
  475. return ret;
  476. if (total_added_ret)
  477. *total_added_ret += size;
  478. }
  479. return 0;
  480. }
  481. static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
  482. {
  483. struct btrfs_block_group *block_group = caching_ctl->block_group;
  484. struct btrfs_fs_info *fs_info = block_group->fs_info;
  485. struct btrfs_root *extent_root;
  486. struct btrfs_path *path;
  487. struct extent_buffer *leaf;
  488. struct btrfs_key key;
  489. u64 total_found = 0;
  490. u64 last = 0;
  491. u32 nritems;
  492. int ret;
  493. bool wakeup = true;
  494. path = btrfs_alloc_path();
  495. if (!path)
  496. return -ENOMEM;
  497. last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET);
  498. extent_root = btrfs_extent_root(fs_info, last);
  499. #ifdef CONFIG_BTRFS_DEBUG
  500. /*
  501. * If we're fragmenting we don't want to make anybody think we can
  502. * allocate from this block group until we've had a chance to fragment
  503. * the free space.
  504. */
  505. if (btrfs_should_fragment_free_space(block_group))
  506. wakeup = false;
  507. #endif
  508. /*
  509. * We don't want to deadlock with somebody trying to allocate a new
  510. * extent for the extent root while also trying to search the extent
  511. * root to add free space. So we skip locking and search the commit
  512. * root, since its read-only
  513. */
  514. path->skip_locking = 1;
  515. path->search_commit_root = 1;
  516. path->reada = READA_FORWARD;
  517. key.objectid = last;
  518. key.offset = 0;
  519. key.type = BTRFS_EXTENT_ITEM_KEY;
  520. next:
  521. ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
  522. if (ret < 0)
  523. goto out;
  524. leaf = path->nodes[0];
  525. nritems = btrfs_header_nritems(leaf);
  526. while (1) {
  527. if (btrfs_fs_closing(fs_info) > 1) {
  528. last = (u64)-1;
  529. break;
  530. }
  531. if (path->slots[0] < nritems) {
  532. btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
  533. } else {
  534. ret = btrfs_find_next_key(extent_root, path, &key, 0, 0);
  535. if (ret)
  536. break;
  537. if (need_resched() ||
  538. rwsem_is_contended(&fs_info->commit_root_sem)) {
  539. btrfs_release_path(path);
  540. up_read(&fs_info->commit_root_sem);
  541. mutex_unlock(&caching_ctl->mutex);
  542. cond_resched();
  543. mutex_lock(&caching_ctl->mutex);
  544. down_read(&fs_info->commit_root_sem);
  545. goto next;
  546. }
  547. ret = btrfs_next_leaf(extent_root, path);
  548. if (ret < 0)
  549. goto out;
  550. if (ret)
  551. break;
  552. leaf = path->nodes[0];
  553. nritems = btrfs_header_nritems(leaf);
  554. continue;
  555. }
  556. if (key.objectid < last) {
  557. key.objectid = last;
  558. key.offset = 0;
  559. key.type = BTRFS_EXTENT_ITEM_KEY;
  560. btrfs_release_path(path);
  561. goto next;
  562. }
  563. if (key.objectid < block_group->start) {
  564. path->slots[0]++;
  565. continue;
  566. }
  567. if (key.objectid >= block_group->start + block_group->length)
  568. break;
  569. if (key.type == BTRFS_EXTENT_ITEM_KEY ||
  570. key.type == BTRFS_METADATA_ITEM_KEY) {
  571. u64 space_added;
  572. ret = add_new_free_space(block_group, last, key.objectid,
  573. &space_added);
  574. if (ret)
  575. goto out;
  576. total_found += space_added;
  577. if (key.type == BTRFS_METADATA_ITEM_KEY)
  578. last = key.objectid +
  579. fs_info->nodesize;
  580. else
  581. last = key.objectid + key.offset;
  582. if (total_found > CACHING_CTL_WAKE_UP) {
  583. total_found = 0;
  584. if (wakeup) {
  585. atomic_inc(&caching_ctl->progress);
  586. wake_up(&caching_ctl->wait);
  587. }
  588. }
  589. }
  590. path->slots[0]++;
  591. }
  592. ret = add_new_free_space(block_group, last,
  593. block_group->start + block_group->length,
  594. NULL);
  595. out:
  596. btrfs_free_path(path);
  597. return ret;
  598. }
  599. static noinline void caching_thread(struct btrfs_work *work)
  600. {
  601. struct btrfs_block_group *block_group;
  602. struct btrfs_fs_info *fs_info;
  603. struct btrfs_caching_control *caching_ctl;
  604. int ret;
  605. caching_ctl = container_of(work, struct btrfs_caching_control, work);
  606. block_group = caching_ctl->block_group;
  607. fs_info = block_group->fs_info;
  608. mutex_lock(&caching_ctl->mutex);
  609. down_read(&fs_info->commit_root_sem);
  610. if (btrfs_test_opt(fs_info, SPACE_CACHE)) {
  611. ret = load_free_space_cache(block_group);
  612. if (ret == 1) {
  613. ret = 0;
  614. goto done;
  615. }
  616. /*
  617. * We failed to load the space cache, set ourselves to
  618. * CACHE_STARTED and carry on.
  619. */
  620. spin_lock(&block_group->lock);
  621. block_group->cached = BTRFS_CACHE_STARTED;
  622. spin_unlock(&block_group->lock);
  623. wake_up(&caching_ctl->wait);
  624. }
  625. /*
  626. * If we are in the transaction that populated the free space tree we
  627. * can't actually cache from the free space tree as our commit root and
  628. * real root are the same, so we could change the contents of the blocks
  629. * while caching. Instead do the slow caching in this case, and after
  630. * the transaction has committed we will be safe.
  631. */
  632. if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) &&
  633. !(test_bit(BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, &fs_info->flags)))
  634. ret = load_free_space_tree(caching_ctl);
  635. else
  636. ret = load_extent_tree_free(caching_ctl);
  637. done:
  638. spin_lock(&block_group->lock);
  639. block_group->caching_ctl = NULL;
  640. block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
  641. spin_unlock(&block_group->lock);
  642. #ifdef CONFIG_BTRFS_DEBUG
  643. if (btrfs_should_fragment_free_space(block_group)) {
  644. u64 bytes_used;
  645. spin_lock(&block_group->space_info->lock);
  646. spin_lock(&block_group->lock);
  647. bytes_used = block_group->length - block_group->used;
  648. block_group->space_info->bytes_used += bytes_used >> 1;
  649. spin_unlock(&block_group->lock);
  650. spin_unlock(&block_group->space_info->lock);
  651. fragment_free_space(block_group);
  652. }
  653. #endif
  654. up_read(&fs_info->commit_root_sem);
  655. btrfs_free_excluded_extents(block_group);
  656. mutex_unlock(&caching_ctl->mutex);
  657. wake_up(&caching_ctl->wait);
  658. btrfs_put_caching_control(caching_ctl);
  659. btrfs_put_block_group(block_group);
  660. }
  661. int btrfs_cache_block_group(struct btrfs_block_group *cache, bool wait)
  662. {
  663. struct btrfs_fs_info *fs_info = cache->fs_info;
  664. struct btrfs_caching_control *caching_ctl = NULL;
  665. int ret = 0;
  666. /* Allocator for zoned filesystems does not use the cache at all */
  667. if (btrfs_is_zoned(fs_info))
  668. return 0;
  669. caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
  670. if (!caching_ctl)
  671. return -ENOMEM;
  672. INIT_LIST_HEAD(&caching_ctl->list);
  673. mutex_init(&caching_ctl->mutex);
  674. init_waitqueue_head(&caching_ctl->wait);
  675. caching_ctl->block_group = cache;
  676. refcount_set(&caching_ctl->count, 2);
  677. atomic_set(&caching_ctl->progress, 0);
  678. btrfs_init_work(&caching_ctl->work, caching_thread, NULL, NULL);
  679. spin_lock(&cache->lock);
  680. if (cache->cached != BTRFS_CACHE_NO) {
  681. kfree(caching_ctl);
  682. caching_ctl = cache->caching_ctl;
  683. if (caching_ctl)
  684. refcount_inc(&caching_ctl->count);
  685. spin_unlock(&cache->lock);
  686. goto out;
  687. }
  688. WARN_ON(cache->caching_ctl);
  689. cache->caching_ctl = caching_ctl;
  690. cache->cached = BTRFS_CACHE_STARTED;
  691. spin_unlock(&cache->lock);
  692. write_lock(&fs_info->block_group_cache_lock);
  693. refcount_inc(&caching_ctl->count);
  694. list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
  695. write_unlock(&fs_info->block_group_cache_lock);
  696. btrfs_get_block_group(cache);
  697. btrfs_queue_work(fs_info->caching_workers, &caching_ctl->work);
  698. out:
  699. if (wait && caching_ctl)
  700. ret = btrfs_caching_ctl_wait_done(cache, caching_ctl);
  701. if (caching_ctl)
  702. btrfs_put_caching_control(caching_ctl);
  703. return ret;
  704. }
  705. static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  706. {
  707. u64 extra_flags = chunk_to_extended(flags) &
  708. BTRFS_EXTENDED_PROFILE_MASK;
  709. write_seqlock(&fs_info->profiles_lock);
  710. if (flags & BTRFS_BLOCK_GROUP_DATA)
  711. fs_info->avail_data_alloc_bits &= ~extra_flags;
  712. if (flags & BTRFS_BLOCK_GROUP_METADATA)
  713. fs_info->avail_metadata_alloc_bits &= ~extra_flags;
  714. if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
  715. fs_info->avail_system_alloc_bits &= ~extra_flags;
  716. write_sequnlock(&fs_info->profiles_lock);
  717. }
  718. /*
  719. * Clear incompat bits for the following feature(s):
  720. *
  721. * - RAID56 - in case there's neither RAID5 nor RAID6 profile block group
  722. * in the whole filesystem
  723. *
  724. * - RAID1C34 - same as above for RAID1C3 and RAID1C4 block groups
  725. */
  726. static void clear_incompat_bg_bits(struct btrfs_fs_info *fs_info, u64 flags)
  727. {
  728. bool found_raid56 = false;
  729. bool found_raid1c34 = false;
  730. if ((flags & BTRFS_BLOCK_GROUP_RAID56_MASK) ||
  731. (flags & BTRFS_BLOCK_GROUP_RAID1C3) ||
  732. (flags & BTRFS_BLOCK_GROUP_RAID1C4)) {
  733. struct list_head *head = &fs_info->space_info;
  734. struct btrfs_space_info *sinfo;
  735. list_for_each_entry_rcu(sinfo, head, list) {
  736. down_read(&sinfo->groups_sem);
  737. if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID5]))
  738. found_raid56 = true;
  739. if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID6]))
  740. found_raid56 = true;
  741. if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C3]))
  742. found_raid1c34 = true;
  743. if (!list_empty(&sinfo->block_groups[BTRFS_RAID_RAID1C4]))
  744. found_raid1c34 = true;
  745. up_read(&sinfo->groups_sem);
  746. }
  747. if (!found_raid56)
  748. btrfs_clear_fs_incompat(fs_info, RAID56);
  749. if (!found_raid1c34)
  750. btrfs_clear_fs_incompat(fs_info, RAID1C34);
  751. }
  752. }
  753. static int remove_block_group_item(struct btrfs_trans_handle *trans,
  754. struct btrfs_path *path,
  755. struct btrfs_block_group *block_group)
  756. {
  757. struct btrfs_fs_info *fs_info = trans->fs_info;
  758. struct btrfs_root *root;
  759. struct btrfs_key key;
  760. int ret;
  761. root = btrfs_block_group_root(fs_info);
  762. key.objectid = block_group->start;
  763. key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
  764. key.offset = block_group->length;
  765. ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
  766. if (ret > 0)
  767. ret = -ENOENT;
  768. if (ret < 0)
  769. return ret;
  770. ret = btrfs_del_item(trans, root, path);
  771. return ret;
  772. }
  773. int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
  774. u64 group_start, struct extent_map *em)
  775. {
  776. struct btrfs_fs_info *fs_info = trans->fs_info;
  777. struct btrfs_path *path;
  778. struct btrfs_block_group *block_group;
  779. struct btrfs_free_cluster *cluster;
  780. struct inode *inode;
  781. struct kobject *kobj = NULL;
  782. int ret;
  783. int index;
  784. int factor;
  785. struct btrfs_caching_control *caching_ctl = NULL;
  786. bool remove_em;
  787. bool remove_rsv = false;
  788. block_group = btrfs_lookup_block_group(fs_info, group_start);
  789. BUG_ON(!block_group);
  790. BUG_ON(!block_group->ro);
  791. trace_btrfs_remove_block_group(block_group);
  792. /*
  793. * Free the reserved super bytes from this block group before
  794. * remove it.
  795. */
  796. btrfs_free_excluded_extents(block_group);
  797. btrfs_free_ref_tree_range(fs_info, block_group->start,
  798. block_group->length);
  799. index = btrfs_bg_flags_to_raid_index(block_group->flags);
  800. factor = btrfs_bg_type_to_factor(block_group->flags);
  801. /* make sure this block group isn't part of an allocation cluster */
  802. cluster = &fs_info->data_alloc_cluster;
  803. spin_lock(&cluster->refill_lock);
  804. btrfs_return_cluster_to_free_space(block_group, cluster);
  805. spin_unlock(&cluster->refill_lock);
  806. /*
  807. * make sure this block group isn't part of a metadata
  808. * allocation cluster
  809. */
  810. cluster = &fs_info->meta_alloc_cluster;
  811. spin_lock(&cluster->refill_lock);
  812. btrfs_return_cluster_to_free_space(block_group, cluster);
  813. spin_unlock(&cluster->refill_lock);
  814. btrfs_clear_treelog_bg(block_group);
  815. btrfs_clear_data_reloc_bg(block_group);
  816. path = btrfs_alloc_path();
  817. if (!path) {
  818. ret = -ENOMEM;
  819. goto out;
  820. }
  821. /*
  822. * get the inode first so any iput calls done for the io_list
  823. * aren't the final iput (no unlinks allowed now)
  824. */
  825. inode = lookup_free_space_inode(block_group, path);
  826. mutex_lock(&trans->transaction->cache_write_mutex);
  827. /*
  828. * Make sure our free space cache IO is done before removing the
  829. * free space inode
  830. */
  831. spin_lock(&trans->transaction->dirty_bgs_lock);
  832. if (!list_empty(&block_group->io_list)) {
  833. list_del_init(&block_group->io_list);
  834. WARN_ON(!IS_ERR(inode) && inode != block_group->io_ctl.inode);
  835. spin_unlock(&trans->transaction->dirty_bgs_lock);
  836. btrfs_wait_cache_io(trans, block_group, path);
  837. btrfs_put_block_group(block_group);
  838. spin_lock(&trans->transaction->dirty_bgs_lock);
  839. }
  840. if (!list_empty(&block_group->dirty_list)) {
  841. list_del_init(&block_group->dirty_list);
  842. remove_rsv = true;
  843. btrfs_put_block_group(block_group);
  844. }
  845. spin_unlock(&trans->transaction->dirty_bgs_lock);
  846. mutex_unlock(&trans->transaction->cache_write_mutex);
  847. ret = btrfs_remove_free_space_inode(trans, inode, block_group);
  848. if (ret)
  849. goto out;
  850. write_lock(&fs_info->block_group_cache_lock);
  851. rb_erase_cached(&block_group->cache_node,
  852. &fs_info->block_group_cache_tree);
  853. RB_CLEAR_NODE(&block_group->cache_node);
  854. /* Once for the block groups rbtree */
  855. btrfs_put_block_group(block_group);
  856. write_unlock(&fs_info->block_group_cache_lock);
  857. down_write(&block_group->space_info->groups_sem);
  858. /*
  859. * we must use list_del_init so people can check to see if they
  860. * are still on the list after taking the semaphore
  861. */
  862. list_del_init(&block_group->list);
  863. if (list_empty(&block_group->space_info->block_groups[index])) {
  864. kobj = block_group->space_info->block_group_kobjs[index];
  865. block_group->space_info->block_group_kobjs[index] = NULL;
  866. clear_avail_alloc_bits(fs_info, block_group->flags);
  867. }
  868. up_write(&block_group->space_info->groups_sem);
  869. clear_incompat_bg_bits(fs_info, block_group->flags);
  870. if (kobj) {
  871. kobject_del(kobj);
  872. kobject_put(kobj);
  873. }
  874. if (block_group->cached == BTRFS_CACHE_STARTED)
  875. btrfs_wait_block_group_cache_done(block_group);
  876. write_lock(&fs_info->block_group_cache_lock);
  877. caching_ctl = btrfs_get_caching_control(block_group);
  878. if (!caching_ctl) {
  879. struct btrfs_caching_control *ctl;
  880. list_for_each_entry(ctl, &fs_info->caching_block_groups, list) {
  881. if (ctl->block_group == block_group) {
  882. caching_ctl = ctl;
  883. refcount_inc(&caching_ctl->count);
  884. break;
  885. }
  886. }
  887. }
  888. if (caching_ctl)
  889. list_del_init(&caching_ctl->list);
  890. write_unlock(&fs_info->block_group_cache_lock);
  891. if (caching_ctl) {
  892. /* Once for the caching bgs list and once for us. */
  893. btrfs_put_caching_control(caching_ctl);
  894. btrfs_put_caching_control(caching_ctl);
  895. }
  896. spin_lock(&trans->transaction->dirty_bgs_lock);
  897. WARN_ON(!list_empty(&block_group->dirty_list));
  898. WARN_ON(!list_empty(&block_group->io_list));
  899. spin_unlock(&trans->transaction->dirty_bgs_lock);
  900. btrfs_remove_free_space_cache(block_group);
  901. spin_lock(&block_group->space_info->lock);
  902. list_del_init(&block_group->ro_list);
  903. if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
  904. WARN_ON(block_group->space_info->total_bytes
  905. < block_group->length);
  906. WARN_ON(block_group->space_info->bytes_readonly
  907. < block_group->length - block_group->zone_unusable);
  908. WARN_ON(block_group->space_info->bytes_zone_unusable
  909. < block_group->zone_unusable);
  910. WARN_ON(block_group->space_info->disk_total
  911. < block_group->length * factor);
  912. WARN_ON(test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
  913. &block_group->runtime_flags) &&
  914. block_group->space_info->active_total_bytes
  915. < block_group->length);
  916. }
  917. block_group->space_info->total_bytes -= block_group->length;
  918. if (test_bit(BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE, &block_group->runtime_flags))
  919. block_group->space_info->active_total_bytes -= block_group->length;
  920. block_group->space_info->bytes_readonly -=
  921. (block_group->length - block_group->zone_unusable);
  922. block_group->space_info->bytes_zone_unusable -=
  923. block_group->zone_unusable;
  924. block_group->space_info->disk_total -= block_group->length * factor;
  925. spin_unlock(&block_group->space_info->lock);
  926. /*
  927. * Remove the free space for the block group from the free space tree
  928. * and the block group's item from the extent tree before marking the
  929. * block group as removed. This is to prevent races with tasks that
  930. * freeze and unfreeze a block group, this task and another task
  931. * allocating a new block group - the unfreeze task ends up removing
  932. * the block group's extent map before the task calling this function
  933. * deletes the block group item from the extent tree, allowing for
  934. * another task to attempt to create another block group with the same
  935. * item key (and failing with -EEXIST and a transaction abort).
  936. */
  937. ret = remove_block_group_free_space(trans, block_group);
  938. if (ret)
  939. goto out;
  940. ret = remove_block_group_item(trans, path, block_group);
  941. if (ret < 0)
  942. goto out;
  943. spin_lock(&block_group->lock);
  944. set_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags);
  945. /*
  946. * At this point trimming or scrub can't start on this block group,
  947. * because we removed the block group from the rbtree
  948. * fs_info->block_group_cache_tree so no one can't find it anymore and
  949. * even if someone already got this block group before we removed it
  950. * from the rbtree, they have already incremented block_group->frozen -
  951. * if they didn't, for the trimming case they won't find any free space
  952. * entries because we already removed them all when we called
  953. * btrfs_remove_free_space_cache().
  954. *
  955. * And we must not remove the extent map from the fs_info->mapping_tree
  956. * to prevent the same logical address range and physical device space
  957. * ranges from being reused for a new block group. This is needed to
  958. * avoid races with trimming and scrub.
  959. *
  960. * An fs trim operation (btrfs_trim_fs() / btrfs_ioctl_fitrim()) is
  961. * completely transactionless, so while it is trimming a range the
  962. * currently running transaction might finish and a new one start,
  963. * allowing for new block groups to be created that can reuse the same
  964. * physical device locations unless we take this special care.
  965. *
  966. * There may also be an implicit trim operation if the file system
  967. * is mounted with -odiscard. The same protections must remain
  968. * in place until the extents have been discarded completely when
  969. * the transaction commit has completed.
  970. */
  971. remove_em = (atomic_read(&block_group->frozen) == 0);
  972. spin_unlock(&block_group->lock);
  973. if (remove_em) {
  974. struct extent_map_tree *em_tree;
  975. em_tree = &fs_info->mapping_tree;
  976. write_lock(&em_tree->lock);
  977. remove_extent_mapping(em_tree, em);
  978. write_unlock(&em_tree->lock);
  979. /* once for the tree */
  980. free_extent_map(em);
  981. }
  982. out:
  983. /* Once for the lookup reference */
  984. btrfs_put_block_group(block_group);
  985. if (remove_rsv)
  986. btrfs_delayed_refs_rsv_release(fs_info, 1);
  987. btrfs_free_path(path);
  988. return ret;
  989. }
  990. struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
  991. struct btrfs_fs_info *fs_info, const u64 chunk_offset)
  992. {
  993. struct btrfs_root *root = btrfs_block_group_root(fs_info);
  994. struct extent_map_tree *em_tree = &fs_info->mapping_tree;
  995. struct extent_map *em;
  996. struct map_lookup *map;
  997. unsigned int num_items;
  998. read_lock(&em_tree->lock);
  999. em = lookup_extent_mapping(em_tree, chunk_offset, 1);
  1000. read_unlock(&em_tree->lock);
  1001. ASSERT(em && em->start == chunk_offset);
  1002. /*
  1003. * We need to reserve 3 + N units from the metadata space info in order
  1004. * to remove a block group (done at btrfs_remove_chunk() and at
  1005. * btrfs_remove_block_group()), which are used for:
  1006. *
  1007. * 1 unit for adding the free space inode's orphan (located in the tree
  1008. * of tree roots).
  1009. * 1 unit for deleting the block group item (located in the extent
  1010. * tree).
  1011. * 1 unit for deleting the free space item (located in tree of tree
  1012. * roots).
  1013. * N units for deleting N device extent items corresponding to each
  1014. * stripe (located in the device tree).
  1015. *
  1016. * In order to remove a block group we also need to reserve units in the
  1017. * system space info in order to update the chunk tree (update one or
  1018. * more device items and remove one chunk item), but this is done at
  1019. * btrfs_remove_chunk() through a call to check_system_chunk().
  1020. */
  1021. map = em->map_lookup;
  1022. num_items = 3 + map->num_stripes;
  1023. free_extent_map(em);
  1024. return btrfs_start_transaction_fallback_global_rsv(root, num_items);
  1025. }
  1026. /*
  1027. * Mark block group @cache read-only, so later write won't happen to block
  1028. * group @cache.
  1029. *
  1030. * If @force is not set, this function will only mark the block group readonly
  1031. * if we have enough free space (1M) in other metadata/system block groups.
  1032. * If @force is not set, this function will mark the block group readonly
  1033. * without checking free space.
  1034. *
  1035. * NOTE: This function doesn't care if other block groups can contain all the
  1036. * data in this block group. That check should be done by relocation routine,
  1037. * not this function.
  1038. */
  1039. static int inc_block_group_ro(struct btrfs_block_group *cache, int force)
  1040. {
  1041. struct btrfs_space_info *sinfo = cache->space_info;
  1042. u64 num_bytes;
  1043. int ret = -ENOSPC;
  1044. spin_lock(&sinfo->lock);
  1045. spin_lock(&cache->lock);
  1046. if (cache->swap_extents) {
  1047. ret = -ETXTBSY;
  1048. goto out;
  1049. }
  1050. if (cache->ro) {
  1051. cache->ro++;
  1052. ret = 0;
  1053. goto out;
  1054. }
  1055. num_bytes = cache->length - cache->reserved - cache->pinned -
  1056. cache->bytes_super - cache->zone_unusable - cache->used;
  1057. /*
  1058. * Data never overcommits, even in mixed mode, so do just the straight
  1059. * check of left over space in how much we have allocated.
  1060. */
  1061. if (force) {
  1062. ret = 0;
  1063. } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
  1064. u64 sinfo_used = btrfs_space_info_used(sinfo, true);
  1065. /*
  1066. * Here we make sure if we mark this bg RO, we still have enough
  1067. * free space as buffer.
  1068. */
  1069. if (sinfo_used + num_bytes <= sinfo->total_bytes)
  1070. ret = 0;
  1071. } else {
  1072. /*
  1073. * We overcommit metadata, so we need to do the
  1074. * btrfs_can_overcommit check here, and we need to pass in
  1075. * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
  1076. * leeway to allow us to mark this block group as read only.
  1077. */
  1078. if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
  1079. BTRFS_RESERVE_NO_FLUSH))
  1080. ret = 0;
  1081. }
  1082. if (!ret) {
  1083. sinfo->bytes_readonly += num_bytes;
  1084. if (btrfs_is_zoned(cache->fs_info)) {
  1085. /* Migrate zone_unusable bytes to readonly */
  1086. sinfo->bytes_readonly += cache->zone_unusable;
  1087. sinfo->bytes_zone_unusable -= cache->zone_unusable;
  1088. cache->zone_unusable = 0;
  1089. }
  1090. cache->ro++;
  1091. list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
  1092. }
  1093. out:
  1094. spin_unlock(&cache->lock);
  1095. spin_unlock(&sinfo->lock);
  1096. if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) {
  1097. btrfs_info(cache->fs_info,
  1098. "unable to make block group %llu ro", cache->start);
  1099. btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
  1100. }
  1101. return ret;
  1102. }
  1103. static bool clean_pinned_extents(struct btrfs_trans_handle *trans,
  1104. struct btrfs_block_group *bg)
  1105. {
  1106. struct btrfs_fs_info *fs_info = bg->fs_info;
  1107. struct btrfs_transaction *prev_trans = NULL;
  1108. const u64 start = bg->start;
  1109. const u64 end = start + bg->length - 1;
  1110. int ret;
  1111. spin_lock(&fs_info->trans_lock);
  1112. if (trans->transaction->list.prev != &fs_info->trans_list) {
  1113. prev_trans = list_last_entry(&trans->transaction->list,
  1114. struct btrfs_transaction, list);
  1115. refcount_inc(&prev_trans->use_count);
  1116. }
  1117. spin_unlock(&fs_info->trans_lock);
  1118. /*
  1119. * Hold the unused_bg_unpin_mutex lock to avoid racing with
  1120. * btrfs_finish_extent_commit(). If we are at transaction N, another
  1121. * task might be running finish_extent_commit() for the previous
  1122. * transaction N - 1, and have seen a range belonging to the block
  1123. * group in pinned_extents before we were able to clear the whole block
  1124. * group range from pinned_extents. This means that task can lookup for
  1125. * the block group after we unpinned it from pinned_extents and removed
  1126. * it, leading to a BUG_ON() at unpin_extent_range().
  1127. */
  1128. mutex_lock(&fs_info->unused_bg_unpin_mutex);
  1129. if (prev_trans) {
  1130. ret = clear_extent_bits(&prev_trans->pinned_extents, start, end,
  1131. EXTENT_DIRTY);
  1132. if (ret)
  1133. goto out;
  1134. }
  1135. ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end,
  1136. EXTENT_DIRTY);
  1137. out:
  1138. mutex_unlock(&fs_info->unused_bg_unpin_mutex);
  1139. if (prev_trans)
  1140. btrfs_put_transaction(prev_trans);
  1141. return ret == 0;
  1142. }
  1143. /*
  1144. * Process the unused_bgs list and remove any that don't have any allocated
  1145. * space inside of them.
  1146. */
  1147. void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
  1148. {
  1149. struct btrfs_block_group *block_group;
  1150. struct btrfs_space_info *space_info;
  1151. struct btrfs_trans_handle *trans;
  1152. const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC);
  1153. int ret = 0;
  1154. if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
  1155. return;
  1156. if (btrfs_fs_closing(fs_info))
  1157. return;
  1158. /*
  1159. * Long running balances can keep us blocked here for eternity, so
  1160. * simply skip deletion if we're unable to get the mutex.
  1161. */
  1162. if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
  1163. return;
  1164. spin_lock(&fs_info->unused_bgs_lock);
  1165. while (!list_empty(&fs_info->unused_bgs)) {
  1166. int trimming;
  1167. block_group = list_first_entry(&fs_info->unused_bgs,
  1168. struct btrfs_block_group,
  1169. bg_list);
  1170. list_del_init(&block_group->bg_list);
  1171. space_info = block_group->space_info;
  1172. if (ret || btrfs_mixed_space_info(space_info)) {
  1173. btrfs_put_block_group(block_group);
  1174. continue;
  1175. }
  1176. spin_unlock(&fs_info->unused_bgs_lock);
  1177. btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
  1178. /* Don't want to race with allocators so take the groups_sem */
  1179. down_write(&space_info->groups_sem);
  1180. /*
  1181. * Async discard moves the final block group discard to be prior
  1182. * to the unused_bgs code path. Therefore, if it's not fully
  1183. * trimmed, punt it back to the async discard lists.
  1184. */
  1185. if (btrfs_test_opt(fs_info, DISCARD_ASYNC) &&
  1186. !btrfs_is_free_space_trimmed(block_group)) {
  1187. trace_btrfs_skip_unused_block_group(block_group);
  1188. up_write(&space_info->groups_sem);
  1189. /* Requeue if we failed because of async discard */
  1190. btrfs_discard_queue_work(&fs_info->discard_ctl,
  1191. block_group);
  1192. goto next;
  1193. }
  1194. spin_lock(&block_group->lock);
  1195. if (block_group->reserved || block_group->pinned ||
  1196. block_group->used || block_group->ro ||
  1197. list_is_singular(&block_group->list)) {
  1198. /*
  1199. * We want to bail if we made new allocations or have
  1200. * outstanding allocations in this block group. We do
  1201. * the ro check in case balance is currently acting on
  1202. * this block group.
  1203. */
  1204. trace_btrfs_skip_unused_block_group(block_group);
  1205. spin_unlock(&block_group->lock);
  1206. up_write(&space_info->groups_sem);
  1207. goto next;
  1208. }
  1209. spin_unlock(&block_group->lock);
  1210. /* We don't want to force the issue, only flip if it's ok. */
  1211. ret = inc_block_group_ro(block_group, 0);
  1212. up_write(&space_info->groups_sem);
  1213. if (ret < 0) {
  1214. ret = 0;
  1215. goto next;
  1216. }
  1217. ret = btrfs_zone_finish(block_group);
  1218. if (ret < 0) {
  1219. btrfs_dec_block_group_ro(block_group);
  1220. if (ret == -EAGAIN)
  1221. ret = 0;
  1222. goto next;
  1223. }
  1224. /*
  1225. * Want to do this before we do anything else so we can recover
  1226. * properly if we fail to join the transaction.
  1227. */
  1228. trans = btrfs_start_trans_remove_block_group(fs_info,
  1229. block_group->start);
  1230. if (IS_ERR(trans)) {
  1231. btrfs_dec_block_group_ro(block_group);
  1232. ret = PTR_ERR(trans);
  1233. goto next;
  1234. }
  1235. /*
  1236. * We could have pending pinned extents for this block group,
  1237. * just delete them, we don't care about them anymore.
  1238. */
  1239. if (!clean_pinned_extents(trans, block_group)) {
  1240. btrfs_dec_block_group_ro(block_group);
  1241. goto end_trans;
  1242. }
  1243. /*
  1244. * At this point, the block_group is read only and should fail
  1245. * new allocations. However, btrfs_finish_extent_commit() can
  1246. * cause this block_group to be placed back on the discard
  1247. * lists because now the block_group isn't fully discarded.
  1248. * Bail here and try again later after discarding everything.
  1249. */
  1250. spin_lock(&fs_info->discard_ctl.lock);
  1251. if (!list_empty(&block_group->discard_list)) {
  1252. spin_unlock(&fs_info->discard_ctl.lock);
  1253. btrfs_dec_block_group_ro(block_group);
  1254. btrfs_discard_queue_work(&fs_info->discard_ctl,
  1255. block_group);
  1256. goto end_trans;
  1257. }
  1258. spin_unlock(&fs_info->discard_ctl.lock);
  1259. /* Reset pinned so btrfs_put_block_group doesn't complain */
  1260. spin_lock(&space_info->lock);
  1261. spin_lock(&block_group->lock);
  1262. btrfs_space_info_update_bytes_pinned(fs_info, space_info,
  1263. -block_group->pinned);
  1264. space_info->bytes_readonly += block_group->pinned;
  1265. block_group->pinned = 0;
  1266. spin_unlock(&block_group->lock);
  1267. spin_unlock(&space_info->lock);
  1268. /*
  1269. * The normal path here is an unused block group is passed here,
  1270. * then trimming is handled in the transaction commit path.
  1271. * Async discard interposes before this to do the trimming
  1272. * before coming down the unused block group path as trimming
  1273. * will no longer be done later in the transaction commit path.
  1274. */
  1275. if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC))
  1276. goto flip_async;
  1277. /*
  1278. * DISCARD can flip during remount. On zoned filesystems, we
  1279. * need to reset sequential-required zones.
  1280. */
  1281. trimming = btrfs_test_opt(fs_info, DISCARD_SYNC) ||
  1282. btrfs_is_zoned(fs_info);
  1283. /* Implicit trim during transaction commit. */
  1284. if (trimming)
  1285. btrfs_freeze_block_group(block_group);
  1286. /*
  1287. * Btrfs_remove_chunk will abort the transaction if things go
  1288. * horribly wrong.
  1289. */
  1290. ret = btrfs_remove_chunk(trans, block_group->start);
  1291. if (ret) {
  1292. if (trimming)
  1293. btrfs_unfreeze_block_group(block_group);
  1294. goto end_trans;
  1295. }
  1296. /*
  1297. * If we're not mounted with -odiscard, we can just forget
  1298. * about this block group. Otherwise we'll need to wait
  1299. * until transaction commit to do the actual discard.
  1300. */
  1301. if (trimming) {
  1302. spin_lock(&fs_info->unused_bgs_lock);
  1303. /*
  1304. * A concurrent scrub might have added us to the list
  1305. * fs_info->unused_bgs, so use a list_move operation
  1306. * to add the block group to the deleted_bgs list.
  1307. */
  1308. list_move(&block_group->bg_list,
  1309. &trans->transaction->deleted_bgs);
  1310. spin_unlock(&fs_info->unused_bgs_lock);
  1311. btrfs_get_block_group(block_group);
  1312. }
  1313. end_trans:
  1314. btrfs_end_transaction(trans);
  1315. next:
  1316. btrfs_put_block_group(block_group);
  1317. spin_lock(&fs_info->unused_bgs_lock);
  1318. }
  1319. spin_unlock(&fs_info->unused_bgs_lock);
  1320. mutex_unlock(&fs_info->reclaim_bgs_lock);
  1321. return;
  1322. flip_async:
  1323. btrfs_end_transaction(trans);
  1324. mutex_unlock(&fs_info->reclaim_bgs_lock);
  1325. btrfs_put_block_group(block_group);
  1326. btrfs_discard_punt_unused_bgs_list(fs_info);
  1327. }
  1328. void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
  1329. {
  1330. struct btrfs_fs_info *fs_info = bg->fs_info;
  1331. spin_lock(&fs_info->unused_bgs_lock);
  1332. if (list_empty(&bg->bg_list)) {
  1333. btrfs_get_block_group(bg);
  1334. trace_btrfs_add_unused_block_group(bg);
  1335. list_add_tail(&bg->bg_list, &fs_info->unused_bgs);
  1336. } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) {
  1337. /* Pull out the block group from the reclaim_bgs list. */
  1338. trace_btrfs_add_unused_block_group(bg);
  1339. list_move_tail(&bg->bg_list, &fs_info->unused_bgs);
  1340. }
  1341. spin_unlock(&fs_info->unused_bgs_lock);
  1342. }
  1343. /*
  1344. * We want block groups with a low number of used bytes to be in the beginning
  1345. * of the list, so they will get reclaimed first.
  1346. */
  1347. static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
  1348. const struct list_head *b)
  1349. {
  1350. const struct btrfs_block_group *bg1, *bg2;
  1351. bg1 = list_entry(a, struct btrfs_block_group, bg_list);
  1352. bg2 = list_entry(b, struct btrfs_block_group, bg_list);
  1353. return bg1->used > bg2->used;
  1354. }
  1355. static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
  1356. {
  1357. if (btrfs_is_zoned(fs_info))
  1358. return btrfs_zoned_should_reclaim(fs_info);
  1359. return true;
  1360. }
  1361. void btrfs_reclaim_bgs_work(struct work_struct *work)
  1362. {
  1363. struct btrfs_fs_info *fs_info =
  1364. container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
  1365. struct btrfs_block_group *bg;
  1366. struct btrfs_space_info *space_info;
  1367. if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
  1368. return;
  1369. if (btrfs_fs_closing(fs_info))
  1370. return;
  1371. if (!btrfs_should_reclaim(fs_info))
  1372. return;
  1373. sb_start_write(fs_info->sb);
  1374. if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
  1375. sb_end_write(fs_info->sb);
  1376. return;
  1377. }
  1378. /*
  1379. * Long running balances can keep us blocked here for eternity, so
  1380. * simply skip reclaim if we're unable to get the mutex.
  1381. */
  1382. if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) {
  1383. btrfs_exclop_finish(fs_info);
  1384. sb_end_write(fs_info->sb);
  1385. return;
  1386. }
  1387. spin_lock(&fs_info->unused_bgs_lock);
  1388. /*
  1389. * Sort happens under lock because we can't simply splice it and sort.
  1390. * The block groups might still be in use and reachable via bg_list,
  1391. * and their presence in the reclaim_bgs list must be preserved.
  1392. */
  1393. list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp);
  1394. while (!list_empty(&fs_info->reclaim_bgs)) {
  1395. u64 zone_unusable;
  1396. int ret = 0;
  1397. bg = list_first_entry(&fs_info->reclaim_bgs,
  1398. struct btrfs_block_group,
  1399. bg_list);
  1400. list_del_init(&bg->bg_list);
  1401. space_info = bg->space_info;
  1402. spin_unlock(&fs_info->unused_bgs_lock);
  1403. /* Don't race with allocators so take the groups_sem */
  1404. down_write(&space_info->groups_sem);
  1405. spin_lock(&bg->lock);
  1406. if (bg->reserved || bg->pinned || bg->ro) {
  1407. /*
  1408. * We want to bail if we made new allocations or have
  1409. * outstanding allocations in this block group. We do
  1410. * the ro check in case balance is currently acting on
  1411. * this block group.
  1412. */
  1413. spin_unlock(&bg->lock);
  1414. up_write(&space_info->groups_sem);
  1415. goto next;
  1416. }
  1417. spin_unlock(&bg->lock);
  1418. /*
  1419. * Get out fast, in case we're read-only or unmounting the
  1420. * filesystem. It is OK to drop block groups from the list even
  1421. * for the read-only case. As we did sb_start_write(),
  1422. * "mount -o remount,ro" won't happen and read-only filesystem
  1423. * means it is forced read-only due to a fatal error. So, it
  1424. * never gets back to read-write to let us reclaim again.
  1425. */
  1426. if (btrfs_need_cleaner_sleep(fs_info)) {
  1427. up_write(&space_info->groups_sem);
  1428. goto next;
  1429. }
  1430. /*
  1431. * Cache the zone_unusable value before turning the block group
  1432. * to read only. As soon as the blog group is read only it's
  1433. * zone_unusable value gets moved to the block group's read-only
  1434. * bytes and isn't available for calculations anymore.
  1435. */
  1436. zone_unusable = bg->zone_unusable;
  1437. ret = inc_block_group_ro(bg, 0);
  1438. up_write(&space_info->groups_sem);
  1439. if (ret < 0)
  1440. goto next;
  1441. btrfs_info(fs_info,
  1442. "reclaiming chunk %llu with %llu%% used %llu%% unusable",
  1443. bg->start,
  1444. div64_u64(bg->used * 100, bg->length),
  1445. div64_u64(zone_unusable * 100, bg->length));
  1446. trace_btrfs_reclaim_block_group(bg);
  1447. ret = btrfs_relocate_chunk(fs_info, bg->start);
  1448. if (ret) {
  1449. btrfs_dec_block_group_ro(bg);
  1450. btrfs_err(fs_info, "error relocating chunk %llu",
  1451. bg->start);
  1452. }
  1453. next:
  1454. if (ret)
  1455. btrfs_mark_bg_to_reclaim(bg);
  1456. btrfs_put_block_group(bg);
  1457. mutex_unlock(&fs_info->reclaim_bgs_lock);
  1458. /*
  1459. * Reclaiming all the block groups in the list can take really
  1460. * long. Prioritize cleaning up unused block groups.
  1461. */
  1462. btrfs_delete_unused_bgs(fs_info);
  1463. /*
  1464. * If we are interrupted by a balance, we can just bail out. The
  1465. * cleaner thread restart again if necessary.
  1466. */
  1467. if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
  1468. goto end;
  1469. spin_lock(&fs_info->unused_bgs_lock);
  1470. }
  1471. spin_unlock(&fs_info->unused_bgs_lock);
  1472. mutex_unlock(&fs_info->reclaim_bgs_lock);
  1473. end:
  1474. btrfs_exclop_finish(fs_info);
  1475. sb_end_write(fs_info->sb);
  1476. }
  1477. void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
  1478. {
  1479. spin_lock(&fs_info->unused_bgs_lock);
  1480. if (!list_empty(&fs_info->reclaim_bgs))
  1481. queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
  1482. spin_unlock(&fs_info->unused_bgs_lock);
  1483. }
  1484. void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
  1485. {
  1486. struct btrfs_fs_info *fs_info = bg->fs_info;
  1487. spin_lock(&fs_info->unused_bgs_lock);
  1488. if (list_empty(&bg->bg_list)) {
  1489. btrfs_get_block_group(bg);
  1490. trace_btrfs_add_reclaim_block_group(bg);
  1491. list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
  1492. }
  1493. spin_unlock(&fs_info->unused_bgs_lock);
  1494. }
  1495. static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
  1496. struct btrfs_path *path)
  1497. {
  1498. struct extent_map_tree *em_tree;
  1499. struct extent_map *em;
  1500. struct btrfs_block_group_item bg;
  1501. struct extent_buffer *leaf;
  1502. int slot;
  1503. u64 flags;
  1504. int ret = 0;
  1505. slot = path->slots[0];
  1506. leaf = path->nodes[0];
  1507. em_tree = &fs_info->mapping_tree;
  1508. read_lock(&em_tree->lock);
  1509. em = lookup_extent_mapping(em_tree, key->objectid, key->offset);
  1510. read_unlock(&em_tree->lock);
  1511. if (!em) {
  1512. btrfs_err(fs_info,
  1513. "logical %llu len %llu found bg but no related chunk",
  1514. key->objectid, key->offset);
  1515. return -ENOENT;
  1516. }
  1517. if (em->start != key->objectid || em->len != key->offset) {
  1518. btrfs_err(fs_info,
  1519. "block group %llu len %llu mismatch with chunk %llu len %llu",
  1520. key->objectid, key->offset, em->start, em->len);
  1521. ret = -EUCLEAN;
  1522. goto out_free_em;
  1523. }
  1524. read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot),
  1525. sizeof(bg));
  1526. flags = btrfs_stack_block_group_flags(&bg) &
  1527. BTRFS_BLOCK_GROUP_TYPE_MASK;
  1528. if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
  1529. btrfs_err(fs_info,
  1530. "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx",
  1531. key->objectid, key->offset, flags,
  1532. (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type));
  1533. ret = -EUCLEAN;
  1534. }
  1535. out_free_em:
  1536. free_extent_map(em);
  1537. return ret;
  1538. }
  1539. static int find_first_block_group(struct btrfs_fs_info *fs_info,
  1540. struct btrfs_path *path,
  1541. struct btrfs_key *key)
  1542. {
  1543. struct btrfs_root *root = btrfs_block_group_root(fs_info);
  1544. int ret;
  1545. struct btrfs_key found_key;
  1546. btrfs_for_each_slot(root, key, &found_key, path, ret) {
  1547. if (found_key.objectid >= key->objectid &&
  1548. found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
  1549. return read_bg_from_eb(fs_info, &found_key, path);
  1550. }
  1551. }
  1552. return ret;
  1553. }
  1554. static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
  1555. {
  1556. u64 extra_flags = chunk_to_extended(flags) &
  1557. BTRFS_EXTENDED_PROFILE_MASK;
  1558. write_seqlock(&fs_info->profiles_lock);
  1559. if (flags & BTRFS_BLOCK_GROUP_DATA)
  1560. fs_info->avail_data_alloc_bits |= extra_flags;
  1561. if (flags & BTRFS_BLOCK_GROUP_METADATA)
  1562. fs_info->avail_metadata_alloc_bits |= extra_flags;
  1563. if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
  1564. fs_info->avail_system_alloc_bits |= extra_flags;
  1565. write_sequnlock(&fs_info->profiles_lock);
  1566. }
  1567. /**
  1568. * Map a physical disk address to a list of logical addresses
  1569. *
  1570. * @fs_info: the filesystem
  1571. * @chunk_start: logical address of block group
  1572. * @bdev: physical device to resolve, can be NULL to indicate any device
  1573. * @physical: physical address to map to logical addresses
  1574. * @logical: return array of logical addresses which map to @physical
  1575. * @naddrs: length of @logical
  1576. * @stripe_len: size of IO stripe for the given block group
  1577. *
  1578. * Maps a particular @physical disk address to a list of @logical addresses.
  1579. * Used primarily to exclude those portions of a block group that contain super
  1580. * block copies.
  1581. */
  1582. int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start,
  1583. struct block_device *bdev, u64 physical, u64 **logical,
  1584. int *naddrs, int *stripe_len)
  1585. {
  1586. struct extent_map *em;
  1587. struct map_lookup *map;
  1588. u64 *buf;
  1589. u64 bytenr;
  1590. u64 data_stripe_length;
  1591. u64 io_stripe_size;
  1592. int i, nr = 0;
  1593. int ret = 0;
  1594. em = btrfs_get_chunk_map(fs_info, chunk_start, 1);
  1595. if (IS_ERR(em))
  1596. return -EIO;
  1597. map = em->map_lookup;
  1598. data_stripe_length = em->orig_block_len;
  1599. io_stripe_size = map->stripe_len;
  1600. chunk_start = em->start;
  1601. /* For RAID5/6 adjust to a full IO stripe length */
  1602. if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
  1603. io_stripe_size = map->stripe_len * nr_data_stripes(map);
  1604. buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS);
  1605. if (!buf) {
  1606. ret = -ENOMEM;
  1607. goto out;
  1608. }
  1609. for (i = 0; i < map->num_stripes; i++) {
  1610. bool already_inserted = false;
  1611. u64 stripe_nr;
  1612. u64 offset;
  1613. int j;
  1614. if (!in_range(physical, map->stripes[i].physical,
  1615. data_stripe_length))
  1616. continue;
  1617. if (bdev && map->stripes[i].dev->bdev != bdev)
  1618. continue;
  1619. stripe_nr = physical - map->stripes[i].physical;
  1620. stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset);
  1621. if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
  1622. BTRFS_BLOCK_GROUP_RAID10)) {
  1623. stripe_nr = stripe_nr * map->num_stripes + i;
  1624. stripe_nr = div_u64(stripe_nr, map->sub_stripes);
  1625. }
  1626. /*
  1627. * The remaining case would be for RAID56, multiply by
  1628. * nr_data_stripes(). Alternatively, just use rmap_len below
  1629. * instead of map->stripe_len
  1630. */
  1631. bytenr = chunk_start + stripe_nr * io_stripe_size + offset;
  1632. /* Ensure we don't add duplicate addresses */
  1633. for (j = 0; j < nr; j++) {
  1634. if (buf[j] == bytenr) {
  1635. already_inserted = true;
  1636. break;
  1637. }
  1638. }
  1639. if (!already_inserted)
  1640. buf[nr++] = bytenr;
  1641. }
  1642. *logical = buf;
  1643. *naddrs = nr;
  1644. *stripe_len = io_stripe_size;
  1645. out:
  1646. free_extent_map(em);
  1647. return ret;
  1648. }
  1649. static int exclude_super_stripes(struct btrfs_block_group *cache)
  1650. {
  1651. struct btrfs_fs_info *fs_info = cache->fs_info;
  1652. const bool zoned = btrfs_is_zoned(fs_info);
  1653. u64 bytenr;
  1654. u64 *logical;
  1655. int stripe_len;
  1656. int i, nr, ret;
  1657. if (cache->start < BTRFS_SUPER_INFO_OFFSET) {
  1658. stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start;
  1659. cache->bytes_super += stripe_len;
  1660. ret = btrfs_add_excluded_extent(fs_info, cache->start,
  1661. stripe_len);
  1662. if (ret)
  1663. return ret;
  1664. }
  1665. for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
  1666. bytenr = btrfs_sb_offset(i);
  1667. ret = btrfs_rmap_block(fs_info, cache->start, NULL,
  1668. bytenr, &logical, &nr, &stripe_len);
  1669. if (ret)
  1670. return ret;
  1671. /* Shouldn't have super stripes in sequential zones */
  1672. if (zoned && nr) {
  1673. kfree(logical);
  1674. btrfs_err(fs_info,
  1675. "zoned: block group %llu must not contain super block",
  1676. cache->start);
  1677. return -EUCLEAN;
  1678. }
  1679. while (nr--) {
  1680. u64 len = min_t(u64, stripe_len,
  1681. cache->start + cache->length - logical[nr]);
  1682. cache->bytes_super += len;
  1683. ret = btrfs_add_excluded_extent(fs_info, logical[nr],
  1684. len);
  1685. if (ret) {
  1686. kfree(logical);
  1687. return ret;
  1688. }
  1689. }
  1690. kfree(logical);
  1691. }
  1692. return 0;
  1693. }
  1694. static struct btrfs_block_group *btrfs_create_block_group_cache(
  1695. struct btrfs_fs_info *fs_info, u64 start)
  1696. {
  1697. struct btrfs_block_group *cache;
  1698. cache = kzalloc(sizeof(*cache), GFP_NOFS);
  1699. if (!cache)
  1700. return NULL;
  1701. cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
  1702. GFP_NOFS);
  1703. if (!cache->free_space_ctl) {
  1704. kfree(cache);
  1705. return NULL;
  1706. }
  1707. cache->start = start;
  1708. cache->fs_info = fs_info;
  1709. cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start);
  1710. cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
  1711. refcount_set(&cache->refs, 1);
  1712. spin_lock_init(&cache->lock);
  1713. init_rwsem(&cache->data_rwsem);
  1714. INIT_LIST_HEAD(&cache->list);
  1715. INIT_LIST_HEAD(&cache->cluster_list);
  1716. INIT_LIST_HEAD(&cache->bg_list);
  1717. INIT_LIST_HEAD(&cache->ro_list);
  1718. INIT_LIST_HEAD(&cache->discard_list);
  1719. INIT_LIST_HEAD(&cache->dirty_list);
  1720. INIT_LIST_HEAD(&cache->io_list);
  1721. INIT_LIST_HEAD(&cache->active_bg_list);
  1722. btrfs_init_free_space_ctl(cache, cache->free_space_ctl);
  1723. atomic_set(&cache->frozen, 0);
  1724. mutex_init(&cache->free_space_lock);
  1725. cache->full_stripe_locks_root.root = RB_ROOT;
  1726. mutex_init(&cache->full_stripe_locks_root.lock);
  1727. return cache;
  1728. }
  1729. /*
  1730. * Iterate all chunks and verify that each of them has the corresponding block
  1731. * group
  1732. */
  1733. static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info)
  1734. {
  1735. struct extent_map_tree *map_tree = &fs_info->mapping_tree;
  1736. struct extent_map *em;
  1737. struct btrfs_block_group *bg;
  1738. u64 start = 0;
  1739. int ret = 0;
  1740. while (1) {
  1741. read_lock(&map_tree->lock);
  1742. /*
  1743. * lookup_extent_mapping will return the first extent map
  1744. * intersecting the range, so setting @len to 1 is enough to
  1745. * get the first chunk.
  1746. */
  1747. em = lookup_extent_mapping(map_tree, start, 1);
  1748. read_unlock(&map_tree->lock);
  1749. if (!em)
  1750. break;
  1751. bg = btrfs_lookup_block_group(fs_info, em->start);
  1752. if (!bg) {
  1753. btrfs_err(fs_info,
  1754. "chunk start=%llu len=%llu doesn't have corresponding block group",
  1755. em->start, em->len);
  1756. ret = -EUCLEAN;
  1757. free_extent_map(em);
  1758. break;
  1759. }
  1760. if (bg->start != em->start || bg->length != em->len ||
  1761. (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) !=
  1762. (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
  1763. btrfs_err(fs_info,
  1764. "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx",
  1765. em->start, em->len,
  1766. em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK,
  1767. bg->start, bg->length,
  1768. bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK);
  1769. ret = -EUCLEAN;
  1770. free_extent_map(em);
  1771. btrfs_put_block_group(bg);
  1772. break;
  1773. }
  1774. start = em->start + em->len;
  1775. free_extent_map(em);
  1776. btrfs_put_block_group(bg);
  1777. }
  1778. return ret;
  1779. }
  1780. static int read_one_block_group(struct btrfs_fs_info *info,
  1781. struct btrfs_block_group_item *bgi,
  1782. const struct btrfs_key *key,
  1783. int need_clear)
  1784. {
  1785. struct btrfs_block_group *cache;
  1786. const bool mixed = btrfs_fs_incompat(info, MIXED_GROUPS);
  1787. int ret;
  1788. ASSERT(key->type == BTRFS_BLOCK_GROUP_ITEM_KEY);
  1789. cache = btrfs_create_block_group_cache(info, key->objectid);
  1790. if (!cache)
  1791. return -ENOMEM;
  1792. cache->length = key->offset;
  1793. cache->used = btrfs_stack_block_group_used(bgi);
  1794. cache->flags = btrfs_stack_block_group_flags(bgi);
  1795. cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
  1796. set_free_space_tree_thresholds(cache);
  1797. if (need_clear) {
  1798. /*
  1799. * When we mount with old space cache, we need to
  1800. * set BTRFS_DC_CLEAR and set dirty flag.
  1801. *
  1802. * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
  1803. * truncate the old free space cache inode and
  1804. * setup a new one.
  1805. * b) Setting 'dirty flag' makes sure that we flush
  1806. * the new space cache info onto disk.
  1807. */
  1808. if (btrfs_test_opt(info, SPACE_CACHE))
  1809. cache->disk_cache_state = BTRFS_DC_CLEAR;
  1810. }
  1811. if (!mixed && ((cache->flags & BTRFS_BLOCK_GROUP_METADATA) &&
  1812. (cache->flags & BTRFS_BLOCK_GROUP_DATA))) {
  1813. btrfs_err(info,
  1814. "bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
  1815. cache->start);
  1816. ret = -EINVAL;
  1817. goto error;
  1818. }
  1819. ret = btrfs_load_block_group_zone_info(cache, false);
  1820. if (ret) {
  1821. btrfs_err(info, "zoned: failed to load zone info of bg %llu",
  1822. cache->start);
  1823. goto error;
  1824. }
  1825. /*
  1826. * We need to exclude the super stripes now so that the space info has
  1827. * super bytes accounted for, otherwise we'll think we have more space
  1828. * than we actually do.
  1829. */
  1830. ret = exclude_super_stripes(cache);
  1831. if (ret) {
  1832. /* We may have excluded something, so call this just in case. */
  1833. btrfs_free_excluded_extents(cache);
  1834. goto error;
  1835. }
  1836. /*
  1837. * For zoned filesystem, space after the allocation offset is the only
  1838. * free space for a block group. So, we don't need any caching work.
  1839. * btrfs_calc_zone_unusable() will set the amount of free space and
  1840. * zone_unusable space.
  1841. *
  1842. * For regular filesystem, check for two cases, either we are full, and
  1843. * therefore don't need to bother with the caching work since we won't
  1844. * find any space, or we are empty, and we can just add all the space
  1845. * in and be done with it. This saves us _a_lot_ of time, particularly
  1846. * in the full case.
  1847. */
  1848. if (btrfs_is_zoned(info)) {
  1849. btrfs_calc_zone_unusable(cache);
  1850. /* Should not have any excluded extents. Just in case, though. */
  1851. btrfs_free_excluded_extents(cache);
  1852. } else if (cache->length == cache->used) {
  1853. cache->cached = BTRFS_CACHE_FINISHED;
  1854. btrfs_free_excluded_extents(cache);
  1855. } else if (cache->used == 0) {
  1856. cache->cached = BTRFS_CACHE_FINISHED;
  1857. ret = add_new_free_space(cache, cache->start,
  1858. cache->start + cache->length, NULL);
  1859. btrfs_free_excluded_extents(cache);
  1860. if (ret)
  1861. goto error;
  1862. }
  1863. ret = btrfs_add_block_group_cache(info, cache);
  1864. if (ret) {
  1865. btrfs_remove_free_space_cache(cache);
  1866. goto error;
  1867. }
  1868. trace_btrfs_add_block_group(info, cache, 0);
  1869. btrfs_add_bg_to_space_info(info, cache);
  1870. set_avail_alloc_bits(info, cache->flags);
  1871. if (btrfs_chunk_writeable(info, cache->start)) {
  1872. if (cache->used == 0) {
  1873. ASSERT(list_empty(&cache->bg_list));
  1874. if (btrfs_test_opt(info, DISCARD_ASYNC))
  1875. btrfs_discard_queue_work(&info->discard_ctl, cache);
  1876. else
  1877. btrfs_mark_bg_unused(cache);
  1878. }
  1879. } else {
  1880. inc_block_group_ro(cache, 1);
  1881. }
  1882. return 0;
  1883. error:
  1884. btrfs_put_block_group(cache);
  1885. return ret;
  1886. }
  1887. static int fill_dummy_bgs(struct btrfs_fs_info *fs_info)
  1888. {
  1889. struct extent_map_tree *em_tree = &fs_info->mapping_tree;
  1890. struct rb_node *node;
  1891. int ret = 0;
  1892. for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
  1893. struct extent_map *em;
  1894. struct map_lookup *map;
  1895. struct btrfs_block_group *bg;
  1896. em = rb_entry(node, struct extent_map, rb_node);
  1897. map = em->map_lookup;
  1898. bg = btrfs_create_block_group_cache(fs_info, em->start);
  1899. if (!bg) {
  1900. ret = -ENOMEM;
  1901. break;
  1902. }
  1903. /* Fill dummy cache as FULL */
  1904. bg->length = em->len;
  1905. bg->flags = map->type;
  1906. bg->cached = BTRFS_CACHE_FINISHED;
  1907. bg->used = em->len;
  1908. bg->flags = map->type;
  1909. ret = btrfs_add_block_group_cache(fs_info, bg);
  1910. /*
  1911. * We may have some valid block group cache added already, in
  1912. * that case we skip to the next one.
  1913. */
  1914. if (ret == -EEXIST) {
  1915. ret = 0;
  1916. btrfs_put_block_group(bg);
  1917. continue;
  1918. }
  1919. if (ret) {
  1920. btrfs_remove_free_space_cache(bg);
  1921. btrfs_put_block_group(bg);
  1922. break;
  1923. }
  1924. btrfs_add_bg_to_space_info(fs_info, bg);
  1925. set_avail_alloc_bits(fs_info, bg->flags);
  1926. }
  1927. if (!ret)
  1928. btrfs_init_global_block_rsv(fs_info);
  1929. return ret;
  1930. }
  1931. int btrfs_read_block_groups(struct btrfs_fs_info *info)
  1932. {
  1933. struct btrfs_root *root = btrfs_block_group_root(info);
  1934. struct btrfs_path *path;
  1935. int ret;
  1936. struct btrfs_block_group *cache;
  1937. struct btrfs_space_info *space_info;
  1938. struct btrfs_key key;
  1939. int need_clear = 0;
  1940. u64 cache_gen;
  1941. /*
  1942. * Either no extent root (with ibadroots rescue option) or we have
  1943. * unsupported RO options. The fs can never be mounted read-write, so no
  1944. * need to waste time searching block group items.
  1945. *
  1946. * This also allows new extent tree related changes to be RO compat,
  1947. * no need for a full incompat flag.
  1948. */
  1949. if (!root || (btrfs_super_compat_ro_flags(info->super_copy) &
  1950. ~BTRFS_FEATURE_COMPAT_RO_SUPP))
  1951. return fill_dummy_bgs(info);
  1952. key.objectid = 0;
  1953. key.offset = 0;
  1954. key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
  1955. path = btrfs_alloc_path();
  1956. if (!path)
  1957. return -ENOMEM;
  1958. cache_gen = btrfs_super_cache_generation(info->super_copy);
  1959. if (btrfs_test_opt(info, SPACE_CACHE) &&
  1960. btrfs_super_generation(info->super_copy) != cache_gen)
  1961. need_clear = 1;
  1962. if (btrfs_test_opt(info, CLEAR_CACHE))
  1963. need_clear = 1;
  1964. while (1) {
  1965. struct btrfs_block_group_item bgi;
  1966. struct extent_buffer *leaf;
  1967. int slot;
  1968. ret = find_first_block_group(info, path, &key);
  1969. if (ret > 0)
  1970. break;
  1971. if (ret != 0)
  1972. goto error;
  1973. leaf = path->nodes[0];
  1974. slot = path->slots[0];
  1975. read_extent_buffer(leaf, &bgi, btrfs_item_ptr_offset(leaf, slot),
  1976. sizeof(bgi));
  1977. btrfs_item_key_to_cpu(leaf, &key, slot);
  1978. btrfs_release_path(path);
  1979. ret = read_one_block_group(info, &bgi, &key, need_clear);
  1980. if (ret < 0)
  1981. goto error;
  1982. key.objectid += key.offset;
  1983. key.offset = 0;
  1984. }
  1985. btrfs_release_path(path);
  1986. list_for_each_entry(space_info, &info->space_info, list) {
  1987. int i;
  1988. for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
  1989. if (list_empty(&space_info->block_groups[i]))
  1990. continue;
  1991. cache = list_first_entry(&space_info->block_groups[i],
  1992. struct btrfs_block_group,
  1993. list);
  1994. btrfs_sysfs_add_block_group_type(cache);
  1995. }
  1996. if (!(btrfs_get_alloc_profile(info, space_info->flags) &
  1997. (BTRFS_BLOCK_GROUP_RAID10 |
  1998. BTRFS_BLOCK_GROUP_RAID1_MASK |
  1999. BTRFS_BLOCK_GROUP_RAID56_MASK |
  2000. BTRFS_BLOCK_GROUP_DUP)))
  2001. continue;
  2002. /*
  2003. * Avoid allocating from un-mirrored block group if there are
  2004. * mirrored block groups.
  2005. */
  2006. list_for_each_entry(cache,
  2007. &space_info->block_groups[BTRFS_RAID_RAID0],
  2008. list)
  2009. inc_block_group_ro(cache, 1);
  2010. list_for_each_entry(cache,
  2011. &space_info->block_groups[BTRFS_RAID_SINGLE],
  2012. list)
  2013. inc_block_group_ro(cache, 1);
  2014. }
  2015. btrfs_init_global_block_rsv(info);
  2016. ret = check_chunk_block_group_mappings(info);
  2017. error:
  2018. btrfs_free_path(path);
  2019. /*
  2020. * We've hit some error while reading the extent tree, and have
  2021. * rescue=ibadroots mount option.
  2022. * Try to fill the tree using dummy block groups so that the user can
  2023. * continue to mount and grab their data.
  2024. */
  2025. if (ret && btrfs_test_opt(info, IGNOREBADROOTS))
  2026. ret = fill_dummy_bgs(info);
  2027. return ret;
  2028. }
  2029. /*
  2030. * This function, insert_block_group_item(), belongs to the phase 2 of chunk
  2031. * allocation.
  2032. *
  2033. * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
  2034. * phases.
  2035. */
  2036. static int insert_block_group_item(struct btrfs_trans_handle *trans,
  2037. struct btrfs_block_group *block_group)
  2038. {
  2039. struct btrfs_fs_info *fs_info = trans->fs_info;
  2040. struct btrfs_block_group_item bgi;
  2041. struct btrfs_root *root = btrfs_block_group_root(fs_info);
  2042. struct btrfs_key key;
  2043. spin_lock(&block_group->lock);
  2044. btrfs_set_stack_block_group_used(&bgi, block_group->used);
  2045. btrfs_set_stack_block_group_chunk_objectid(&bgi,
  2046. block_group->global_root_id);
  2047. btrfs_set_stack_block_group_flags(&bgi, block_group->flags);
  2048. key.objectid = block_group->start;
  2049. key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
  2050. key.offset = block_group->length;
  2051. spin_unlock(&block_group->lock);
  2052. return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi));
  2053. }
  2054. static int insert_dev_extent(struct btrfs_trans_handle *trans,
  2055. struct btrfs_device *device, u64 chunk_offset,
  2056. u64 start, u64 num_bytes)
  2057. {
  2058. struct btrfs_fs_info *fs_info = device->fs_info;
  2059. struct btrfs_root *root = fs_info->dev_root;
  2060. struct btrfs_path *path;
  2061. struct btrfs_dev_extent *extent;
  2062. struct extent_buffer *leaf;
  2063. struct btrfs_key key;
  2064. int ret;
  2065. WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
  2066. WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
  2067. path = btrfs_alloc_path();
  2068. if (!path)
  2069. return -ENOMEM;
  2070. key.objectid = device->devid;
  2071. key.type = BTRFS_DEV_EXTENT_KEY;
  2072. key.offset = start;
  2073. ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent));
  2074. if (ret)
  2075. goto out;
  2076. leaf = path->nodes[0];
  2077. extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
  2078. btrfs_set_dev_extent_chunk_tree(leaf, extent, BTRFS_CHUNK_TREE_OBJECTID);
  2079. btrfs_set_dev_extent_chunk_objectid(leaf, extent,
  2080. BTRFS_FIRST_CHUNK_TREE_OBJECTID);
  2081. btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
  2082. btrfs_set_dev_extent_length(leaf, extent, num_bytes);
  2083. btrfs_mark_buffer_dirty(leaf);
  2084. out:
  2085. btrfs_free_path(path);
  2086. return ret;
  2087. }
  2088. /*
  2089. * This function belongs to phase 2.
  2090. *
  2091. * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
  2092. * phases.
  2093. */
  2094. static int insert_dev_extents(struct btrfs_trans_handle *trans,
  2095. u64 chunk_offset, u64 chunk_size)
  2096. {
  2097. struct btrfs_fs_info *fs_info = trans->fs_info;
  2098. struct btrfs_device *device;
  2099. struct extent_map *em;
  2100. struct map_lookup *map;
  2101. u64 dev_offset;
  2102. u64 stripe_size;
  2103. int i;
  2104. int ret = 0;
  2105. em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
  2106. if (IS_ERR(em))
  2107. return PTR_ERR(em);
  2108. map = em->map_lookup;
  2109. stripe_size = em->orig_block_len;
  2110. /*
  2111. * Take the device list mutex to prevent races with the final phase of
  2112. * a device replace operation that replaces the device object associated
  2113. * with the map's stripes, because the device object's id can change
  2114. * at any time during that final phase of the device replace operation
  2115. * (dev-replace.c:btrfs_dev_replace_finishing()), so we could grab the
  2116. * replaced device and then see it with an ID of BTRFS_DEV_REPLACE_DEVID,
  2117. * resulting in persisting a device extent item with such ID.
  2118. */
  2119. mutex_lock(&fs_info->fs_devices->device_list_mutex);
  2120. for (i = 0; i < map->num_stripes; i++) {
  2121. device = map->stripes[i].dev;
  2122. dev_offset = map->stripes[i].physical;
  2123. ret = insert_dev_extent(trans, device, chunk_offset, dev_offset,
  2124. stripe_size);
  2125. if (ret)
  2126. break;
  2127. }
  2128. mutex_unlock(&fs_info->fs_devices->device_list_mutex);
  2129. free_extent_map(em);
  2130. return ret;
  2131. }
  2132. /*
  2133. * This function, btrfs_create_pending_block_groups(), belongs to the phase 2 of
  2134. * chunk allocation.
  2135. *
  2136. * See the comment at btrfs_chunk_alloc() for details about the chunk allocation
  2137. * phases.
  2138. */
  2139. void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
  2140. {
  2141. struct btrfs_fs_info *fs_info = trans->fs_info;
  2142. struct btrfs_block_group *block_group;
  2143. int ret = 0;
  2144. while (!list_empty(&trans->new_bgs)) {
  2145. int index;
  2146. block_group = list_first_entry(&trans->new_bgs,
  2147. struct btrfs_block_group,
  2148. bg_list);
  2149. if (ret)
  2150. goto next;
  2151. index = btrfs_bg_flags_to_raid_index(block_group->flags);
  2152. ret = insert_block_group_item(trans, block_group);
  2153. if (ret)
  2154. btrfs_abort_transaction(trans, ret);
  2155. if (!test_bit(BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
  2156. &block_group->runtime_flags)) {
  2157. mutex_lock(&fs_info->chunk_mutex);
  2158. ret = btrfs_chunk_alloc_add_chunk_item(trans, block_group);
  2159. mutex_unlock(&fs_info->chunk_mutex);
  2160. if (ret)
  2161. btrfs_abort_transaction(trans, ret);
  2162. }
  2163. ret = insert_dev_extents(trans, block_group->start,
  2164. block_group->length);
  2165. if (ret)
  2166. btrfs_abort_transaction(trans, ret);
  2167. add_block_group_free_space(trans, block_group);
  2168. /*
  2169. * If we restriped during balance, we may have added a new raid
  2170. * type, so now add the sysfs entries when it is safe to do so.
  2171. * We don't have to worry about locking here as it's handled in
  2172. * btrfs_sysfs_add_block_group_type.
  2173. */
  2174. if (block_group->space_info->block_group_kobjs[index] == NULL)
  2175. btrfs_sysfs_add_block_group_type(block_group);
  2176. /* Already aborted the transaction if it failed. */
  2177. next:
  2178. btrfs_delayed_refs_rsv_release(fs_info, 1);
  2179. list_del_init(&block_group->bg_list);
  2180. clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags);
  2181. }
  2182. btrfs_trans_release_chunk_metadata(trans);
  2183. }
  2184. /*
  2185. * For extent tree v2 we use the block_group_item->chunk_offset to point at our
  2186. * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID.
  2187. */
  2188. static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset)
  2189. {
  2190. u64 div = SZ_1G;
  2191. u64 index;
  2192. if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))
  2193. return BTRFS_FIRST_CHUNK_TREE_OBJECTID;
  2194. /* If we have a smaller fs index based on 128MiB. */
  2195. if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL))
  2196. div = SZ_128M;
  2197. offset = div64_u64(offset, div);
  2198. div64_u64_rem(offset, fs_info->nr_global_roots, &index);
  2199. return index;
  2200. }
  2201. struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans,
  2202. u64 bytes_used, u64 type,
  2203. u64 chunk_offset, u64 size)
  2204. {
  2205. struct btrfs_fs_info *fs_info = trans->fs_info;
  2206. struct btrfs_block_group *cache;
  2207. int ret;
  2208. btrfs_set_log_full_commit(trans);
  2209. cache = btrfs_create_block_group_cache(fs_info, chunk_offset);
  2210. if (!cache)
  2211. return ERR_PTR(-ENOMEM);
  2212. /*
  2213. * Mark it as new before adding it to the rbtree of block groups or any
  2214. * list, so that no other task finds it and calls btrfs_mark_bg_unused()
  2215. * before the new flag is set.
  2216. */
  2217. set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags);
  2218. cache->length = size;
  2219. set_free_space_tree_thresholds(cache);
  2220. cache->used = bytes_used;
  2221. cache->flags = type;
  2222. cache->cached = BTRFS_CACHE_FINISHED;
  2223. cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
  2224. if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
  2225. set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
  2226. ret = btrfs_load_block_group_zone_info(cache, true);
  2227. if (ret) {
  2228. btrfs_put_block_group(cache);
  2229. return ERR_PTR(ret);
  2230. }
  2231. ret = exclude_super_stripes(cache);
  2232. if (ret) {
  2233. /* We may have excluded something, so call this just in case */
  2234. btrfs_free_excluded_extents(cache);
  2235. btrfs_put_block_group(cache);
  2236. return ERR_PTR(ret);
  2237. }
  2238. ret = add_new_free_space(cache, chunk_offset, chunk_offset + size, NULL);
  2239. btrfs_free_excluded_extents(cache);
  2240. if (ret) {
  2241. btrfs_put_block_group(cache);
  2242. return ERR_PTR(ret);
  2243. }
  2244. /*
  2245. * Ensure the corresponding space_info object is created and
  2246. * assigned to our block group. We want our bg to be added to the rbtree
  2247. * with its ->space_info set.
  2248. */
  2249. cache->space_info = btrfs_find_space_info(fs_info, cache->flags);
  2250. ASSERT(cache->space_info);
  2251. ret = btrfs_add_block_group_cache(fs_info, cache);
  2252. if (ret) {
  2253. btrfs_remove_free_space_cache(cache);
  2254. btrfs_put_block_group(cache);
  2255. return ERR_PTR(ret);
  2256. }
  2257. /*
  2258. * Now that our block group has its ->space_info set and is inserted in
  2259. * the rbtree, update the space info's counters.
  2260. */
  2261. trace_btrfs_add_block_group(fs_info, cache, 1);
  2262. btrfs_add_bg_to_space_info(fs_info, cache);
  2263. btrfs_update_global_block_rsv(fs_info);
  2264. #ifdef CONFIG_BTRFS_DEBUG
  2265. if (btrfs_should_fragment_free_space(cache)) {
  2266. u64 new_bytes_used = size - bytes_used;
  2267. cache->space_info->bytes_used += new_bytes_used >> 1;
  2268. fragment_free_space(cache);
  2269. }
  2270. #endif
  2271. list_add_tail(&cache->bg_list, &trans->new_bgs);
  2272. trans->delayed_ref_updates++;
  2273. btrfs_update_delayed_refs_rsv(trans);
  2274. set_avail_alloc_bits(fs_info, type);
  2275. return cache;
  2276. }
  2277. /*
  2278. * Mark one block group RO, can be called several times for the same block
  2279. * group.
  2280. *
  2281. * @cache: the destination block group
  2282. * @do_chunk_alloc: whether need to do chunk pre-allocation, this is to
  2283. * ensure we still have some free space after marking this
  2284. * block group RO.
  2285. */
  2286. int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
  2287. bool do_chunk_alloc)
  2288. {
  2289. struct btrfs_fs_info *fs_info = cache->fs_info;
  2290. struct btrfs_trans_handle *trans;
  2291. struct btrfs_root *root = btrfs_block_group_root(fs_info);
  2292. u64 alloc_flags;
  2293. int ret;
  2294. bool dirty_bg_running;
  2295. /*
  2296. * This can only happen when we are doing read-only scrub on read-only
  2297. * mount.
  2298. * In that case we should not start a new transaction on read-only fs.
  2299. * Thus here we skip all chunk allocations.
  2300. */
  2301. if (sb_rdonly(fs_info->sb)) {
  2302. mutex_lock(&fs_info->ro_block_group_mutex);
  2303. ret = inc_block_group_ro(cache, 0);
  2304. mutex_unlock(&fs_info->ro_block_group_mutex);
  2305. return ret;
  2306. }
  2307. do {
  2308. trans = btrfs_join_transaction(root);
  2309. if (IS_ERR(trans))
  2310. return PTR_ERR(trans);
  2311. dirty_bg_running = false;
  2312. /*
  2313. * We're not allowed to set block groups readonly after the dirty
  2314. * block group cache has started writing. If it already started,
  2315. * back off and let this transaction commit.
  2316. */
  2317. mutex_lock(&fs_info->ro_block_group_mutex);
  2318. if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
  2319. u64 transid = trans->transid;
  2320. mutex_unlock(&fs_info->ro_block_group_mutex);
  2321. btrfs_end_transaction(trans);
  2322. ret = btrfs_wait_for_commit(fs_info, transid);
  2323. if (ret)
  2324. return ret;
  2325. dirty_bg_running = true;
  2326. }
  2327. } while (dirty_bg_running);
  2328. if (do_chunk_alloc) {
  2329. /*
  2330. * If we are changing raid levels, try to allocate a
  2331. * corresponding block group with the new raid level.
  2332. */
  2333. alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
  2334. if (alloc_flags != cache->flags) {
  2335. ret = btrfs_chunk_alloc(trans, alloc_flags,
  2336. CHUNK_ALLOC_FORCE);
  2337. /*
  2338. * ENOSPC is allowed here, we may have enough space
  2339. * already allocated at the new raid level to carry on
  2340. */
  2341. if (ret == -ENOSPC)
  2342. ret = 0;
  2343. if (ret < 0)
  2344. goto out;
  2345. }
  2346. }
  2347. ret = inc_block_group_ro(cache, 0);
  2348. if (!ret)
  2349. goto out;
  2350. if (ret == -ETXTBSY)
  2351. goto unlock_out;
  2352. /*
  2353. * Skip chunk alloction if the bg is SYSTEM, this is to avoid system
  2354. * chunk allocation storm to exhaust the system chunk array. Otherwise
  2355. * we still want to try our best to mark the block group read-only.
  2356. */
  2357. if (!do_chunk_alloc && ret == -ENOSPC &&
  2358. (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
  2359. goto unlock_out;
  2360. alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags);
  2361. ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
  2362. if (ret < 0)
  2363. goto out;
  2364. /*
  2365. * We have allocated a new chunk. We also need to activate that chunk to
  2366. * grant metadata tickets for zoned filesystem.
  2367. */
  2368. ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true);
  2369. if (ret < 0)
  2370. goto out;
  2371. ret = inc_block_group_ro(cache, 0);
  2372. if (ret == -ETXTBSY)
  2373. goto unlock_out;
  2374. out:
  2375. if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
  2376. alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags);
  2377. mutex_lock(&fs_info->chunk_mutex);
  2378. check_system_chunk(trans, alloc_flags);
  2379. mutex_unlock(&fs_info->chunk_mutex);
  2380. }
  2381. unlock_out:
  2382. mutex_unlock(&fs_info->ro_block_group_mutex);
  2383. btrfs_end_transaction(trans);
  2384. return ret;
  2385. }
  2386. void btrfs_dec_block_group_ro(struct btrfs_block_group *cache)
  2387. {
  2388. struct btrfs_space_info *sinfo = cache->space_info;
  2389. u64 num_bytes;
  2390. BUG_ON(!cache->ro);
  2391. spin_lock(&sinfo->lock);
  2392. spin_lock(&cache->lock);
  2393. if (!--cache->ro) {
  2394. if (btrfs_is_zoned(cache->fs_info)) {
  2395. /* Migrate zone_unusable bytes back */
  2396. cache->zone_unusable =
  2397. (cache->alloc_offset - cache->used) +
  2398. (cache->length - cache->zone_capacity);
  2399. sinfo->bytes_zone_unusable += cache->zone_unusable;
  2400. sinfo->bytes_readonly -= cache->zone_unusable;
  2401. }
  2402. num_bytes = cache->length - cache->reserved -
  2403. cache->pinned - cache->bytes_super -
  2404. cache->zone_unusable - cache->used;
  2405. sinfo->bytes_readonly -= num_bytes;
  2406. list_del_init(&cache->ro_list);
  2407. }
  2408. spin_unlock(&cache->lock);
  2409. spin_unlock(&sinfo->lock);
  2410. }
  2411. static int update_block_group_item(struct btrfs_trans_handle *trans,
  2412. struct btrfs_path *path,
  2413. struct btrfs_block_group *cache)
  2414. {
  2415. struct btrfs_fs_info *fs_info = trans->fs_info;
  2416. int ret;
  2417. struct btrfs_root *root = btrfs_block_group_root(fs_info);
  2418. unsigned long bi;
  2419. struct extent_buffer *leaf;
  2420. struct btrfs_block_group_item bgi;
  2421. struct btrfs_key key;
  2422. key.objectid = cache->start;
  2423. key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
  2424. key.offset = cache->length;
  2425. ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
  2426. if (ret) {
  2427. if (ret > 0)
  2428. ret = -ENOENT;
  2429. goto fail;
  2430. }
  2431. leaf = path->nodes[0];
  2432. bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
  2433. btrfs_set_stack_block_group_used(&bgi, cache->used);
  2434. btrfs_set_stack_block_group_chunk_objectid(&bgi,
  2435. cache->global_root_id);
  2436. btrfs_set_stack_block_group_flags(&bgi, cache->flags);
  2437. write_extent_buffer(leaf, &bgi, bi, sizeof(bgi));
  2438. btrfs_mark_buffer_dirty(leaf);
  2439. fail:
  2440. btrfs_release_path(path);
  2441. return ret;
  2442. }
  2443. static int cache_save_setup(struct btrfs_block_group *block_group,
  2444. struct btrfs_trans_handle *trans,
  2445. struct btrfs_path *path)
  2446. {
  2447. struct btrfs_fs_info *fs_info = block_group->fs_info;
  2448. struct btrfs_root *root = fs_info->tree_root;
  2449. struct inode *inode = NULL;
  2450. struct extent_changeset *data_reserved = NULL;
  2451. u64 alloc_hint = 0;
  2452. int dcs = BTRFS_DC_ERROR;
  2453. u64 cache_size = 0;
  2454. int retries = 0;
  2455. int ret = 0;
  2456. if (!btrfs_test_opt(fs_info, SPACE_CACHE))
  2457. return 0;
  2458. /*
  2459. * If this block group is smaller than 100 megs don't bother caching the
  2460. * block group.
  2461. */
  2462. if (block_group->length < (100 * SZ_1M)) {
  2463. spin_lock(&block_group->lock);
  2464. block_group->disk_cache_state = BTRFS_DC_WRITTEN;
  2465. spin_unlock(&block_group->lock);
  2466. return 0;
  2467. }
  2468. if (TRANS_ABORTED(trans))
  2469. return 0;
  2470. again:
  2471. inode = lookup_free_space_inode(block_group, path);
  2472. if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
  2473. ret = PTR_ERR(inode);
  2474. btrfs_release_path(path);
  2475. goto out;
  2476. }
  2477. if (IS_ERR(inode)) {
  2478. BUG_ON(retries);
  2479. retries++;
  2480. if (block_group->ro)
  2481. goto out_free;
  2482. ret = create_free_space_inode(trans, block_group, path);
  2483. if (ret)
  2484. goto out_free;
  2485. goto again;
  2486. }
  2487. /*
  2488. * We want to set the generation to 0, that way if anything goes wrong
  2489. * from here on out we know not to trust this cache when we load up next
  2490. * time.
  2491. */
  2492. BTRFS_I(inode)->generation = 0;
  2493. ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
  2494. if (ret) {
  2495. /*
  2496. * So theoretically we could recover from this, simply set the
  2497. * super cache generation to 0 so we know to invalidate the
  2498. * cache, but then we'd have to keep track of the block groups
  2499. * that fail this way so we know we _have_ to reset this cache
  2500. * before the next commit or risk reading stale cache. So to
  2501. * limit our exposure to horrible edge cases lets just abort the
  2502. * transaction, this only happens in really bad situations
  2503. * anyway.
  2504. */
  2505. btrfs_abort_transaction(trans, ret);
  2506. goto out_put;
  2507. }
  2508. WARN_ON(ret);
  2509. /* We've already setup this transaction, go ahead and exit */
  2510. if (block_group->cache_generation == trans->transid &&
  2511. i_size_read(inode)) {
  2512. dcs = BTRFS_DC_SETUP;
  2513. goto out_put;
  2514. }
  2515. if (i_size_read(inode) > 0) {
  2516. ret = btrfs_check_trunc_cache_free_space(fs_info,
  2517. &fs_info->global_block_rsv);
  2518. if (ret)
  2519. goto out_put;
  2520. ret = btrfs_truncate_free_space_cache(trans, NULL, inode);
  2521. if (ret)
  2522. goto out_put;
  2523. }
  2524. spin_lock(&block_group->lock);
  2525. if (block_group->cached != BTRFS_CACHE_FINISHED ||
  2526. !btrfs_test_opt(fs_info, SPACE_CACHE)) {
  2527. /*
  2528. * don't bother trying to write stuff out _if_
  2529. * a) we're not cached,
  2530. * b) we're with nospace_cache mount option,
  2531. * c) we're with v2 space_cache (FREE_SPACE_TREE).
  2532. */
  2533. dcs = BTRFS_DC_WRITTEN;
  2534. spin_unlock(&block_group->lock);
  2535. goto out_put;
  2536. }
  2537. spin_unlock(&block_group->lock);
  2538. /*
  2539. * We hit an ENOSPC when setting up the cache in this transaction, just
  2540. * skip doing the setup, we've already cleared the cache so we're safe.
  2541. */
  2542. if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) {
  2543. ret = -ENOSPC;
  2544. goto out_put;
  2545. }
  2546. /*
  2547. * Try to preallocate enough space based on how big the block group is.
  2548. * Keep in mind this has to include any pinned space which could end up
  2549. * taking up quite a bit since it's not folded into the other space
  2550. * cache.
  2551. */
  2552. cache_size = div_u64(block_group->length, SZ_256M);
  2553. if (!cache_size)
  2554. cache_size = 1;
  2555. cache_size *= 16;
  2556. cache_size *= fs_info->sectorsize;
  2557. ret = btrfs_check_data_free_space(BTRFS_I(inode), &data_reserved, 0,
  2558. cache_size, false);
  2559. if (ret)
  2560. goto out_put;
  2561. ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, cache_size,
  2562. cache_size, cache_size,
  2563. &alloc_hint);
  2564. /*
  2565. * Our cache requires contiguous chunks so that we don't modify a bunch
  2566. * of metadata or split extents when writing the cache out, which means
  2567. * we can enospc if we are heavily fragmented in addition to just normal
  2568. * out of space conditions. So if we hit this just skip setting up any
  2569. * other block groups for this transaction, maybe we'll unpin enough
  2570. * space the next time around.
  2571. */
  2572. if (!ret)
  2573. dcs = BTRFS_DC_SETUP;
  2574. else if (ret == -ENOSPC)
  2575. set_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags);
  2576. out_put:
  2577. iput(inode);
  2578. out_free:
  2579. btrfs_release_path(path);
  2580. out:
  2581. spin_lock(&block_group->lock);
  2582. if (!ret && dcs == BTRFS_DC_SETUP)
  2583. block_group->cache_generation = trans->transid;
  2584. block_group->disk_cache_state = dcs;
  2585. spin_unlock(&block_group->lock);
  2586. extent_changeset_free(data_reserved);
  2587. return ret;
  2588. }
  2589. int btrfs_setup_space_cache(struct btrfs_trans_handle *trans)
  2590. {
  2591. struct btrfs_fs_info *fs_info = trans->fs_info;
  2592. struct btrfs_block_group *cache, *tmp;
  2593. struct btrfs_transaction *cur_trans = trans->transaction;
  2594. struct btrfs_path *path;
  2595. if (list_empty(&cur_trans->dirty_bgs) ||
  2596. !btrfs_test_opt(fs_info, SPACE_CACHE))
  2597. return 0;
  2598. path = btrfs_alloc_path();
  2599. if (!path)
  2600. return -ENOMEM;
  2601. /* Could add new block groups, use _safe just in case */
  2602. list_for_each_entry_safe(cache, tmp, &cur_trans->dirty_bgs,
  2603. dirty_list) {
  2604. if (cache->disk_cache_state == BTRFS_DC_CLEAR)
  2605. cache_save_setup(cache, trans, path);
  2606. }
  2607. btrfs_free_path(path);
  2608. return 0;
  2609. }
  2610. /*
  2611. * Transaction commit does final block group cache writeback during a critical
  2612. * section where nothing is allowed to change the FS. This is required in
  2613. * order for the cache to actually match the block group, but can introduce a
  2614. * lot of latency into the commit.
  2615. *
  2616. * So, btrfs_start_dirty_block_groups is here to kick off block group cache IO.
  2617. * There's a chance we'll have to redo some of it if the block group changes
  2618. * again during the commit, but it greatly reduces the commit latency by
  2619. * getting rid of the easy block groups while we're still allowing others to
  2620. * join the commit.
  2621. */
  2622. int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
  2623. {
  2624. struct btrfs_fs_info *fs_info = trans->fs_info;
  2625. struct btrfs_block_group *cache;
  2626. struct btrfs_transaction *cur_trans = trans->transaction;
  2627. int ret = 0;
  2628. int should_put;
  2629. struct btrfs_path *path = NULL;
  2630. LIST_HEAD(dirty);
  2631. struct list_head *io = &cur_trans->io_bgs;
  2632. int loops = 0;
  2633. spin_lock(&cur_trans->dirty_bgs_lock);
  2634. if (list_empty(&cur_trans->dirty_bgs)) {
  2635. spin_unlock(&cur_trans->dirty_bgs_lock);
  2636. return 0;
  2637. }
  2638. list_splice_init(&cur_trans->dirty_bgs, &dirty);
  2639. spin_unlock(&cur_trans->dirty_bgs_lock);
  2640. again:
  2641. /* Make sure all the block groups on our dirty list actually exist */
  2642. btrfs_create_pending_block_groups(trans);
  2643. if (!path) {
  2644. path = btrfs_alloc_path();
  2645. if (!path) {
  2646. ret = -ENOMEM;
  2647. goto out;
  2648. }
  2649. }
  2650. /*
  2651. * cache_write_mutex is here only to save us from balance or automatic
  2652. * removal of empty block groups deleting this block group while we are
  2653. * writing out the cache
  2654. */
  2655. mutex_lock(&trans->transaction->cache_write_mutex);
  2656. while (!list_empty(&dirty)) {
  2657. bool drop_reserve = true;
  2658. cache = list_first_entry(&dirty, struct btrfs_block_group,
  2659. dirty_list);
  2660. /*
  2661. * This can happen if something re-dirties a block group that
  2662. * is already under IO. Just wait for it to finish and then do
  2663. * it all again
  2664. */
  2665. if (!list_empty(&cache->io_list)) {
  2666. list_del_init(&cache->io_list);
  2667. btrfs_wait_cache_io(trans, cache, path);
  2668. btrfs_put_block_group(cache);
  2669. }
  2670. /*
  2671. * btrfs_wait_cache_io uses the cache->dirty_list to decide if
  2672. * it should update the cache_state. Don't delete until after
  2673. * we wait.
  2674. *
  2675. * Since we're not running in the commit critical section
  2676. * we need the dirty_bgs_lock to protect from update_block_group
  2677. */
  2678. spin_lock(&cur_trans->dirty_bgs_lock);
  2679. list_del_init(&cache->dirty_list);
  2680. spin_unlock(&cur_trans->dirty_bgs_lock);
  2681. should_put = 1;
  2682. cache_save_setup(cache, trans, path);
  2683. if (cache->disk_cache_state == BTRFS_DC_SETUP) {
  2684. cache->io_ctl.inode = NULL;
  2685. ret = btrfs_write_out_cache(trans, cache, path);
  2686. if (ret == 0 && cache->io_ctl.inode) {
  2687. should_put = 0;
  2688. /*
  2689. * The cache_write_mutex is protecting the
  2690. * io_list, also refer to the definition of
  2691. * btrfs_transaction::io_bgs for more details
  2692. */
  2693. list_add_tail(&cache->io_list, io);
  2694. } else {
  2695. /*
  2696. * If we failed to write the cache, the
  2697. * generation will be bad and life goes on
  2698. */
  2699. ret = 0;
  2700. }
  2701. }
  2702. if (!ret) {
  2703. ret = update_block_group_item(trans, path, cache);
  2704. /*
  2705. * Our block group might still be attached to the list
  2706. * of new block groups in the transaction handle of some
  2707. * other task (struct btrfs_trans_handle->new_bgs). This
  2708. * means its block group item isn't yet in the extent
  2709. * tree. If this happens ignore the error, as we will
  2710. * try again later in the critical section of the
  2711. * transaction commit.
  2712. */
  2713. if (ret == -ENOENT) {
  2714. ret = 0;
  2715. spin_lock(&cur_trans->dirty_bgs_lock);
  2716. if (list_empty(&cache->dirty_list)) {
  2717. list_add_tail(&cache->dirty_list,
  2718. &cur_trans->dirty_bgs);
  2719. btrfs_get_block_group(cache);
  2720. drop_reserve = false;
  2721. }
  2722. spin_unlock(&cur_trans->dirty_bgs_lock);
  2723. } else if (ret) {
  2724. btrfs_abort_transaction(trans, ret);
  2725. }
  2726. }
  2727. /* If it's not on the io list, we need to put the block group */
  2728. if (should_put)
  2729. btrfs_put_block_group(cache);
  2730. if (drop_reserve)
  2731. btrfs_delayed_refs_rsv_release(fs_info, 1);
  2732. /*
  2733. * Avoid blocking other tasks for too long. It might even save
  2734. * us from writing caches for block groups that are going to be
  2735. * removed.
  2736. */
  2737. mutex_unlock(&trans->transaction->cache_write_mutex);
  2738. if (ret)
  2739. goto out;
  2740. mutex_lock(&trans->transaction->cache_write_mutex);
  2741. }
  2742. mutex_unlock(&trans->transaction->cache_write_mutex);
  2743. /*
  2744. * Go through delayed refs for all the stuff we've just kicked off
  2745. * and then loop back (just once)
  2746. */
  2747. if (!ret)
  2748. ret = btrfs_run_delayed_refs(trans, 0);
  2749. if (!ret && loops == 0) {
  2750. loops++;
  2751. spin_lock(&cur_trans->dirty_bgs_lock);
  2752. list_splice_init(&cur_trans->dirty_bgs, &dirty);
  2753. /*
  2754. * dirty_bgs_lock protects us from concurrent block group
  2755. * deletes too (not just cache_write_mutex).
  2756. */
  2757. if (!list_empty(&dirty)) {
  2758. spin_unlock(&cur_trans->dirty_bgs_lock);
  2759. goto again;
  2760. }
  2761. spin_unlock(&cur_trans->dirty_bgs_lock);
  2762. }
  2763. out:
  2764. if (ret < 0) {
  2765. spin_lock(&cur_trans->dirty_bgs_lock);
  2766. list_splice_init(&dirty, &cur_trans->dirty_bgs);
  2767. spin_unlock(&cur_trans->dirty_bgs_lock);
  2768. btrfs_cleanup_dirty_bgs(cur_trans, fs_info);
  2769. }
  2770. btrfs_free_path(path);
  2771. return ret;
  2772. }
  2773. int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
  2774. {
  2775. struct btrfs_fs_info *fs_info = trans->fs_info;
  2776. struct btrfs_block_group *cache;
  2777. struct btrfs_transaction *cur_trans = trans->transaction;
  2778. int ret = 0;
  2779. int should_put;
  2780. struct btrfs_path *path;
  2781. struct list_head *io = &cur_trans->io_bgs;
  2782. path = btrfs_alloc_path();
  2783. if (!path)
  2784. return -ENOMEM;
  2785. /*
  2786. * Even though we are in the critical section of the transaction commit,
  2787. * we can still have concurrent tasks adding elements to this
  2788. * transaction's list of dirty block groups. These tasks correspond to
  2789. * endio free space workers started when writeback finishes for a
  2790. * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
  2791. * allocate new block groups as a result of COWing nodes of the root
  2792. * tree when updating the free space inode. The writeback for the space
  2793. * caches is triggered by an earlier call to
  2794. * btrfs_start_dirty_block_groups() and iterations of the following
  2795. * loop.
  2796. * Also we want to do the cache_save_setup first and then run the
  2797. * delayed refs to make sure we have the best chance at doing this all
  2798. * in one shot.
  2799. */
  2800. spin_lock(&cur_trans->dirty_bgs_lock);
  2801. while (!list_empty(&cur_trans->dirty_bgs)) {
  2802. cache = list_first_entry(&cur_trans->dirty_bgs,
  2803. struct btrfs_block_group,
  2804. dirty_list);
  2805. /*
  2806. * This can happen if cache_save_setup re-dirties a block group
  2807. * that is already under IO. Just wait for it to finish and
  2808. * then do it all again
  2809. */
  2810. if (!list_empty(&cache->io_list)) {
  2811. spin_unlock(&cur_trans->dirty_bgs_lock);
  2812. list_del_init(&cache->io_list);
  2813. btrfs_wait_cache_io(trans, cache, path);
  2814. btrfs_put_block_group(cache);
  2815. spin_lock(&cur_trans->dirty_bgs_lock);
  2816. }
  2817. /*
  2818. * Don't remove from the dirty list until after we've waited on
  2819. * any pending IO
  2820. */
  2821. list_del_init(&cache->dirty_list);
  2822. spin_unlock(&cur_trans->dirty_bgs_lock);
  2823. should_put = 1;
  2824. cache_save_setup(cache, trans, path);
  2825. if (!ret)
  2826. ret = btrfs_run_delayed_refs(trans,
  2827. (unsigned long) -1);
  2828. if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP) {
  2829. cache->io_ctl.inode = NULL;
  2830. ret = btrfs_write_out_cache(trans, cache, path);
  2831. if (ret == 0 && cache->io_ctl.inode) {
  2832. should_put = 0;
  2833. list_add_tail(&cache->io_list, io);
  2834. } else {
  2835. /*
  2836. * If we failed to write the cache, the
  2837. * generation will be bad and life goes on
  2838. */
  2839. ret = 0;
  2840. }
  2841. }
  2842. if (!ret) {
  2843. ret = update_block_group_item(trans, path, cache);
  2844. /*
  2845. * One of the free space endio workers might have
  2846. * created a new block group while updating a free space
  2847. * cache's inode (at inode.c:btrfs_finish_ordered_io())
  2848. * and hasn't released its transaction handle yet, in
  2849. * which case the new block group is still attached to
  2850. * its transaction handle and its creation has not
  2851. * finished yet (no block group item in the extent tree
  2852. * yet, etc). If this is the case, wait for all free
  2853. * space endio workers to finish and retry. This is a
  2854. * very rare case so no need for a more efficient and
  2855. * complex approach.
  2856. */
  2857. if (ret == -ENOENT) {
  2858. wait_event(cur_trans->writer_wait,
  2859. atomic_read(&cur_trans->num_writers) == 1);
  2860. ret = update_block_group_item(trans, path, cache);
  2861. }
  2862. if (ret)
  2863. btrfs_abort_transaction(trans, ret);
  2864. }
  2865. /* If its not on the io list, we need to put the block group */
  2866. if (should_put)
  2867. btrfs_put_block_group(cache);
  2868. btrfs_delayed_refs_rsv_release(fs_info, 1);
  2869. spin_lock(&cur_trans->dirty_bgs_lock);
  2870. }
  2871. spin_unlock(&cur_trans->dirty_bgs_lock);
  2872. /*
  2873. * Refer to the definition of io_bgs member for details why it's safe
  2874. * to use it without any locking
  2875. */
  2876. while (!list_empty(io)) {
  2877. cache = list_first_entry(io, struct btrfs_block_group,
  2878. io_list);
  2879. list_del_init(&cache->io_list);
  2880. btrfs_wait_cache_io(trans, cache, path);
  2881. btrfs_put_block_group(cache);
  2882. }
  2883. btrfs_free_path(path);
  2884. return ret;
  2885. }
  2886. static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
  2887. u64 bytes_freed)
  2888. {
  2889. const struct btrfs_space_info *space_info = bg->space_info;
  2890. const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
  2891. const u64 new_val = bg->used;
  2892. const u64 old_val = new_val + bytes_freed;
  2893. u64 thresh;
  2894. if (reclaim_thresh == 0)
  2895. return false;
  2896. thresh = div_factor_fine(bg->length, reclaim_thresh);
  2897. /*
  2898. * If we were below the threshold before don't reclaim, we are likely a
  2899. * brand new block group and we don't want to relocate new block groups.
  2900. */
  2901. if (old_val < thresh)
  2902. return false;
  2903. if (new_val >= thresh)
  2904. return false;
  2905. return true;
  2906. }
  2907. int btrfs_update_block_group(struct btrfs_trans_handle *trans,
  2908. u64 bytenr, u64 num_bytes, bool alloc)
  2909. {
  2910. struct btrfs_fs_info *info = trans->fs_info;
  2911. struct btrfs_block_group *cache = NULL;
  2912. u64 total = num_bytes;
  2913. u64 old_val;
  2914. u64 byte_in_group;
  2915. int factor;
  2916. int ret = 0;
  2917. /* Block accounting for super block */
  2918. spin_lock(&info->delalloc_root_lock);
  2919. old_val = btrfs_super_bytes_used(info->super_copy);
  2920. if (alloc)
  2921. old_val += num_bytes;
  2922. else
  2923. old_val -= num_bytes;
  2924. btrfs_set_super_bytes_used(info->super_copy, old_val);
  2925. spin_unlock(&info->delalloc_root_lock);
  2926. while (total) {
  2927. struct btrfs_space_info *space_info;
  2928. bool reclaim = false;
  2929. cache = btrfs_lookup_block_group(info, bytenr);
  2930. if (!cache) {
  2931. ret = -ENOENT;
  2932. break;
  2933. }
  2934. space_info = cache->space_info;
  2935. factor = btrfs_bg_type_to_factor(cache->flags);
  2936. /*
  2937. * If this block group has free space cache written out, we
  2938. * need to make sure to load it if we are removing space. This
  2939. * is because we need the unpinning stage to actually add the
  2940. * space back to the block group, otherwise we will leak space.
  2941. */
  2942. if (!alloc && !btrfs_block_group_done(cache))
  2943. btrfs_cache_block_group(cache, true);
  2944. byte_in_group = bytenr - cache->start;
  2945. WARN_ON(byte_in_group > cache->length);
  2946. spin_lock(&space_info->lock);
  2947. spin_lock(&cache->lock);
  2948. if (btrfs_test_opt(info, SPACE_CACHE) &&
  2949. cache->disk_cache_state < BTRFS_DC_CLEAR)
  2950. cache->disk_cache_state = BTRFS_DC_CLEAR;
  2951. old_val = cache->used;
  2952. num_bytes = min(total, cache->length - byte_in_group);
  2953. if (alloc) {
  2954. old_val += num_bytes;
  2955. cache->used = old_val;
  2956. cache->reserved -= num_bytes;
  2957. space_info->bytes_reserved -= num_bytes;
  2958. space_info->bytes_used += num_bytes;
  2959. space_info->disk_used += num_bytes * factor;
  2960. spin_unlock(&cache->lock);
  2961. spin_unlock(&space_info->lock);
  2962. } else {
  2963. old_val -= num_bytes;
  2964. cache->used = old_val;
  2965. cache->pinned += num_bytes;
  2966. btrfs_space_info_update_bytes_pinned(info, space_info,
  2967. num_bytes);
  2968. space_info->bytes_used -= num_bytes;
  2969. space_info->disk_used -= num_bytes * factor;
  2970. reclaim = should_reclaim_block_group(cache, num_bytes);
  2971. spin_unlock(&cache->lock);
  2972. spin_unlock(&space_info->lock);
  2973. set_extent_dirty(&trans->transaction->pinned_extents,
  2974. bytenr, bytenr + num_bytes - 1,
  2975. GFP_NOFS | __GFP_NOFAIL);
  2976. }
  2977. spin_lock(&trans->transaction->dirty_bgs_lock);
  2978. if (list_empty(&cache->dirty_list)) {
  2979. list_add_tail(&cache->dirty_list,
  2980. &trans->transaction->dirty_bgs);
  2981. trans->delayed_ref_updates++;
  2982. btrfs_get_block_group(cache);
  2983. }
  2984. spin_unlock(&trans->transaction->dirty_bgs_lock);
  2985. /*
  2986. * No longer have used bytes in this block group, queue it for
  2987. * deletion. We do this after adding the block group to the
  2988. * dirty list to avoid races between cleaner kthread and space
  2989. * cache writeout.
  2990. */
  2991. if (!alloc && old_val == 0) {
  2992. if (!btrfs_test_opt(info, DISCARD_ASYNC))
  2993. btrfs_mark_bg_unused(cache);
  2994. } else if (!alloc && reclaim) {
  2995. btrfs_mark_bg_to_reclaim(cache);
  2996. }
  2997. btrfs_put_block_group(cache);
  2998. total -= num_bytes;
  2999. bytenr += num_bytes;
  3000. }
  3001. /* Modified block groups are accounted for in the delayed_refs_rsv. */
  3002. btrfs_update_delayed_refs_rsv(trans);
  3003. return ret;
  3004. }
  3005. /**
  3006. * btrfs_add_reserved_bytes - update the block_group and space info counters
  3007. * @cache: The cache we are manipulating
  3008. * @ram_bytes: The number of bytes of file content, and will be same to
  3009. * @num_bytes except for the compress path.
  3010. * @num_bytes: The number of bytes in question
  3011. * @delalloc: The blocks are allocated for the delalloc write
  3012. *
  3013. * This is called by the allocator when it reserves space. If this is a
  3014. * reservation and the block group has become read only we cannot make the
  3015. * reservation and return -EAGAIN, otherwise this function always succeeds.
  3016. */
  3017. int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
  3018. u64 ram_bytes, u64 num_bytes, int delalloc)
  3019. {
  3020. struct btrfs_space_info *space_info = cache->space_info;
  3021. int ret = 0;
  3022. spin_lock(&space_info->lock);
  3023. spin_lock(&cache->lock);
  3024. if (cache->ro) {
  3025. ret = -EAGAIN;
  3026. } else {
  3027. cache->reserved += num_bytes;
  3028. space_info->bytes_reserved += num_bytes;
  3029. trace_btrfs_space_reservation(cache->fs_info, "space_info",
  3030. space_info->flags, num_bytes, 1);
  3031. btrfs_space_info_update_bytes_may_use(cache->fs_info,
  3032. space_info, -ram_bytes);
  3033. if (delalloc)
  3034. cache->delalloc_bytes += num_bytes;
  3035. /*
  3036. * Compression can use less space than we reserved, so wake
  3037. * tickets if that happens
  3038. */
  3039. if (num_bytes < ram_bytes)
  3040. btrfs_try_granting_tickets(cache->fs_info, space_info);
  3041. }
  3042. spin_unlock(&cache->lock);
  3043. spin_unlock(&space_info->lock);
  3044. return ret;
  3045. }
  3046. /**
  3047. * btrfs_free_reserved_bytes - update the block_group and space info counters
  3048. * @cache: The cache we are manipulating
  3049. * @num_bytes: The number of bytes in question
  3050. * @delalloc: The blocks are allocated for the delalloc write
  3051. *
  3052. * This is called by somebody who is freeing space that was never actually used
  3053. * on disk. For example if you reserve some space for a new leaf in transaction
  3054. * A and before transaction A commits you free that leaf, you call this with
  3055. * reserve set to 0 in order to clear the reservation.
  3056. */
  3057. void btrfs_free_reserved_bytes(struct btrfs_block_group *cache,
  3058. u64 num_bytes, int delalloc)
  3059. {
  3060. struct btrfs_space_info *space_info = cache->space_info;
  3061. spin_lock(&space_info->lock);
  3062. spin_lock(&cache->lock);
  3063. if (cache->ro)
  3064. space_info->bytes_readonly += num_bytes;
  3065. cache->reserved -= num_bytes;
  3066. space_info->bytes_reserved -= num_bytes;
  3067. space_info->max_extent_size = 0;
  3068. if (delalloc)
  3069. cache->delalloc_bytes -= num_bytes;
  3070. spin_unlock(&cache->lock);
  3071. btrfs_try_granting_tickets(cache->fs_info, space_info);
  3072. spin_unlock(&space_info->lock);
  3073. }
  3074. static void force_metadata_allocation(struct btrfs_fs_info *info)
  3075. {
  3076. struct list_head *head = &info->space_info;
  3077. struct btrfs_space_info *found;
  3078. list_for_each_entry(found, head, list) {
  3079. if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
  3080. found->force_alloc = CHUNK_ALLOC_FORCE;
  3081. }
  3082. }
  3083. static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
  3084. struct btrfs_space_info *sinfo, int force)
  3085. {
  3086. u64 bytes_used = btrfs_space_info_used(sinfo, false);
  3087. u64 thresh;
  3088. if (force == CHUNK_ALLOC_FORCE)
  3089. return 1;
  3090. /*
  3091. * in limited mode, we want to have some free space up to
  3092. * about 1% of the FS size.
  3093. */
  3094. if (force == CHUNK_ALLOC_LIMITED) {
  3095. thresh = btrfs_super_total_bytes(fs_info->super_copy);
  3096. thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
  3097. if (sinfo->total_bytes - bytes_used < thresh)
  3098. return 1;
  3099. }
  3100. if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
  3101. return 0;
  3102. return 1;
  3103. }
  3104. int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type)
  3105. {
  3106. u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type);
  3107. return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE);
  3108. }
  3109. static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags)
  3110. {
  3111. struct btrfs_block_group *bg;
  3112. int ret;
  3113. /*
  3114. * Check if we have enough space in the system space info because we
  3115. * will need to update device items in the chunk btree and insert a new
  3116. * chunk item in the chunk btree as well. This will allocate a new
  3117. * system block group if needed.
  3118. */
  3119. check_system_chunk(trans, flags);
  3120. bg = btrfs_create_chunk(trans, flags);
  3121. if (IS_ERR(bg)) {
  3122. ret = PTR_ERR(bg);
  3123. goto out;
  3124. }
  3125. ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
  3126. /*
  3127. * Normally we are not expected to fail with -ENOSPC here, since we have
  3128. * previously reserved space in the system space_info and allocated one
  3129. * new system chunk if necessary. However there are three exceptions:
  3130. *
  3131. * 1) We may have enough free space in the system space_info but all the
  3132. * existing system block groups have a profile which can not be used
  3133. * for extent allocation.
  3134. *
  3135. * This happens when mounting in degraded mode. For example we have a
  3136. * RAID1 filesystem with 2 devices, lose one device and mount the fs
  3137. * using the other device in degraded mode. If we then allocate a chunk,
  3138. * we may have enough free space in the existing system space_info, but
  3139. * none of the block groups can be used for extent allocation since they
  3140. * have a RAID1 profile, and because we are in degraded mode with a
  3141. * single device, we are forced to allocate a new system chunk with a
  3142. * SINGLE profile. Making check_system_chunk() iterate over all system
  3143. * block groups and check if they have a usable profile and enough space
  3144. * can be slow on very large filesystems, so we tolerate the -ENOSPC and
  3145. * try again after forcing allocation of a new system chunk. Like this
  3146. * we avoid paying the cost of that search in normal circumstances, when
  3147. * we were not mounted in degraded mode;
  3148. *
  3149. * 2) We had enough free space info the system space_info, and one suitable
  3150. * block group to allocate from when we called check_system_chunk()
  3151. * above. However right after we called it, the only system block group
  3152. * with enough free space got turned into RO mode by a running scrub,
  3153. * and in this case we have to allocate a new one and retry. We only
  3154. * need do this allocate and retry once, since we have a transaction
  3155. * handle and scrub uses the commit root to search for block groups;
  3156. *
  3157. * 3) We had one system block group with enough free space when we called
  3158. * check_system_chunk(), but after that, right before we tried to
  3159. * allocate the last extent buffer we needed, a discard operation came
  3160. * in and it temporarily removed the last free space entry from the
  3161. * block group (discard removes a free space entry, discards it, and
  3162. * then adds back the entry to the block group cache).
  3163. */
  3164. if (ret == -ENOSPC) {
  3165. const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info);
  3166. struct btrfs_block_group *sys_bg;
  3167. sys_bg = btrfs_create_chunk(trans, sys_flags);
  3168. if (IS_ERR(sys_bg)) {
  3169. ret = PTR_ERR(sys_bg);
  3170. btrfs_abort_transaction(trans, ret);
  3171. goto out;
  3172. }
  3173. ret = btrfs_chunk_alloc_add_chunk_item(trans, sys_bg);
  3174. if (ret) {
  3175. btrfs_abort_transaction(trans, ret);
  3176. goto out;
  3177. }
  3178. ret = btrfs_chunk_alloc_add_chunk_item(trans, bg);
  3179. if (ret) {
  3180. btrfs_abort_transaction(trans, ret);
  3181. goto out;
  3182. }
  3183. } else if (ret) {
  3184. btrfs_abort_transaction(trans, ret);
  3185. goto out;
  3186. }
  3187. out:
  3188. btrfs_trans_release_chunk_metadata(trans);
  3189. if (ret)
  3190. return ERR_PTR(ret);
  3191. btrfs_get_block_group(bg);
  3192. return bg;
  3193. }
  3194. /*
  3195. * Chunk allocation is done in 2 phases:
  3196. *
  3197. * 1) Phase 1 - through btrfs_chunk_alloc() we allocate device extents for
  3198. * the chunk, the chunk mapping, create its block group and add the items
  3199. * that belong in the chunk btree to it - more specifically, we need to
  3200. * update device items in the chunk btree and add a new chunk item to it.
  3201. *
  3202. * 2) Phase 2 - through btrfs_create_pending_block_groups(), we add the block
  3203. * group item to the extent btree and the device extent items to the devices
  3204. * btree.
  3205. *
  3206. * This is done to prevent deadlocks. For example when COWing a node from the
  3207. * extent btree we are holding a write lock on the node's parent and if we
  3208. * trigger chunk allocation and attempted to insert the new block group item
  3209. * in the extent btree right way, we could deadlock because the path for the
  3210. * insertion can include that parent node. At first glance it seems impossible
  3211. * to trigger chunk allocation after starting a transaction since tasks should
  3212. * reserve enough transaction units (metadata space), however while that is true
  3213. * most of the time, chunk allocation may still be triggered for several reasons:
  3214. *
  3215. * 1) When reserving metadata, we check if there is enough free space in the
  3216. * metadata space_info and therefore don't trigger allocation of a new chunk.
  3217. * However later when the task actually tries to COW an extent buffer from
  3218. * the extent btree or from the device btree for example, it is forced to
  3219. * allocate a new block group (chunk) because the only one that had enough
  3220. * free space was just turned to RO mode by a running scrub for example (or
  3221. * device replace, block group reclaim thread, etc), so we can not use it
  3222. * for allocating an extent and end up being forced to allocate a new one;
  3223. *
  3224. * 2) Because we only check that the metadata space_info has enough free bytes,
  3225. * we end up not allocating a new metadata chunk in that case. However if
  3226. * the filesystem was mounted in degraded mode, none of the existing block
  3227. * groups might be suitable for extent allocation due to their incompatible
  3228. * profile (for e.g. mounting a 2 devices filesystem, where all block groups
  3229. * use a RAID1 profile, in degraded mode using a single device). In this case
  3230. * when the task attempts to COW some extent buffer of the extent btree for
  3231. * example, it will trigger allocation of a new metadata block group with a
  3232. * suitable profile (SINGLE profile in the example of the degraded mount of
  3233. * the RAID1 filesystem);
  3234. *
  3235. * 3) The task has reserved enough transaction units / metadata space, but when
  3236. * it attempts to COW an extent buffer from the extent or device btree for
  3237. * example, it does not find any free extent in any metadata block group,
  3238. * therefore forced to try to allocate a new metadata block group.
  3239. * This is because some other task allocated all available extents in the
  3240. * meanwhile - this typically happens with tasks that don't reserve space
  3241. * properly, either intentionally or as a bug. One example where this is
  3242. * done intentionally is fsync, as it does not reserve any transaction units
  3243. * and ends up allocating a variable number of metadata extents for log
  3244. * tree extent buffers;
  3245. *
  3246. * 4) The task has reserved enough transaction units / metadata space, but right
  3247. * before it tries to allocate the last extent buffer it needs, a discard
  3248. * operation comes in and, temporarily, removes the last free space entry from
  3249. * the only metadata block group that had free space (discard starts by
  3250. * removing a free space entry from a block group, then does the discard
  3251. * operation and, once it's done, it adds back the free space entry to the
  3252. * block group).
  3253. *
  3254. * We also need this 2 phases setup when adding a device to a filesystem with
  3255. * a seed device - we must create new metadata and system chunks without adding
  3256. * any of the block group items to the chunk, extent and device btrees. If we
  3257. * did not do it this way, we would get ENOSPC when attempting to update those
  3258. * btrees, since all the chunks from the seed device are read-only.
  3259. *
  3260. * Phase 1 does the updates and insertions to the chunk btree because if we had
  3261. * it done in phase 2 and have a thundering herd of tasks allocating chunks in
  3262. * parallel, we risk having too many system chunks allocated by many tasks if
  3263. * many tasks reach phase 1 without the previous ones completing phase 2. In the
  3264. * extreme case this leads to exhaustion of the system chunk array in the
  3265. * superblock. This is easier to trigger if using a btree node/leaf size of 64K
  3266. * and with RAID filesystems (so we have more device items in the chunk btree).
  3267. * This has happened before and commit eafa4fd0ad0607 ("btrfs: fix exhaustion of
  3268. * the system chunk array due to concurrent allocations") provides more details.
  3269. *
  3270. * Allocation of system chunks does not happen through this function. A task that
  3271. * needs to update the chunk btree (the only btree that uses system chunks), must
  3272. * preallocate chunk space by calling either check_system_chunk() or
  3273. * btrfs_reserve_chunk_metadata() - the former is used when allocating a data or
  3274. * metadata chunk or when removing a chunk, while the later is used before doing
  3275. * a modification to the chunk btree - use cases for the later are adding,
  3276. * removing and resizing a device as well as relocation of a system chunk.
  3277. * See the comment below for more details.
  3278. *
  3279. * The reservation of system space, done through check_system_chunk(), as well
  3280. * as all the updates and insertions into the chunk btree must be done while
  3281. * holding fs_info->chunk_mutex. This is important to guarantee that while COWing
  3282. * an extent buffer from the chunks btree we never trigger allocation of a new
  3283. * system chunk, which would result in a deadlock (trying to lock twice an
  3284. * extent buffer of the chunk btree, first time before triggering the chunk
  3285. * allocation and the second time during chunk allocation while attempting to
  3286. * update the chunks btree). The system chunk array is also updated while holding
  3287. * that mutex. The same logic applies to removing chunks - we must reserve system
  3288. * space, update the chunk btree and the system chunk array in the superblock
  3289. * while holding fs_info->chunk_mutex.
  3290. *
  3291. * This function, btrfs_chunk_alloc(), belongs to phase 1.
  3292. *
  3293. * If @force is CHUNK_ALLOC_FORCE:
  3294. * - return 1 if it successfully allocates a chunk,
  3295. * - return errors including -ENOSPC otherwise.
  3296. * If @force is NOT CHUNK_ALLOC_FORCE:
  3297. * - return 0 if it doesn't need to allocate a new chunk,
  3298. * - return 1 if it successfully allocates a chunk,
  3299. * - return errors including -ENOSPC otherwise.
  3300. */
  3301. int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags,
  3302. enum btrfs_chunk_alloc_enum force)
  3303. {
  3304. struct btrfs_fs_info *fs_info = trans->fs_info;
  3305. struct btrfs_space_info *space_info;
  3306. struct btrfs_block_group *ret_bg;
  3307. bool wait_for_alloc = false;
  3308. bool should_alloc = false;
  3309. bool from_extent_allocation = false;
  3310. int ret = 0;
  3311. if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) {
  3312. from_extent_allocation = true;
  3313. force = CHUNK_ALLOC_FORCE;
  3314. }
  3315. /* Don't re-enter if we're already allocating a chunk */
  3316. if (trans->allocating_chunk)
  3317. return -ENOSPC;
  3318. /*
  3319. * Allocation of system chunks can not happen through this path, as we
  3320. * could end up in a deadlock if we are allocating a data or metadata
  3321. * chunk and there is another task modifying the chunk btree.
  3322. *
  3323. * This is because while we are holding the chunk mutex, we will attempt
  3324. * to add the new chunk item to the chunk btree or update an existing
  3325. * device item in the chunk btree, while the other task that is modifying
  3326. * the chunk btree is attempting to COW an extent buffer while holding a
  3327. * lock on it and on its parent - if the COW operation triggers a system
  3328. * chunk allocation, then we can deadlock because we are holding the
  3329. * chunk mutex and we may need to access that extent buffer or its parent
  3330. * in order to add the chunk item or update a device item.
  3331. *
  3332. * Tasks that want to modify the chunk tree should reserve system space
  3333. * before updating the chunk btree, by calling either
  3334. * btrfs_reserve_chunk_metadata() or check_system_chunk().
  3335. * It's possible that after a task reserves the space, it still ends up
  3336. * here - this happens in the cases described above at do_chunk_alloc().
  3337. * The task will have to either retry or fail.
  3338. */
  3339. if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
  3340. return -ENOSPC;
  3341. space_info = btrfs_find_space_info(fs_info, flags);
  3342. ASSERT(space_info);
  3343. do {
  3344. spin_lock(&space_info->lock);
  3345. if (force < space_info->force_alloc)
  3346. force = space_info->force_alloc;
  3347. should_alloc = should_alloc_chunk(fs_info, space_info, force);
  3348. if (space_info->full) {
  3349. /* No more free physical space */
  3350. if (should_alloc)
  3351. ret = -ENOSPC;
  3352. else
  3353. ret = 0;
  3354. spin_unlock(&space_info->lock);
  3355. return ret;
  3356. } else if (!should_alloc) {
  3357. spin_unlock(&space_info->lock);
  3358. return 0;
  3359. } else if (space_info->chunk_alloc) {
  3360. /*
  3361. * Someone is already allocating, so we need to block
  3362. * until this someone is finished and then loop to
  3363. * recheck if we should continue with our allocation
  3364. * attempt.
  3365. */
  3366. wait_for_alloc = true;
  3367. force = CHUNK_ALLOC_NO_FORCE;
  3368. spin_unlock(&space_info->lock);
  3369. mutex_lock(&fs_info->chunk_mutex);
  3370. mutex_unlock(&fs_info->chunk_mutex);
  3371. } else {
  3372. /* Proceed with allocation */
  3373. space_info->chunk_alloc = 1;
  3374. wait_for_alloc = false;
  3375. spin_unlock(&space_info->lock);
  3376. }
  3377. cond_resched();
  3378. } while (wait_for_alloc);
  3379. mutex_lock(&fs_info->chunk_mutex);
  3380. trans->allocating_chunk = true;
  3381. /*
  3382. * If we have mixed data/metadata chunks we want to make sure we keep
  3383. * allocating mixed chunks instead of individual chunks.
  3384. */
  3385. if (btrfs_mixed_space_info(space_info))
  3386. flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
  3387. /*
  3388. * if we're doing a data chunk, go ahead and make sure that
  3389. * we keep a reasonable number of metadata chunks allocated in the
  3390. * FS as well.
  3391. */
  3392. if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
  3393. fs_info->data_chunk_allocations++;
  3394. if (!(fs_info->data_chunk_allocations %
  3395. fs_info->metadata_ratio))
  3396. force_metadata_allocation(fs_info);
  3397. }
  3398. ret_bg = do_chunk_alloc(trans, flags);
  3399. trans->allocating_chunk = false;
  3400. if (IS_ERR(ret_bg)) {
  3401. ret = PTR_ERR(ret_bg);
  3402. } else if (from_extent_allocation) {
  3403. /*
  3404. * New block group is likely to be used soon. Try to activate
  3405. * it now. Failure is OK for now.
  3406. */
  3407. btrfs_zone_activate(ret_bg);
  3408. }
  3409. if (!ret)
  3410. btrfs_put_block_group(ret_bg);
  3411. spin_lock(&space_info->lock);
  3412. if (ret < 0) {
  3413. if (ret == -ENOSPC)
  3414. space_info->full = 1;
  3415. else
  3416. goto out;
  3417. } else {
  3418. ret = 1;
  3419. space_info->max_extent_size = 0;
  3420. }
  3421. space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
  3422. out:
  3423. space_info->chunk_alloc = 0;
  3424. spin_unlock(&space_info->lock);
  3425. mutex_unlock(&fs_info->chunk_mutex);
  3426. return ret;
  3427. }
  3428. static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
  3429. {
  3430. u64 num_dev;
  3431. num_dev = btrfs_raid_array[btrfs_bg_flags_to_raid_index(type)].devs_max;
  3432. if (!num_dev)
  3433. num_dev = fs_info->fs_devices->rw_devices;
  3434. return num_dev;
  3435. }
  3436. static void reserve_chunk_space(struct btrfs_trans_handle *trans,
  3437. u64 bytes,
  3438. u64 type)
  3439. {
  3440. struct btrfs_fs_info *fs_info = trans->fs_info;
  3441. struct btrfs_space_info *info;
  3442. u64 left;
  3443. int ret = 0;
  3444. /*
  3445. * Needed because we can end up allocating a system chunk and for an
  3446. * atomic and race free space reservation in the chunk block reserve.
  3447. */
  3448. lockdep_assert_held(&fs_info->chunk_mutex);
  3449. info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
  3450. spin_lock(&info->lock);
  3451. left = info->total_bytes - btrfs_space_info_used(info, true);
  3452. spin_unlock(&info->lock);
  3453. if (left < bytes && btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
  3454. btrfs_info(fs_info, "left=%llu, need=%llu, flags=%llu",
  3455. left, bytes, type);
  3456. btrfs_dump_space_info(fs_info, info, 0, 0);
  3457. }
  3458. if (left < bytes) {
  3459. u64 flags = btrfs_system_alloc_profile(fs_info);
  3460. struct btrfs_block_group *bg;
  3461. /*
  3462. * Ignore failure to create system chunk. We might end up not
  3463. * needing it, as we might not need to COW all nodes/leafs from
  3464. * the paths we visit in the chunk tree (they were already COWed
  3465. * or created in the current transaction for example).
  3466. */
  3467. bg = btrfs_create_chunk(trans, flags);
  3468. if (IS_ERR(bg)) {
  3469. ret = PTR_ERR(bg);
  3470. } else {
  3471. /*
  3472. * We have a new chunk. We also need to activate it for
  3473. * zoned filesystem.
  3474. */
  3475. ret = btrfs_zoned_activate_one_bg(fs_info, info, true);
  3476. if (ret < 0)
  3477. return;
  3478. /*
  3479. * If we fail to add the chunk item here, we end up
  3480. * trying again at phase 2 of chunk allocation, at
  3481. * btrfs_create_pending_block_groups(). So ignore
  3482. * any error here. An ENOSPC here could happen, due to
  3483. * the cases described at do_chunk_alloc() - the system
  3484. * block group we just created was just turned into RO
  3485. * mode by a scrub for example, or a running discard
  3486. * temporarily removed its free space entries, etc.
  3487. */
  3488. btrfs_chunk_alloc_add_chunk_item(trans, bg);
  3489. }
  3490. }
  3491. if (!ret) {
  3492. ret = btrfs_block_rsv_add(fs_info,
  3493. &fs_info->chunk_block_rsv,
  3494. bytes, BTRFS_RESERVE_NO_FLUSH);
  3495. if (!ret)
  3496. trans->chunk_bytes_reserved += bytes;
  3497. }
  3498. }
  3499. /*
  3500. * Reserve space in the system space for allocating or removing a chunk.
  3501. * The caller must be holding fs_info->chunk_mutex.
  3502. */
  3503. void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
  3504. {
  3505. struct btrfs_fs_info *fs_info = trans->fs_info;
  3506. const u64 num_devs = get_profile_num_devs(fs_info, type);
  3507. u64 bytes;
  3508. /* num_devs device items to update and 1 chunk item to add or remove. */
  3509. bytes = btrfs_calc_metadata_size(fs_info, num_devs) +
  3510. btrfs_calc_insert_metadata_size(fs_info, 1);
  3511. reserve_chunk_space(trans, bytes, type);
  3512. }
  3513. /*
  3514. * Reserve space in the system space, if needed, for doing a modification to the
  3515. * chunk btree.
  3516. *
  3517. * @trans: A transaction handle.
  3518. * @is_item_insertion: Indicate if the modification is for inserting a new item
  3519. * in the chunk btree or if it's for the deletion or update
  3520. * of an existing item.
  3521. *
  3522. * This is used in a context where we need to update the chunk btree outside
  3523. * block group allocation and removal, to avoid a deadlock with a concurrent
  3524. * task that is allocating a metadata or data block group and therefore needs to
  3525. * update the chunk btree while holding the chunk mutex. After the update to the
  3526. * chunk btree is done, btrfs_trans_release_chunk_metadata() should be called.
  3527. *
  3528. */
  3529. void btrfs_reserve_chunk_metadata(struct btrfs_trans_handle *trans,
  3530. bool is_item_insertion)
  3531. {
  3532. struct btrfs_fs_info *fs_info = trans->fs_info;
  3533. u64 bytes;
  3534. if (is_item_insertion)
  3535. bytes = btrfs_calc_insert_metadata_size(fs_info, 1);
  3536. else
  3537. bytes = btrfs_calc_metadata_size(fs_info, 1);
  3538. mutex_lock(&fs_info->chunk_mutex);
  3539. reserve_chunk_space(trans, bytes, BTRFS_BLOCK_GROUP_SYSTEM);
  3540. mutex_unlock(&fs_info->chunk_mutex);
  3541. }
  3542. void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
  3543. {
  3544. struct btrfs_block_group *block_group;
  3545. block_group = btrfs_lookup_first_block_group(info, 0);
  3546. while (block_group) {
  3547. btrfs_wait_block_group_cache_done(block_group);
  3548. spin_lock(&block_group->lock);
  3549. if (test_and_clear_bit(BLOCK_GROUP_FLAG_IREF,
  3550. &block_group->runtime_flags)) {
  3551. struct inode *inode = block_group->inode;
  3552. block_group->inode = NULL;
  3553. spin_unlock(&block_group->lock);
  3554. ASSERT(block_group->io_ctl.inode == NULL);
  3555. iput(inode);
  3556. } else {
  3557. spin_unlock(&block_group->lock);
  3558. }
  3559. block_group = btrfs_next_block_group(block_group);
  3560. }
  3561. }
  3562. /*
  3563. * Must be called only after stopping all workers, since we could have block
  3564. * group caching kthreads running, and therefore they could race with us if we
  3565. * freed the block groups before stopping them.
  3566. */
  3567. int btrfs_free_block_groups(struct btrfs_fs_info *info)
  3568. {
  3569. struct btrfs_block_group *block_group;
  3570. struct btrfs_space_info *space_info;
  3571. struct btrfs_caching_control *caching_ctl;
  3572. struct rb_node *n;
  3573. write_lock(&info->block_group_cache_lock);
  3574. while (!list_empty(&info->caching_block_groups)) {
  3575. caching_ctl = list_entry(info->caching_block_groups.next,
  3576. struct btrfs_caching_control, list);
  3577. list_del(&caching_ctl->list);
  3578. btrfs_put_caching_control(caching_ctl);
  3579. }
  3580. write_unlock(&info->block_group_cache_lock);
  3581. spin_lock(&info->unused_bgs_lock);
  3582. while (!list_empty(&info->unused_bgs)) {
  3583. block_group = list_first_entry(&info->unused_bgs,
  3584. struct btrfs_block_group,
  3585. bg_list);
  3586. list_del_init(&block_group->bg_list);
  3587. btrfs_put_block_group(block_group);
  3588. }
  3589. while (!list_empty(&info->reclaim_bgs)) {
  3590. block_group = list_first_entry(&info->reclaim_bgs,
  3591. struct btrfs_block_group,
  3592. bg_list);
  3593. list_del_init(&block_group->bg_list);
  3594. btrfs_put_block_group(block_group);
  3595. }
  3596. spin_unlock(&info->unused_bgs_lock);
  3597. spin_lock(&info->zone_active_bgs_lock);
  3598. while (!list_empty(&info->zone_active_bgs)) {
  3599. block_group = list_first_entry(&info->zone_active_bgs,
  3600. struct btrfs_block_group,
  3601. active_bg_list);
  3602. list_del_init(&block_group->active_bg_list);
  3603. btrfs_put_block_group(block_group);
  3604. }
  3605. spin_unlock(&info->zone_active_bgs_lock);
  3606. write_lock(&info->block_group_cache_lock);
  3607. while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
  3608. block_group = rb_entry(n, struct btrfs_block_group,
  3609. cache_node);
  3610. rb_erase_cached(&block_group->cache_node,
  3611. &info->block_group_cache_tree);
  3612. RB_CLEAR_NODE(&block_group->cache_node);
  3613. write_unlock(&info->block_group_cache_lock);
  3614. down_write(&block_group->space_info->groups_sem);
  3615. list_del(&block_group->list);
  3616. up_write(&block_group->space_info->groups_sem);
  3617. /*
  3618. * We haven't cached this block group, which means we could
  3619. * possibly have excluded extents on this block group.
  3620. */
  3621. if (block_group->cached == BTRFS_CACHE_NO ||
  3622. block_group->cached == BTRFS_CACHE_ERROR)
  3623. btrfs_free_excluded_extents(block_group);
  3624. btrfs_remove_free_space_cache(block_group);
  3625. ASSERT(block_group->cached != BTRFS_CACHE_STARTED);
  3626. ASSERT(list_empty(&block_group->dirty_list));
  3627. ASSERT(list_empty(&block_group->io_list));
  3628. ASSERT(list_empty(&block_group->bg_list));
  3629. ASSERT(refcount_read(&block_group->refs) == 1);
  3630. ASSERT(block_group->swap_extents == 0);
  3631. btrfs_put_block_group(block_group);
  3632. write_lock(&info->block_group_cache_lock);
  3633. }
  3634. write_unlock(&info->block_group_cache_lock);
  3635. btrfs_release_global_block_rsv(info);
  3636. while (!list_empty(&info->space_info)) {
  3637. space_info = list_entry(info->space_info.next,
  3638. struct btrfs_space_info,
  3639. list);
  3640. /*
  3641. * Do not hide this behind enospc_debug, this is actually
  3642. * important and indicates a real bug if this happens.
  3643. */
  3644. if (WARN_ON(space_info->bytes_pinned > 0 ||
  3645. space_info->bytes_may_use > 0))
  3646. btrfs_dump_space_info(info, space_info, 0, 0);
  3647. /*
  3648. * If there was a failure to cleanup a log tree, very likely due
  3649. * to an IO failure on a writeback attempt of one or more of its
  3650. * extent buffers, we could not do proper (and cheap) unaccounting
  3651. * of their reserved space, so don't warn on bytes_reserved > 0 in
  3652. * that case.
  3653. */
  3654. if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) ||
  3655. !BTRFS_FS_LOG_CLEANUP_ERROR(info)) {
  3656. if (WARN_ON(space_info->bytes_reserved > 0))
  3657. btrfs_dump_space_info(info, space_info, 0, 0);
  3658. }
  3659. WARN_ON(space_info->reclaim_size > 0);
  3660. list_del(&space_info->list);
  3661. btrfs_sysfs_remove_space_info(space_info);
  3662. }
  3663. return 0;
  3664. }
  3665. void btrfs_freeze_block_group(struct btrfs_block_group *cache)
  3666. {
  3667. atomic_inc(&cache->frozen);
  3668. }
  3669. void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group)
  3670. {
  3671. struct btrfs_fs_info *fs_info = block_group->fs_info;
  3672. struct extent_map_tree *em_tree;
  3673. struct extent_map *em;
  3674. bool cleanup;
  3675. spin_lock(&block_group->lock);
  3676. cleanup = (atomic_dec_and_test(&block_group->frozen) &&
  3677. test_bit(BLOCK_GROUP_FLAG_REMOVED, &block_group->runtime_flags));
  3678. spin_unlock(&block_group->lock);
  3679. if (cleanup) {
  3680. em_tree = &fs_info->mapping_tree;
  3681. write_lock(&em_tree->lock);
  3682. em = lookup_extent_mapping(em_tree, block_group->start,
  3683. 1);
  3684. BUG_ON(!em); /* logic error, can't happen */
  3685. remove_extent_mapping(em_tree, em);
  3686. write_unlock(&em_tree->lock);
  3687. /* once for us and once for the tree */
  3688. free_extent_map(em);
  3689. free_extent_map(em);
  3690. /*
  3691. * We may have left one free space entry and other possible
  3692. * tasks trimming this block group have left 1 entry each one.
  3693. * Free them if any.
  3694. */
  3695. btrfs_remove_free_space_cache(block_group);
  3696. }
  3697. }
  3698. bool btrfs_inc_block_group_swap_extents(struct btrfs_block_group *bg)
  3699. {
  3700. bool ret = true;
  3701. spin_lock(&bg->lock);
  3702. if (bg->ro)
  3703. ret = false;
  3704. else
  3705. bg->swap_extents++;
  3706. spin_unlock(&bg->lock);
  3707. return ret;
  3708. }
  3709. void btrfs_dec_block_group_swap_extents(struct btrfs_block_group *bg, int amount)
  3710. {
  3711. spin_lock(&bg->lock);
  3712. ASSERT(!bg->ro);
  3713. ASSERT(bg->swap_extents >= amount);
  3714. bg->swap_extents -= amount;
  3715. spin_unlock(&bg->lock);
  3716. }