vmscan.c 223 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079408040814082408340844085408640874088408940904091409240934094409540964097409840994100410141024103410441054106410741084109411041114112411341144115411641174118411941204121412241234124412541264127412841294130413141324133413441354136413741384139414041414142414341444145414641474148414941504151415241534154415541564157415841594160416141624163416441654166416741684169417041714172417341744175417641774178417941804181418241834184418541864187418841894190419141924193419441954196419741984199420042014202420342044205420642074208420942104211421242134214421542164217421842194220422142224223422442254226422742284229423042314232423342344235423642374238423942404241424242434244424542464247424842494250425142524253425442554256425742584259426042614262426342644265426642674268426942704271427242734274427542764277427842794280428142824283428442854286428742884289429042914292429342944295429642974298429943004301430243034304430543064307430843094310431143124313431443154316431743184319432043214322432343244325432643274328432943304331433243334334433543364337433843394340434143424343434443454346434743484349435043514352435343544355435643574358435943604361436243634364436543664367436843694370437143724373437443754376437743784379438043814382438343844385438643874388438943904391439243934394439543964397439843994400440144024403440444054406440744084409441044114412441344144415441644174418441944204421442244234424442544264427442844294430443144324433443444354436443744384439444044414442444344444445444644474448444944504451445244534454445544564457445844594460446144624463446444654466446744684469447044714472447344744475447644774478447944804481448244834484448544864487448844894490449144924493449444954496449744984499450045014502450345044505450645074508450945104511451245134514451545164517451845194520452145224523452445254526452745284529453045314532453345344535453645374538453945404541454245434544454545464547454845494550455145524553455445554556455745584559456045614562456345644565456645674568456945704571457245734574457545764577457845794580458145824583458445854586458745884589459045914592459345944595459645974598459946004601460246034604460546064607460846094610461146124613461446154616461746184619462046214622462346244625462646274628462946304631463246334634463546364637463846394640464146424643464446454646464746484649465046514652465346544655465646574658465946604661466246634664466546664667466846694670467146724673467446754676467746784679468046814682468346844685468646874688468946904691469246934694469546964697469846994700470147024703470447054706470747084709471047114712471347144715471647174718471947204721472247234724472547264727472847294730473147324733473447354736473747384739474047414742474347444745474647474748474947504751475247534754475547564757475847594760476147624763476447654766476747684769477047714772477347744775477647774778477947804781478247834784478547864787478847894790479147924793479447954796479747984799480048014802480348044805480648074808480948104811481248134814481548164817481848194820482148224823482448254826482748284829483048314832483348344835483648374838483948404841484248434844484548464847484848494850485148524853485448554856485748584859486048614862486348644865486648674868486948704871487248734874487548764877487848794880488148824883488448854886488748884889489048914892489348944895489648974898489949004901490249034904490549064907490849094910491149124913491449154916491749184919492049214922492349244925492649274928492949304931493249334934493549364937493849394940494149424943494449454946494749484949495049514952495349544955495649574958495949604961496249634964496549664967496849694970497149724973497449754976497749784979498049814982498349844985498649874988498949904991499249934994499549964997499849995000500150025003500450055006500750085009501050115012501350145015501650175018501950205021502250235024502550265027502850295030503150325033503450355036503750385039504050415042504350445045504650475048504950505051505250535054505550565057505850595060506150625063506450655066506750685069507050715072507350745075507650775078507950805081508250835084508550865087508850895090509150925093509450955096509750985099510051015102510351045105510651075108510951105111511251135114511551165117511851195120512151225123512451255126512751285129513051315132513351345135513651375138513951405141514251435144514551465147514851495150515151525153515451555156515751585159516051615162516351645165516651675168516951705171517251735174517551765177517851795180518151825183518451855186518751885189519051915192519351945195519651975198519952005201520252035204520552065207520852095210521152125213521452155216521752185219522052215222522352245225522652275228522952305231523252335234523552365237523852395240524152425243524452455246524752485249525052515252525352545255525652575258525952605261526252635264526552665267526852695270527152725273527452755276527752785279528052815282528352845285528652875288528952905291529252935294529552965297529852995300530153025303530453055306530753085309531053115312531353145315531653175318531953205321532253235324532553265327532853295330533153325333533453355336533753385339534053415342534353445345534653475348534953505351535253535354535553565357535853595360536153625363536453655366536753685369537053715372537353745375537653775378537953805381538253835384538553865387538853895390539153925393539453955396539753985399540054015402540354045405540654075408540954105411541254135414541554165417541854195420542154225423542454255426542754285429543054315432543354345435543654375438543954405441544254435444544554465447544854495450545154525453545454555456545754585459546054615462546354645465546654675468546954705471547254735474547554765477547854795480548154825483548454855486548754885489549054915492549354945495549654975498549955005501550255035504550555065507550855095510551155125513551455155516551755185519552055215522552355245525552655275528552955305531553255335534553555365537553855395540554155425543554455455546554755485549555055515552555355545555555655575558555955605561556255635564556555665567556855695570557155725573557455755576557755785579558055815582558355845585558655875588558955905591559255935594559555965597559855995600560156025603560456055606560756085609561056115612561356145615561656175618561956205621562256235624562556265627562856295630563156325633563456355636563756385639564056415642564356445645564656475648564956505651565256535654565556565657565856595660566156625663566456655666566756685669567056715672567356745675567656775678567956805681568256835684568556865687568856895690569156925693569456955696569756985699570057015702570357045705570657075708570957105711571257135714571557165717571857195720572157225723572457255726572757285729573057315732573357345735573657375738573957405741574257435744574557465747574857495750575157525753575457555756575757585759576057615762576357645765576657675768576957705771577257735774577557765777577857795780578157825783578457855786578757885789579057915792579357945795579657975798579958005801580258035804580558065807580858095810581158125813581458155816581758185819582058215822582358245825582658275828582958305831583258335834583558365837583858395840584158425843584458455846584758485849585058515852585358545855585658575858585958605861586258635864586558665867586858695870587158725873587458755876587758785879588058815882588358845885588658875888588958905891589258935894589558965897589858995900590159025903590459055906590759085909591059115912591359145915591659175918591959205921592259235924592559265927592859295930593159325933593459355936593759385939594059415942594359445945594659475948594959505951595259535954595559565957595859595960596159625963596459655966596759685969597059715972597359745975597659775978597959805981598259835984598559865987598859895990599159925993599459955996599759985999600060016002600360046005600660076008600960106011601260136014601560166017601860196020602160226023602460256026602760286029603060316032603360346035603660376038603960406041604260436044604560466047604860496050605160526053605460556056605760586059606060616062606360646065606660676068606960706071607260736074607560766077607860796080608160826083608460856086608760886089609060916092609360946095609660976098609961006101610261036104610561066107610861096110611161126113611461156116611761186119612061216122612361246125612661276128612961306131613261336134613561366137613861396140614161426143614461456146614761486149615061516152615361546155615661576158615961606161616261636164616561666167616861696170617161726173617461756176617761786179618061816182618361846185618661876188618961906191619261936194619561966197619861996200620162026203620462056206620762086209621062116212621362146215621662176218621962206221622262236224622562266227622862296230623162326233623462356236623762386239624062416242624362446245624662476248624962506251625262536254625562566257625862596260626162626263626462656266626762686269627062716272627362746275627662776278627962806281628262836284628562866287628862896290629162926293629462956296629762986299630063016302630363046305630663076308630963106311631263136314631563166317631863196320632163226323632463256326632763286329633063316332633363346335633663376338633963406341634263436344634563466347634863496350635163526353635463556356635763586359636063616362636363646365636663676368636963706371637263736374637563766377637863796380638163826383638463856386638763886389639063916392639363946395639663976398639964006401640264036404640564066407640864096410641164126413641464156416641764186419642064216422642364246425642664276428642964306431643264336434643564366437643864396440644164426443644464456446644764486449645064516452645364546455645664576458645964606461646264636464646564666467646864696470647164726473647464756476647764786479648064816482648364846485648664876488648964906491649264936494649564966497649864996500650165026503650465056506650765086509651065116512651365146515651665176518651965206521652265236524652565266527652865296530653165326533653465356536653765386539654065416542654365446545654665476548654965506551655265536554655565566557655865596560656165626563656465656566656765686569657065716572657365746575657665776578657965806581658265836584658565866587658865896590659165926593659465956596659765986599660066016602660366046605660666076608660966106611661266136614661566166617661866196620662166226623662466256626662766286629663066316632663366346635663666376638663966406641664266436644664566466647664866496650665166526653665466556656665766586659666066616662666366646665666666676668666966706671667266736674667566766677667866796680668166826683668466856686668766886689669066916692669366946695669666976698669967006701670267036704670567066707670867096710671167126713671467156716671767186719672067216722672367246725672667276728672967306731673267336734673567366737673867396740674167426743674467456746674767486749675067516752675367546755675667576758675967606761676267636764676567666767676867696770677167726773677467756776677767786779678067816782678367846785678667876788678967906791679267936794679567966797679867996800680168026803680468056806680768086809681068116812681368146815681668176818681968206821682268236824682568266827682868296830683168326833683468356836683768386839684068416842684368446845684668476848684968506851685268536854685568566857685868596860686168626863686468656866686768686869687068716872687368746875687668776878687968806881688268836884688568866887688868896890689168926893689468956896689768986899690069016902690369046905690669076908690969106911691269136914691569166917691869196920692169226923692469256926692769286929693069316932693369346935693669376938693969406941694269436944694569466947694869496950695169526953695469556956695769586959696069616962696369646965696669676968696969706971697269736974697569766977697869796980698169826983698469856986698769886989699069916992699369946995699669976998699970007001700270037004700570067007700870097010701170127013701470157016701770187019702070217022702370247025702670277028702970307031703270337034703570367037703870397040704170427043704470457046704770487049705070517052705370547055705670577058705970607061706270637064706570667067706870697070707170727073707470757076707770787079708070817082708370847085708670877088708970907091709270937094709570967097709870997100710171027103710471057106710771087109711071117112711371147115711671177118711971207121712271237124712571267127712871297130713171327133713471357136713771387139714071417142714371447145714671477148714971507151715271537154715571567157715871597160716171627163716471657166716771687169717071717172717371747175717671777178717971807181718271837184718571867187718871897190719171927193719471957196719771987199720072017202720372047205720672077208720972107211721272137214721572167217721872197220722172227223722472257226722772287229723072317232723372347235723672377238723972407241724272437244724572467247724872497250725172527253725472557256725772587259726072617262726372647265726672677268726972707271727272737274727572767277727872797280728172827283728472857286728772887289729072917292729372947295729672977298729973007301730273037304730573067307730873097310731173127313731473157316731773187319732073217322732373247325732673277328732973307331733273337334733573367337733873397340734173427343734473457346734773487349735073517352735373547355735673577358735973607361736273637364736573667367736873697370737173727373737473757376737773787379738073817382738373847385738673877388738973907391739273937394739573967397739873997400740174027403740474057406740774087409741074117412741374147415741674177418741974207421742274237424742574267427742874297430743174327433743474357436743774387439744074417442744374447445744674477448744974507451745274537454745574567457745874597460746174627463746474657466746774687469747074717472747374747475747674777478747974807481748274837484748574867487748874897490749174927493749474957496749774987499750075017502750375047505750675077508750975107511751275137514751575167517751875197520752175227523752475257526752775287529753075317532753375347535753675377538753975407541754275437544754575467547754875497550755175527553755475557556755775587559756075617562756375647565756675677568756975707571757275737574757575767577757875797580758175827583758475857586758775887589759075917592759375947595759675977598759976007601760276037604760576067607760876097610761176127613761476157616761776187619762076217622762376247625762676277628762976307631763276337634763576367637763876397640764176427643764476457646764776487649765076517652765376547655765676577658765976607661766276637664766576667667766876697670767176727673767476757676767776787679768076817682768376847685768676877688768976907691769276937694769576967697769876997700770177027703770477057706770777087709771077117712771377147715771677177718771977207721772277237724772577267727772877297730773177327733773477357736773777387739774077417742774377447745774677477748774977507751775277537754775577567757775877597760776177627763776477657766776777687769777077717772777377747775777677777778777977807781778277837784778577867787778877897790779177927793779477957796779777987799780078017802780378047805780678077808780978107811781278137814781578167817781878197820782178227823782478257826782778287829783078317832783378347835783678377838783978407841784278437844784578467847784878497850785178527853785478557856785778587859786078617862786378647865786678677868786978707871787278737874787578767877787878797880788178827883788478857886788778887889789078917892789378947895789678977898789979007901790279037904790579067907790879097910791179127913791479157916791779187919792079217922792379247925792679277928792979307931793279337934793579367937793879397940794179427943794479457946794779487949795079517952795379547955795679577958795979607961796279637964796579667967796879697970797179727973797479757976797779787979798079817982798379847985798679877988798979907991799279937994799579967997799879998000800180028003800480058006800780088009801080118012801380148015801680178018801980208021802280238024802580268027802880298030803180328033803480358036803780388039804080418042804380448045804680478048804980508051805280538054805580568057805880598060806180628063806480658066806780688069807080718072807380748075807680778078807980808081808280838084808580868087808880898090809180928093809480958096809780988099810081018102810381048105810681078108810981108111811281138114811581168117811881198120812181228123812481258126812781288129813081318132813381348135813681378138813981408141814281438144814581468147814881498150815181528153815481558156815781588159816081618162816381648165816681678168816981708171817281738174817581768177817881798180818181828183818481858186818781888189819081918192819381948195819681978198819982008201820282038204820582068207820882098210821182128213821482158216
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  4. *
  5. * Swap reorganised 29.12.95, Stephen Tweedie.
  6. * kswapd added: 7.1.96 sct
  7. * Removed kswapd_ctl limits, and swap out as many pages as needed
  8. * to bring the system back to freepages.high: 2.4.97, Rik van Riel.
  9. * Zone aware kswapd started 02/00, Kanoj Sarcar ([email protected]).
  10. * Multiqueue VM started 5.8.00, Rik van Riel.
  11. */
  12. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  13. #include <linux/mm.h>
  14. #include <linux/sched/mm.h>
  15. #include <linux/module.h>
  16. #include <linux/gfp.h>
  17. #include <linux/kernel_stat.h>
  18. #include <linux/swap.h>
  19. #include <linux/pagemap.h>
  20. #include <linux/init.h>
  21. #include <linux/highmem.h>
  22. #include <linux/vmpressure.h>
  23. #include <linux/vmstat.h>
  24. #include <linux/file.h>
  25. #include <linux/writeback.h>
  26. #include <linux/blkdev.h>
  27. #include <linux/buffer_head.h> /* for buffer_heads_over_limit */
  28. #include <linux/mm_inline.h>
  29. #include <linux/backing-dev.h>
  30. #include <linux/rmap.h>
  31. #include <linux/topology.h>
  32. #include <linux/cpu.h>
  33. #include <linux/cpuset.h>
  34. #include <linux/compaction.h>
  35. #include <linux/notifier.h>
  36. #include <linux/rwsem.h>
  37. #include <linux/delay.h>
  38. #include <linux/kthread.h>
  39. #include <linux/freezer.h>
  40. #include <linux/memcontrol.h>
  41. #include <linux/migrate.h>
  42. #include <linux/delayacct.h>
  43. #include <linux/sysctl.h>
  44. #include <linux/memory-tiers.h>
  45. #include <linux/oom.h>
  46. #include <linux/pagevec.h>
  47. #include <linux/prefetch.h>
  48. #include <linux/printk.h>
  49. #include <linux/dax.h>
  50. #include <linux/psi.h>
  51. #include <linux/pagewalk.h>
  52. #include <linux/shmem_fs.h>
  53. #include <linux/ctype.h>
  54. #include <linux/debugfs.h>
  55. #include <linux/rculist_nulls.h>
  56. #include <linux/random.h>
  57. #include <asm/tlbflush.h>
  58. #include <asm/div64.h>
  59. #include <linux/swapops.h>
  60. #include <linux/balloon_compaction.h>
  61. #include <linux/sched/sysctl.h>
  62. #include "internal.h"
  63. #include "swap.h"
  64. #define CREATE_TRACE_POINTS
  65. #include <trace/events/vmscan.h>
  66. #undef CREATE_TRACE_POINTS
  67. #include <trace/hooks/vmscan.h>
  68. EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_begin);
  69. EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_direct_reclaim_end);
  70. EXPORT_TRACEPOINT_SYMBOL_GPL(mm_vmscan_kswapd_wake);
  71. struct scan_control {
  72. /* How many pages shrink_list() should reclaim */
  73. unsigned long nr_to_reclaim;
  74. /*
  75. * Nodemask of nodes allowed by the caller. If NULL, all nodes
  76. * are scanned.
  77. */
  78. nodemask_t *nodemask;
  79. /*
  80. * The memory cgroup that hit its limit and as a result is the
  81. * primary target of this reclaim invocation.
  82. */
  83. struct mem_cgroup *target_mem_cgroup;
  84. /*
  85. * Scan pressure balancing between anon and file LRUs
  86. */
  87. unsigned long anon_cost;
  88. unsigned long file_cost;
  89. /* Can active folios be deactivated as part of reclaim? */
  90. #define DEACTIVATE_ANON 1
  91. #define DEACTIVATE_FILE 2
  92. unsigned int may_deactivate:2;
  93. unsigned int force_deactivate:1;
  94. unsigned int skipped_deactivate:1;
  95. /* Writepage batching in laptop mode; RECLAIM_WRITE */
  96. unsigned int may_writepage:1;
  97. /* Can mapped folios be reclaimed? */
  98. unsigned int may_unmap:1;
  99. /* Can folios be swapped as part of reclaim? */
  100. unsigned int may_swap:1;
  101. /* Proactive reclaim invoked by userspace through memory.reclaim */
  102. unsigned int proactive:1;
  103. /*
  104. * Cgroup memory below memory.low is protected as long as we
  105. * don't threaten to OOM. If any cgroup is reclaimed at
  106. * reduced force or passed over entirely due to its memory.low
  107. * setting (memcg_low_skipped), and nothing is reclaimed as a
  108. * result, then go back for one more cycle that reclaims the protected
  109. * memory (memcg_low_reclaim) to avert OOM.
  110. */
  111. unsigned int memcg_low_reclaim:1;
  112. unsigned int memcg_low_skipped:1;
  113. unsigned int hibernation_mode:1;
  114. /* One of the zones is ready for compaction */
  115. unsigned int compaction_ready:1;
  116. /* There is easily reclaimable cold cache in the current node */
  117. unsigned int cache_trim_mode:1;
  118. /* The file folios on the current node are dangerously low */
  119. unsigned int file_is_tiny:1;
  120. /* Always discard instead of demoting to lower tier memory */
  121. unsigned int no_demotion:1;
  122. /* Allocation order */
  123. s8 order;
  124. /* Scan (total_size >> priority) pages at once */
  125. s8 priority;
  126. /* The highest zone to isolate folios for reclaim from */
  127. s8 reclaim_idx;
  128. /* This context's GFP mask */
  129. gfp_t gfp_mask;
  130. /* Incremented by the number of inactive pages that were scanned */
  131. unsigned long nr_scanned;
  132. /* Number of pages freed so far during a call to shrink_zones() */
  133. unsigned long nr_reclaimed;
  134. struct {
  135. unsigned int dirty;
  136. unsigned int unqueued_dirty;
  137. unsigned int congested;
  138. unsigned int writeback;
  139. unsigned int immediate;
  140. unsigned int file_taken;
  141. unsigned int taken;
  142. } nr;
  143. /* for recording the reclaimed slab by now */
  144. struct reclaim_state reclaim_state;
  145. ANDROID_VENDOR_DATA(1);
  146. };
  147. #ifdef ARCH_HAS_PREFETCHW
  148. #define prefetchw_prev_lru_folio(_folio, _base, _field) \
  149. do { \
  150. if ((_folio)->lru.prev != _base) { \
  151. struct folio *prev; \
  152. \
  153. prev = lru_to_folio(&(_folio->lru)); \
  154. prefetchw(&prev->_field); \
  155. } \
  156. } while (0)
  157. #else
  158. #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0)
  159. #endif
  160. /*
  161. * From 0 .. 200. Higher means more swappy.
  162. */
  163. int vm_swappiness = 60;
  164. static void set_task_reclaim_state(struct task_struct *task,
  165. struct reclaim_state *rs)
  166. {
  167. /* Check for an overwrite */
  168. WARN_ON_ONCE(rs && task->reclaim_state);
  169. /* Check for the nulling of an already-nulled member */
  170. WARN_ON_ONCE(!rs && !task->reclaim_state);
  171. task->reclaim_state = rs;
  172. }
  173. LIST_HEAD(shrinker_list);
  174. DECLARE_RWSEM(shrinker_rwsem);
  175. #ifdef CONFIG_MEMCG
  176. static int shrinker_nr_max;
  177. /* The shrinker_info is expanded in a batch of BITS_PER_LONG */
  178. static inline int shrinker_map_size(int nr_items)
  179. {
  180. return (DIV_ROUND_UP(nr_items, BITS_PER_LONG) * sizeof(unsigned long));
  181. }
  182. static inline int shrinker_defer_size(int nr_items)
  183. {
  184. return (round_up(nr_items, BITS_PER_LONG) * sizeof(atomic_long_t));
  185. }
  186. static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg,
  187. int nid)
  188. {
  189. return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info,
  190. lockdep_is_held(&shrinker_rwsem));
  191. }
  192. static int expand_one_shrinker_info(struct mem_cgroup *memcg,
  193. int map_size, int defer_size,
  194. int old_map_size, int old_defer_size)
  195. {
  196. struct shrinker_info *new, *old;
  197. struct mem_cgroup_per_node *pn;
  198. int nid;
  199. int size = map_size + defer_size;
  200. for_each_node(nid) {
  201. pn = memcg->nodeinfo[nid];
  202. old = shrinker_info_protected(memcg, nid);
  203. /* Not yet online memcg */
  204. if (!old)
  205. return 0;
  206. new = kvmalloc_node(sizeof(*new) + size, GFP_KERNEL, nid);
  207. if (!new)
  208. return -ENOMEM;
  209. new->nr_deferred = (atomic_long_t *)(new + 1);
  210. new->map = (void *)new->nr_deferred + defer_size;
  211. /* map: set all old bits, clear all new bits */
  212. memset(new->map, (int)0xff, old_map_size);
  213. memset((void *)new->map + old_map_size, 0, map_size - old_map_size);
  214. /* nr_deferred: copy old values, clear all new values */
  215. memcpy(new->nr_deferred, old->nr_deferred, old_defer_size);
  216. memset((void *)new->nr_deferred + old_defer_size, 0,
  217. defer_size - old_defer_size);
  218. rcu_assign_pointer(pn->shrinker_info, new);
  219. kvfree_rcu(old, rcu);
  220. }
  221. return 0;
  222. }
  223. void free_shrinker_info(struct mem_cgroup *memcg)
  224. {
  225. struct mem_cgroup_per_node *pn;
  226. struct shrinker_info *info;
  227. int nid;
  228. for_each_node(nid) {
  229. pn = memcg->nodeinfo[nid];
  230. info = rcu_dereference_protected(pn->shrinker_info, true);
  231. kvfree(info);
  232. rcu_assign_pointer(pn->shrinker_info, NULL);
  233. }
  234. }
  235. int alloc_shrinker_info(struct mem_cgroup *memcg)
  236. {
  237. struct shrinker_info *info;
  238. int nid, size, ret = 0;
  239. int map_size, defer_size = 0;
  240. down_write(&shrinker_rwsem);
  241. map_size = shrinker_map_size(shrinker_nr_max);
  242. defer_size = shrinker_defer_size(shrinker_nr_max);
  243. size = map_size + defer_size;
  244. for_each_node(nid) {
  245. info = kvzalloc_node(sizeof(*info) + size, GFP_KERNEL, nid);
  246. if (!info) {
  247. free_shrinker_info(memcg);
  248. ret = -ENOMEM;
  249. break;
  250. }
  251. info->nr_deferred = (atomic_long_t *)(info + 1);
  252. info->map = (void *)info->nr_deferred + defer_size;
  253. rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info);
  254. }
  255. up_write(&shrinker_rwsem);
  256. return ret;
  257. }
  258. static inline bool need_expand(int nr_max)
  259. {
  260. return round_up(nr_max, BITS_PER_LONG) >
  261. round_up(shrinker_nr_max, BITS_PER_LONG);
  262. }
  263. static int expand_shrinker_info(int new_id)
  264. {
  265. int ret = 0;
  266. int new_nr_max = new_id + 1;
  267. int map_size, defer_size = 0;
  268. int old_map_size, old_defer_size = 0;
  269. struct mem_cgroup *memcg;
  270. if (!need_expand(new_nr_max))
  271. goto out;
  272. if (!root_mem_cgroup)
  273. goto out;
  274. lockdep_assert_held(&shrinker_rwsem);
  275. map_size = shrinker_map_size(new_nr_max);
  276. defer_size = shrinker_defer_size(new_nr_max);
  277. old_map_size = shrinker_map_size(shrinker_nr_max);
  278. old_defer_size = shrinker_defer_size(shrinker_nr_max);
  279. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  280. do {
  281. ret = expand_one_shrinker_info(memcg, map_size, defer_size,
  282. old_map_size, old_defer_size);
  283. if (ret) {
  284. mem_cgroup_iter_break(NULL, memcg);
  285. goto out;
  286. }
  287. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  288. out:
  289. if (!ret)
  290. shrinker_nr_max = new_nr_max;
  291. return ret;
  292. }
  293. void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
  294. {
  295. if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
  296. struct shrinker_info *info;
  297. rcu_read_lock();
  298. info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info);
  299. /* Pairs with smp mb in shrink_slab() */
  300. smp_mb__before_atomic();
  301. set_bit(shrinker_id, info->map);
  302. rcu_read_unlock();
  303. }
  304. }
  305. static DEFINE_IDR(shrinker_idr);
  306. static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  307. {
  308. int id, ret = -ENOMEM;
  309. if (mem_cgroup_disabled())
  310. return -ENOSYS;
  311. down_write(&shrinker_rwsem);
  312. /* This may call shrinker, so it must use down_read_trylock() */
  313. id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL);
  314. if (id < 0)
  315. goto unlock;
  316. if (id >= shrinker_nr_max) {
  317. if (expand_shrinker_info(id)) {
  318. idr_remove(&shrinker_idr, id);
  319. goto unlock;
  320. }
  321. }
  322. shrinker->id = id;
  323. ret = 0;
  324. unlock:
  325. up_write(&shrinker_rwsem);
  326. return ret;
  327. }
  328. static void unregister_memcg_shrinker(struct shrinker *shrinker)
  329. {
  330. int id = shrinker->id;
  331. BUG_ON(id < 0);
  332. lockdep_assert_held(&shrinker_rwsem);
  333. idr_remove(&shrinker_idr, id);
  334. }
  335. static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
  336. struct mem_cgroup *memcg)
  337. {
  338. struct shrinker_info *info;
  339. info = shrinker_info_protected(memcg, nid);
  340. return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0);
  341. }
  342. static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
  343. struct mem_cgroup *memcg)
  344. {
  345. struct shrinker_info *info;
  346. info = shrinker_info_protected(memcg, nid);
  347. return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]);
  348. }
  349. void reparent_shrinker_deferred(struct mem_cgroup *memcg)
  350. {
  351. int i, nid;
  352. long nr;
  353. struct mem_cgroup *parent;
  354. struct shrinker_info *child_info, *parent_info;
  355. parent = parent_mem_cgroup(memcg);
  356. if (!parent)
  357. parent = root_mem_cgroup;
  358. /* Prevent from concurrent shrinker_info expand */
  359. down_read(&shrinker_rwsem);
  360. for_each_node(nid) {
  361. child_info = shrinker_info_protected(memcg, nid);
  362. parent_info = shrinker_info_protected(parent, nid);
  363. for (i = 0; i < shrinker_nr_max; i++) {
  364. nr = atomic_long_read(&child_info->nr_deferred[i]);
  365. atomic_long_add(nr, &parent_info->nr_deferred[i]);
  366. }
  367. }
  368. up_read(&shrinker_rwsem);
  369. }
  370. static bool cgroup_reclaim(struct scan_control *sc)
  371. {
  372. return sc->target_mem_cgroup;
  373. }
  374. static bool global_reclaim(struct scan_control *sc)
  375. {
  376. return !sc->target_mem_cgroup || mem_cgroup_is_root(sc->target_mem_cgroup);
  377. }
  378. /**
  379. * writeback_throttling_sane - is the usual dirty throttling mechanism available?
  380. * @sc: scan_control in question
  381. *
  382. * The normal page dirty throttling mechanism in balance_dirty_pages() is
  383. * completely broken with the legacy memcg and direct stalling in
  384. * shrink_folio_list() is used for throttling instead, which lacks all the
  385. * niceties such as fairness, adaptive pausing, bandwidth proportional
  386. * allocation and configurability.
  387. *
  388. * This function tests whether the vmscan currently in progress can assume
  389. * that the normal dirty throttling mechanism is operational.
  390. */
  391. static bool writeback_throttling_sane(struct scan_control *sc)
  392. {
  393. if (!cgroup_reclaim(sc))
  394. return true;
  395. #ifdef CONFIG_CGROUP_WRITEBACK
  396. if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
  397. return true;
  398. #endif
  399. return false;
  400. }
  401. #else
  402. static int prealloc_memcg_shrinker(struct shrinker *shrinker)
  403. {
  404. return -ENOSYS;
  405. }
  406. static void unregister_memcg_shrinker(struct shrinker *shrinker)
  407. {
  408. }
  409. static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker,
  410. struct mem_cgroup *memcg)
  411. {
  412. return 0;
  413. }
  414. static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker,
  415. struct mem_cgroup *memcg)
  416. {
  417. return 0;
  418. }
  419. static bool cgroup_reclaim(struct scan_control *sc)
  420. {
  421. return false;
  422. }
  423. static bool global_reclaim(struct scan_control *sc)
  424. {
  425. return true;
  426. }
  427. static bool writeback_throttling_sane(struct scan_control *sc)
  428. {
  429. return true;
  430. }
  431. #endif
  432. static long xchg_nr_deferred(struct shrinker *shrinker,
  433. struct shrink_control *sc)
  434. {
  435. int nid = sc->nid;
  436. if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  437. nid = 0;
  438. if (sc->memcg &&
  439. (shrinker->flags & SHRINKER_MEMCG_AWARE))
  440. return xchg_nr_deferred_memcg(nid, shrinker,
  441. sc->memcg);
  442. return atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
  443. }
  444. static long add_nr_deferred(long nr, struct shrinker *shrinker,
  445. struct shrink_control *sc)
  446. {
  447. int nid = sc->nid;
  448. if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
  449. nid = 0;
  450. if (sc->memcg &&
  451. (shrinker->flags & SHRINKER_MEMCG_AWARE))
  452. return add_nr_deferred_memcg(nr, nid, shrinker,
  453. sc->memcg);
  454. return atomic_long_add_return(nr, &shrinker->nr_deferred[nid]);
  455. }
  456. static bool can_demote(int nid, struct scan_control *sc)
  457. {
  458. if (!numa_demotion_enabled)
  459. return false;
  460. if (sc && sc->no_demotion)
  461. return false;
  462. if (next_demotion_node(nid) == NUMA_NO_NODE)
  463. return false;
  464. return true;
  465. }
  466. static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg,
  467. int nid,
  468. struct scan_control *sc)
  469. {
  470. if (memcg == NULL) {
  471. /*
  472. * For non-memcg reclaim, is there
  473. * space in any swap device?
  474. */
  475. if (get_nr_swap_pages() > 0)
  476. return true;
  477. } else {
  478. /* Is the memcg below its swap limit? */
  479. if (mem_cgroup_get_nr_swap_pages(memcg) > 0)
  480. return true;
  481. }
  482. /*
  483. * The page can not be swapped.
  484. *
  485. * Can it be reclaimed from this node via demotion?
  486. */
  487. return can_demote(nid, sc);
  488. }
  489. /*
  490. * This misses isolated folios which are not accounted for to save counters.
  491. * As the data only determines if reclaim or compaction continues, it is
  492. * not expected that isolated folios will be a dominating factor.
  493. */
  494. unsigned long zone_reclaimable_pages(struct zone *zone)
  495. {
  496. unsigned long nr;
  497. nr = zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_FILE) +
  498. zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_FILE);
  499. if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
  500. nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
  501. zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
  502. return nr;
  503. }
  504. /**
  505. * lruvec_lru_size - Returns the number of pages on the given LRU list.
  506. * @lruvec: lru vector
  507. * @lru: lru to use
  508. * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list)
  509. */
  510. static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru,
  511. int zone_idx)
  512. {
  513. unsigned long size = 0;
  514. int zid;
  515. for (zid = 0; zid <= zone_idx; zid++) {
  516. struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid];
  517. if (!managed_zone(zone))
  518. continue;
  519. if (!mem_cgroup_disabled())
  520. size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
  521. else
  522. size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru);
  523. }
  524. return size;
  525. }
  526. /*
  527. * Add a shrinker callback to be called from the vm.
  528. */
  529. static int __prealloc_shrinker(struct shrinker *shrinker)
  530. {
  531. unsigned int size;
  532. int err;
  533. if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
  534. err = prealloc_memcg_shrinker(shrinker);
  535. if (err != -ENOSYS)
  536. return err;
  537. shrinker->flags &= ~SHRINKER_MEMCG_AWARE;
  538. }
  539. size = sizeof(*shrinker->nr_deferred);
  540. if (shrinker->flags & SHRINKER_NUMA_AWARE)
  541. size *= nr_node_ids;
  542. shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
  543. if (!shrinker->nr_deferred)
  544. return -ENOMEM;
  545. return 0;
  546. }
  547. #ifdef CONFIG_SHRINKER_DEBUG
  548. int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  549. {
  550. va_list ap;
  551. int err;
  552. va_start(ap, fmt);
  553. shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
  554. va_end(ap);
  555. if (!shrinker->name)
  556. return -ENOMEM;
  557. err = __prealloc_shrinker(shrinker);
  558. if (err) {
  559. kfree_const(shrinker->name);
  560. shrinker->name = NULL;
  561. }
  562. return err;
  563. }
  564. #else
  565. int prealloc_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  566. {
  567. return __prealloc_shrinker(shrinker);
  568. }
  569. #endif
  570. void free_prealloced_shrinker(struct shrinker *shrinker)
  571. {
  572. #ifdef CONFIG_SHRINKER_DEBUG
  573. kfree_const(shrinker->name);
  574. shrinker->name = NULL;
  575. #endif
  576. if (shrinker->flags & SHRINKER_MEMCG_AWARE) {
  577. down_write(&shrinker_rwsem);
  578. unregister_memcg_shrinker(shrinker);
  579. up_write(&shrinker_rwsem);
  580. return;
  581. }
  582. kfree(shrinker->nr_deferred);
  583. shrinker->nr_deferred = NULL;
  584. }
  585. void register_shrinker_prepared(struct shrinker *shrinker)
  586. {
  587. down_write(&shrinker_rwsem);
  588. list_add_tail(&shrinker->list, &shrinker_list);
  589. shrinker->flags |= SHRINKER_REGISTERED;
  590. shrinker_debugfs_add(shrinker);
  591. up_write(&shrinker_rwsem);
  592. }
  593. static int __register_shrinker(struct shrinker *shrinker)
  594. {
  595. int err = __prealloc_shrinker(shrinker);
  596. if (err)
  597. return err;
  598. register_shrinker_prepared(shrinker);
  599. return 0;
  600. }
  601. #ifdef CONFIG_SHRINKER_DEBUG
  602. int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  603. {
  604. va_list ap;
  605. int err;
  606. va_start(ap, fmt);
  607. shrinker->name = kvasprintf_const(GFP_KERNEL, fmt, ap);
  608. va_end(ap);
  609. if (!shrinker->name)
  610. return -ENOMEM;
  611. err = __register_shrinker(shrinker);
  612. if (err) {
  613. kfree_const(shrinker->name);
  614. shrinker->name = NULL;
  615. }
  616. return err;
  617. }
  618. #else
  619. int register_shrinker(struct shrinker *shrinker, const char *fmt, ...)
  620. {
  621. return __register_shrinker(shrinker);
  622. }
  623. #endif
  624. EXPORT_SYMBOL(register_shrinker);
  625. /*
  626. * Remove one
  627. */
  628. void unregister_shrinker(struct shrinker *shrinker)
  629. {
  630. struct dentry *debugfs_entry;
  631. if (!(shrinker->flags & SHRINKER_REGISTERED))
  632. return;
  633. down_write(&shrinker_rwsem);
  634. list_del(&shrinker->list);
  635. shrinker->flags &= ~SHRINKER_REGISTERED;
  636. if (shrinker->flags & SHRINKER_MEMCG_AWARE)
  637. unregister_memcg_shrinker(shrinker);
  638. debugfs_entry = shrinker_debugfs_remove(shrinker);
  639. up_write(&shrinker_rwsem);
  640. debugfs_remove_recursive(debugfs_entry);
  641. kfree(shrinker->nr_deferred);
  642. shrinker->nr_deferred = NULL;
  643. }
  644. EXPORT_SYMBOL(unregister_shrinker);
  645. /**
  646. * synchronize_shrinkers - Wait for all running shrinkers to complete.
  647. *
  648. * This is equivalent to calling unregister_shrink() and register_shrinker(),
  649. * but atomically and with less overhead. This is useful to guarantee that all
  650. * shrinker invocations have seen an update, before freeing memory, similar to
  651. * rcu.
  652. */
  653. void synchronize_shrinkers(void)
  654. {
  655. down_write(&shrinker_rwsem);
  656. up_write(&shrinker_rwsem);
  657. }
  658. EXPORT_SYMBOL(synchronize_shrinkers);
  659. #define SHRINK_BATCH 128
  660. static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
  661. struct shrinker *shrinker, int priority)
  662. {
  663. unsigned long freed = 0;
  664. unsigned long long delta;
  665. long total_scan;
  666. long freeable;
  667. long nr;
  668. long new_nr;
  669. long batch_size = shrinker->batch ? shrinker->batch
  670. : SHRINK_BATCH;
  671. long scanned = 0, next_deferred;
  672. freeable = shrinker->count_objects(shrinker, shrinkctl);
  673. trace_android_vh_do_shrink_slab(shrinker, &freeable);
  674. if (freeable == 0 || freeable == SHRINK_EMPTY)
  675. return freeable;
  676. /*
  677. * copy the current shrinker scan count into a local variable
  678. * and zero it so that other concurrent shrinker invocations
  679. * don't also do this scanning work.
  680. */
  681. nr = xchg_nr_deferred(shrinker, shrinkctl);
  682. if (shrinker->seeks) {
  683. delta = freeable >> priority;
  684. delta *= 4;
  685. do_div(delta, shrinker->seeks);
  686. } else {
  687. /*
  688. * These objects don't require any IO to create. Trim
  689. * them aggressively under memory pressure to keep
  690. * them from causing refetches in the IO caches.
  691. */
  692. delta = freeable / 2;
  693. }
  694. total_scan = nr >> priority;
  695. total_scan += delta;
  696. total_scan = min(total_scan, (2 * freeable));
  697. trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
  698. freeable, delta, total_scan, priority);
  699. /*
  700. * Normally, we should not scan less than batch_size objects in one
  701. * pass to avoid too frequent shrinker calls, but if the slab has less
  702. * than batch_size objects in total and we are really tight on memory,
  703. * we will try to reclaim all available objects, otherwise we can end
  704. * up failing allocations although there are plenty of reclaimable
  705. * objects spread over several slabs with usage less than the
  706. * batch_size.
  707. *
  708. * We detect the "tight on memory" situations by looking at the total
  709. * number of objects we want to scan (total_scan). If it is greater
  710. * than the total number of objects on slab (freeable), we must be
  711. * scanning at high prio and therefore should try to reclaim as much as
  712. * possible.
  713. */
  714. while (total_scan >= batch_size ||
  715. total_scan >= freeable) {
  716. unsigned long ret;
  717. unsigned long nr_to_scan = min(batch_size, total_scan);
  718. shrinkctl->nr_to_scan = nr_to_scan;
  719. shrinkctl->nr_scanned = nr_to_scan;
  720. ret = shrinker->scan_objects(shrinker, shrinkctl);
  721. if (ret == SHRINK_STOP)
  722. break;
  723. freed += ret;
  724. count_vm_events(SLABS_SCANNED, shrinkctl->nr_scanned);
  725. total_scan -= shrinkctl->nr_scanned;
  726. scanned += shrinkctl->nr_scanned;
  727. cond_resched();
  728. }
  729. /*
  730. * The deferred work is increased by any new work (delta) that wasn't
  731. * done, decreased by old deferred work that was done now.
  732. *
  733. * And it is capped to two times of the freeable items.
  734. */
  735. next_deferred = max_t(long, (nr + delta - scanned), 0);
  736. next_deferred = min(next_deferred, (2 * freeable));
  737. /*
  738. * move the unused scan count back into the shrinker in a
  739. * manner that handles concurrent updates.
  740. */
  741. new_nr = add_nr_deferred(next_deferred, shrinker, shrinkctl);
  742. trace_mm_shrink_slab_end(shrinker, shrinkctl->nid, freed, nr, new_nr, total_scan);
  743. return freed;
  744. }
  745. #ifdef CONFIG_MEMCG
  746. static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  747. struct mem_cgroup *memcg, int priority)
  748. {
  749. struct shrinker_info *info;
  750. unsigned long ret, freed = 0;
  751. int i;
  752. if (!mem_cgroup_online(memcg))
  753. return 0;
  754. if (!down_read_trylock(&shrinker_rwsem))
  755. return 0;
  756. info = shrinker_info_protected(memcg, nid);
  757. if (unlikely(!info))
  758. goto unlock;
  759. for_each_set_bit(i, info->map, shrinker_nr_max) {
  760. struct shrink_control sc = {
  761. .gfp_mask = gfp_mask,
  762. .nid = nid,
  763. .memcg = memcg,
  764. };
  765. struct shrinker *shrinker;
  766. shrinker = idr_find(&shrinker_idr, i);
  767. if (unlikely(!shrinker || !(shrinker->flags & SHRINKER_REGISTERED))) {
  768. if (!shrinker)
  769. clear_bit(i, info->map);
  770. continue;
  771. }
  772. /* Call non-slab shrinkers even though kmem is disabled */
  773. if (!memcg_kmem_enabled() &&
  774. !(shrinker->flags & SHRINKER_NONSLAB))
  775. continue;
  776. ret = do_shrink_slab(&sc, shrinker, priority);
  777. if (ret == SHRINK_EMPTY) {
  778. clear_bit(i, info->map);
  779. /*
  780. * After the shrinker reported that it had no objects to
  781. * free, but before we cleared the corresponding bit in
  782. * the memcg shrinker map, a new object might have been
  783. * added. To make sure, we have the bit set in this
  784. * case, we invoke the shrinker one more time and reset
  785. * the bit if it reports that it is not empty anymore.
  786. * The memory barrier here pairs with the barrier in
  787. * set_shrinker_bit():
  788. *
  789. * list_lru_add() shrink_slab_memcg()
  790. * list_add_tail() clear_bit()
  791. * <MB> <MB>
  792. * set_bit() do_shrink_slab()
  793. */
  794. smp_mb__after_atomic();
  795. ret = do_shrink_slab(&sc, shrinker, priority);
  796. if (ret == SHRINK_EMPTY)
  797. ret = 0;
  798. else
  799. set_shrinker_bit(memcg, nid, i);
  800. }
  801. freed += ret;
  802. if (rwsem_is_contended(&shrinker_rwsem)) {
  803. freed = freed ? : 1;
  804. break;
  805. }
  806. }
  807. unlock:
  808. up_read(&shrinker_rwsem);
  809. return freed;
  810. }
  811. #else /* CONFIG_MEMCG */
  812. static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
  813. struct mem_cgroup *memcg, int priority)
  814. {
  815. return 0;
  816. }
  817. #endif /* CONFIG_MEMCG */
  818. /**
  819. * shrink_slab - shrink slab caches
  820. * @gfp_mask: allocation context
  821. * @nid: node whose slab caches to target
  822. * @memcg: memory cgroup whose slab caches to target
  823. * @priority: the reclaim priority
  824. *
  825. * Call the shrink functions to age shrinkable caches.
  826. *
  827. * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
  828. * unaware shrinkers will receive a node id of 0 instead.
  829. *
  830. * @memcg specifies the memory cgroup to target. Unaware shrinkers
  831. * are called only if it is the root cgroup.
  832. *
  833. * @priority is sc->priority, we take the number of objects and >> by priority
  834. * in order to get the scan target.
  835. *
  836. * Returns the number of reclaimed slab objects.
  837. */
  838. static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
  839. struct mem_cgroup *memcg,
  840. int priority)
  841. {
  842. unsigned long ret, freed = 0;
  843. struct shrinker *shrinker;
  844. bool bypass = false;
  845. trace_android_vh_shrink_slab_bypass(gfp_mask, nid, memcg, priority, &bypass);
  846. if (bypass)
  847. return 0;
  848. /*
  849. * The root memcg might be allocated even though memcg is disabled
  850. * via "cgroup_disable=memory" boot parameter. This could make
  851. * mem_cgroup_is_root() return false, then just run memcg slab
  852. * shrink, but skip global shrink. This may result in premature
  853. * oom.
  854. */
  855. if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
  856. return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
  857. if (!down_read_trylock(&shrinker_rwsem))
  858. goto out;
  859. list_for_each_entry(shrinker, &shrinker_list, list) {
  860. struct shrink_control sc = {
  861. .gfp_mask = gfp_mask,
  862. .nid = nid,
  863. .memcg = memcg,
  864. };
  865. ret = do_shrink_slab(&sc, shrinker, priority);
  866. if (ret == SHRINK_EMPTY)
  867. ret = 0;
  868. freed += ret;
  869. /*
  870. * Bail out if someone want to register a new shrinker to
  871. * prevent the registration from being stalled for long periods
  872. * by parallel ongoing shrinking.
  873. */
  874. if (rwsem_is_contended(&shrinker_rwsem)) {
  875. freed = freed ? : 1;
  876. break;
  877. }
  878. }
  879. up_read(&shrinker_rwsem);
  880. out:
  881. cond_resched();
  882. return freed;
  883. }
  884. static void drop_slab_node(int nid)
  885. {
  886. unsigned long freed;
  887. int shift = 0;
  888. do {
  889. struct mem_cgroup *memcg = NULL;
  890. if (fatal_signal_pending(current))
  891. return;
  892. freed = 0;
  893. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  894. do {
  895. freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
  896. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
  897. } while ((freed >> shift++) > 1);
  898. }
  899. void drop_slab(void)
  900. {
  901. int nid;
  902. for_each_online_node(nid)
  903. drop_slab_node(nid);
  904. }
  905. static inline int is_page_cache_freeable(struct folio *folio)
  906. {
  907. /*
  908. * A freeable page cache folio is referenced only by the caller
  909. * that isolated the folio, the page cache and optional filesystem
  910. * private data at folio->private.
  911. */
  912. return folio_ref_count(folio) - folio_test_private(folio) ==
  913. 1 + folio_nr_pages(folio);
  914. }
  915. /*
  916. * We detected a synchronous write error writing a folio out. Probably
  917. * -ENOSPC. We need to propagate that into the address_space for a subsequent
  918. * fsync(), msync() or close().
  919. *
  920. * The tricky part is that after writepage we cannot touch the mapping: nothing
  921. * prevents it from being freed up. But we have a ref on the folio and once
  922. * that folio is locked, the mapping is pinned.
  923. *
  924. * We're allowed to run sleeping folio_lock() here because we know the caller has
  925. * __GFP_FS.
  926. */
  927. static void handle_write_error(struct address_space *mapping,
  928. struct folio *folio, int error)
  929. {
  930. folio_lock(folio);
  931. if (folio_mapping(folio) == mapping)
  932. mapping_set_error(mapping, error);
  933. folio_unlock(folio);
  934. }
  935. static bool skip_throttle_noprogress(pg_data_t *pgdat)
  936. {
  937. int reclaimable = 0, write_pending = 0;
  938. int i;
  939. /*
  940. * If kswapd is disabled, reschedule if necessary but do not
  941. * throttle as the system is likely near OOM.
  942. */
  943. if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  944. return true;
  945. /*
  946. * If there are a lot of dirty/writeback folios then do not
  947. * throttle as throttling will occur when the folios cycle
  948. * towards the end of the LRU if still under writeback.
  949. */
  950. for (i = 0; i < MAX_NR_ZONES; i++) {
  951. struct zone *zone = pgdat->node_zones + i;
  952. if (!managed_zone(zone))
  953. continue;
  954. reclaimable += zone_reclaimable_pages(zone);
  955. write_pending += zone_page_state_snapshot(zone,
  956. NR_ZONE_WRITE_PENDING);
  957. }
  958. if (2 * write_pending <= reclaimable)
  959. return true;
  960. return false;
  961. }
  962. void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
  963. {
  964. wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
  965. long timeout, ret;
  966. DEFINE_WAIT(wait);
  967. /*
  968. * Do not throttle IO workers, kthreads other than kswapd or
  969. * workqueues. They may be required for reclaim to make
  970. * forward progress (e.g. journalling workqueues or kthreads).
  971. */
  972. if (!current_is_kswapd() &&
  973. current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
  974. cond_resched();
  975. return;
  976. }
  977. /*
  978. * These figures are pulled out of thin air.
  979. * VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
  980. * parallel reclaimers which is a short-lived event so the timeout is
  981. * short. Failing to make progress or waiting on writeback are
  982. * potentially long-lived events so use a longer timeout. This is shaky
  983. * logic as a failure to make progress could be due to anything from
  984. * writeback to a slow device to excessive referenced folios at the tail
  985. * of the inactive LRU.
  986. */
  987. switch(reason) {
  988. case VMSCAN_THROTTLE_WRITEBACK:
  989. timeout = HZ/10;
  990. if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
  991. WRITE_ONCE(pgdat->nr_reclaim_start,
  992. node_page_state(pgdat, NR_THROTTLED_WRITTEN));
  993. }
  994. break;
  995. case VMSCAN_THROTTLE_CONGESTED:
  996. fallthrough;
  997. case VMSCAN_THROTTLE_NOPROGRESS:
  998. if (skip_throttle_noprogress(pgdat)) {
  999. cond_resched();
  1000. return;
  1001. }
  1002. timeout = 1;
  1003. break;
  1004. case VMSCAN_THROTTLE_ISOLATED:
  1005. timeout = HZ/50;
  1006. break;
  1007. default:
  1008. WARN_ON_ONCE(1);
  1009. timeout = HZ;
  1010. break;
  1011. }
  1012. prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
  1013. ret = schedule_timeout(timeout);
  1014. finish_wait(wqh, &wait);
  1015. if (reason == VMSCAN_THROTTLE_WRITEBACK)
  1016. atomic_dec(&pgdat->nr_writeback_throttled);
  1017. trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
  1018. jiffies_to_usecs(timeout - ret),
  1019. reason);
  1020. }
  1021. /*
  1022. * Account for folios written if tasks are throttled waiting on dirty
  1023. * folios to clean. If enough folios have been cleaned since throttling
  1024. * started then wakeup the throttled tasks.
  1025. */
  1026. void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
  1027. int nr_throttled)
  1028. {
  1029. unsigned long nr_written;
  1030. node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
  1031. /*
  1032. * This is an inaccurate read as the per-cpu deltas may not
  1033. * be synchronised. However, given that the system is
  1034. * writeback throttled, it is not worth taking the penalty
  1035. * of getting an accurate count. At worst, the throttle
  1036. * timeout guarantees forward progress.
  1037. */
  1038. nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
  1039. READ_ONCE(pgdat->nr_reclaim_start);
  1040. if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
  1041. wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
  1042. }
  1043. /* possible outcome of pageout() */
  1044. typedef enum {
  1045. /* failed to write folio out, folio is locked */
  1046. PAGE_KEEP,
  1047. /* move folio to the active list, folio is locked */
  1048. PAGE_ACTIVATE,
  1049. /* folio has been sent to the disk successfully, folio is unlocked */
  1050. PAGE_SUCCESS,
  1051. /* folio is clean and locked */
  1052. PAGE_CLEAN,
  1053. } pageout_t;
  1054. /*
  1055. * pageout is called by shrink_folio_list() for each dirty folio.
  1056. * Calls ->writepage().
  1057. */
  1058. static pageout_t pageout(struct folio *folio, struct address_space *mapping,
  1059. struct swap_iocb **plug)
  1060. {
  1061. /*
  1062. * If the folio is dirty, only perform writeback if that write
  1063. * will be non-blocking. To prevent this allocation from being
  1064. * stalled by pagecache activity. But note that there may be
  1065. * stalls if we need to run get_block(). We could test
  1066. * PagePrivate for that.
  1067. *
  1068. * If this process is currently in __generic_file_write_iter() against
  1069. * this folio's queue, we can perform writeback even if that
  1070. * will block.
  1071. *
  1072. * If the folio is swapcache, write it back even if that would
  1073. * block, for some throttling. This happens by accident, because
  1074. * swap_backing_dev_info is bust: it doesn't reflect the
  1075. * congestion state of the swapdevs. Easy to fix, if needed.
  1076. */
  1077. if (!is_page_cache_freeable(folio))
  1078. return PAGE_KEEP;
  1079. if (!mapping) {
  1080. /*
  1081. * Some data journaling orphaned folios can have
  1082. * folio->mapping == NULL while being dirty with clean buffers.
  1083. */
  1084. if (folio_test_private(folio)) {
  1085. if (try_to_free_buffers(folio)) {
  1086. folio_clear_dirty(folio);
  1087. pr_info("%s: orphaned folio\n", __func__);
  1088. return PAGE_CLEAN;
  1089. }
  1090. }
  1091. return PAGE_KEEP;
  1092. }
  1093. if (mapping->a_ops->writepage == NULL)
  1094. return PAGE_ACTIVATE;
  1095. if (folio_clear_dirty_for_io(folio)) {
  1096. int res;
  1097. struct writeback_control wbc = {
  1098. .sync_mode = WB_SYNC_NONE,
  1099. .nr_to_write = SWAP_CLUSTER_MAX,
  1100. .range_start = 0,
  1101. .range_end = LLONG_MAX,
  1102. .for_reclaim = 1,
  1103. .swap_plug = plug,
  1104. };
  1105. folio_set_reclaim(folio);
  1106. res = mapping->a_ops->writepage(&folio->page, &wbc);
  1107. if (res < 0)
  1108. handle_write_error(mapping, folio, res);
  1109. if (res == AOP_WRITEPAGE_ACTIVATE) {
  1110. folio_clear_reclaim(folio);
  1111. return PAGE_ACTIVATE;
  1112. }
  1113. if (!folio_test_writeback(folio)) {
  1114. /* synchronous write or broken a_ops? */
  1115. folio_clear_reclaim(folio);
  1116. }
  1117. trace_mm_vmscan_write_folio(folio);
  1118. node_stat_add_folio(folio, NR_VMSCAN_WRITE);
  1119. return PAGE_SUCCESS;
  1120. }
  1121. return PAGE_CLEAN;
  1122. }
  1123. /*
  1124. * Same as remove_mapping, but if the folio is removed from the mapping, it
  1125. * gets returned with a refcount of 0.
  1126. */
  1127. static int __remove_mapping(struct address_space *mapping, struct folio *folio,
  1128. bool reclaimed, struct mem_cgroup *target_memcg)
  1129. {
  1130. int refcount;
  1131. void *shadow = NULL;
  1132. BUG_ON(!folio_test_locked(folio));
  1133. BUG_ON(mapping != folio_mapping(folio));
  1134. if (!folio_test_swapcache(folio))
  1135. spin_lock(&mapping->host->i_lock);
  1136. xa_lock_irq(&mapping->i_pages);
  1137. /*
  1138. * The non racy check for a busy folio.
  1139. *
  1140. * Must be careful with the order of the tests. When someone has
  1141. * a ref to the folio, it may be possible that they dirty it then
  1142. * drop the reference. So if the dirty flag is tested before the
  1143. * refcount here, then the following race may occur:
  1144. *
  1145. * get_user_pages(&page);
  1146. * [user mapping goes away]
  1147. * write_to(page);
  1148. * !folio_test_dirty(folio) [good]
  1149. * folio_set_dirty(folio);
  1150. * folio_put(folio);
  1151. * !refcount(folio) [good, discard it]
  1152. *
  1153. * [oops, our write_to data is lost]
  1154. *
  1155. * Reversing the order of the tests ensures such a situation cannot
  1156. * escape unnoticed. The smp_rmb is needed to ensure the folio->flags
  1157. * load is not satisfied before that of folio->_refcount.
  1158. *
  1159. * Note that if the dirty flag is always set via folio_mark_dirty,
  1160. * and thus under the i_pages lock, then this ordering is not required.
  1161. */
  1162. refcount = 1 + folio_nr_pages(folio);
  1163. if (!folio_ref_freeze(folio, refcount))
  1164. goto cannot_free;
  1165. /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */
  1166. if (unlikely(folio_test_dirty(folio))) {
  1167. folio_ref_unfreeze(folio, refcount);
  1168. goto cannot_free;
  1169. }
  1170. if (folio_test_swapcache(folio)) {
  1171. swp_entry_t swap = folio_swap_entry(folio);
  1172. /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
  1173. if (reclaimed && !mapping_exiting(mapping))
  1174. shadow = workingset_eviction(folio, target_memcg);
  1175. mem_cgroup_swapout(folio, swap);
  1176. __delete_from_swap_cache(folio, swap, shadow);
  1177. xa_unlock_irq(&mapping->i_pages);
  1178. put_swap_folio(folio, swap);
  1179. } else {
  1180. void (*free_folio)(struct folio *);
  1181. free_folio = mapping->a_ops->free_folio;
  1182. /*
  1183. * Remember a shadow entry for reclaimed file cache in
  1184. * order to detect refaults, thus thrashing, later on.
  1185. *
  1186. * But don't store shadows in an address space that is
  1187. * already exiting. This is not just an optimization,
  1188. * inode reclaim needs to empty out the radix tree or
  1189. * the nodes are lost. Don't plant shadows behind its
  1190. * back.
  1191. *
  1192. * We also don't store shadows for DAX mappings because the
  1193. * only page cache folios found in these are zero pages
  1194. * covering holes, and because we don't want to mix DAX
  1195. * exceptional entries and shadow exceptional entries in the
  1196. * same address_space.
  1197. */
  1198. if (reclaimed && folio_is_file_lru(folio) &&
  1199. !mapping_exiting(mapping) && !dax_mapping(mapping))
  1200. shadow = workingset_eviction(folio, target_memcg);
  1201. __filemap_remove_folio(folio, shadow);
  1202. xa_unlock_irq(&mapping->i_pages);
  1203. if (mapping_shrinkable(mapping))
  1204. inode_add_lru(mapping->host);
  1205. spin_unlock(&mapping->host->i_lock);
  1206. if (free_folio)
  1207. free_folio(folio);
  1208. }
  1209. return 1;
  1210. cannot_free:
  1211. xa_unlock_irq(&mapping->i_pages);
  1212. if (!folio_test_swapcache(folio))
  1213. spin_unlock(&mapping->host->i_lock);
  1214. return 0;
  1215. }
  1216. /**
  1217. * remove_mapping() - Attempt to remove a folio from its mapping.
  1218. * @mapping: The address space.
  1219. * @folio: The folio to remove.
  1220. *
  1221. * If the folio is dirty, under writeback or if someone else has a ref
  1222. * on it, removal will fail.
  1223. * Return: The number of pages removed from the mapping. 0 if the folio
  1224. * could not be removed.
  1225. * Context: The caller should have a single refcount on the folio and
  1226. * hold its lock.
  1227. */
  1228. long remove_mapping(struct address_space *mapping, struct folio *folio)
  1229. {
  1230. if (__remove_mapping(mapping, folio, false, NULL)) {
  1231. /*
  1232. * Unfreezing the refcount with 1 effectively
  1233. * drops the pagecache ref for us without requiring another
  1234. * atomic operation.
  1235. */
  1236. folio_ref_unfreeze(folio, 1);
  1237. return folio_nr_pages(folio);
  1238. }
  1239. return 0;
  1240. }
  1241. /**
  1242. * folio_putback_lru - Put previously isolated folio onto appropriate LRU list.
  1243. * @folio: Folio to be returned to an LRU list.
  1244. *
  1245. * Add previously isolated @folio to appropriate LRU list.
  1246. * The folio may still be unevictable for other reasons.
  1247. *
  1248. * Context: lru_lock must not be held, interrupts must be enabled.
  1249. */
  1250. void folio_putback_lru(struct folio *folio)
  1251. {
  1252. folio_add_lru(folio);
  1253. folio_put(folio); /* drop ref from isolate */
  1254. }
  1255. enum folio_references {
  1256. FOLIOREF_RECLAIM,
  1257. FOLIOREF_RECLAIM_CLEAN,
  1258. FOLIOREF_KEEP,
  1259. FOLIOREF_ACTIVATE,
  1260. };
  1261. static enum folio_references folio_check_references(struct folio *folio,
  1262. struct scan_control *sc)
  1263. {
  1264. int referenced_ptes, referenced_folio;
  1265. unsigned long vm_flags;
  1266. int ret = 0;
  1267. trace_android_vh_check_folio_look_around_ref(folio, &ret);
  1268. if (ret)
  1269. return ret;
  1270. referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup,
  1271. &vm_flags);
  1272. referenced_folio = folio_test_clear_referenced(folio);
  1273. /*
  1274. * The supposedly reclaimable folio was found to be in a VM_LOCKED vma.
  1275. * Let the folio, now marked Mlocked, be moved to the unevictable list.
  1276. */
  1277. if (vm_flags & VM_LOCKED)
  1278. return FOLIOREF_ACTIVATE;
  1279. /* rmap lock contention: rotate */
  1280. if (referenced_ptes == -1)
  1281. return FOLIOREF_KEEP;
  1282. if (referenced_ptes) {
  1283. /*
  1284. * All mapped folios start out with page table
  1285. * references from the instantiating fault, so we need
  1286. * to look twice if a mapped file/anon folio is used more
  1287. * than once.
  1288. *
  1289. * Mark it and spare it for another trip around the
  1290. * inactive list. Another page table reference will
  1291. * lead to its activation.
  1292. *
  1293. * Note: the mark is set for activated folios as well
  1294. * so that recently deactivated but used folios are
  1295. * quickly recovered.
  1296. */
  1297. folio_set_referenced(folio);
  1298. if (referenced_folio || referenced_ptes > 1)
  1299. return FOLIOREF_ACTIVATE;
  1300. /*
  1301. * Activate file-backed executable folios after first usage.
  1302. */
  1303. if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio))
  1304. return FOLIOREF_ACTIVATE;
  1305. return FOLIOREF_KEEP;
  1306. }
  1307. /* Reclaim if clean, defer dirty folios to writeback */
  1308. if (referenced_folio && folio_is_file_lru(folio))
  1309. return FOLIOREF_RECLAIM_CLEAN;
  1310. return FOLIOREF_RECLAIM;
  1311. }
  1312. /* Check if a folio is dirty or under writeback */
  1313. static void folio_check_dirty_writeback(struct folio *folio,
  1314. bool *dirty, bool *writeback)
  1315. {
  1316. struct address_space *mapping;
  1317. /*
  1318. * Anonymous folios are not handled by flushers and must be written
  1319. * from reclaim context. Do not stall reclaim based on them.
  1320. * MADV_FREE anonymous folios are put into inactive file list too.
  1321. * They could be mistakenly treated as file lru. So further anon
  1322. * test is needed.
  1323. */
  1324. if (!folio_is_file_lru(folio) ||
  1325. (folio_test_anon(folio) && !folio_test_swapbacked(folio))) {
  1326. *dirty = false;
  1327. *writeback = false;
  1328. return;
  1329. }
  1330. /* By default assume that the folio flags are accurate */
  1331. *dirty = folio_test_dirty(folio);
  1332. *writeback = folio_test_writeback(folio);
  1333. /* Verify dirty/writeback state if the filesystem supports it */
  1334. if (!folio_test_private(folio))
  1335. return;
  1336. mapping = folio_mapping(folio);
  1337. if (mapping && mapping->a_ops->is_dirty_writeback)
  1338. mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
  1339. }
  1340. static struct page *alloc_demote_page(struct page *page, unsigned long private)
  1341. {
  1342. struct page *target_page;
  1343. nodemask_t *allowed_mask;
  1344. struct migration_target_control *mtc;
  1345. mtc = (struct migration_target_control *)private;
  1346. allowed_mask = mtc->nmask;
  1347. /*
  1348. * make sure we allocate from the target node first also trying to
  1349. * demote or reclaim pages from the target node via kswapd if we are
  1350. * low on free memory on target node. If we don't do this and if
  1351. * we have free memory on the slower(lower) memtier, we would start
  1352. * allocating pages from slower(lower) memory tiers without even forcing
  1353. * a demotion of cold pages from the target memtier. This can result
  1354. * in the kernel placing hot pages in slower(lower) memory tiers.
  1355. */
  1356. mtc->nmask = NULL;
  1357. mtc->gfp_mask |= __GFP_THISNODE;
  1358. target_page = alloc_migration_target(page, (unsigned long)mtc);
  1359. if (target_page)
  1360. return target_page;
  1361. mtc->gfp_mask &= ~__GFP_THISNODE;
  1362. mtc->nmask = allowed_mask;
  1363. return alloc_migration_target(page, (unsigned long)mtc);
  1364. }
  1365. /*
  1366. * Take folios on @demote_folios and attempt to demote them to another node.
  1367. * Folios which are not demoted are left on @demote_folios.
  1368. */
  1369. static unsigned int demote_folio_list(struct list_head *demote_folios,
  1370. struct pglist_data *pgdat)
  1371. {
  1372. int target_nid = next_demotion_node(pgdat->node_id);
  1373. unsigned int nr_succeeded;
  1374. nodemask_t allowed_mask;
  1375. struct migration_target_control mtc = {
  1376. /*
  1377. * Allocate from 'node', or fail quickly and quietly.
  1378. * When this happens, 'page' will likely just be discarded
  1379. * instead of migrated.
  1380. */
  1381. .gfp_mask = (GFP_HIGHUSER_MOVABLE & ~__GFP_RECLAIM) | __GFP_NOWARN |
  1382. __GFP_NOMEMALLOC | GFP_NOWAIT,
  1383. .nid = target_nid,
  1384. .nmask = &allowed_mask
  1385. };
  1386. if (list_empty(demote_folios))
  1387. return 0;
  1388. if (target_nid == NUMA_NO_NODE)
  1389. return 0;
  1390. node_get_allowed_targets(pgdat, &allowed_mask);
  1391. /* Demotion ignores all cpuset and mempolicy settings */
  1392. migrate_pages(demote_folios, alloc_demote_page, NULL,
  1393. (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
  1394. &nr_succeeded);
  1395. if (current_is_kswapd())
  1396. __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
  1397. else
  1398. __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
  1399. return nr_succeeded;
  1400. }
  1401. static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask)
  1402. {
  1403. if (gfp_mask & __GFP_FS)
  1404. return true;
  1405. if (!folio_test_swapcache(folio) || !(gfp_mask & __GFP_IO))
  1406. return false;
  1407. /*
  1408. * We can "enter_fs" for swap-cache with only __GFP_IO
  1409. * providing this isn't SWP_FS_OPS.
  1410. * ->flags can be updated non-atomicially (scan_swap_map_slots),
  1411. * but that will never affect SWP_FS_OPS, so the data_race
  1412. * is safe.
  1413. */
  1414. return !data_race(folio_swap_flags(folio) & SWP_FS_OPS);
  1415. }
  1416. /*
  1417. * shrink_folio_list() returns the number of reclaimed pages
  1418. */
  1419. static unsigned int shrink_folio_list(struct list_head *folio_list,
  1420. struct pglist_data *pgdat, struct scan_control *sc,
  1421. struct reclaim_stat *stat, bool ignore_references)
  1422. {
  1423. LIST_HEAD(ret_folios);
  1424. LIST_HEAD(free_folios);
  1425. LIST_HEAD(demote_folios);
  1426. unsigned int nr_reclaimed = 0;
  1427. unsigned int pgactivate = 0;
  1428. bool do_demote_pass;
  1429. struct swap_iocb *plug = NULL;
  1430. memset(stat, 0, sizeof(*stat));
  1431. cond_resched();
  1432. do_demote_pass = can_demote(pgdat->node_id, sc);
  1433. retry:
  1434. while (!list_empty(folio_list)) {
  1435. struct address_space *mapping;
  1436. struct folio *folio;
  1437. enum folio_references references = FOLIOREF_RECLAIM;
  1438. bool dirty, writeback;
  1439. unsigned int nr_pages;
  1440. cond_resched();
  1441. folio = lru_to_folio(folio_list);
  1442. list_del(&folio->lru);
  1443. if (!folio_trylock(folio))
  1444. goto keep;
  1445. VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
  1446. nr_pages = folio_nr_pages(folio);
  1447. /* Account the number of base pages */
  1448. sc->nr_scanned += nr_pages;
  1449. if (unlikely(!folio_evictable(folio)))
  1450. goto activate_locked;
  1451. if (!sc->may_unmap && folio_mapped(folio))
  1452. goto keep_locked;
  1453. /* folio_update_gen() tried to promote this page? */
  1454. if (lru_gen_enabled() && !ignore_references &&
  1455. folio_mapped(folio) && folio_test_referenced(folio))
  1456. goto keep_locked;
  1457. /*
  1458. * The number of dirty pages determines if a node is marked
  1459. * reclaim_congested. kswapd will stall and start writing
  1460. * folios if the tail of the LRU is all dirty unqueued folios.
  1461. */
  1462. folio_check_dirty_writeback(folio, &dirty, &writeback);
  1463. if (dirty || writeback)
  1464. stat->nr_dirty += nr_pages;
  1465. if (dirty && !writeback)
  1466. stat->nr_unqueued_dirty += nr_pages;
  1467. /*
  1468. * Treat this folio as congested if folios are cycling
  1469. * through the LRU so quickly that the folios marked
  1470. * for immediate reclaim are making it to the end of
  1471. * the LRU a second time.
  1472. */
  1473. if (writeback && folio_test_reclaim(folio))
  1474. stat->nr_congested += nr_pages;
  1475. /*
  1476. * If a folio at the tail of the LRU is under writeback, there
  1477. * are three cases to consider.
  1478. *
  1479. * 1) If reclaim is encountering an excessive number
  1480. * of folios under writeback and this folio has both
  1481. * the writeback and reclaim flags set, then it
  1482. * indicates that folios are being queued for I/O but
  1483. * are being recycled through the LRU before the I/O
  1484. * can complete. Waiting on the folio itself risks an
  1485. * indefinite stall if it is impossible to writeback
  1486. * the folio due to I/O error or disconnected storage
  1487. * so instead note that the LRU is being scanned too
  1488. * quickly and the caller can stall after the folio
  1489. * list has been processed.
  1490. *
  1491. * 2) Global or new memcg reclaim encounters a folio that is
  1492. * not marked for immediate reclaim, or the caller does not
  1493. * have __GFP_FS (or __GFP_IO if it's simply going to swap,
  1494. * not to fs). In this case mark the folio for immediate
  1495. * reclaim and continue scanning.
  1496. *
  1497. * Require may_enter_fs() because we would wait on fs, which
  1498. * may not have submitted I/O yet. And the loop driver might
  1499. * enter reclaim, and deadlock if it waits on a folio for
  1500. * which it is needed to do the write (loop masks off
  1501. * __GFP_IO|__GFP_FS for this reason); but more thought
  1502. * would probably show more reasons.
  1503. *
  1504. * 3) Legacy memcg encounters a folio that already has the
  1505. * reclaim flag set. memcg does not have any dirty folio
  1506. * throttling so we could easily OOM just because too many
  1507. * folios are in writeback and there is nothing else to
  1508. * reclaim. Wait for the writeback to complete.
  1509. *
  1510. * In cases 1) and 2) we activate the folios to get them out of
  1511. * the way while we continue scanning for clean folios on the
  1512. * inactive list and refilling from the active list. The
  1513. * observation here is that waiting for disk writes is more
  1514. * expensive than potentially causing reloads down the line.
  1515. * Since they're marked for immediate reclaim, they won't put
  1516. * memory pressure on the cache working set any longer than it
  1517. * takes to write them to disk.
  1518. */
  1519. if (folio_test_writeback(folio)) {
  1520. /* Case 1 above */
  1521. if (current_is_kswapd() &&
  1522. folio_test_reclaim(folio) &&
  1523. test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
  1524. stat->nr_immediate += nr_pages;
  1525. goto activate_locked;
  1526. /* Case 2 above */
  1527. } else if (writeback_throttling_sane(sc) ||
  1528. !folio_test_reclaim(folio) ||
  1529. !may_enter_fs(folio, sc->gfp_mask)) {
  1530. /*
  1531. * This is slightly racy -
  1532. * folio_end_writeback() might have
  1533. * just cleared the reclaim flag, then
  1534. * setting the reclaim flag here ends up
  1535. * interpreted as the readahead flag - but
  1536. * that does not matter enough to care.
  1537. * What we do want is for this folio to
  1538. * have the reclaim flag set next time
  1539. * memcg reclaim reaches the tests above,
  1540. * so it will then wait for writeback to
  1541. * avoid OOM; and it's also appropriate
  1542. * in global reclaim.
  1543. */
  1544. folio_set_reclaim(folio);
  1545. stat->nr_writeback += nr_pages;
  1546. goto activate_locked;
  1547. /* Case 3 above */
  1548. } else {
  1549. folio_unlock(folio);
  1550. folio_wait_writeback(folio);
  1551. /* then go back and try same folio again */
  1552. list_add_tail(&folio->lru, folio_list);
  1553. continue;
  1554. }
  1555. }
  1556. if (!ignore_references)
  1557. references = folio_check_references(folio, sc);
  1558. switch (references) {
  1559. case FOLIOREF_ACTIVATE:
  1560. goto activate_locked;
  1561. case FOLIOREF_KEEP:
  1562. stat->nr_ref_keep += nr_pages;
  1563. goto keep_locked;
  1564. case FOLIOREF_RECLAIM:
  1565. case FOLIOREF_RECLAIM_CLEAN:
  1566. ; /* try to reclaim the folio below */
  1567. }
  1568. /*
  1569. * Before reclaiming the folio, try to relocate
  1570. * its contents to another node.
  1571. */
  1572. if (do_demote_pass &&
  1573. (thp_migration_supported() || !folio_test_large(folio))) {
  1574. list_add(&folio->lru, &demote_folios);
  1575. folio_unlock(folio);
  1576. continue;
  1577. }
  1578. /*
  1579. * Anonymous process memory has backing store?
  1580. * Try to allocate it some swap space here.
  1581. * Lazyfree folio could be freed directly
  1582. */
  1583. if (folio_test_anon(folio) && folio_test_swapbacked(folio)) {
  1584. if (!folio_test_swapcache(folio)) {
  1585. if (!(sc->gfp_mask & __GFP_IO))
  1586. goto keep_locked;
  1587. if (folio_maybe_dma_pinned(folio))
  1588. goto keep_locked;
  1589. if (folio_test_large(folio)) {
  1590. /* cannot split folio, skip it */
  1591. if (!can_split_folio(folio, NULL))
  1592. goto activate_locked;
  1593. /*
  1594. * Split folios without a PMD map right
  1595. * away. Chances are some or all of the
  1596. * tail pages can be freed without IO.
  1597. */
  1598. if (!folio_entire_mapcount(folio) &&
  1599. split_folio_to_list(folio,
  1600. folio_list))
  1601. goto activate_locked;
  1602. }
  1603. if (!add_to_swap(folio)) {
  1604. if (!folio_test_large(folio))
  1605. goto activate_locked_split;
  1606. /* Fallback to swap normal pages */
  1607. if (split_folio_to_list(folio,
  1608. folio_list))
  1609. goto activate_locked;
  1610. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  1611. count_vm_event(THP_SWPOUT_FALLBACK);
  1612. #endif
  1613. if (!add_to_swap(folio))
  1614. goto activate_locked_split;
  1615. }
  1616. }
  1617. } else if (folio_test_swapbacked(folio) &&
  1618. folio_test_large(folio)) {
  1619. /* Split shmem folio */
  1620. if (split_folio_to_list(folio, folio_list))
  1621. goto keep_locked;
  1622. }
  1623. /*
  1624. * If the folio was split above, the tail pages will make
  1625. * their own pass through this function and be accounted
  1626. * then.
  1627. */
  1628. if ((nr_pages > 1) && !folio_test_large(folio)) {
  1629. sc->nr_scanned -= (nr_pages - 1);
  1630. nr_pages = 1;
  1631. }
  1632. /*
  1633. * The folio is mapped into the page tables of one or more
  1634. * processes. Try to unmap it here.
  1635. */
  1636. if (folio_mapped(folio)) {
  1637. enum ttu_flags flags = TTU_BATCH_FLUSH;
  1638. bool was_swapbacked = folio_test_swapbacked(folio);
  1639. if (folio_test_pmd_mappable(folio))
  1640. flags |= TTU_SPLIT_HUGE_PMD;
  1641. try_to_unmap(folio, flags);
  1642. if (folio_mapped(folio)) {
  1643. stat->nr_unmap_fail += nr_pages;
  1644. if (!was_swapbacked &&
  1645. folio_test_swapbacked(folio))
  1646. stat->nr_lazyfree_fail += nr_pages;
  1647. goto activate_locked;
  1648. }
  1649. }
  1650. /*
  1651. * Folio is unmapped now so it cannot be newly pinned anymore.
  1652. * No point in trying to reclaim folio if it is pinned.
  1653. * Furthermore we don't want to reclaim underlying fs metadata
  1654. * if the folio is pinned and thus potentially modified by the
  1655. * pinning process as that may upset the filesystem.
  1656. */
  1657. if (folio_maybe_dma_pinned(folio))
  1658. goto activate_locked;
  1659. mapping = folio_mapping(folio);
  1660. if (folio_test_dirty(folio)) {
  1661. /*
  1662. * Only kswapd can writeback filesystem folios
  1663. * to avoid risk of stack overflow. But avoid
  1664. * injecting inefficient single-folio I/O into
  1665. * flusher writeback as much as possible: only
  1666. * write folios when we've encountered many
  1667. * dirty folios, and when we've already scanned
  1668. * the rest of the LRU for clean folios and see
  1669. * the same dirty folios again (with the reclaim
  1670. * flag set).
  1671. */
  1672. if (folio_is_file_lru(folio) &&
  1673. (!current_is_kswapd() ||
  1674. !folio_test_reclaim(folio) ||
  1675. !test_bit(PGDAT_DIRTY, &pgdat->flags))) {
  1676. /*
  1677. * Immediately reclaim when written back.
  1678. * Similar in principle to deactivate_page()
  1679. * except we already have the folio isolated
  1680. * and know it's dirty
  1681. */
  1682. node_stat_mod_folio(folio, NR_VMSCAN_IMMEDIATE,
  1683. nr_pages);
  1684. folio_set_reclaim(folio);
  1685. goto activate_locked;
  1686. }
  1687. if (references == FOLIOREF_RECLAIM_CLEAN)
  1688. goto keep_locked;
  1689. if (!may_enter_fs(folio, sc->gfp_mask))
  1690. goto keep_locked;
  1691. if (!sc->may_writepage)
  1692. goto keep_locked;
  1693. /*
  1694. * Folio is dirty. Flush the TLB if a writable entry
  1695. * potentially exists to avoid CPU writes after I/O
  1696. * starts and then write it out here.
  1697. */
  1698. try_to_unmap_flush_dirty();
  1699. switch (pageout(folio, mapping, &plug)) {
  1700. case PAGE_KEEP:
  1701. goto keep_locked;
  1702. case PAGE_ACTIVATE:
  1703. goto activate_locked;
  1704. case PAGE_SUCCESS:
  1705. stat->nr_pageout += nr_pages;
  1706. if (folio_test_writeback(folio))
  1707. goto keep;
  1708. if (folio_test_dirty(folio))
  1709. goto keep;
  1710. /*
  1711. * A synchronous write - probably a ramdisk. Go
  1712. * ahead and try to reclaim the folio.
  1713. */
  1714. if (!folio_trylock(folio))
  1715. goto keep;
  1716. if (folio_test_dirty(folio) ||
  1717. folio_test_writeback(folio))
  1718. goto keep_locked;
  1719. mapping = folio_mapping(folio);
  1720. fallthrough;
  1721. case PAGE_CLEAN:
  1722. ; /* try to free the folio below */
  1723. }
  1724. }
  1725. /*
  1726. * If the folio has buffers, try to free the buffer
  1727. * mappings associated with this folio. If we succeed
  1728. * we try to free the folio as well.
  1729. *
  1730. * We do this even if the folio is dirty.
  1731. * filemap_release_folio() does not perform I/O, but it
  1732. * is possible for a folio to have the dirty flag set,
  1733. * but it is actually clean (all its buffers are clean).
  1734. * This happens if the buffers were written out directly,
  1735. * with submit_bh(). ext3 will do this, as well as
  1736. * the blockdev mapping. filemap_release_folio() will
  1737. * discover that cleanness and will drop the buffers
  1738. * and mark the folio clean - it can be freed.
  1739. *
  1740. * Rarely, folios can have buffers and no ->mapping.
  1741. * These are the folios which were not successfully
  1742. * invalidated in truncate_cleanup_folio(). We try to
  1743. * drop those buffers here and if that worked, and the
  1744. * folio is no longer mapped into process address space
  1745. * (refcount == 1) it can be freed. Otherwise, leave
  1746. * the folio on the LRU so it is swappable.
  1747. */
  1748. if (folio_has_private(folio)) {
  1749. if (!filemap_release_folio(folio, sc->gfp_mask))
  1750. goto activate_locked;
  1751. if (!mapping && folio_ref_count(folio) == 1) {
  1752. folio_unlock(folio);
  1753. if (folio_put_testzero(folio))
  1754. goto free_it;
  1755. else {
  1756. /*
  1757. * rare race with speculative reference.
  1758. * the speculative reference will free
  1759. * this folio shortly, so we may
  1760. * increment nr_reclaimed here (and
  1761. * leave it off the LRU).
  1762. */
  1763. nr_reclaimed += nr_pages;
  1764. continue;
  1765. }
  1766. }
  1767. }
  1768. if (folio_test_anon(folio) && !folio_test_swapbacked(folio)) {
  1769. /* follow __remove_mapping for reference */
  1770. if (!folio_ref_freeze(folio, 1))
  1771. goto keep_locked;
  1772. /*
  1773. * The folio has only one reference left, which is
  1774. * from the isolation. After the caller puts the
  1775. * folio back on the lru and drops the reference, the
  1776. * folio will be freed anyway. It doesn't matter
  1777. * which lru it goes on. So we don't bother checking
  1778. * the dirty flag here.
  1779. */
  1780. count_vm_events(PGLAZYFREED, nr_pages);
  1781. count_memcg_folio_events(folio, PGLAZYFREED, nr_pages);
  1782. } else if (!mapping || !__remove_mapping(mapping, folio, true,
  1783. sc->target_mem_cgroup))
  1784. goto keep_locked;
  1785. folio_unlock(folio);
  1786. free_it:
  1787. /*
  1788. * Folio may get swapped out as a whole, need to account
  1789. * all pages in it.
  1790. */
  1791. nr_reclaimed += nr_pages;
  1792. /*
  1793. * Is there need to periodically free_folio_list? It would
  1794. * appear not as the counts should be low
  1795. */
  1796. if (unlikely(folio_test_large(folio)))
  1797. destroy_large_folio(folio);
  1798. else
  1799. list_add(&folio->lru, &free_folios);
  1800. continue;
  1801. activate_locked_split:
  1802. /*
  1803. * The tail pages that are failed to add into swap cache
  1804. * reach here. Fixup nr_scanned and nr_pages.
  1805. */
  1806. if (nr_pages > 1) {
  1807. sc->nr_scanned -= (nr_pages - 1);
  1808. nr_pages = 1;
  1809. }
  1810. activate_locked:
  1811. /* Not a candidate for swapping, so reclaim swap space. */
  1812. if (folio_test_swapcache(folio) &&
  1813. (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio)))
  1814. folio_free_swap(folio);
  1815. VM_BUG_ON_FOLIO(folio_test_active(folio), folio);
  1816. if (!folio_test_mlocked(folio)) {
  1817. int type = folio_is_file_lru(folio);
  1818. folio_set_active(folio);
  1819. stat->nr_activate[type] += nr_pages;
  1820. count_memcg_folio_events(folio, PGACTIVATE, nr_pages);
  1821. }
  1822. keep_locked:
  1823. folio_unlock(folio);
  1824. keep:
  1825. list_add(&folio->lru, &ret_folios);
  1826. VM_BUG_ON_FOLIO(folio_test_lru(folio) ||
  1827. folio_test_unevictable(folio), folio);
  1828. }
  1829. /* 'folio_list' is always empty here */
  1830. /* Migrate folios selected for demotion */
  1831. nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
  1832. /* Folios that could not be demoted are still in @demote_folios */
  1833. if (!list_empty(&demote_folios)) {
  1834. /* Folios which weren't demoted go back on @folio_list for retry: */
  1835. list_splice_init(&demote_folios, folio_list);
  1836. do_demote_pass = false;
  1837. goto retry;
  1838. }
  1839. pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
  1840. mem_cgroup_uncharge_list(&free_folios);
  1841. try_to_unmap_flush();
  1842. free_unref_page_list(&free_folios);
  1843. list_splice(&ret_folios, folio_list);
  1844. count_vm_events(PGACTIVATE, pgactivate);
  1845. if (plug)
  1846. swap_write_unplug(plug);
  1847. return nr_reclaimed;
  1848. }
  1849. unsigned int reclaim_clean_pages_from_list(struct zone *zone,
  1850. struct list_head *folio_list)
  1851. {
  1852. struct scan_control sc = {
  1853. .gfp_mask = GFP_KERNEL,
  1854. .may_unmap = 1,
  1855. };
  1856. struct reclaim_stat stat;
  1857. unsigned int nr_reclaimed;
  1858. struct folio *folio, *next;
  1859. LIST_HEAD(clean_folios);
  1860. unsigned int noreclaim_flag;
  1861. list_for_each_entry_safe(folio, next, folio_list, lru) {
  1862. if (!folio_test_hugetlb(folio) && folio_is_file_lru(folio) &&
  1863. !folio_test_dirty(folio) && !__folio_test_movable(folio) &&
  1864. !folio_test_unevictable(folio)) {
  1865. folio_clear_active(folio);
  1866. list_move(&folio->lru, &clean_folios);
  1867. }
  1868. }
  1869. /*
  1870. * We should be safe here since we are only dealing with file pages and
  1871. * we are not kswapd and therefore cannot write dirty file pages. But
  1872. * call memalloc_noreclaim_save() anyway, just in case these conditions
  1873. * change in the future.
  1874. */
  1875. noreclaim_flag = memalloc_noreclaim_save();
  1876. nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc,
  1877. &stat, true);
  1878. memalloc_noreclaim_restore(noreclaim_flag);
  1879. list_splice(&clean_folios, folio_list);
  1880. mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
  1881. -(long)nr_reclaimed);
  1882. /*
  1883. * Since lazyfree pages are isolated from file LRU from the beginning,
  1884. * they will rotate back to anonymous LRU in the end if it failed to
  1885. * discard so isolated count will be mismatched.
  1886. * Compensate the isolated count for both LRU lists.
  1887. */
  1888. mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
  1889. stat.nr_lazyfree_fail);
  1890. mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
  1891. -(long)stat.nr_lazyfree_fail);
  1892. return nr_reclaimed;
  1893. }
  1894. /*
  1895. * Update LRU sizes after isolating pages. The LRU size updates must
  1896. * be complete before mem_cgroup_update_lru_size due to a sanity check.
  1897. */
  1898. static __always_inline void update_lru_sizes(struct lruvec *lruvec,
  1899. enum lru_list lru, unsigned long *nr_zone_taken)
  1900. {
  1901. int zid;
  1902. for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  1903. if (!nr_zone_taken[zid])
  1904. continue;
  1905. update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
  1906. }
  1907. }
  1908. #ifdef CONFIG_CMA
  1909. /*
  1910. * It is waste of effort to scan and reclaim CMA pages if it is not available
  1911. * for current allocation context. Kswapd can not be enrolled as it can not
  1912. * distinguish this scenario by using sc->gfp_mask = GFP_KERNEL
  1913. */
  1914. static bool skip_cma(struct folio *folio, struct scan_control *sc)
  1915. {
  1916. return !current_is_kswapd() &&
  1917. gfp_migratetype(sc->gfp_mask) != MIGRATE_MOVABLE &&
  1918. get_pageblock_migratetype(&folio->page) == MIGRATE_CMA;
  1919. }
  1920. #else
  1921. static bool skip_cma(struct folio *folio, struct scan_control *sc)
  1922. {
  1923. return false;
  1924. }
  1925. #endif
  1926. /*
  1927. * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
  1928. *
  1929. * lruvec->lru_lock is heavily contended. Some of the functions that
  1930. * shrink the lists perform better by taking out a batch of pages
  1931. * and working on them outside the LRU lock.
  1932. *
  1933. * For pagecache intensive workloads, this function is the hottest
  1934. * spot in the kernel (apart from copy_*_user functions).
  1935. *
  1936. * Lru_lock must be held before calling this function.
  1937. *
  1938. * @nr_to_scan: The number of eligible pages to look through on the list.
  1939. * @lruvec: The LRU vector to pull pages from.
  1940. * @dst: The temp list to put pages on to.
  1941. * @nr_scanned: The number of pages that were scanned.
  1942. * @sc: The scan_control struct for this reclaim session
  1943. * @lru: LRU list id for isolating
  1944. *
  1945. * returns how many pages were moved onto *@dst.
  1946. */
  1947. static unsigned long isolate_lru_folios(unsigned long nr_to_scan,
  1948. struct lruvec *lruvec, struct list_head *dst,
  1949. unsigned long *nr_scanned, struct scan_control *sc,
  1950. enum lru_list lru)
  1951. {
  1952. struct list_head *src = &lruvec->lists[lru];
  1953. unsigned long nr_taken = 0;
  1954. unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 };
  1955. unsigned long nr_skipped[MAX_NR_ZONES] = { 0, };
  1956. unsigned long skipped = 0;
  1957. unsigned long scan, total_scan, nr_pages;
  1958. LIST_HEAD(folios_skipped);
  1959. total_scan = 0;
  1960. scan = 0;
  1961. while (scan < nr_to_scan && !list_empty(src)) {
  1962. struct list_head *move_to = src;
  1963. struct folio *folio;
  1964. folio = lru_to_folio(src);
  1965. prefetchw_prev_lru_folio(folio, src, flags);
  1966. nr_pages = folio_nr_pages(folio);
  1967. total_scan += nr_pages;
  1968. if (folio_zonenum(folio) > sc->reclaim_idx ||
  1969. skip_cma(folio, sc)) {
  1970. nr_skipped[folio_zonenum(folio)] += nr_pages;
  1971. move_to = &folios_skipped;
  1972. goto move;
  1973. }
  1974. /*
  1975. * Do not count skipped folios because that makes the function
  1976. * return with no isolated folios if the LRU mostly contains
  1977. * ineligible folios. This causes the VM to not reclaim any
  1978. * folios, triggering a premature OOM.
  1979. * Account all pages in a folio.
  1980. */
  1981. scan += nr_pages;
  1982. if (!folio_test_lru(folio))
  1983. goto move;
  1984. if (!sc->may_unmap && folio_mapped(folio))
  1985. goto move;
  1986. /*
  1987. * Be careful not to clear the lru flag until after we're
  1988. * sure the folio is not being freed elsewhere -- the
  1989. * folio release code relies on it.
  1990. */
  1991. if (unlikely(!folio_try_get(folio)))
  1992. goto move;
  1993. if (!folio_test_clear_lru(folio)) {
  1994. /* Another thread is already isolating this folio */
  1995. folio_put(folio);
  1996. goto move;
  1997. }
  1998. nr_taken += nr_pages;
  1999. nr_zone_taken[folio_zonenum(folio)] += nr_pages;
  2000. move_to = dst;
  2001. move:
  2002. list_move(&folio->lru, move_to);
  2003. }
  2004. /*
  2005. * Splice any skipped folios to the start of the LRU list. Note that
  2006. * this disrupts the LRU order when reclaiming for lower zones but
  2007. * we cannot splice to the tail. If we did then the SWAP_CLUSTER_MAX
  2008. * scanning would soon rescan the same folios to skip and waste lots
  2009. * of cpu cycles.
  2010. */
  2011. if (!list_empty(&folios_skipped)) {
  2012. int zid;
  2013. list_splice(&folios_skipped, src);
  2014. for (zid = 0; zid < MAX_NR_ZONES; zid++) {
  2015. if (!nr_skipped[zid])
  2016. continue;
  2017. __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]);
  2018. skipped += nr_skipped[zid];
  2019. }
  2020. }
  2021. *nr_scanned = total_scan;
  2022. trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan,
  2023. total_scan, skipped, nr_taken,
  2024. sc->may_unmap ? 0 : ISOLATE_UNMAPPED, lru);
  2025. update_lru_sizes(lruvec, lru, nr_zone_taken);
  2026. return nr_taken;
  2027. }
  2028. /**
  2029. * folio_isolate_lru() - Try to isolate a folio from its LRU list.
  2030. * @folio: Folio to isolate from its LRU list.
  2031. *
  2032. * Isolate a @folio from an LRU list and adjust the vmstat statistic
  2033. * corresponding to whatever LRU list the folio was on.
  2034. *
  2035. * The folio will have its LRU flag cleared. If it was found on the
  2036. * active list, it will have the Active flag set. If it was found on the
  2037. * unevictable list, it will have the Unevictable flag set. These flags
  2038. * may need to be cleared by the caller before letting the page go.
  2039. *
  2040. * Context:
  2041. *
  2042. * (1) Must be called with an elevated refcount on the folio. This is a
  2043. * fundamental difference from isolate_lru_folios() (which is called
  2044. * without a stable reference).
  2045. * (2) The lru_lock must not be held.
  2046. * (3) Interrupts must be enabled.
  2047. *
  2048. * Return: 0 if the folio was removed from an LRU list.
  2049. * -EBUSY if the folio was not on an LRU list.
  2050. */
  2051. int folio_isolate_lru(struct folio *folio)
  2052. {
  2053. int ret = -EBUSY;
  2054. VM_BUG_ON_FOLIO(!folio_ref_count(folio), folio);
  2055. if (folio_test_clear_lru(folio)) {
  2056. struct lruvec *lruvec;
  2057. folio_get(folio);
  2058. lruvec = folio_lruvec_lock_irq(folio);
  2059. lruvec_del_folio(lruvec, folio);
  2060. unlock_page_lruvec_irq(lruvec);
  2061. ret = 0;
  2062. }
  2063. return ret;
  2064. }
  2065. /*
  2066. * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  2067. * then get rescheduled. When there are massive number of tasks doing page
  2068. * allocation, such sleeping direct reclaimers may keep piling up on each CPU,
  2069. * the LRU list will go small and be scanned faster than necessary, leading to
  2070. * unnecessary swapping, thrashing and OOM.
  2071. */
  2072. static int too_many_isolated(struct pglist_data *pgdat, int file,
  2073. struct scan_control *sc)
  2074. {
  2075. unsigned long inactive, isolated;
  2076. bool too_many;
  2077. if (current_is_kswapd())
  2078. return 0;
  2079. if (!writeback_throttling_sane(sc))
  2080. return 0;
  2081. if (file) {
  2082. inactive = node_page_state(pgdat, NR_INACTIVE_FILE);
  2083. isolated = node_page_state(pgdat, NR_ISOLATED_FILE);
  2084. } else {
  2085. inactive = node_page_state(pgdat, NR_INACTIVE_ANON);
  2086. isolated = node_page_state(pgdat, NR_ISOLATED_ANON);
  2087. }
  2088. /*
  2089. * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
  2090. * won't get blocked by normal direct-reclaimers, forming a circular
  2091. * deadlock.
  2092. */
  2093. if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
  2094. inactive >>= 3;
  2095. too_many = isolated > inactive;
  2096. /* Wake up tasks throttled due to too_many_isolated. */
  2097. if (!too_many)
  2098. wake_throttle_isolated(pgdat);
  2099. return too_many;
  2100. }
  2101. /*
  2102. * move_folios_to_lru() moves folios from private @list to appropriate LRU list.
  2103. * On return, @list is reused as a list of folios to be freed by the caller.
  2104. *
  2105. * Returns the number of pages moved to the given lruvec.
  2106. */
  2107. static unsigned int move_folios_to_lru(struct lruvec *lruvec,
  2108. struct list_head *list)
  2109. {
  2110. int nr_pages, nr_moved = 0;
  2111. LIST_HEAD(folios_to_free);
  2112. while (!list_empty(list)) {
  2113. struct folio *folio = lru_to_folio(list);
  2114. VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
  2115. list_del(&folio->lru);
  2116. if (unlikely(!folio_evictable(folio))) {
  2117. spin_unlock_irq(&lruvec->lru_lock);
  2118. folio_putback_lru(folio);
  2119. spin_lock_irq(&lruvec->lru_lock);
  2120. continue;
  2121. }
  2122. /*
  2123. * The folio_set_lru needs to be kept here for list integrity.
  2124. * Otherwise:
  2125. * #0 move_folios_to_lru #1 release_pages
  2126. * if (!folio_put_testzero())
  2127. * if (folio_put_testzero())
  2128. * !lru //skip lru_lock
  2129. * folio_set_lru()
  2130. * list_add(&folio->lru,)
  2131. * list_add(&folio->lru,)
  2132. */
  2133. folio_set_lru(folio);
  2134. if (unlikely(folio_put_testzero(folio))) {
  2135. __folio_clear_lru_flags(folio);
  2136. if (unlikely(folio_test_large(folio))) {
  2137. spin_unlock_irq(&lruvec->lru_lock);
  2138. destroy_large_folio(folio);
  2139. spin_lock_irq(&lruvec->lru_lock);
  2140. } else
  2141. list_add(&folio->lru, &folios_to_free);
  2142. continue;
  2143. }
  2144. /*
  2145. * All pages were isolated from the same lruvec (and isolation
  2146. * inhibits memcg migration).
  2147. */
  2148. VM_BUG_ON_FOLIO(!folio_matches_lruvec(folio, lruvec), folio);
  2149. lruvec_add_folio(lruvec, folio);
  2150. nr_pages = folio_nr_pages(folio);
  2151. nr_moved += nr_pages;
  2152. if (folio_test_active(folio))
  2153. workingset_age_nonresident(lruvec, nr_pages);
  2154. }
  2155. /*
  2156. * To save our caller's stack, now use input list for pages to free.
  2157. */
  2158. list_splice(&folios_to_free, list);
  2159. return nr_moved;
  2160. }
  2161. /*
  2162. * If a kernel thread (such as nfsd for loop-back mounts) services a backing
  2163. * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case
  2164. * we should not throttle. Otherwise it is safe to do so.
  2165. */
  2166. static int current_may_throttle(void)
  2167. {
  2168. return !(current->flags & PF_LOCAL_THROTTLE);
  2169. }
  2170. /*
  2171. * shrink_inactive_list() is a helper for shrink_node(). It returns the number
  2172. * of reclaimed pages
  2173. */
  2174. static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
  2175. struct lruvec *lruvec, struct scan_control *sc,
  2176. enum lru_list lru)
  2177. {
  2178. LIST_HEAD(folio_list);
  2179. unsigned long nr_scanned;
  2180. unsigned int nr_reclaimed = 0;
  2181. unsigned long nr_taken;
  2182. struct reclaim_stat stat;
  2183. bool file = is_file_lru(lru);
  2184. enum vm_event_item item;
  2185. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  2186. bool stalled = false;
  2187. while (unlikely(too_many_isolated(pgdat, file, sc))) {
  2188. if (stalled)
  2189. return 0;
  2190. /* wait a bit for the reclaimer. */
  2191. stalled = true;
  2192. reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
  2193. /* We are about to die and free our memory. Return now. */
  2194. if (fatal_signal_pending(current))
  2195. return SWAP_CLUSTER_MAX;
  2196. }
  2197. lru_add_drain();
  2198. spin_lock_irq(&lruvec->lru_lock);
  2199. nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list,
  2200. &nr_scanned, sc, lru);
  2201. __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
  2202. item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
  2203. if (!cgroup_reclaim(sc))
  2204. __count_vm_events(item, nr_scanned);
  2205. __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
  2206. __count_vm_events(PGSCAN_ANON + file, nr_scanned);
  2207. spin_unlock_irq(&lruvec->lru_lock);
  2208. if (nr_taken == 0)
  2209. return 0;
  2210. nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false);
  2211. spin_lock_irq(&lruvec->lru_lock);
  2212. move_folios_to_lru(lruvec, &folio_list);
  2213. __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
  2214. item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
  2215. if (!cgroup_reclaim(sc))
  2216. __count_vm_events(item, nr_reclaimed);
  2217. __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
  2218. __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
  2219. spin_unlock_irq(&lruvec->lru_lock);
  2220. lru_note_cost(lruvec, file, stat.nr_pageout);
  2221. mem_cgroup_uncharge_list(&folio_list);
  2222. free_unref_page_list(&folio_list);
  2223. /*
  2224. * If dirty folios are scanned that are not queued for IO, it
  2225. * implies that flushers are not doing their job. This can
  2226. * happen when memory pressure pushes dirty folios to the end of
  2227. * the LRU before the dirty limits are breached and the dirty
  2228. * data has expired. It can also happen when the proportion of
  2229. * dirty folios grows not through writes but through memory
  2230. * pressure reclaiming all the clean cache. And in some cases,
  2231. * the flushers simply cannot keep up with the allocation
  2232. * rate. Nudge the flusher threads in case they are asleep.
  2233. */
  2234. if (stat.nr_unqueued_dirty == nr_taken) {
  2235. wakeup_flusher_threads(WB_REASON_VMSCAN);
  2236. /*
  2237. * For cgroupv1 dirty throttling is achieved by waking up
  2238. * the kernel flusher here and later waiting on folios
  2239. * which are in writeback to finish (see shrink_folio_list()).
  2240. *
  2241. * Flusher may not be able to issue writeback quickly
  2242. * enough for cgroupv1 writeback throttling to work
  2243. * on a large system.
  2244. */
  2245. if (!writeback_throttling_sane(sc))
  2246. reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
  2247. }
  2248. sc->nr.dirty += stat.nr_dirty;
  2249. sc->nr.congested += stat.nr_congested;
  2250. sc->nr.unqueued_dirty += stat.nr_unqueued_dirty;
  2251. sc->nr.writeback += stat.nr_writeback;
  2252. sc->nr.immediate += stat.nr_immediate;
  2253. sc->nr.taken += nr_taken;
  2254. if (file)
  2255. sc->nr.file_taken += nr_taken;
  2256. trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id,
  2257. nr_scanned, nr_reclaimed, &stat, sc->priority, file);
  2258. return nr_reclaimed;
  2259. }
  2260. /*
  2261. * shrink_active_list() moves folios from the active LRU to the inactive LRU.
  2262. *
  2263. * We move them the other way if the folio is referenced by one or more
  2264. * processes.
  2265. *
  2266. * If the folios are mostly unmapped, the processing is fast and it is
  2267. * appropriate to hold lru_lock across the whole operation. But if
  2268. * the folios are mapped, the processing is slow (folio_referenced()), so
  2269. * we should drop lru_lock around each folio. It's impossible to balance
  2270. * this, so instead we remove the folios from the LRU while processing them.
  2271. * It is safe to rely on the active flag against the non-LRU folios in here
  2272. * because nobody will play with that bit on a non-LRU folio.
  2273. *
  2274. * The downside is that we have to touch folio->_refcount against each folio.
  2275. * But we had to alter folio->flags anyway.
  2276. */
  2277. static void shrink_active_list(unsigned long nr_to_scan,
  2278. struct lruvec *lruvec,
  2279. struct scan_control *sc,
  2280. enum lru_list lru)
  2281. {
  2282. unsigned long nr_taken;
  2283. unsigned long nr_scanned;
  2284. unsigned long vm_flags;
  2285. LIST_HEAD(l_hold); /* The folios which were snipped off */
  2286. LIST_HEAD(l_active);
  2287. LIST_HEAD(l_inactive);
  2288. unsigned nr_deactivate, nr_activate;
  2289. unsigned nr_rotated = 0;
  2290. int file = is_file_lru(lru);
  2291. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  2292. lru_add_drain();
  2293. spin_lock_irq(&lruvec->lru_lock);
  2294. nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold,
  2295. &nr_scanned, sc, lru);
  2296. __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
  2297. if (!cgroup_reclaim(sc))
  2298. __count_vm_events(PGREFILL, nr_scanned);
  2299. __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
  2300. spin_unlock_irq(&lruvec->lru_lock);
  2301. while (!list_empty(&l_hold)) {
  2302. struct folio *folio;
  2303. cond_resched();
  2304. folio = lru_to_folio(&l_hold);
  2305. list_del(&folio->lru);
  2306. if (unlikely(!folio_evictable(folio))) {
  2307. folio_putback_lru(folio);
  2308. continue;
  2309. }
  2310. if (unlikely(buffer_heads_over_limit)) {
  2311. if (folio_test_private(folio) && folio_trylock(folio)) {
  2312. if (folio_test_private(folio))
  2313. filemap_release_folio(folio, 0);
  2314. folio_unlock(folio);
  2315. }
  2316. }
  2317. /* Referenced or rmap lock contention: rotate */
  2318. if (folio_referenced(folio, 0, sc->target_mem_cgroup,
  2319. &vm_flags) != 0) {
  2320. /*
  2321. * Identify referenced, file-backed active folios and
  2322. * give them one more trip around the active list. So
  2323. * that executable code get better chances to stay in
  2324. * memory under moderate memory pressure. Anon folios
  2325. * are not likely to be evicted by use-once streaming
  2326. * IO, plus JVM can create lots of anon VM_EXEC folios,
  2327. * so we ignore them here.
  2328. */
  2329. if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) {
  2330. nr_rotated += folio_nr_pages(folio);
  2331. list_add(&folio->lru, &l_active);
  2332. continue;
  2333. }
  2334. }
  2335. folio_clear_active(folio); /* we are de-activating */
  2336. folio_set_workingset(folio);
  2337. list_add(&folio->lru, &l_inactive);
  2338. }
  2339. /*
  2340. * Move folios back to the lru list.
  2341. */
  2342. spin_lock_irq(&lruvec->lru_lock);
  2343. nr_activate = move_folios_to_lru(lruvec, &l_active);
  2344. nr_deactivate = move_folios_to_lru(lruvec, &l_inactive);
  2345. /* Keep all free folios in l_active list */
  2346. list_splice(&l_inactive, &l_active);
  2347. __count_vm_events(PGDEACTIVATE, nr_deactivate);
  2348. __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
  2349. __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
  2350. spin_unlock_irq(&lruvec->lru_lock);
  2351. mem_cgroup_uncharge_list(&l_active);
  2352. free_unref_page_list(&l_active);
  2353. trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
  2354. nr_deactivate, nr_rotated, sc->priority, file);
  2355. }
  2356. static unsigned int reclaim_folio_list(struct list_head *folio_list,
  2357. struct pglist_data *pgdat)
  2358. {
  2359. struct reclaim_stat dummy_stat;
  2360. unsigned int nr_reclaimed;
  2361. struct folio *folio;
  2362. struct scan_control sc = {
  2363. .gfp_mask = GFP_KERNEL,
  2364. .may_writepage = 1,
  2365. .may_unmap = 1,
  2366. .may_swap = 1,
  2367. .no_demotion = 1,
  2368. };
  2369. nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false);
  2370. while (!list_empty(folio_list)) {
  2371. folio = lru_to_folio(folio_list);
  2372. list_del(&folio->lru);
  2373. folio_putback_lru(folio);
  2374. }
  2375. return nr_reclaimed;
  2376. }
  2377. unsigned long reclaim_pages(struct list_head *folio_list)
  2378. {
  2379. int nid;
  2380. unsigned int nr_reclaimed = 0;
  2381. LIST_HEAD(node_folio_list);
  2382. unsigned int noreclaim_flag;
  2383. if (list_empty(folio_list))
  2384. return nr_reclaimed;
  2385. noreclaim_flag = memalloc_noreclaim_save();
  2386. nid = folio_nid(lru_to_folio(folio_list));
  2387. do {
  2388. struct folio *folio = lru_to_folio(folio_list);
  2389. if (nid == folio_nid(folio)) {
  2390. folio_clear_active(folio);
  2391. list_move(&folio->lru, &node_folio_list);
  2392. continue;
  2393. }
  2394. nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
  2395. nid = folio_nid(lru_to_folio(folio_list));
  2396. } while (!list_empty(folio_list));
  2397. nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid));
  2398. memalloc_noreclaim_restore(noreclaim_flag);
  2399. return nr_reclaimed;
  2400. }
  2401. static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
  2402. struct lruvec *lruvec, struct scan_control *sc)
  2403. {
  2404. if (is_active_lru(lru)) {
  2405. if (sc->may_deactivate & (1 << is_file_lru(lru)))
  2406. shrink_active_list(nr_to_scan, lruvec, sc, lru);
  2407. else
  2408. sc->skipped_deactivate = 1;
  2409. return 0;
  2410. }
  2411. return shrink_inactive_list(nr_to_scan, lruvec, sc, lru);
  2412. }
  2413. /*
  2414. * The inactive anon list should be small enough that the VM never has
  2415. * to do too much work.
  2416. *
  2417. * The inactive file list should be small enough to leave most memory
  2418. * to the established workingset on the scan-resistant active list,
  2419. * but large enough to avoid thrashing the aggregate readahead window.
  2420. *
  2421. * Both inactive lists should also be large enough that each inactive
  2422. * folio has a chance to be referenced again before it is reclaimed.
  2423. *
  2424. * If that fails and refaulting is observed, the inactive list grows.
  2425. *
  2426. * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios
  2427. * on this LRU, maintained by the pageout code. An inactive_ratio
  2428. * of 3 means 3:1 or 25% of the folios are kept on the inactive list.
  2429. *
  2430. * total target max
  2431. * memory ratio inactive
  2432. * -------------------------------------
  2433. * 10MB 1 5MB
  2434. * 100MB 1 50MB
  2435. * 1GB 3 250MB
  2436. * 10GB 10 0.9GB
  2437. * 100GB 31 3GB
  2438. * 1TB 101 10GB
  2439. * 10TB 320 32GB
  2440. */
  2441. static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru)
  2442. {
  2443. enum lru_list active_lru = inactive_lru + LRU_ACTIVE;
  2444. unsigned long inactive, active;
  2445. unsigned long inactive_ratio;
  2446. unsigned long gb;
  2447. inactive = lruvec_page_state(lruvec, NR_LRU_BASE + inactive_lru);
  2448. active = lruvec_page_state(lruvec, NR_LRU_BASE + active_lru);
  2449. gb = (inactive + active) >> (30 - PAGE_SHIFT);
  2450. if (gb)
  2451. inactive_ratio = int_sqrt(10 * gb);
  2452. else
  2453. inactive_ratio = 1;
  2454. return inactive * inactive_ratio < active;
  2455. }
  2456. enum scan_balance {
  2457. SCAN_EQUAL,
  2458. SCAN_FRACT,
  2459. SCAN_ANON,
  2460. SCAN_FILE,
  2461. };
  2462. static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc)
  2463. {
  2464. unsigned long file;
  2465. struct lruvec *target_lruvec;
  2466. bool bypass = false;
  2467. if (lru_gen_enabled())
  2468. return;
  2469. target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  2470. /*
  2471. * Flush the memory cgroup stats, so that we read accurate per-memcg
  2472. * lruvec stats for heuristics.
  2473. */
  2474. mem_cgroup_flush_stats();
  2475. /*
  2476. * Determine the scan balance between anon and file LRUs.
  2477. */
  2478. spin_lock_irq(&target_lruvec->lru_lock);
  2479. sc->anon_cost = target_lruvec->anon_cost;
  2480. sc->file_cost = target_lruvec->file_cost;
  2481. spin_unlock_irq(&target_lruvec->lru_lock);
  2482. /*
  2483. * Target desirable inactive:active list ratios for the anon
  2484. * and file LRU lists.
  2485. */
  2486. if (!sc->force_deactivate) {
  2487. unsigned long refaults;
  2488. /*
  2489. * When refaults are being observed, it means a new
  2490. * workingset is being established. Deactivate to get
  2491. * rid of any stale active pages quickly.
  2492. */
  2493. refaults = lruvec_page_state(target_lruvec,
  2494. WORKINGSET_ACTIVATE_ANON);
  2495. if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
  2496. inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
  2497. sc->may_deactivate |= DEACTIVATE_ANON;
  2498. else
  2499. sc->may_deactivate &= ~DEACTIVATE_ANON;
  2500. refaults = lruvec_page_state(target_lruvec,
  2501. WORKINGSET_ACTIVATE_FILE);
  2502. if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
  2503. inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
  2504. sc->may_deactivate |= DEACTIVATE_FILE;
  2505. else
  2506. sc->may_deactivate &= ~DEACTIVATE_FILE;
  2507. } else
  2508. sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE;
  2509. /*
  2510. * If we have plenty of inactive file pages that aren't
  2511. * thrashing, try to reclaim those first before touching
  2512. * anonymous pages.
  2513. */
  2514. file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE);
  2515. if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE))
  2516. sc->cache_trim_mode = 1;
  2517. else
  2518. sc->cache_trim_mode = 0;
  2519. trace_android_vh_file_is_tiny_bypass(sc->file_is_tiny, &bypass);
  2520. if (bypass)
  2521. return;
  2522. /*
  2523. * Prevent the reclaimer from falling into the cache trap: as
  2524. * cache pages start out inactive, every cache fault will tip
  2525. * the scan balance towards the file LRU. And as the file LRU
  2526. * shrinks, so does the window for rotation from references.
  2527. * This means we have a runaway feedback loop where a tiny
  2528. * thrashing file LRU becomes infinitely more attractive than
  2529. * anon pages. Try to detect this based on file LRU size.
  2530. */
  2531. if (!cgroup_reclaim(sc)) {
  2532. unsigned long total_high_wmark = 0;
  2533. unsigned long free, anon;
  2534. int z;
  2535. free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES);
  2536. file = node_page_state(pgdat, NR_ACTIVE_FILE) +
  2537. node_page_state(pgdat, NR_INACTIVE_FILE);
  2538. for (z = 0; z < MAX_NR_ZONES; z++) {
  2539. struct zone *zone = &pgdat->node_zones[z];
  2540. if (!managed_zone(zone))
  2541. continue;
  2542. total_high_wmark += high_wmark_pages(zone);
  2543. }
  2544. /*
  2545. * Consider anon: if that's low too, this isn't a
  2546. * runaway file reclaim problem, but rather just
  2547. * extreme pressure. Reclaim as per usual then.
  2548. */
  2549. anon = node_page_state(pgdat, NR_INACTIVE_ANON);
  2550. sc->file_is_tiny =
  2551. file + free <= total_high_wmark &&
  2552. !(sc->may_deactivate & DEACTIVATE_ANON) &&
  2553. anon >> sc->priority;
  2554. }
  2555. }
  2556. /*
  2557. * Determine how aggressively the anon and file LRU lists should be
  2558. * scanned.
  2559. *
  2560. * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan
  2561. * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan
  2562. */
  2563. static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
  2564. unsigned long *nr)
  2565. {
  2566. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  2567. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  2568. unsigned long anon_cost, file_cost, total_cost;
  2569. int swappiness = mem_cgroup_swappiness(memcg);
  2570. u64 fraction[ANON_AND_FILE];
  2571. u64 denominator = 0; /* gcc */
  2572. enum scan_balance scan_balance;
  2573. unsigned long ap, fp;
  2574. enum lru_list lru;
  2575. bool balance_anon_file_reclaim = false;
  2576. /* If we have no swap space, do not bother scanning anon folios. */
  2577. if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) {
  2578. scan_balance = SCAN_FILE;
  2579. goto out;
  2580. }
  2581. trace_android_vh_tune_swappiness(&swappiness);
  2582. /*
  2583. * Global reclaim will swap to prevent OOM even with no
  2584. * swappiness, but memcg users want to use this knob to
  2585. * disable swapping for individual groups completely when
  2586. * using the memory controller's swap limit feature would be
  2587. * too expensive.
  2588. */
  2589. if (cgroup_reclaim(sc) && !swappiness) {
  2590. scan_balance = SCAN_FILE;
  2591. goto out;
  2592. }
  2593. /*
  2594. * Do not apply any pressure balancing cleverness when the
  2595. * system is close to OOM, scan both anon and file equally
  2596. * (unless the swappiness setting disagrees with swapping).
  2597. */
  2598. if (!sc->priority && swappiness) {
  2599. scan_balance = SCAN_EQUAL;
  2600. goto out;
  2601. }
  2602. /*
  2603. * If the system is almost out of file pages, force-scan anon.
  2604. */
  2605. if (sc->file_is_tiny) {
  2606. scan_balance = SCAN_ANON;
  2607. goto out;
  2608. }
  2609. trace_android_rvh_set_balance_anon_file_reclaim(&balance_anon_file_reclaim);
  2610. /*
  2611. * If there is enough inactive page cache, we do not reclaim
  2612. * anything from the anonymous working right now. But when balancing
  2613. * anon and page cache files for reclaim, allow swapping of anon pages
  2614. * even if there are a number of inactive file cache pages.
  2615. */
  2616. if (!balance_anon_file_reclaim && sc->cache_trim_mode) {
  2617. scan_balance = SCAN_FILE;
  2618. goto out;
  2619. }
  2620. scan_balance = SCAN_FRACT;
  2621. /*
  2622. * Calculate the pressure balance between anon and file pages.
  2623. *
  2624. * The amount of pressure we put on each LRU is inversely
  2625. * proportional to the cost of reclaiming each list, as
  2626. * determined by the share of pages that are refaulting, times
  2627. * the relative IO cost of bringing back a swapped out
  2628. * anonymous page vs reloading a filesystem page (swappiness).
  2629. *
  2630. * Although we limit that influence to ensure no list gets
  2631. * left behind completely: at least a third of the pressure is
  2632. * applied, before swappiness.
  2633. *
  2634. * With swappiness at 100, anon and file have equal IO cost.
  2635. */
  2636. total_cost = sc->anon_cost + sc->file_cost;
  2637. anon_cost = total_cost + sc->anon_cost;
  2638. file_cost = total_cost + sc->file_cost;
  2639. total_cost = anon_cost + file_cost;
  2640. ap = swappiness * (total_cost + 1);
  2641. ap /= anon_cost + 1;
  2642. fp = (200 - swappiness) * (total_cost + 1);
  2643. fp /= file_cost + 1;
  2644. fraction[0] = ap;
  2645. fraction[1] = fp;
  2646. denominator = ap + fp;
  2647. out:
  2648. trace_android_vh_tune_scan_type(&scan_balance);
  2649. for_each_evictable_lru(lru) {
  2650. int file = is_file_lru(lru);
  2651. unsigned long lruvec_size;
  2652. unsigned long low, min;
  2653. unsigned long scan;
  2654. lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
  2655. mem_cgroup_protection(sc->target_mem_cgroup, memcg,
  2656. &min, &low);
  2657. if (min || low) {
  2658. /*
  2659. * Scale a cgroup's reclaim pressure by proportioning
  2660. * its current usage to its memory.low or memory.min
  2661. * setting.
  2662. *
  2663. * This is important, as otherwise scanning aggression
  2664. * becomes extremely binary -- from nothing as we
  2665. * approach the memory protection threshold, to totally
  2666. * nominal as we exceed it. This results in requiring
  2667. * setting extremely liberal protection thresholds. It
  2668. * also means we simply get no protection at all if we
  2669. * set it too low, which is not ideal.
  2670. *
  2671. * If there is any protection in place, we reduce scan
  2672. * pressure by how much of the total memory used is
  2673. * within protection thresholds.
  2674. *
  2675. * There is one special case: in the first reclaim pass,
  2676. * we skip over all groups that are within their low
  2677. * protection. If that fails to reclaim enough pages to
  2678. * satisfy the reclaim goal, we come back and override
  2679. * the best-effort low protection. However, we still
  2680. * ideally want to honor how well-behaved groups are in
  2681. * that case instead of simply punishing them all
  2682. * equally. As such, we reclaim them based on how much
  2683. * memory they are using, reducing the scan pressure
  2684. * again by how much of the total memory used is under
  2685. * hard protection.
  2686. */
  2687. unsigned long cgroup_size = mem_cgroup_size(memcg);
  2688. unsigned long protection;
  2689. /* memory.low scaling, make sure we retry before OOM */
  2690. if (!sc->memcg_low_reclaim && low > min) {
  2691. protection = low;
  2692. sc->memcg_low_skipped = 1;
  2693. } else {
  2694. protection = min;
  2695. }
  2696. /* Avoid TOCTOU with earlier protection check */
  2697. cgroup_size = max(cgroup_size, protection);
  2698. scan = lruvec_size - lruvec_size * protection /
  2699. (cgroup_size + 1);
  2700. /*
  2701. * Minimally target SWAP_CLUSTER_MAX pages to keep
  2702. * reclaim moving forwards, avoiding decrementing
  2703. * sc->priority further than desirable.
  2704. */
  2705. scan = max(scan, SWAP_CLUSTER_MAX);
  2706. } else {
  2707. scan = lruvec_size;
  2708. }
  2709. scan >>= sc->priority;
  2710. /*
  2711. * If the cgroup's already been deleted, make sure to
  2712. * scrape out the remaining cache.
  2713. */
  2714. if (!scan && !mem_cgroup_online(memcg))
  2715. scan = min(lruvec_size, SWAP_CLUSTER_MAX);
  2716. switch (scan_balance) {
  2717. case SCAN_EQUAL:
  2718. /* Scan lists relative to size */
  2719. break;
  2720. case SCAN_FRACT:
  2721. /*
  2722. * Scan types proportional to swappiness and
  2723. * their relative recent reclaim efficiency.
  2724. * Make sure we don't miss the last page on
  2725. * the offlined memory cgroups because of a
  2726. * round-off error.
  2727. */
  2728. scan = mem_cgroup_online(memcg) ?
  2729. div64_u64(scan * fraction[file], denominator) :
  2730. DIV64_U64_ROUND_UP(scan * fraction[file],
  2731. denominator);
  2732. break;
  2733. case SCAN_FILE:
  2734. case SCAN_ANON:
  2735. /* Scan one type exclusively */
  2736. if ((scan_balance == SCAN_FILE) != file)
  2737. scan = 0;
  2738. break;
  2739. default:
  2740. /* Look ma, no brain */
  2741. BUG();
  2742. }
  2743. nr[lru] = scan;
  2744. }
  2745. }
  2746. /*
  2747. * Anonymous LRU management is a waste if there is
  2748. * ultimately no way to reclaim the memory.
  2749. */
  2750. static bool can_age_anon_pages(struct pglist_data *pgdat,
  2751. struct scan_control *sc)
  2752. {
  2753. /* Aging the anon LRU is valuable if swap is present: */
  2754. if (total_swap_pages > 0)
  2755. return true;
  2756. /* Also valuable if anon pages can be demoted: */
  2757. return can_demote(pgdat->node_id, sc);
  2758. }
  2759. #ifdef CONFIG_LRU_GEN
  2760. #ifdef CONFIG_LRU_GEN_ENABLED
  2761. DEFINE_STATIC_KEY_ARRAY_TRUE(lru_gen_caps, NR_LRU_GEN_CAPS);
  2762. #define get_cap(cap) static_branch_likely(&lru_gen_caps[cap])
  2763. #else
  2764. DEFINE_STATIC_KEY_ARRAY_FALSE(lru_gen_caps, NR_LRU_GEN_CAPS);
  2765. #define get_cap(cap) static_branch_unlikely(&lru_gen_caps[cap])
  2766. #endif
  2767. /******************************************************************************
  2768. * shorthand helpers
  2769. ******************************************************************************/
  2770. #define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset))
  2771. #define DEFINE_MAX_SEQ(lruvec) \
  2772. unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq)
  2773. #define DEFINE_MIN_SEQ(lruvec) \
  2774. unsigned long min_seq[ANON_AND_FILE] = { \
  2775. READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \
  2776. READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \
  2777. }
  2778. #define for_each_gen_type_zone(gen, type, zone) \
  2779. for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \
  2780. for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \
  2781. for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++)
  2782. #define get_memcg_gen(seq) ((seq) % MEMCG_NR_GENS)
  2783. #define get_memcg_bin(bin) ((bin) % MEMCG_NR_BINS)
  2784. static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
  2785. {
  2786. struct pglist_data *pgdat = NODE_DATA(nid);
  2787. #ifdef CONFIG_MEMCG
  2788. if (memcg) {
  2789. struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
  2790. /* for hotadd_new_pgdat() */
  2791. if (!lruvec->pgdat)
  2792. lruvec->pgdat = pgdat;
  2793. return lruvec;
  2794. }
  2795. #endif
  2796. VM_WARN_ON_ONCE(!mem_cgroup_disabled());
  2797. return pgdat ? &pgdat->__lruvec : NULL;
  2798. }
  2799. static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
  2800. {
  2801. int swappiness;
  2802. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  2803. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  2804. if (!sc->may_swap)
  2805. return 0;
  2806. if (!can_demote(pgdat->node_id, sc) &&
  2807. mem_cgroup_get_nr_swap_pages(memcg) <= 0)
  2808. return 0;
  2809. swappiness = mem_cgroup_swappiness(memcg);
  2810. trace_android_vh_tune_swappiness(&swappiness);
  2811. return swappiness;
  2812. }
  2813. static int get_nr_gens(struct lruvec *lruvec, int type)
  2814. {
  2815. return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1;
  2816. }
  2817. static bool __maybe_unused seq_is_valid(struct lruvec *lruvec)
  2818. {
  2819. /* see the comment on lru_gen_folio */
  2820. return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS &&
  2821. get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) &&
  2822. get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS;
  2823. }
  2824. /******************************************************************************
  2825. * Bloom filters
  2826. ******************************************************************************/
  2827. /*
  2828. * Bloom filters with m=1<<15, k=2 and the false positive rates of ~1/5 when
  2829. * n=10,000 and ~1/2 when n=20,000, where, conventionally, m is the number of
  2830. * bits in a bitmap, k is the number of hash functions and n is the number of
  2831. * inserted items.
  2832. *
  2833. * Page table walkers use one of the two filters to reduce their search space.
  2834. * To get rid of non-leaf entries that no longer have enough leaf entries, the
  2835. * aging uses the double-buffering technique to flip to the other filter each
  2836. * time it produces a new generation. For non-leaf entries that have enough
  2837. * leaf entries, the aging carries them over to the next generation in
  2838. * walk_pmd_range(); the eviction also report them when walking the rmap
  2839. * in lru_gen_look_around().
  2840. *
  2841. * For future optimizations:
  2842. * 1. It's not necessary to keep both filters all the time. The spare one can be
  2843. * freed after the RCU grace period and reallocated if needed again.
  2844. * 2. And when reallocating, it's worth scaling its size according to the number
  2845. * of inserted entries in the other filter, to reduce the memory overhead on
  2846. * small systems and false positives on large systems.
  2847. * 3. Jenkins' hash function is an alternative to Knuth's.
  2848. */
  2849. #define BLOOM_FILTER_SHIFT 15
  2850. static inline int filter_gen_from_seq(unsigned long seq)
  2851. {
  2852. return seq % NR_BLOOM_FILTERS;
  2853. }
  2854. static void get_item_key(void *item, int *key)
  2855. {
  2856. u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
  2857. BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
  2858. key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
  2859. key[1] = hash >> BLOOM_FILTER_SHIFT;
  2860. }
  2861. static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
  2862. {
  2863. int key[2];
  2864. unsigned long *filter;
  2865. int gen = filter_gen_from_seq(seq);
  2866. filter = READ_ONCE(lruvec->mm_state.filters[gen]);
  2867. if (!filter)
  2868. return true;
  2869. get_item_key(item, key);
  2870. return test_bit(key[0], filter) && test_bit(key[1], filter);
  2871. }
  2872. static void update_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
  2873. {
  2874. int key[2];
  2875. unsigned long *filter;
  2876. int gen = filter_gen_from_seq(seq);
  2877. filter = READ_ONCE(lruvec->mm_state.filters[gen]);
  2878. if (!filter)
  2879. return;
  2880. get_item_key(item, key);
  2881. if (!test_bit(key[0], filter))
  2882. set_bit(key[0], filter);
  2883. if (!test_bit(key[1], filter))
  2884. set_bit(key[1], filter);
  2885. }
  2886. static void reset_bloom_filter(struct lruvec *lruvec, unsigned long seq)
  2887. {
  2888. unsigned long *filter;
  2889. int gen = filter_gen_from_seq(seq);
  2890. filter = lruvec->mm_state.filters[gen];
  2891. if (filter) {
  2892. bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
  2893. return;
  2894. }
  2895. filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT),
  2896. __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
  2897. WRITE_ONCE(lruvec->mm_state.filters[gen], filter);
  2898. }
  2899. /******************************************************************************
  2900. * mm_struct list
  2901. ******************************************************************************/
  2902. static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
  2903. {
  2904. static struct lru_gen_mm_list mm_list = {
  2905. .fifo = LIST_HEAD_INIT(mm_list.fifo),
  2906. .lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
  2907. };
  2908. #ifdef CONFIG_MEMCG
  2909. if (memcg)
  2910. return &memcg->mm_list;
  2911. #endif
  2912. VM_WARN_ON_ONCE(!mem_cgroup_disabled());
  2913. return &mm_list;
  2914. }
  2915. void lru_gen_add_mm(struct mm_struct *mm)
  2916. {
  2917. int nid;
  2918. struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
  2919. struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  2920. VM_WARN_ON_ONCE(!list_empty(&mm->lru_gen.list));
  2921. #ifdef CONFIG_MEMCG
  2922. VM_WARN_ON_ONCE(mm->lru_gen.memcg);
  2923. mm->lru_gen.memcg = memcg;
  2924. #endif
  2925. spin_lock(&mm_list->lock);
  2926. for_each_node_state(nid, N_MEMORY) {
  2927. struct lruvec *lruvec = get_lruvec(memcg, nid);
  2928. if (!lruvec)
  2929. continue;
  2930. /* the first addition since the last iteration */
  2931. if (lruvec->mm_state.tail == &mm_list->fifo)
  2932. lruvec->mm_state.tail = &mm->lru_gen.list;
  2933. }
  2934. list_add_tail(&mm->lru_gen.list, &mm_list->fifo);
  2935. spin_unlock(&mm_list->lock);
  2936. }
  2937. void lru_gen_del_mm(struct mm_struct *mm)
  2938. {
  2939. int nid;
  2940. struct lru_gen_mm_list *mm_list;
  2941. struct mem_cgroup *memcg = NULL;
  2942. if (list_empty(&mm->lru_gen.list))
  2943. return;
  2944. #ifdef CONFIG_MEMCG
  2945. memcg = mm->lru_gen.memcg;
  2946. #endif
  2947. mm_list = get_mm_list(memcg);
  2948. spin_lock(&mm_list->lock);
  2949. for_each_node(nid) {
  2950. struct lruvec *lruvec = get_lruvec(memcg, nid);
  2951. if (!lruvec)
  2952. continue;
  2953. /* where the current iteration continues after */
  2954. if (lruvec->mm_state.head == &mm->lru_gen.list)
  2955. lruvec->mm_state.head = lruvec->mm_state.head->prev;
  2956. /* where the last iteration ended before */
  2957. if (lruvec->mm_state.tail == &mm->lru_gen.list)
  2958. lruvec->mm_state.tail = lruvec->mm_state.tail->next;
  2959. }
  2960. list_del_init(&mm->lru_gen.list);
  2961. spin_unlock(&mm_list->lock);
  2962. #ifdef CONFIG_MEMCG
  2963. mem_cgroup_put(mm->lru_gen.memcg);
  2964. mm->lru_gen.memcg = NULL;
  2965. #endif
  2966. }
  2967. #ifdef CONFIG_MEMCG
  2968. void lru_gen_migrate_mm(struct mm_struct *mm)
  2969. {
  2970. struct mem_cgroup *memcg;
  2971. struct task_struct *task = rcu_dereference_protected(mm->owner, true);
  2972. VM_WARN_ON_ONCE(task->mm != mm);
  2973. lockdep_assert_held(&task->alloc_lock);
  2974. /* for mm_update_next_owner() */
  2975. if (mem_cgroup_disabled())
  2976. return;
  2977. /* migration can happen before addition */
  2978. if (!mm->lru_gen.memcg)
  2979. return;
  2980. rcu_read_lock();
  2981. memcg = mem_cgroup_from_task(task);
  2982. rcu_read_unlock();
  2983. if (memcg == mm->lru_gen.memcg)
  2984. return;
  2985. VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
  2986. lru_gen_del_mm(mm);
  2987. lru_gen_add_mm(mm);
  2988. }
  2989. #endif
  2990. static void reset_mm_stats(struct lruvec *lruvec, struct lru_gen_mm_walk *walk, bool last)
  2991. {
  2992. int i;
  2993. int hist;
  2994. lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
  2995. if (walk) {
  2996. hist = lru_hist_from_seq(walk->max_seq);
  2997. for (i = 0; i < NR_MM_STATS; i++) {
  2998. WRITE_ONCE(lruvec->mm_state.stats[hist][i],
  2999. lruvec->mm_state.stats[hist][i] + walk->mm_stats[i]);
  3000. walk->mm_stats[i] = 0;
  3001. }
  3002. }
  3003. if (NR_HIST_GENS > 1 && last) {
  3004. hist = lru_hist_from_seq(lruvec->mm_state.seq + 1);
  3005. for (i = 0; i < NR_MM_STATS; i++)
  3006. WRITE_ONCE(lruvec->mm_state.stats[hist][i], 0);
  3007. }
  3008. }
  3009. static bool should_skip_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk)
  3010. {
  3011. int type;
  3012. unsigned long size = 0;
  3013. struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  3014. int key = pgdat->node_id % BITS_PER_TYPE(mm->lru_gen.bitmap);
  3015. if (!walk->force_scan && !test_bit(key, &mm->lru_gen.bitmap))
  3016. return true;
  3017. clear_bit(key, &mm->lru_gen.bitmap);
  3018. for (type = !walk->can_swap; type < ANON_AND_FILE; type++) {
  3019. size += type ? get_mm_counter(mm, MM_FILEPAGES) :
  3020. get_mm_counter(mm, MM_ANONPAGES) +
  3021. get_mm_counter(mm, MM_SHMEMPAGES);
  3022. }
  3023. if (size < MIN_LRU_BATCH)
  3024. return true;
  3025. return !mmget_not_zero(mm);
  3026. }
  3027. static bool iterate_mm_list(struct lruvec *lruvec, struct lru_gen_mm_walk *walk,
  3028. struct mm_struct **iter)
  3029. {
  3030. bool first = false;
  3031. bool last = false;
  3032. struct mm_struct *mm = NULL;
  3033. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  3034. struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  3035. struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
  3036. /*
  3037. * mm_state->seq is incremented after each iteration of mm_list. There
  3038. * are three interesting cases for this page table walker:
  3039. * 1. It tries to start a new iteration with a stale max_seq: there is
  3040. * nothing left to do.
  3041. * 2. It started the next iteration: it needs to reset the Bloom filter
  3042. * so that a fresh set of PTE tables can be recorded.
  3043. * 3. It ended the current iteration: it needs to reset the mm stats
  3044. * counters and tell its caller to increment max_seq.
  3045. */
  3046. spin_lock(&mm_list->lock);
  3047. VM_WARN_ON_ONCE(mm_state->seq + 1 < walk->max_seq);
  3048. if (walk->max_seq <= mm_state->seq)
  3049. goto done;
  3050. if (!mm_state->head)
  3051. mm_state->head = &mm_list->fifo;
  3052. if (mm_state->head == &mm_list->fifo)
  3053. first = true;
  3054. do {
  3055. mm_state->head = mm_state->head->next;
  3056. if (mm_state->head == &mm_list->fifo) {
  3057. WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
  3058. last = true;
  3059. break;
  3060. }
  3061. /* force scan for those added after the last iteration */
  3062. if (!mm_state->tail || mm_state->tail == mm_state->head) {
  3063. mm_state->tail = mm_state->head->next;
  3064. walk->force_scan = true;
  3065. }
  3066. mm = list_entry(mm_state->head, struct mm_struct, lru_gen.list);
  3067. if (should_skip_mm(mm, walk))
  3068. mm = NULL;
  3069. } while (!mm);
  3070. done:
  3071. if (*iter || last)
  3072. reset_mm_stats(lruvec, walk, last);
  3073. spin_unlock(&mm_list->lock);
  3074. if (mm && first)
  3075. reset_bloom_filter(lruvec, walk->max_seq + 1);
  3076. if (*iter)
  3077. mmput_async(*iter);
  3078. *iter = mm;
  3079. return last;
  3080. }
  3081. static bool iterate_mm_list_nowalk(struct lruvec *lruvec, unsigned long max_seq)
  3082. {
  3083. bool success = false;
  3084. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  3085. struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
  3086. struct lru_gen_mm_state *mm_state = &lruvec->mm_state;
  3087. spin_lock(&mm_list->lock);
  3088. VM_WARN_ON_ONCE(mm_state->seq + 1 < max_seq);
  3089. if (max_seq > mm_state->seq) {
  3090. mm_state->head = NULL;
  3091. mm_state->tail = NULL;
  3092. WRITE_ONCE(mm_state->seq, mm_state->seq + 1);
  3093. reset_mm_stats(lruvec, NULL, true);
  3094. success = true;
  3095. }
  3096. spin_unlock(&mm_list->lock);
  3097. return success;
  3098. }
  3099. /******************************************************************************
  3100. * refault feedback loop
  3101. ******************************************************************************/
  3102. /*
  3103. * A feedback loop based on Proportional-Integral-Derivative (PID) controller.
  3104. *
  3105. * The P term is refaulted/(evicted+protected) from a tier in the generation
  3106. * currently being evicted; the I term is the exponential moving average of the
  3107. * P term over the generations previously evicted, using the smoothing factor
  3108. * 1/2; the D term isn't supported.
  3109. *
  3110. * The setpoint (SP) is always the first tier of one type; the process variable
  3111. * (PV) is either any tier of the other type or any other tier of the same
  3112. * type.
  3113. *
  3114. * The error is the difference between the SP and the PV; the correction is to
  3115. * turn off protection when SP>PV or turn on protection when SP<PV.
  3116. *
  3117. * For future optimizations:
  3118. * 1. The D term may discount the other two terms over time so that long-lived
  3119. * generations can resist stale information.
  3120. */
  3121. struct ctrl_pos {
  3122. unsigned long refaulted;
  3123. unsigned long total;
  3124. int gain;
  3125. };
  3126. static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain,
  3127. struct ctrl_pos *pos)
  3128. {
  3129. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3130. int hist = lru_hist_from_seq(lrugen->min_seq[type]);
  3131. pos->refaulted = lrugen->avg_refaulted[type][tier] +
  3132. atomic_long_read(&lrugen->refaulted[hist][type][tier]);
  3133. pos->total = lrugen->avg_total[type][tier] +
  3134. atomic_long_read(&lrugen->evicted[hist][type][tier]);
  3135. if (tier)
  3136. pos->total += lrugen->protected[hist][type][tier - 1];
  3137. pos->gain = gain;
  3138. }
  3139. static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover)
  3140. {
  3141. int hist, tier;
  3142. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3143. bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1;
  3144. unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1;
  3145. lockdep_assert_held(&lruvec->lru_lock);
  3146. if (!carryover && !clear)
  3147. return;
  3148. hist = lru_hist_from_seq(seq);
  3149. for (tier = 0; tier < MAX_NR_TIERS; tier++) {
  3150. if (carryover) {
  3151. unsigned long sum;
  3152. sum = lrugen->avg_refaulted[type][tier] +
  3153. atomic_long_read(&lrugen->refaulted[hist][type][tier]);
  3154. WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2);
  3155. sum = lrugen->avg_total[type][tier] +
  3156. atomic_long_read(&lrugen->evicted[hist][type][tier]);
  3157. if (tier)
  3158. sum += lrugen->protected[hist][type][tier - 1];
  3159. WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2);
  3160. }
  3161. if (clear) {
  3162. atomic_long_set(&lrugen->refaulted[hist][type][tier], 0);
  3163. atomic_long_set(&lrugen->evicted[hist][type][tier], 0);
  3164. if (tier)
  3165. WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0);
  3166. }
  3167. }
  3168. }
  3169. static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv)
  3170. {
  3171. /*
  3172. * Return true if the PV has a limited number of refaults or a lower
  3173. * refaulted/total than the SP.
  3174. */
  3175. return pv->refaulted < MIN_LRU_BATCH ||
  3176. pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <=
  3177. (sp->refaulted + 1) * pv->total * pv->gain;
  3178. }
  3179. /******************************************************************************
  3180. * the aging
  3181. ******************************************************************************/
  3182. /* promote pages accessed through page tables */
  3183. static int folio_update_gen(struct folio *folio, int gen)
  3184. {
  3185. unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
  3186. VM_WARN_ON_ONCE(gen >= MAX_NR_GENS);
  3187. VM_WARN_ON_ONCE(!rcu_read_lock_held());
  3188. do {
  3189. /* lru_gen_del_folio() has isolated this page? */
  3190. if (!(old_flags & LRU_GEN_MASK)) {
  3191. /* for shrink_folio_list() */
  3192. new_flags = old_flags | BIT(PG_referenced);
  3193. continue;
  3194. }
  3195. new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
  3196. new_flags |= (gen + 1UL) << LRU_GEN_PGOFF;
  3197. } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
  3198. return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  3199. }
  3200. /* protect pages accessed multiple times through file descriptors */
  3201. static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming)
  3202. {
  3203. int type = folio_is_file_lru(folio);
  3204. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3205. int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
  3206. unsigned long new_flags, old_flags = READ_ONCE(folio->flags);
  3207. VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio);
  3208. do {
  3209. new_gen = ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1;
  3210. /* folio_update_gen() has promoted this page? */
  3211. if (new_gen >= 0 && new_gen != old_gen)
  3212. return new_gen;
  3213. new_gen = (old_gen + 1) % MAX_NR_GENS;
  3214. new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS);
  3215. new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF;
  3216. /* for folio_end_writeback() */
  3217. if (reclaiming)
  3218. new_flags |= BIT(PG_reclaim);
  3219. } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags));
  3220. lru_gen_update_size(lruvec, folio, old_gen, new_gen);
  3221. return new_gen;
  3222. }
  3223. static void update_batch_size(struct lru_gen_mm_walk *walk, struct folio *folio,
  3224. int old_gen, int new_gen)
  3225. {
  3226. int type = folio_is_file_lru(folio);
  3227. int zone = folio_zonenum(folio);
  3228. int delta = folio_nr_pages(folio);
  3229. VM_WARN_ON_ONCE(old_gen >= MAX_NR_GENS);
  3230. VM_WARN_ON_ONCE(new_gen >= MAX_NR_GENS);
  3231. walk->batched++;
  3232. walk->nr_pages[old_gen][type][zone] -= delta;
  3233. walk->nr_pages[new_gen][type][zone] += delta;
  3234. }
  3235. static void reset_batch_size(struct lruvec *lruvec, struct lru_gen_mm_walk *walk)
  3236. {
  3237. int gen, type, zone;
  3238. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3239. walk->batched = 0;
  3240. for_each_gen_type_zone(gen, type, zone) {
  3241. enum lru_list lru = type * LRU_INACTIVE_FILE;
  3242. int delta = walk->nr_pages[gen][type][zone];
  3243. if (!delta)
  3244. continue;
  3245. walk->nr_pages[gen][type][zone] = 0;
  3246. WRITE_ONCE(lrugen->nr_pages[gen][type][zone],
  3247. lrugen->nr_pages[gen][type][zone] + delta);
  3248. if (lru_gen_is_active(lruvec, gen))
  3249. lru += LRU_ACTIVE;
  3250. __update_lru_size(lruvec, lru, zone, delta);
  3251. }
  3252. }
  3253. static int should_skip_vma(unsigned long start, unsigned long end, struct mm_walk *args)
  3254. {
  3255. struct address_space *mapping;
  3256. struct vm_area_struct *vma = args->vma;
  3257. struct lru_gen_mm_walk *walk = args->private;
  3258. if (!vma_is_accessible(vma))
  3259. return true;
  3260. if (is_vm_hugetlb_page(vma))
  3261. return true;
  3262. if (!vma_has_recency(vma))
  3263. return true;
  3264. if (vma->vm_flags & (VM_LOCKED | VM_SPECIAL))
  3265. return true;
  3266. if (vma == get_gate_vma(vma->vm_mm))
  3267. return true;
  3268. if (vma_is_anonymous(vma))
  3269. return !walk->can_swap;
  3270. if (WARN_ON_ONCE(!vma->vm_file || !vma->vm_file->f_mapping))
  3271. return true;
  3272. mapping = vma->vm_file->f_mapping;
  3273. if (mapping_unevictable(mapping))
  3274. return true;
  3275. if (shmem_mapping(mapping))
  3276. return !walk->can_swap;
  3277. /* to exclude special mappings like dax, etc. */
  3278. return !mapping->a_ops->read_folio;
  3279. }
  3280. /*
  3281. * Some userspace memory allocators map many single-page VMAs. Instead of
  3282. * returning back to the PGD table for each of such VMAs, finish an entire PMD
  3283. * table to reduce zigzags and improve cache performance.
  3284. */
  3285. static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk *args,
  3286. unsigned long *vm_start, unsigned long *vm_end)
  3287. {
  3288. unsigned long start = round_up(*vm_end, size);
  3289. unsigned long end = (start | ~mask) + 1;
  3290. VMA_ITERATOR(vmi, args->mm, start);
  3291. VM_WARN_ON_ONCE(mask & size);
  3292. VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask));
  3293. for_each_vma(vmi, args->vma) {
  3294. if (end && end <= args->vma->vm_start)
  3295. return false;
  3296. if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args))
  3297. continue;
  3298. *vm_start = max(start, args->vma->vm_start);
  3299. *vm_end = min(end - 1, args->vma->vm_end - 1) + 1;
  3300. return true;
  3301. }
  3302. return false;
  3303. }
  3304. static unsigned long get_pte_pfn(pte_t pte, struct vm_area_struct *vma, unsigned long addr)
  3305. {
  3306. unsigned long pfn = pte_pfn(pte);
  3307. VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
  3308. if (!pte_present(pte) || is_zero_pfn(pfn))
  3309. return -1;
  3310. if (WARN_ON_ONCE(pte_devmap(pte) || pte_special(pte)))
  3311. return -1;
  3312. if (WARN_ON_ONCE(!pfn_valid(pfn)))
  3313. return -1;
  3314. return pfn;
  3315. }
  3316. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
  3317. static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned long addr)
  3318. {
  3319. unsigned long pfn = pmd_pfn(pmd);
  3320. VM_WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end);
  3321. if (!pmd_present(pmd) || is_huge_zero_pmd(pmd))
  3322. return -1;
  3323. if (WARN_ON_ONCE(pmd_devmap(pmd)))
  3324. return -1;
  3325. if (WARN_ON_ONCE(!pfn_valid(pfn)))
  3326. return -1;
  3327. return pfn;
  3328. }
  3329. #endif
  3330. static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg,
  3331. struct pglist_data *pgdat, bool can_swap)
  3332. {
  3333. struct folio *folio;
  3334. /* try to avoid unnecessary memory loads */
  3335. if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
  3336. return NULL;
  3337. folio = pfn_folio(pfn);
  3338. if (folio_nid(folio) != pgdat->node_id)
  3339. return NULL;
  3340. if (folio_memcg_rcu(folio) != memcg)
  3341. return NULL;
  3342. /* file VMAs can contain anon pages from COW */
  3343. if (!folio_is_file_lru(folio) && !can_swap)
  3344. return NULL;
  3345. return folio;
  3346. }
  3347. static bool suitable_to_scan(int total, int young)
  3348. {
  3349. int n = clamp_t(int, cache_line_size() / sizeof(pte_t), 2, 8);
  3350. /* suitable if the average number of young PTEs per cacheline is >=1 */
  3351. return young * n >= total;
  3352. }
  3353. static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end,
  3354. struct mm_walk *args)
  3355. {
  3356. int i;
  3357. pte_t *pte;
  3358. spinlock_t *ptl;
  3359. unsigned long addr;
  3360. int total = 0;
  3361. int young = 0;
  3362. struct lru_gen_mm_walk *walk = args->private;
  3363. struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
  3364. struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  3365. int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
  3366. VM_WARN_ON_ONCE(pmd_leaf(*pmd));
  3367. ptl = pte_lockptr(args->mm, pmd);
  3368. if (!spin_trylock(ptl))
  3369. return false;
  3370. arch_enter_lazy_mmu_mode();
  3371. pte = pte_offset_map(pmd, start & PMD_MASK);
  3372. restart:
  3373. for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) {
  3374. unsigned long pfn;
  3375. struct folio *folio;
  3376. total++;
  3377. walk->mm_stats[MM_LEAF_TOTAL]++;
  3378. pfn = get_pte_pfn(pte[i], args->vma, addr);
  3379. if (pfn == -1)
  3380. continue;
  3381. if (!pte_young(pte[i])) {
  3382. walk->mm_stats[MM_LEAF_OLD]++;
  3383. continue;
  3384. }
  3385. folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
  3386. if (!folio)
  3387. continue;
  3388. if (!ptep_test_and_clear_young(args->vma, addr, pte + i))
  3389. VM_WARN_ON_ONCE(true);
  3390. young++;
  3391. walk->mm_stats[MM_LEAF_YOUNG]++;
  3392. if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
  3393. !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
  3394. !folio_test_swapcache(folio)))
  3395. folio_mark_dirty(folio);
  3396. old_gen = folio_update_gen(folio, new_gen);
  3397. if (old_gen >= 0 && old_gen != new_gen)
  3398. update_batch_size(walk, folio, old_gen, new_gen);
  3399. }
  3400. if (i < PTRS_PER_PTE && get_next_vma(PMD_MASK, PAGE_SIZE, args, &start, &end))
  3401. goto restart;
  3402. pte_unmap(pte);
  3403. arch_leave_lazy_mmu_mode();
  3404. spin_unlock(ptl);
  3405. return suitable_to_scan(total, young);
  3406. }
  3407. #if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG)
  3408. static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
  3409. struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
  3410. {
  3411. int i;
  3412. pmd_t *pmd;
  3413. spinlock_t *ptl;
  3414. struct lru_gen_mm_walk *walk = args->private;
  3415. struct mem_cgroup *memcg = lruvec_memcg(walk->lruvec);
  3416. struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  3417. int old_gen, new_gen = lru_gen_from_seq(walk->max_seq);
  3418. VM_WARN_ON_ONCE(pud_leaf(*pud));
  3419. /* try to batch at most 1+MIN_LRU_BATCH+1 entries */
  3420. if (*first == -1) {
  3421. *first = addr;
  3422. bitmap_zero(bitmap, MIN_LRU_BATCH);
  3423. return;
  3424. }
  3425. i = addr == -1 ? 0 : pmd_index(addr) - pmd_index(*first);
  3426. if (i && i <= MIN_LRU_BATCH) {
  3427. __set_bit(i - 1, bitmap);
  3428. return;
  3429. }
  3430. pmd = pmd_offset(pud, *first);
  3431. ptl = pmd_lockptr(args->mm, pmd);
  3432. if (!spin_trylock(ptl))
  3433. goto done;
  3434. arch_enter_lazy_mmu_mode();
  3435. do {
  3436. unsigned long pfn;
  3437. struct folio *folio;
  3438. /* don't round down the first address */
  3439. addr = i ? (*first & PMD_MASK) + i * PMD_SIZE : *first;
  3440. pfn = get_pmd_pfn(pmd[i], vma, addr);
  3441. if (pfn == -1)
  3442. goto next;
  3443. if (!pmd_trans_huge(pmd[i])) {
  3444. if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
  3445. pmdp_test_and_clear_young(vma, addr, pmd + i);
  3446. goto next;
  3447. }
  3448. folio = get_pfn_folio(pfn, memcg, pgdat, walk->can_swap);
  3449. if (!folio)
  3450. goto next;
  3451. if (!pmdp_test_and_clear_young(vma, addr, pmd + i))
  3452. goto next;
  3453. walk->mm_stats[MM_LEAF_YOUNG]++;
  3454. if (pmd_dirty(pmd[i]) && !folio_test_dirty(folio) &&
  3455. !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
  3456. !folio_test_swapcache(folio)))
  3457. folio_mark_dirty(folio);
  3458. old_gen = folio_update_gen(folio, new_gen);
  3459. if (old_gen >= 0 && old_gen != new_gen)
  3460. update_batch_size(walk, folio, old_gen, new_gen);
  3461. next:
  3462. i = i > MIN_LRU_BATCH ? 0 : find_next_bit(bitmap, MIN_LRU_BATCH, i) + 1;
  3463. } while (i <= MIN_LRU_BATCH);
  3464. arch_leave_lazy_mmu_mode();
  3465. spin_unlock(ptl);
  3466. done:
  3467. *first = -1;
  3468. }
  3469. #else
  3470. static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area_struct *vma,
  3471. struct mm_walk *args, unsigned long *bitmap, unsigned long *first)
  3472. {
  3473. }
  3474. #endif
  3475. static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
  3476. struct mm_walk *args)
  3477. {
  3478. int i;
  3479. pmd_t *pmd;
  3480. unsigned long next;
  3481. unsigned long addr;
  3482. struct vm_area_struct *vma;
  3483. unsigned long bitmap[BITS_TO_LONGS(MIN_LRU_BATCH)];
  3484. unsigned long first = -1;
  3485. struct lru_gen_mm_walk *walk = args->private;
  3486. VM_WARN_ON_ONCE(pud_leaf(*pud));
  3487. /*
  3488. * Finish an entire PMD in two passes: the first only reaches to PTE
  3489. * tables to avoid taking the PMD lock; the second, if necessary, takes
  3490. * the PMD lock to clear the accessed bit in PMD entries.
  3491. */
  3492. pmd = pmd_offset(pud, start & PUD_MASK);
  3493. restart:
  3494. /* walk_pte_range() may call get_next_vma() */
  3495. vma = args->vma;
  3496. for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
  3497. pmd_t val = pmd_read_atomic(pmd + i);
  3498. /* for pmd_read_atomic() */
  3499. barrier();
  3500. next = pmd_addr_end(addr, end);
  3501. if (!pmd_present(val) || is_huge_zero_pmd(val)) {
  3502. walk->mm_stats[MM_LEAF_TOTAL]++;
  3503. continue;
  3504. }
  3505. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  3506. if (pmd_trans_huge(val)) {
  3507. unsigned long pfn = pmd_pfn(val);
  3508. struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec);
  3509. walk->mm_stats[MM_LEAF_TOTAL]++;
  3510. if (!pmd_young(val)) {
  3511. walk->mm_stats[MM_LEAF_OLD]++;
  3512. continue;
  3513. }
  3514. /* try to avoid unnecessary memory loads */
  3515. if (pfn < pgdat->node_start_pfn || pfn >= pgdat_end_pfn(pgdat))
  3516. continue;
  3517. walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
  3518. continue;
  3519. }
  3520. #endif
  3521. walk->mm_stats[MM_NONLEAF_TOTAL]++;
  3522. if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG)) {
  3523. if (!pmd_young(val))
  3524. continue;
  3525. walk_pmd_range_locked(pud, addr, vma, args, bitmap, &first);
  3526. }
  3527. if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
  3528. continue;
  3529. walk->mm_stats[MM_NONLEAF_FOUND]++;
  3530. if (!walk_pte_range(&val, addr, next, args))
  3531. continue;
  3532. walk->mm_stats[MM_NONLEAF_ADDED]++;
  3533. /* carry over to the next generation */
  3534. update_bloom_filter(walk->lruvec, walk->max_seq + 1, pmd + i);
  3535. }
  3536. walk_pmd_range_locked(pud, -1, vma, args, bitmap, &first);
  3537. if (i < PTRS_PER_PMD && get_next_vma(PUD_MASK, PMD_SIZE, args, &start, &end))
  3538. goto restart;
  3539. }
  3540. static int walk_pud_range(p4d_t *p4d, unsigned long start, unsigned long end,
  3541. struct mm_walk *args)
  3542. {
  3543. int i;
  3544. pud_t *pud;
  3545. unsigned long addr;
  3546. unsigned long next;
  3547. struct lru_gen_mm_walk *walk = args->private;
  3548. VM_WARN_ON_ONCE(p4d_leaf(*p4d));
  3549. pud = pud_offset(p4d, start & P4D_MASK);
  3550. restart:
  3551. for (i = pud_index(start), addr = start; addr != end; i++, addr = next) {
  3552. pud_t val = READ_ONCE(pud[i]);
  3553. next = pud_addr_end(addr, end);
  3554. if (!pud_present(val) || WARN_ON_ONCE(pud_leaf(val)))
  3555. continue;
  3556. walk_pmd_range(&val, addr, next, args);
  3557. if (need_resched() || walk->batched >= MAX_LRU_BATCH) {
  3558. end = (addr | ~PUD_MASK) + 1;
  3559. goto done;
  3560. }
  3561. }
  3562. if (i < PTRS_PER_PUD && get_next_vma(P4D_MASK, PUD_SIZE, args, &start, &end))
  3563. goto restart;
  3564. end = round_up(end, P4D_SIZE);
  3565. done:
  3566. if (!end || !args->vma)
  3567. return 1;
  3568. walk->next_addr = max(end, args->vma->vm_start);
  3569. return -EAGAIN;
  3570. }
  3571. static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_mm_walk *walk)
  3572. {
  3573. static const struct mm_walk_ops mm_walk_ops = {
  3574. .test_walk = should_skip_vma,
  3575. .p4d_entry = walk_pud_range,
  3576. .walk_lock = PGWALK_RDLOCK,
  3577. };
  3578. int err;
  3579. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  3580. walk->next_addr = FIRST_USER_ADDRESS;
  3581. do {
  3582. DEFINE_MAX_SEQ(lruvec);
  3583. err = -EBUSY;
  3584. /* another thread might have called inc_max_seq() */
  3585. if (walk->max_seq != max_seq)
  3586. break;
  3587. /* folio_update_gen() requires stable folio_memcg() */
  3588. if (!mem_cgroup_trylock_pages(memcg))
  3589. break;
  3590. /* the caller might be holding the lock for write */
  3591. if (mmap_read_trylock(mm)) {
  3592. err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk);
  3593. mmap_read_unlock(mm);
  3594. }
  3595. mem_cgroup_unlock_pages();
  3596. if (walk->batched) {
  3597. spin_lock_irq(&lruvec->lru_lock);
  3598. reset_batch_size(lruvec, walk);
  3599. spin_unlock_irq(&lruvec->lru_lock);
  3600. }
  3601. cond_resched();
  3602. } while (err == -EAGAIN);
  3603. }
  3604. static struct lru_gen_mm_walk *set_mm_walk(struct pglist_data *pgdat, bool force_alloc)
  3605. {
  3606. struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
  3607. if (pgdat && current_is_kswapd()) {
  3608. VM_WARN_ON_ONCE(walk);
  3609. walk = &pgdat->mm_walk;
  3610. } else if (!walk && force_alloc) {
  3611. VM_WARN_ON_ONCE(current_is_kswapd());
  3612. walk = kzalloc(sizeof(*walk), __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN);
  3613. }
  3614. current->reclaim_state->mm_walk = walk;
  3615. return walk;
  3616. }
  3617. static void clear_mm_walk(void)
  3618. {
  3619. struct lru_gen_mm_walk *walk = current->reclaim_state->mm_walk;
  3620. VM_WARN_ON_ONCE(walk && memchr_inv(walk->nr_pages, 0, sizeof(walk->nr_pages)));
  3621. VM_WARN_ON_ONCE(walk && memchr_inv(walk->mm_stats, 0, sizeof(walk->mm_stats)));
  3622. current->reclaim_state->mm_walk = NULL;
  3623. if (!current_is_kswapd())
  3624. kfree(walk);
  3625. }
  3626. static bool inc_min_seq(struct lruvec *lruvec, int type, bool can_swap)
  3627. {
  3628. int zone;
  3629. int remaining = MAX_LRU_BATCH;
  3630. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3631. int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]);
  3632. if (type == LRU_GEN_ANON && !can_swap)
  3633. goto done;
  3634. /* prevent cold/hot inversion if force_scan is true */
  3635. for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  3636. struct list_head *head = &lrugen->folios[old_gen][type][zone];
  3637. while (!list_empty(head)) {
  3638. struct folio *folio = lru_to_folio(head);
  3639. VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
  3640. VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
  3641. VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
  3642. VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
  3643. new_gen = folio_inc_gen(lruvec, folio, false);
  3644. list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]);
  3645. if (!--remaining)
  3646. return false;
  3647. }
  3648. }
  3649. done:
  3650. reset_ctrl_pos(lruvec, type, true);
  3651. WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1);
  3652. return true;
  3653. }
  3654. static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap)
  3655. {
  3656. int gen, type, zone;
  3657. bool success = false;
  3658. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3659. DEFINE_MIN_SEQ(lruvec);
  3660. VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
  3661. /* find the oldest populated generation */
  3662. for (type = !can_swap; type < ANON_AND_FILE; type++) {
  3663. while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) {
  3664. gen = lru_gen_from_seq(min_seq[type]);
  3665. for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  3666. if (!list_empty(&lrugen->folios[gen][type][zone]))
  3667. goto next;
  3668. }
  3669. min_seq[type]++;
  3670. }
  3671. next:
  3672. ;
  3673. }
  3674. /* see the comment on lru_gen_folio */
  3675. if (can_swap) {
  3676. min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]);
  3677. min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]);
  3678. }
  3679. for (type = !can_swap; type < ANON_AND_FILE; type++) {
  3680. if (min_seq[type] == lrugen->min_seq[type])
  3681. continue;
  3682. reset_ctrl_pos(lruvec, type, true);
  3683. WRITE_ONCE(lrugen->min_seq[type], min_seq[type]);
  3684. success = true;
  3685. }
  3686. return success;
  3687. }
  3688. static void inc_max_seq(struct lruvec *lruvec, bool can_swap, bool force_scan)
  3689. {
  3690. int prev, next;
  3691. int type, zone;
  3692. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3693. restart:
  3694. spin_lock_irq(&lruvec->lru_lock);
  3695. VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
  3696. for (type = ANON_AND_FILE - 1; type >= 0; type--) {
  3697. if (get_nr_gens(lruvec, type) != MAX_NR_GENS)
  3698. continue;
  3699. VM_WARN_ON_ONCE(!force_scan && (type == LRU_GEN_FILE || can_swap));
  3700. if (inc_min_seq(lruvec, type, can_swap))
  3701. continue;
  3702. spin_unlock_irq(&lruvec->lru_lock);
  3703. cond_resched();
  3704. goto restart;
  3705. }
  3706. /*
  3707. * Update the active/inactive LRU sizes for compatibility. Both sides of
  3708. * the current max_seq need to be covered, since max_seq+1 can overlap
  3709. * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do
  3710. * overlap, cold/hot inversion happens.
  3711. */
  3712. prev = lru_gen_from_seq(lrugen->max_seq - 1);
  3713. next = lru_gen_from_seq(lrugen->max_seq + 1);
  3714. for (type = 0; type < ANON_AND_FILE; type++) {
  3715. for (zone = 0; zone < MAX_NR_ZONES; zone++) {
  3716. enum lru_list lru = type * LRU_INACTIVE_FILE;
  3717. long delta = lrugen->nr_pages[prev][type][zone] -
  3718. lrugen->nr_pages[next][type][zone];
  3719. if (!delta)
  3720. continue;
  3721. __update_lru_size(lruvec, lru, zone, delta);
  3722. __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta);
  3723. }
  3724. }
  3725. for (type = 0; type < ANON_AND_FILE; type++)
  3726. reset_ctrl_pos(lruvec, type, false);
  3727. WRITE_ONCE(lrugen->timestamps[next], jiffies);
  3728. /* make sure preceding modifications appear */
  3729. smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1);
  3730. spin_unlock_irq(&lruvec->lru_lock);
  3731. }
  3732. static bool try_to_inc_max_seq(struct lruvec *lruvec, unsigned long max_seq,
  3733. struct scan_control *sc, bool can_swap, bool force_scan)
  3734. {
  3735. bool success;
  3736. struct lru_gen_mm_walk *walk;
  3737. struct mm_struct *mm = NULL;
  3738. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3739. VM_WARN_ON_ONCE(max_seq > READ_ONCE(lrugen->max_seq));
  3740. /* see the comment in iterate_mm_list() */
  3741. if (max_seq <= READ_ONCE(lruvec->mm_state.seq)) {
  3742. success = false;
  3743. goto done;
  3744. }
  3745. /*
  3746. * If the hardware doesn't automatically set the accessed bit, fallback
  3747. * to lru_gen_look_around(), which only clears the accessed bit in a
  3748. * handful of PTEs. Spreading the work out over a period of time usually
  3749. * is less efficient, but it avoids bursty page faults.
  3750. */
  3751. if (!arch_has_hw_pte_young() || !get_cap(LRU_GEN_MM_WALK)) {
  3752. success = iterate_mm_list_nowalk(lruvec, max_seq);
  3753. goto done;
  3754. }
  3755. walk = set_mm_walk(NULL, true);
  3756. if (!walk) {
  3757. success = iterate_mm_list_nowalk(lruvec, max_seq);
  3758. goto done;
  3759. }
  3760. walk->lruvec = lruvec;
  3761. walk->max_seq = max_seq;
  3762. walk->can_swap = can_swap;
  3763. walk->force_scan = force_scan;
  3764. do {
  3765. success = iterate_mm_list(lruvec, walk, &mm);
  3766. if (mm)
  3767. walk_mm(lruvec, mm, walk);
  3768. } while (mm);
  3769. done:
  3770. if (success)
  3771. inc_max_seq(lruvec, can_swap, force_scan);
  3772. return success;
  3773. }
  3774. /******************************************************************************
  3775. * working set protection
  3776. ******************************************************************************/
  3777. static bool lruvec_is_sizable(struct lruvec *lruvec, struct scan_control *sc)
  3778. {
  3779. int gen, type, zone;
  3780. unsigned long total = 0;
  3781. bool can_swap = get_swappiness(lruvec, sc);
  3782. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  3783. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  3784. DEFINE_MAX_SEQ(lruvec);
  3785. DEFINE_MIN_SEQ(lruvec);
  3786. for (type = !can_swap; type < ANON_AND_FILE; type++) {
  3787. unsigned long seq;
  3788. for (seq = min_seq[type]; seq <= max_seq; seq++) {
  3789. gen = lru_gen_from_seq(seq);
  3790. for (zone = 0; zone < MAX_NR_ZONES; zone++)
  3791. total += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
  3792. }
  3793. }
  3794. /* whether the size is big enough to be helpful */
  3795. return mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
  3796. }
  3797. static bool lruvec_is_reclaimable(struct lruvec *lruvec, struct scan_control *sc,
  3798. unsigned long min_ttl)
  3799. {
  3800. int gen;
  3801. unsigned long birth;
  3802. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  3803. DEFINE_MIN_SEQ(lruvec);
  3804. /* see the comment on lru_gen_folio */
  3805. gen = lru_gen_from_seq(min_seq[LRU_GEN_FILE]);
  3806. birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
  3807. if (time_is_after_jiffies(birth + min_ttl))
  3808. return false;
  3809. if (!lruvec_is_sizable(lruvec, sc))
  3810. return false;
  3811. mem_cgroup_calculate_protection(NULL, memcg);
  3812. return !mem_cgroup_below_min(memcg);
  3813. }
  3814. /* to protect the working set of the last N jiffies */
  3815. static unsigned long lru_gen_min_ttl __read_mostly;
  3816. static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  3817. {
  3818. struct mem_cgroup *memcg;
  3819. unsigned long min_ttl = READ_ONCE(lru_gen_min_ttl);
  3820. VM_WARN_ON_ONCE(!current_is_kswapd());
  3821. /* check the order to exclude compaction-induced reclaim */
  3822. if (!min_ttl || sc->order || sc->priority == DEF_PRIORITY)
  3823. return;
  3824. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  3825. do {
  3826. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  3827. if (lruvec_is_reclaimable(lruvec, sc, min_ttl)) {
  3828. mem_cgroup_iter_break(NULL, memcg);
  3829. return;
  3830. }
  3831. cond_resched();
  3832. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
  3833. /*
  3834. * The main goal is to OOM kill if every generation from all memcgs is
  3835. * younger than min_ttl. However, another possibility is all memcgs are
  3836. * either too small or below min.
  3837. */
  3838. if (mutex_trylock(&oom_lock)) {
  3839. struct oom_control oc = {
  3840. .gfp_mask = sc->gfp_mask,
  3841. };
  3842. out_of_memory(&oc);
  3843. mutex_unlock(&oom_lock);
  3844. }
  3845. }
  3846. /******************************************************************************
  3847. * rmap/PT walk feedback
  3848. ******************************************************************************/
  3849. /*
  3850. * This function exploits spatial locality when shrink_folio_list() walks the
  3851. * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If
  3852. * the scan was done cacheline efficiently, it adds the PMD entry pointing to
  3853. * the PTE table to the Bloom filter. This forms a feedback loop between the
  3854. * eviction and the aging.
  3855. */
  3856. void lru_gen_look_around(struct page_vma_mapped_walk *pvmw)
  3857. {
  3858. int i;
  3859. unsigned long start;
  3860. unsigned long end;
  3861. struct lru_gen_mm_walk *walk;
  3862. int young = 0;
  3863. pte_t *pte = pvmw->pte;
  3864. unsigned long addr = pvmw->address;
  3865. struct folio *folio = pfn_folio(pvmw->pfn);
  3866. bool can_swap = !folio_is_file_lru(folio);
  3867. struct mem_cgroup *memcg = folio_memcg(folio);
  3868. struct pglist_data *pgdat = folio_pgdat(folio);
  3869. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  3870. DEFINE_MAX_SEQ(lruvec);
  3871. int old_gen, new_gen = lru_gen_from_seq(max_seq);
  3872. lockdep_assert_held(pvmw->ptl);
  3873. VM_WARN_ON_ONCE_FOLIO(folio_test_lru(folio), folio);
  3874. if (spin_is_contended(pvmw->ptl))
  3875. return;
  3876. /* avoid taking the LRU lock under the PTL when possible */
  3877. walk = current->reclaim_state ? current->reclaim_state->mm_walk : NULL;
  3878. start = max(addr & PMD_MASK, pvmw->vma->vm_start);
  3879. end = min(addr | ~PMD_MASK, pvmw->vma->vm_end - 1) + 1;
  3880. if (end - start > MIN_LRU_BATCH * PAGE_SIZE) {
  3881. if (addr - start < MIN_LRU_BATCH * PAGE_SIZE / 2)
  3882. end = start + MIN_LRU_BATCH * PAGE_SIZE;
  3883. else if (end - addr < MIN_LRU_BATCH * PAGE_SIZE / 2)
  3884. start = end - MIN_LRU_BATCH * PAGE_SIZE;
  3885. else {
  3886. start = addr - MIN_LRU_BATCH * PAGE_SIZE / 2;
  3887. end = addr + MIN_LRU_BATCH * PAGE_SIZE / 2;
  3888. }
  3889. }
  3890. /* folio_update_gen() requires stable folio_memcg() */
  3891. if (!mem_cgroup_trylock_pages(memcg))
  3892. return;
  3893. arch_enter_lazy_mmu_mode();
  3894. pte -= (addr - start) / PAGE_SIZE;
  3895. for (i = 0, addr = start; addr != end; i++, addr += PAGE_SIZE) {
  3896. unsigned long pfn;
  3897. pfn = get_pte_pfn(pte[i], pvmw->vma, addr);
  3898. if (pfn == -1)
  3899. continue;
  3900. if (!pte_young(pte[i]))
  3901. continue;
  3902. folio = get_pfn_folio(pfn, memcg, pgdat, can_swap);
  3903. if (!folio)
  3904. continue;
  3905. if (!ptep_test_and_clear_young(pvmw->vma, addr, pte + i))
  3906. VM_WARN_ON_ONCE(true);
  3907. young++;
  3908. if (pte_dirty(pte[i]) && !folio_test_dirty(folio) &&
  3909. !(folio_test_anon(folio) && folio_test_swapbacked(folio) &&
  3910. !folio_test_swapcache(folio)))
  3911. folio_mark_dirty(folio);
  3912. if (walk) {
  3913. old_gen = folio_update_gen(folio, new_gen);
  3914. if (old_gen >= 0 && old_gen != new_gen)
  3915. update_batch_size(walk, folio, old_gen, new_gen);
  3916. continue;
  3917. }
  3918. old_gen = folio_lru_gen(folio);
  3919. if (old_gen < 0)
  3920. folio_set_referenced(folio);
  3921. else if (old_gen != new_gen)
  3922. folio_activate(folio);
  3923. }
  3924. arch_leave_lazy_mmu_mode();
  3925. mem_cgroup_unlock_pages();
  3926. /* feedback from rmap walkers to page table walkers */
  3927. if (suitable_to_scan(i, young))
  3928. update_bloom_filter(lruvec, max_seq, pvmw->pmd);
  3929. }
  3930. /******************************************************************************
  3931. * memcg LRU
  3932. ******************************************************************************/
  3933. /* see the comment on MEMCG_NR_GENS */
  3934. enum {
  3935. MEMCG_LRU_NOP,
  3936. MEMCG_LRU_HEAD,
  3937. MEMCG_LRU_TAIL,
  3938. MEMCG_LRU_OLD,
  3939. MEMCG_LRU_YOUNG,
  3940. };
  3941. #ifdef CONFIG_MEMCG
  3942. static int lru_gen_memcg_seg(struct lruvec *lruvec)
  3943. {
  3944. return READ_ONCE(lruvec->lrugen.seg);
  3945. }
  3946. static void lru_gen_rotate_memcg(struct lruvec *lruvec, int op)
  3947. {
  3948. int seg;
  3949. int old, new;
  3950. unsigned long flags;
  3951. int bin = get_random_u32_below(MEMCG_NR_BINS);
  3952. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  3953. spin_lock_irqsave(&pgdat->memcg_lru.lock, flags);
  3954. VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list));
  3955. seg = 0;
  3956. new = old = lruvec->lrugen.gen;
  3957. /* see the comment on MEMCG_NR_GENS */
  3958. if (op == MEMCG_LRU_HEAD)
  3959. seg = MEMCG_LRU_HEAD;
  3960. else if (op == MEMCG_LRU_TAIL)
  3961. seg = MEMCG_LRU_TAIL;
  3962. else if (op == MEMCG_LRU_OLD)
  3963. new = get_memcg_gen(pgdat->memcg_lru.seq);
  3964. else if (op == MEMCG_LRU_YOUNG)
  3965. new = get_memcg_gen(pgdat->memcg_lru.seq + 1);
  3966. else
  3967. VM_WARN_ON_ONCE(true);
  3968. hlist_nulls_del_rcu(&lruvec->lrugen.list);
  3969. if (op == MEMCG_LRU_HEAD || op == MEMCG_LRU_OLD)
  3970. hlist_nulls_add_head_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  3971. else
  3972. hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[new][bin]);
  3973. pgdat->memcg_lru.nr_memcgs[old]--;
  3974. pgdat->memcg_lru.nr_memcgs[new]++;
  3975. lruvec->lrugen.gen = new;
  3976. WRITE_ONCE(lruvec->lrugen.seg, seg);
  3977. if (!pgdat->memcg_lru.nr_memcgs[old] && old == get_memcg_gen(pgdat->memcg_lru.seq))
  3978. WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  3979. spin_unlock_irqrestore(&pgdat->memcg_lru.lock, flags);
  3980. }
  3981. void lru_gen_online_memcg(struct mem_cgroup *memcg)
  3982. {
  3983. int gen;
  3984. int nid;
  3985. int bin = get_random_u32_below(MEMCG_NR_BINS);
  3986. for_each_node(nid) {
  3987. struct pglist_data *pgdat = NODE_DATA(nid);
  3988. struct lruvec *lruvec = get_lruvec(memcg, nid);
  3989. spin_lock_irq(&pgdat->memcg_lru.lock);
  3990. VM_WARN_ON_ONCE(!hlist_nulls_unhashed(&lruvec->lrugen.list));
  3991. gen = get_memcg_gen(pgdat->memcg_lru.seq);
  3992. hlist_nulls_add_tail_rcu(&lruvec->lrugen.list, &pgdat->memcg_lru.fifo[gen][bin]);
  3993. pgdat->memcg_lru.nr_memcgs[gen]++;
  3994. lruvec->lrugen.gen = gen;
  3995. spin_unlock_irq(&pgdat->memcg_lru.lock);
  3996. }
  3997. }
  3998. void lru_gen_offline_memcg(struct mem_cgroup *memcg)
  3999. {
  4000. int nid;
  4001. for_each_node(nid) {
  4002. struct lruvec *lruvec = get_lruvec(memcg, nid);
  4003. lru_gen_rotate_memcg(lruvec, MEMCG_LRU_OLD);
  4004. }
  4005. }
  4006. void lru_gen_release_memcg(struct mem_cgroup *memcg)
  4007. {
  4008. int gen;
  4009. int nid;
  4010. for_each_node(nid) {
  4011. struct pglist_data *pgdat = NODE_DATA(nid);
  4012. struct lruvec *lruvec = get_lruvec(memcg, nid);
  4013. spin_lock_irq(&pgdat->memcg_lru.lock);
  4014. if (hlist_nulls_unhashed(&lruvec->lrugen.list))
  4015. goto unlock;
  4016. gen = lruvec->lrugen.gen;
  4017. hlist_nulls_del_init_rcu(&lruvec->lrugen.list);
  4018. pgdat->memcg_lru.nr_memcgs[gen]--;
  4019. if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq))
  4020. WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1);
  4021. unlock:
  4022. spin_unlock_irq(&pgdat->memcg_lru.lock);
  4023. }
  4024. }
  4025. void lru_gen_soft_reclaim(struct lruvec *lruvec)
  4026. {
  4027. /* see the comment on MEMCG_NR_GENS */
  4028. if (lru_gen_memcg_seg(lruvec) != MEMCG_LRU_HEAD)
  4029. lru_gen_rotate_memcg(lruvec, MEMCG_LRU_HEAD);
  4030. }
  4031. #else /* !CONFIG_MEMCG */
  4032. static int lru_gen_memcg_seg(struct lruvec *lruvec)
  4033. {
  4034. return 0;
  4035. }
  4036. #endif
  4037. /******************************************************************************
  4038. * the eviction
  4039. ******************************************************************************/
  4040. static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc,
  4041. int tier_idx)
  4042. {
  4043. bool success;
  4044. int gen = folio_lru_gen(folio);
  4045. int type = folio_is_file_lru(folio);
  4046. int zone = folio_zonenum(folio);
  4047. int delta = folio_nr_pages(folio);
  4048. int refs = folio_lru_refs(folio);
  4049. int tier = lru_tier_from_refs(refs);
  4050. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4051. VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio);
  4052. /* unevictable */
  4053. if (!folio_evictable(folio)) {
  4054. success = lru_gen_del_folio(lruvec, folio, true);
  4055. VM_WARN_ON_ONCE_FOLIO(!success, folio);
  4056. folio_set_unevictable(folio);
  4057. lruvec_add_folio(lruvec, folio);
  4058. __count_vm_events(UNEVICTABLE_PGCULLED, delta);
  4059. return true;
  4060. }
  4061. /* dirty lazyfree */
  4062. if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) {
  4063. success = lru_gen_del_folio(lruvec, folio, true);
  4064. VM_WARN_ON_ONCE_FOLIO(!success, folio);
  4065. folio_set_swapbacked(folio);
  4066. lruvec_add_folio_tail(lruvec, folio);
  4067. return true;
  4068. }
  4069. /* promoted */
  4070. if (gen != lru_gen_from_seq(lrugen->min_seq[type])) {
  4071. list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
  4072. return true;
  4073. }
  4074. /* protected */
  4075. if (tier > tier_idx) {
  4076. int hist = lru_hist_from_seq(lrugen->min_seq[type]);
  4077. gen = folio_inc_gen(lruvec, folio, false);
  4078. list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
  4079. WRITE_ONCE(lrugen->protected[hist][type][tier - 1],
  4080. lrugen->protected[hist][type][tier - 1] + delta);
  4081. return true;
  4082. }
  4083. /* ineligible */
  4084. if (zone > sc->reclaim_idx || skip_cma(folio, sc)) {
  4085. gen = folio_inc_gen(lruvec, folio, false);
  4086. list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
  4087. return true;
  4088. }
  4089. /* ineligible */
  4090. if (zone > sc->reclaim_idx) {
  4091. gen = folio_inc_gen(lruvec, folio, false);
  4092. list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]);
  4093. return true;
  4094. }
  4095. /* waiting for writeback */
  4096. if (folio_test_locked(folio) || folio_test_writeback(folio) ||
  4097. (type == LRU_GEN_FILE && folio_test_dirty(folio))) {
  4098. gen = folio_inc_gen(lruvec, folio, true);
  4099. list_move(&folio->lru, &lrugen->folios[gen][type][zone]);
  4100. return true;
  4101. }
  4102. return false;
  4103. }
  4104. static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc)
  4105. {
  4106. bool success;
  4107. /* swapping inhibited */
  4108. if (!(sc->gfp_mask & __GFP_IO) &&
  4109. (folio_test_dirty(folio) ||
  4110. (folio_test_anon(folio) && !folio_test_swapcache(folio))))
  4111. return false;
  4112. /* raced with release_pages() */
  4113. if (!folio_try_get(folio))
  4114. return false;
  4115. /* raced with another isolation */
  4116. if (!folio_test_clear_lru(folio)) {
  4117. folio_put(folio);
  4118. return false;
  4119. }
  4120. /* see the comment on MAX_NR_TIERS */
  4121. if (!folio_test_referenced(folio))
  4122. set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0);
  4123. /* for shrink_folio_list() */
  4124. folio_clear_reclaim(folio);
  4125. folio_clear_referenced(folio);
  4126. success = lru_gen_del_folio(lruvec, folio, true);
  4127. VM_WARN_ON_ONCE_FOLIO(!success, folio);
  4128. return true;
  4129. }
  4130. static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
  4131. int type, int tier, struct list_head *list)
  4132. {
  4133. int i;
  4134. int gen;
  4135. enum vm_event_item item;
  4136. int sorted = 0;
  4137. int scanned = 0;
  4138. int isolated = 0;
  4139. int remaining = MAX_LRU_BATCH;
  4140. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4141. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4142. VM_WARN_ON_ONCE(!list_empty(list));
  4143. if (get_nr_gens(lruvec, type) == MIN_NR_GENS)
  4144. return 0;
  4145. gen = lru_gen_from_seq(lrugen->min_seq[type]);
  4146. for (i = MAX_NR_ZONES; i > 0; i--) {
  4147. LIST_HEAD(moved);
  4148. int skipped = 0;
  4149. int zone = (sc->reclaim_idx + i) % MAX_NR_ZONES;
  4150. struct list_head *head = &lrugen->folios[gen][type][zone];
  4151. while (!list_empty(head)) {
  4152. struct folio *folio = lru_to_folio(head);
  4153. int delta = folio_nr_pages(folio);
  4154. VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
  4155. VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
  4156. VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
  4157. VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
  4158. scanned += delta;
  4159. if (sort_folio(lruvec, folio, sc, tier))
  4160. sorted += delta;
  4161. else if (isolate_folio(lruvec, folio, sc)) {
  4162. list_add(&folio->lru, list);
  4163. isolated += delta;
  4164. } else {
  4165. list_move(&folio->lru, &moved);
  4166. skipped += delta;
  4167. }
  4168. if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH)
  4169. break;
  4170. }
  4171. if (skipped) {
  4172. list_splice(&moved, head);
  4173. __count_zid_vm_events(PGSCAN_SKIP, zone, skipped);
  4174. }
  4175. if (!remaining || isolated >= MIN_LRU_BATCH)
  4176. break;
  4177. }
  4178. item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
  4179. if (!cgroup_reclaim(sc)) {
  4180. __count_vm_events(item, isolated);
  4181. __count_vm_events(PGREFILL, sorted);
  4182. }
  4183. __count_memcg_events(memcg, item, isolated);
  4184. __count_memcg_events(memcg, PGREFILL, sorted);
  4185. __count_vm_events(PGSCAN_ANON + type, isolated);
  4186. /*
  4187. * There might not be eligible folios due to reclaim_idx. Check the
  4188. * remaining to prevent livelock if it's not making progress.
  4189. */
  4190. return isolated || !remaining ? scanned : 0;
  4191. }
  4192. static int get_tier_idx(struct lruvec *lruvec, int type)
  4193. {
  4194. int tier;
  4195. struct ctrl_pos sp, pv;
  4196. /*
  4197. * To leave a margin for fluctuations, use a larger gain factor (1:2).
  4198. * This value is chosen because any other tier would have at least twice
  4199. * as many refaults as the first tier.
  4200. */
  4201. read_ctrl_pos(lruvec, type, 0, 1, &sp);
  4202. for (tier = 1; tier < MAX_NR_TIERS; tier++) {
  4203. read_ctrl_pos(lruvec, type, tier, 2, &pv);
  4204. if (!positive_ctrl_err(&sp, &pv))
  4205. break;
  4206. }
  4207. return tier - 1;
  4208. }
  4209. static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx)
  4210. {
  4211. int type, tier;
  4212. struct ctrl_pos sp, pv;
  4213. int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness };
  4214. /*
  4215. * Compare the first tier of anon with that of file to determine which
  4216. * type to scan. Also need to compare other tiers of the selected type
  4217. * with the first tier of the other type to determine the last tier (of
  4218. * the selected type) to evict.
  4219. */
  4220. read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp);
  4221. read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv);
  4222. type = positive_ctrl_err(&sp, &pv);
  4223. read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp);
  4224. for (tier = 1; tier < MAX_NR_TIERS; tier++) {
  4225. read_ctrl_pos(lruvec, type, tier, gain[type], &pv);
  4226. if (!positive_ctrl_err(&sp, &pv))
  4227. break;
  4228. }
  4229. *tier_idx = tier - 1;
  4230. return type;
  4231. }
  4232. static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness,
  4233. int *type_scanned, struct list_head *list)
  4234. {
  4235. int i;
  4236. int type;
  4237. int scanned;
  4238. int tier = -1;
  4239. DEFINE_MIN_SEQ(lruvec);
  4240. /*
  4241. * Try to make the obvious choice first. When anon and file are both
  4242. * available from the same generation, interpret swappiness 1 as file
  4243. * first and 200 as anon first.
  4244. */
  4245. if (!swappiness)
  4246. type = LRU_GEN_FILE;
  4247. else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE])
  4248. type = LRU_GEN_ANON;
  4249. else if (swappiness == 1)
  4250. type = LRU_GEN_FILE;
  4251. else if (swappiness == 200)
  4252. type = LRU_GEN_ANON;
  4253. else
  4254. type = get_type_to_scan(lruvec, swappiness, &tier);
  4255. for (i = !swappiness; i < ANON_AND_FILE; i++) {
  4256. if (tier < 0)
  4257. tier = get_tier_idx(lruvec, type);
  4258. scanned = scan_folios(lruvec, sc, type, tier, list);
  4259. if (scanned)
  4260. break;
  4261. type = !type;
  4262. tier = -1;
  4263. }
  4264. *type_scanned = type;
  4265. return scanned;
  4266. }
  4267. static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness)
  4268. {
  4269. int type;
  4270. int scanned;
  4271. int reclaimed;
  4272. LIST_HEAD(list);
  4273. LIST_HEAD(clean);
  4274. struct folio *folio;
  4275. struct folio *next;
  4276. enum vm_event_item item;
  4277. struct reclaim_stat stat;
  4278. struct lru_gen_mm_walk *walk;
  4279. bool skip_retry = false;
  4280. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4281. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  4282. spin_lock_irq(&lruvec->lru_lock);
  4283. scanned = isolate_folios(lruvec, sc, swappiness, &type, &list);
  4284. scanned += try_to_inc_min_seq(lruvec, swappiness);
  4285. if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS)
  4286. scanned = 0;
  4287. spin_unlock_irq(&lruvec->lru_lock);
  4288. if (list_empty(&list))
  4289. return scanned;
  4290. retry:
  4291. reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
  4292. sc->nr_reclaimed += reclaimed;
  4293. list_for_each_entry_safe_reverse(folio, next, &list, lru) {
  4294. if (!folio_evictable(folio)) {
  4295. list_del(&folio->lru);
  4296. folio_putback_lru(folio);
  4297. continue;
  4298. }
  4299. if (folio_test_reclaim(folio) &&
  4300. (folio_test_dirty(folio) || folio_test_writeback(folio))) {
  4301. /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
  4302. if (folio_test_workingset(folio))
  4303. folio_set_referenced(folio);
  4304. continue;
  4305. }
  4306. if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
  4307. folio_mapped(folio) || folio_test_locked(folio) ||
  4308. folio_test_dirty(folio) || folio_test_writeback(folio)) {
  4309. /* don't add rejected folios to the oldest generation */
  4310. set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
  4311. BIT(PG_active));
  4312. continue;
  4313. }
  4314. /* retry folios that may have missed folio_rotate_reclaimable() */
  4315. list_move(&folio->lru, &clean);
  4316. sc->nr_scanned -= folio_nr_pages(folio);
  4317. }
  4318. spin_lock_irq(&lruvec->lru_lock);
  4319. move_folios_to_lru(lruvec, &list);
  4320. walk = current->reclaim_state->mm_walk;
  4321. if (walk && walk->batched)
  4322. reset_batch_size(lruvec, walk);
  4323. item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
  4324. if (!cgroup_reclaim(sc))
  4325. __count_vm_events(item, reclaimed);
  4326. __count_memcg_events(memcg, item, reclaimed);
  4327. __count_vm_events(PGSTEAL_ANON + type, reclaimed);
  4328. spin_unlock_irq(&lruvec->lru_lock);
  4329. mem_cgroup_uncharge_list(&list);
  4330. free_unref_page_list(&list);
  4331. INIT_LIST_HEAD(&list);
  4332. list_splice_init(&clean, &list);
  4333. if (!list_empty(&list)) {
  4334. skip_retry = true;
  4335. goto retry;
  4336. }
  4337. return scanned;
  4338. }
  4339. static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq,
  4340. struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan)
  4341. {
  4342. int gen, type, zone;
  4343. unsigned long old = 0;
  4344. unsigned long young = 0;
  4345. unsigned long total = 0;
  4346. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4347. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4348. DEFINE_MIN_SEQ(lruvec);
  4349. /* whether this lruvec is completely out of cold folios */
  4350. if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) {
  4351. *nr_to_scan = 0;
  4352. return true;
  4353. }
  4354. for (type = !can_swap; type < ANON_AND_FILE; type++) {
  4355. unsigned long seq;
  4356. for (seq = min_seq[type]; seq <= max_seq; seq++) {
  4357. unsigned long size = 0;
  4358. gen = lru_gen_from_seq(seq);
  4359. for (zone = 0; zone < MAX_NR_ZONES; zone++)
  4360. size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
  4361. total += size;
  4362. if (seq == max_seq)
  4363. young += size;
  4364. else if (seq + MIN_NR_GENS == max_seq)
  4365. old += size;
  4366. }
  4367. }
  4368. /* try to scrape all its memory if this memcg was deleted */
  4369. *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total;
  4370. /*
  4371. * The aging tries to be lazy to reduce the overhead, while the eviction
  4372. * stalls when the number of generations reaches MIN_NR_GENS. Hence, the
  4373. * ideal number of generations is MIN_NR_GENS+1.
  4374. */
  4375. if (min_seq[!can_swap] + MIN_NR_GENS < max_seq)
  4376. return false;
  4377. /*
  4378. * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1)
  4379. * of the total number of pages for each generation. A reasonable range
  4380. * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The
  4381. * aging cares about the upper bound of hot pages, while the eviction
  4382. * cares about the lower bound of cold pages.
  4383. */
  4384. if (young * MIN_NR_GENS > total)
  4385. return true;
  4386. if (old * (MIN_NR_GENS + 2) < total)
  4387. return true;
  4388. return false;
  4389. }
  4390. /*
  4391. * For future optimizations:
  4392. * 1. Defer try_to_inc_max_seq() to workqueues to reduce latency for memcg
  4393. * reclaim.
  4394. */
  4395. static long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, bool can_swap)
  4396. {
  4397. unsigned long nr_to_scan;
  4398. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4399. DEFINE_MAX_SEQ(lruvec);
  4400. if (mem_cgroup_below_min(memcg))
  4401. return 0;
  4402. if (!should_run_aging(lruvec, max_seq, sc, can_swap, &nr_to_scan))
  4403. return nr_to_scan;
  4404. /* skip the aging path at the default priority */
  4405. if (sc->priority == DEF_PRIORITY)
  4406. return nr_to_scan;
  4407. /* skip this lruvec as it's low on cold folios */
  4408. return try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, false) ? -1 : 0;
  4409. }
  4410. static unsigned long get_nr_to_reclaim(struct scan_control *sc)
  4411. {
  4412. /* don't abort memcg reclaim to ensure fairness */
  4413. if (!global_reclaim(sc))
  4414. return -1;
  4415. return max(sc->nr_to_reclaim, compact_gap(sc->order));
  4416. }
  4417. static bool should_abort_scan(struct lruvec *lruvec, struct scan_control *sc)
  4418. {
  4419. unsigned long nr_to_reclaim = get_nr_to_reclaim(sc);
  4420. bool check_wmarks = false;
  4421. int i;
  4422. if (sc->nr_reclaimed >= nr_to_reclaim)
  4423. return true;
  4424. trace_android_vh_scan_abort_check_wmarks(&check_wmarks);
  4425. if (!check_wmarks)
  4426. return false;
  4427. if (!current_is_kswapd())
  4428. return false;
  4429. for (i = 0; i <= sc->reclaim_idx; i++) {
  4430. unsigned long wmark;
  4431. struct zone *zone = lruvec_pgdat(lruvec)->node_zones + i;
  4432. if (!managed_zone(zone))
  4433. continue;
  4434. if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
  4435. wmark = wmark_pages(zone, WMARK_PROMO);
  4436. else
  4437. wmark = high_wmark_pages(zone);
  4438. /*
  4439. * Abort scan once the target number of order zero pages are met.
  4440. * Reclaim MIN_LRU_BATCH << 2 to facilitate immediate kswapd sleep.
  4441. */
  4442. wmark += MIN_LRU_BATCH << 2;
  4443. if (!zone_watermark_ok_safe(zone, 0, wmark, sc->reclaim_idx))
  4444. return false;
  4445. }
  4446. return true;
  4447. }
  4448. static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  4449. {
  4450. long nr_to_scan;
  4451. unsigned long scanned = 0;
  4452. int swappiness = get_swappiness(lruvec, sc);
  4453. /* clean file folios are more likely to exist */
  4454. if (swappiness && !(sc->gfp_mask & __GFP_IO))
  4455. swappiness = 1;
  4456. while (true) {
  4457. int delta;
  4458. nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness);
  4459. if (nr_to_scan <= 0)
  4460. break;
  4461. delta = evict_folios(lruvec, sc, swappiness);
  4462. if (!delta)
  4463. break;
  4464. scanned += delta;
  4465. if (scanned >= nr_to_scan)
  4466. break;
  4467. if (should_abort_scan(lruvec, sc))
  4468. break;
  4469. cond_resched();
  4470. }
  4471. /* whether try_to_inc_max_seq() was successful */
  4472. return nr_to_scan < 0;
  4473. }
  4474. static int shrink_one(struct lruvec *lruvec, struct scan_control *sc)
  4475. {
  4476. bool success;
  4477. unsigned long scanned = sc->nr_scanned;
  4478. unsigned long reclaimed = sc->nr_reclaimed;
  4479. int seg = lru_gen_memcg_seg(lruvec);
  4480. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4481. struct pglist_data *pgdat = lruvec_pgdat(lruvec);
  4482. /* see the comment on MEMCG_NR_GENS */
  4483. if (!lruvec_is_sizable(lruvec, sc))
  4484. return seg != MEMCG_LRU_TAIL ? MEMCG_LRU_TAIL : MEMCG_LRU_YOUNG;
  4485. mem_cgroup_calculate_protection(NULL, memcg);
  4486. if (mem_cgroup_below_min(memcg))
  4487. return MEMCG_LRU_YOUNG;
  4488. if (mem_cgroup_below_low(memcg)) {
  4489. /* see the comment on MEMCG_NR_GENS */
  4490. if (seg != MEMCG_LRU_TAIL)
  4491. return MEMCG_LRU_TAIL;
  4492. memcg_memory_event(memcg, MEMCG_LOW);
  4493. }
  4494. success = try_to_shrink_lruvec(lruvec, sc);
  4495. shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority);
  4496. if (!sc->proactive)
  4497. vmpressure(sc->gfp_mask, memcg, false, sc->nr_scanned - scanned,
  4498. sc->nr_reclaimed - reclaimed);
  4499. sc->nr_reclaimed += current->reclaim_state->reclaimed_slab;
  4500. current->reclaim_state->reclaimed_slab = 0;
  4501. return success ? MEMCG_LRU_YOUNG : 0;
  4502. }
  4503. #ifdef CONFIG_MEMCG
  4504. static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  4505. {
  4506. int op;
  4507. int gen;
  4508. int bin;
  4509. int first_bin;
  4510. struct lruvec *lruvec;
  4511. struct lru_gen_folio *lrugen = NULL;
  4512. struct mem_cgroup *memcg;
  4513. const struct hlist_nulls_node *pos;
  4514. bin = first_bin = get_random_u32_below(MEMCG_NR_BINS);
  4515. restart:
  4516. op = 0;
  4517. memcg = NULL;
  4518. gen = get_memcg_gen(READ_ONCE(pgdat->memcg_lru.seq));
  4519. rcu_read_lock();
  4520. hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) {
  4521. if (op) {
  4522. lru_gen_rotate_memcg(lruvec, op);
  4523. op = 0;
  4524. }
  4525. mem_cgroup_put(memcg);
  4526. lruvec = container_of(lrugen, struct lruvec, lrugen);
  4527. memcg = lruvec_memcg(lruvec);
  4528. if (!mem_cgroup_tryget(memcg)) {
  4529. lru_gen_release_memcg(memcg);
  4530. memcg = NULL;
  4531. continue;
  4532. }
  4533. rcu_read_unlock();
  4534. op = shrink_one(lruvec, sc);
  4535. rcu_read_lock();
  4536. if (should_abort_scan(lruvec, sc))
  4537. break;
  4538. }
  4539. rcu_read_unlock();
  4540. if (op)
  4541. lru_gen_rotate_memcg(lruvec, op);
  4542. mem_cgroup_put(memcg);
  4543. if (lruvec && should_abort_scan(lruvec, sc))
  4544. return;
  4545. /* restart if raced with lru_gen_rotate_memcg() */
  4546. if (gen != get_nulls_value(pos))
  4547. goto restart;
  4548. /* try the rest of the bins of the current generation */
  4549. bin = get_memcg_bin(bin + 1);
  4550. if (bin != first_bin)
  4551. goto restart;
  4552. }
  4553. static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  4554. {
  4555. struct blk_plug plug;
  4556. VM_WARN_ON_ONCE(global_reclaim(sc));
  4557. VM_WARN_ON_ONCE(!sc->may_writepage || !sc->may_unmap);
  4558. lru_add_drain();
  4559. blk_start_plug(&plug);
  4560. set_mm_walk(NULL, sc->proactive);
  4561. if (try_to_shrink_lruvec(lruvec, sc))
  4562. lru_gen_rotate_memcg(lruvec, MEMCG_LRU_YOUNG);
  4563. clear_mm_walk();
  4564. blk_finish_plug(&plug);
  4565. }
  4566. #else /* !CONFIG_MEMCG */
  4567. static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc)
  4568. {
  4569. BUILD_BUG();
  4570. }
  4571. static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  4572. {
  4573. BUILD_BUG();
  4574. }
  4575. #endif
  4576. static void set_initial_priority(struct pglist_data *pgdat, struct scan_control *sc)
  4577. {
  4578. int priority;
  4579. unsigned long reclaimable;
  4580. struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
  4581. if (sc->priority != DEF_PRIORITY || sc->nr_to_reclaim < MIN_LRU_BATCH)
  4582. return;
  4583. /*
  4584. * Determine the initial priority based on ((total / MEMCG_NR_GENS) >>
  4585. * priority) * reclaimed_to_scanned_ratio = nr_to_reclaim, where the
  4586. * estimated reclaimed_to_scanned_ratio = inactive / total.
  4587. */
  4588. reclaimable = node_page_state(pgdat, NR_INACTIVE_FILE);
  4589. if (get_swappiness(lruvec, sc))
  4590. reclaimable += node_page_state(pgdat, NR_INACTIVE_ANON);
  4591. reclaimable /= MEMCG_NR_GENS;
  4592. /* round down reclaimable and round up sc->nr_to_reclaim */
  4593. priority = fls_long(reclaimable) - 1 - fls_long(sc->nr_to_reclaim - 1);
  4594. sc->priority = clamp(priority, 0, DEF_PRIORITY);
  4595. }
  4596. static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  4597. {
  4598. struct blk_plug plug;
  4599. unsigned long reclaimed = sc->nr_reclaimed;
  4600. VM_WARN_ON_ONCE(!global_reclaim(sc));
  4601. /*
  4602. * Unmapped clean folios are already prioritized. Scanning for more of
  4603. * them is likely futile and can cause high reclaim latency when there
  4604. * is a large number of memcgs.
  4605. */
  4606. if (!sc->may_writepage || !sc->may_unmap)
  4607. goto done;
  4608. lru_add_drain();
  4609. blk_start_plug(&plug);
  4610. set_mm_walk(pgdat, sc->proactive);
  4611. set_initial_priority(pgdat, sc);
  4612. if (current_is_kswapd())
  4613. sc->nr_reclaimed = 0;
  4614. if (mem_cgroup_disabled())
  4615. shrink_one(&pgdat->__lruvec, sc);
  4616. else
  4617. shrink_many(pgdat, sc);
  4618. if (current_is_kswapd())
  4619. sc->nr_reclaimed += reclaimed;
  4620. clear_mm_walk();
  4621. blk_finish_plug(&plug);
  4622. done:
  4623. /* kswapd should never fail */
  4624. pgdat->kswapd_failures = 0;
  4625. }
  4626. /******************************************************************************
  4627. * state change
  4628. ******************************************************************************/
  4629. static bool __maybe_unused state_is_valid(struct lruvec *lruvec)
  4630. {
  4631. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4632. if (lrugen->enabled) {
  4633. enum lru_list lru;
  4634. for_each_evictable_lru(lru) {
  4635. if (!list_empty(&lruvec->lists[lru]))
  4636. return false;
  4637. }
  4638. } else {
  4639. int gen, type, zone;
  4640. for_each_gen_type_zone(gen, type, zone) {
  4641. if (!list_empty(&lrugen->folios[gen][type][zone]))
  4642. return false;
  4643. }
  4644. }
  4645. return true;
  4646. }
  4647. static bool fill_evictable(struct lruvec *lruvec)
  4648. {
  4649. enum lru_list lru;
  4650. int remaining = MAX_LRU_BATCH;
  4651. for_each_evictable_lru(lru) {
  4652. int type = is_file_lru(lru);
  4653. bool active = is_active_lru(lru);
  4654. struct list_head *head = &lruvec->lists[lru];
  4655. while (!list_empty(head)) {
  4656. bool success;
  4657. struct folio *folio = lru_to_folio(head);
  4658. VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
  4659. VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio) != active, folio);
  4660. VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
  4661. VM_WARN_ON_ONCE_FOLIO(folio_lru_gen(folio) != -1, folio);
  4662. lruvec_del_folio(lruvec, folio);
  4663. success = lru_gen_add_folio(lruvec, folio, false);
  4664. VM_WARN_ON_ONCE(!success);
  4665. if (!--remaining)
  4666. return false;
  4667. }
  4668. }
  4669. return true;
  4670. }
  4671. static bool drain_evictable(struct lruvec *lruvec)
  4672. {
  4673. int gen, type, zone;
  4674. int remaining = MAX_LRU_BATCH;
  4675. for_each_gen_type_zone(gen, type, zone) {
  4676. struct list_head *head = &lruvec->lrugen.folios[gen][type][zone];
  4677. while (!list_empty(head)) {
  4678. bool success;
  4679. struct folio *folio = lru_to_folio(head);
  4680. VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio);
  4681. VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio);
  4682. VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio);
  4683. VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio);
  4684. success = lru_gen_del_folio(lruvec, folio, false);
  4685. VM_WARN_ON_ONCE(!success);
  4686. lruvec_add_folio(lruvec, folio);
  4687. if (!--remaining)
  4688. return false;
  4689. }
  4690. }
  4691. return true;
  4692. }
  4693. static void lru_gen_change_state(bool enabled)
  4694. {
  4695. static DEFINE_MUTEX(state_mutex);
  4696. struct mem_cgroup *memcg;
  4697. cgroup_lock();
  4698. cpus_read_lock();
  4699. get_online_mems();
  4700. mutex_lock(&state_mutex);
  4701. if (enabled == lru_gen_enabled())
  4702. goto unlock;
  4703. if (enabled)
  4704. static_branch_enable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
  4705. else
  4706. static_branch_disable_cpuslocked(&lru_gen_caps[LRU_GEN_CORE]);
  4707. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  4708. do {
  4709. int nid;
  4710. for_each_node(nid) {
  4711. struct lruvec *lruvec = get_lruvec(memcg, nid);
  4712. if (!lruvec)
  4713. continue;
  4714. spin_lock_irq(&lruvec->lru_lock);
  4715. VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
  4716. VM_WARN_ON_ONCE(!state_is_valid(lruvec));
  4717. lruvec->lrugen.enabled = enabled;
  4718. while (!(enabled ? fill_evictable(lruvec) : drain_evictable(lruvec))) {
  4719. spin_unlock_irq(&lruvec->lru_lock);
  4720. cond_resched();
  4721. spin_lock_irq(&lruvec->lru_lock);
  4722. }
  4723. spin_unlock_irq(&lruvec->lru_lock);
  4724. }
  4725. cond_resched();
  4726. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
  4727. unlock:
  4728. mutex_unlock(&state_mutex);
  4729. put_online_mems();
  4730. cpus_read_unlock();
  4731. cgroup_unlock();
  4732. }
  4733. /******************************************************************************
  4734. * sysfs interface
  4735. ******************************************************************************/
  4736. static ssize_t show_min_ttl(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
  4737. {
  4738. return sprintf(buf, "%u\n", jiffies_to_msecs(READ_ONCE(lru_gen_min_ttl)));
  4739. }
  4740. /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
  4741. static ssize_t store_min_ttl(struct kobject *kobj, struct kobj_attribute *attr,
  4742. const char *buf, size_t len)
  4743. {
  4744. unsigned int msecs;
  4745. if (kstrtouint(buf, 0, &msecs))
  4746. return -EINVAL;
  4747. WRITE_ONCE(lru_gen_min_ttl, msecs_to_jiffies(msecs));
  4748. return len;
  4749. }
  4750. static struct kobj_attribute lru_gen_min_ttl_attr = __ATTR(
  4751. min_ttl_ms, 0644, show_min_ttl, store_min_ttl
  4752. );
  4753. static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, char *buf)
  4754. {
  4755. unsigned int caps = 0;
  4756. if (get_cap(LRU_GEN_CORE))
  4757. caps |= BIT(LRU_GEN_CORE);
  4758. if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
  4759. caps |= BIT(LRU_GEN_MM_WALK);
  4760. if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
  4761. caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
  4762. return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
  4763. }
  4764. /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
  4765. static ssize_t store_enabled(struct kobject *kobj, struct kobj_attribute *attr,
  4766. const char *buf, size_t len)
  4767. {
  4768. int i;
  4769. unsigned int caps;
  4770. if (tolower(*buf) == 'n')
  4771. caps = 0;
  4772. else if (tolower(*buf) == 'y')
  4773. caps = -1;
  4774. else if (kstrtouint(buf, 0, &caps))
  4775. return -EINVAL;
  4776. for (i = 0; i < NR_LRU_GEN_CAPS; i++) {
  4777. bool enabled = caps & BIT(i);
  4778. if (i == LRU_GEN_CORE)
  4779. lru_gen_change_state(enabled);
  4780. else if (enabled)
  4781. static_branch_enable(&lru_gen_caps[i]);
  4782. else
  4783. static_branch_disable(&lru_gen_caps[i]);
  4784. }
  4785. return len;
  4786. }
  4787. static struct kobj_attribute lru_gen_enabled_attr = __ATTR(
  4788. enabled, 0644, show_enabled, store_enabled
  4789. );
  4790. static struct attribute *lru_gen_attrs[] = {
  4791. &lru_gen_min_ttl_attr.attr,
  4792. &lru_gen_enabled_attr.attr,
  4793. NULL
  4794. };
  4795. static struct attribute_group lru_gen_attr_group = {
  4796. .name = "lru_gen",
  4797. .attrs = lru_gen_attrs,
  4798. };
  4799. /******************************************************************************
  4800. * debugfs interface
  4801. ******************************************************************************/
  4802. static void *lru_gen_seq_start(struct seq_file *m, loff_t *pos)
  4803. {
  4804. struct mem_cgroup *memcg;
  4805. loff_t nr_to_skip = *pos;
  4806. m->private = kvmalloc(PATH_MAX, GFP_KERNEL);
  4807. if (!m->private)
  4808. return ERR_PTR(-ENOMEM);
  4809. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  4810. do {
  4811. int nid;
  4812. for_each_node_state(nid, N_MEMORY) {
  4813. if (!nr_to_skip--)
  4814. return get_lruvec(memcg, nid);
  4815. }
  4816. } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
  4817. return NULL;
  4818. }
  4819. static void lru_gen_seq_stop(struct seq_file *m, void *v)
  4820. {
  4821. if (!IS_ERR_OR_NULL(v))
  4822. mem_cgroup_iter_break(NULL, lruvec_memcg(v));
  4823. kvfree(m->private);
  4824. m->private = NULL;
  4825. }
  4826. static void *lru_gen_seq_next(struct seq_file *m, void *v, loff_t *pos)
  4827. {
  4828. int nid = lruvec_pgdat(v)->node_id;
  4829. struct mem_cgroup *memcg = lruvec_memcg(v);
  4830. ++*pos;
  4831. nid = next_memory_node(nid);
  4832. if (nid == MAX_NUMNODES) {
  4833. memcg = mem_cgroup_iter(NULL, memcg, NULL);
  4834. if (!memcg)
  4835. return NULL;
  4836. nid = first_memory_node;
  4837. }
  4838. return get_lruvec(memcg, nid);
  4839. }
  4840. static void lru_gen_seq_show_full(struct seq_file *m, struct lruvec *lruvec,
  4841. unsigned long max_seq, unsigned long *min_seq,
  4842. unsigned long seq)
  4843. {
  4844. int i;
  4845. int type, tier;
  4846. int hist = lru_hist_from_seq(seq);
  4847. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4848. for (tier = 0; tier < MAX_NR_TIERS; tier++) {
  4849. seq_printf(m, " %10d", tier);
  4850. for (type = 0; type < ANON_AND_FILE; type++) {
  4851. const char *s = " ";
  4852. unsigned long n[3] = {};
  4853. if (seq == max_seq) {
  4854. s = "RT ";
  4855. n[0] = READ_ONCE(lrugen->avg_refaulted[type][tier]);
  4856. n[1] = READ_ONCE(lrugen->avg_total[type][tier]);
  4857. } else if (seq == min_seq[type] || NR_HIST_GENS > 1) {
  4858. s = "rep";
  4859. n[0] = atomic_long_read(&lrugen->refaulted[hist][type][tier]);
  4860. n[1] = atomic_long_read(&lrugen->evicted[hist][type][tier]);
  4861. if (tier)
  4862. n[2] = READ_ONCE(lrugen->protected[hist][type][tier - 1]);
  4863. }
  4864. for (i = 0; i < 3; i++)
  4865. seq_printf(m, " %10lu%c", n[i], s[i]);
  4866. }
  4867. seq_putc(m, '\n');
  4868. }
  4869. seq_puts(m, " ");
  4870. for (i = 0; i < NR_MM_STATS; i++) {
  4871. const char *s = " ";
  4872. unsigned long n = 0;
  4873. if (seq == max_seq && NR_HIST_GENS == 1) {
  4874. s = "LOYNFA";
  4875. n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
  4876. } else if (seq != max_seq && NR_HIST_GENS > 1) {
  4877. s = "loynfa";
  4878. n = READ_ONCE(lruvec->mm_state.stats[hist][i]);
  4879. }
  4880. seq_printf(m, " %10lu%c", n, s[i]);
  4881. }
  4882. seq_putc(m, '\n');
  4883. }
  4884. /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
  4885. static int lru_gen_seq_show(struct seq_file *m, void *v)
  4886. {
  4887. unsigned long seq;
  4888. bool full = !debugfs_real_fops(m->file)->write;
  4889. struct lruvec *lruvec = v;
  4890. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  4891. int nid = lruvec_pgdat(lruvec)->node_id;
  4892. struct mem_cgroup *memcg = lruvec_memcg(lruvec);
  4893. DEFINE_MAX_SEQ(lruvec);
  4894. DEFINE_MIN_SEQ(lruvec);
  4895. if (nid == first_memory_node) {
  4896. const char *path = memcg ? m->private : "";
  4897. #ifdef CONFIG_MEMCG
  4898. if (memcg)
  4899. cgroup_path(memcg->css.cgroup, m->private, PATH_MAX);
  4900. #endif
  4901. seq_printf(m, "memcg %5hu %s\n", mem_cgroup_id(memcg), path);
  4902. }
  4903. seq_printf(m, " node %5d\n", nid);
  4904. if (!full)
  4905. seq = min_seq[LRU_GEN_ANON];
  4906. else if (max_seq >= MAX_NR_GENS)
  4907. seq = max_seq - MAX_NR_GENS + 1;
  4908. else
  4909. seq = 0;
  4910. for (; seq <= max_seq; seq++) {
  4911. int type, zone;
  4912. int gen = lru_gen_from_seq(seq);
  4913. unsigned long birth = READ_ONCE(lruvec->lrugen.timestamps[gen]);
  4914. seq_printf(m, " %10lu %10u", seq, jiffies_to_msecs(jiffies - birth));
  4915. for (type = 0; type < ANON_AND_FILE; type++) {
  4916. unsigned long size = 0;
  4917. char mark = full && seq < min_seq[type] ? 'x' : ' ';
  4918. for (zone = 0; zone < MAX_NR_ZONES; zone++)
  4919. size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L);
  4920. seq_printf(m, " %10lu%c", size, mark);
  4921. }
  4922. seq_putc(m, '\n');
  4923. if (full)
  4924. lru_gen_seq_show_full(m, lruvec, max_seq, min_seq, seq);
  4925. }
  4926. return 0;
  4927. }
  4928. static const struct seq_operations lru_gen_seq_ops = {
  4929. .start = lru_gen_seq_start,
  4930. .stop = lru_gen_seq_stop,
  4931. .next = lru_gen_seq_next,
  4932. .show = lru_gen_seq_show,
  4933. };
  4934. static int run_aging(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
  4935. bool can_swap, bool force_scan)
  4936. {
  4937. DEFINE_MAX_SEQ(lruvec);
  4938. DEFINE_MIN_SEQ(lruvec);
  4939. if (seq < max_seq)
  4940. return 0;
  4941. if (seq > max_seq)
  4942. return -EINVAL;
  4943. if (!force_scan && min_seq[!can_swap] + MAX_NR_GENS - 1 <= max_seq)
  4944. return -ERANGE;
  4945. try_to_inc_max_seq(lruvec, max_seq, sc, can_swap, force_scan);
  4946. return 0;
  4947. }
  4948. static int run_eviction(struct lruvec *lruvec, unsigned long seq, struct scan_control *sc,
  4949. int swappiness, unsigned long nr_to_reclaim)
  4950. {
  4951. DEFINE_MAX_SEQ(lruvec);
  4952. if (seq + MIN_NR_GENS > max_seq)
  4953. return -EINVAL;
  4954. sc->nr_reclaimed = 0;
  4955. while (!signal_pending(current)) {
  4956. DEFINE_MIN_SEQ(lruvec);
  4957. if (seq < min_seq[!swappiness])
  4958. return 0;
  4959. if (sc->nr_reclaimed >= nr_to_reclaim)
  4960. return 0;
  4961. if (!evict_folios(lruvec, sc, swappiness))
  4962. return 0;
  4963. cond_resched();
  4964. }
  4965. return -EINTR;
  4966. }
  4967. static int run_cmd(char cmd, int memcg_id, int nid, unsigned long seq,
  4968. struct scan_control *sc, int swappiness, unsigned long opt)
  4969. {
  4970. struct lruvec *lruvec;
  4971. int err = -EINVAL;
  4972. struct mem_cgroup *memcg = NULL;
  4973. if (nid < 0 || nid >= MAX_NUMNODES || !node_state(nid, N_MEMORY))
  4974. return -EINVAL;
  4975. if (!mem_cgroup_disabled()) {
  4976. rcu_read_lock();
  4977. memcg = mem_cgroup_from_id(memcg_id);
  4978. if (!mem_cgroup_tryget(memcg))
  4979. memcg = NULL;
  4980. rcu_read_unlock();
  4981. if (!memcg)
  4982. return -EINVAL;
  4983. }
  4984. if (memcg_id != mem_cgroup_id(memcg))
  4985. goto done;
  4986. lruvec = get_lruvec(memcg, nid);
  4987. if (swappiness < 0)
  4988. swappiness = get_swappiness(lruvec, sc);
  4989. else if (swappiness > 200)
  4990. goto done;
  4991. switch (cmd) {
  4992. case '+':
  4993. err = run_aging(lruvec, seq, sc, swappiness, opt);
  4994. break;
  4995. case '-':
  4996. err = run_eviction(lruvec, seq, sc, swappiness, opt);
  4997. break;
  4998. }
  4999. done:
  5000. mem_cgroup_put(memcg);
  5001. return err;
  5002. }
  5003. /* see Documentation/admin-guide/mm/multigen_lru.rst for details */
  5004. static ssize_t lru_gen_seq_write(struct file *file, const char __user *src,
  5005. size_t len, loff_t *pos)
  5006. {
  5007. void *buf;
  5008. char *cur, *next;
  5009. unsigned int flags;
  5010. struct blk_plug plug;
  5011. int err = -EINVAL;
  5012. struct scan_control sc = {
  5013. .may_writepage = true,
  5014. .may_unmap = true,
  5015. .may_swap = true,
  5016. .reclaim_idx = MAX_NR_ZONES - 1,
  5017. .gfp_mask = GFP_KERNEL,
  5018. };
  5019. buf = kvmalloc(len + 1, GFP_KERNEL);
  5020. if (!buf)
  5021. return -ENOMEM;
  5022. if (copy_from_user(buf, src, len)) {
  5023. kvfree(buf);
  5024. return -EFAULT;
  5025. }
  5026. set_task_reclaim_state(current, &sc.reclaim_state);
  5027. flags = memalloc_noreclaim_save();
  5028. blk_start_plug(&plug);
  5029. if (!set_mm_walk(NULL, true)) {
  5030. err = -ENOMEM;
  5031. goto done;
  5032. }
  5033. next = buf;
  5034. next[len] = '\0';
  5035. while ((cur = strsep(&next, ",;\n"))) {
  5036. int n;
  5037. int end;
  5038. char cmd;
  5039. unsigned int memcg_id;
  5040. unsigned int nid;
  5041. unsigned long seq;
  5042. unsigned int swappiness = -1;
  5043. unsigned long opt = -1;
  5044. cur = skip_spaces(cur);
  5045. if (!*cur)
  5046. continue;
  5047. n = sscanf(cur, "%c %u %u %lu %n %u %n %lu %n", &cmd, &memcg_id, &nid,
  5048. &seq, &end, &swappiness, &end, &opt, &end);
  5049. if (n < 4 || cur[end]) {
  5050. err = -EINVAL;
  5051. break;
  5052. }
  5053. err = run_cmd(cmd, memcg_id, nid, seq, &sc, swappiness, opt);
  5054. if (err)
  5055. break;
  5056. }
  5057. done:
  5058. clear_mm_walk();
  5059. blk_finish_plug(&plug);
  5060. memalloc_noreclaim_restore(flags);
  5061. set_task_reclaim_state(current, NULL);
  5062. kvfree(buf);
  5063. return err ? : len;
  5064. }
  5065. static int lru_gen_seq_open(struct inode *inode, struct file *file)
  5066. {
  5067. return seq_open(file, &lru_gen_seq_ops);
  5068. }
  5069. static const struct file_operations lru_gen_rw_fops = {
  5070. .open = lru_gen_seq_open,
  5071. .read = seq_read,
  5072. .write = lru_gen_seq_write,
  5073. .llseek = seq_lseek,
  5074. .release = seq_release,
  5075. };
  5076. static const struct file_operations lru_gen_ro_fops = {
  5077. .open = lru_gen_seq_open,
  5078. .read = seq_read,
  5079. .llseek = seq_lseek,
  5080. .release = seq_release,
  5081. };
  5082. /******************************************************************************
  5083. * initialization
  5084. ******************************************************************************/
  5085. void lru_gen_init_lruvec(struct lruvec *lruvec)
  5086. {
  5087. int i;
  5088. int gen, type, zone;
  5089. struct lru_gen_folio *lrugen = &lruvec->lrugen;
  5090. lrugen->max_seq = MIN_NR_GENS + 1;
  5091. lrugen->enabled = lru_gen_enabled();
  5092. for (i = 0; i <= MIN_NR_GENS + 1; i++)
  5093. lrugen->timestamps[i] = jiffies;
  5094. for_each_gen_type_zone(gen, type, zone)
  5095. INIT_LIST_HEAD(&lrugen->folios[gen][type][zone]);
  5096. lruvec->mm_state.seq = MIN_NR_GENS;
  5097. }
  5098. #ifdef CONFIG_MEMCG
  5099. void lru_gen_init_pgdat(struct pglist_data *pgdat)
  5100. {
  5101. int i, j;
  5102. spin_lock_init(&pgdat->memcg_lru.lock);
  5103. for (i = 0; i < MEMCG_NR_GENS; i++) {
  5104. for (j = 0; j < MEMCG_NR_BINS; j++)
  5105. INIT_HLIST_NULLS_HEAD(&pgdat->memcg_lru.fifo[i][j], i);
  5106. }
  5107. }
  5108. void lru_gen_init_memcg(struct mem_cgroup *memcg)
  5109. {
  5110. INIT_LIST_HEAD(&memcg->mm_list.fifo);
  5111. spin_lock_init(&memcg->mm_list.lock);
  5112. }
  5113. void lru_gen_exit_memcg(struct mem_cgroup *memcg)
  5114. {
  5115. int i;
  5116. int nid;
  5117. VM_WARN_ON_ONCE(!list_empty(&memcg->mm_list.fifo));
  5118. for_each_node(nid) {
  5119. struct lruvec *lruvec = get_lruvec(memcg, nid);
  5120. VM_WARN_ON_ONCE(memchr_inv(lruvec->lrugen.nr_pages, 0,
  5121. sizeof(lruvec->lrugen.nr_pages)));
  5122. lruvec->lrugen.list.next = LIST_POISON1;
  5123. for (i = 0; i < NR_BLOOM_FILTERS; i++) {
  5124. bitmap_free(lruvec->mm_state.filters[i]);
  5125. lruvec->mm_state.filters[i] = NULL;
  5126. }
  5127. }
  5128. }
  5129. #endif /* CONFIG_MEMCG */
  5130. static int __init init_lru_gen(void)
  5131. {
  5132. BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
  5133. BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
  5134. if (sysfs_create_group(mm_kobj, &lru_gen_attr_group))
  5135. pr_err("lru_gen: failed to create sysfs group\n");
  5136. debugfs_create_file("lru_gen", 0644, NULL, NULL, &lru_gen_rw_fops);
  5137. debugfs_create_file("lru_gen_full", 0444, NULL, NULL, &lru_gen_ro_fops);
  5138. return 0;
  5139. };
  5140. late_initcall(init_lru_gen);
  5141. #else /* !CONFIG_LRU_GEN */
  5142. static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  5143. {
  5144. }
  5145. static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  5146. {
  5147. }
  5148. static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control *sc)
  5149. {
  5150. }
  5151. #endif /* CONFIG_LRU_GEN */
  5152. static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
  5153. {
  5154. unsigned long nr[NR_LRU_LISTS];
  5155. unsigned long targets[NR_LRU_LISTS];
  5156. unsigned long nr_to_scan;
  5157. enum lru_list lru;
  5158. unsigned long nr_reclaimed = 0;
  5159. unsigned long nr_to_reclaim = sc->nr_to_reclaim;
  5160. bool proportional_reclaim;
  5161. struct blk_plug plug;
  5162. if (lru_gen_enabled() && !global_reclaim(sc)) {
  5163. lru_gen_shrink_lruvec(lruvec, sc);
  5164. return;
  5165. }
  5166. get_scan_count(lruvec, sc, nr);
  5167. /* Record the original scan target for proportional adjustments later */
  5168. memcpy(targets, nr, sizeof(nr));
  5169. /*
  5170. * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal
  5171. * event that can occur when there is little memory pressure e.g.
  5172. * multiple streaming readers/writers. Hence, we do not abort scanning
  5173. * when the requested number of pages are reclaimed when scanning at
  5174. * DEF_PRIORITY on the assumption that the fact we are direct
  5175. * reclaiming implies that kswapd is not keeping up and it is best to
  5176. * do a batch of work at once. For memcg reclaim one check is made to
  5177. * abort proportional reclaim if either the file or anon lru has already
  5178. * dropped to zero at the first pass.
  5179. */
  5180. proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
  5181. sc->priority == DEF_PRIORITY);
  5182. blk_start_plug(&plug);
  5183. while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
  5184. nr[LRU_INACTIVE_FILE]) {
  5185. unsigned long nr_anon, nr_file, percentage;
  5186. unsigned long nr_scanned;
  5187. for_each_evictable_lru(lru) {
  5188. if (nr[lru]) {
  5189. nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX);
  5190. nr[lru] -= nr_to_scan;
  5191. nr_reclaimed += shrink_list(lru, nr_to_scan,
  5192. lruvec, sc);
  5193. }
  5194. }
  5195. cond_resched();
  5196. if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
  5197. continue;
  5198. /*
  5199. * For kswapd and memcg, reclaim at least the number of pages
  5200. * requested. Ensure that the anon and file LRUs are scanned
  5201. * proportionally what was requested by get_scan_count(). We
  5202. * stop reclaiming one LRU and reduce the amount scanning
  5203. * proportional to the original scan target.
  5204. */
  5205. nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE];
  5206. nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON];
  5207. /*
  5208. * It's just vindictive to attack the larger once the smaller
  5209. * has gone to zero. And given the way we stop scanning the
  5210. * smaller below, this makes sure that we only make one nudge
  5211. * towards proportionality once we've got nr_to_reclaim.
  5212. */
  5213. if (!nr_file || !nr_anon)
  5214. break;
  5215. if (nr_file > nr_anon) {
  5216. unsigned long scan_target = targets[LRU_INACTIVE_ANON] +
  5217. targets[LRU_ACTIVE_ANON] + 1;
  5218. lru = LRU_BASE;
  5219. percentage = nr_anon * 100 / scan_target;
  5220. } else {
  5221. unsigned long scan_target = targets[LRU_INACTIVE_FILE] +
  5222. targets[LRU_ACTIVE_FILE] + 1;
  5223. lru = LRU_FILE;
  5224. percentage = nr_file * 100 / scan_target;
  5225. }
  5226. /* Stop scanning the smaller of the LRU */
  5227. nr[lru] = 0;
  5228. nr[lru + LRU_ACTIVE] = 0;
  5229. /*
  5230. * Recalculate the other LRU scan count based on its original
  5231. * scan target and the percentage scanning already complete
  5232. */
  5233. lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE;
  5234. nr_scanned = targets[lru] - nr[lru];
  5235. nr[lru] = targets[lru] * (100 - percentage) / 100;
  5236. nr[lru] -= min(nr[lru], nr_scanned);
  5237. lru += LRU_ACTIVE;
  5238. nr_scanned = targets[lru] - nr[lru];
  5239. nr[lru] = targets[lru] * (100 - percentage) / 100;
  5240. nr[lru] -= min(nr[lru], nr_scanned);
  5241. }
  5242. blk_finish_plug(&plug);
  5243. sc->nr_reclaimed += nr_reclaimed;
  5244. /*
  5245. * Even if we did not try to evict anon pages at all, we want to
  5246. * rebalance the anon lru active/inactive ratio.
  5247. */
  5248. if (can_age_anon_pages(lruvec_pgdat(lruvec), sc) &&
  5249. inactive_is_low(lruvec, LRU_INACTIVE_ANON))
  5250. shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  5251. sc, LRU_ACTIVE_ANON);
  5252. }
  5253. /* Use reclaim/compaction for costly allocs or under memory pressure */
  5254. static bool in_reclaim_compaction(struct scan_control *sc)
  5255. {
  5256. if (IS_ENABLED(CONFIG_COMPACTION) && sc->order &&
  5257. (sc->order > PAGE_ALLOC_COSTLY_ORDER ||
  5258. sc->priority < DEF_PRIORITY - 2))
  5259. return true;
  5260. return false;
  5261. }
  5262. /*
  5263. * Reclaim/compaction is used for high-order allocation requests. It reclaims
  5264. * order-0 pages before compacting the zone. should_continue_reclaim() returns
  5265. * true if more pages should be reclaimed such that when the page allocator
  5266. * calls try_to_compact_pages() that it will have enough free pages to succeed.
  5267. * It will give up earlier than that if there is difficulty reclaiming pages.
  5268. */
  5269. static inline bool should_continue_reclaim(struct pglist_data *pgdat,
  5270. unsigned long nr_reclaimed,
  5271. struct scan_control *sc)
  5272. {
  5273. unsigned long pages_for_compaction;
  5274. unsigned long inactive_lru_pages;
  5275. int z;
  5276. bool continue_reclaim = true;
  5277. /* If not in reclaim/compaction mode, stop */
  5278. if (!in_reclaim_compaction(sc))
  5279. return false;
  5280. /*
  5281. * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
  5282. * number of pages that were scanned. This will return to the caller
  5283. * with the risk reclaim/compaction and the resulting allocation attempt
  5284. * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
  5285. * allocations through requiring that the full LRU list has been scanned
  5286. * first, by assuming that zero delta of sc->nr_scanned means full LRU
  5287. * scan, but that approximation was wrong, and there were corner cases
  5288. * where always a non-zero amount of pages were scanned.
  5289. */
  5290. if (!nr_reclaimed)
  5291. return false;
  5292. /* If compaction would go ahead or the allocation would succeed, stop */
  5293. for (z = 0; z <= sc->reclaim_idx; z++) {
  5294. struct zone *zone = &pgdat->node_zones[z];
  5295. if (!managed_zone(zone))
  5296. continue;
  5297. switch (compaction_suitable(zone, sc->order, 0, sc->reclaim_idx)) {
  5298. case COMPACT_SUCCESS:
  5299. case COMPACT_CONTINUE:
  5300. return false;
  5301. default:
  5302. /* check next zone */
  5303. ;
  5304. }
  5305. }
  5306. /*
  5307. * If we have not reclaimed enough pages for compaction and the
  5308. * inactive lists are large enough, continue reclaiming
  5309. */
  5310. pages_for_compaction = compact_gap(sc->order);
  5311. inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
  5312. if (can_reclaim_anon_pages(NULL, pgdat->node_id, sc))
  5313. inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
  5314. #ifdef CONFIG_ANDROID_VENDOR_OEM_DATA
  5315. trace_android_vh_should_continue_reclaim(&sc->android_vendor_data1,
  5316. &sc->nr_to_reclaim, &sc->nr_reclaimed, &continue_reclaim);
  5317. #endif
  5318. if (!continue_reclaim)
  5319. return false;
  5320. return inactive_lru_pages > pages_for_compaction;
  5321. }
  5322. static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
  5323. {
  5324. struct mem_cgroup *target_memcg = sc->target_mem_cgroup;
  5325. struct mem_cgroup *memcg;
  5326. memcg = mem_cgroup_iter(target_memcg, NULL, NULL);
  5327. do {
  5328. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  5329. unsigned long reclaimed;
  5330. unsigned long scanned;
  5331. bool skip = false;
  5332. /*
  5333. * This loop can become CPU-bound when target memcgs
  5334. * aren't eligible for reclaim - either because they
  5335. * don't have any reclaimable pages, or because their
  5336. * memory is explicitly protected. Avoid soft lockups.
  5337. */
  5338. cond_resched();
  5339. trace_android_vh_shrink_node_memcgs(memcg, &skip);
  5340. if (skip)
  5341. continue;
  5342. mem_cgroup_calculate_protection(target_memcg, memcg);
  5343. if (mem_cgroup_below_min(memcg)) {
  5344. /*
  5345. * Hard protection.
  5346. * If there is no reclaimable memory, OOM.
  5347. */
  5348. continue;
  5349. } else if (mem_cgroup_below_low(memcg)) {
  5350. /*
  5351. * Soft protection.
  5352. * Respect the protection only as long as
  5353. * there is an unprotected supply
  5354. * of reclaimable memory from other cgroups.
  5355. */
  5356. if (!sc->memcg_low_reclaim) {
  5357. sc->memcg_low_skipped = 1;
  5358. continue;
  5359. }
  5360. memcg_memory_event(memcg, MEMCG_LOW);
  5361. }
  5362. reclaimed = sc->nr_reclaimed;
  5363. scanned = sc->nr_scanned;
  5364. shrink_lruvec(lruvec, sc);
  5365. shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
  5366. sc->priority);
  5367. /* Record the group's reclaim efficiency */
  5368. if (!sc->proactive)
  5369. vmpressure(sc->gfp_mask, memcg, false,
  5370. sc->nr_scanned - scanned,
  5371. sc->nr_reclaimed - reclaimed);
  5372. } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL)));
  5373. }
  5374. static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
  5375. {
  5376. struct reclaim_state *reclaim_state = current->reclaim_state;
  5377. unsigned long nr_reclaimed, nr_scanned;
  5378. struct lruvec *target_lruvec;
  5379. bool reclaimable = false;
  5380. if (lru_gen_enabled() && global_reclaim(sc)) {
  5381. lru_gen_shrink_node(pgdat, sc);
  5382. return;
  5383. }
  5384. target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
  5385. again:
  5386. memset(&sc->nr, 0, sizeof(sc->nr));
  5387. nr_reclaimed = sc->nr_reclaimed;
  5388. nr_scanned = sc->nr_scanned;
  5389. prepare_scan_count(pgdat, sc);
  5390. shrink_node_memcgs(pgdat, sc);
  5391. if (reclaim_state) {
  5392. sc->nr_reclaimed += reclaim_state->reclaimed_slab;
  5393. reclaim_state->reclaimed_slab = 0;
  5394. }
  5395. /* Record the subtree's reclaim efficiency */
  5396. if (!sc->proactive)
  5397. vmpressure(sc->gfp_mask, sc->target_mem_cgroup, true,
  5398. sc->nr_scanned - nr_scanned,
  5399. sc->nr_reclaimed - nr_reclaimed);
  5400. if (sc->nr_reclaimed - nr_reclaimed)
  5401. reclaimable = true;
  5402. if (current_is_kswapd()) {
  5403. /*
  5404. * If reclaim is isolating dirty pages under writeback,
  5405. * it implies that the long-lived page allocation rate
  5406. * is exceeding the page laundering rate. Either the
  5407. * global limits are not being effective at throttling
  5408. * processes due to the page distribution throughout
  5409. * zones or there is heavy usage of a slow backing
  5410. * device. The only option is to throttle from reclaim
  5411. * context which is not ideal as there is no guarantee
  5412. * the dirtying process is throttled in the same way
  5413. * balance_dirty_pages() manages.
  5414. *
  5415. * Once a node is flagged PGDAT_WRITEBACK, kswapd will
  5416. * count the number of pages under pages flagged for
  5417. * immediate reclaim and stall if any are encountered
  5418. * in the nr_immediate check below.
  5419. */
  5420. if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken)
  5421. set_bit(PGDAT_WRITEBACK, &pgdat->flags);
  5422. /* Allow kswapd to start writing pages during reclaim.*/
  5423. if (sc->nr.unqueued_dirty == sc->nr.file_taken)
  5424. set_bit(PGDAT_DIRTY, &pgdat->flags);
  5425. /*
  5426. * If kswapd scans pages marked for immediate
  5427. * reclaim and under writeback (nr_immediate), it
  5428. * implies that pages are cycling through the LRU
  5429. * faster than they are written so forcibly stall
  5430. * until some pages complete writeback.
  5431. */
  5432. if (sc->nr.immediate)
  5433. reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
  5434. }
  5435. /*
  5436. * Tag a node/memcg as congested if all the dirty pages were marked
  5437. * for writeback and immediate reclaim (counted in nr.congested).
  5438. *
  5439. * Legacy memcg will stall in page writeback so avoid forcibly
  5440. * stalling in reclaim_throttle().
  5441. */
  5442. if ((current_is_kswapd() ||
  5443. (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
  5444. sc->nr.dirty && sc->nr.dirty == sc->nr.congested)
  5445. set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
  5446. /*
  5447. * Stall direct reclaim for IO completions if the lruvec is
  5448. * node is congested. Allow kswapd to continue until it
  5449. * starts encountering unqueued dirty pages or cycling through
  5450. * the LRU too quickly.
  5451. */
  5452. if (!current_is_kswapd() && current_may_throttle() &&
  5453. !sc->hibernation_mode &&
  5454. test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
  5455. reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
  5456. if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
  5457. sc))
  5458. goto again;
  5459. /*
  5460. * Kswapd gives up on balancing particular nodes after too
  5461. * many failures to reclaim anything from them and goes to
  5462. * sleep. On reclaim progress, reset the failure counter. A
  5463. * successful direct reclaim run will revive a dormant kswapd.
  5464. */
  5465. if (reclaimable)
  5466. pgdat->kswapd_failures = 0;
  5467. }
  5468. /*
  5469. * Returns true if compaction should go ahead for a costly-order request, or
  5470. * the allocation would already succeed without compaction. Return false if we
  5471. * should reclaim first.
  5472. */
  5473. static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
  5474. {
  5475. unsigned long watermark;
  5476. enum compact_result suitable;
  5477. suitable = compaction_suitable(zone, sc->order, 0, sc->reclaim_idx);
  5478. if (suitable == COMPACT_SUCCESS)
  5479. /* Allocation should succeed already. Don't reclaim. */
  5480. return true;
  5481. if (suitable == COMPACT_SKIPPED)
  5482. /* Compaction cannot yet proceed. Do reclaim. */
  5483. return false;
  5484. /*
  5485. * Compaction is already possible, but it takes time to run and there
  5486. * are potentially other callers using the pages just freed. So proceed
  5487. * with reclaim to make a buffer of free pages available to give
  5488. * compaction a reasonable chance of completing and allocating the page.
  5489. * Note that we won't actually reclaim the whole buffer in one attempt
  5490. * as the target watermark in should_continue_reclaim() is lower. But if
  5491. * we are already above the high+gap watermark, don't reclaim at all.
  5492. */
  5493. watermark = high_wmark_pages(zone) + compact_gap(sc->order);
  5494. return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
  5495. }
  5496. static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
  5497. {
  5498. /*
  5499. * If reclaim is making progress greater than 12% efficiency then
  5500. * wake all the NOPROGRESS throttled tasks.
  5501. */
  5502. if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
  5503. wait_queue_head_t *wqh;
  5504. wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
  5505. if (waitqueue_active(wqh))
  5506. wake_up(wqh);
  5507. return;
  5508. }
  5509. /*
  5510. * Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
  5511. * throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
  5512. * under writeback and marked for immediate reclaim at the tail of the
  5513. * LRU.
  5514. */
  5515. if (current_is_kswapd() || cgroup_reclaim(sc))
  5516. return;
  5517. /* Throttle if making no progress at high prioities. */
  5518. if (sc->priority == 1 && !sc->nr_reclaimed)
  5519. reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
  5520. }
  5521. /*
  5522. * This is the direct reclaim path, for page-allocating processes. We only
  5523. * try to reclaim pages from zones which will satisfy the caller's allocation
  5524. * request.
  5525. *
  5526. * If a zone is deemed to be full of pinned pages then just give it a light
  5527. * scan then give up on it.
  5528. */
  5529. static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
  5530. {
  5531. struct zoneref *z;
  5532. struct zone *zone;
  5533. unsigned long nr_soft_reclaimed;
  5534. unsigned long nr_soft_scanned;
  5535. gfp_t orig_mask;
  5536. pg_data_t *last_pgdat = NULL;
  5537. pg_data_t *first_pgdat = NULL;
  5538. /*
  5539. * If the number of buffer_heads in the machine exceeds the maximum
  5540. * allowed level, force direct reclaim to scan the highmem zone as
  5541. * highmem pages could be pinning lowmem pages storing buffer_heads
  5542. */
  5543. orig_mask = sc->gfp_mask;
  5544. if (buffer_heads_over_limit) {
  5545. sc->gfp_mask |= __GFP_HIGHMEM;
  5546. sc->reclaim_idx = gfp_zone(sc->gfp_mask);
  5547. }
  5548. for_each_zone_zonelist_nodemask(zone, z, zonelist,
  5549. sc->reclaim_idx, sc->nodemask) {
  5550. /*
  5551. * Take care memory controller reclaiming has small influence
  5552. * to global LRU.
  5553. */
  5554. if (!cgroup_reclaim(sc)) {
  5555. if (!cpuset_zone_allowed(zone,
  5556. GFP_KERNEL | __GFP_HARDWALL))
  5557. continue;
  5558. /*
  5559. * If we already have plenty of memory free for
  5560. * compaction in this zone, don't free any more.
  5561. * Even though compaction is invoked for any
  5562. * non-zero order, only frequent costly order
  5563. * reclamation is disruptive enough to become a
  5564. * noticeable problem, like transparent huge
  5565. * page allocations.
  5566. */
  5567. if (IS_ENABLED(CONFIG_COMPACTION) &&
  5568. sc->order > PAGE_ALLOC_COSTLY_ORDER &&
  5569. compaction_ready(zone, sc)) {
  5570. sc->compaction_ready = true;
  5571. continue;
  5572. }
  5573. /*
  5574. * Shrink each node in the zonelist once. If the
  5575. * zonelist is ordered by zone (not the default) then a
  5576. * node may be shrunk multiple times but in that case
  5577. * the user prefers lower zones being preserved.
  5578. */
  5579. if (zone->zone_pgdat == last_pgdat)
  5580. continue;
  5581. /*
  5582. * This steals pages from memory cgroups over softlimit
  5583. * and returns the number of reclaimed pages and
  5584. * scanned pages. This works for global memory pressure
  5585. * and balancing, not for a memcg's limit.
  5586. */
  5587. nr_soft_scanned = 0;
  5588. nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(zone->zone_pgdat,
  5589. sc->order, sc->gfp_mask,
  5590. &nr_soft_scanned);
  5591. sc->nr_reclaimed += nr_soft_reclaimed;
  5592. sc->nr_scanned += nr_soft_scanned;
  5593. /* need some check for avoid more shrink_zone() */
  5594. }
  5595. if (!first_pgdat)
  5596. first_pgdat = zone->zone_pgdat;
  5597. /* See comment about same check for global reclaim above */
  5598. if (zone->zone_pgdat == last_pgdat)
  5599. continue;
  5600. last_pgdat = zone->zone_pgdat;
  5601. shrink_node(zone->zone_pgdat, sc);
  5602. }
  5603. if (first_pgdat)
  5604. consider_reclaim_throttle(first_pgdat, sc);
  5605. /*
  5606. * Restore to original mask to avoid the impact on the caller if we
  5607. * promoted it to __GFP_HIGHMEM.
  5608. */
  5609. sc->gfp_mask = orig_mask;
  5610. }
  5611. static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
  5612. {
  5613. struct lruvec *target_lruvec;
  5614. unsigned long refaults;
  5615. if (lru_gen_enabled())
  5616. return;
  5617. target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
  5618. refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
  5619. target_lruvec->refaults[WORKINGSET_ANON] = refaults;
  5620. refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
  5621. target_lruvec->refaults[WORKINGSET_FILE] = refaults;
  5622. }
  5623. static void modify_scan_control(struct scan_control *sc)
  5624. {
  5625. bool file_is_tiny = false, may_writepage = true;
  5626. #ifdef CONFIG_ANDROID_VENDOR_OEM_DATA
  5627. trace_android_vh_modify_scan_control(&sc->android_vendor_data1,
  5628. &sc->nr_to_reclaim, sc->target_mem_cgroup, &file_is_tiny,
  5629. &may_writepage);
  5630. #endif
  5631. if (file_is_tiny)
  5632. sc->file_is_tiny = true;
  5633. if (!may_writepage)
  5634. sc->may_writepage = false;
  5635. }
  5636. /*
  5637. * This is the main entry point to direct page reclaim.
  5638. *
  5639. * If a full scan of the inactive list fails to free enough memory then we
  5640. * are "out of memory" and something needs to be killed.
  5641. *
  5642. * If the caller is !__GFP_FS then the probability of a failure is reasonably
  5643. * high - the zone may be full of dirty or under-writeback pages, which this
  5644. * caller can't do much about. We kick the writeback threads and take explicit
  5645. * naps in the hope that some of these pages can be written. But if the
  5646. * allocating task holds filesystem locks which prevent writeout this might not
  5647. * work, and the allocation attempt will fail.
  5648. *
  5649. * returns: 0, if no pages reclaimed
  5650. * else, the number of pages reclaimed
  5651. */
  5652. static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
  5653. struct scan_control *sc)
  5654. {
  5655. int initial_priority = sc->priority;
  5656. pg_data_t *last_pgdat;
  5657. struct zoneref *z;
  5658. struct zone *zone;
  5659. modify_scan_control(sc);
  5660. retry:
  5661. delayacct_freepages_start();
  5662. if (!cgroup_reclaim(sc))
  5663. __count_zid_vm_events(ALLOCSTALL, sc->reclaim_idx, 1);
  5664. do {
  5665. if (!sc->proactive)
  5666. vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup,
  5667. sc->priority);
  5668. sc->nr_scanned = 0;
  5669. shrink_zones(zonelist, sc);
  5670. if (sc->nr_reclaimed >= sc->nr_to_reclaim)
  5671. break;
  5672. if (sc->compaction_ready)
  5673. break;
  5674. /*
  5675. * If we're getting trouble reclaiming, start doing
  5676. * writepage even in laptop mode.
  5677. */
  5678. if (sc->priority < DEF_PRIORITY - 2)
  5679. sc->may_writepage = 1;
  5680. } while (--sc->priority >= 0);
  5681. last_pgdat = NULL;
  5682. for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx,
  5683. sc->nodemask) {
  5684. if (zone->zone_pgdat == last_pgdat)
  5685. continue;
  5686. last_pgdat = zone->zone_pgdat;
  5687. snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat);
  5688. if (cgroup_reclaim(sc)) {
  5689. struct lruvec *lruvec;
  5690. lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup,
  5691. zone->zone_pgdat);
  5692. clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
  5693. }
  5694. }
  5695. delayacct_freepages_end();
  5696. if (sc->nr_reclaimed)
  5697. return sc->nr_reclaimed;
  5698. /* Aborted reclaim to try compaction? don't OOM, then */
  5699. if (sc->compaction_ready)
  5700. return 1;
  5701. /*
  5702. * We make inactive:active ratio decisions based on the node's
  5703. * composition of memory, but a restrictive reclaim_idx or a
  5704. * memory.low cgroup setting can exempt large amounts of
  5705. * memory from reclaim. Neither of which are very common, so
  5706. * instead of doing costly eligibility calculations of the
  5707. * entire cgroup subtree up front, we assume the estimates are
  5708. * good, and retry with forcible deactivation if that fails.
  5709. */
  5710. if (sc->skipped_deactivate) {
  5711. sc->priority = initial_priority;
  5712. sc->force_deactivate = 1;
  5713. sc->skipped_deactivate = 0;
  5714. goto retry;
  5715. }
  5716. /* Untapped cgroup reserves? Don't OOM, retry. */
  5717. if (sc->memcg_low_skipped) {
  5718. sc->priority = initial_priority;
  5719. sc->force_deactivate = 0;
  5720. sc->memcg_low_reclaim = 1;
  5721. sc->memcg_low_skipped = 0;
  5722. goto retry;
  5723. }
  5724. return 0;
  5725. }
  5726. static bool allow_direct_reclaim(pg_data_t *pgdat)
  5727. {
  5728. struct zone *zone;
  5729. unsigned long pfmemalloc_reserve = 0;
  5730. unsigned long free_pages = 0;
  5731. int i;
  5732. bool wmark_ok;
  5733. if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  5734. return true;
  5735. for (i = 0; i <= ZONE_NORMAL; i++) {
  5736. zone = &pgdat->node_zones[i];
  5737. if (!managed_zone(zone))
  5738. continue;
  5739. if (!zone_reclaimable_pages(zone))
  5740. continue;
  5741. pfmemalloc_reserve += min_wmark_pages(zone);
  5742. free_pages += zone_page_state(zone, NR_FREE_PAGES);
  5743. }
  5744. /* If there are no reserves (unexpected config) then do not throttle */
  5745. if (!pfmemalloc_reserve)
  5746. return true;
  5747. wmark_ok = free_pages > pfmemalloc_reserve / 2;
  5748. /* kswapd must be awake if processes are being throttled */
  5749. if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
  5750. if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
  5751. WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
  5752. wake_up_interruptible(&pgdat->kswapd_wait);
  5753. }
  5754. return wmark_ok;
  5755. }
  5756. /*
  5757. * Throttle direct reclaimers if backing storage is backed by the network
  5758. * and the PFMEMALLOC reserve for the preferred node is getting dangerously
  5759. * depleted. kswapd will continue to make progress and wake the processes
  5760. * when the low watermark is reached.
  5761. *
  5762. * Returns true if a fatal signal was delivered during throttling. If this
  5763. * happens, the page allocator should not consider triggering the OOM killer.
  5764. */
  5765. static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
  5766. nodemask_t *nodemask)
  5767. {
  5768. struct zoneref *z;
  5769. struct zone *zone;
  5770. pg_data_t *pgdat = NULL;
  5771. /*
  5772. * Kernel threads should not be throttled as they may be indirectly
  5773. * responsible for cleaning pages necessary for reclaim to make forward
  5774. * progress. kjournald for example may enter direct reclaim while
  5775. * committing a transaction where throttling it could forcing other
  5776. * processes to block on log_wait_commit().
  5777. */
  5778. if (current->flags & PF_KTHREAD)
  5779. goto out;
  5780. /*
  5781. * If a fatal signal is pending, this process should not throttle.
  5782. * It should return quickly so it can exit and free its memory
  5783. */
  5784. if (fatal_signal_pending(current))
  5785. goto out;
  5786. /*
  5787. * Check if the pfmemalloc reserves are ok by finding the first node
  5788. * with a usable ZONE_NORMAL or lower zone. The expectation is that
  5789. * GFP_KERNEL will be required for allocating network buffers when
  5790. * swapping over the network so ZONE_HIGHMEM is unusable.
  5791. *
  5792. * Throttling is based on the first usable node and throttled processes
  5793. * wait on a queue until kswapd makes progress and wakes them. There
  5794. * is an affinity then between processes waking up and where reclaim
  5795. * progress has been made assuming the process wakes on the same node.
  5796. * More importantly, processes running on remote nodes will not compete
  5797. * for remote pfmemalloc reserves and processes on different nodes
  5798. * should make reasonable progress.
  5799. */
  5800. for_each_zone_zonelist_nodemask(zone, z, zonelist,
  5801. gfp_zone(gfp_mask), nodemask) {
  5802. if (zone_idx(zone) > ZONE_NORMAL)
  5803. continue;
  5804. /* Throttle based on the first usable node */
  5805. pgdat = zone->zone_pgdat;
  5806. if (allow_direct_reclaim(pgdat))
  5807. goto out;
  5808. break;
  5809. }
  5810. /* If no zone was usable by the allocation flags then do not throttle */
  5811. if (!pgdat)
  5812. goto out;
  5813. /* Account for the throttling */
  5814. count_vm_event(PGSCAN_DIRECT_THROTTLE);
  5815. /*
  5816. * If the caller cannot enter the filesystem, it's possible that it
  5817. * is due to the caller holding an FS lock or performing a journal
  5818. * transaction in the case of a filesystem like ext[3|4]. In this case,
  5819. * it is not safe to block on pfmemalloc_wait as kswapd could be
  5820. * blocked waiting on the same lock. Instead, throttle for up to a
  5821. * second before continuing.
  5822. */
  5823. if (!(gfp_mask & __GFP_FS))
  5824. wait_event_interruptible_timeout(pgdat->pfmemalloc_wait,
  5825. allow_direct_reclaim(pgdat), HZ);
  5826. else
  5827. /* Throttle until kswapd wakes the process */
  5828. wait_event_killable(zone->zone_pgdat->pfmemalloc_wait,
  5829. allow_direct_reclaim(pgdat));
  5830. if (fatal_signal_pending(current))
  5831. return true;
  5832. out:
  5833. return false;
  5834. }
  5835. unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
  5836. gfp_t gfp_mask, nodemask_t *nodemask)
  5837. {
  5838. unsigned long nr_reclaimed;
  5839. struct scan_control sc = {
  5840. .nr_to_reclaim = SWAP_CLUSTER_MAX,
  5841. .gfp_mask = current_gfp_context(gfp_mask),
  5842. .reclaim_idx = gfp_zone(gfp_mask),
  5843. .order = order,
  5844. .nodemask = nodemask,
  5845. .priority = DEF_PRIORITY,
  5846. .may_writepage = !laptop_mode,
  5847. .may_unmap = 1,
  5848. .may_swap = 1,
  5849. };
  5850. /*
  5851. * scan_control uses s8 fields for order, priority, and reclaim_idx.
  5852. * Confirm they are large enough for max values.
  5853. */
  5854. BUILD_BUG_ON(MAX_ORDER > S8_MAX);
  5855. BUILD_BUG_ON(DEF_PRIORITY > S8_MAX);
  5856. BUILD_BUG_ON(MAX_NR_ZONES > S8_MAX);
  5857. /*
  5858. * Do not enter reclaim if fatal signal was delivered while throttled.
  5859. * 1 is returned so that the page allocator does not OOM kill at this
  5860. * point.
  5861. */
  5862. if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
  5863. return 1;
  5864. set_task_reclaim_state(current, &sc.reclaim_state);
  5865. trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
  5866. nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  5867. trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
  5868. set_task_reclaim_state(current, NULL);
  5869. return nr_reclaimed;
  5870. }
  5871. #ifdef CONFIG_MEMCG
  5872. /* Only used by soft limit reclaim. Do not reuse for anything else. */
  5873. unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
  5874. gfp_t gfp_mask, bool noswap,
  5875. pg_data_t *pgdat,
  5876. unsigned long *nr_scanned)
  5877. {
  5878. struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
  5879. struct scan_control sc = {
  5880. .nr_to_reclaim = SWAP_CLUSTER_MAX,
  5881. .target_mem_cgroup = memcg,
  5882. .may_writepage = !laptop_mode,
  5883. .may_unmap = 1,
  5884. .reclaim_idx = MAX_NR_ZONES - 1,
  5885. .may_swap = !noswap,
  5886. };
  5887. WARN_ON_ONCE(!current->reclaim_state);
  5888. sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
  5889. (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
  5890. trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
  5891. sc.gfp_mask);
  5892. /*
  5893. * NOTE: Although we can get the priority field, using it
  5894. * here is not a good idea, since it limits the pages we can scan.
  5895. * if we don't reclaim here, the shrink_node from balance_pgdat
  5896. * will pick up pages from other mem cgroup's as well. We hack
  5897. * the priority and make it zero.
  5898. */
  5899. shrink_lruvec(lruvec, &sc);
  5900. trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
  5901. *nr_scanned = sc.nr_scanned;
  5902. return sc.nr_reclaimed;
  5903. }
  5904. unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
  5905. unsigned long nr_pages,
  5906. gfp_t gfp_mask,
  5907. unsigned int reclaim_options)
  5908. {
  5909. unsigned long nr_reclaimed;
  5910. unsigned int noreclaim_flag;
  5911. struct scan_control sc = {
  5912. .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
  5913. .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) |
  5914. (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
  5915. .reclaim_idx = MAX_NR_ZONES - 1,
  5916. .target_mem_cgroup = memcg,
  5917. .priority = DEF_PRIORITY,
  5918. .may_writepage = !laptop_mode,
  5919. .may_unmap = 1,
  5920. .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
  5921. .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
  5922. };
  5923. /*
  5924. * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
  5925. * equal pressure on all the nodes. This is based on the assumption that
  5926. * the reclaim does not bail out early.
  5927. */
  5928. struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
  5929. set_task_reclaim_state(current, &sc.reclaim_state);
  5930. trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
  5931. noreclaim_flag = memalloc_noreclaim_save();
  5932. nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  5933. memalloc_noreclaim_restore(noreclaim_flag);
  5934. trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
  5935. set_task_reclaim_state(current, NULL);
  5936. return nr_reclaimed;
  5937. }
  5938. EXPORT_SYMBOL_GPL(try_to_free_mem_cgroup_pages);
  5939. #endif
  5940. static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc)
  5941. {
  5942. struct mem_cgroup *memcg;
  5943. struct lruvec *lruvec;
  5944. if (lru_gen_enabled()) {
  5945. lru_gen_age_node(pgdat, sc);
  5946. return;
  5947. }
  5948. if (!can_age_anon_pages(pgdat, sc))
  5949. return;
  5950. lruvec = mem_cgroup_lruvec(NULL, pgdat);
  5951. if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON))
  5952. return;
  5953. memcg = mem_cgroup_iter(NULL, NULL, NULL);
  5954. do {
  5955. lruvec = mem_cgroup_lruvec(memcg, pgdat);
  5956. shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
  5957. sc, LRU_ACTIVE_ANON);
  5958. memcg = mem_cgroup_iter(NULL, memcg, NULL);
  5959. } while (memcg);
  5960. }
  5961. static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
  5962. {
  5963. int i;
  5964. struct zone *zone;
  5965. /*
  5966. * Check for watermark boosts top-down as the higher zones
  5967. * are more likely to be boosted. Both watermarks and boosts
  5968. * should not be checked at the same time as reclaim would
  5969. * start prematurely when there is no boosting and a lower
  5970. * zone is balanced.
  5971. */
  5972. for (i = highest_zoneidx; i >= 0; i--) {
  5973. zone = pgdat->node_zones + i;
  5974. if (!managed_zone(zone))
  5975. continue;
  5976. if (zone->watermark_boost)
  5977. return true;
  5978. }
  5979. return false;
  5980. }
  5981. /*
  5982. * Returns true if there is an eligible zone balanced for the request order
  5983. * and highest_zoneidx
  5984. */
  5985. static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
  5986. {
  5987. int i;
  5988. unsigned long mark = -1;
  5989. struct zone *zone;
  5990. /*
  5991. * Check watermarks bottom-up as lower zones are more likely to
  5992. * meet watermarks.
  5993. */
  5994. for (i = 0; i <= highest_zoneidx; i++) {
  5995. zone = pgdat->node_zones + i;
  5996. if (!managed_zone(zone))
  5997. continue;
  5998. if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
  5999. mark = wmark_pages(zone, WMARK_PROMO);
  6000. else
  6001. mark = high_wmark_pages(zone);
  6002. if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
  6003. return true;
  6004. }
  6005. /*
  6006. * If a node has no managed zone within highest_zoneidx, it does not
  6007. * need balancing by definition. This can happen if a zone-restricted
  6008. * allocation tries to wake a remote kswapd.
  6009. */
  6010. if (mark == -1)
  6011. return true;
  6012. return false;
  6013. }
  6014. /* Clear pgdat state for congested, dirty or under writeback. */
  6015. static void clear_pgdat_congested(pg_data_t *pgdat)
  6016. {
  6017. struct lruvec *lruvec = mem_cgroup_lruvec(NULL, pgdat);
  6018. clear_bit(LRUVEC_CONGESTED, &lruvec->flags);
  6019. clear_bit(PGDAT_DIRTY, &pgdat->flags);
  6020. clear_bit(PGDAT_WRITEBACK, &pgdat->flags);
  6021. }
  6022. /*
  6023. * Prepare kswapd for sleeping. This verifies that there are no processes
  6024. * waiting in throttle_direct_reclaim() and that watermarks have been met.
  6025. *
  6026. * Returns true if kswapd is ready to sleep
  6027. */
  6028. static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
  6029. int highest_zoneidx)
  6030. {
  6031. /*
  6032. * The throttled processes are normally woken up in balance_pgdat() as
  6033. * soon as allow_direct_reclaim() is true. But there is a potential
  6034. * race between when kswapd checks the watermarks and a process gets
  6035. * throttled. There is also a potential race if processes get
  6036. * throttled, kswapd wakes, a large process exits thereby balancing the
  6037. * zones, which causes kswapd to exit balance_pgdat() before reaching
  6038. * the wake up checks. If kswapd is going to sleep, no process should
  6039. * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
  6040. * the wake up is premature, processes will wake kswapd and get
  6041. * throttled again. The difference from wake ups in balance_pgdat() is
  6042. * that here we are under prepare_to_wait().
  6043. */
  6044. if (waitqueue_active(&pgdat->pfmemalloc_wait))
  6045. wake_up_all(&pgdat->pfmemalloc_wait);
  6046. /* Hopeless node, leave it to direct reclaim */
  6047. if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
  6048. return true;
  6049. if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
  6050. clear_pgdat_congested(pgdat);
  6051. return true;
  6052. }
  6053. return false;
  6054. }
  6055. /*
  6056. * kswapd shrinks a node of pages that are at or below the highest usable
  6057. * zone that is currently unbalanced.
  6058. *
  6059. * Returns true if kswapd scanned at least the requested number of pages to
  6060. * reclaim or if the lack of progress was due to pages under writeback.
  6061. * This is used to determine if the scanning priority needs to be raised.
  6062. */
  6063. static bool kswapd_shrink_node(pg_data_t *pgdat,
  6064. struct scan_control *sc)
  6065. {
  6066. struct zone *zone;
  6067. int z;
  6068. /* Reclaim a number of pages proportional to the number of zones */
  6069. sc->nr_to_reclaim = 0;
  6070. for (z = 0; z <= sc->reclaim_idx; z++) {
  6071. zone = pgdat->node_zones + z;
  6072. if (!managed_zone(zone))
  6073. continue;
  6074. sc->nr_to_reclaim += max(high_wmark_pages(zone), SWAP_CLUSTER_MAX);
  6075. }
  6076. /*
  6077. * Historically care was taken to put equal pressure on all zones but
  6078. * now pressure is applied based on node LRU order.
  6079. */
  6080. shrink_node(pgdat, sc);
  6081. /*
  6082. * Fragmentation may mean that the system cannot be rebalanced for
  6083. * high-order allocations. If twice the allocation size has been
  6084. * reclaimed then recheck watermarks only at order-0 to prevent
  6085. * excessive reclaim. Assume that a process requested a high-order
  6086. * can direct reclaim/compact.
  6087. */
  6088. if (sc->order && sc->nr_reclaimed >= compact_gap(sc->order))
  6089. sc->order = 0;
  6090. return sc->nr_scanned >= sc->nr_to_reclaim;
  6091. }
  6092. /* Page allocator PCP high watermark is lowered if reclaim is active. */
  6093. static inline void
  6094. update_reclaim_active(pg_data_t *pgdat, int highest_zoneidx, bool active)
  6095. {
  6096. int i;
  6097. struct zone *zone;
  6098. for (i = 0; i <= highest_zoneidx; i++) {
  6099. zone = pgdat->node_zones + i;
  6100. if (!managed_zone(zone))
  6101. continue;
  6102. if (active)
  6103. set_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
  6104. else
  6105. clear_bit(ZONE_RECLAIM_ACTIVE, &zone->flags);
  6106. }
  6107. }
  6108. static inline void
  6109. set_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
  6110. {
  6111. update_reclaim_active(pgdat, highest_zoneidx, true);
  6112. }
  6113. static inline void
  6114. clear_reclaim_active(pg_data_t *pgdat, int highest_zoneidx)
  6115. {
  6116. update_reclaim_active(pgdat, highest_zoneidx, false);
  6117. }
  6118. /*
  6119. * For kswapd, balance_pgdat() will reclaim pages across a node from zones
  6120. * that are eligible for use by the caller until at least one zone is
  6121. * balanced.
  6122. *
  6123. * Returns the order kswapd finished reclaiming at.
  6124. *
  6125. * kswapd scans the zones in the highmem->normal->dma direction. It skips
  6126. * zones which have free_pages > high_wmark_pages(zone), but once a zone is
  6127. * found to have free_pages <= high_wmark_pages(zone), any page in that zone
  6128. * or lower is eligible for reclaim until at least one usable zone is
  6129. * balanced.
  6130. */
  6131. static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
  6132. {
  6133. int i;
  6134. unsigned long nr_soft_reclaimed;
  6135. unsigned long nr_soft_scanned;
  6136. unsigned long pflags;
  6137. unsigned long nr_boost_reclaim;
  6138. unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
  6139. bool boosted;
  6140. struct zone *zone;
  6141. struct scan_control sc = {
  6142. .gfp_mask = GFP_KERNEL,
  6143. .order = order,
  6144. .may_unmap = 1,
  6145. };
  6146. set_task_reclaim_state(current, &sc.reclaim_state);
  6147. psi_memstall_enter(&pflags);
  6148. __fs_reclaim_acquire(_THIS_IP_);
  6149. count_vm_event(PAGEOUTRUN);
  6150. /*
  6151. * Account for the reclaim boost. Note that the zone boost is left in
  6152. * place so that parallel allocations that are near the watermark will
  6153. * stall or direct reclaim until kswapd is finished.
  6154. */
  6155. nr_boost_reclaim = 0;
  6156. for (i = 0; i <= highest_zoneidx; i++) {
  6157. zone = pgdat->node_zones + i;
  6158. if (!managed_zone(zone))
  6159. continue;
  6160. nr_boost_reclaim += zone->watermark_boost;
  6161. zone_boosts[i] = zone->watermark_boost;
  6162. }
  6163. boosted = nr_boost_reclaim;
  6164. restart:
  6165. set_reclaim_active(pgdat, highest_zoneidx);
  6166. sc.priority = DEF_PRIORITY;
  6167. do {
  6168. unsigned long nr_reclaimed = sc.nr_reclaimed;
  6169. bool raise_priority = true;
  6170. bool balanced;
  6171. bool ret;
  6172. sc.reclaim_idx = highest_zoneidx;
  6173. /*
  6174. * If the number of buffer_heads exceeds the maximum allowed
  6175. * then consider reclaiming from all zones. This has a dual
  6176. * purpose -- on 64-bit systems it is expected that
  6177. * buffer_heads are stripped during active rotation. On 32-bit
  6178. * systems, highmem pages can pin lowmem memory and shrinking
  6179. * buffers can relieve lowmem pressure. Reclaim may still not
  6180. * go ahead if all eligible zones for the original allocation
  6181. * request are balanced to avoid excessive reclaim from kswapd.
  6182. */
  6183. if (buffer_heads_over_limit) {
  6184. for (i = MAX_NR_ZONES - 1; i >= 0; i--) {
  6185. zone = pgdat->node_zones + i;
  6186. if (!managed_zone(zone))
  6187. continue;
  6188. sc.reclaim_idx = i;
  6189. break;
  6190. }
  6191. }
  6192. /*
  6193. * If the pgdat is imbalanced then ignore boosting and preserve
  6194. * the watermarks for a later time and restart. Note that the
  6195. * zone watermarks will be still reset at the end of balancing
  6196. * on the grounds that the normal reclaim should be enough to
  6197. * re-evaluate if boosting is required when kswapd next wakes.
  6198. */
  6199. balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
  6200. if (!balanced && nr_boost_reclaim) {
  6201. nr_boost_reclaim = 0;
  6202. goto restart;
  6203. }
  6204. /*
  6205. * If boosting is not active then only reclaim if there are no
  6206. * eligible zones. Note that sc.reclaim_idx is not used as
  6207. * buffer_heads_over_limit may have adjusted it.
  6208. */
  6209. if (!nr_boost_reclaim && balanced)
  6210. goto out;
  6211. /* Limit the priority of boosting to avoid reclaim writeback */
  6212. if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
  6213. raise_priority = false;
  6214. /*
  6215. * Do not writeback or swap pages for boosted reclaim. The
  6216. * intent is to relieve pressure not issue sub-optimal IO
  6217. * from reclaim context. If no pages are reclaimed, the
  6218. * reclaim will be aborted.
  6219. */
  6220. sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
  6221. sc.may_swap = !nr_boost_reclaim;
  6222. /*
  6223. * Do some background aging, to give pages a chance to be
  6224. * referenced before reclaiming. All pages are rotated
  6225. * regardless of classzone as this is about consistent aging.
  6226. */
  6227. kswapd_age_node(pgdat, &sc);
  6228. /*
  6229. * If we're getting trouble reclaiming, start doing writepage
  6230. * even in laptop mode.
  6231. */
  6232. if (sc.priority < DEF_PRIORITY - 2)
  6233. sc.may_writepage = 1;
  6234. /* Call soft limit reclaim before calling shrink_node. */
  6235. sc.nr_scanned = 0;
  6236. nr_soft_scanned = 0;
  6237. nr_soft_reclaimed = mem_cgroup_soft_limit_reclaim(pgdat, sc.order,
  6238. sc.gfp_mask, &nr_soft_scanned);
  6239. sc.nr_reclaimed += nr_soft_reclaimed;
  6240. /*
  6241. * There should be no need to raise the scanning priority if
  6242. * enough pages are already being scanned that that high
  6243. * watermark would be met at 100% efficiency.
  6244. */
  6245. if (kswapd_shrink_node(pgdat, &sc))
  6246. raise_priority = false;
  6247. /*
  6248. * If the low watermark is met there is no need for processes
  6249. * to be throttled on pfmemalloc_wait as they should not be
  6250. * able to safely make forward progress. Wake them
  6251. */
  6252. if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
  6253. allow_direct_reclaim(pgdat))
  6254. wake_up_all(&pgdat->pfmemalloc_wait);
  6255. /* Check if kswapd should be suspending */
  6256. __fs_reclaim_release(_THIS_IP_);
  6257. ret = try_to_freeze();
  6258. __fs_reclaim_acquire(_THIS_IP_);
  6259. if (ret || kthread_should_stop())
  6260. break;
  6261. /*
  6262. * Raise priority if scanning rate is too low or there was no
  6263. * progress in reclaiming pages
  6264. */
  6265. nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
  6266. nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
  6267. /*
  6268. * If reclaim made no progress for a boost, stop reclaim as
  6269. * IO cannot be queued and it could be an infinite loop in
  6270. * extreme circumstances.
  6271. */
  6272. if (nr_boost_reclaim && !nr_reclaimed)
  6273. break;
  6274. if (raise_priority || !nr_reclaimed)
  6275. sc.priority--;
  6276. } while (sc.priority >= 1);
  6277. if (!sc.nr_reclaimed)
  6278. pgdat->kswapd_failures++;
  6279. out:
  6280. clear_reclaim_active(pgdat, highest_zoneidx);
  6281. /* If reclaim was boosted, account for the reclaim done in this pass */
  6282. if (boosted) {
  6283. unsigned long flags;
  6284. for (i = 0; i <= highest_zoneidx; i++) {
  6285. if (!zone_boosts[i])
  6286. continue;
  6287. /* Increments are under the zone lock */
  6288. zone = pgdat->node_zones + i;
  6289. spin_lock_irqsave(&zone->lock, flags);
  6290. zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
  6291. spin_unlock_irqrestore(&zone->lock, flags);
  6292. }
  6293. /*
  6294. * As there is now likely space, wakeup kcompact to defragment
  6295. * pageblocks.
  6296. */
  6297. wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
  6298. }
  6299. snapshot_refaults(NULL, pgdat);
  6300. __fs_reclaim_release(_THIS_IP_);
  6301. psi_memstall_leave(&pflags);
  6302. set_task_reclaim_state(current, NULL);
  6303. /*
  6304. * Return the order kswapd stopped reclaiming at as
  6305. * prepare_kswapd_sleep() takes it into account. If another caller
  6306. * entered the allocator slow path while kswapd was awake, order will
  6307. * remain at the higher level.
  6308. */
  6309. return sc.order;
  6310. }
  6311. /*
  6312. * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
  6313. * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
  6314. * not a valid index then either kswapd runs for first time or kswapd couldn't
  6315. * sleep after previous reclaim attempt (node is still unbalanced). In that
  6316. * case return the zone index of the previous kswapd reclaim cycle.
  6317. */
  6318. static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
  6319. enum zone_type prev_highest_zoneidx)
  6320. {
  6321. enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
  6322. return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
  6323. }
  6324. static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
  6325. unsigned int highest_zoneidx)
  6326. {
  6327. long remaining = 0;
  6328. DEFINE_WAIT(wait);
  6329. if (freezing(current) || kthread_should_stop())
  6330. return;
  6331. prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  6332. /*
  6333. * Try to sleep for a short interval. Note that kcompactd will only be
  6334. * woken if it is possible to sleep for a short interval. This is
  6335. * deliberate on the assumption that if reclaim cannot keep an
  6336. * eligible zone balanced that it's also unlikely that compaction will
  6337. * succeed.
  6338. */
  6339. if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
  6340. /*
  6341. * Compaction records what page blocks it recently failed to
  6342. * isolate pages from and skips them in the future scanning.
  6343. * When kswapd is going to sleep, it is reasonable to assume
  6344. * that pages and compaction may succeed so reset the cache.
  6345. */
  6346. reset_isolation_suitable(pgdat);
  6347. /*
  6348. * We have freed the memory, now we should compact it to make
  6349. * allocation of the requested order possible.
  6350. */
  6351. wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
  6352. remaining = schedule_timeout(HZ/10);
  6353. /*
  6354. * If woken prematurely then reset kswapd_highest_zoneidx and
  6355. * order. The values will either be from a wakeup request or
  6356. * the previous request that slept prematurely.
  6357. */
  6358. if (remaining) {
  6359. WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
  6360. kswapd_highest_zoneidx(pgdat,
  6361. highest_zoneidx));
  6362. if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
  6363. WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
  6364. }
  6365. finish_wait(&pgdat->kswapd_wait, &wait);
  6366. prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
  6367. }
  6368. /*
  6369. * After a short sleep, check if it was a premature sleep. If not, then
  6370. * go fully to sleep until explicitly woken up.
  6371. */
  6372. if (!remaining &&
  6373. prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
  6374. trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
  6375. /*
  6376. * vmstat counters are not perfectly accurate and the estimated
  6377. * value for counters such as NR_FREE_PAGES can deviate from the
  6378. * true value by nr_online_cpus * threshold. To avoid the zone
  6379. * watermarks being breached while under pressure, we reduce the
  6380. * per-cpu vmstat threshold while kswapd is awake and restore
  6381. * them before going back to sleep.
  6382. */
  6383. set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold);
  6384. if (!kthread_should_stop())
  6385. schedule();
  6386. set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold);
  6387. } else {
  6388. if (remaining)
  6389. count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY);
  6390. else
  6391. count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY);
  6392. }
  6393. finish_wait(&pgdat->kswapd_wait, &wait);
  6394. }
  6395. /*
  6396. * The background pageout daemon, started as a kernel thread
  6397. * from the init process.
  6398. *
  6399. * This basically trickles out pages so that we have _some_
  6400. * free memory available even if there is no other activity
  6401. * that frees anything up. This is needed for things like routing
  6402. * etc, where we otherwise might have all activity going on in
  6403. * asynchronous contexts that cannot page things out.
  6404. *
  6405. * If there are applications that are active memory-allocators
  6406. * (most normal use), this basically shouldn't matter.
  6407. */
  6408. int kswapd(void *p)
  6409. {
  6410. unsigned int alloc_order, reclaim_order;
  6411. unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
  6412. pg_data_t *pgdat = (pg_data_t *)p;
  6413. struct task_struct *tsk = current;
  6414. const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
  6415. if (!cpumask_empty(cpumask))
  6416. set_cpus_allowed_ptr(tsk, cpumask);
  6417. /*
  6418. * Tell the memory management that we're a "memory allocator",
  6419. * and that if we need more memory we should get access to it
  6420. * regardless (see "__alloc_pages()"). "kswapd" should
  6421. * never get caught in the normal page freeing logic.
  6422. *
  6423. * (Kswapd normally doesn't need memory anyway, but sometimes
  6424. * you need a small amount of memory in order to be able to
  6425. * page out something else, and this flag essentially protects
  6426. * us from recursively trying to free more memory as we're
  6427. * trying to free the first piece of memory in the first place).
  6428. */
  6429. tsk->flags |= PF_MEMALLOC | PF_KSWAPD;
  6430. set_freezable();
  6431. WRITE_ONCE(pgdat->kswapd_order, 0);
  6432. WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
  6433. atomic_set(&pgdat->nr_writeback_throttled, 0);
  6434. for ( ; ; ) {
  6435. bool ret;
  6436. alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
  6437. highest_zoneidx = kswapd_highest_zoneidx(pgdat,
  6438. highest_zoneidx);
  6439. kswapd_try_sleep:
  6440. kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
  6441. highest_zoneidx);
  6442. /* Read the new order and highest_zoneidx */
  6443. alloc_order = READ_ONCE(pgdat->kswapd_order);
  6444. highest_zoneidx = kswapd_highest_zoneidx(pgdat,
  6445. highest_zoneidx);
  6446. WRITE_ONCE(pgdat->kswapd_order, 0);
  6447. WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
  6448. ret = try_to_freeze();
  6449. if (kthread_should_stop())
  6450. break;
  6451. /*
  6452. * We can speed up thawing tasks if we don't call balance_pgdat
  6453. * after returning from the refrigerator
  6454. */
  6455. if (ret)
  6456. continue;
  6457. /*
  6458. * Reclaim begins at the requested order but if a high-order
  6459. * reclaim fails then kswapd falls back to reclaiming for
  6460. * order-0. If that happens, kswapd will consider sleeping
  6461. * for the order it finished reclaiming at (reclaim_order)
  6462. * but kcompactd is woken to compact for the original
  6463. * request (alloc_order).
  6464. */
  6465. trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
  6466. alloc_order);
  6467. reclaim_order = balance_pgdat(pgdat, alloc_order,
  6468. highest_zoneidx);
  6469. trace_android_vh_vmscan_kswapd_done(pgdat->node_id, highest_zoneidx,
  6470. alloc_order, reclaim_order);
  6471. if (reclaim_order < alloc_order)
  6472. goto kswapd_try_sleep;
  6473. }
  6474. tsk->flags &= ~(PF_MEMALLOC | PF_KSWAPD);
  6475. return 0;
  6476. }
  6477. EXPORT_SYMBOL_GPL(kswapd);
  6478. /*
  6479. * A zone is low on free memory or too fragmented for high-order memory. If
  6480. * kswapd should reclaim (direct reclaim is deferred), wake it up for the zone's
  6481. * pgdat. It will wake up kcompactd after reclaiming memory. If kswapd reclaim
  6482. * has failed or is not needed, still wake up kcompactd if only compaction is
  6483. * needed.
  6484. */
  6485. void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
  6486. enum zone_type highest_zoneidx)
  6487. {
  6488. pg_data_t *pgdat;
  6489. enum zone_type curr_idx;
  6490. if (!managed_zone(zone))
  6491. return;
  6492. if (!cpuset_zone_allowed(zone, gfp_flags))
  6493. return;
  6494. pgdat = zone->zone_pgdat;
  6495. curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
  6496. if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
  6497. WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
  6498. if (READ_ONCE(pgdat->kswapd_order) < order)
  6499. WRITE_ONCE(pgdat->kswapd_order, order);
  6500. if (!waitqueue_active(&pgdat->kswapd_wait))
  6501. return;
  6502. /* Hopeless node, leave it to direct reclaim if possible */
  6503. if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
  6504. (pgdat_balanced(pgdat, order, highest_zoneidx) &&
  6505. !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
  6506. /*
  6507. * There may be plenty of free memory available, but it's too
  6508. * fragmented for high-order allocations. Wake up kcompactd
  6509. * and rely on compaction_suitable() to determine if it's
  6510. * needed. If it fails, it will defer subsequent attempts to
  6511. * ratelimit its work.
  6512. */
  6513. if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
  6514. wakeup_kcompactd(pgdat, order, highest_zoneidx);
  6515. return;
  6516. }
  6517. trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
  6518. gfp_flags);
  6519. wake_up_interruptible(&pgdat->kswapd_wait);
  6520. }
  6521. #ifdef CONFIG_HIBERNATION
  6522. /*
  6523. * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of
  6524. * freed pages.
  6525. *
  6526. * Rather than trying to age LRUs the aim is to preserve the overall
  6527. * LRU order by reclaiming preferentially
  6528. * inactive > active > active referenced > active mapped
  6529. */
  6530. unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
  6531. {
  6532. struct scan_control sc = {
  6533. .nr_to_reclaim = nr_to_reclaim,
  6534. .gfp_mask = GFP_HIGHUSER_MOVABLE,
  6535. .reclaim_idx = MAX_NR_ZONES - 1,
  6536. .priority = DEF_PRIORITY,
  6537. .may_writepage = 1,
  6538. .may_unmap = 1,
  6539. .may_swap = 1,
  6540. .hibernation_mode = 1,
  6541. };
  6542. struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
  6543. unsigned long nr_reclaimed;
  6544. unsigned int noreclaim_flag;
  6545. fs_reclaim_acquire(sc.gfp_mask);
  6546. noreclaim_flag = memalloc_noreclaim_save();
  6547. set_task_reclaim_state(current, &sc.reclaim_state);
  6548. nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
  6549. set_task_reclaim_state(current, NULL);
  6550. memalloc_noreclaim_restore(noreclaim_flag);
  6551. fs_reclaim_release(sc.gfp_mask);
  6552. return nr_reclaimed;
  6553. }
  6554. #endif /* CONFIG_HIBERNATION */
  6555. /*
  6556. * This kswapd start function will be called by init and node-hot-add.
  6557. */
  6558. void kswapd_run(int nid)
  6559. {
  6560. pg_data_t *pgdat = NODE_DATA(nid);
  6561. bool skip = false;
  6562. pgdat_kswapd_lock(pgdat);
  6563. if (!pgdat->kswapd) {
  6564. trace_android_vh_kswapd_per_node(nid, &skip, true);
  6565. if (skip) {
  6566. pgdat_kswapd_unlock(pgdat);
  6567. return;
  6568. }
  6569. pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
  6570. if (IS_ERR(pgdat->kswapd)) {
  6571. /* failure at boot is fatal */
  6572. BUG_ON(system_state < SYSTEM_RUNNING);
  6573. pr_err("Failed to start kswapd on node %d\n", nid);
  6574. pgdat->kswapd = NULL;
  6575. }
  6576. }
  6577. pgdat_kswapd_unlock(pgdat);
  6578. }
  6579. /*
  6580. * Called by memory hotplug when all memory in a node is offlined. Caller must
  6581. * be holding mem_hotplug_begin/done().
  6582. */
  6583. void kswapd_stop(int nid)
  6584. {
  6585. pg_data_t *pgdat = NODE_DATA(nid);
  6586. struct task_struct *kswapd;
  6587. bool skip = false;
  6588. pgdat_kswapd_lock(pgdat);
  6589. kswapd = pgdat->kswapd;
  6590. trace_android_vh_kswapd_per_node(nid, &skip, false);
  6591. if (skip) {
  6592. pgdat_kswapd_unlock(pgdat);
  6593. return;
  6594. }
  6595. if (kswapd) {
  6596. kthread_stop(kswapd);
  6597. pgdat->kswapd = NULL;
  6598. }
  6599. pgdat_kswapd_unlock(pgdat);
  6600. }
  6601. static int __init kswapd_init(void)
  6602. {
  6603. int nid;
  6604. swap_setup();
  6605. for_each_node_state(nid, N_MEMORY)
  6606. kswapd_run(nid);
  6607. return 0;
  6608. }
  6609. module_init(kswapd_init)
  6610. #ifdef CONFIG_NUMA
  6611. /*
  6612. * Node reclaim mode
  6613. *
  6614. * If non-zero call node_reclaim when the number of free pages falls below
  6615. * the watermarks.
  6616. */
  6617. int node_reclaim_mode __read_mostly;
  6618. /*
  6619. * Priority for NODE_RECLAIM. This determines the fraction of pages
  6620. * of a node considered for each zone_reclaim. 4 scans 1/16th of
  6621. * a zone.
  6622. */
  6623. #define NODE_RECLAIM_PRIORITY 4
  6624. /*
  6625. * Percentage of pages in a zone that must be unmapped for node_reclaim to
  6626. * occur.
  6627. */
  6628. int sysctl_min_unmapped_ratio = 1;
  6629. /*
  6630. * If the number of slab pages in a zone grows beyond this percentage then
  6631. * slab reclaim needs to occur.
  6632. */
  6633. int sysctl_min_slab_ratio = 5;
  6634. static inline unsigned long node_unmapped_file_pages(struct pglist_data *pgdat)
  6635. {
  6636. unsigned long file_mapped = node_page_state(pgdat, NR_FILE_MAPPED);
  6637. unsigned long file_lru = node_page_state(pgdat, NR_INACTIVE_FILE) +
  6638. node_page_state(pgdat, NR_ACTIVE_FILE);
  6639. /*
  6640. * It's possible for there to be more file mapped pages than
  6641. * accounted for by the pages on the file LRU lists because
  6642. * tmpfs pages accounted for as ANON can also be FILE_MAPPED
  6643. */
  6644. return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
  6645. }
  6646. /* Work out how many page cache pages we can reclaim in this reclaim_mode */
  6647. static unsigned long node_pagecache_reclaimable(struct pglist_data *pgdat)
  6648. {
  6649. unsigned long nr_pagecache_reclaimable;
  6650. unsigned long delta = 0;
  6651. /*
  6652. * If RECLAIM_UNMAP is set, then all file pages are considered
  6653. * potentially reclaimable. Otherwise, we have to worry about
  6654. * pages like swapcache and node_unmapped_file_pages() provides
  6655. * a better estimate
  6656. */
  6657. if (node_reclaim_mode & RECLAIM_UNMAP)
  6658. nr_pagecache_reclaimable = node_page_state(pgdat, NR_FILE_PAGES);
  6659. else
  6660. nr_pagecache_reclaimable = node_unmapped_file_pages(pgdat);
  6661. /* If we can't clean pages, remove dirty pages from consideration */
  6662. if (!(node_reclaim_mode & RECLAIM_WRITE))
  6663. delta += node_page_state(pgdat, NR_FILE_DIRTY);
  6664. /* Watch for any possible underflows due to delta */
  6665. if (unlikely(delta > nr_pagecache_reclaimable))
  6666. delta = nr_pagecache_reclaimable;
  6667. return nr_pagecache_reclaimable - delta;
  6668. }
  6669. /*
  6670. * Try to free up some pages from this node through reclaim.
  6671. */
  6672. static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  6673. {
  6674. /* Minimum pages needed in order to stay on node */
  6675. const unsigned long nr_pages = 1 << order;
  6676. struct task_struct *p = current;
  6677. unsigned int noreclaim_flag;
  6678. struct scan_control sc = {
  6679. .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
  6680. .gfp_mask = current_gfp_context(gfp_mask),
  6681. .order = order,
  6682. .priority = NODE_RECLAIM_PRIORITY,
  6683. .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE),
  6684. .may_unmap = !!(node_reclaim_mode & RECLAIM_UNMAP),
  6685. .may_swap = 1,
  6686. .reclaim_idx = gfp_zone(gfp_mask),
  6687. };
  6688. unsigned long pflags;
  6689. trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
  6690. sc.gfp_mask);
  6691. cond_resched();
  6692. psi_memstall_enter(&pflags);
  6693. fs_reclaim_acquire(sc.gfp_mask);
  6694. /*
  6695. * We need to be able to allocate from the reserves for RECLAIM_UNMAP
  6696. */
  6697. noreclaim_flag = memalloc_noreclaim_save();
  6698. set_task_reclaim_state(p, &sc.reclaim_state);
  6699. if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages ||
  6700. node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) > pgdat->min_slab_pages) {
  6701. /*
  6702. * Free memory by calling shrink node with increasing
  6703. * priorities until we have enough memory freed.
  6704. */
  6705. do {
  6706. shrink_node(pgdat, &sc);
  6707. } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
  6708. }
  6709. set_task_reclaim_state(p, NULL);
  6710. memalloc_noreclaim_restore(noreclaim_flag);
  6711. fs_reclaim_release(sc.gfp_mask);
  6712. psi_memstall_leave(&pflags);
  6713. trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
  6714. return sc.nr_reclaimed >= nr_pages;
  6715. }
  6716. int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
  6717. {
  6718. int ret;
  6719. /*
  6720. * Node reclaim reclaims unmapped file backed pages and
  6721. * slab pages if we are over the defined limits.
  6722. *
  6723. * A small portion of unmapped file backed pages is needed for
  6724. * file I/O otherwise pages read by file I/O will be immediately
  6725. * thrown out if the node is overallocated. So we do not reclaim
  6726. * if less than a specified percentage of the node is used by
  6727. * unmapped file backed pages.
  6728. */
  6729. if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
  6730. node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
  6731. pgdat->min_slab_pages)
  6732. return NODE_RECLAIM_FULL;
  6733. /*
  6734. * Do not scan if the allocation should not be delayed.
  6735. */
  6736. if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC))
  6737. return NODE_RECLAIM_NOSCAN;
  6738. /*
  6739. * Only run node reclaim on the local node or on nodes that do not
  6740. * have associated processors. This will favor the local processor
  6741. * over remote processors and spread off node memory allocations
  6742. * as wide as possible.
  6743. */
  6744. if (node_state(pgdat->node_id, N_CPU) && pgdat->node_id != numa_node_id())
  6745. return NODE_RECLAIM_NOSCAN;
  6746. if (test_and_set_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags))
  6747. return NODE_RECLAIM_NOSCAN;
  6748. ret = __node_reclaim(pgdat, gfp_mask, order);
  6749. clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
  6750. if (!ret)
  6751. count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
  6752. return ret;
  6753. }
  6754. #endif
  6755. void check_move_unevictable_pages(struct pagevec *pvec)
  6756. {
  6757. struct folio_batch fbatch;
  6758. unsigned i;
  6759. folio_batch_init(&fbatch);
  6760. for (i = 0; i < pvec->nr; i++) {
  6761. struct page *page = pvec->pages[i];
  6762. if (PageTransTail(page))
  6763. continue;
  6764. folio_batch_add(&fbatch, page_folio(page));
  6765. }
  6766. check_move_unevictable_folios(&fbatch);
  6767. }
  6768. EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
  6769. /**
  6770. * check_move_unevictable_folios - Move evictable folios to appropriate zone
  6771. * lru list
  6772. * @fbatch: Batch of lru folios to check.
  6773. *
  6774. * Checks folios for evictability, if an evictable folio is in the unevictable
  6775. * lru list, moves it to the appropriate evictable lru list. This function
  6776. * should be only used for lru folios.
  6777. */
  6778. void check_move_unevictable_folios(struct folio_batch *fbatch)
  6779. {
  6780. struct lruvec *lruvec = NULL;
  6781. int pgscanned = 0;
  6782. int pgrescued = 0;
  6783. int i;
  6784. for (i = 0; i < fbatch->nr; i++) {
  6785. struct folio *folio = fbatch->folios[i];
  6786. int nr_pages = folio_nr_pages(folio);
  6787. pgscanned += nr_pages;
  6788. /* block memcg migration while the folio moves between lrus */
  6789. if (!folio_test_clear_lru(folio))
  6790. continue;
  6791. lruvec = folio_lruvec_relock_irq(folio, lruvec);
  6792. if (folio_evictable(folio) && folio_test_unevictable(folio)) {
  6793. lruvec_del_folio(lruvec, folio);
  6794. folio_clear_unevictable(folio);
  6795. lruvec_add_folio(lruvec, folio);
  6796. pgrescued += nr_pages;
  6797. }
  6798. folio_set_lru(folio);
  6799. }
  6800. if (lruvec) {
  6801. __count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
  6802. __count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
  6803. unlock_page_lruvec_irq(lruvec);
  6804. } else if (pgscanned) {
  6805. count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
  6806. }
  6807. }
  6808. EXPORT_SYMBOL_GPL(check_move_unevictable_folios);