swapfile.c 97 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141314231433144314531463147314831493150315131523153315431553156315731583159316031613162316331643165316631673168316931703171317231733174317531763177317831793180318131823183318431853186318731883189319031913192319331943195319631973198319932003201320232033204320532063207320832093210321132123213321432153216321732183219322032213222322332243225322632273228322932303231323232333234323532363237323832393240324132423243324432453246324732483249325032513252325332543255325632573258325932603261326232633264326532663267326832693270327132723273327432753276327732783279328032813282328332843285328632873288328932903291329232933294329532963297329832993300330133023303330433053306330733083309331033113312331333143315331633173318331933203321332233233324332533263327332833293330333133323333333433353336333733383339334033413342334333443345334633473348334933503351335233533354335533563357335833593360336133623363336433653366336733683369337033713372337333743375337633773378337933803381338233833384338533863387338833893390339133923393339433953396339733983399340034013402340334043405340634073408340934103411341234133414341534163417341834193420342134223423342434253426342734283429343034313432343334343435343634373438343934403441344234433444344534463447344834493450345134523453345434553456345734583459346034613462346334643465346634673468346934703471347234733474347534763477347834793480348134823483348434853486348734883489349034913492349334943495349634973498349935003501350235033504350535063507350835093510351135123513351435153516351735183519352035213522352335243525352635273528352935303531353235333534353535363537353835393540354135423543354435453546354735483549355035513552355335543555355635573558355935603561356235633564356535663567356835693570357135723573357435753576357735783579358035813582358335843585358635873588358935903591359235933594359535963597359835993600360136023603360436053606360736083609361036113612361336143615361636173618361936203621362236233624362536263627362836293630363136323633363436353636363736383639364036413642364336443645364636473648364936503651365236533654365536563657365836593660366136623663366436653666366736683669367036713672367336743675367636773678367936803681368236833684368536863687368836893690369136923693369436953696369736983699370037013702370337043705370637073708370937103711371237133714371537163717371837193720372137223723372437253726372737283729373037313732373337343735373637373738373937403741374237433744374537463747374837493750375137523753375437553756375737583759376037613762376337643765376637673768376937703771377237733774377537763777377837793780378137823783378437853786378737883789379037913792379337943795379637973798379938003801380238033804380538063807380838093810381138123813381438153816381738183819382038213822382338243825382638273828382938303831383238333834383538363837383838393840384138423843384438453846
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/mm/swapfile.c
  4. *
  5. * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
  6. * Swap reorganised 29.12.95, Stephen Tweedie
  7. */
  8. #include <linux/mm.h>
  9. #include <linux/sched/mm.h>
  10. #include <linux/sched/task.h>
  11. #include <linux/hugetlb.h>
  12. #include <linux/mman.h>
  13. #include <linux/slab.h>
  14. #include <linux/kernel_stat.h>
  15. #include <linux/swap.h>
  16. #include <linux/vmalloc.h>
  17. #include <linux/pagemap.h>
  18. #include <linux/namei.h>
  19. #include <linux/shmem_fs.h>
  20. #include <linux/blkdev.h>
  21. #include <linux/random.h>
  22. #include <linux/writeback.h>
  23. #include <linux/proc_fs.h>
  24. #include <linux/seq_file.h>
  25. #include <linux/init.h>
  26. #include <linux/ksm.h>
  27. #include <linux/rmap.h>
  28. #include <linux/security.h>
  29. #include <linux/backing-dev.h>
  30. #include <linux/mutex.h>
  31. #include <linux/capability.h>
  32. #include <linux/syscalls.h>
  33. #include <linux/memcontrol.h>
  34. #include <linux/poll.h>
  35. #include <linux/oom.h>
  36. #include <linux/frontswap.h>
  37. #include <linux/swapfile.h>
  38. #include <linux/export.h>
  39. #include <linux/swap_slots.h>
  40. #include <linux/sort.h>
  41. #include <asm/tlbflush.h>
  42. #include <linux/swapops.h>
  43. #include <linux/swap_cgroup.h>
  44. static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
  45. unsigned char);
  46. static void free_swap_count_continuations(struct swap_info_struct *);
  47. static sector_t map_swap_entry(swp_entry_t, struct block_device**);
  48. DEFINE_SPINLOCK(swap_lock);
  49. static unsigned int nr_swapfiles;
  50. atomic_long_t nr_swap_pages;
  51. /*
  52. * Some modules use swappable objects and may try to swap them out under
  53. * memory pressure (via the shrinker). Before doing so, they may wish to
  54. * check to see if any swap space is available.
  55. */
  56. EXPORT_SYMBOL_GPL(nr_swap_pages);
  57. /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
  58. long total_swap_pages;
  59. static int least_priority = -1;
  60. static const char Bad_file[] = "Bad swap file entry ";
  61. static const char Unused_file[] = "Unused swap file entry ";
  62. static const char Bad_offset[] = "Bad swap offset entry ";
  63. static const char Unused_offset[] = "Unused swap offset entry ";
  64. /*
  65. * all active swap_info_structs
  66. * protected with swap_lock, and ordered by priority.
  67. */
  68. PLIST_HEAD(swap_active_head);
  69. /*
  70. * all available (active, not full) swap_info_structs
  71. * protected with swap_avail_lock, ordered by priority.
  72. * This is used by get_swap_page() instead of swap_active_head
  73. * because swap_active_head includes all swap_info_structs,
  74. * but get_swap_page() doesn't need to look at full ones.
  75. * This uses its own lock instead of swap_lock because when a
  76. * swap_info_struct changes between not-full/full, it needs to
  77. * add/remove itself to/from this list, but the swap_info_struct->lock
  78. * is held and the locking order requires swap_lock to be taken
  79. * before any swap_info_struct->lock.
  80. */
  81. static struct plist_head *swap_avail_heads;
  82. static DEFINE_SPINLOCK(swap_avail_lock);
  83. struct swap_info_struct *swap_info[MAX_SWAPFILES];
  84. static DEFINE_MUTEX(swapon_mutex);
  85. static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait);
  86. /* Activity counter to indicate that a swapon or swapoff has occurred */
  87. static atomic_t proc_poll_event = ATOMIC_INIT(0);
  88. atomic_t nr_rotate_swap = ATOMIC_INIT(0);
  89. static struct swap_info_struct *swap_type_to_swap_info(int type)
  90. {
  91. if (type >= READ_ONCE(nr_swapfiles))
  92. return NULL;
  93. smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
  94. return READ_ONCE(swap_info[type]);
  95. }
  96. static inline unsigned char swap_count(unsigned char ent)
  97. {
  98. return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
  99. }
  100. /* Reclaim the swap entry anyway if possible */
  101. #define TTRS_ANYWAY 0x1
  102. /*
  103. * Reclaim the swap entry if there are no more mappings of the
  104. * corresponding page
  105. */
  106. #define TTRS_UNMAPPED 0x2
  107. /* Reclaim the swap entry if swap is getting full*/
  108. #define TTRS_FULL 0x4
  109. /* returns 1 if swap entry is freed */
  110. static int __try_to_reclaim_swap(struct swap_info_struct *si,
  111. unsigned long offset, unsigned long flags)
  112. {
  113. swp_entry_t entry = swp_entry(si->type, offset);
  114. struct page *page;
  115. int ret = 0;
  116. page = find_get_page(swap_address_space(entry), offset);
  117. if (!page)
  118. return 0;
  119. /*
  120. * When this function is called from scan_swap_map_slots() and it's
  121. * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
  122. * here. We have to use trylock for avoiding deadlock. This is a special
  123. * case and you should use try_to_free_swap() with explicit lock_page()
  124. * in usual operations.
  125. */
  126. if (trylock_page(page)) {
  127. if ((flags & TTRS_ANYWAY) ||
  128. ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
  129. ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
  130. ret = try_to_free_swap(page);
  131. unlock_page(page);
  132. }
  133. put_page(page);
  134. return ret;
  135. }
  136. static inline struct swap_extent *first_se(struct swap_info_struct *sis)
  137. {
  138. struct rb_node *rb = rb_first(&sis->swap_extent_root);
  139. return rb_entry(rb, struct swap_extent, rb_node);
  140. }
  141. static inline struct swap_extent *next_se(struct swap_extent *se)
  142. {
  143. struct rb_node *rb = rb_next(&se->rb_node);
  144. return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
  145. }
  146. /*
  147. * swapon tell device that all the old swap contents can be discarded,
  148. * to allow the swap device to optimize its wear-levelling.
  149. */
  150. static int discard_swap(struct swap_info_struct *si)
  151. {
  152. struct swap_extent *se;
  153. sector_t start_block;
  154. sector_t nr_blocks;
  155. int err = 0;
  156. /* Do not discard the swap header page! */
  157. se = first_se(si);
  158. start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
  159. nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
  160. if (nr_blocks) {
  161. err = blkdev_issue_discard(si->bdev, start_block,
  162. nr_blocks, GFP_KERNEL, 0);
  163. if (err)
  164. return err;
  165. cond_resched();
  166. }
  167. for (se = next_se(se); se; se = next_se(se)) {
  168. start_block = se->start_block << (PAGE_SHIFT - 9);
  169. nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
  170. err = blkdev_issue_discard(si->bdev, start_block,
  171. nr_blocks, GFP_KERNEL, 0);
  172. if (err)
  173. break;
  174. cond_resched();
  175. }
  176. return err; /* That will often be -EOPNOTSUPP */
  177. }
  178. static struct swap_extent *
  179. offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
  180. {
  181. struct swap_extent *se;
  182. struct rb_node *rb;
  183. rb = sis->swap_extent_root.rb_node;
  184. while (rb) {
  185. se = rb_entry(rb, struct swap_extent, rb_node);
  186. if (offset < se->start_page)
  187. rb = rb->rb_left;
  188. else if (offset >= se->start_page + se->nr_pages)
  189. rb = rb->rb_right;
  190. else
  191. return se;
  192. }
  193. /* It *must* be present */
  194. BUG();
  195. }
  196. /*
  197. * swap allocation tell device that a cluster of swap can now be discarded,
  198. * to allow the swap device to optimize its wear-levelling.
  199. */
  200. static void discard_swap_cluster(struct swap_info_struct *si,
  201. pgoff_t start_page, pgoff_t nr_pages)
  202. {
  203. struct swap_extent *se = offset_to_swap_extent(si, start_page);
  204. while (nr_pages) {
  205. pgoff_t offset = start_page - se->start_page;
  206. sector_t start_block = se->start_block + offset;
  207. sector_t nr_blocks = se->nr_pages - offset;
  208. if (nr_blocks > nr_pages)
  209. nr_blocks = nr_pages;
  210. start_page += nr_blocks;
  211. nr_pages -= nr_blocks;
  212. start_block <<= PAGE_SHIFT - 9;
  213. nr_blocks <<= PAGE_SHIFT - 9;
  214. if (blkdev_issue_discard(si->bdev, start_block,
  215. nr_blocks, GFP_NOIO, 0))
  216. break;
  217. se = next_se(se);
  218. }
  219. }
  220. #ifdef CONFIG_THP_SWAP
  221. #define SWAPFILE_CLUSTER HPAGE_PMD_NR
  222. #define swap_entry_size(size) (size)
  223. #else
  224. #define SWAPFILE_CLUSTER 256
  225. /*
  226. * Define swap_entry_size() as constant to let compiler to optimize
  227. * out some code if !CONFIG_THP_SWAP
  228. */
  229. #define swap_entry_size(size) 1
  230. #endif
  231. #define LATENCY_LIMIT 256
  232. static inline void cluster_set_flag(struct swap_cluster_info *info,
  233. unsigned int flag)
  234. {
  235. info->flags = flag;
  236. }
  237. static inline unsigned int cluster_count(struct swap_cluster_info *info)
  238. {
  239. return info->data;
  240. }
  241. static inline void cluster_set_count(struct swap_cluster_info *info,
  242. unsigned int c)
  243. {
  244. info->data = c;
  245. }
  246. static inline void cluster_set_count_flag(struct swap_cluster_info *info,
  247. unsigned int c, unsigned int f)
  248. {
  249. info->flags = f;
  250. info->data = c;
  251. }
  252. static inline unsigned int cluster_next(struct swap_cluster_info *info)
  253. {
  254. return info->data;
  255. }
  256. static inline void cluster_set_next(struct swap_cluster_info *info,
  257. unsigned int n)
  258. {
  259. info->data = n;
  260. }
  261. static inline void cluster_set_next_flag(struct swap_cluster_info *info,
  262. unsigned int n, unsigned int f)
  263. {
  264. info->flags = f;
  265. info->data = n;
  266. }
  267. static inline bool cluster_is_free(struct swap_cluster_info *info)
  268. {
  269. return info->flags & CLUSTER_FLAG_FREE;
  270. }
  271. static inline bool cluster_is_null(struct swap_cluster_info *info)
  272. {
  273. return info->flags & CLUSTER_FLAG_NEXT_NULL;
  274. }
  275. static inline void cluster_set_null(struct swap_cluster_info *info)
  276. {
  277. info->flags = CLUSTER_FLAG_NEXT_NULL;
  278. info->data = 0;
  279. }
  280. static inline bool cluster_is_huge(struct swap_cluster_info *info)
  281. {
  282. if (IS_ENABLED(CONFIG_THP_SWAP))
  283. return info->flags & CLUSTER_FLAG_HUGE;
  284. return false;
  285. }
  286. static inline void cluster_clear_huge(struct swap_cluster_info *info)
  287. {
  288. info->flags &= ~CLUSTER_FLAG_HUGE;
  289. }
  290. static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
  291. unsigned long offset)
  292. {
  293. struct swap_cluster_info *ci;
  294. ci = si->cluster_info;
  295. if (ci) {
  296. ci += offset / SWAPFILE_CLUSTER;
  297. spin_lock(&ci->lock);
  298. }
  299. return ci;
  300. }
  301. static inline void unlock_cluster(struct swap_cluster_info *ci)
  302. {
  303. if (ci)
  304. spin_unlock(&ci->lock);
  305. }
  306. /*
  307. * Determine the locking method in use for this device. Return
  308. * swap_cluster_info if SSD-style cluster-based locking is in place.
  309. */
  310. static inline struct swap_cluster_info *lock_cluster_or_swap_info(
  311. struct swap_info_struct *si, unsigned long offset)
  312. {
  313. struct swap_cluster_info *ci;
  314. /* Try to use fine-grained SSD-style locking if available: */
  315. ci = lock_cluster(si, offset);
  316. /* Otherwise, fall back to traditional, coarse locking: */
  317. if (!ci)
  318. spin_lock(&si->lock);
  319. return ci;
  320. }
  321. static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
  322. struct swap_cluster_info *ci)
  323. {
  324. if (ci)
  325. unlock_cluster(ci);
  326. else
  327. spin_unlock(&si->lock);
  328. }
  329. static inline bool cluster_list_empty(struct swap_cluster_list *list)
  330. {
  331. return cluster_is_null(&list->head);
  332. }
  333. static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
  334. {
  335. return cluster_next(&list->head);
  336. }
  337. static void cluster_list_init(struct swap_cluster_list *list)
  338. {
  339. cluster_set_null(&list->head);
  340. cluster_set_null(&list->tail);
  341. }
  342. static void cluster_list_add_tail(struct swap_cluster_list *list,
  343. struct swap_cluster_info *ci,
  344. unsigned int idx)
  345. {
  346. if (cluster_list_empty(list)) {
  347. cluster_set_next_flag(&list->head, idx, 0);
  348. cluster_set_next_flag(&list->tail, idx, 0);
  349. } else {
  350. struct swap_cluster_info *ci_tail;
  351. unsigned int tail = cluster_next(&list->tail);
  352. /*
  353. * Nested cluster lock, but both cluster locks are
  354. * only acquired when we held swap_info_struct->lock
  355. */
  356. ci_tail = ci + tail;
  357. spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
  358. cluster_set_next(ci_tail, idx);
  359. spin_unlock(&ci_tail->lock);
  360. cluster_set_next_flag(&list->tail, idx, 0);
  361. }
  362. }
  363. static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
  364. struct swap_cluster_info *ci)
  365. {
  366. unsigned int idx;
  367. idx = cluster_next(&list->head);
  368. if (cluster_next(&list->tail) == idx) {
  369. cluster_set_null(&list->head);
  370. cluster_set_null(&list->tail);
  371. } else
  372. cluster_set_next_flag(&list->head,
  373. cluster_next(&ci[idx]), 0);
  374. return idx;
  375. }
  376. /* Add a cluster to discard list and schedule it to do discard */
  377. static void swap_cluster_schedule_discard(struct swap_info_struct *si,
  378. unsigned int idx)
  379. {
  380. /*
  381. * If scan_swap_map() can't find a free cluster, it will check
  382. * si->swap_map directly. To make sure the discarding cluster isn't
  383. * taken by scan_swap_map(), mark the swap entries bad (occupied). It
  384. * will be cleared after discard
  385. */
  386. memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  387. SWAP_MAP_BAD, SWAPFILE_CLUSTER);
  388. cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
  389. schedule_work(&si->discard_work);
  390. }
  391. static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
  392. {
  393. struct swap_cluster_info *ci = si->cluster_info;
  394. cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
  395. cluster_list_add_tail(&si->free_clusters, ci, idx);
  396. }
  397. /*
  398. * Doing discard actually. After a cluster discard is finished, the cluster
  399. * will be added to free cluster list. caller should hold si->lock.
  400. */
  401. static void swap_do_scheduled_discard(struct swap_info_struct *si)
  402. {
  403. struct swap_cluster_info *info, *ci;
  404. unsigned int idx;
  405. info = si->cluster_info;
  406. while (!cluster_list_empty(&si->discard_clusters)) {
  407. idx = cluster_list_del_first(&si->discard_clusters, info);
  408. spin_unlock(&si->lock);
  409. discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
  410. SWAPFILE_CLUSTER);
  411. spin_lock(&si->lock);
  412. ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
  413. __free_cluster(si, idx);
  414. memset(si->swap_map + idx * SWAPFILE_CLUSTER,
  415. 0, SWAPFILE_CLUSTER);
  416. unlock_cluster(ci);
  417. }
  418. }
  419. static void swap_discard_work(struct work_struct *work)
  420. {
  421. struct swap_info_struct *si;
  422. si = container_of(work, struct swap_info_struct, discard_work);
  423. spin_lock(&si->lock);
  424. swap_do_scheduled_discard(si);
  425. spin_unlock(&si->lock);
  426. }
  427. static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
  428. {
  429. struct swap_cluster_info *ci = si->cluster_info;
  430. VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
  431. cluster_list_del_first(&si->free_clusters, ci);
  432. cluster_set_count_flag(ci + idx, 0, 0);
  433. }
  434. static void free_cluster(struct swap_info_struct *si, unsigned long idx)
  435. {
  436. struct swap_cluster_info *ci = si->cluster_info + idx;
  437. VM_BUG_ON(cluster_count(ci) != 0);
  438. /*
  439. * If the swap is discardable, prepare discard the cluster
  440. * instead of free it immediately. The cluster will be freed
  441. * after discard.
  442. */
  443. if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
  444. (SWP_WRITEOK | SWP_PAGE_DISCARD)) {
  445. swap_cluster_schedule_discard(si, idx);
  446. return;
  447. }
  448. __free_cluster(si, idx);
  449. }
  450. /*
  451. * The cluster corresponding to page_nr will be used. The cluster will be
  452. * removed from free cluster list and its usage counter will be increased.
  453. */
  454. static void inc_cluster_info_page(struct swap_info_struct *p,
  455. struct swap_cluster_info *cluster_info, unsigned long page_nr)
  456. {
  457. unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  458. if (!cluster_info)
  459. return;
  460. if (cluster_is_free(&cluster_info[idx]))
  461. alloc_cluster(p, idx);
  462. VM_BUG_ON(cluster_count(&cluster_info[idx]) >= SWAPFILE_CLUSTER);
  463. cluster_set_count(&cluster_info[idx],
  464. cluster_count(&cluster_info[idx]) + 1);
  465. }
  466. /*
  467. * The cluster corresponding to page_nr decreases one usage. If the usage
  468. * counter becomes 0, which means no page in the cluster is in using, we can
  469. * optionally discard the cluster and add it to free cluster list.
  470. */
  471. static void dec_cluster_info_page(struct swap_info_struct *p,
  472. struct swap_cluster_info *cluster_info, unsigned long page_nr)
  473. {
  474. unsigned long idx = page_nr / SWAPFILE_CLUSTER;
  475. if (!cluster_info)
  476. return;
  477. VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
  478. cluster_set_count(&cluster_info[idx],
  479. cluster_count(&cluster_info[idx]) - 1);
  480. if (cluster_count(&cluster_info[idx]) == 0)
  481. free_cluster(p, idx);
  482. }
  483. /*
  484. * It's possible scan_swap_map() uses a free cluster in the middle of free
  485. * cluster list. Avoiding such abuse to avoid list corruption.
  486. */
  487. static bool
  488. scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
  489. unsigned long offset)
  490. {
  491. struct percpu_cluster *percpu_cluster;
  492. bool conflict;
  493. offset /= SWAPFILE_CLUSTER;
  494. conflict = !cluster_list_empty(&si->free_clusters) &&
  495. offset != cluster_list_first(&si->free_clusters) &&
  496. cluster_is_free(&si->cluster_info[offset]);
  497. if (!conflict)
  498. return false;
  499. percpu_cluster = this_cpu_ptr(si->percpu_cluster);
  500. cluster_set_null(&percpu_cluster->index);
  501. return true;
  502. }
  503. /*
  504. * Try to get a swap entry from current cpu's swap entry pool (a cluster). This
  505. * might involve allocating a new cluster for current CPU too.
  506. */
  507. static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
  508. unsigned long *offset, unsigned long *scan_base)
  509. {
  510. struct percpu_cluster *cluster;
  511. struct swap_cluster_info *ci;
  512. unsigned long tmp, max;
  513. new_cluster:
  514. cluster = this_cpu_ptr(si->percpu_cluster);
  515. if (cluster_is_null(&cluster->index)) {
  516. if (!cluster_list_empty(&si->free_clusters)) {
  517. cluster->index = si->free_clusters.head;
  518. cluster->next = cluster_next(&cluster->index) *
  519. SWAPFILE_CLUSTER;
  520. } else if (!cluster_list_empty(&si->discard_clusters)) {
  521. /*
  522. * we don't have free cluster but have some clusters in
  523. * discarding, do discard now and reclaim them, then
  524. * reread cluster_next_cpu since we dropped si->lock
  525. */
  526. swap_do_scheduled_discard(si);
  527. *scan_base = this_cpu_read(*si->cluster_next_cpu);
  528. *offset = *scan_base;
  529. goto new_cluster;
  530. } else
  531. return false;
  532. }
  533. /*
  534. * Other CPUs can use our cluster if they can't find a free cluster,
  535. * check if there is still free entry in the cluster
  536. */
  537. tmp = cluster->next;
  538. max = min_t(unsigned long, si->max,
  539. (cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
  540. if (tmp < max) {
  541. ci = lock_cluster(si, tmp);
  542. while (tmp < max) {
  543. if (!si->swap_map[tmp])
  544. break;
  545. tmp++;
  546. }
  547. unlock_cluster(ci);
  548. }
  549. if (tmp >= max) {
  550. cluster_set_null(&cluster->index);
  551. goto new_cluster;
  552. }
  553. cluster->next = tmp + 1;
  554. *offset = tmp;
  555. *scan_base = tmp;
  556. return true;
  557. }
  558. static void __del_from_avail_list(struct swap_info_struct *p)
  559. {
  560. int nid;
  561. for_each_node(nid)
  562. plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
  563. }
  564. static void del_from_avail_list(struct swap_info_struct *p)
  565. {
  566. spin_lock(&swap_avail_lock);
  567. __del_from_avail_list(p);
  568. spin_unlock(&swap_avail_lock);
  569. }
  570. static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
  571. unsigned int nr_entries)
  572. {
  573. unsigned int end = offset + nr_entries - 1;
  574. if (offset == si->lowest_bit)
  575. si->lowest_bit += nr_entries;
  576. if (end == si->highest_bit)
  577. WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
  578. si->inuse_pages += nr_entries;
  579. if (si->inuse_pages == si->pages) {
  580. si->lowest_bit = si->max;
  581. si->highest_bit = 0;
  582. del_from_avail_list(si);
  583. }
  584. }
  585. static void add_to_avail_list(struct swap_info_struct *p)
  586. {
  587. int nid;
  588. spin_lock(&swap_avail_lock);
  589. for_each_node(nid) {
  590. WARN_ON(!plist_node_empty(&p->avail_lists[nid]));
  591. plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
  592. }
  593. spin_unlock(&swap_avail_lock);
  594. }
  595. static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
  596. unsigned int nr_entries)
  597. {
  598. unsigned long begin = offset;
  599. unsigned long end = offset + nr_entries - 1;
  600. void (*swap_slot_free_notify)(struct block_device *, unsigned long);
  601. if (offset < si->lowest_bit)
  602. si->lowest_bit = offset;
  603. if (end > si->highest_bit) {
  604. bool was_full = !si->highest_bit;
  605. WRITE_ONCE(si->highest_bit, end);
  606. if (was_full && (si->flags & SWP_WRITEOK))
  607. add_to_avail_list(si);
  608. }
  609. atomic_long_add(nr_entries, &nr_swap_pages);
  610. si->inuse_pages -= nr_entries;
  611. if (si->flags & SWP_BLKDEV)
  612. swap_slot_free_notify =
  613. si->bdev->bd_disk->fops->swap_slot_free_notify;
  614. else
  615. swap_slot_free_notify = NULL;
  616. while (offset <= end) {
  617. arch_swap_invalidate_page(si->type, offset);
  618. frontswap_invalidate_page(si->type, offset);
  619. if (swap_slot_free_notify)
  620. swap_slot_free_notify(si->bdev, offset);
  621. offset++;
  622. }
  623. clear_shadow_from_swap_cache(si->type, begin, end);
  624. }
  625. static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
  626. {
  627. unsigned long prev;
  628. if (!(si->flags & SWP_SOLIDSTATE)) {
  629. si->cluster_next = next;
  630. return;
  631. }
  632. prev = this_cpu_read(*si->cluster_next_cpu);
  633. /*
  634. * Cross the swap address space size aligned trunk, choose
  635. * another trunk randomly to avoid lock contention on swap
  636. * address space if possible.
  637. */
  638. if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
  639. (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
  640. /* No free swap slots available */
  641. if (si->highest_bit <= si->lowest_bit)
  642. return;
  643. next = si->lowest_bit +
  644. prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
  645. next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
  646. next = max_t(unsigned int, next, si->lowest_bit);
  647. }
  648. this_cpu_write(*si->cluster_next_cpu, next);
  649. }
  650. static int scan_swap_map_slots(struct swap_info_struct *si,
  651. unsigned char usage, int nr,
  652. swp_entry_t slots[])
  653. {
  654. struct swap_cluster_info *ci;
  655. unsigned long offset;
  656. unsigned long scan_base;
  657. unsigned long last_in_cluster = 0;
  658. int latency_ration = LATENCY_LIMIT;
  659. int n_ret = 0;
  660. bool scanned_many = false;
  661. /*
  662. * We try to cluster swap pages by allocating them sequentially
  663. * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
  664. * way, however, we resort to first-free allocation, starting
  665. * a new cluster. This prevents us from scattering swap pages
  666. * all over the entire swap partition, so that we reduce
  667. * overall disk seek times between swap pages. -- sct
  668. * But we do now try to find an empty cluster. -Andrea
  669. * And we let swap pages go all over an SSD partition. Hugh
  670. */
  671. si->flags += SWP_SCANNING;
  672. /*
  673. * Use percpu scan base for SSD to reduce lock contention on
  674. * cluster and swap cache. For HDD, sequential access is more
  675. * important.
  676. */
  677. if (si->flags & SWP_SOLIDSTATE)
  678. scan_base = this_cpu_read(*si->cluster_next_cpu);
  679. else
  680. scan_base = si->cluster_next;
  681. offset = scan_base;
  682. /* SSD algorithm */
  683. if (si->cluster_info) {
  684. if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  685. goto scan;
  686. } else if (unlikely(!si->cluster_nr--)) {
  687. if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
  688. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  689. goto checks;
  690. }
  691. spin_unlock(&si->lock);
  692. /*
  693. * If seek is expensive, start searching for new cluster from
  694. * start of partition, to minimize the span of allocated swap.
  695. * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
  696. * case, just handled by scan_swap_map_try_ssd_cluster() above.
  697. */
  698. scan_base = offset = si->lowest_bit;
  699. last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
  700. /* Locate the first empty (unaligned) cluster */
  701. for (; last_in_cluster <= si->highest_bit; offset++) {
  702. if (si->swap_map[offset])
  703. last_in_cluster = offset + SWAPFILE_CLUSTER;
  704. else if (offset == last_in_cluster) {
  705. spin_lock(&si->lock);
  706. offset -= SWAPFILE_CLUSTER - 1;
  707. si->cluster_next = offset;
  708. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  709. goto checks;
  710. }
  711. if (unlikely(--latency_ration < 0)) {
  712. cond_resched();
  713. latency_ration = LATENCY_LIMIT;
  714. }
  715. }
  716. offset = scan_base;
  717. spin_lock(&si->lock);
  718. si->cluster_nr = SWAPFILE_CLUSTER - 1;
  719. }
  720. checks:
  721. if (si->cluster_info) {
  722. while (scan_swap_map_ssd_cluster_conflict(si, offset)) {
  723. /* take a break if we already got some slots */
  724. if (n_ret)
  725. goto done;
  726. if (!scan_swap_map_try_ssd_cluster(si, &offset,
  727. &scan_base))
  728. goto scan;
  729. }
  730. }
  731. if (!(si->flags & SWP_WRITEOK))
  732. goto no_page;
  733. if (!si->highest_bit)
  734. goto no_page;
  735. if (offset > si->highest_bit)
  736. scan_base = offset = si->lowest_bit;
  737. ci = lock_cluster(si, offset);
  738. /* reuse swap entry of cache-only swap if not busy. */
  739. if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
  740. int swap_was_freed;
  741. unlock_cluster(ci);
  742. spin_unlock(&si->lock);
  743. swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
  744. spin_lock(&si->lock);
  745. /* entry was freed successfully, try to use this again */
  746. if (swap_was_freed)
  747. goto checks;
  748. goto scan; /* check next one */
  749. }
  750. if (si->swap_map[offset]) {
  751. unlock_cluster(ci);
  752. if (!n_ret)
  753. goto scan;
  754. else
  755. goto done;
  756. }
  757. WRITE_ONCE(si->swap_map[offset], usage);
  758. inc_cluster_info_page(si, si->cluster_info, offset);
  759. unlock_cluster(ci);
  760. swap_range_alloc(si, offset, 1);
  761. slots[n_ret++] = swp_entry(si->type, offset);
  762. /* got enough slots or reach max slots? */
  763. if ((n_ret == nr) || (offset >= si->highest_bit))
  764. goto done;
  765. /* search for next available slot */
  766. /* time to take a break? */
  767. if (unlikely(--latency_ration < 0)) {
  768. if (n_ret)
  769. goto done;
  770. spin_unlock(&si->lock);
  771. cond_resched();
  772. spin_lock(&si->lock);
  773. latency_ration = LATENCY_LIMIT;
  774. }
  775. /* try to get more slots in cluster */
  776. if (si->cluster_info) {
  777. if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
  778. goto checks;
  779. } else if (si->cluster_nr && !si->swap_map[++offset]) {
  780. /* non-ssd case, still more slots in cluster? */
  781. --si->cluster_nr;
  782. goto checks;
  783. }
  784. /*
  785. * Even if there's no free clusters available (fragmented),
  786. * try to scan a little more quickly with lock held unless we
  787. * have scanned too many slots already.
  788. */
  789. if (!scanned_many) {
  790. unsigned long scan_limit;
  791. if (offset < scan_base)
  792. scan_limit = scan_base;
  793. else
  794. scan_limit = si->highest_bit;
  795. for (; offset <= scan_limit && --latency_ration > 0;
  796. offset++) {
  797. if (!si->swap_map[offset])
  798. goto checks;
  799. }
  800. }
  801. done:
  802. set_cluster_next(si, offset + 1);
  803. si->flags -= SWP_SCANNING;
  804. return n_ret;
  805. scan:
  806. spin_unlock(&si->lock);
  807. while (++offset <= READ_ONCE(si->highest_bit)) {
  808. if (data_race(!si->swap_map[offset])) {
  809. spin_lock(&si->lock);
  810. goto checks;
  811. }
  812. if (vm_swap_full() &&
  813. READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
  814. spin_lock(&si->lock);
  815. goto checks;
  816. }
  817. if (unlikely(--latency_ration < 0)) {
  818. cond_resched();
  819. latency_ration = LATENCY_LIMIT;
  820. scanned_many = true;
  821. }
  822. }
  823. offset = si->lowest_bit;
  824. while (offset < scan_base) {
  825. if (data_race(!si->swap_map[offset])) {
  826. spin_lock(&si->lock);
  827. goto checks;
  828. }
  829. if (vm_swap_full() &&
  830. READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
  831. spin_lock(&si->lock);
  832. goto checks;
  833. }
  834. if (unlikely(--latency_ration < 0)) {
  835. cond_resched();
  836. latency_ration = LATENCY_LIMIT;
  837. scanned_many = true;
  838. }
  839. offset++;
  840. }
  841. spin_lock(&si->lock);
  842. no_page:
  843. si->flags -= SWP_SCANNING;
  844. return n_ret;
  845. }
  846. static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
  847. {
  848. unsigned long idx;
  849. struct swap_cluster_info *ci;
  850. unsigned long offset, i;
  851. unsigned char *map;
  852. /*
  853. * Should not even be attempting cluster allocations when huge
  854. * page swap is disabled. Warn and fail the allocation.
  855. */
  856. if (!IS_ENABLED(CONFIG_THP_SWAP)) {
  857. VM_WARN_ON_ONCE(1);
  858. return 0;
  859. }
  860. if (cluster_list_empty(&si->free_clusters))
  861. return 0;
  862. idx = cluster_list_first(&si->free_clusters);
  863. offset = idx * SWAPFILE_CLUSTER;
  864. ci = lock_cluster(si, offset);
  865. alloc_cluster(si, idx);
  866. cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
  867. map = si->swap_map + offset;
  868. for (i = 0; i < SWAPFILE_CLUSTER; i++)
  869. map[i] = SWAP_HAS_CACHE;
  870. unlock_cluster(ci);
  871. swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
  872. *slot = swp_entry(si->type, offset);
  873. return 1;
  874. }
  875. static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
  876. {
  877. unsigned long offset = idx * SWAPFILE_CLUSTER;
  878. struct swap_cluster_info *ci;
  879. ci = lock_cluster(si, offset);
  880. memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
  881. cluster_set_count_flag(ci, 0, 0);
  882. free_cluster(si, idx);
  883. unlock_cluster(ci);
  884. swap_range_free(si, offset, SWAPFILE_CLUSTER);
  885. }
  886. static unsigned long scan_swap_map(struct swap_info_struct *si,
  887. unsigned char usage)
  888. {
  889. swp_entry_t entry;
  890. int n_ret;
  891. n_ret = scan_swap_map_slots(si, usage, 1, &entry);
  892. if (n_ret)
  893. return swp_offset(entry);
  894. else
  895. return 0;
  896. }
  897. int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
  898. {
  899. unsigned long size = swap_entry_size(entry_size);
  900. struct swap_info_struct *si, *next;
  901. long avail_pgs;
  902. int n_ret = 0;
  903. int node;
  904. /* Only single cluster request supported */
  905. WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
  906. avail_pgs = atomic_long_read(&nr_swap_pages) / size;
  907. if (avail_pgs <= 0)
  908. goto noswap;
  909. n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
  910. atomic_long_sub(n_goal * size, &nr_swap_pages);
  911. spin_lock(&swap_avail_lock);
  912. start_over:
  913. node = numa_node_id();
  914. plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
  915. /* requeue si to after same-priority siblings */
  916. plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
  917. spin_unlock(&swap_avail_lock);
  918. spin_lock(&si->lock);
  919. if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
  920. spin_lock(&swap_avail_lock);
  921. if (plist_node_empty(&si->avail_lists[node])) {
  922. spin_unlock(&si->lock);
  923. goto nextsi;
  924. }
  925. WARN(!si->highest_bit,
  926. "swap_info %d in list but !highest_bit\n",
  927. si->type);
  928. WARN(!(si->flags & SWP_WRITEOK),
  929. "swap_info %d in list but !SWP_WRITEOK\n",
  930. si->type);
  931. __del_from_avail_list(si);
  932. spin_unlock(&si->lock);
  933. goto nextsi;
  934. }
  935. if (size == SWAPFILE_CLUSTER) {
  936. if (si->flags & SWP_BLKDEV)
  937. n_ret = swap_alloc_cluster(si, swp_entries);
  938. } else
  939. n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
  940. n_goal, swp_entries);
  941. spin_unlock(&si->lock);
  942. if (n_ret || size == SWAPFILE_CLUSTER)
  943. goto check_out;
  944. pr_debug("scan_swap_map of si %d failed to find offset\n",
  945. si->type);
  946. spin_lock(&swap_avail_lock);
  947. nextsi:
  948. /*
  949. * if we got here, it's likely that si was almost full before,
  950. * and since scan_swap_map() can drop the si->lock, multiple
  951. * callers probably all tried to get a page from the same si
  952. * and it filled up before we could get one; or, the si filled
  953. * up between us dropping swap_avail_lock and taking si->lock.
  954. * Since we dropped the swap_avail_lock, the swap_avail_head
  955. * list may have been modified; so if next is still in the
  956. * swap_avail_head list then try it, otherwise start over
  957. * if we have not gotten any slots.
  958. */
  959. if (plist_node_empty(&next->avail_lists[node]))
  960. goto start_over;
  961. }
  962. spin_unlock(&swap_avail_lock);
  963. check_out:
  964. if (n_ret < n_goal)
  965. atomic_long_add((long)(n_goal - n_ret) * size,
  966. &nr_swap_pages);
  967. noswap:
  968. return n_ret;
  969. }
  970. /* The only caller of this function is now suspend routine */
  971. swp_entry_t get_swap_page_of_type(int type)
  972. {
  973. struct swap_info_struct *si = swap_type_to_swap_info(type);
  974. pgoff_t offset;
  975. if (!si)
  976. goto fail;
  977. spin_lock(&si->lock);
  978. if (si->flags & SWP_WRITEOK) {
  979. atomic_long_dec(&nr_swap_pages);
  980. /* This is called for allocating swap entry, not cache */
  981. offset = scan_swap_map(si, 1);
  982. if (offset) {
  983. spin_unlock(&si->lock);
  984. return swp_entry(type, offset);
  985. }
  986. atomic_long_inc(&nr_swap_pages);
  987. }
  988. spin_unlock(&si->lock);
  989. fail:
  990. return (swp_entry_t) {0};
  991. }
  992. static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
  993. {
  994. struct swap_info_struct *p;
  995. unsigned long offset;
  996. if (!entry.val)
  997. goto out;
  998. p = swp_swap_info(entry);
  999. if (!p)
  1000. goto bad_nofile;
  1001. if (data_race(!(p->flags & SWP_USED)))
  1002. goto bad_device;
  1003. offset = swp_offset(entry);
  1004. if (offset >= p->max)
  1005. goto bad_offset;
  1006. return p;
  1007. bad_offset:
  1008. pr_err("swap_info_get: %s%08lx\n", Bad_offset, entry.val);
  1009. goto out;
  1010. bad_device:
  1011. pr_err("swap_info_get: %s%08lx\n", Unused_file, entry.val);
  1012. goto out;
  1013. bad_nofile:
  1014. pr_err("swap_info_get: %s%08lx\n", Bad_file, entry.val);
  1015. out:
  1016. return NULL;
  1017. }
  1018. static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
  1019. {
  1020. struct swap_info_struct *p;
  1021. p = __swap_info_get(entry);
  1022. if (!p)
  1023. goto out;
  1024. if (data_race(!p->swap_map[swp_offset(entry)]))
  1025. goto bad_free;
  1026. return p;
  1027. bad_free:
  1028. pr_err("swap_info_get: %s%08lx\n", Unused_offset, entry.val);
  1029. out:
  1030. return NULL;
  1031. }
  1032. static struct swap_info_struct *swap_info_get(swp_entry_t entry)
  1033. {
  1034. struct swap_info_struct *p;
  1035. p = _swap_info_get(entry);
  1036. if (p)
  1037. spin_lock(&p->lock);
  1038. return p;
  1039. }
  1040. static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
  1041. struct swap_info_struct *q)
  1042. {
  1043. struct swap_info_struct *p;
  1044. p = _swap_info_get(entry);
  1045. if (p != q) {
  1046. if (q != NULL)
  1047. spin_unlock(&q->lock);
  1048. if (p != NULL)
  1049. spin_lock(&p->lock);
  1050. }
  1051. return p;
  1052. }
  1053. static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
  1054. unsigned long offset,
  1055. unsigned char usage)
  1056. {
  1057. unsigned char count;
  1058. unsigned char has_cache;
  1059. count = p->swap_map[offset];
  1060. has_cache = count & SWAP_HAS_CACHE;
  1061. count &= ~SWAP_HAS_CACHE;
  1062. if (usage == SWAP_HAS_CACHE) {
  1063. VM_BUG_ON(!has_cache);
  1064. has_cache = 0;
  1065. } else if (count == SWAP_MAP_SHMEM) {
  1066. /*
  1067. * Or we could insist on shmem.c using a special
  1068. * swap_shmem_free() and free_shmem_swap_and_cache()...
  1069. */
  1070. count = 0;
  1071. } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
  1072. if (count == COUNT_CONTINUED) {
  1073. if (swap_count_continued(p, offset, count))
  1074. count = SWAP_MAP_MAX | COUNT_CONTINUED;
  1075. else
  1076. count = SWAP_MAP_MAX;
  1077. } else
  1078. count--;
  1079. }
  1080. usage = count | has_cache;
  1081. if (usage)
  1082. WRITE_ONCE(p->swap_map[offset], usage);
  1083. else
  1084. WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
  1085. return usage;
  1086. }
  1087. /*
  1088. * Check whether swap entry is valid in the swap device. If so,
  1089. * return pointer to swap_info_struct, and keep the swap entry valid
  1090. * via preventing the swap device from being swapoff, until
  1091. * put_swap_device() is called. Otherwise return NULL.
  1092. *
  1093. * The entirety of the RCU read critical section must come before the
  1094. * return from or after the call to synchronize_rcu() in
  1095. * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
  1096. * true, the si->map, si->cluster_info, etc. must be valid in the
  1097. * critical section.
  1098. *
  1099. * Notice that swapoff or swapoff+swapon can still happen before the
  1100. * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
  1101. * in put_swap_device() if there isn't any other way to prevent
  1102. * swapoff, such as page lock, page table lock, etc. The caller must
  1103. * be prepared for that. For example, the following situation is
  1104. * possible.
  1105. *
  1106. * CPU1 CPU2
  1107. * do_swap_page()
  1108. * ... swapoff+swapon
  1109. * __read_swap_cache_async()
  1110. * swapcache_prepare()
  1111. * __swap_duplicate()
  1112. * // check swap_map
  1113. * // verify PTE not changed
  1114. *
  1115. * In __swap_duplicate(), the swap_map need to be checked before
  1116. * changing partly because the specified swap entry may be for another
  1117. * swap device which has been swapoff. And in do_swap_page(), after
  1118. * the page is read from the swap device, the PTE is verified not
  1119. * changed with the page table locked to check whether the swap device
  1120. * has been swapoff or swapoff+swapon.
  1121. */
  1122. struct swap_info_struct *get_swap_device(swp_entry_t entry)
  1123. {
  1124. struct swap_info_struct *si;
  1125. unsigned long offset;
  1126. if (!entry.val)
  1127. goto out;
  1128. si = swp_swap_info(entry);
  1129. if (!si)
  1130. goto bad_nofile;
  1131. rcu_read_lock();
  1132. if (data_race(!(si->flags & SWP_VALID)))
  1133. goto unlock_out;
  1134. offset = swp_offset(entry);
  1135. if (offset >= si->max)
  1136. goto unlock_out;
  1137. return si;
  1138. bad_nofile:
  1139. pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
  1140. out:
  1141. return NULL;
  1142. unlock_out:
  1143. rcu_read_unlock();
  1144. return NULL;
  1145. }
  1146. static unsigned char __swap_entry_free(struct swap_info_struct *p,
  1147. swp_entry_t entry)
  1148. {
  1149. struct swap_cluster_info *ci;
  1150. unsigned long offset = swp_offset(entry);
  1151. unsigned char usage;
  1152. ci = lock_cluster_or_swap_info(p, offset);
  1153. usage = __swap_entry_free_locked(p, offset, 1);
  1154. unlock_cluster_or_swap_info(p, ci);
  1155. if (!usage)
  1156. free_swap_slot(entry);
  1157. return usage;
  1158. }
  1159. static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
  1160. {
  1161. struct swap_cluster_info *ci;
  1162. unsigned long offset = swp_offset(entry);
  1163. unsigned char count;
  1164. ci = lock_cluster(p, offset);
  1165. count = p->swap_map[offset];
  1166. VM_BUG_ON(count != SWAP_HAS_CACHE);
  1167. p->swap_map[offset] = 0;
  1168. dec_cluster_info_page(p, p->cluster_info, offset);
  1169. unlock_cluster(ci);
  1170. mem_cgroup_uncharge_swap(entry, 1);
  1171. swap_range_free(p, offset, 1);
  1172. }
  1173. /*
  1174. * Caller has made sure that the swap device corresponding to entry
  1175. * is still around or has not been recycled.
  1176. */
  1177. void swap_free(swp_entry_t entry)
  1178. {
  1179. struct swap_info_struct *p;
  1180. p = _swap_info_get(entry);
  1181. if (p)
  1182. __swap_entry_free(p, entry);
  1183. }
  1184. /*
  1185. * Called after dropping swapcache to decrease refcnt to swap entries.
  1186. */
  1187. void put_swap_page(struct page *page, swp_entry_t entry)
  1188. {
  1189. unsigned long offset = swp_offset(entry);
  1190. unsigned long idx = offset / SWAPFILE_CLUSTER;
  1191. struct swap_cluster_info *ci;
  1192. struct swap_info_struct *si;
  1193. unsigned char *map;
  1194. unsigned int i, free_entries = 0;
  1195. unsigned char val;
  1196. int size = swap_entry_size(thp_nr_pages(page));
  1197. si = _swap_info_get(entry);
  1198. if (!si)
  1199. return;
  1200. ci = lock_cluster_or_swap_info(si, offset);
  1201. if (size == SWAPFILE_CLUSTER) {
  1202. VM_BUG_ON(!cluster_is_huge(ci));
  1203. map = si->swap_map + offset;
  1204. for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  1205. val = map[i];
  1206. VM_BUG_ON(!(val & SWAP_HAS_CACHE));
  1207. if (val == SWAP_HAS_CACHE)
  1208. free_entries++;
  1209. }
  1210. cluster_clear_huge(ci);
  1211. if (free_entries == SWAPFILE_CLUSTER) {
  1212. unlock_cluster_or_swap_info(si, ci);
  1213. spin_lock(&si->lock);
  1214. mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
  1215. swap_free_cluster(si, idx);
  1216. spin_unlock(&si->lock);
  1217. return;
  1218. }
  1219. }
  1220. for (i = 0; i < size; i++, entry.val++) {
  1221. if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
  1222. unlock_cluster_or_swap_info(si, ci);
  1223. free_swap_slot(entry);
  1224. if (i == size - 1)
  1225. return;
  1226. lock_cluster_or_swap_info(si, offset);
  1227. }
  1228. }
  1229. unlock_cluster_or_swap_info(si, ci);
  1230. }
  1231. #ifdef CONFIG_THP_SWAP
  1232. int split_swap_cluster(swp_entry_t entry)
  1233. {
  1234. struct swap_info_struct *si;
  1235. struct swap_cluster_info *ci;
  1236. unsigned long offset = swp_offset(entry);
  1237. si = _swap_info_get(entry);
  1238. if (!si)
  1239. return -EBUSY;
  1240. ci = lock_cluster(si, offset);
  1241. cluster_clear_huge(ci);
  1242. unlock_cluster(ci);
  1243. return 0;
  1244. }
  1245. #endif
  1246. static int swp_entry_cmp(const void *ent1, const void *ent2)
  1247. {
  1248. const swp_entry_t *e1 = ent1, *e2 = ent2;
  1249. return (int)swp_type(*e1) - (int)swp_type(*e2);
  1250. }
  1251. void swapcache_free_entries(swp_entry_t *entries, int n)
  1252. {
  1253. struct swap_info_struct *p, *prev;
  1254. int i;
  1255. if (n <= 0)
  1256. return;
  1257. prev = NULL;
  1258. p = NULL;
  1259. /*
  1260. * Sort swap entries by swap device, so each lock is only taken once.
  1261. * nr_swapfiles isn't absolutely correct, but the overhead of sort() is
  1262. * so low that it isn't necessary to optimize further.
  1263. */
  1264. if (nr_swapfiles > 1)
  1265. sort(entries, n, sizeof(entries[0]), swp_entry_cmp, NULL);
  1266. for (i = 0; i < n; ++i) {
  1267. p = swap_info_get_cont(entries[i], prev);
  1268. if (p)
  1269. swap_entry_free(p, entries[i]);
  1270. prev = p;
  1271. }
  1272. if (p)
  1273. spin_unlock(&p->lock);
  1274. }
  1275. /*
  1276. * How many references to page are currently swapped out?
  1277. * This does not give an exact answer when swap count is continued,
  1278. * but does include the high COUNT_CONTINUED flag to allow for that.
  1279. */
  1280. int page_swapcount(struct page *page)
  1281. {
  1282. int count = 0;
  1283. struct swap_info_struct *p;
  1284. struct swap_cluster_info *ci;
  1285. swp_entry_t entry;
  1286. unsigned long offset;
  1287. entry.val = page_private(page);
  1288. p = _swap_info_get(entry);
  1289. if (p) {
  1290. offset = swp_offset(entry);
  1291. ci = lock_cluster_or_swap_info(p, offset);
  1292. count = swap_count(p->swap_map[offset]);
  1293. unlock_cluster_or_swap_info(p, ci);
  1294. }
  1295. return count;
  1296. }
  1297. int __swap_count(swp_entry_t entry)
  1298. {
  1299. struct swap_info_struct *si;
  1300. pgoff_t offset = swp_offset(entry);
  1301. int count = 0;
  1302. si = get_swap_device(entry);
  1303. if (si) {
  1304. count = swap_count(si->swap_map[offset]);
  1305. put_swap_device(si);
  1306. }
  1307. return count;
  1308. }
  1309. static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
  1310. {
  1311. int count = 0;
  1312. pgoff_t offset = swp_offset(entry);
  1313. struct swap_cluster_info *ci;
  1314. ci = lock_cluster_or_swap_info(si, offset);
  1315. count = swap_count(si->swap_map[offset]);
  1316. unlock_cluster_or_swap_info(si, ci);
  1317. return count;
  1318. }
  1319. /*
  1320. * How many references to @entry are currently swapped out?
  1321. * This does not give an exact answer when swap count is continued,
  1322. * but does include the high COUNT_CONTINUED flag to allow for that.
  1323. */
  1324. int __swp_swapcount(swp_entry_t entry)
  1325. {
  1326. int count = 0;
  1327. struct swap_info_struct *si;
  1328. si = get_swap_device(entry);
  1329. if (si) {
  1330. count = swap_swapcount(si, entry);
  1331. put_swap_device(si);
  1332. }
  1333. return count;
  1334. }
  1335. /*
  1336. * How many references to @entry are currently swapped out?
  1337. * This considers COUNT_CONTINUED so it returns exact answer.
  1338. */
  1339. int swp_swapcount(swp_entry_t entry)
  1340. {
  1341. int count, tmp_count, n;
  1342. struct swap_info_struct *p;
  1343. struct swap_cluster_info *ci;
  1344. struct page *page;
  1345. pgoff_t offset;
  1346. unsigned char *map;
  1347. p = _swap_info_get(entry);
  1348. if (!p)
  1349. return 0;
  1350. offset = swp_offset(entry);
  1351. ci = lock_cluster_or_swap_info(p, offset);
  1352. count = swap_count(p->swap_map[offset]);
  1353. if (!(count & COUNT_CONTINUED))
  1354. goto out;
  1355. count &= ~COUNT_CONTINUED;
  1356. n = SWAP_MAP_MAX + 1;
  1357. page = vmalloc_to_page(p->swap_map + offset);
  1358. offset &= ~PAGE_MASK;
  1359. VM_BUG_ON(page_private(page) != SWP_CONTINUED);
  1360. do {
  1361. page = list_next_entry(page, lru);
  1362. map = kmap_atomic(page);
  1363. tmp_count = map[offset];
  1364. kunmap_atomic(map);
  1365. count += (tmp_count & ~COUNT_CONTINUED) * n;
  1366. n *= (SWAP_CONT_MAX + 1);
  1367. } while (tmp_count & COUNT_CONTINUED);
  1368. out:
  1369. unlock_cluster_or_swap_info(p, ci);
  1370. return count;
  1371. }
  1372. static bool swap_page_trans_huge_swapped(struct swap_info_struct *si,
  1373. swp_entry_t entry)
  1374. {
  1375. struct swap_cluster_info *ci;
  1376. unsigned char *map = si->swap_map;
  1377. unsigned long roffset = swp_offset(entry);
  1378. unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER);
  1379. int i;
  1380. bool ret = false;
  1381. ci = lock_cluster_or_swap_info(si, offset);
  1382. if (!ci || !cluster_is_huge(ci)) {
  1383. if (swap_count(map[roffset]))
  1384. ret = true;
  1385. goto unlock_out;
  1386. }
  1387. for (i = 0; i < SWAPFILE_CLUSTER; i++) {
  1388. if (swap_count(map[offset + i])) {
  1389. ret = true;
  1390. break;
  1391. }
  1392. }
  1393. unlock_out:
  1394. unlock_cluster_or_swap_info(si, ci);
  1395. return ret;
  1396. }
  1397. static bool page_swapped(struct page *page)
  1398. {
  1399. swp_entry_t entry;
  1400. struct swap_info_struct *si;
  1401. if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page)))
  1402. return page_swapcount(page) != 0;
  1403. page = compound_head(page);
  1404. entry.val = page_private(page);
  1405. si = _swap_info_get(entry);
  1406. if (si)
  1407. return swap_page_trans_huge_swapped(si, entry);
  1408. return false;
  1409. }
  1410. static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
  1411. int *total_swapcount)
  1412. {
  1413. int i, map_swapcount, _total_mapcount, _total_swapcount;
  1414. unsigned long offset = 0;
  1415. struct swap_info_struct *si;
  1416. struct swap_cluster_info *ci = NULL;
  1417. unsigned char *map = NULL;
  1418. int mapcount, swapcount = 0;
  1419. /* hugetlbfs shouldn't call it */
  1420. VM_BUG_ON_PAGE(PageHuge(page), page);
  1421. if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
  1422. mapcount = page_trans_huge_mapcount(page, total_mapcount);
  1423. if (PageSwapCache(page))
  1424. swapcount = page_swapcount(page);
  1425. if (total_swapcount)
  1426. *total_swapcount = swapcount;
  1427. return mapcount + swapcount;
  1428. }
  1429. page = compound_head(page);
  1430. _total_mapcount = _total_swapcount = map_swapcount = 0;
  1431. if (PageSwapCache(page)) {
  1432. swp_entry_t entry;
  1433. entry.val = page_private(page);
  1434. si = _swap_info_get(entry);
  1435. if (si) {
  1436. map = si->swap_map;
  1437. offset = swp_offset(entry);
  1438. }
  1439. }
  1440. if (map)
  1441. ci = lock_cluster(si, offset);
  1442. for (i = 0; i < HPAGE_PMD_NR; i++) {
  1443. mapcount = atomic_read(&page[i]._mapcount) + 1;
  1444. _total_mapcount += mapcount;
  1445. if (map) {
  1446. swapcount = swap_count(map[offset + i]);
  1447. _total_swapcount += swapcount;
  1448. }
  1449. map_swapcount = max(map_swapcount, mapcount + swapcount);
  1450. }
  1451. unlock_cluster(ci);
  1452. if (PageDoubleMap(page)) {
  1453. map_swapcount -= 1;
  1454. _total_mapcount -= HPAGE_PMD_NR;
  1455. }
  1456. mapcount = compound_mapcount(page);
  1457. map_swapcount += mapcount;
  1458. _total_mapcount += mapcount;
  1459. if (total_mapcount)
  1460. *total_mapcount = _total_mapcount;
  1461. if (total_swapcount)
  1462. *total_swapcount = _total_swapcount;
  1463. return map_swapcount;
  1464. }
  1465. /*
  1466. * We can write to an anon page without COW if there are no other references
  1467. * to it. And as a side-effect, free up its swap: because the old content
  1468. * on disk will never be read, and seeking back there to write new content
  1469. * later would only waste time away from clustering.
  1470. *
  1471. * NOTE: total_map_swapcount should not be relied upon by the caller if
  1472. * reuse_swap_page() returns false, but it may be always overwritten
  1473. * (see the other implementation for CONFIG_SWAP=n).
  1474. */
  1475. bool reuse_swap_page(struct page *page, int *total_map_swapcount)
  1476. {
  1477. int count, total_mapcount, total_swapcount;
  1478. VM_BUG_ON_PAGE(!PageLocked(page), page);
  1479. if (unlikely(PageKsm(page)))
  1480. return false;
  1481. count = page_trans_huge_map_swapcount(page, &total_mapcount,
  1482. &total_swapcount);
  1483. if (total_map_swapcount)
  1484. *total_map_swapcount = total_mapcount + total_swapcount;
  1485. if (count == 1 && PageSwapCache(page) &&
  1486. (likely(!PageTransCompound(page)) ||
  1487. /* The remaining swap count will be freed soon */
  1488. total_swapcount == page_swapcount(page))) {
  1489. if (!PageWriteback(page)) {
  1490. page = compound_head(page);
  1491. delete_from_swap_cache(page);
  1492. SetPageDirty(page);
  1493. } else {
  1494. swp_entry_t entry;
  1495. struct swap_info_struct *p;
  1496. entry.val = page_private(page);
  1497. p = swap_info_get(entry);
  1498. if (p->flags & SWP_STABLE_WRITES) {
  1499. spin_unlock(&p->lock);
  1500. return false;
  1501. }
  1502. spin_unlock(&p->lock);
  1503. }
  1504. }
  1505. return count <= 1;
  1506. }
  1507. /*
  1508. * If swap is getting full, or if there are no more mappings of this page,
  1509. * then try_to_free_swap is called to free its swap space.
  1510. */
  1511. int try_to_free_swap(struct page *page)
  1512. {
  1513. VM_BUG_ON_PAGE(!PageLocked(page), page);
  1514. if (!PageSwapCache(page))
  1515. return 0;
  1516. if (PageWriteback(page))
  1517. return 0;
  1518. if (page_swapped(page))
  1519. return 0;
  1520. /*
  1521. * Once hibernation has begun to create its image of memory,
  1522. * there's a danger that one of the calls to try_to_free_swap()
  1523. * - most probably a call from __try_to_reclaim_swap() while
  1524. * hibernation is allocating its own swap pages for the image,
  1525. * but conceivably even a call from memory reclaim - will free
  1526. * the swap from a page which has already been recorded in the
  1527. * image as a clean swapcache page, and then reuse its swap for
  1528. * another page of the image. On waking from hibernation, the
  1529. * original page might be freed under memory pressure, then
  1530. * later read back in from swap, now with the wrong data.
  1531. *
  1532. * Hibernation suspends storage while it is writing the image
  1533. * to disk so check that here.
  1534. */
  1535. if (pm_suspended_storage())
  1536. return 0;
  1537. page = compound_head(page);
  1538. delete_from_swap_cache(page);
  1539. SetPageDirty(page);
  1540. return 1;
  1541. }
  1542. /*
  1543. * Free the swap entry like above, but also try to
  1544. * free the page cache entry if it is the last user.
  1545. */
  1546. int free_swap_and_cache(swp_entry_t entry)
  1547. {
  1548. struct swap_info_struct *p;
  1549. unsigned char count;
  1550. if (non_swap_entry(entry))
  1551. return 1;
  1552. p = _swap_info_get(entry);
  1553. if (p) {
  1554. count = __swap_entry_free(p, entry);
  1555. if (count == SWAP_HAS_CACHE &&
  1556. !swap_page_trans_huge_swapped(p, entry))
  1557. __try_to_reclaim_swap(p, swp_offset(entry),
  1558. TTRS_UNMAPPED | TTRS_FULL);
  1559. }
  1560. return p != NULL;
  1561. }
  1562. #ifdef CONFIG_HIBERNATION
  1563. /*
  1564. * Find the swap type that corresponds to given device (if any).
  1565. *
  1566. * @offset - number of the PAGE_SIZE-sized block of the device, starting
  1567. * from 0, in which the swap header is expected to be located.
  1568. *
  1569. * This is needed for the suspend to disk (aka swsusp).
  1570. */
  1571. int swap_type_of(dev_t device, sector_t offset)
  1572. {
  1573. int type;
  1574. if (!device)
  1575. return -1;
  1576. spin_lock(&swap_lock);
  1577. for (type = 0; type < nr_swapfiles; type++) {
  1578. struct swap_info_struct *sis = swap_info[type];
  1579. if (!(sis->flags & SWP_WRITEOK))
  1580. continue;
  1581. if (device == sis->bdev->bd_dev) {
  1582. struct swap_extent *se = first_se(sis);
  1583. if (se->start_block == offset) {
  1584. spin_unlock(&swap_lock);
  1585. return type;
  1586. }
  1587. }
  1588. }
  1589. spin_unlock(&swap_lock);
  1590. return -ENODEV;
  1591. }
  1592. int find_first_swap(dev_t *device)
  1593. {
  1594. int type;
  1595. spin_lock(&swap_lock);
  1596. for (type = 0; type < nr_swapfiles; type++) {
  1597. struct swap_info_struct *sis = swap_info[type];
  1598. if (!(sis->flags & SWP_WRITEOK))
  1599. continue;
  1600. *device = sis->bdev->bd_dev;
  1601. spin_unlock(&swap_lock);
  1602. return type;
  1603. }
  1604. spin_unlock(&swap_lock);
  1605. return -ENODEV;
  1606. }
  1607. /*
  1608. * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev
  1609. * corresponding to given index in swap_info (swap type).
  1610. */
  1611. sector_t swapdev_block(int type, pgoff_t offset)
  1612. {
  1613. struct block_device *bdev;
  1614. struct swap_info_struct *si = swap_type_to_swap_info(type);
  1615. if (!si || !(si->flags & SWP_WRITEOK))
  1616. return 0;
  1617. return map_swap_entry(swp_entry(type, offset), &bdev);
  1618. }
  1619. /*
  1620. * Return either the total number of swap pages of given type, or the number
  1621. * of free pages of that type (depending on @free)
  1622. *
  1623. * This is needed for software suspend
  1624. */
  1625. unsigned int count_swap_pages(int type, int free)
  1626. {
  1627. unsigned int n = 0;
  1628. spin_lock(&swap_lock);
  1629. if ((unsigned int)type < nr_swapfiles) {
  1630. struct swap_info_struct *sis = swap_info[type];
  1631. spin_lock(&sis->lock);
  1632. if (sis->flags & SWP_WRITEOK) {
  1633. n = sis->pages;
  1634. if (free)
  1635. n -= sis->inuse_pages;
  1636. }
  1637. spin_unlock(&sis->lock);
  1638. }
  1639. spin_unlock(&swap_lock);
  1640. return n;
  1641. }
  1642. #endif /* CONFIG_HIBERNATION */
  1643. static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte)
  1644. {
  1645. return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte);
  1646. }
  1647. /*
  1648. * No need to decide whether this PTE shares the swap entry with others,
  1649. * just let do_wp_page work it out if a write is requested later - to
  1650. * force COW, vm_page_prot omits write permission from any private vma.
  1651. */
  1652. static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
  1653. unsigned long addr, swp_entry_t entry, struct page *page)
  1654. {
  1655. struct page *swapcache;
  1656. spinlock_t *ptl;
  1657. pte_t *pte;
  1658. int ret = 1;
  1659. swapcache = page;
  1660. page = ksm_might_need_to_copy(page, vma, addr);
  1661. if (unlikely(!page))
  1662. return -ENOMEM;
  1663. pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
  1664. if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
  1665. ret = 0;
  1666. goto out;
  1667. }
  1668. dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
  1669. inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
  1670. get_page(page);
  1671. set_pte_at(vma->vm_mm, addr, pte,
  1672. pte_mkold(mk_pte(page, vma->vm_page_prot)));
  1673. if (page == swapcache) {
  1674. page_add_anon_rmap(page, vma, addr, false);
  1675. } else { /* ksm created a completely new copy */
  1676. page_add_new_anon_rmap(page, vma, addr, false);
  1677. lru_cache_add_inactive_or_unevictable(page, vma);
  1678. }
  1679. swap_free(entry);
  1680. out:
  1681. pte_unmap_unlock(pte, ptl);
  1682. if (page != swapcache) {
  1683. unlock_page(page);
  1684. put_page(page);
  1685. }
  1686. return ret;
  1687. }
  1688. static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
  1689. unsigned long addr, unsigned long end,
  1690. unsigned int type, bool frontswap,
  1691. unsigned long *fs_pages_to_unuse)
  1692. {
  1693. struct page *page;
  1694. swp_entry_t entry;
  1695. pte_t *pte;
  1696. struct swap_info_struct *si;
  1697. unsigned long offset;
  1698. int ret = 0;
  1699. volatile unsigned char *swap_map;
  1700. si = swap_info[type];
  1701. pte = pte_offset_map(pmd, addr);
  1702. do {
  1703. struct vm_fault vmf;
  1704. if (!is_swap_pte(*pte))
  1705. continue;
  1706. entry = pte_to_swp_entry(*pte);
  1707. if (swp_type(entry) != type)
  1708. continue;
  1709. offset = swp_offset(entry);
  1710. if (frontswap && !frontswap_test(si, offset))
  1711. continue;
  1712. pte_unmap(pte);
  1713. swap_map = &si->swap_map[offset];
  1714. page = lookup_swap_cache(entry, vma, addr);
  1715. if (!page) {
  1716. vmf.vma = vma;
  1717. vmf.address = addr;
  1718. vmf.pmd = pmd;
  1719. page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
  1720. &vmf);
  1721. }
  1722. if (!page) {
  1723. if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
  1724. goto try_next;
  1725. return -ENOMEM;
  1726. }
  1727. lock_page(page);
  1728. wait_on_page_writeback(page);
  1729. ret = unuse_pte(vma, pmd, addr, entry, page);
  1730. if (ret < 0) {
  1731. unlock_page(page);
  1732. put_page(page);
  1733. goto out;
  1734. }
  1735. try_to_free_swap(page);
  1736. unlock_page(page);
  1737. put_page(page);
  1738. if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
  1739. ret = FRONTSWAP_PAGES_UNUSED;
  1740. goto out;
  1741. }
  1742. try_next:
  1743. pte = pte_offset_map(pmd, addr);
  1744. } while (pte++, addr += PAGE_SIZE, addr != end);
  1745. pte_unmap(pte - 1);
  1746. ret = 0;
  1747. out:
  1748. return ret;
  1749. }
  1750. static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
  1751. unsigned long addr, unsigned long end,
  1752. unsigned int type, bool frontswap,
  1753. unsigned long *fs_pages_to_unuse)
  1754. {
  1755. pmd_t *pmd;
  1756. unsigned long next;
  1757. int ret;
  1758. pmd = pmd_offset(pud, addr);
  1759. do {
  1760. cond_resched();
  1761. next = pmd_addr_end(addr, end);
  1762. if (pmd_none_or_trans_huge_or_clear_bad(pmd))
  1763. continue;
  1764. ret = unuse_pte_range(vma, pmd, addr, next, type,
  1765. frontswap, fs_pages_to_unuse);
  1766. if (ret)
  1767. return ret;
  1768. } while (pmd++, addr = next, addr != end);
  1769. return 0;
  1770. }
  1771. static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
  1772. unsigned long addr, unsigned long end,
  1773. unsigned int type, bool frontswap,
  1774. unsigned long *fs_pages_to_unuse)
  1775. {
  1776. pud_t *pud;
  1777. unsigned long next;
  1778. int ret;
  1779. pud = pud_offset(p4d, addr);
  1780. do {
  1781. next = pud_addr_end(addr, end);
  1782. if (pud_none_or_clear_bad(pud))
  1783. continue;
  1784. ret = unuse_pmd_range(vma, pud, addr, next, type,
  1785. frontswap, fs_pages_to_unuse);
  1786. if (ret)
  1787. return ret;
  1788. } while (pud++, addr = next, addr != end);
  1789. return 0;
  1790. }
  1791. static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
  1792. unsigned long addr, unsigned long end,
  1793. unsigned int type, bool frontswap,
  1794. unsigned long *fs_pages_to_unuse)
  1795. {
  1796. p4d_t *p4d;
  1797. unsigned long next;
  1798. int ret;
  1799. p4d = p4d_offset(pgd, addr);
  1800. do {
  1801. next = p4d_addr_end(addr, end);
  1802. if (p4d_none_or_clear_bad(p4d))
  1803. continue;
  1804. ret = unuse_pud_range(vma, p4d, addr, next, type,
  1805. frontswap, fs_pages_to_unuse);
  1806. if (ret)
  1807. return ret;
  1808. } while (p4d++, addr = next, addr != end);
  1809. return 0;
  1810. }
  1811. static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
  1812. bool frontswap, unsigned long *fs_pages_to_unuse)
  1813. {
  1814. pgd_t *pgd;
  1815. unsigned long addr, end, next;
  1816. int ret;
  1817. addr = vma->vm_start;
  1818. end = vma->vm_end;
  1819. pgd = pgd_offset(vma->vm_mm, addr);
  1820. do {
  1821. next = pgd_addr_end(addr, end);
  1822. if (pgd_none_or_clear_bad(pgd))
  1823. continue;
  1824. ret = unuse_p4d_range(vma, pgd, addr, next, type,
  1825. frontswap, fs_pages_to_unuse);
  1826. if (ret)
  1827. return ret;
  1828. } while (pgd++, addr = next, addr != end);
  1829. return 0;
  1830. }
  1831. static int unuse_mm(struct mm_struct *mm, unsigned int type,
  1832. bool frontswap, unsigned long *fs_pages_to_unuse)
  1833. {
  1834. struct vm_area_struct *vma;
  1835. int ret = 0;
  1836. mmap_read_lock(mm);
  1837. for (vma = mm->mmap; vma; vma = vma->vm_next) {
  1838. if (vma->anon_vma) {
  1839. ret = unuse_vma(vma, type, frontswap,
  1840. fs_pages_to_unuse);
  1841. if (ret)
  1842. break;
  1843. }
  1844. cond_resched();
  1845. }
  1846. mmap_read_unlock(mm);
  1847. return ret;
  1848. }
  1849. /*
  1850. * Scan swap_map (or frontswap_map if frontswap parameter is true)
  1851. * from current position to next entry still in use. Return 0
  1852. * if there are no inuse entries after prev till end of the map.
  1853. */
  1854. static unsigned int find_next_to_unuse(struct swap_info_struct *si,
  1855. unsigned int prev, bool frontswap)
  1856. {
  1857. unsigned int i;
  1858. unsigned char count;
  1859. /*
  1860. * No need for swap_lock here: we're just looking
  1861. * for whether an entry is in use, not modifying it; false
  1862. * hits are okay, and sys_swapoff() has already prevented new
  1863. * allocations from this area (while holding swap_lock).
  1864. */
  1865. for (i = prev + 1; i < si->max; i++) {
  1866. count = READ_ONCE(si->swap_map[i]);
  1867. if (count && swap_count(count) != SWAP_MAP_BAD)
  1868. if (!frontswap || frontswap_test(si, i))
  1869. break;
  1870. if ((i % LATENCY_LIMIT) == 0)
  1871. cond_resched();
  1872. }
  1873. if (i == si->max)
  1874. i = 0;
  1875. return i;
  1876. }
  1877. /*
  1878. * If the boolean frontswap is true, only unuse pages_to_unuse pages;
  1879. * pages_to_unuse==0 means all pages; ignored if frontswap is false
  1880. */
  1881. int try_to_unuse(unsigned int type, bool frontswap,
  1882. unsigned long pages_to_unuse)
  1883. {
  1884. struct mm_struct *prev_mm;
  1885. struct mm_struct *mm;
  1886. struct list_head *p;
  1887. int retval = 0;
  1888. struct swap_info_struct *si = swap_info[type];
  1889. struct page *page;
  1890. swp_entry_t entry;
  1891. unsigned int i;
  1892. if (!READ_ONCE(si->inuse_pages))
  1893. return 0;
  1894. if (!frontswap)
  1895. pages_to_unuse = 0;
  1896. retry:
  1897. retval = shmem_unuse(type, frontswap, &pages_to_unuse);
  1898. if (retval)
  1899. goto out;
  1900. prev_mm = &init_mm;
  1901. mmget(prev_mm);
  1902. spin_lock(&mmlist_lock);
  1903. p = &init_mm.mmlist;
  1904. while (READ_ONCE(si->inuse_pages) &&
  1905. !signal_pending(current) &&
  1906. (p = p->next) != &init_mm.mmlist) {
  1907. mm = list_entry(p, struct mm_struct, mmlist);
  1908. if (!mmget_not_zero(mm))
  1909. continue;
  1910. spin_unlock(&mmlist_lock);
  1911. mmput(prev_mm);
  1912. prev_mm = mm;
  1913. retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
  1914. if (retval) {
  1915. mmput(prev_mm);
  1916. goto out;
  1917. }
  1918. /*
  1919. * Make sure that we aren't completely killing
  1920. * interactive performance.
  1921. */
  1922. cond_resched();
  1923. spin_lock(&mmlist_lock);
  1924. }
  1925. spin_unlock(&mmlist_lock);
  1926. mmput(prev_mm);
  1927. i = 0;
  1928. while (READ_ONCE(si->inuse_pages) &&
  1929. !signal_pending(current) &&
  1930. (i = find_next_to_unuse(si, i, frontswap)) != 0) {
  1931. entry = swp_entry(type, i);
  1932. page = find_get_page(swap_address_space(entry), i);
  1933. if (!page)
  1934. continue;
  1935. /*
  1936. * It is conceivable that a racing task removed this page from
  1937. * swap cache just before we acquired the page lock. The page
  1938. * might even be back in swap cache on another swap area. But
  1939. * that is okay, try_to_free_swap() only removes stale pages.
  1940. */
  1941. lock_page(page);
  1942. wait_on_page_writeback(page);
  1943. try_to_free_swap(page);
  1944. unlock_page(page);
  1945. put_page(page);
  1946. /*
  1947. * For frontswap, we just need to unuse pages_to_unuse, if
  1948. * it was specified. Need not check frontswap again here as
  1949. * we already zeroed out pages_to_unuse if not frontswap.
  1950. */
  1951. if (pages_to_unuse && --pages_to_unuse == 0)
  1952. goto out;
  1953. }
  1954. /*
  1955. * Lets check again to see if there are still swap entries in the map.
  1956. * If yes, we would need to do retry the unuse logic again.
  1957. * Under global memory pressure, swap entries can be reinserted back
  1958. * into process space after the mmlist loop above passes over them.
  1959. *
  1960. * Limit the number of retries? No: when mmget_not_zero() above fails,
  1961. * that mm is likely to be freeing swap from exit_mmap(), which proceeds
  1962. * at its own independent pace; and even shmem_writepage() could have
  1963. * been preempted after get_swap_page(), temporarily hiding that swap.
  1964. * It's easy and robust (though cpu-intensive) just to keep retrying.
  1965. */
  1966. if (READ_ONCE(si->inuse_pages)) {
  1967. if (!signal_pending(current))
  1968. goto retry;
  1969. retval = -EINTR;
  1970. }
  1971. out:
  1972. return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
  1973. }
  1974. /*
  1975. * After a successful try_to_unuse, if no swap is now in use, we know
  1976. * we can empty the mmlist. swap_lock must be held on entry and exit.
  1977. * Note that mmlist_lock nests inside swap_lock, and an mm must be
  1978. * added to the mmlist just after page_duplicate - before would be racy.
  1979. */
  1980. static void drain_mmlist(void)
  1981. {
  1982. struct list_head *p, *next;
  1983. unsigned int type;
  1984. for (type = 0; type < nr_swapfiles; type++)
  1985. if (swap_info[type]->inuse_pages)
  1986. return;
  1987. spin_lock(&mmlist_lock);
  1988. list_for_each_safe(p, next, &init_mm.mmlist)
  1989. list_del_init(p);
  1990. spin_unlock(&mmlist_lock);
  1991. }
  1992. /*
  1993. * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
  1994. * corresponds to page offset for the specified swap entry.
  1995. * Note that the type of this function is sector_t, but it returns page offset
  1996. * into the bdev, not sector offset.
  1997. */
  1998. static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
  1999. {
  2000. struct swap_info_struct *sis;
  2001. struct swap_extent *se;
  2002. pgoff_t offset;
  2003. sis = swp_swap_info(entry);
  2004. *bdev = sis->bdev;
  2005. offset = swp_offset(entry);
  2006. se = offset_to_swap_extent(sis, offset);
  2007. return se->start_block + (offset - se->start_page);
  2008. }
  2009. /*
  2010. * Returns the page offset into bdev for the specified page's swap entry.
  2011. */
  2012. sector_t map_swap_page(struct page *page, struct block_device **bdev)
  2013. {
  2014. swp_entry_t entry;
  2015. entry.val = page_private(page);
  2016. return map_swap_entry(entry, bdev);
  2017. }
  2018. /*
  2019. * Free all of a swapdev's extent information
  2020. */
  2021. static void destroy_swap_extents(struct swap_info_struct *sis)
  2022. {
  2023. while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
  2024. struct rb_node *rb = sis->swap_extent_root.rb_node;
  2025. struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
  2026. rb_erase(rb, &sis->swap_extent_root);
  2027. kfree(se);
  2028. }
  2029. if (sis->flags & SWP_ACTIVATED) {
  2030. struct file *swap_file = sis->swap_file;
  2031. struct address_space *mapping = swap_file->f_mapping;
  2032. sis->flags &= ~SWP_ACTIVATED;
  2033. if (mapping->a_ops->swap_deactivate)
  2034. mapping->a_ops->swap_deactivate(swap_file);
  2035. }
  2036. }
  2037. /*
  2038. * Add a block range (and the corresponding page range) into this swapdev's
  2039. * extent tree.
  2040. *
  2041. * This function rather assumes that it is called in ascending page order.
  2042. */
  2043. int
  2044. add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
  2045. unsigned long nr_pages, sector_t start_block)
  2046. {
  2047. struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
  2048. struct swap_extent *se;
  2049. struct swap_extent *new_se;
  2050. /*
  2051. * place the new node at the right most since the
  2052. * function is called in ascending page order.
  2053. */
  2054. while (*link) {
  2055. parent = *link;
  2056. link = &parent->rb_right;
  2057. }
  2058. if (parent) {
  2059. se = rb_entry(parent, struct swap_extent, rb_node);
  2060. BUG_ON(se->start_page + se->nr_pages != start_page);
  2061. if (se->start_block + se->nr_pages == start_block) {
  2062. /* Merge it */
  2063. se->nr_pages += nr_pages;
  2064. return 0;
  2065. }
  2066. }
  2067. /* No merge, insert a new extent. */
  2068. new_se = kmalloc(sizeof(*se), GFP_KERNEL);
  2069. if (new_se == NULL)
  2070. return -ENOMEM;
  2071. new_se->start_page = start_page;
  2072. new_se->nr_pages = nr_pages;
  2073. new_se->start_block = start_block;
  2074. rb_link_node(&new_se->rb_node, parent, link);
  2075. rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
  2076. return 1;
  2077. }
  2078. EXPORT_SYMBOL_GPL(add_swap_extent);
  2079. /*
  2080. * A `swap extent' is a simple thing which maps a contiguous range of pages
  2081. * onto a contiguous range of disk blocks. An ordered list of swap extents
  2082. * is built at swapon time and is then used at swap_writepage/swap_readpage
  2083. * time for locating where on disk a page belongs.
  2084. *
  2085. * If the swapfile is an S_ISBLK block device, a single extent is installed.
  2086. * This is done so that the main operating code can treat S_ISBLK and S_ISREG
  2087. * swap files identically.
  2088. *
  2089. * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
  2090. * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
  2091. * swapfiles are handled *identically* after swapon time.
  2092. *
  2093. * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
  2094. * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
  2095. * some stray blocks are found which do not fall within the PAGE_SIZE alignment
  2096. * requirements, they are simply tossed out - we will never use those blocks
  2097. * for swapping.
  2098. *
  2099. * For all swap devices we set S_SWAPFILE across the life of the swapon. This
  2100. * prevents users from writing to the swap device, which will corrupt memory.
  2101. *
  2102. * The amount of disk space which a single swap extent represents varies.
  2103. * Typically it is in the 1-4 megabyte range. So we can have hundreds of
  2104. * extents in the list. To avoid much list walking, we cache the previous
  2105. * search location in `curr_swap_extent', and start new searches from there.
  2106. * This is extremely effective. The average number of iterations in
  2107. * map_swap_page() has been measured at about 0.3 per page. - akpm.
  2108. */
  2109. static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
  2110. {
  2111. struct file *swap_file = sis->swap_file;
  2112. struct address_space *mapping = swap_file->f_mapping;
  2113. struct inode *inode = mapping->host;
  2114. int ret;
  2115. if (S_ISBLK(inode->i_mode)) {
  2116. ret = add_swap_extent(sis, 0, sis->max, 0);
  2117. *span = sis->pages;
  2118. return ret;
  2119. }
  2120. if (mapping->a_ops->swap_activate) {
  2121. ret = mapping->a_ops->swap_activate(sis, swap_file, span);
  2122. if (ret >= 0)
  2123. sis->flags |= SWP_ACTIVATED;
  2124. if (!ret) {
  2125. sis->flags |= SWP_FS_OPS;
  2126. ret = add_swap_extent(sis, 0, sis->max, 0);
  2127. *span = sis->pages;
  2128. }
  2129. return ret;
  2130. }
  2131. return generic_swapfile_activate(sis, swap_file, span);
  2132. }
  2133. static int swap_node(struct swap_info_struct *p)
  2134. {
  2135. struct block_device *bdev;
  2136. if (p->bdev)
  2137. bdev = p->bdev;
  2138. else
  2139. bdev = p->swap_file->f_inode->i_sb->s_bdev;
  2140. return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
  2141. }
  2142. static void setup_swap_info(struct swap_info_struct *p, int prio,
  2143. unsigned char *swap_map,
  2144. struct swap_cluster_info *cluster_info)
  2145. {
  2146. int i;
  2147. if (prio >= 0)
  2148. p->prio = prio;
  2149. else
  2150. p->prio = --least_priority;
  2151. /*
  2152. * the plist prio is negated because plist ordering is
  2153. * low-to-high, while swap ordering is high-to-low
  2154. */
  2155. p->list.prio = -p->prio;
  2156. for_each_node(i) {
  2157. if (p->prio >= 0)
  2158. p->avail_lists[i].prio = -p->prio;
  2159. else {
  2160. if (swap_node(p) == i)
  2161. p->avail_lists[i].prio = 1;
  2162. else
  2163. p->avail_lists[i].prio = -p->prio;
  2164. }
  2165. }
  2166. p->swap_map = swap_map;
  2167. p->cluster_info = cluster_info;
  2168. }
  2169. static void _enable_swap_info(struct swap_info_struct *p)
  2170. {
  2171. p->flags |= SWP_WRITEOK | SWP_VALID;
  2172. atomic_long_add(p->pages, &nr_swap_pages);
  2173. total_swap_pages += p->pages;
  2174. assert_spin_locked(&swap_lock);
  2175. /*
  2176. * both lists are plists, and thus priority ordered.
  2177. * swap_active_head needs to be priority ordered for swapoff(),
  2178. * which on removal of any swap_info_struct with an auto-assigned
  2179. * (i.e. negative) priority increments the auto-assigned priority
  2180. * of any lower-priority swap_info_structs.
  2181. * swap_avail_head needs to be priority ordered for get_swap_page(),
  2182. * which allocates swap pages from the highest available priority
  2183. * swap_info_struct.
  2184. */
  2185. plist_add(&p->list, &swap_active_head);
  2186. add_to_avail_list(p);
  2187. }
  2188. static void enable_swap_info(struct swap_info_struct *p, int prio,
  2189. unsigned char *swap_map,
  2190. struct swap_cluster_info *cluster_info,
  2191. unsigned long *frontswap_map)
  2192. {
  2193. frontswap_init(p->type, frontswap_map);
  2194. spin_lock(&swap_lock);
  2195. spin_lock(&p->lock);
  2196. setup_swap_info(p, prio, swap_map, cluster_info);
  2197. spin_unlock(&p->lock);
  2198. spin_unlock(&swap_lock);
  2199. /*
  2200. * Guarantee swap_map, cluster_info, etc. fields are valid
  2201. * between get/put_swap_device() if SWP_VALID bit is set
  2202. */
  2203. synchronize_rcu();
  2204. spin_lock(&swap_lock);
  2205. spin_lock(&p->lock);
  2206. _enable_swap_info(p);
  2207. spin_unlock(&p->lock);
  2208. spin_unlock(&swap_lock);
  2209. }
  2210. static void reinsert_swap_info(struct swap_info_struct *p)
  2211. {
  2212. spin_lock(&swap_lock);
  2213. spin_lock(&p->lock);
  2214. setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
  2215. _enable_swap_info(p);
  2216. spin_unlock(&p->lock);
  2217. spin_unlock(&swap_lock);
  2218. }
  2219. bool has_usable_swap(void)
  2220. {
  2221. bool ret = true;
  2222. spin_lock(&swap_lock);
  2223. if (plist_head_empty(&swap_active_head))
  2224. ret = false;
  2225. spin_unlock(&swap_lock);
  2226. return ret;
  2227. }
  2228. SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
  2229. {
  2230. struct swap_info_struct *p = NULL;
  2231. unsigned char *swap_map;
  2232. struct swap_cluster_info *cluster_info;
  2233. unsigned long *frontswap_map;
  2234. struct file *swap_file, *victim;
  2235. struct address_space *mapping;
  2236. struct inode *inode;
  2237. struct filename *pathname;
  2238. int err, found = 0;
  2239. unsigned int old_block_size;
  2240. if (!capable(CAP_SYS_ADMIN))
  2241. return -EPERM;
  2242. BUG_ON(!current->mm);
  2243. pathname = getname(specialfile);
  2244. if (IS_ERR(pathname))
  2245. return PTR_ERR(pathname);
  2246. victim = file_open_name(pathname, O_RDWR|O_LARGEFILE, 0);
  2247. err = PTR_ERR(victim);
  2248. if (IS_ERR(victim))
  2249. goto out;
  2250. mapping = victim->f_mapping;
  2251. spin_lock(&swap_lock);
  2252. plist_for_each_entry(p, &swap_active_head, list) {
  2253. if (p->flags & SWP_WRITEOK) {
  2254. if (p->swap_file->f_mapping == mapping) {
  2255. found = 1;
  2256. break;
  2257. }
  2258. }
  2259. }
  2260. if (!found) {
  2261. err = -EINVAL;
  2262. spin_unlock(&swap_lock);
  2263. goto out_dput;
  2264. }
  2265. if (!security_vm_enough_memory_mm(current->mm, p->pages))
  2266. vm_unacct_memory(p->pages);
  2267. else {
  2268. err = -ENOMEM;
  2269. spin_unlock(&swap_lock);
  2270. goto out_dput;
  2271. }
  2272. del_from_avail_list(p);
  2273. spin_lock(&p->lock);
  2274. if (p->prio < 0) {
  2275. struct swap_info_struct *si = p;
  2276. int nid;
  2277. plist_for_each_entry_continue(si, &swap_active_head, list) {
  2278. si->prio++;
  2279. si->list.prio--;
  2280. for_each_node(nid) {
  2281. if (si->avail_lists[nid].prio != 1)
  2282. si->avail_lists[nid].prio--;
  2283. }
  2284. }
  2285. least_priority++;
  2286. }
  2287. plist_del(&p->list, &swap_active_head);
  2288. atomic_long_sub(p->pages, &nr_swap_pages);
  2289. total_swap_pages -= p->pages;
  2290. p->flags &= ~SWP_WRITEOK;
  2291. spin_unlock(&p->lock);
  2292. spin_unlock(&swap_lock);
  2293. disable_swap_slots_cache_lock();
  2294. set_current_oom_origin();
  2295. err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
  2296. clear_current_oom_origin();
  2297. if (err) {
  2298. /* re-insert swap space back into swap_list */
  2299. reinsert_swap_info(p);
  2300. reenable_swap_slots_cache_unlock();
  2301. goto out_dput;
  2302. }
  2303. reenable_swap_slots_cache_unlock();
  2304. spin_lock(&swap_lock);
  2305. spin_lock(&p->lock);
  2306. p->flags &= ~SWP_VALID; /* mark swap device as invalid */
  2307. spin_unlock(&p->lock);
  2308. spin_unlock(&swap_lock);
  2309. /*
  2310. * wait for swap operations protected by get/put_swap_device()
  2311. * to complete
  2312. */
  2313. synchronize_rcu();
  2314. flush_work(&p->discard_work);
  2315. destroy_swap_extents(p);
  2316. if (p->flags & SWP_CONTINUED)
  2317. free_swap_count_continuations(p);
  2318. if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
  2319. atomic_dec(&nr_rotate_swap);
  2320. mutex_lock(&swapon_mutex);
  2321. spin_lock(&swap_lock);
  2322. spin_lock(&p->lock);
  2323. drain_mmlist();
  2324. /* wait for anyone still in scan_swap_map */
  2325. p->highest_bit = 0; /* cuts scans short */
  2326. while (p->flags >= SWP_SCANNING) {
  2327. spin_unlock(&p->lock);
  2328. spin_unlock(&swap_lock);
  2329. schedule_timeout_uninterruptible(1);
  2330. spin_lock(&swap_lock);
  2331. spin_lock(&p->lock);
  2332. }
  2333. swap_file = p->swap_file;
  2334. old_block_size = p->old_block_size;
  2335. p->swap_file = NULL;
  2336. p->max = 0;
  2337. swap_map = p->swap_map;
  2338. p->swap_map = NULL;
  2339. cluster_info = p->cluster_info;
  2340. p->cluster_info = NULL;
  2341. frontswap_map = frontswap_map_get(p);
  2342. spin_unlock(&p->lock);
  2343. spin_unlock(&swap_lock);
  2344. arch_swap_invalidate_area(p->type);
  2345. frontswap_invalidate_area(p->type);
  2346. frontswap_map_set(p, NULL);
  2347. mutex_unlock(&swapon_mutex);
  2348. free_percpu(p->percpu_cluster);
  2349. p->percpu_cluster = NULL;
  2350. free_percpu(p->cluster_next_cpu);
  2351. p->cluster_next_cpu = NULL;
  2352. vfree(swap_map);
  2353. kvfree(cluster_info);
  2354. kvfree(frontswap_map);
  2355. /* Destroy swap account information */
  2356. swap_cgroup_swapoff(p->type);
  2357. exit_swap_address_space(p->type);
  2358. inode = mapping->host;
  2359. if (S_ISBLK(inode->i_mode)) {
  2360. struct block_device *bdev = I_BDEV(inode);
  2361. set_blocksize(bdev, old_block_size);
  2362. blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  2363. }
  2364. inode_lock(inode);
  2365. inode->i_flags &= ~S_SWAPFILE;
  2366. inode_unlock(inode);
  2367. filp_close(swap_file, NULL);
  2368. /*
  2369. * Clear the SWP_USED flag after all resources are freed so that swapon
  2370. * can reuse this swap_info in alloc_swap_info() safely. It is ok to
  2371. * not hold p->lock after we cleared its SWP_WRITEOK.
  2372. */
  2373. spin_lock(&swap_lock);
  2374. p->flags = 0;
  2375. spin_unlock(&swap_lock);
  2376. err = 0;
  2377. atomic_inc(&proc_poll_event);
  2378. wake_up_interruptible(&proc_poll_wait);
  2379. out_dput:
  2380. filp_close(victim, NULL);
  2381. out:
  2382. putname(pathname);
  2383. return err;
  2384. }
  2385. #ifdef CONFIG_PROC_FS
  2386. static __poll_t swaps_poll(struct file *file, poll_table *wait)
  2387. {
  2388. struct seq_file *seq = file->private_data;
  2389. poll_wait(file, &proc_poll_wait, wait);
  2390. if (seq->poll_event != atomic_read(&proc_poll_event)) {
  2391. seq->poll_event = atomic_read(&proc_poll_event);
  2392. return EPOLLIN | EPOLLRDNORM | EPOLLERR | EPOLLPRI;
  2393. }
  2394. return EPOLLIN | EPOLLRDNORM;
  2395. }
  2396. /* iterator */
  2397. static void *swap_start(struct seq_file *swap, loff_t *pos)
  2398. {
  2399. struct swap_info_struct *si;
  2400. int type;
  2401. loff_t l = *pos;
  2402. mutex_lock(&swapon_mutex);
  2403. if (!l)
  2404. return SEQ_START_TOKEN;
  2405. for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
  2406. if (!(si->flags & SWP_USED) || !si->swap_map)
  2407. continue;
  2408. if (!--l)
  2409. return si;
  2410. }
  2411. return NULL;
  2412. }
  2413. static void *swap_next(struct seq_file *swap, void *v, loff_t *pos)
  2414. {
  2415. struct swap_info_struct *si = v;
  2416. int type;
  2417. if (v == SEQ_START_TOKEN)
  2418. type = 0;
  2419. else
  2420. type = si->type + 1;
  2421. ++(*pos);
  2422. for (; (si = swap_type_to_swap_info(type)); type++) {
  2423. if (!(si->flags & SWP_USED) || !si->swap_map)
  2424. continue;
  2425. return si;
  2426. }
  2427. return NULL;
  2428. }
  2429. static void swap_stop(struct seq_file *swap, void *v)
  2430. {
  2431. mutex_unlock(&swapon_mutex);
  2432. }
  2433. static int swap_show(struct seq_file *swap, void *v)
  2434. {
  2435. struct swap_info_struct *si = v;
  2436. struct file *file;
  2437. int len;
  2438. unsigned int bytes, inuse;
  2439. if (si == SEQ_START_TOKEN) {
  2440. seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
  2441. return 0;
  2442. }
  2443. bytes = si->pages << (PAGE_SHIFT - 10);
  2444. inuse = si->inuse_pages << (PAGE_SHIFT - 10);
  2445. file = si->swap_file;
  2446. len = seq_file_path(swap, file, " \t\n\\");
  2447. seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
  2448. len < 40 ? 40 - len : 1, " ",
  2449. S_ISBLK(file_inode(file)->i_mode) ?
  2450. "partition" : "file\t",
  2451. bytes, bytes < 10000000 ? "\t" : "",
  2452. inuse, inuse < 10000000 ? "\t" : "",
  2453. si->prio);
  2454. return 0;
  2455. }
  2456. static const struct seq_operations swaps_op = {
  2457. .start = swap_start,
  2458. .next = swap_next,
  2459. .stop = swap_stop,
  2460. .show = swap_show
  2461. };
  2462. static int swaps_open(struct inode *inode, struct file *file)
  2463. {
  2464. struct seq_file *seq;
  2465. int ret;
  2466. ret = seq_open(file, &swaps_op);
  2467. if (ret)
  2468. return ret;
  2469. seq = file->private_data;
  2470. seq->poll_event = atomic_read(&proc_poll_event);
  2471. return 0;
  2472. }
  2473. static const struct proc_ops swaps_proc_ops = {
  2474. .proc_flags = PROC_ENTRY_PERMANENT,
  2475. .proc_open = swaps_open,
  2476. .proc_read = seq_read,
  2477. .proc_lseek = seq_lseek,
  2478. .proc_release = seq_release,
  2479. .proc_poll = swaps_poll,
  2480. };
  2481. static int __init procswaps_init(void)
  2482. {
  2483. proc_create("swaps", 0, NULL, &swaps_proc_ops);
  2484. return 0;
  2485. }
  2486. __initcall(procswaps_init);
  2487. #endif /* CONFIG_PROC_FS */
  2488. #ifdef MAX_SWAPFILES_CHECK
  2489. static int __init max_swapfiles_check(void)
  2490. {
  2491. MAX_SWAPFILES_CHECK();
  2492. return 0;
  2493. }
  2494. late_initcall(max_swapfiles_check);
  2495. #endif
  2496. static struct swap_info_struct *alloc_swap_info(void)
  2497. {
  2498. struct swap_info_struct *p;
  2499. struct swap_info_struct *defer = NULL;
  2500. unsigned int type;
  2501. int i;
  2502. p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
  2503. if (!p)
  2504. return ERR_PTR(-ENOMEM);
  2505. spin_lock(&swap_lock);
  2506. for (type = 0; type < nr_swapfiles; type++) {
  2507. if (!(swap_info[type]->flags & SWP_USED))
  2508. break;
  2509. }
  2510. if (type >= MAX_SWAPFILES) {
  2511. spin_unlock(&swap_lock);
  2512. kvfree(p);
  2513. return ERR_PTR(-EPERM);
  2514. }
  2515. if (type >= nr_swapfiles) {
  2516. p->type = type;
  2517. WRITE_ONCE(swap_info[type], p);
  2518. /*
  2519. * Write swap_info[type] before nr_swapfiles, in case a
  2520. * racing procfs swap_start() or swap_next() is reading them.
  2521. * (We never shrink nr_swapfiles, we never free this entry.)
  2522. */
  2523. smp_wmb();
  2524. WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
  2525. } else {
  2526. defer = p;
  2527. p = swap_info[type];
  2528. /*
  2529. * Do not memset this entry: a racing procfs swap_next()
  2530. * would be relying on p->type to remain valid.
  2531. */
  2532. }
  2533. p->swap_extent_root = RB_ROOT;
  2534. plist_node_init(&p->list, 0);
  2535. for_each_node(i)
  2536. plist_node_init(&p->avail_lists[i], 0);
  2537. p->flags = SWP_USED;
  2538. spin_unlock(&swap_lock);
  2539. kvfree(defer);
  2540. spin_lock_init(&p->lock);
  2541. spin_lock_init(&p->cont_lock);
  2542. return p;
  2543. }
  2544. static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
  2545. {
  2546. int error;
  2547. if (S_ISBLK(inode->i_mode)) {
  2548. p->bdev = blkdev_get_by_dev(inode->i_rdev,
  2549. FMODE_READ | FMODE_WRITE | FMODE_EXCL, p);
  2550. if (IS_ERR(p->bdev)) {
  2551. error = PTR_ERR(p->bdev);
  2552. p->bdev = NULL;
  2553. return error;
  2554. }
  2555. p->old_block_size = block_size(p->bdev);
  2556. error = set_blocksize(p->bdev, PAGE_SIZE);
  2557. if (error < 0)
  2558. return error;
  2559. /*
  2560. * Zoned block devices contain zones that have a sequential
  2561. * write only restriction. Hence zoned block devices are not
  2562. * suitable for swapping. Disallow them here.
  2563. */
  2564. if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
  2565. return -EINVAL;
  2566. p->flags |= SWP_BLKDEV;
  2567. } else if (S_ISREG(inode->i_mode)) {
  2568. p->bdev = inode->i_sb->s_bdev;
  2569. }
  2570. return 0;
  2571. }
  2572. /*
  2573. * Find out how many pages are allowed for a single swap device. There
  2574. * are two limiting factors:
  2575. * 1) the number of bits for the swap offset in the swp_entry_t type, and
  2576. * 2) the number of bits in the swap pte, as defined by the different
  2577. * architectures.
  2578. *
  2579. * In order to find the largest possible bit mask, a swap entry with
  2580. * swap type 0 and swap offset ~0UL is created, encoded to a swap pte,
  2581. * decoded to a swp_entry_t again, and finally the swap offset is
  2582. * extracted.
  2583. *
  2584. * This will mask all the bits from the initial ~0UL mask that can't
  2585. * be encoded in either the swp_entry_t or the architecture definition
  2586. * of a swap pte.
  2587. */
  2588. unsigned long generic_max_swapfile_size(void)
  2589. {
  2590. return swp_offset(pte_to_swp_entry(
  2591. swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1;
  2592. }
  2593. /* Can be overridden by an architecture for additional checks. */
  2594. __weak unsigned long max_swapfile_size(void)
  2595. {
  2596. return generic_max_swapfile_size();
  2597. }
  2598. static unsigned long read_swap_header(struct swap_info_struct *p,
  2599. union swap_header *swap_header,
  2600. struct inode *inode)
  2601. {
  2602. int i;
  2603. unsigned long maxpages;
  2604. unsigned long swapfilepages;
  2605. unsigned long last_page;
  2606. if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
  2607. pr_err("Unable to find swap-space signature\n");
  2608. return 0;
  2609. }
  2610. /* swap partition endianess hack... */
  2611. if (swab32(swap_header->info.version) == 1) {
  2612. swab32s(&swap_header->info.version);
  2613. swab32s(&swap_header->info.last_page);
  2614. swab32s(&swap_header->info.nr_badpages);
  2615. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2616. return 0;
  2617. for (i = 0; i < swap_header->info.nr_badpages; i++)
  2618. swab32s(&swap_header->info.badpages[i]);
  2619. }
  2620. /* Check the swap header's sub-version */
  2621. if (swap_header->info.version != 1) {
  2622. pr_warn("Unable to handle swap header version %d\n",
  2623. swap_header->info.version);
  2624. return 0;
  2625. }
  2626. p->lowest_bit = 1;
  2627. p->cluster_next = 1;
  2628. p->cluster_nr = 0;
  2629. maxpages = max_swapfile_size();
  2630. last_page = swap_header->info.last_page;
  2631. if (!last_page) {
  2632. pr_warn("Empty swap-file\n");
  2633. return 0;
  2634. }
  2635. if (last_page > maxpages) {
  2636. pr_warn("Truncating oversized swap area, only using %luk out of %luk\n",
  2637. maxpages << (PAGE_SHIFT - 10),
  2638. last_page << (PAGE_SHIFT - 10));
  2639. }
  2640. if (maxpages > last_page) {
  2641. maxpages = last_page + 1;
  2642. /* p->max is an unsigned int: don't overflow it */
  2643. if ((unsigned int)maxpages == 0)
  2644. maxpages = UINT_MAX;
  2645. }
  2646. p->highest_bit = maxpages - 1;
  2647. if (!maxpages)
  2648. return 0;
  2649. swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
  2650. if (swapfilepages && maxpages > swapfilepages) {
  2651. pr_warn("Swap area shorter than signature indicates\n");
  2652. return 0;
  2653. }
  2654. if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
  2655. return 0;
  2656. if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
  2657. return 0;
  2658. return maxpages;
  2659. }
  2660. #define SWAP_CLUSTER_INFO_COLS \
  2661. DIV_ROUND_UP(L1_CACHE_BYTES, sizeof(struct swap_cluster_info))
  2662. #define SWAP_CLUSTER_SPACE_COLS \
  2663. DIV_ROUND_UP(SWAP_ADDRESS_SPACE_PAGES, SWAPFILE_CLUSTER)
  2664. #define SWAP_CLUSTER_COLS \
  2665. max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
  2666. static int setup_swap_map_and_extents(struct swap_info_struct *p,
  2667. union swap_header *swap_header,
  2668. unsigned char *swap_map,
  2669. struct swap_cluster_info *cluster_info,
  2670. unsigned long maxpages,
  2671. sector_t *span)
  2672. {
  2673. unsigned int j, k;
  2674. unsigned int nr_good_pages;
  2675. int nr_extents;
  2676. unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2677. unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
  2678. unsigned long i, idx;
  2679. nr_good_pages = maxpages - 1; /* omit header page */
  2680. cluster_list_init(&p->free_clusters);
  2681. cluster_list_init(&p->discard_clusters);
  2682. for (i = 0; i < swap_header->info.nr_badpages; i++) {
  2683. unsigned int page_nr = swap_header->info.badpages[i];
  2684. if (page_nr == 0 || page_nr > swap_header->info.last_page)
  2685. return -EINVAL;
  2686. if (page_nr < maxpages) {
  2687. swap_map[page_nr] = SWAP_MAP_BAD;
  2688. nr_good_pages--;
  2689. /*
  2690. * Haven't marked the cluster free yet, no list
  2691. * operation involved
  2692. */
  2693. inc_cluster_info_page(p, cluster_info, page_nr);
  2694. }
  2695. }
  2696. /* Haven't marked the cluster free yet, no list operation involved */
  2697. for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
  2698. inc_cluster_info_page(p, cluster_info, i);
  2699. if (nr_good_pages) {
  2700. swap_map[0] = SWAP_MAP_BAD;
  2701. /*
  2702. * Not mark the cluster free yet, no list
  2703. * operation involved
  2704. */
  2705. inc_cluster_info_page(p, cluster_info, 0);
  2706. p->max = maxpages;
  2707. p->pages = nr_good_pages;
  2708. nr_extents = setup_swap_extents(p, span);
  2709. if (nr_extents < 0)
  2710. return nr_extents;
  2711. nr_good_pages = p->pages;
  2712. }
  2713. if (!nr_good_pages) {
  2714. pr_warn("Empty swap-file\n");
  2715. return -EINVAL;
  2716. }
  2717. if (!cluster_info)
  2718. return nr_extents;
  2719. /*
  2720. * Reduce false cache line sharing between cluster_info and
  2721. * sharing same address space.
  2722. */
  2723. for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
  2724. j = (k + col) % SWAP_CLUSTER_COLS;
  2725. for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
  2726. idx = i * SWAP_CLUSTER_COLS + j;
  2727. if (idx >= nr_clusters)
  2728. continue;
  2729. if (cluster_count(&cluster_info[idx]))
  2730. continue;
  2731. cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
  2732. cluster_list_add_tail(&p->free_clusters, cluster_info,
  2733. idx);
  2734. }
  2735. }
  2736. return nr_extents;
  2737. }
  2738. /*
  2739. * Helper to sys_swapon determining if a given swap
  2740. * backing device queue supports DISCARD operations.
  2741. */
  2742. static bool swap_discardable(struct swap_info_struct *si)
  2743. {
  2744. struct request_queue *q = bdev_get_queue(si->bdev);
  2745. if (!q || !blk_queue_discard(q))
  2746. return false;
  2747. return true;
  2748. }
  2749. SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
  2750. {
  2751. struct swap_info_struct *p;
  2752. struct filename *name;
  2753. struct file *swap_file = NULL;
  2754. struct address_space *mapping;
  2755. int prio;
  2756. int error;
  2757. union swap_header *swap_header;
  2758. int nr_extents;
  2759. sector_t span;
  2760. unsigned long maxpages;
  2761. unsigned char *swap_map = NULL;
  2762. struct swap_cluster_info *cluster_info = NULL;
  2763. unsigned long *frontswap_map = NULL;
  2764. struct page *page = NULL;
  2765. struct inode *inode = NULL;
  2766. bool inced_nr_rotate_swap = false;
  2767. if (swap_flags & ~SWAP_FLAGS_VALID)
  2768. return -EINVAL;
  2769. if (!capable(CAP_SYS_ADMIN))
  2770. return -EPERM;
  2771. if (!swap_avail_heads)
  2772. return -ENOMEM;
  2773. p = alloc_swap_info();
  2774. if (IS_ERR(p))
  2775. return PTR_ERR(p);
  2776. INIT_WORK(&p->discard_work, swap_discard_work);
  2777. name = getname(specialfile);
  2778. if (IS_ERR(name)) {
  2779. error = PTR_ERR(name);
  2780. name = NULL;
  2781. goto bad_swap;
  2782. }
  2783. swap_file = file_open_name(name, O_RDWR|O_LARGEFILE, 0);
  2784. if (IS_ERR(swap_file)) {
  2785. error = PTR_ERR(swap_file);
  2786. swap_file = NULL;
  2787. goto bad_swap;
  2788. }
  2789. p->swap_file = swap_file;
  2790. mapping = swap_file->f_mapping;
  2791. inode = mapping->host;
  2792. error = claim_swapfile(p, inode);
  2793. if (unlikely(error))
  2794. goto bad_swap;
  2795. inode_lock(inode);
  2796. if (IS_SWAPFILE(inode)) {
  2797. error = -EBUSY;
  2798. goto bad_swap_unlock_inode;
  2799. }
  2800. /*
  2801. * Read the swap header.
  2802. */
  2803. if (!mapping->a_ops->readpage) {
  2804. error = -EINVAL;
  2805. goto bad_swap_unlock_inode;
  2806. }
  2807. page = read_mapping_page(mapping, 0, swap_file);
  2808. if (IS_ERR(page)) {
  2809. error = PTR_ERR(page);
  2810. goto bad_swap_unlock_inode;
  2811. }
  2812. swap_header = kmap(page);
  2813. maxpages = read_swap_header(p, swap_header, inode);
  2814. if (unlikely(!maxpages)) {
  2815. error = -EINVAL;
  2816. goto bad_swap_unlock_inode;
  2817. }
  2818. /* OK, set up the swap map and apply the bad block list */
  2819. swap_map = vzalloc(maxpages);
  2820. if (!swap_map) {
  2821. error = -ENOMEM;
  2822. goto bad_swap_unlock_inode;
  2823. }
  2824. if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
  2825. p->flags |= SWP_STABLE_WRITES;
  2826. if (p->bdev && p->bdev->bd_disk->fops->rw_page)
  2827. p->flags |= SWP_SYNCHRONOUS_IO;
  2828. if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
  2829. int cpu;
  2830. unsigned long ci, nr_cluster;
  2831. p->flags |= SWP_SOLIDSTATE;
  2832. p->cluster_next_cpu = alloc_percpu(unsigned int);
  2833. if (!p->cluster_next_cpu) {
  2834. error = -ENOMEM;
  2835. goto bad_swap_unlock_inode;
  2836. }
  2837. /*
  2838. * select a random position to start with to help wear leveling
  2839. * SSD
  2840. */
  2841. for_each_possible_cpu(cpu) {
  2842. per_cpu(*p->cluster_next_cpu, cpu) =
  2843. 1 + prandom_u32_max(p->highest_bit);
  2844. }
  2845. nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
  2846. cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
  2847. GFP_KERNEL);
  2848. if (!cluster_info) {
  2849. error = -ENOMEM;
  2850. goto bad_swap_unlock_inode;
  2851. }
  2852. for (ci = 0; ci < nr_cluster; ci++)
  2853. spin_lock_init(&((cluster_info + ci)->lock));
  2854. p->percpu_cluster = alloc_percpu(struct percpu_cluster);
  2855. if (!p->percpu_cluster) {
  2856. error = -ENOMEM;
  2857. goto bad_swap_unlock_inode;
  2858. }
  2859. for_each_possible_cpu(cpu) {
  2860. struct percpu_cluster *cluster;
  2861. cluster = per_cpu_ptr(p->percpu_cluster, cpu);
  2862. cluster_set_null(&cluster->index);
  2863. }
  2864. } else {
  2865. atomic_inc(&nr_rotate_swap);
  2866. inced_nr_rotate_swap = true;
  2867. }
  2868. error = swap_cgroup_swapon(p->type, maxpages);
  2869. if (error)
  2870. goto bad_swap_unlock_inode;
  2871. nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
  2872. cluster_info, maxpages, &span);
  2873. if (unlikely(nr_extents < 0)) {
  2874. error = nr_extents;
  2875. goto bad_swap_unlock_inode;
  2876. }
  2877. /* frontswap enabled? set up bit-per-page map for frontswap */
  2878. if (IS_ENABLED(CONFIG_FRONTSWAP))
  2879. frontswap_map = kvcalloc(BITS_TO_LONGS(maxpages),
  2880. sizeof(long),
  2881. GFP_KERNEL);
  2882. if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
  2883. /*
  2884. * When discard is enabled for swap with no particular
  2885. * policy flagged, we set all swap discard flags here in
  2886. * order to sustain backward compatibility with older
  2887. * swapon(8) releases.
  2888. */
  2889. p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
  2890. SWP_PAGE_DISCARD);
  2891. /*
  2892. * By flagging sys_swapon, a sysadmin can tell us to
  2893. * either do single-time area discards only, or to just
  2894. * perform discards for released swap page-clusters.
  2895. * Now it's time to adjust the p->flags accordingly.
  2896. */
  2897. if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
  2898. p->flags &= ~SWP_PAGE_DISCARD;
  2899. else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
  2900. p->flags &= ~SWP_AREA_DISCARD;
  2901. /* issue a swapon-time discard if it's still required */
  2902. if (p->flags & SWP_AREA_DISCARD) {
  2903. int err = discard_swap(p);
  2904. if (unlikely(err))
  2905. pr_err("swapon: discard_swap(%p): %d\n",
  2906. p, err);
  2907. }
  2908. }
  2909. error = init_swap_address_space(p->type, maxpages);
  2910. if (error)
  2911. goto bad_swap_unlock_inode;
  2912. /*
  2913. * Flush any pending IO and dirty mappings before we start using this
  2914. * swap device.
  2915. */
  2916. inode->i_flags |= S_SWAPFILE;
  2917. error = inode_drain_writes(inode);
  2918. if (error) {
  2919. inode->i_flags &= ~S_SWAPFILE;
  2920. goto free_swap_address_space;
  2921. }
  2922. mutex_lock(&swapon_mutex);
  2923. prio = -1;
  2924. if (swap_flags & SWAP_FLAG_PREFER)
  2925. prio =
  2926. (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
  2927. enable_swap_info(p, prio, swap_map, cluster_info, frontswap_map);
  2928. pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s%s\n",
  2929. p->pages<<(PAGE_SHIFT-10), name->name, p->prio,
  2930. nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
  2931. (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
  2932. (p->flags & SWP_DISCARDABLE) ? "D" : "",
  2933. (p->flags & SWP_AREA_DISCARD) ? "s" : "",
  2934. (p->flags & SWP_PAGE_DISCARD) ? "c" : "",
  2935. (frontswap_map) ? "FS" : "");
  2936. mutex_unlock(&swapon_mutex);
  2937. atomic_inc(&proc_poll_event);
  2938. wake_up_interruptible(&proc_poll_wait);
  2939. error = 0;
  2940. goto out;
  2941. free_swap_address_space:
  2942. exit_swap_address_space(p->type);
  2943. bad_swap_unlock_inode:
  2944. inode_unlock(inode);
  2945. bad_swap:
  2946. free_percpu(p->percpu_cluster);
  2947. p->percpu_cluster = NULL;
  2948. free_percpu(p->cluster_next_cpu);
  2949. p->cluster_next_cpu = NULL;
  2950. if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
  2951. set_blocksize(p->bdev, p->old_block_size);
  2952. blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
  2953. }
  2954. inode = NULL;
  2955. destroy_swap_extents(p);
  2956. swap_cgroup_swapoff(p->type);
  2957. spin_lock(&swap_lock);
  2958. p->swap_file = NULL;
  2959. p->flags = 0;
  2960. spin_unlock(&swap_lock);
  2961. vfree(swap_map);
  2962. kvfree(cluster_info);
  2963. kvfree(frontswap_map);
  2964. if (inced_nr_rotate_swap)
  2965. atomic_dec(&nr_rotate_swap);
  2966. if (swap_file)
  2967. filp_close(swap_file, NULL);
  2968. out:
  2969. if (page && !IS_ERR(page)) {
  2970. kunmap(page);
  2971. put_page(page);
  2972. }
  2973. if (name)
  2974. putname(name);
  2975. if (inode)
  2976. inode_unlock(inode);
  2977. if (!error)
  2978. enable_swap_slots_cache();
  2979. return error;
  2980. }
  2981. void si_swapinfo(struct sysinfo *val)
  2982. {
  2983. unsigned int type;
  2984. unsigned long nr_to_be_unused = 0;
  2985. spin_lock(&swap_lock);
  2986. for (type = 0; type < nr_swapfiles; type++) {
  2987. struct swap_info_struct *si = swap_info[type];
  2988. if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK))
  2989. nr_to_be_unused += si->inuse_pages;
  2990. }
  2991. val->freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused;
  2992. val->totalswap = total_swap_pages + nr_to_be_unused;
  2993. spin_unlock(&swap_lock);
  2994. }
  2995. /*
  2996. * Verify that a swap entry is valid and increment its swap map count.
  2997. *
  2998. * Returns error code in following case.
  2999. * - success -> 0
  3000. * - swp_entry is invalid -> EINVAL
  3001. * - swp_entry is migration entry -> EINVAL
  3002. * - swap-cache reference is requested but there is already one. -> EEXIST
  3003. * - swap-cache reference is requested but the entry is not used. -> ENOENT
  3004. * - swap-mapped reference requested but needs continued swap count. -> ENOMEM
  3005. */
  3006. static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
  3007. {
  3008. struct swap_info_struct *p;
  3009. struct swap_cluster_info *ci;
  3010. unsigned long offset;
  3011. unsigned char count;
  3012. unsigned char has_cache;
  3013. int err = -EINVAL;
  3014. p = get_swap_device(entry);
  3015. if (!p)
  3016. goto out;
  3017. offset = swp_offset(entry);
  3018. ci = lock_cluster_or_swap_info(p, offset);
  3019. count = p->swap_map[offset];
  3020. /*
  3021. * swapin_readahead() doesn't check if a swap entry is valid, so the
  3022. * swap entry could be SWAP_MAP_BAD. Check here with lock held.
  3023. */
  3024. if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
  3025. err = -ENOENT;
  3026. goto unlock_out;
  3027. }
  3028. has_cache = count & SWAP_HAS_CACHE;
  3029. count &= ~SWAP_HAS_CACHE;
  3030. err = 0;
  3031. if (usage == SWAP_HAS_CACHE) {
  3032. /* set SWAP_HAS_CACHE if there is no cache and entry is used */
  3033. if (!has_cache && count)
  3034. has_cache = SWAP_HAS_CACHE;
  3035. else if (has_cache) /* someone else added cache */
  3036. err = -EEXIST;
  3037. else /* no users remaining */
  3038. err = -ENOENT;
  3039. } else if (count || has_cache) {
  3040. if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
  3041. count += usage;
  3042. else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
  3043. err = -EINVAL;
  3044. else if (swap_count_continued(p, offset, count))
  3045. count = COUNT_CONTINUED;
  3046. else
  3047. err = -ENOMEM;
  3048. } else
  3049. err = -ENOENT; /* unused swap entry */
  3050. WRITE_ONCE(p->swap_map[offset], count | has_cache);
  3051. unlock_out:
  3052. unlock_cluster_or_swap_info(p, ci);
  3053. out:
  3054. if (p)
  3055. put_swap_device(p);
  3056. return err;
  3057. }
  3058. /*
  3059. * Help swapoff by noting that swap entry belongs to shmem/tmpfs
  3060. * (in which case its reference count is never incremented).
  3061. */
  3062. void swap_shmem_alloc(swp_entry_t entry)
  3063. {
  3064. __swap_duplicate(entry, SWAP_MAP_SHMEM);
  3065. }
  3066. /*
  3067. * Increase reference count of swap entry by 1.
  3068. * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required
  3069. * but could not be atomically allocated. Returns 0, just as if it succeeded,
  3070. * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which
  3071. * might occur if a page table entry has got corrupted.
  3072. */
  3073. int swap_duplicate(swp_entry_t entry)
  3074. {
  3075. int err = 0;
  3076. while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
  3077. err = add_swap_count_continuation(entry, GFP_ATOMIC);
  3078. return err;
  3079. }
  3080. /*
  3081. * @entry: swap entry for which we allocate swap cache.
  3082. *
  3083. * Called when allocating swap cache for existing swap entry,
  3084. * This can return error codes. Returns 0 at success.
  3085. * -EEXIST means there is a swap cache.
  3086. * Note: return code is different from swap_duplicate().
  3087. */
  3088. int swapcache_prepare(swp_entry_t entry)
  3089. {
  3090. return __swap_duplicate(entry, SWAP_HAS_CACHE);
  3091. }
  3092. struct swap_info_struct *swp_swap_info(swp_entry_t entry)
  3093. {
  3094. return swap_type_to_swap_info(swp_type(entry));
  3095. }
  3096. struct swap_info_struct *page_swap_info(struct page *page)
  3097. {
  3098. swp_entry_t entry = { .val = page_private(page) };
  3099. return swp_swap_info(entry);
  3100. }
  3101. /*
  3102. * out-of-line __page_file_ methods to avoid include hell.
  3103. */
  3104. struct address_space *__page_file_mapping(struct page *page)
  3105. {
  3106. return page_swap_info(page)->swap_file->f_mapping;
  3107. }
  3108. EXPORT_SYMBOL_GPL(__page_file_mapping);
  3109. pgoff_t __page_file_index(struct page *page)
  3110. {
  3111. swp_entry_t swap = { .val = page_private(page) };
  3112. return swp_offset(swap);
  3113. }
  3114. EXPORT_SYMBOL_GPL(__page_file_index);
  3115. /*
  3116. * add_swap_count_continuation - called when a swap count is duplicated
  3117. * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
  3118. * page of the original vmalloc'ed swap_map, to hold the continuation count
  3119. * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called
  3120. * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc.
  3121. *
  3122. * These continuation pages are seldom referenced: the common paths all work
  3123. * on the original swap_map, only referring to a continuation page when the
  3124. * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX.
  3125. *
  3126. * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding
  3127. * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL)
  3128. * can be called after dropping locks.
  3129. */
  3130. int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
  3131. {
  3132. struct swap_info_struct *si;
  3133. struct swap_cluster_info *ci;
  3134. struct page *head;
  3135. struct page *page;
  3136. struct page *list_page;
  3137. pgoff_t offset;
  3138. unsigned char count;
  3139. int ret = 0;
  3140. /*
  3141. * When debugging, it's easier to use __GFP_ZERO here; but it's better
  3142. * for latency not to zero a page while GFP_ATOMIC and holding locks.
  3143. */
  3144. page = alloc_page(gfp_mask | __GFP_HIGHMEM);
  3145. si = get_swap_device(entry);
  3146. if (!si) {
  3147. /*
  3148. * An acceptable race has occurred since the failing
  3149. * __swap_duplicate(): the swap device may be swapoff
  3150. */
  3151. goto outer;
  3152. }
  3153. spin_lock(&si->lock);
  3154. offset = swp_offset(entry);
  3155. ci = lock_cluster(si, offset);
  3156. count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
  3157. if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
  3158. /*
  3159. * The higher the swap count, the more likely it is that tasks
  3160. * will race to add swap count continuation: we need to avoid
  3161. * over-provisioning.
  3162. */
  3163. goto out;
  3164. }
  3165. if (!page) {
  3166. ret = -ENOMEM;
  3167. goto out;
  3168. }
  3169. /*
  3170. * We are fortunate that although vmalloc_to_page uses pte_offset_map,
  3171. * no architecture is using highmem pages for kernel page tables: so it
  3172. * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
  3173. */
  3174. head = vmalloc_to_page(si->swap_map + offset);
  3175. offset &= ~PAGE_MASK;
  3176. spin_lock(&si->cont_lock);
  3177. /*
  3178. * Page allocation does not initialize the page's lru field,
  3179. * but it does always reset its private field.
  3180. */
  3181. if (!page_private(head)) {
  3182. BUG_ON(count & COUNT_CONTINUED);
  3183. INIT_LIST_HEAD(&head->lru);
  3184. set_page_private(head, SWP_CONTINUED);
  3185. si->flags |= SWP_CONTINUED;
  3186. }
  3187. list_for_each_entry(list_page, &head->lru, lru) {
  3188. unsigned char *map;
  3189. /*
  3190. * If the previous map said no continuation, but we've found
  3191. * a continuation page, free our allocation and use this one.
  3192. */
  3193. if (!(count & COUNT_CONTINUED))
  3194. goto out_unlock_cont;
  3195. map = kmap_atomic(list_page) + offset;
  3196. count = *map;
  3197. kunmap_atomic(map);
  3198. /*
  3199. * If this continuation count now has some space in it,
  3200. * free our allocation and use this one.
  3201. */
  3202. if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX)
  3203. goto out_unlock_cont;
  3204. }
  3205. list_add_tail(&page->lru, &head->lru);
  3206. page = NULL; /* now it's attached, don't free it */
  3207. out_unlock_cont:
  3208. spin_unlock(&si->cont_lock);
  3209. out:
  3210. unlock_cluster(ci);
  3211. spin_unlock(&si->lock);
  3212. put_swap_device(si);
  3213. outer:
  3214. if (page)
  3215. __free_page(page);
  3216. return ret;
  3217. }
  3218. /*
  3219. * swap_count_continued - when the original swap_map count is incremented
  3220. * from SWAP_MAP_MAX, check if there is already a continuation page to carry
  3221. * into, carry if so, or else fail until a new continuation page is allocated;
  3222. * when the original swap_map count is decremented from 0 with continuation,
  3223. * borrow from the continuation and report whether it still holds more.
  3224. * Called while __swap_duplicate() or swap_entry_free() holds swap or cluster
  3225. * lock.
  3226. */
  3227. static bool swap_count_continued(struct swap_info_struct *si,
  3228. pgoff_t offset, unsigned char count)
  3229. {
  3230. struct page *head;
  3231. struct page *page;
  3232. unsigned char *map;
  3233. bool ret;
  3234. head = vmalloc_to_page(si->swap_map + offset);
  3235. if (page_private(head) != SWP_CONTINUED) {
  3236. BUG_ON(count & COUNT_CONTINUED);
  3237. return false; /* need to add count continuation */
  3238. }
  3239. spin_lock(&si->cont_lock);
  3240. offset &= ~PAGE_MASK;
  3241. page = list_next_entry(head, lru);
  3242. map = kmap_atomic(page) + offset;
  3243. if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
  3244. goto init_map; /* jump over SWAP_CONT_MAX checks */
  3245. if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */
  3246. /*
  3247. * Think of how you add 1 to 999
  3248. */
  3249. while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
  3250. kunmap_atomic(map);
  3251. page = list_next_entry(page, lru);
  3252. BUG_ON(page == head);
  3253. map = kmap_atomic(page) + offset;
  3254. }
  3255. if (*map == SWAP_CONT_MAX) {
  3256. kunmap_atomic(map);
  3257. page = list_next_entry(page, lru);
  3258. if (page == head) {
  3259. ret = false; /* add count continuation */
  3260. goto out;
  3261. }
  3262. map = kmap_atomic(page) + offset;
  3263. init_map: *map = 0; /* we didn't zero the page */
  3264. }
  3265. *map += 1;
  3266. kunmap_atomic(map);
  3267. while ((page = list_prev_entry(page, lru)) != head) {
  3268. map = kmap_atomic(page) + offset;
  3269. *map = COUNT_CONTINUED;
  3270. kunmap_atomic(map);
  3271. }
  3272. ret = true; /* incremented */
  3273. } else { /* decrementing */
  3274. /*
  3275. * Think of how you subtract 1 from 1000
  3276. */
  3277. BUG_ON(count != COUNT_CONTINUED);
  3278. while (*map == COUNT_CONTINUED) {
  3279. kunmap_atomic(map);
  3280. page = list_next_entry(page, lru);
  3281. BUG_ON(page == head);
  3282. map = kmap_atomic(page) + offset;
  3283. }
  3284. BUG_ON(*map == 0);
  3285. *map -= 1;
  3286. if (*map == 0)
  3287. count = 0;
  3288. kunmap_atomic(map);
  3289. while ((page = list_prev_entry(page, lru)) != head) {
  3290. map = kmap_atomic(page) + offset;
  3291. *map = SWAP_CONT_MAX | count;
  3292. count = COUNT_CONTINUED;
  3293. kunmap_atomic(map);
  3294. }
  3295. ret = count == COUNT_CONTINUED;
  3296. }
  3297. out:
  3298. spin_unlock(&si->cont_lock);
  3299. return ret;
  3300. }
  3301. /*
  3302. * free_swap_count_continuations - swapoff free all the continuation pages
  3303. * appended to the swap_map, after swap_map is quiesced, before vfree'ing it.
  3304. */
  3305. static void free_swap_count_continuations(struct swap_info_struct *si)
  3306. {
  3307. pgoff_t offset;
  3308. for (offset = 0; offset < si->max; offset += PAGE_SIZE) {
  3309. struct page *head;
  3310. head = vmalloc_to_page(si->swap_map + offset);
  3311. if (page_private(head)) {
  3312. struct page *page, *next;
  3313. list_for_each_entry_safe(page, next, &head->lru, lru) {
  3314. list_del(&page->lru);
  3315. __free_page(page);
  3316. }
  3317. }
  3318. }
  3319. }
  3320. #if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
  3321. void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
  3322. {
  3323. struct swap_info_struct *si, *next;
  3324. int nid = page_to_nid(page);
  3325. if (!(gfp_mask & __GFP_IO))
  3326. return;
  3327. if (!blk_cgroup_congested())
  3328. return;
  3329. /*
  3330. * We've already scheduled a throttle, avoid taking the global swap
  3331. * lock.
  3332. */
  3333. if (current->throttle_queue)
  3334. return;
  3335. spin_lock(&swap_avail_lock);
  3336. plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
  3337. avail_lists[nid]) {
  3338. if (si->bdev) {
  3339. blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
  3340. break;
  3341. }
  3342. }
  3343. spin_unlock(&swap_avail_lock);
  3344. }
  3345. #endif
  3346. static int __init swapfile_init(void)
  3347. {
  3348. int nid;
  3349. swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head),
  3350. GFP_KERNEL);
  3351. if (!swap_avail_heads) {
  3352. pr_emerg("Not enough memory for swap heads, swap is disabled\n");
  3353. return -ENOMEM;
  3354. }
  3355. for_each_node(nid)
  3356. plist_head_init(&swap_avail_heads[nid]);
  3357. return 0;
  3358. }
  3359. subsys_initcall(swapfile_init);