khugepaged.c 71 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746
  1. // SPDX-License-Identifier: GPL-2.0
  2. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  3. #include <linux/mm.h>
  4. #include <linux/sched.h>
  5. #include <linux/sched/mm.h>
  6. #include <linux/sched/coredump.h>
  7. #include <linux/mmu_notifier.h>
  8. #include <linux/rmap.h>
  9. #include <linux/swap.h>
  10. #include <linux/mm_inline.h>
  11. #include <linux/kthread.h>
  12. #include <linux/khugepaged.h>
  13. #include <linux/freezer.h>
  14. #include <linux/mman.h>
  15. #include <linux/hashtable.h>
  16. #include <linux/userfaultfd_k.h>
  17. #include <linux/page_idle.h>
  18. #include <linux/page_table_check.h>
  19. #include <linux/swapops.h>
  20. #include <linux/shmem_fs.h>
  21. #include <asm/tlb.h>
  22. #include <asm/pgalloc.h>
  23. #include "internal.h"
  24. #include "mm_slot.h"
  25. enum scan_result {
  26. SCAN_FAIL,
  27. SCAN_SUCCEED,
  28. SCAN_PMD_NULL,
  29. SCAN_PMD_NONE,
  30. SCAN_PMD_MAPPED,
  31. SCAN_EXCEED_NONE_PTE,
  32. SCAN_EXCEED_SWAP_PTE,
  33. SCAN_EXCEED_SHARED_PTE,
  34. SCAN_PTE_NON_PRESENT,
  35. SCAN_PTE_UFFD_WP,
  36. SCAN_PTE_MAPPED_HUGEPAGE,
  37. SCAN_PAGE_RO,
  38. SCAN_LACK_REFERENCED_PAGE,
  39. SCAN_PAGE_NULL,
  40. SCAN_SCAN_ABORT,
  41. SCAN_PAGE_COUNT,
  42. SCAN_PAGE_LRU,
  43. SCAN_PAGE_LOCK,
  44. SCAN_PAGE_ANON,
  45. SCAN_PAGE_COMPOUND,
  46. SCAN_ANY_PROCESS,
  47. SCAN_VMA_NULL,
  48. SCAN_VMA_CHECK,
  49. SCAN_ADDRESS_RANGE,
  50. SCAN_DEL_PAGE_LRU,
  51. SCAN_ALLOC_HUGE_PAGE_FAIL,
  52. SCAN_CGROUP_CHARGE_FAIL,
  53. SCAN_TRUNCATED,
  54. SCAN_PAGE_HAS_PRIVATE,
  55. };
  56. #define CREATE_TRACE_POINTS
  57. #include <trace/events/huge_memory.h>
  58. static struct task_struct *khugepaged_thread __read_mostly;
  59. static DEFINE_MUTEX(khugepaged_mutex);
  60. /* default scan 8*512 pte (or vmas) every 30 second */
  61. static unsigned int khugepaged_pages_to_scan __read_mostly;
  62. static unsigned int khugepaged_pages_collapsed;
  63. static unsigned int khugepaged_full_scans;
  64. static unsigned int khugepaged_scan_sleep_millisecs __read_mostly = 10000;
  65. /* during fragmentation poll the hugepage allocator once every minute */
  66. static unsigned int khugepaged_alloc_sleep_millisecs __read_mostly = 60000;
  67. static unsigned long khugepaged_sleep_expire;
  68. static DEFINE_SPINLOCK(khugepaged_mm_lock);
  69. static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
  70. /*
  71. * default collapse hugepages if there is at least one pte mapped like
  72. * it would have happened if the vma was large enough during page
  73. * fault.
  74. *
  75. * Note that these are only respected if collapse was initiated by khugepaged.
  76. */
  77. static unsigned int khugepaged_max_ptes_none __read_mostly;
  78. static unsigned int khugepaged_max_ptes_swap __read_mostly;
  79. static unsigned int khugepaged_max_ptes_shared __read_mostly;
  80. #define MM_SLOTS_HASH_BITS 10
  81. static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
  82. static struct kmem_cache *mm_slot_cache __read_mostly;
  83. #define MAX_PTE_MAPPED_THP 8
  84. struct collapse_control {
  85. bool is_khugepaged;
  86. /* Num pages scanned per node */
  87. u32 node_load[MAX_NUMNODES];
  88. /* nodemask for allocation fallback */
  89. nodemask_t alloc_nmask;
  90. };
  91. /**
  92. * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned
  93. * @slot: hash lookup from mm to mm_slot
  94. * @nr_pte_mapped_thp: number of pte mapped THP
  95. * @pte_mapped_thp: address array corresponding pte mapped THP
  96. */
  97. struct khugepaged_mm_slot {
  98. struct mm_slot slot;
  99. /* pte-mapped THP in this mm */
  100. int nr_pte_mapped_thp;
  101. unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
  102. };
  103. /**
  104. * struct khugepaged_scan - cursor for scanning
  105. * @mm_head: the head of the mm list to scan
  106. * @mm_slot: the current mm_slot we are scanning
  107. * @address: the next address inside that to be scanned
  108. *
  109. * There is only the one khugepaged_scan instance of this cursor structure.
  110. */
  111. struct khugepaged_scan {
  112. struct list_head mm_head;
  113. struct khugepaged_mm_slot *mm_slot;
  114. unsigned long address;
  115. };
  116. static struct khugepaged_scan khugepaged_scan = {
  117. .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
  118. };
  119. #ifdef CONFIG_SYSFS
  120. static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
  121. struct kobj_attribute *attr,
  122. char *buf)
  123. {
  124. return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
  125. }
  126. static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
  127. struct kobj_attribute *attr,
  128. const char *buf, size_t count)
  129. {
  130. unsigned int msecs;
  131. int err;
  132. err = kstrtouint(buf, 10, &msecs);
  133. if (err)
  134. return -EINVAL;
  135. khugepaged_scan_sleep_millisecs = msecs;
  136. khugepaged_sleep_expire = 0;
  137. wake_up_interruptible(&khugepaged_wait);
  138. return count;
  139. }
  140. static struct kobj_attribute scan_sleep_millisecs_attr =
  141. __ATTR_RW(scan_sleep_millisecs);
  142. static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
  143. struct kobj_attribute *attr,
  144. char *buf)
  145. {
  146. return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
  147. }
  148. static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
  149. struct kobj_attribute *attr,
  150. const char *buf, size_t count)
  151. {
  152. unsigned int msecs;
  153. int err;
  154. err = kstrtouint(buf, 10, &msecs);
  155. if (err)
  156. return -EINVAL;
  157. khugepaged_alloc_sleep_millisecs = msecs;
  158. khugepaged_sleep_expire = 0;
  159. wake_up_interruptible(&khugepaged_wait);
  160. return count;
  161. }
  162. static struct kobj_attribute alloc_sleep_millisecs_attr =
  163. __ATTR_RW(alloc_sleep_millisecs);
  164. static ssize_t pages_to_scan_show(struct kobject *kobj,
  165. struct kobj_attribute *attr,
  166. char *buf)
  167. {
  168. return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
  169. }
  170. static ssize_t pages_to_scan_store(struct kobject *kobj,
  171. struct kobj_attribute *attr,
  172. const char *buf, size_t count)
  173. {
  174. unsigned int pages;
  175. int err;
  176. err = kstrtouint(buf, 10, &pages);
  177. if (err || !pages)
  178. return -EINVAL;
  179. khugepaged_pages_to_scan = pages;
  180. return count;
  181. }
  182. static struct kobj_attribute pages_to_scan_attr =
  183. __ATTR_RW(pages_to_scan);
  184. static ssize_t pages_collapsed_show(struct kobject *kobj,
  185. struct kobj_attribute *attr,
  186. char *buf)
  187. {
  188. return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
  189. }
  190. static struct kobj_attribute pages_collapsed_attr =
  191. __ATTR_RO(pages_collapsed);
  192. static ssize_t full_scans_show(struct kobject *kobj,
  193. struct kobj_attribute *attr,
  194. char *buf)
  195. {
  196. return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
  197. }
  198. static struct kobj_attribute full_scans_attr =
  199. __ATTR_RO(full_scans);
  200. static ssize_t defrag_show(struct kobject *kobj,
  201. struct kobj_attribute *attr, char *buf)
  202. {
  203. return single_hugepage_flag_show(kobj, attr, buf,
  204. TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  205. }
  206. static ssize_t defrag_store(struct kobject *kobj,
  207. struct kobj_attribute *attr,
  208. const char *buf, size_t count)
  209. {
  210. return single_hugepage_flag_store(kobj, attr, buf, count,
  211. TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
  212. }
  213. static struct kobj_attribute khugepaged_defrag_attr =
  214. __ATTR_RW(defrag);
  215. /*
  216. * max_ptes_none controls if khugepaged should collapse hugepages over
  217. * any unmapped ptes in turn potentially increasing the memory
  218. * footprint of the vmas. When max_ptes_none is 0 khugepaged will not
  219. * reduce the available free memory in the system as it
  220. * runs. Increasing max_ptes_none will instead potentially reduce the
  221. * free memory in the system during the khugepaged scan.
  222. */
  223. static ssize_t max_ptes_none_show(struct kobject *kobj,
  224. struct kobj_attribute *attr,
  225. char *buf)
  226. {
  227. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
  228. }
  229. static ssize_t max_ptes_none_store(struct kobject *kobj,
  230. struct kobj_attribute *attr,
  231. const char *buf, size_t count)
  232. {
  233. int err;
  234. unsigned long max_ptes_none;
  235. err = kstrtoul(buf, 10, &max_ptes_none);
  236. if (err || max_ptes_none > HPAGE_PMD_NR - 1)
  237. return -EINVAL;
  238. khugepaged_max_ptes_none = max_ptes_none;
  239. return count;
  240. }
  241. static struct kobj_attribute khugepaged_max_ptes_none_attr =
  242. __ATTR_RW(max_ptes_none);
  243. static ssize_t max_ptes_swap_show(struct kobject *kobj,
  244. struct kobj_attribute *attr,
  245. char *buf)
  246. {
  247. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
  248. }
  249. static ssize_t max_ptes_swap_store(struct kobject *kobj,
  250. struct kobj_attribute *attr,
  251. const char *buf, size_t count)
  252. {
  253. int err;
  254. unsigned long max_ptes_swap;
  255. err = kstrtoul(buf, 10, &max_ptes_swap);
  256. if (err || max_ptes_swap > HPAGE_PMD_NR - 1)
  257. return -EINVAL;
  258. khugepaged_max_ptes_swap = max_ptes_swap;
  259. return count;
  260. }
  261. static struct kobj_attribute khugepaged_max_ptes_swap_attr =
  262. __ATTR_RW(max_ptes_swap);
  263. static ssize_t max_ptes_shared_show(struct kobject *kobj,
  264. struct kobj_attribute *attr,
  265. char *buf)
  266. {
  267. return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
  268. }
  269. static ssize_t max_ptes_shared_store(struct kobject *kobj,
  270. struct kobj_attribute *attr,
  271. const char *buf, size_t count)
  272. {
  273. int err;
  274. unsigned long max_ptes_shared;
  275. err = kstrtoul(buf, 10, &max_ptes_shared);
  276. if (err || max_ptes_shared > HPAGE_PMD_NR - 1)
  277. return -EINVAL;
  278. khugepaged_max_ptes_shared = max_ptes_shared;
  279. return count;
  280. }
  281. static struct kobj_attribute khugepaged_max_ptes_shared_attr =
  282. __ATTR_RW(max_ptes_shared);
  283. static struct attribute *khugepaged_attr[] = {
  284. &khugepaged_defrag_attr.attr,
  285. &khugepaged_max_ptes_none_attr.attr,
  286. &khugepaged_max_ptes_swap_attr.attr,
  287. &khugepaged_max_ptes_shared_attr.attr,
  288. &pages_to_scan_attr.attr,
  289. &pages_collapsed_attr.attr,
  290. &full_scans_attr.attr,
  291. &scan_sleep_millisecs_attr.attr,
  292. &alloc_sleep_millisecs_attr.attr,
  293. NULL,
  294. };
  295. struct attribute_group khugepaged_attr_group = {
  296. .attrs = khugepaged_attr,
  297. .name = "khugepaged",
  298. };
  299. #endif /* CONFIG_SYSFS */
  300. int hugepage_madvise(struct vm_area_struct *vma,
  301. unsigned long *vm_flags, int advice)
  302. {
  303. switch (advice) {
  304. case MADV_HUGEPAGE:
  305. #ifdef CONFIG_S390
  306. /*
  307. * qemu blindly sets MADV_HUGEPAGE on all allocations, but s390
  308. * can't handle this properly after s390_enable_sie, so we simply
  309. * ignore the madvise to prevent qemu from causing a SIGSEGV.
  310. */
  311. if (mm_has_pgste(vma->vm_mm))
  312. return 0;
  313. #endif
  314. *vm_flags &= ~VM_NOHUGEPAGE;
  315. *vm_flags |= VM_HUGEPAGE;
  316. /*
  317. * If the vma become good for khugepaged to scan,
  318. * register it here without waiting a page fault that
  319. * may not happen any time soon.
  320. */
  321. khugepaged_enter_vma(vma, *vm_flags);
  322. break;
  323. case MADV_NOHUGEPAGE:
  324. *vm_flags &= ~VM_HUGEPAGE;
  325. *vm_flags |= VM_NOHUGEPAGE;
  326. /*
  327. * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning
  328. * this vma even if we leave the mm registered in khugepaged if
  329. * it got registered before VM_NOHUGEPAGE was set.
  330. */
  331. break;
  332. }
  333. return 0;
  334. }
  335. int __init khugepaged_init(void)
  336. {
  337. mm_slot_cache = kmem_cache_create("khugepaged_mm_slot",
  338. sizeof(struct khugepaged_mm_slot),
  339. __alignof__(struct khugepaged_mm_slot),
  340. 0, NULL);
  341. if (!mm_slot_cache)
  342. return -ENOMEM;
  343. khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
  344. khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
  345. khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
  346. khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
  347. return 0;
  348. }
  349. void __init khugepaged_destroy(void)
  350. {
  351. kmem_cache_destroy(mm_slot_cache);
  352. }
  353. static inline int hpage_collapse_test_exit(struct mm_struct *mm)
  354. {
  355. return atomic_read(&mm->mm_users) == 0;
  356. }
  357. void __khugepaged_enter(struct mm_struct *mm)
  358. {
  359. struct khugepaged_mm_slot *mm_slot;
  360. struct mm_slot *slot;
  361. int wakeup;
  362. mm_slot = mm_slot_alloc(mm_slot_cache);
  363. if (!mm_slot)
  364. return;
  365. slot = &mm_slot->slot;
  366. /* __khugepaged_exit() must not run from under us */
  367. VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm);
  368. if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
  369. mm_slot_free(mm_slot_cache, mm_slot);
  370. return;
  371. }
  372. spin_lock(&khugepaged_mm_lock);
  373. mm_slot_insert(mm_slots_hash, mm, slot);
  374. /*
  375. * Insert just behind the scanning cursor, to let the area settle
  376. * down a little.
  377. */
  378. wakeup = list_empty(&khugepaged_scan.mm_head);
  379. list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head);
  380. spin_unlock(&khugepaged_mm_lock);
  381. mmgrab(mm);
  382. if (wakeup)
  383. wake_up_interruptible(&khugepaged_wait);
  384. }
  385. void khugepaged_enter_vma(struct vm_area_struct *vma,
  386. unsigned long vm_flags)
  387. {
  388. if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) &&
  389. hugepage_flags_enabled()) {
  390. if (hugepage_vma_check(vma, vm_flags, false, false, true))
  391. __khugepaged_enter(vma->vm_mm);
  392. }
  393. }
  394. void __khugepaged_exit(struct mm_struct *mm)
  395. {
  396. struct khugepaged_mm_slot *mm_slot;
  397. struct mm_slot *slot;
  398. int free = 0;
  399. spin_lock(&khugepaged_mm_lock);
  400. slot = mm_slot_lookup(mm_slots_hash, mm);
  401. mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  402. if (mm_slot && khugepaged_scan.mm_slot != mm_slot) {
  403. hash_del(&slot->hash);
  404. list_del(&slot->mm_node);
  405. free = 1;
  406. }
  407. spin_unlock(&khugepaged_mm_lock);
  408. if (free) {
  409. clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  410. mm_slot_free(mm_slot_cache, mm_slot);
  411. mmdrop(mm);
  412. } else if (mm_slot) {
  413. /*
  414. * This is required to serialize against
  415. * hpage_collapse_test_exit() (which is guaranteed to run
  416. * under mmap sem read mode). Stop here (after we return all
  417. * pagetables will be destroyed) until khugepaged has finished
  418. * working on the pagetables under the mmap_lock.
  419. */
  420. mmap_write_lock(mm);
  421. mmap_write_unlock(mm);
  422. }
  423. }
  424. static void release_pte_page(struct page *page)
  425. {
  426. mod_node_page_state(page_pgdat(page),
  427. NR_ISOLATED_ANON + page_is_file_lru(page),
  428. -compound_nr(page));
  429. unlock_page(page);
  430. putback_lru_page(page);
  431. }
  432. static void release_pte_pages(pte_t *pte, pte_t *_pte,
  433. struct list_head *compound_pagelist)
  434. {
  435. struct page *page, *tmp;
  436. while (--_pte >= pte) {
  437. pte_t pteval = *_pte;
  438. page = pte_page(pteval);
  439. if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
  440. !PageCompound(page))
  441. release_pte_page(page);
  442. }
  443. list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
  444. list_del(&page->lru);
  445. release_pte_page(page);
  446. }
  447. }
  448. static bool is_refcount_suitable(struct page *page)
  449. {
  450. int expected_refcount;
  451. expected_refcount = total_mapcount(page);
  452. if (PageSwapCache(page))
  453. expected_refcount += compound_nr(page);
  454. return page_count(page) == expected_refcount;
  455. }
  456. static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
  457. unsigned long address,
  458. pte_t *pte,
  459. struct collapse_control *cc,
  460. struct list_head *compound_pagelist)
  461. {
  462. struct page *page = NULL;
  463. pte_t *_pte;
  464. int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0;
  465. bool writable = false;
  466. for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
  467. _pte++, address += PAGE_SIZE) {
  468. pte_t pteval = *_pte;
  469. if (pte_none(pteval) || (pte_present(pteval) &&
  470. is_zero_pfn(pte_pfn(pteval)))) {
  471. ++none_or_zero;
  472. if (!userfaultfd_armed(vma) &&
  473. (!cc->is_khugepaged ||
  474. none_or_zero <= khugepaged_max_ptes_none)) {
  475. continue;
  476. } else {
  477. result = SCAN_EXCEED_NONE_PTE;
  478. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  479. goto out;
  480. }
  481. }
  482. if (!pte_present(pteval)) {
  483. result = SCAN_PTE_NON_PRESENT;
  484. goto out;
  485. }
  486. if (pte_uffd_wp(pteval)) {
  487. result = SCAN_PTE_UFFD_WP;
  488. goto out;
  489. }
  490. page = vm_normal_page(vma, address, pteval);
  491. if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  492. result = SCAN_PAGE_NULL;
  493. goto out;
  494. }
  495. VM_BUG_ON_PAGE(!PageAnon(page), page);
  496. if (page_mapcount(page) > 1) {
  497. ++shared;
  498. if (cc->is_khugepaged &&
  499. shared > khugepaged_max_ptes_shared) {
  500. result = SCAN_EXCEED_SHARED_PTE;
  501. count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
  502. goto out;
  503. }
  504. }
  505. if (PageCompound(page)) {
  506. struct page *p;
  507. page = compound_head(page);
  508. /*
  509. * Check if we have dealt with the compound page
  510. * already
  511. */
  512. list_for_each_entry(p, compound_pagelist, lru) {
  513. if (page == p)
  514. goto next;
  515. }
  516. }
  517. /*
  518. * We can do it before isolate_lru_page because the
  519. * page can't be freed from under us. NOTE: PG_lock
  520. * is needed to serialize against split_huge_page
  521. * when invoked from the VM.
  522. */
  523. if (!trylock_page(page)) {
  524. result = SCAN_PAGE_LOCK;
  525. goto out;
  526. }
  527. /*
  528. * Check if the page has any GUP (or other external) pins.
  529. *
  530. * The page table that maps the page has been already unlinked
  531. * from the page table tree and this process cannot get
  532. * an additional pin on the page.
  533. *
  534. * New pins can come later if the page is shared across fork,
  535. * but not from this process. The other process cannot write to
  536. * the page, only trigger CoW.
  537. */
  538. if (!is_refcount_suitable(page)) {
  539. unlock_page(page);
  540. result = SCAN_PAGE_COUNT;
  541. goto out;
  542. }
  543. /*
  544. * Isolate the page to avoid collapsing an hugepage
  545. * currently in use by the VM.
  546. */
  547. if (isolate_lru_page(page)) {
  548. unlock_page(page);
  549. result = SCAN_DEL_PAGE_LRU;
  550. goto out;
  551. }
  552. mod_node_page_state(page_pgdat(page),
  553. NR_ISOLATED_ANON + page_is_file_lru(page),
  554. compound_nr(page));
  555. VM_BUG_ON_PAGE(!PageLocked(page), page);
  556. VM_BUG_ON_PAGE(PageLRU(page), page);
  557. if (PageCompound(page))
  558. list_add_tail(&page->lru, compound_pagelist);
  559. next:
  560. /*
  561. * If collapse was initiated by khugepaged, check that there is
  562. * enough young pte to justify collapsing the page
  563. */
  564. if (cc->is_khugepaged &&
  565. (pte_young(pteval) || page_is_young(page) ||
  566. PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
  567. address)))
  568. referenced++;
  569. if (pte_write(pteval))
  570. writable = true;
  571. }
  572. if (unlikely(!writable)) {
  573. result = SCAN_PAGE_RO;
  574. } else if (unlikely(cc->is_khugepaged && !referenced)) {
  575. result = SCAN_LACK_REFERENCED_PAGE;
  576. } else {
  577. result = SCAN_SUCCEED;
  578. trace_mm_collapse_huge_page_isolate(page, none_or_zero,
  579. referenced, writable, result);
  580. return result;
  581. }
  582. out:
  583. release_pte_pages(pte, _pte, compound_pagelist);
  584. trace_mm_collapse_huge_page_isolate(page, none_or_zero,
  585. referenced, writable, result);
  586. return result;
  587. }
  588. static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
  589. struct vm_area_struct *vma,
  590. unsigned long address,
  591. spinlock_t *ptl,
  592. struct list_head *compound_pagelist)
  593. {
  594. struct page *src_page, *tmp;
  595. pte_t *_pte;
  596. for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
  597. _pte++, page++, address += PAGE_SIZE) {
  598. pte_t pteval = *_pte;
  599. if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  600. clear_user_highpage(page, address);
  601. add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1);
  602. if (is_zero_pfn(pte_pfn(pteval))) {
  603. /*
  604. * ptl mostly unnecessary.
  605. */
  606. spin_lock(ptl);
  607. ptep_clear(vma->vm_mm, address, _pte);
  608. spin_unlock(ptl);
  609. }
  610. } else {
  611. src_page = pte_page(pteval);
  612. copy_user_highpage(page, src_page, address, vma);
  613. if (!PageCompound(src_page))
  614. release_pte_page(src_page);
  615. /*
  616. * ptl mostly unnecessary, but preempt has to
  617. * be disabled to update the per-cpu stats
  618. * inside page_remove_rmap().
  619. */
  620. spin_lock(ptl);
  621. ptep_clear(vma->vm_mm, address, _pte);
  622. page_remove_rmap(src_page, vma, false);
  623. spin_unlock(ptl);
  624. free_page_and_swap_cache(src_page);
  625. }
  626. }
  627. list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
  628. list_del(&src_page->lru);
  629. mod_node_page_state(page_pgdat(src_page),
  630. NR_ISOLATED_ANON + page_is_file_lru(src_page),
  631. -compound_nr(src_page));
  632. unlock_page(src_page);
  633. free_swap_cache(src_page);
  634. putback_lru_page(src_page);
  635. }
  636. }
  637. static void khugepaged_alloc_sleep(void)
  638. {
  639. DEFINE_WAIT(wait);
  640. add_wait_queue(&khugepaged_wait, &wait);
  641. __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE);
  642. schedule_timeout(msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
  643. remove_wait_queue(&khugepaged_wait, &wait);
  644. }
  645. struct collapse_control khugepaged_collapse_control = {
  646. .is_khugepaged = true,
  647. };
  648. static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc)
  649. {
  650. int i;
  651. /*
  652. * If node_reclaim_mode is disabled, then no extra effort is made to
  653. * allocate memory locally.
  654. */
  655. if (!node_reclaim_enabled())
  656. return false;
  657. /* If there is a count for this node already, it must be acceptable */
  658. if (cc->node_load[nid])
  659. return false;
  660. for (i = 0; i < MAX_NUMNODES; i++) {
  661. if (!cc->node_load[i])
  662. continue;
  663. if (node_distance(nid, i) > node_reclaim_distance)
  664. return true;
  665. }
  666. return false;
  667. }
  668. #define khugepaged_defrag() \
  669. (transparent_hugepage_flags & \
  670. (1<<TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG))
  671. /* Defrag for khugepaged will enter direct reclaim/compaction if necessary */
  672. static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void)
  673. {
  674. return khugepaged_defrag() ? GFP_TRANSHUGE : GFP_TRANSHUGE_LIGHT;
  675. }
  676. #ifdef CONFIG_NUMA
  677. static int hpage_collapse_find_target_node(struct collapse_control *cc)
  678. {
  679. int nid, target_node = 0, max_value = 0;
  680. /* find first node with max normal pages hit */
  681. for (nid = 0; nid < MAX_NUMNODES; nid++)
  682. if (cc->node_load[nid] > max_value) {
  683. max_value = cc->node_load[nid];
  684. target_node = nid;
  685. }
  686. for_each_online_node(nid) {
  687. if (max_value == cc->node_load[nid])
  688. node_set(nid, cc->alloc_nmask);
  689. }
  690. return target_node;
  691. }
  692. #else
  693. static int hpage_collapse_find_target_node(struct collapse_control *cc)
  694. {
  695. return 0;
  696. }
  697. #endif
  698. static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node,
  699. nodemask_t *nmask)
  700. {
  701. *hpage = __alloc_pages(gfp, HPAGE_PMD_ORDER, node, nmask);
  702. if (unlikely(!*hpage)) {
  703. count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
  704. return false;
  705. }
  706. prep_transhuge_page(*hpage);
  707. count_vm_event(THP_COLLAPSE_ALLOC);
  708. return true;
  709. }
  710. /*
  711. * If mmap_lock temporarily dropped, revalidate vma
  712. * before taking mmap_lock.
  713. * Returns enum scan_result value.
  714. */
  715. static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
  716. bool expect_anon,
  717. struct vm_area_struct **vmap,
  718. struct collapse_control *cc)
  719. {
  720. struct vm_area_struct *vma;
  721. if (unlikely(hpage_collapse_test_exit(mm)))
  722. return SCAN_ANY_PROCESS;
  723. *vmap = vma = find_vma(mm, address);
  724. if (!vma)
  725. return SCAN_VMA_NULL;
  726. if (!transhuge_vma_suitable(vma, address))
  727. return SCAN_ADDRESS_RANGE;
  728. if (!hugepage_vma_check(vma, vma->vm_flags, false, false,
  729. cc->is_khugepaged))
  730. return SCAN_VMA_CHECK;
  731. /*
  732. * Anon VMA expected, the address may be unmapped then
  733. * remapped to file after khugepaged reaquired the mmap_lock.
  734. *
  735. * hugepage_vma_check may return true for qualified file
  736. * vmas.
  737. */
  738. if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap)))
  739. return SCAN_PAGE_ANON;
  740. return SCAN_SUCCEED;
  741. }
  742. /*
  743. * See pmd_trans_unstable() for how the result may change out from
  744. * underneath us, even if we hold mmap_lock in read.
  745. */
  746. static int find_pmd_or_thp_or_none(struct mm_struct *mm,
  747. unsigned long address,
  748. pmd_t **pmd)
  749. {
  750. pmd_t pmde;
  751. *pmd = mm_find_pmd(mm, address);
  752. if (!*pmd)
  753. return SCAN_PMD_NULL;
  754. pmde = pmd_read_atomic(*pmd);
  755. #ifdef CONFIG_TRANSPARENT_HUGEPAGE
  756. /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
  757. barrier();
  758. #endif
  759. if (pmd_none(pmde))
  760. return SCAN_PMD_NONE;
  761. if (!pmd_present(pmde))
  762. return SCAN_PMD_NULL;
  763. if (pmd_trans_huge(pmde))
  764. return SCAN_PMD_MAPPED;
  765. if (pmd_devmap(pmde))
  766. return SCAN_PMD_NULL;
  767. if (pmd_bad(pmde))
  768. return SCAN_PMD_NULL;
  769. return SCAN_SUCCEED;
  770. }
  771. static int check_pmd_still_valid(struct mm_struct *mm,
  772. unsigned long address,
  773. pmd_t *pmd)
  774. {
  775. pmd_t *new_pmd;
  776. int result = find_pmd_or_thp_or_none(mm, address, &new_pmd);
  777. if (result != SCAN_SUCCEED)
  778. return result;
  779. if (new_pmd != pmd)
  780. return SCAN_FAIL;
  781. return SCAN_SUCCEED;
  782. }
  783. /*
  784. * Bring missing pages in from swap, to complete THP collapse.
  785. * Only done if hpage_collapse_scan_pmd believes it is worthwhile.
  786. *
  787. * Called and returns without pte mapped or spinlocks held.
  788. * Note that if false is returned, mmap_lock will be released.
  789. */
  790. static int __collapse_huge_page_swapin(struct mm_struct *mm,
  791. struct vm_area_struct *vma,
  792. unsigned long haddr, pmd_t *pmd,
  793. int referenced)
  794. {
  795. int swapped_in = 0;
  796. vm_fault_t ret = 0;
  797. unsigned long address, end = haddr + (HPAGE_PMD_NR * PAGE_SIZE);
  798. for (address = haddr; address < end; address += PAGE_SIZE) {
  799. struct vm_fault vmf = {
  800. .vma = vma,
  801. .address = address,
  802. .pgoff = linear_page_index(vma, haddr),
  803. .flags = FAULT_FLAG_ALLOW_RETRY,
  804. .pmd = pmd,
  805. };
  806. vmf.pte = pte_offset_map(pmd, address);
  807. vmf.orig_pte = *vmf.pte;
  808. if (!is_swap_pte(vmf.orig_pte)) {
  809. pte_unmap(vmf.pte);
  810. continue;
  811. }
  812. ret = do_swap_page(&vmf);
  813. /*
  814. * do_swap_page returns VM_FAULT_RETRY with released mmap_lock.
  815. * Note we treat VM_FAULT_RETRY as VM_FAULT_ERROR here because
  816. * we do not retry here and swap entry will remain in pagetable
  817. * resulting in later failure.
  818. */
  819. if (ret & VM_FAULT_RETRY) {
  820. trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
  821. /* Likely, but not guaranteed, that page lock failed */
  822. return SCAN_PAGE_LOCK;
  823. }
  824. if (ret & VM_FAULT_ERROR) {
  825. mmap_read_unlock(mm);
  826. trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
  827. return SCAN_FAIL;
  828. }
  829. swapped_in++;
  830. }
  831. /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
  832. if (swapped_in)
  833. lru_add_drain();
  834. trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
  835. return SCAN_SUCCEED;
  836. }
  837. static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm,
  838. struct collapse_control *cc)
  839. {
  840. gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
  841. GFP_TRANSHUGE);
  842. int node = hpage_collapse_find_target_node(cc);
  843. if (!hpage_collapse_alloc_page(hpage, gfp, node, &cc->alloc_nmask))
  844. return SCAN_ALLOC_HUGE_PAGE_FAIL;
  845. if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp)))
  846. return SCAN_CGROUP_CHARGE_FAIL;
  847. count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC);
  848. return SCAN_SUCCEED;
  849. }
  850. static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
  851. int referenced, int unmapped,
  852. struct collapse_control *cc)
  853. {
  854. LIST_HEAD(compound_pagelist);
  855. pmd_t *pmd, _pmd;
  856. pte_t *pte;
  857. pgtable_t pgtable;
  858. struct page *hpage;
  859. spinlock_t *pmd_ptl, *pte_ptl;
  860. int result = SCAN_FAIL;
  861. struct vm_area_struct *vma;
  862. struct mmu_notifier_range range;
  863. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  864. /*
  865. * Before allocating the hugepage, release the mmap_lock read lock.
  866. * The allocation can take potentially a long time if it involves
  867. * sync compaction, and we do not need to hold the mmap_lock during
  868. * that. We will recheck the vma after taking it again in write mode.
  869. */
  870. mmap_read_unlock(mm);
  871. result = alloc_charge_hpage(&hpage, mm, cc);
  872. if (result != SCAN_SUCCEED)
  873. goto out_nolock;
  874. mmap_read_lock(mm);
  875. result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  876. if (result != SCAN_SUCCEED) {
  877. mmap_read_unlock(mm);
  878. goto out_nolock;
  879. }
  880. result = find_pmd_or_thp_or_none(mm, address, &pmd);
  881. if (result != SCAN_SUCCEED) {
  882. mmap_read_unlock(mm);
  883. goto out_nolock;
  884. }
  885. if (unmapped) {
  886. /*
  887. * __collapse_huge_page_swapin will return with mmap_lock
  888. * released when it fails. So we jump out_nolock directly in
  889. * that case. Continuing to collapse causes inconsistency.
  890. */
  891. result = __collapse_huge_page_swapin(mm, vma, address, pmd,
  892. referenced);
  893. if (result != SCAN_SUCCEED)
  894. goto out_nolock;
  895. }
  896. mmap_read_unlock(mm);
  897. /*
  898. * Prevent all access to pagetables with the exception of
  899. * gup_fast later handled by the ptep_clear_flush and the VM
  900. * handled by the anon_vma lock + PG_lock.
  901. */
  902. mmap_write_lock(mm);
  903. result = hugepage_vma_revalidate(mm, address, true, &vma, cc);
  904. if (result != SCAN_SUCCEED)
  905. goto out_up_write;
  906. /* check if the pmd is still valid */
  907. result = check_pmd_still_valid(mm, address, pmd);
  908. if (result != SCAN_SUCCEED)
  909. goto out_up_write;
  910. vma_start_write(vma);
  911. anon_vma_lock_write(vma->anon_vma);
  912. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
  913. address, address + HPAGE_PMD_SIZE);
  914. mmu_notifier_invalidate_range_start(&range);
  915. pte = pte_offset_map(pmd, address);
  916. pte_ptl = pte_lockptr(mm, pmd);
  917. pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
  918. /*
  919. * This removes any huge TLB entry from the CPU so we won't allow
  920. * huge and small TLB entries for the same virtual address to
  921. * avoid the risk of CPU bugs in that area.
  922. *
  923. * Parallel fast GUP is fine since fast GUP will back off when
  924. * it detects PMD is changed.
  925. */
  926. _pmd = pmdp_collapse_flush(vma, address, pmd);
  927. spin_unlock(pmd_ptl);
  928. mmu_notifier_invalidate_range_end(&range);
  929. tlb_remove_table_sync_one();
  930. spin_lock(pte_ptl);
  931. result = __collapse_huge_page_isolate(vma, address, pte, cc,
  932. &compound_pagelist);
  933. spin_unlock(pte_ptl);
  934. if (unlikely(result != SCAN_SUCCEED)) {
  935. pte_unmap(pte);
  936. spin_lock(pmd_ptl);
  937. BUG_ON(!pmd_none(*pmd));
  938. /*
  939. * We can only use set_pmd_at when establishing
  940. * hugepmds and never for establishing regular pmds that
  941. * points to regular pagetables. Use pmd_populate for that
  942. */
  943. pmd_populate(mm, pmd, pmd_pgtable(_pmd));
  944. spin_unlock(pmd_ptl);
  945. anon_vma_unlock_write(vma->anon_vma);
  946. goto out_up_write;
  947. }
  948. /*
  949. * All pages are isolated and locked so anon_vma rmap
  950. * can't run anymore.
  951. */
  952. anon_vma_unlock_write(vma->anon_vma);
  953. __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl,
  954. &compound_pagelist);
  955. pte_unmap(pte);
  956. /*
  957. * spin_lock() below is not the equivalent of smp_wmb(), but
  958. * the smp_wmb() inside __SetPageUptodate() can be reused to
  959. * avoid the copy_huge_page writes to become visible after
  960. * the set_pmd_at() write.
  961. */
  962. __SetPageUptodate(hpage);
  963. pgtable = pmd_pgtable(_pmd);
  964. _pmd = mk_huge_pmd(hpage, vma->vm_page_prot);
  965. _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma);
  966. spin_lock(pmd_ptl);
  967. BUG_ON(!pmd_none(*pmd));
  968. page_add_new_anon_rmap(hpage, vma, address);
  969. lru_cache_add_inactive_or_unevictable(hpage, vma);
  970. pgtable_trans_huge_deposit(mm, pmd, pgtable);
  971. set_pmd_at(mm, address, pmd, _pmd);
  972. update_mmu_cache_pmd(vma, address, pmd);
  973. spin_unlock(pmd_ptl);
  974. hpage = NULL;
  975. result = SCAN_SUCCEED;
  976. out_up_write:
  977. mmap_write_unlock(mm);
  978. out_nolock:
  979. if (hpage) {
  980. mem_cgroup_uncharge(page_folio(hpage));
  981. put_page(hpage);
  982. }
  983. trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result);
  984. return result;
  985. }
  986. static int hpage_collapse_scan_pmd(struct mm_struct *mm,
  987. struct vm_area_struct *vma,
  988. unsigned long address, bool *mmap_locked,
  989. struct collapse_control *cc)
  990. {
  991. pmd_t *pmd;
  992. pte_t *pte, *_pte;
  993. int result = SCAN_FAIL, referenced = 0;
  994. int none_or_zero = 0, shared = 0;
  995. struct page *page = NULL;
  996. unsigned long _address;
  997. spinlock_t *ptl;
  998. int node = NUMA_NO_NODE, unmapped = 0;
  999. bool writable = false;
  1000. VM_BUG_ON(address & ~HPAGE_PMD_MASK);
  1001. result = find_pmd_or_thp_or_none(mm, address, &pmd);
  1002. if (result != SCAN_SUCCEED)
  1003. goto out;
  1004. memset(cc->node_load, 0, sizeof(cc->node_load));
  1005. nodes_clear(cc->alloc_nmask);
  1006. pte = pte_offset_map_lock(mm, pmd, address, &ptl);
  1007. for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR;
  1008. _pte++, _address += PAGE_SIZE) {
  1009. pte_t pteval = *_pte;
  1010. if (is_swap_pte(pteval)) {
  1011. ++unmapped;
  1012. if (!cc->is_khugepaged ||
  1013. unmapped <= khugepaged_max_ptes_swap) {
  1014. /*
  1015. * Always be strict with uffd-wp
  1016. * enabled swap entries. Please see
  1017. * comment below for pte_uffd_wp().
  1018. */
  1019. if (pte_swp_uffd_wp(pteval)) {
  1020. result = SCAN_PTE_UFFD_WP;
  1021. goto out_unmap;
  1022. }
  1023. continue;
  1024. } else {
  1025. result = SCAN_EXCEED_SWAP_PTE;
  1026. count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
  1027. goto out_unmap;
  1028. }
  1029. }
  1030. if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
  1031. ++none_or_zero;
  1032. if (!userfaultfd_armed(vma) &&
  1033. (!cc->is_khugepaged ||
  1034. none_or_zero <= khugepaged_max_ptes_none)) {
  1035. continue;
  1036. } else {
  1037. result = SCAN_EXCEED_NONE_PTE;
  1038. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  1039. goto out_unmap;
  1040. }
  1041. }
  1042. if (pte_uffd_wp(pteval)) {
  1043. /*
  1044. * Don't collapse the page if any of the small
  1045. * PTEs are armed with uffd write protection.
  1046. * Here we can also mark the new huge pmd as
  1047. * write protected if any of the small ones is
  1048. * marked but that could bring unknown
  1049. * userfault messages that falls outside of
  1050. * the registered range. So, just be simple.
  1051. */
  1052. result = SCAN_PTE_UFFD_WP;
  1053. goto out_unmap;
  1054. }
  1055. if (pte_write(pteval))
  1056. writable = true;
  1057. page = vm_normal_page(vma, _address, pteval);
  1058. if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
  1059. result = SCAN_PAGE_NULL;
  1060. goto out_unmap;
  1061. }
  1062. if (page_mapcount(page) > 1) {
  1063. ++shared;
  1064. if (cc->is_khugepaged &&
  1065. shared > khugepaged_max_ptes_shared) {
  1066. result = SCAN_EXCEED_SHARED_PTE;
  1067. count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
  1068. goto out_unmap;
  1069. }
  1070. }
  1071. page = compound_head(page);
  1072. /*
  1073. * Record which node the original page is from and save this
  1074. * information to cc->node_load[].
  1075. * Khugepaged will allocate hugepage from the node has the max
  1076. * hit record.
  1077. */
  1078. node = page_to_nid(page);
  1079. if (hpage_collapse_scan_abort(node, cc)) {
  1080. result = SCAN_SCAN_ABORT;
  1081. goto out_unmap;
  1082. }
  1083. cc->node_load[node]++;
  1084. if (!PageLRU(page)) {
  1085. result = SCAN_PAGE_LRU;
  1086. goto out_unmap;
  1087. }
  1088. if (PageLocked(page)) {
  1089. result = SCAN_PAGE_LOCK;
  1090. goto out_unmap;
  1091. }
  1092. if (!PageAnon(page)) {
  1093. result = SCAN_PAGE_ANON;
  1094. goto out_unmap;
  1095. }
  1096. /*
  1097. * Check if the page has any GUP (or other external) pins.
  1098. *
  1099. * Here the check is racy it may see total_mapcount > refcount
  1100. * in some cases.
  1101. * For example, one process with one forked child process.
  1102. * The parent has the PMD split due to MADV_DONTNEED, then
  1103. * the child is trying unmap the whole PMD, but khugepaged
  1104. * may be scanning the parent between the child has
  1105. * PageDoubleMap flag cleared and dec the mapcount. So
  1106. * khugepaged may see total_mapcount > refcount.
  1107. *
  1108. * But such case is ephemeral we could always retry collapse
  1109. * later. However it may report false positive if the page
  1110. * has excessive GUP pins (i.e. 512). Anyway the same check
  1111. * will be done again later the risk seems low.
  1112. */
  1113. if (!is_refcount_suitable(page)) {
  1114. result = SCAN_PAGE_COUNT;
  1115. goto out_unmap;
  1116. }
  1117. /*
  1118. * If collapse was initiated by khugepaged, check that there is
  1119. * enough young pte to justify collapsing the page
  1120. */
  1121. if (cc->is_khugepaged &&
  1122. (pte_young(pteval) || page_is_young(page) ||
  1123. PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm,
  1124. address)))
  1125. referenced++;
  1126. }
  1127. if (!writable) {
  1128. result = SCAN_PAGE_RO;
  1129. } else if (cc->is_khugepaged &&
  1130. (!referenced ||
  1131. (unmapped && referenced < HPAGE_PMD_NR / 2))) {
  1132. result = SCAN_LACK_REFERENCED_PAGE;
  1133. } else {
  1134. result = SCAN_SUCCEED;
  1135. }
  1136. out_unmap:
  1137. pte_unmap_unlock(pte, ptl);
  1138. if (result == SCAN_SUCCEED) {
  1139. result = collapse_huge_page(mm, address, referenced,
  1140. unmapped, cc);
  1141. /* collapse_huge_page will return with the mmap_lock released */
  1142. *mmap_locked = false;
  1143. }
  1144. out:
  1145. trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
  1146. none_or_zero, result, unmapped);
  1147. return result;
  1148. }
  1149. static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot)
  1150. {
  1151. struct mm_slot *slot = &mm_slot->slot;
  1152. struct mm_struct *mm = slot->mm;
  1153. lockdep_assert_held(&khugepaged_mm_lock);
  1154. if (hpage_collapse_test_exit(mm)) {
  1155. /* free mm_slot */
  1156. hash_del(&slot->hash);
  1157. list_del(&slot->mm_node);
  1158. /*
  1159. * Not strictly needed because the mm exited already.
  1160. *
  1161. * clear_bit(MMF_VM_HUGEPAGE, &mm->flags);
  1162. */
  1163. /* khugepaged_mm_lock actually not necessary for the below */
  1164. mm_slot_free(mm_slot_cache, mm_slot);
  1165. mmdrop(mm);
  1166. }
  1167. }
  1168. #ifdef CONFIG_SHMEM
  1169. /*
  1170. * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
  1171. * khugepaged should try to collapse the page table.
  1172. *
  1173. * Note that following race exists:
  1174. * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A,
  1175. * emptying the A's ->pte_mapped_thp[] array.
  1176. * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and
  1177. * retract_page_tables() finds a VMA in mm_struct A mapping the same extent
  1178. * (at virtual address X) and adds an entry (for X) into mm_struct A's
  1179. * ->pte-mapped_thp[] array.
  1180. * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X,
  1181. * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry
  1182. * (for X) into mm_struct A's ->pte-mapped_thp[] array.
  1183. * Thus, it's possible the same address is added multiple times for the same
  1184. * mm_struct. Should this happen, we'll simply attempt
  1185. * collapse_pte_mapped_thp() multiple times for the same address, under the same
  1186. * exclusive mmap_lock, and assuming the first call is successful, subsequent
  1187. * attempts will return quickly (without grabbing any additional locks) when
  1188. * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap
  1189. * check, and since this is a rare occurrence, the cost of preventing this
  1190. * "multiple-add" is thought to be more expensive than just handling it, should
  1191. * it occur.
  1192. */
  1193. static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
  1194. unsigned long addr)
  1195. {
  1196. struct khugepaged_mm_slot *mm_slot;
  1197. struct mm_slot *slot;
  1198. bool ret = false;
  1199. VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
  1200. spin_lock(&khugepaged_mm_lock);
  1201. slot = mm_slot_lookup(mm_slots_hash, mm);
  1202. mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  1203. if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) {
  1204. mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
  1205. ret = true;
  1206. }
  1207. spin_unlock(&khugepaged_mm_lock);
  1208. return ret;
  1209. }
  1210. /* hpage must be locked, and mmap_lock must be held in write */
  1211. static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr,
  1212. pmd_t *pmdp, struct page *hpage)
  1213. {
  1214. struct vm_fault vmf = {
  1215. .vma = vma,
  1216. .address = addr,
  1217. .flags = 0,
  1218. .pmd = pmdp,
  1219. };
  1220. VM_BUG_ON(!PageTransHuge(hpage));
  1221. mmap_assert_write_locked(vma->vm_mm);
  1222. if (do_set_pmd(&vmf, hpage))
  1223. return SCAN_FAIL;
  1224. get_page(hpage);
  1225. return SCAN_SUCCEED;
  1226. }
  1227. /*
  1228. * A note about locking:
  1229. * Trying to take the page table spinlocks would be useless here because those
  1230. * are only used to synchronize:
  1231. *
  1232. * - modifying terminal entries (ones that point to a data page, not to another
  1233. * page table)
  1234. * - installing *new* non-terminal entries
  1235. *
  1236. * Instead, we need roughly the same kind of protection as free_pgtables() or
  1237. * mm_take_all_locks() (but only for a single VMA):
  1238. * The mmap lock together with this VMA's rmap locks covers all paths towards
  1239. * the page table entries we're messing with here, except for hardware page
  1240. * table walks and lockless_pages_from_mm().
  1241. */
  1242. static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
  1243. unsigned long addr, pmd_t *pmdp)
  1244. {
  1245. pmd_t pmd;
  1246. struct mmu_notifier_range range;
  1247. mmap_assert_write_locked(mm);
  1248. if (vma->vm_file)
  1249. lockdep_assert_held_write(&vma->vm_file->f_mapping->i_mmap_rwsem);
  1250. /*
  1251. * All anon_vmas attached to the VMA have the same root and are
  1252. * therefore locked by the same lock.
  1253. */
  1254. if (vma->anon_vma)
  1255. lockdep_assert_held_write(&vma->anon_vma->root->rwsem);
  1256. mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm, addr,
  1257. addr + HPAGE_PMD_SIZE);
  1258. mmu_notifier_invalidate_range_start(&range);
  1259. pmd = pmdp_collapse_flush(vma, addr, pmdp);
  1260. tlb_remove_table_sync_one();
  1261. mmu_notifier_invalidate_range_end(&range);
  1262. mm_dec_nr_ptes(mm);
  1263. page_table_check_pte_clear_range(mm, addr, pmd);
  1264. pte_free(mm, pmd_pgtable(pmd));
  1265. }
  1266. /**
  1267. * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
  1268. * address haddr.
  1269. *
  1270. * @mm: process address space where collapse happens
  1271. * @addr: THP collapse address
  1272. * @install_pmd: If a huge PMD should be installed
  1273. *
  1274. * This function checks whether all the PTEs in the PMD are pointing to the
  1275. * right THP. If so, retract the page table so the THP can refault in with
  1276. * as pmd-mapped. Possibly install a huge PMD mapping the THP.
  1277. */
  1278. int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr,
  1279. bool install_pmd)
  1280. {
  1281. unsigned long haddr = addr & HPAGE_PMD_MASK;
  1282. struct vm_area_struct *vma = vma_lookup(mm, haddr);
  1283. struct page *hpage;
  1284. pte_t *start_pte, *pte;
  1285. pmd_t *pmd;
  1286. spinlock_t *ptl;
  1287. int count = 0, result = SCAN_FAIL;
  1288. int i;
  1289. mmap_assert_write_locked(mm);
  1290. /* Fast check before locking page if already PMD-mapped */
  1291. result = find_pmd_or_thp_or_none(mm, haddr, &pmd);
  1292. if (result == SCAN_PMD_MAPPED)
  1293. return result;
  1294. if (!vma || !vma->vm_file ||
  1295. !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE))
  1296. return SCAN_VMA_CHECK;
  1297. /*
  1298. * If we are here, we've succeeded in replacing all the native pages
  1299. * in the page cache with a single hugepage. If a mm were to fault-in
  1300. * this memory (mapped by a suitably aligned VMA), we'd get the hugepage
  1301. * and map it by a PMD, regardless of sysfs THP settings. As such, let's
  1302. * analogously elide sysfs THP settings here.
  1303. */
  1304. if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
  1305. return SCAN_VMA_CHECK;
  1306. /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
  1307. if (userfaultfd_wp(vma))
  1308. return SCAN_PTE_UFFD_WP;
  1309. hpage = find_lock_page(vma->vm_file->f_mapping,
  1310. linear_page_index(vma, haddr));
  1311. if (!hpage)
  1312. return SCAN_PAGE_NULL;
  1313. if (!PageHead(hpage)) {
  1314. result = SCAN_FAIL;
  1315. goto drop_hpage;
  1316. }
  1317. if (compound_order(hpage) != HPAGE_PMD_ORDER) {
  1318. result = SCAN_PAGE_COMPOUND;
  1319. goto drop_hpage;
  1320. }
  1321. switch (result) {
  1322. case SCAN_SUCCEED:
  1323. break;
  1324. case SCAN_PMD_NONE:
  1325. /*
  1326. * In MADV_COLLAPSE path, possible race with khugepaged where
  1327. * all pte entries have been removed and pmd cleared. If so,
  1328. * skip all the pte checks and just update the pmd mapping.
  1329. */
  1330. goto maybe_install_pmd;
  1331. default:
  1332. goto drop_hpage;
  1333. }
  1334. /* Lock the vma before taking i_mmap and page table locks */
  1335. vma_start_write(vma);
  1336. /*
  1337. * We need to lock the mapping so that from here on, only GUP-fast and
  1338. * hardware page walks can access the parts of the page tables that
  1339. * we're operating on.
  1340. * See collapse_and_free_pmd().
  1341. */
  1342. i_mmap_lock_write(vma->vm_file->f_mapping);
  1343. /*
  1344. * This spinlock should be unnecessary: Nobody else should be accessing
  1345. * the page tables under spinlock protection here, only
  1346. * lockless_pages_from_mm() and the hardware page walker can access page
  1347. * tables while all the high-level locks are held in write mode.
  1348. */
  1349. start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
  1350. result = SCAN_FAIL;
  1351. /* step 1: check all mapped PTEs are to the right huge page */
  1352. for (i = 0, addr = haddr, pte = start_pte;
  1353. i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
  1354. struct page *page;
  1355. /* empty pte, skip */
  1356. if (pte_none(*pte))
  1357. continue;
  1358. /* page swapped out, abort */
  1359. if (!pte_present(*pte)) {
  1360. result = SCAN_PTE_NON_PRESENT;
  1361. goto abort;
  1362. }
  1363. page = vm_normal_page(vma, addr, *pte);
  1364. if (WARN_ON_ONCE(page && is_zone_device_page(page)))
  1365. page = NULL;
  1366. /*
  1367. * Note that uprobe, debugger, or MAP_PRIVATE may change the
  1368. * page table, but the new page will not be a subpage of hpage.
  1369. */
  1370. if (hpage + i != page)
  1371. goto abort;
  1372. count++;
  1373. }
  1374. /* step 2: adjust rmap */
  1375. for (i = 0, addr = haddr, pte = start_pte;
  1376. i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
  1377. struct page *page;
  1378. if (pte_none(*pte))
  1379. continue;
  1380. page = vm_normal_page(vma, addr, *pte);
  1381. if (WARN_ON_ONCE(page && is_zone_device_page(page)))
  1382. goto abort;
  1383. page_remove_rmap(page, vma, false);
  1384. }
  1385. pte_unmap_unlock(start_pte, ptl);
  1386. /* step 3: set proper refcount and mm_counters. */
  1387. if (count) {
  1388. page_ref_sub(hpage, count);
  1389. add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
  1390. }
  1391. /* step 4: remove pte entries */
  1392. /* we make no change to anon, but protect concurrent anon page lookup */
  1393. if (vma->anon_vma)
  1394. anon_vma_lock_write(vma->anon_vma);
  1395. collapse_and_free_pmd(mm, vma, haddr, pmd);
  1396. if (vma->anon_vma)
  1397. anon_vma_unlock_write(vma->anon_vma);
  1398. i_mmap_unlock_write(vma->vm_file->f_mapping);
  1399. maybe_install_pmd:
  1400. /* step 5: install pmd entry */
  1401. result = install_pmd
  1402. ? set_huge_pmd(vma, haddr, pmd, hpage)
  1403. : SCAN_SUCCEED;
  1404. drop_hpage:
  1405. unlock_page(hpage);
  1406. put_page(hpage);
  1407. return result;
  1408. abort:
  1409. pte_unmap_unlock(start_pte, ptl);
  1410. i_mmap_unlock_write(vma->vm_file->f_mapping);
  1411. goto drop_hpage;
  1412. }
  1413. static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
  1414. {
  1415. struct mm_slot *slot = &mm_slot->slot;
  1416. struct mm_struct *mm = slot->mm;
  1417. int i;
  1418. if (likely(mm_slot->nr_pte_mapped_thp == 0))
  1419. return;
  1420. if (!mmap_write_trylock(mm))
  1421. return;
  1422. if (unlikely(hpage_collapse_test_exit(mm)))
  1423. goto out;
  1424. for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
  1425. collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false);
  1426. out:
  1427. mm_slot->nr_pte_mapped_thp = 0;
  1428. mmap_write_unlock(mm);
  1429. }
  1430. static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
  1431. struct mm_struct *target_mm,
  1432. unsigned long target_addr, struct page *hpage,
  1433. struct collapse_control *cc)
  1434. {
  1435. struct vm_area_struct *vma;
  1436. int target_result = SCAN_FAIL;
  1437. i_mmap_lock_write(mapping);
  1438. vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
  1439. int result = SCAN_FAIL;
  1440. struct mm_struct *mm = NULL;
  1441. unsigned long addr = 0;
  1442. pmd_t *pmd;
  1443. bool is_target = false;
  1444. /*
  1445. * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
  1446. * got written to. These VMAs are likely not worth investing
  1447. * mmap_write_lock(mm) as PMD-mapping is likely to be split
  1448. * later.
  1449. *
  1450. * Note that vma->anon_vma check is racy: it can be set up after
  1451. * the check but before we took mmap_lock by the fault path.
  1452. * But page lock would prevent establishing any new ptes of the
  1453. * page, so we are safe.
  1454. *
  1455. * An alternative would be drop the check, but check that page
  1456. * table is clear before calling pmdp_collapse_flush() under
  1457. * ptl. It has higher chance to recover THP for the VMA, but
  1458. * has higher cost too. It would also probably require locking
  1459. * the anon_vma.
  1460. */
  1461. if (READ_ONCE(vma->anon_vma)) {
  1462. result = SCAN_PAGE_ANON;
  1463. goto next;
  1464. }
  1465. addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
  1466. if (addr & ~HPAGE_PMD_MASK ||
  1467. vma->vm_end < addr + HPAGE_PMD_SIZE) {
  1468. result = SCAN_VMA_CHECK;
  1469. goto next;
  1470. }
  1471. mm = vma->vm_mm;
  1472. is_target = mm == target_mm && addr == target_addr;
  1473. result = find_pmd_or_thp_or_none(mm, addr, &pmd);
  1474. if (result != SCAN_SUCCEED)
  1475. goto next;
  1476. /*
  1477. * We need exclusive mmap_lock to retract page table.
  1478. *
  1479. * We use trylock due to lock inversion: we need to acquire
  1480. * mmap_lock while holding page lock. Fault path does it in
  1481. * reverse order. Trylock is a way to avoid deadlock.
  1482. *
  1483. * Also, it's not MADV_COLLAPSE's job to collapse other
  1484. * mappings - let khugepaged take care of them later.
  1485. */
  1486. result = SCAN_PTE_MAPPED_HUGEPAGE;
  1487. if ((cc->is_khugepaged || is_target) &&
  1488. mmap_write_trylock(mm)) {
  1489. /* trylock for the same lock inversion as above */
  1490. if (!vma_try_start_write(vma))
  1491. goto unlock_next;
  1492. /*
  1493. * Re-check whether we have an ->anon_vma, because
  1494. * collapse_and_free_pmd() requires that either no
  1495. * ->anon_vma exists or the anon_vma is locked.
  1496. * We already checked ->anon_vma above, but that check
  1497. * is racy because ->anon_vma can be populated under the
  1498. * mmap lock in read mode.
  1499. */
  1500. if (vma->anon_vma) {
  1501. result = SCAN_PAGE_ANON;
  1502. goto unlock_next;
  1503. }
  1504. /*
  1505. * When a vma is registered with uffd-wp, we can't
  1506. * recycle the pmd pgtable because there can be pte
  1507. * markers installed. Skip it only, so the rest mm/vma
  1508. * can still have the same file mapped hugely, however
  1509. * it'll always mapped in small page size for uffd-wp
  1510. * registered ranges.
  1511. */
  1512. if (hpage_collapse_test_exit(mm)) {
  1513. result = SCAN_ANY_PROCESS;
  1514. goto unlock_next;
  1515. }
  1516. if (userfaultfd_wp(vma)) {
  1517. result = SCAN_PTE_UFFD_WP;
  1518. goto unlock_next;
  1519. }
  1520. collapse_and_free_pmd(mm, vma, addr, pmd);
  1521. if (!cc->is_khugepaged && is_target)
  1522. result = set_huge_pmd(vma, addr, pmd, hpage);
  1523. else
  1524. result = SCAN_SUCCEED;
  1525. unlock_next:
  1526. mmap_write_unlock(mm);
  1527. goto next;
  1528. }
  1529. /*
  1530. * Calling context will handle target mm/addr. Otherwise, let
  1531. * khugepaged try again later.
  1532. */
  1533. if (!is_target) {
  1534. khugepaged_add_pte_mapped_thp(mm, addr);
  1535. continue;
  1536. }
  1537. next:
  1538. if (is_target)
  1539. target_result = result;
  1540. }
  1541. i_mmap_unlock_write(mapping);
  1542. return target_result;
  1543. }
  1544. /**
  1545. * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
  1546. *
  1547. * @mm: process address space where collapse happens
  1548. * @addr: virtual collapse start address
  1549. * @file: file that collapse on
  1550. * @start: collapse start address
  1551. * @cc: collapse context and scratchpad
  1552. *
  1553. * Basic scheme is simple, details are more complex:
  1554. * - allocate and lock a new huge page;
  1555. * - scan page cache replacing old pages with the new one
  1556. * + swap/gup in pages if necessary;
  1557. * + fill in gaps;
  1558. * + keep old pages around in case rollback is required;
  1559. * - if replacing succeeds:
  1560. * + copy data over;
  1561. * + free old pages;
  1562. * + unlock huge page;
  1563. * - if replacing failed;
  1564. * + put all pages back and unfreeze them;
  1565. * + restore gaps in the page cache;
  1566. * + unlock and free huge page;
  1567. */
  1568. static int collapse_file(struct mm_struct *mm, unsigned long addr,
  1569. struct file *file, pgoff_t start,
  1570. struct collapse_control *cc)
  1571. {
  1572. struct address_space *mapping = file->f_mapping;
  1573. struct page *hpage;
  1574. pgoff_t index, end = start + HPAGE_PMD_NR;
  1575. LIST_HEAD(pagelist);
  1576. XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
  1577. int nr_none = 0, result = SCAN_SUCCEED;
  1578. bool is_shmem = shmem_file(file);
  1579. int nr;
  1580. VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
  1581. VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
  1582. result = alloc_charge_hpage(&hpage, mm, cc);
  1583. if (result != SCAN_SUCCEED)
  1584. goto out;
  1585. /*
  1586. * Ensure we have slots for all the pages in the range. This is
  1587. * almost certainly a no-op because most of the pages must be present
  1588. */
  1589. do {
  1590. xas_lock_irq(&xas);
  1591. xas_create_range(&xas);
  1592. if (!xas_error(&xas))
  1593. break;
  1594. xas_unlock_irq(&xas);
  1595. if (!xas_nomem(&xas, GFP_KERNEL)) {
  1596. result = SCAN_FAIL;
  1597. goto out;
  1598. }
  1599. } while (1);
  1600. __SetPageLocked(hpage);
  1601. if (is_shmem)
  1602. __SetPageSwapBacked(hpage);
  1603. hpage->index = start;
  1604. hpage->mapping = mapping;
  1605. /*
  1606. * At this point the hpage is locked and not up-to-date.
  1607. * It's safe to insert it into the page cache, because nobody would
  1608. * be able to map it or use it in another way until we unlock it.
  1609. */
  1610. xas_set(&xas, start);
  1611. for (index = start; index < end; index++) {
  1612. struct page *page = xas_next(&xas);
  1613. VM_BUG_ON(index != xas.xa_index);
  1614. if (is_shmem) {
  1615. if (!page) {
  1616. /*
  1617. * Stop if extent has been truncated or
  1618. * hole-punched, and is now completely
  1619. * empty.
  1620. */
  1621. if (index == start) {
  1622. if (!xas_next_entry(&xas, end - 1)) {
  1623. result = SCAN_TRUNCATED;
  1624. goto xa_locked;
  1625. }
  1626. xas_set(&xas, index);
  1627. }
  1628. if (!shmem_charge(mapping->host, 1)) {
  1629. result = SCAN_FAIL;
  1630. goto xa_locked;
  1631. }
  1632. xas_store(&xas, hpage);
  1633. nr_none++;
  1634. continue;
  1635. }
  1636. if (xa_is_value(page) || !PageUptodate(page)) {
  1637. struct folio *folio;
  1638. xas_unlock_irq(&xas);
  1639. /* swap in or instantiate fallocated page */
  1640. if (shmem_get_folio(mapping->host, index,
  1641. &folio, SGP_NOALLOC)) {
  1642. result = SCAN_FAIL;
  1643. goto xa_unlocked;
  1644. }
  1645. page = folio_file_page(folio, index);
  1646. } else if (trylock_page(page)) {
  1647. get_page(page);
  1648. xas_unlock_irq(&xas);
  1649. } else {
  1650. result = SCAN_PAGE_LOCK;
  1651. goto xa_locked;
  1652. }
  1653. } else { /* !is_shmem */
  1654. if (!page || xa_is_value(page)) {
  1655. xas_unlock_irq(&xas);
  1656. page_cache_sync_readahead(mapping, &file->f_ra,
  1657. file, index,
  1658. end - index);
  1659. /* drain pagevecs to help isolate_lru_page() */
  1660. lru_add_drain();
  1661. page = find_lock_page(mapping, index);
  1662. if (unlikely(page == NULL)) {
  1663. result = SCAN_FAIL;
  1664. goto xa_unlocked;
  1665. }
  1666. } else if (PageDirty(page)) {
  1667. /*
  1668. * khugepaged only works on read-only fd,
  1669. * so this page is dirty because it hasn't
  1670. * been flushed since first write. There
  1671. * won't be new dirty pages.
  1672. *
  1673. * Trigger async flush here and hope the
  1674. * writeback is done when khugepaged
  1675. * revisits this page.
  1676. *
  1677. * This is a one-off situation. We are not
  1678. * forcing writeback in loop.
  1679. */
  1680. xas_unlock_irq(&xas);
  1681. filemap_flush(mapping);
  1682. result = SCAN_FAIL;
  1683. goto xa_unlocked;
  1684. } else if (PageWriteback(page)) {
  1685. xas_unlock_irq(&xas);
  1686. result = SCAN_FAIL;
  1687. goto xa_unlocked;
  1688. } else if (trylock_page(page)) {
  1689. get_page(page);
  1690. xas_unlock_irq(&xas);
  1691. } else {
  1692. result = SCAN_PAGE_LOCK;
  1693. goto xa_locked;
  1694. }
  1695. }
  1696. /*
  1697. * The page must be locked, so we can drop the i_pages lock
  1698. * without racing with truncate.
  1699. */
  1700. VM_BUG_ON_PAGE(!PageLocked(page), page);
  1701. /* make sure the page is up to date */
  1702. if (unlikely(!PageUptodate(page))) {
  1703. result = SCAN_FAIL;
  1704. goto out_unlock;
  1705. }
  1706. /*
  1707. * If file was truncated then extended, or hole-punched, before
  1708. * we locked the first page, then a THP might be there already.
  1709. * This will be discovered on the first iteration.
  1710. */
  1711. if (PageTransCompound(page)) {
  1712. struct page *head = compound_head(page);
  1713. result = compound_order(head) == HPAGE_PMD_ORDER &&
  1714. head->index == start
  1715. /* Maybe PMD-mapped */
  1716. ? SCAN_PTE_MAPPED_HUGEPAGE
  1717. : SCAN_PAGE_COMPOUND;
  1718. goto out_unlock;
  1719. }
  1720. if (page_mapping(page) != mapping) {
  1721. result = SCAN_TRUNCATED;
  1722. goto out_unlock;
  1723. }
  1724. if (!is_shmem && (PageDirty(page) ||
  1725. PageWriteback(page))) {
  1726. /*
  1727. * khugepaged only works on read-only fd, so this
  1728. * page is dirty because it hasn't been flushed
  1729. * since first write.
  1730. */
  1731. result = SCAN_FAIL;
  1732. goto out_unlock;
  1733. }
  1734. if (isolate_lru_page(page)) {
  1735. result = SCAN_DEL_PAGE_LRU;
  1736. goto out_unlock;
  1737. }
  1738. if (page_has_private(page) &&
  1739. !try_to_release_page(page, GFP_KERNEL)) {
  1740. result = SCAN_PAGE_HAS_PRIVATE;
  1741. putback_lru_page(page);
  1742. goto out_unlock;
  1743. }
  1744. if (page_mapped(page))
  1745. try_to_unmap(page_folio(page),
  1746. TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH);
  1747. xas_lock_irq(&xas);
  1748. xas_set(&xas, index);
  1749. VM_BUG_ON_PAGE(page != xas_load(&xas), page);
  1750. /*
  1751. * The page is expected to have page_count() == 3:
  1752. * - we hold a pin on it;
  1753. * - one reference from page cache;
  1754. * - one from isolate_lru_page;
  1755. */
  1756. if (!page_ref_freeze(page, 3)) {
  1757. result = SCAN_PAGE_COUNT;
  1758. xas_unlock_irq(&xas);
  1759. putback_lru_page(page);
  1760. goto out_unlock;
  1761. }
  1762. /*
  1763. * Add the page to the list to be able to undo the collapse if
  1764. * something go wrong.
  1765. */
  1766. list_add_tail(&page->lru, &pagelist);
  1767. /* Finally, replace with the new page. */
  1768. xas_store(&xas, hpage);
  1769. continue;
  1770. out_unlock:
  1771. unlock_page(page);
  1772. put_page(page);
  1773. goto xa_unlocked;
  1774. }
  1775. nr = thp_nr_pages(hpage);
  1776. if (is_shmem)
  1777. __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr);
  1778. else {
  1779. __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr);
  1780. filemap_nr_thps_inc(mapping);
  1781. /*
  1782. * Paired with smp_mb() in do_dentry_open() to ensure
  1783. * i_writecount is up to date and the update to nr_thps is
  1784. * visible. Ensures the page cache will be truncated if the
  1785. * file is opened writable.
  1786. */
  1787. smp_mb();
  1788. if (inode_is_open_for_write(mapping->host)) {
  1789. result = SCAN_FAIL;
  1790. __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr);
  1791. filemap_nr_thps_dec(mapping);
  1792. goto xa_locked;
  1793. }
  1794. }
  1795. if (nr_none) {
  1796. __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none);
  1797. /* nr_none is always 0 for non-shmem. */
  1798. __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none);
  1799. }
  1800. /* Join all the small entries into a single multi-index entry */
  1801. xas_set_order(&xas, start, HPAGE_PMD_ORDER);
  1802. xas_store(&xas, hpage);
  1803. xa_locked:
  1804. xas_unlock_irq(&xas);
  1805. xa_unlocked:
  1806. /*
  1807. * If collapse is successful, flush must be done now before copying.
  1808. * If collapse is unsuccessful, does flush actually need to be done?
  1809. * Do it anyway, to clear the state.
  1810. */
  1811. try_to_unmap_flush();
  1812. if (result == SCAN_SUCCEED) {
  1813. struct page *page, *tmp;
  1814. /*
  1815. * Replacing old pages with new one has succeeded, now we
  1816. * need to copy the content and free the old pages.
  1817. */
  1818. index = start;
  1819. list_for_each_entry_safe(page, tmp, &pagelist, lru) {
  1820. while (index < page->index) {
  1821. clear_highpage(hpage + (index % HPAGE_PMD_NR));
  1822. index++;
  1823. }
  1824. copy_highpage(hpage + (page->index % HPAGE_PMD_NR),
  1825. page);
  1826. list_del(&page->lru);
  1827. page->mapping = NULL;
  1828. page_ref_unfreeze(page, 1);
  1829. ClearPageActive(page);
  1830. ClearPageUnevictable(page);
  1831. unlock_page(page);
  1832. put_page(page);
  1833. index++;
  1834. }
  1835. while (index < end) {
  1836. clear_highpage(hpage + (index % HPAGE_PMD_NR));
  1837. index++;
  1838. }
  1839. SetPageUptodate(hpage);
  1840. page_ref_add(hpage, HPAGE_PMD_NR - 1);
  1841. if (is_shmem)
  1842. set_page_dirty(hpage);
  1843. lru_cache_add(hpage);
  1844. /*
  1845. * Remove pte page tables, so we can re-fault the page as huge.
  1846. */
  1847. result = retract_page_tables(mapping, start, mm, addr, hpage,
  1848. cc);
  1849. unlock_page(hpage);
  1850. hpage = NULL;
  1851. } else {
  1852. struct page *page;
  1853. /* Something went wrong: roll back page cache changes */
  1854. xas_lock_irq(&xas);
  1855. if (nr_none) {
  1856. mapping->nrpages -= nr_none;
  1857. shmem_uncharge(mapping->host, nr_none);
  1858. }
  1859. xas_set(&xas, start);
  1860. xas_for_each(&xas, page, end - 1) {
  1861. page = list_first_entry_or_null(&pagelist,
  1862. struct page, lru);
  1863. if (!page || xas.xa_index < page->index) {
  1864. if (!nr_none)
  1865. break;
  1866. nr_none--;
  1867. /* Put holes back where they were */
  1868. xas_store(&xas, NULL);
  1869. continue;
  1870. }
  1871. VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
  1872. /* Unfreeze the page. */
  1873. list_del(&page->lru);
  1874. page_ref_unfreeze(page, 2);
  1875. xas_store(&xas, page);
  1876. xas_pause(&xas);
  1877. xas_unlock_irq(&xas);
  1878. unlock_page(page);
  1879. putback_lru_page(page);
  1880. xas_lock_irq(&xas);
  1881. }
  1882. VM_BUG_ON(nr_none);
  1883. xas_unlock_irq(&xas);
  1884. hpage->mapping = NULL;
  1885. }
  1886. if (hpage)
  1887. unlock_page(hpage);
  1888. out:
  1889. VM_BUG_ON(!list_empty(&pagelist));
  1890. if (hpage) {
  1891. mem_cgroup_uncharge(page_folio(hpage));
  1892. put_page(hpage);
  1893. }
  1894. /* TODO: tracepoints */
  1895. return result;
  1896. }
  1897. static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
  1898. struct file *file, pgoff_t start,
  1899. struct collapse_control *cc)
  1900. {
  1901. struct page *page = NULL;
  1902. struct address_space *mapping = file->f_mapping;
  1903. XA_STATE(xas, &mapping->i_pages, start);
  1904. int present, swap;
  1905. int node = NUMA_NO_NODE;
  1906. int result = SCAN_SUCCEED;
  1907. present = 0;
  1908. swap = 0;
  1909. memset(cc->node_load, 0, sizeof(cc->node_load));
  1910. nodes_clear(cc->alloc_nmask);
  1911. rcu_read_lock();
  1912. xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
  1913. if (xas_retry(&xas, page))
  1914. continue;
  1915. if (xa_is_value(page)) {
  1916. ++swap;
  1917. if (cc->is_khugepaged &&
  1918. swap > khugepaged_max_ptes_swap) {
  1919. result = SCAN_EXCEED_SWAP_PTE;
  1920. count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
  1921. break;
  1922. }
  1923. continue;
  1924. }
  1925. /*
  1926. * TODO: khugepaged should compact smaller compound pages
  1927. * into a PMD sized page
  1928. */
  1929. if (PageTransCompound(page)) {
  1930. struct page *head = compound_head(page);
  1931. result = compound_order(head) == HPAGE_PMD_ORDER &&
  1932. head->index == start
  1933. /* Maybe PMD-mapped */
  1934. ? SCAN_PTE_MAPPED_HUGEPAGE
  1935. : SCAN_PAGE_COMPOUND;
  1936. /*
  1937. * For SCAN_PTE_MAPPED_HUGEPAGE, further processing
  1938. * by the caller won't touch the page cache, and so
  1939. * it's safe to skip LRU and refcount checks before
  1940. * returning.
  1941. */
  1942. break;
  1943. }
  1944. node = page_to_nid(page);
  1945. if (hpage_collapse_scan_abort(node, cc)) {
  1946. result = SCAN_SCAN_ABORT;
  1947. break;
  1948. }
  1949. cc->node_load[node]++;
  1950. if (!PageLRU(page)) {
  1951. result = SCAN_PAGE_LRU;
  1952. break;
  1953. }
  1954. if (page_count(page) !=
  1955. 1 + page_mapcount(page) + page_has_private(page)) {
  1956. result = SCAN_PAGE_COUNT;
  1957. break;
  1958. }
  1959. /*
  1960. * We probably should check if the page is referenced here, but
  1961. * nobody would transfer pte_young() to PageReferenced() for us.
  1962. * And rmap walk here is just too costly...
  1963. */
  1964. present++;
  1965. if (need_resched()) {
  1966. xas_pause(&xas);
  1967. cond_resched_rcu();
  1968. }
  1969. }
  1970. rcu_read_unlock();
  1971. if (result == SCAN_SUCCEED) {
  1972. if (cc->is_khugepaged &&
  1973. present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
  1974. result = SCAN_EXCEED_NONE_PTE;
  1975. count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
  1976. } else {
  1977. result = collapse_file(mm, addr, file, start, cc);
  1978. }
  1979. }
  1980. trace_mm_khugepaged_scan_file(mm, page, file, present, swap, result);
  1981. return result;
  1982. }
  1983. #else
  1984. static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
  1985. struct file *file, pgoff_t start,
  1986. struct collapse_control *cc)
  1987. {
  1988. BUILD_BUG();
  1989. }
  1990. static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot)
  1991. {
  1992. }
  1993. static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
  1994. unsigned long addr)
  1995. {
  1996. return false;
  1997. }
  1998. #endif
  1999. static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result,
  2000. struct collapse_control *cc)
  2001. __releases(&khugepaged_mm_lock)
  2002. __acquires(&khugepaged_mm_lock)
  2003. {
  2004. struct vma_iterator vmi;
  2005. struct khugepaged_mm_slot *mm_slot;
  2006. struct mm_slot *slot;
  2007. struct mm_struct *mm;
  2008. struct vm_area_struct *vma;
  2009. int progress = 0;
  2010. VM_BUG_ON(!pages);
  2011. lockdep_assert_held(&khugepaged_mm_lock);
  2012. *result = SCAN_FAIL;
  2013. if (khugepaged_scan.mm_slot) {
  2014. mm_slot = khugepaged_scan.mm_slot;
  2015. slot = &mm_slot->slot;
  2016. } else {
  2017. slot = list_entry(khugepaged_scan.mm_head.next,
  2018. struct mm_slot, mm_node);
  2019. mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  2020. khugepaged_scan.address = 0;
  2021. khugepaged_scan.mm_slot = mm_slot;
  2022. }
  2023. spin_unlock(&khugepaged_mm_lock);
  2024. khugepaged_collapse_pte_mapped_thps(mm_slot);
  2025. mm = slot->mm;
  2026. /*
  2027. * Don't wait for semaphore (to avoid long wait times). Just move to
  2028. * the next mm on the list.
  2029. */
  2030. vma = NULL;
  2031. if (unlikely(!mmap_read_trylock(mm)))
  2032. goto breakouterloop_mmap_lock;
  2033. progress++;
  2034. if (unlikely(hpage_collapse_test_exit(mm)))
  2035. goto breakouterloop;
  2036. vma_iter_init(&vmi, mm, khugepaged_scan.address);
  2037. for_each_vma(vmi, vma) {
  2038. unsigned long hstart, hend;
  2039. cond_resched();
  2040. if (unlikely(hpage_collapse_test_exit(mm))) {
  2041. progress++;
  2042. break;
  2043. }
  2044. if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) {
  2045. skip:
  2046. progress++;
  2047. continue;
  2048. }
  2049. hstart = round_up(vma->vm_start, HPAGE_PMD_SIZE);
  2050. hend = round_down(vma->vm_end, HPAGE_PMD_SIZE);
  2051. if (khugepaged_scan.address > hend)
  2052. goto skip;
  2053. if (khugepaged_scan.address < hstart)
  2054. khugepaged_scan.address = hstart;
  2055. VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK);
  2056. while (khugepaged_scan.address < hend) {
  2057. bool mmap_locked = true;
  2058. cond_resched();
  2059. if (unlikely(hpage_collapse_test_exit(mm)))
  2060. goto breakouterloop;
  2061. VM_BUG_ON(khugepaged_scan.address < hstart ||
  2062. khugepaged_scan.address + HPAGE_PMD_SIZE >
  2063. hend);
  2064. if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
  2065. struct file *file = get_file(vma->vm_file);
  2066. pgoff_t pgoff = linear_page_index(vma,
  2067. khugepaged_scan.address);
  2068. mmap_read_unlock(mm);
  2069. *result = hpage_collapse_scan_file(mm,
  2070. khugepaged_scan.address,
  2071. file, pgoff, cc);
  2072. mmap_locked = false;
  2073. fput(file);
  2074. } else {
  2075. *result = hpage_collapse_scan_pmd(mm, vma,
  2076. khugepaged_scan.address,
  2077. &mmap_locked,
  2078. cc);
  2079. }
  2080. switch (*result) {
  2081. case SCAN_PTE_MAPPED_HUGEPAGE: {
  2082. pmd_t *pmd;
  2083. *result = find_pmd_or_thp_or_none(mm,
  2084. khugepaged_scan.address,
  2085. &pmd);
  2086. if (*result != SCAN_SUCCEED)
  2087. break;
  2088. if (!khugepaged_add_pte_mapped_thp(mm,
  2089. khugepaged_scan.address))
  2090. break;
  2091. } fallthrough;
  2092. case SCAN_SUCCEED:
  2093. ++khugepaged_pages_collapsed;
  2094. break;
  2095. default:
  2096. break;
  2097. }
  2098. /* move to next address */
  2099. khugepaged_scan.address += HPAGE_PMD_SIZE;
  2100. progress += HPAGE_PMD_NR;
  2101. if (!mmap_locked)
  2102. /*
  2103. * We released mmap_lock so break loop. Note
  2104. * that we drop mmap_lock before all hugepage
  2105. * allocations, so if allocation fails, we are
  2106. * guaranteed to break here and report the
  2107. * correct result back to caller.
  2108. */
  2109. goto breakouterloop_mmap_lock;
  2110. if (progress >= pages)
  2111. goto breakouterloop;
  2112. }
  2113. }
  2114. breakouterloop:
  2115. mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
  2116. breakouterloop_mmap_lock:
  2117. spin_lock(&khugepaged_mm_lock);
  2118. VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
  2119. /*
  2120. * Release the current mm_slot if this mm is about to die, or
  2121. * if we scanned all vmas of this mm.
  2122. */
  2123. if (hpage_collapse_test_exit(mm) || !vma) {
  2124. /*
  2125. * Make sure that if mm_users is reaching zero while
  2126. * khugepaged runs here, khugepaged_exit will find
  2127. * mm_slot not pointing to the exiting mm.
  2128. */
  2129. if (slot->mm_node.next != &khugepaged_scan.mm_head) {
  2130. slot = list_entry(slot->mm_node.next,
  2131. struct mm_slot, mm_node);
  2132. khugepaged_scan.mm_slot =
  2133. mm_slot_entry(slot, struct khugepaged_mm_slot, slot);
  2134. khugepaged_scan.address = 0;
  2135. } else {
  2136. khugepaged_scan.mm_slot = NULL;
  2137. khugepaged_full_scans++;
  2138. }
  2139. collect_mm_slot(mm_slot);
  2140. }
  2141. return progress;
  2142. }
  2143. static int khugepaged_has_work(void)
  2144. {
  2145. return !list_empty(&khugepaged_scan.mm_head) &&
  2146. hugepage_flags_enabled();
  2147. }
  2148. static int khugepaged_wait_event(void)
  2149. {
  2150. return !list_empty(&khugepaged_scan.mm_head) ||
  2151. kthread_should_stop();
  2152. }
  2153. static void khugepaged_do_scan(struct collapse_control *cc)
  2154. {
  2155. unsigned int progress = 0, pass_through_head = 0;
  2156. unsigned int pages = READ_ONCE(khugepaged_pages_to_scan);
  2157. bool wait = true;
  2158. int result = SCAN_SUCCEED;
  2159. lru_add_drain_all();
  2160. while (true) {
  2161. cond_resched();
  2162. if (unlikely(kthread_should_stop() || try_to_freeze()))
  2163. break;
  2164. spin_lock(&khugepaged_mm_lock);
  2165. if (!khugepaged_scan.mm_slot)
  2166. pass_through_head++;
  2167. if (khugepaged_has_work() &&
  2168. pass_through_head < 2)
  2169. progress += khugepaged_scan_mm_slot(pages - progress,
  2170. &result, cc);
  2171. else
  2172. progress = pages;
  2173. spin_unlock(&khugepaged_mm_lock);
  2174. if (progress >= pages)
  2175. break;
  2176. if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) {
  2177. /*
  2178. * If fail to allocate the first time, try to sleep for
  2179. * a while. When hit again, cancel the scan.
  2180. */
  2181. if (!wait)
  2182. break;
  2183. wait = false;
  2184. khugepaged_alloc_sleep();
  2185. }
  2186. }
  2187. }
  2188. static bool khugepaged_should_wakeup(void)
  2189. {
  2190. return kthread_should_stop() ||
  2191. time_after_eq(jiffies, khugepaged_sleep_expire);
  2192. }
  2193. static void khugepaged_wait_work(void)
  2194. {
  2195. if (khugepaged_has_work()) {
  2196. const unsigned long scan_sleep_jiffies =
  2197. msecs_to_jiffies(khugepaged_scan_sleep_millisecs);
  2198. if (!scan_sleep_jiffies)
  2199. return;
  2200. khugepaged_sleep_expire = jiffies + scan_sleep_jiffies;
  2201. wait_event_freezable_timeout(khugepaged_wait,
  2202. khugepaged_should_wakeup(),
  2203. scan_sleep_jiffies);
  2204. return;
  2205. }
  2206. if (hugepage_flags_enabled())
  2207. wait_event_freezable(khugepaged_wait, khugepaged_wait_event());
  2208. }
  2209. static int khugepaged(void *none)
  2210. {
  2211. struct khugepaged_mm_slot *mm_slot;
  2212. set_freezable();
  2213. set_user_nice(current, MAX_NICE);
  2214. while (!kthread_should_stop()) {
  2215. khugepaged_do_scan(&khugepaged_collapse_control);
  2216. khugepaged_wait_work();
  2217. }
  2218. spin_lock(&khugepaged_mm_lock);
  2219. mm_slot = khugepaged_scan.mm_slot;
  2220. khugepaged_scan.mm_slot = NULL;
  2221. if (mm_slot)
  2222. collect_mm_slot(mm_slot);
  2223. spin_unlock(&khugepaged_mm_lock);
  2224. return 0;
  2225. }
  2226. static void set_recommended_min_free_kbytes(void)
  2227. {
  2228. struct zone *zone;
  2229. int nr_zones = 0;
  2230. unsigned long recommended_min;
  2231. if (!hugepage_flags_enabled()) {
  2232. calculate_min_free_kbytes();
  2233. goto update_wmarks;
  2234. }
  2235. for_each_populated_zone(zone) {
  2236. /*
  2237. * We don't need to worry about fragmentation of
  2238. * ZONE_MOVABLE since it only has movable pages.
  2239. */
  2240. if (zone_idx(zone) > gfp_zone(GFP_USER))
  2241. continue;
  2242. nr_zones++;
  2243. }
  2244. /* Ensure 2 pageblocks are free to assist fragmentation avoidance */
  2245. recommended_min = pageblock_nr_pages * nr_zones * 2;
  2246. /*
  2247. * Make sure that on average at least two pageblocks are almost free
  2248. * of another type, one for a migratetype to fall back to and a
  2249. * second to avoid subsequent fallbacks of other types There are 3
  2250. * MIGRATE_TYPES we care about.
  2251. */
  2252. recommended_min += pageblock_nr_pages * nr_zones *
  2253. MIGRATE_PCPTYPES * MIGRATE_PCPTYPES;
  2254. /* don't ever allow to reserve more than 5% of the lowmem */
  2255. recommended_min = min(recommended_min,
  2256. (unsigned long) nr_free_buffer_pages() / 20);
  2257. recommended_min <<= (PAGE_SHIFT-10);
  2258. if (recommended_min > min_free_kbytes) {
  2259. if (user_min_free_kbytes >= 0)
  2260. pr_info("raising min_free_kbytes from %d to %lu to help transparent hugepage allocations\n",
  2261. min_free_kbytes, recommended_min);
  2262. min_free_kbytes = recommended_min;
  2263. }
  2264. update_wmarks:
  2265. setup_per_zone_wmarks();
  2266. }
  2267. int start_stop_khugepaged(void)
  2268. {
  2269. int err = 0;
  2270. mutex_lock(&khugepaged_mutex);
  2271. if (hugepage_flags_enabled()) {
  2272. if (!khugepaged_thread)
  2273. khugepaged_thread = kthread_run(khugepaged, NULL,
  2274. "khugepaged");
  2275. if (IS_ERR(khugepaged_thread)) {
  2276. pr_err("khugepaged: kthread_run(khugepaged) failed\n");
  2277. err = PTR_ERR(khugepaged_thread);
  2278. khugepaged_thread = NULL;
  2279. goto fail;
  2280. }
  2281. if (!list_empty(&khugepaged_scan.mm_head))
  2282. wake_up_interruptible(&khugepaged_wait);
  2283. } else if (khugepaged_thread) {
  2284. kthread_stop(khugepaged_thread);
  2285. khugepaged_thread = NULL;
  2286. }
  2287. set_recommended_min_free_kbytes();
  2288. fail:
  2289. mutex_unlock(&khugepaged_mutex);
  2290. return err;
  2291. }
  2292. void khugepaged_min_free_kbytes_update(void)
  2293. {
  2294. mutex_lock(&khugepaged_mutex);
  2295. if (hugepage_flags_enabled() && khugepaged_thread)
  2296. set_recommended_min_free_kbytes();
  2297. mutex_unlock(&khugepaged_mutex);
  2298. }
  2299. static int madvise_collapse_errno(enum scan_result r)
  2300. {
  2301. /*
  2302. * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide
  2303. * actionable feedback to caller, so they may take an appropriate
  2304. * fallback measure depending on the nature of the failure.
  2305. */
  2306. switch (r) {
  2307. case SCAN_ALLOC_HUGE_PAGE_FAIL:
  2308. return -ENOMEM;
  2309. case SCAN_CGROUP_CHARGE_FAIL:
  2310. return -EBUSY;
  2311. /* Resource temporary unavailable - trying again might succeed */
  2312. case SCAN_PAGE_COUNT:
  2313. case SCAN_PAGE_LOCK:
  2314. case SCAN_PAGE_LRU:
  2315. case SCAN_DEL_PAGE_LRU:
  2316. return -EAGAIN;
  2317. /*
  2318. * Other: Trying again likely not to succeed / error intrinsic to
  2319. * specified memory range. khugepaged likely won't be able to collapse
  2320. * either.
  2321. */
  2322. default:
  2323. return -EINVAL;
  2324. }
  2325. }
  2326. int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev,
  2327. unsigned long start, unsigned long end)
  2328. {
  2329. struct collapse_control *cc;
  2330. struct mm_struct *mm = vma->vm_mm;
  2331. unsigned long hstart, hend, addr;
  2332. int thps = 0, last_fail = SCAN_FAIL;
  2333. bool mmap_locked = true;
  2334. BUG_ON(vma->vm_start > start);
  2335. BUG_ON(vma->vm_end < end);
  2336. *prev = vma;
  2337. if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false))
  2338. return -EINVAL;
  2339. cc = kmalloc(sizeof(*cc), GFP_KERNEL);
  2340. if (!cc)
  2341. return -ENOMEM;
  2342. cc->is_khugepaged = false;
  2343. mmgrab(mm);
  2344. lru_add_drain_all();
  2345. hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK;
  2346. hend = end & HPAGE_PMD_MASK;
  2347. for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) {
  2348. int result = SCAN_FAIL;
  2349. if (!mmap_locked) {
  2350. cond_resched();
  2351. mmap_read_lock(mm);
  2352. mmap_locked = true;
  2353. result = hugepage_vma_revalidate(mm, addr, false, &vma,
  2354. cc);
  2355. if (result != SCAN_SUCCEED) {
  2356. last_fail = result;
  2357. goto out_nolock;
  2358. }
  2359. hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
  2360. }
  2361. mmap_assert_locked(mm);
  2362. memset(cc->node_load, 0, sizeof(cc->node_load));
  2363. nodes_clear(cc->alloc_nmask);
  2364. if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
  2365. struct file *file = get_file(vma->vm_file);
  2366. pgoff_t pgoff = linear_page_index(vma, addr);
  2367. mmap_read_unlock(mm);
  2368. mmap_locked = false;
  2369. result = hpage_collapse_scan_file(mm, addr, file, pgoff,
  2370. cc);
  2371. fput(file);
  2372. } else {
  2373. result = hpage_collapse_scan_pmd(mm, vma, addr,
  2374. &mmap_locked, cc);
  2375. }
  2376. if (!mmap_locked)
  2377. *prev = NULL; /* Tell caller we dropped mmap_lock */
  2378. handle_result:
  2379. switch (result) {
  2380. case SCAN_SUCCEED:
  2381. case SCAN_PMD_MAPPED:
  2382. ++thps;
  2383. break;
  2384. case SCAN_PTE_MAPPED_HUGEPAGE:
  2385. BUG_ON(mmap_locked);
  2386. BUG_ON(*prev);
  2387. mmap_write_lock(mm);
  2388. result = collapse_pte_mapped_thp(mm, addr, true);
  2389. mmap_write_unlock(mm);
  2390. goto handle_result;
  2391. /* Whitelisted set of results where continuing OK */
  2392. case SCAN_PMD_NULL:
  2393. case SCAN_PTE_NON_PRESENT:
  2394. case SCAN_PTE_UFFD_WP:
  2395. case SCAN_PAGE_RO:
  2396. case SCAN_LACK_REFERENCED_PAGE:
  2397. case SCAN_PAGE_NULL:
  2398. case SCAN_PAGE_COUNT:
  2399. case SCAN_PAGE_LOCK:
  2400. case SCAN_PAGE_COMPOUND:
  2401. case SCAN_PAGE_LRU:
  2402. case SCAN_DEL_PAGE_LRU:
  2403. last_fail = result;
  2404. break;
  2405. default:
  2406. last_fail = result;
  2407. /* Other error, exit */
  2408. goto out_maybelock;
  2409. }
  2410. }
  2411. out_maybelock:
  2412. /* Caller expects us to hold mmap_lock on return */
  2413. if (!mmap_locked)
  2414. mmap_read_lock(mm);
  2415. out_nolock:
  2416. mmap_assert_locked(mm);
  2417. mmdrop(mm);
  2418. kfree(cc);
  2419. return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0
  2420. : madvise_collapse_errno(last_fail);
  2421. }