gup.c 93 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. #include <linux/kernel.h>
  3. #include <linux/errno.h>
  4. #include <linux/err.h>
  5. #include <linux/spinlock.h>
  6. #include <linux/mm.h>
  7. #include <linux/memremap.h>
  8. #include <linux/pagemap.h>
  9. #include <linux/rmap.h>
  10. #include <linux/swap.h>
  11. #include <linux/swapops.h>
  12. #include <linux/secretmem.h>
  13. #include <linux/sched/signal.h>
  14. #include <linux/rwsem.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/migrate.h>
  17. #include <linux/mm_inline.h>
  18. #include <linux/sched/mm.h>
  19. #include <asm/mmu_context.h>
  20. #include <asm/tlbflush.h>
  21. #include "internal.h"
  22. struct follow_page_context {
  23. struct dev_pagemap *pgmap;
  24. unsigned int page_mask;
  25. };
  26. static inline void sanity_check_pinned_pages(struct page **pages,
  27. unsigned long npages)
  28. {
  29. if (!IS_ENABLED(CONFIG_DEBUG_VM))
  30. return;
  31. /*
  32. * We only pin anonymous pages if they are exclusive. Once pinned, we
  33. * can no longer turn them possibly shared and PageAnonExclusive() will
  34. * stick around until the page is freed.
  35. *
  36. * We'd like to verify that our pinned anonymous pages are still mapped
  37. * exclusively. The issue with anon THP is that we don't know how
  38. * they are/were mapped when pinning them. However, for anon
  39. * THP we can assume that either the given page (PTE-mapped THP) or
  40. * the head page (PMD-mapped THP) should be PageAnonExclusive(). If
  41. * neither is the case, there is certainly something wrong.
  42. */
  43. for (; npages; npages--, pages++) {
  44. struct page *page = *pages;
  45. struct folio *folio = page_folio(page);
  46. if (!folio_test_anon(folio))
  47. continue;
  48. if (!folio_test_large(folio) || folio_test_hugetlb(folio))
  49. VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page), page);
  50. else
  51. /* Either a PTE-mapped or a PMD-mapped THP. */
  52. VM_BUG_ON_PAGE(!PageAnonExclusive(&folio->page) &&
  53. !PageAnonExclusive(page), page);
  54. }
  55. }
  56. /*
  57. * Return the folio with ref appropriately incremented,
  58. * or NULL if that failed.
  59. */
  60. static inline struct folio *try_get_folio(struct page *page, int refs)
  61. {
  62. struct folio *folio;
  63. retry:
  64. folio = page_folio(page);
  65. if (WARN_ON_ONCE(folio_ref_count(folio) < 0))
  66. return NULL;
  67. if (unlikely(!folio_ref_try_add_rcu(folio, refs)))
  68. return NULL;
  69. /*
  70. * At this point we have a stable reference to the folio; but it
  71. * could be that between calling page_folio() and the refcount
  72. * increment, the folio was split, in which case we'd end up
  73. * holding a reference on a folio that has nothing to do with the page
  74. * we were given anymore.
  75. * So now that the folio is stable, recheck that the page still
  76. * belongs to this folio.
  77. */
  78. if (unlikely(page_folio(page) != folio)) {
  79. if (!put_devmap_managed_page_refs(&folio->page, refs))
  80. folio_put_refs(folio, refs);
  81. goto retry;
  82. }
  83. return folio;
  84. }
  85. /**
  86. * try_grab_folio() - Attempt to get or pin a folio.
  87. * @page: pointer to page to be grabbed
  88. * @refs: the value to (effectively) add to the folio's refcount
  89. * @flags: gup flags: these are the FOLL_* flag values.
  90. *
  91. * "grab" names in this file mean, "look at flags to decide whether to use
  92. * FOLL_PIN or FOLL_GET behavior, when incrementing the folio's refcount.
  93. *
  94. * Either FOLL_PIN or FOLL_GET (or neither) must be set, but not both at the
  95. * same time. (That's true throughout the get_user_pages*() and
  96. * pin_user_pages*() APIs.) Cases:
  97. *
  98. * FOLL_GET: folio's refcount will be incremented by @refs.
  99. *
  100. * FOLL_PIN on large folios: folio's refcount will be incremented by
  101. * @refs, and its compound_pincount will be incremented by @refs.
  102. *
  103. * FOLL_PIN on single-page folios: folio's refcount will be incremented by
  104. * @refs * GUP_PIN_COUNTING_BIAS.
  105. *
  106. * Return: The folio containing @page (with refcount appropriately
  107. * incremented) for success, or NULL upon failure. If neither FOLL_GET
  108. * nor FOLL_PIN was set, that's considered failure, and furthermore,
  109. * a likely bug in the caller, so a warning is also emitted.
  110. */
  111. struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
  112. {
  113. if (flags & FOLL_GET)
  114. return try_get_folio(page, refs);
  115. else if (flags & FOLL_PIN) {
  116. struct folio *folio;
  117. /*
  118. * Can't do FOLL_LONGTERM + FOLL_PIN gup fast path if not in a
  119. * right zone, so fail and let the caller fall back to the slow
  120. * path.
  121. */
  122. if (unlikely((flags & FOLL_LONGTERM) &&
  123. !is_longterm_pinnable_page(page)))
  124. return NULL;
  125. /*
  126. * CAUTION: Don't use compound_head() on the page before this
  127. * point, the result won't be stable.
  128. */
  129. folio = try_get_folio(page, refs);
  130. if (!folio)
  131. return NULL;
  132. /*
  133. * When pinning a large folio, use an exact count to track it.
  134. *
  135. * However, be sure to *also* increment the normal folio
  136. * refcount field at least once, so that the folio really
  137. * is pinned. That's why the refcount from the earlier
  138. * try_get_folio() is left intact.
  139. */
  140. if (folio_test_large(folio))
  141. atomic_add(refs, folio_pincount_ptr(folio));
  142. else
  143. folio_ref_add(folio,
  144. refs * (GUP_PIN_COUNTING_BIAS - 1));
  145. /*
  146. * Adjust the pincount before re-checking the PTE for changes.
  147. * This is essentially a smp_mb() and is paired with a memory
  148. * barrier in page_try_share_anon_rmap().
  149. */
  150. smp_mb__after_atomic();
  151. node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, refs);
  152. return folio;
  153. }
  154. WARN_ON_ONCE(1);
  155. return NULL;
  156. }
  157. static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
  158. {
  159. if (flags & FOLL_PIN) {
  160. node_stat_mod_folio(folio, NR_FOLL_PIN_RELEASED, refs);
  161. if (folio_test_large(folio))
  162. atomic_sub(refs, folio_pincount_ptr(folio));
  163. else
  164. refs *= GUP_PIN_COUNTING_BIAS;
  165. }
  166. if (!put_devmap_managed_page_refs(&folio->page, refs))
  167. folio_put_refs(folio, refs);
  168. }
  169. /**
  170. * try_grab_page() - elevate a page's refcount by a flag-dependent amount
  171. * @page: pointer to page to be grabbed
  172. * @flags: gup flags: these are the FOLL_* flag values.
  173. *
  174. * This might not do anything at all, depending on the flags argument.
  175. *
  176. * "grab" names in this file mean, "look at flags to decide whether to use
  177. * FOLL_PIN or FOLL_GET behavior, when incrementing the page's refcount.
  178. *
  179. * Either FOLL_PIN or FOLL_GET (or neither) may be set, but not both at the same
  180. * time. Cases: please see the try_grab_folio() documentation, with
  181. * "refs=1".
  182. *
  183. * Return: true for success, or if no action was required (if neither FOLL_PIN
  184. * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
  185. * FOLL_PIN was set, but the page could not be grabbed.
  186. */
  187. bool __must_check try_grab_page(struct page *page, unsigned int flags)
  188. {
  189. struct folio *folio = page_folio(page);
  190. WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
  191. if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
  192. return false;
  193. if (flags & FOLL_GET)
  194. folio_ref_inc(folio);
  195. else if (flags & FOLL_PIN) {
  196. /*
  197. * Similar to try_grab_folio(): be sure to *also*
  198. * increment the normal page refcount field at least once,
  199. * so that the page really is pinned.
  200. */
  201. if (folio_test_large(folio)) {
  202. folio_ref_add(folio, 1);
  203. atomic_add(1, folio_pincount_ptr(folio));
  204. } else {
  205. folio_ref_add(folio, GUP_PIN_COUNTING_BIAS);
  206. }
  207. node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
  208. }
  209. return true;
  210. }
  211. /**
  212. * unpin_user_page() - release a dma-pinned page
  213. * @page: pointer to page to be released
  214. *
  215. * Pages that were pinned via pin_user_pages*() must be released via either
  216. * unpin_user_page(), or one of the unpin_user_pages*() routines. This is so
  217. * that such pages can be separately tracked and uniquely handled. In
  218. * particular, interactions with RDMA and filesystems need special handling.
  219. */
  220. void unpin_user_page(struct page *page)
  221. {
  222. sanity_check_pinned_pages(&page, 1);
  223. gup_put_folio(page_folio(page), 1, FOLL_PIN);
  224. }
  225. EXPORT_SYMBOL(unpin_user_page);
  226. static inline struct folio *gup_folio_range_next(struct page *start,
  227. unsigned long npages, unsigned long i, unsigned int *ntails)
  228. {
  229. struct page *next = nth_page(start, i);
  230. struct folio *folio = page_folio(next);
  231. unsigned int nr = 1;
  232. if (folio_test_large(folio))
  233. nr = min_t(unsigned int, npages - i,
  234. folio_nr_pages(folio) - folio_page_idx(folio, next));
  235. *ntails = nr;
  236. return folio;
  237. }
  238. static inline struct folio *gup_folio_next(struct page **list,
  239. unsigned long npages, unsigned long i, unsigned int *ntails)
  240. {
  241. struct folio *folio = page_folio(list[i]);
  242. unsigned int nr;
  243. for (nr = i + 1; nr < npages; nr++) {
  244. if (page_folio(list[nr]) != folio)
  245. break;
  246. }
  247. *ntails = nr - i;
  248. return folio;
  249. }
  250. /**
  251. * unpin_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
  252. * @pages: array of pages to be maybe marked dirty, and definitely released.
  253. * @npages: number of pages in the @pages array.
  254. * @make_dirty: whether to mark the pages dirty
  255. *
  256. * "gup-pinned page" refers to a page that has had one of the get_user_pages()
  257. * variants called on that page.
  258. *
  259. * For each page in the @pages array, make that page (or its head page, if a
  260. * compound page) dirty, if @make_dirty is true, and if the page was previously
  261. * listed as clean. In any case, releases all pages using unpin_user_page(),
  262. * possibly via unpin_user_pages(), for the non-dirty case.
  263. *
  264. * Please see the unpin_user_page() documentation for details.
  265. *
  266. * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
  267. * required, then the caller should a) verify that this is really correct,
  268. * because _lock() is usually required, and b) hand code it:
  269. * set_page_dirty_lock(), unpin_user_page().
  270. *
  271. */
  272. void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages,
  273. bool make_dirty)
  274. {
  275. unsigned long i;
  276. struct folio *folio;
  277. unsigned int nr;
  278. if (!make_dirty) {
  279. unpin_user_pages(pages, npages);
  280. return;
  281. }
  282. sanity_check_pinned_pages(pages, npages);
  283. for (i = 0; i < npages; i += nr) {
  284. folio = gup_folio_next(pages, npages, i, &nr);
  285. /*
  286. * Checking PageDirty at this point may race with
  287. * clear_page_dirty_for_io(), but that's OK. Two key
  288. * cases:
  289. *
  290. * 1) This code sees the page as already dirty, so it
  291. * skips the call to set_page_dirty(). That could happen
  292. * because clear_page_dirty_for_io() called
  293. * page_mkclean(), followed by set_page_dirty().
  294. * However, now the page is going to get written back,
  295. * which meets the original intention of setting it
  296. * dirty, so all is well: clear_page_dirty_for_io() goes
  297. * on to call TestClearPageDirty(), and write the page
  298. * back.
  299. *
  300. * 2) This code sees the page as clean, so it calls
  301. * set_page_dirty(). The page stays dirty, despite being
  302. * written back, so it gets written back again in the
  303. * next writeback cycle. This is harmless.
  304. */
  305. if (!folio_test_dirty(folio)) {
  306. folio_lock(folio);
  307. folio_mark_dirty(folio);
  308. folio_unlock(folio);
  309. }
  310. gup_put_folio(folio, nr, FOLL_PIN);
  311. }
  312. }
  313. EXPORT_SYMBOL(unpin_user_pages_dirty_lock);
  314. /**
  315. * unpin_user_page_range_dirty_lock() - release and optionally dirty
  316. * gup-pinned page range
  317. *
  318. * @page: the starting page of a range maybe marked dirty, and definitely released.
  319. * @npages: number of consecutive pages to release.
  320. * @make_dirty: whether to mark the pages dirty
  321. *
  322. * "gup-pinned page range" refers to a range of pages that has had one of the
  323. * pin_user_pages() variants called on that page.
  324. *
  325. * For the page ranges defined by [page .. page+npages], make that range (or
  326. * its head pages, if a compound page) dirty, if @make_dirty is true, and if the
  327. * page range was previously listed as clean.
  328. *
  329. * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
  330. * required, then the caller should a) verify that this is really correct,
  331. * because _lock() is usually required, and b) hand code it:
  332. * set_page_dirty_lock(), unpin_user_page().
  333. *
  334. */
  335. void unpin_user_page_range_dirty_lock(struct page *page, unsigned long npages,
  336. bool make_dirty)
  337. {
  338. unsigned long i;
  339. struct folio *folio;
  340. unsigned int nr;
  341. for (i = 0; i < npages; i += nr) {
  342. folio = gup_folio_range_next(page, npages, i, &nr);
  343. if (make_dirty && !folio_test_dirty(folio)) {
  344. folio_lock(folio);
  345. folio_mark_dirty(folio);
  346. folio_unlock(folio);
  347. }
  348. gup_put_folio(folio, nr, FOLL_PIN);
  349. }
  350. }
  351. EXPORT_SYMBOL(unpin_user_page_range_dirty_lock);
  352. static void unpin_user_pages_lockless(struct page **pages, unsigned long npages)
  353. {
  354. unsigned long i;
  355. struct folio *folio;
  356. unsigned int nr;
  357. /*
  358. * Don't perform any sanity checks because we might have raced with
  359. * fork() and some anonymous pages might now actually be shared --
  360. * which is why we're unpinning after all.
  361. */
  362. for (i = 0; i < npages; i += nr) {
  363. folio = gup_folio_next(pages, npages, i, &nr);
  364. gup_put_folio(folio, nr, FOLL_PIN);
  365. }
  366. }
  367. /**
  368. * unpin_user_pages() - release an array of gup-pinned pages.
  369. * @pages: array of pages to be marked dirty and released.
  370. * @npages: number of pages in the @pages array.
  371. *
  372. * For each page in the @pages array, release the page using unpin_user_page().
  373. *
  374. * Please see the unpin_user_page() documentation for details.
  375. */
  376. void unpin_user_pages(struct page **pages, unsigned long npages)
  377. {
  378. unsigned long i;
  379. struct folio *folio;
  380. unsigned int nr;
  381. /*
  382. * If this WARN_ON() fires, then the system *might* be leaking pages (by
  383. * leaving them pinned), but probably not. More likely, gup/pup returned
  384. * a hard -ERRNO error to the caller, who erroneously passed it here.
  385. */
  386. if (WARN_ON(IS_ERR_VALUE(npages)))
  387. return;
  388. sanity_check_pinned_pages(pages, npages);
  389. for (i = 0; i < npages; i += nr) {
  390. folio = gup_folio_next(pages, npages, i, &nr);
  391. gup_put_folio(folio, nr, FOLL_PIN);
  392. }
  393. }
  394. EXPORT_SYMBOL(unpin_user_pages);
  395. /*
  396. * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
  397. * lifecycle. Avoid setting the bit unless necessary, or it might cause write
  398. * cache bouncing on large SMP machines for concurrent pinned gups.
  399. */
  400. static inline void mm_set_has_pinned_flag(unsigned long *mm_flags)
  401. {
  402. if (!test_bit(MMF_HAS_PINNED, mm_flags))
  403. set_bit(MMF_HAS_PINNED, mm_flags);
  404. }
  405. #ifdef CONFIG_MMU
  406. static struct page *no_page_table(struct vm_area_struct *vma,
  407. unsigned int flags)
  408. {
  409. /*
  410. * When core dumping an enormous anonymous area that nobody
  411. * has touched so far, we don't want to allocate unnecessary pages or
  412. * page tables. Return error instead of NULL to skip handle_mm_fault,
  413. * then get_dump_page() will return NULL to leave a hole in the dump.
  414. * But we can only make this optimization where a hole would surely
  415. * be zero-filled if handle_mm_fault() actually did handle it.
  416. */
  417. if ((flags & FOLL_DUMP) &&
  418. (vma_is_anonymous(vma) || !vma->vm_ops->fault))
  419. return ERR_PTR(-EFAULT);
  420. return NULL;
  421. }
  422. static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address,
  423. pte_t *pte, unsigned int flags)
  424. {
  425. if (flags & FOLL_TOUCH) {
  426. pte_t entry = *pte;
  427. if (flags & FOLL_WRITE)
  428. entry = pte_mkdirty(entry);
  429. entry = pte_mkyoung(entry);
  430. if (!pte_same(*pte, entry)) {
  431. set_pte_at(vma->vm_mm, address, pte, entry);
  432. update_mmu_cache(vma, address, pte);
  433. }
  434. }
  435. /* Proper page table entry exists, but no corresponding struct page */
  436. return -EEXIST;
  437. }
  438. /* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */
  439. static inline bool can_follow_write_pte(pte_t pte, struct page *page,
  440. struct vm_area_struct *vma,
  441. unsigned int flags)
  442. {
  443. /* If the pte is writable, we can write to the page. */
  444. if (pte_write(pte))
  445. return true;
  446. /* Maybe FOLL_FORCE is set to override it? */
  447. if (!(flags & FOLL_FORCE))
  448. return false;
  449. /* But FOLL_FORCE has no effect on shared mappings */
  450. if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
  451. return false;
  452. /* ... or read-only private ones */
  453. if (!(vma->vm_flags & VM_MAYWRITE))
  454. return false;
  455. /* ... or already writable ones that just need to take a write fault */
  456. if (vma->vm_flags & VM_WRITE)
  457. return false;
  458. /*
  459. * See can_change_pte_writable(): we broke COW and could map the page
  460. * writable if we have an exclusive anonymous page ...
  461. */
  462. if (!page || !PageAnon(page) || !PageAnonExclusive(page))
  463. return false;
  464. /* ... and a write-fault isn't required for other reasons. */
  465. if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte))
  466. return false;
  467. return !userfaultfd_pte_wp(vma, pte);
  468. }
  469. static struct page *follow_page_pte(struct vm_area_struct *vma,
  470. unsigned long address, pmd_t *pmd, unsigned int flags,
  471. struct dev_pagemap **pgmap)
  472. {
  473. struct mm_struct *mm = vma->vm_mm;
  474. struct page *page;
  475. spinlock_t *ptl;
  476. pte_t *ptep, pte;
  477. int ret;
  478. /* FOLL_GET and FOLL_PIN are mutually exclusive. */
  479. if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
  480. (FOLL_PIN | FOLL_GET)))
  481. return ERR_PTR(-EINVAL);
  482. /*
  483. * Considering PTE level hugetlb, like continuous-PTE hugetlb on
  484. * ARM64 architecture.
  485. */
  486. if (is_vm_hugetlb_page(vma)) {
  487. page = follow_huge_pmd_pte(vma, address, flags);
  488. if (page)
  489. return page;
  490. return no_page_table(vma, flags);
  491. }
  492. retry:
  493. if (unlikely(pmd_bad(*pmd)))
  494. return no_page_table(vma, flags);
  495. ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
  496. pte = *ptep;
  497. if (!pte_present(pte)) {
  498. swp_entry_t entry;
  499. /*
  500. * KSM's break_ksm() relies upon recognizing a ksm page
  501. * even while it is being migrated, so for that case we
  502. * need migration_entry_wait().
  503. */
  504. if (likely(!(flags & FOLL_MIGRATION)))
  505. goto no_page;
  506. if (pte_none(pte))
  507. goto no_page;
  508. entry = pte_to_swp_entry(pte);
  509. if (!is_migration_entry(entry))
  510. goto no_page;
  511. pte_unmap_unlock(ptep, ptl);
  512. migration_entry_wait(mm, pmd, address);
  513. goto retry;
  514. }
  515. if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
  516. goto no_page;
  517. page = vm_normal_page(vma, address, pte);
  518. /*
  519. * We only care about anon pages in can_follow_write_pte() and don't
  520. * have to worry about pte_devmap() because they are never anon.
  521. */
  522. if ((flags & FOLL_WRITE) &&
  523. !can_follow_write_pte(pte, page, vma, flags)) {
  524. page = NULL;
  525. goto out;
  526. }
  527. if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) {
  528. /*
  529. * Only return device mapping pages in the FOLL_GET or FOLL_PIN
  530. * case since they are only valid while holding the pgmap
  531. * reference.
  532. */
  533. *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
  534. if (*pgmap)
  535. page = pte_page(pte);
  536. else
  537. goto no_page;
  538. } else if (unlikely(!page)) {
  539. if (flags & FOLL_DUMP) {
  540. /* Avoid special (like zero) pages in core dumps */
  541. page = ERR_PTR(-EFAULT);
  542. goto out;
  543. }
  544. if (is_zero_pfn(pte_pfn(pte))) {
  545. page = pte_page(pte);
  546. } else {
  547. ret = follow_pfn_pte(vma, address, ptep, flags);
  548. page = ERR_PTR(ret);
  549. goto out;
  550. }
  551. }
  552. if (!pte_write(pte) && gup_must_unshare(flags, page)) {
  553. page = ERR_PTR(-EMLINK);
  554. goto out;
  555. }
  556. VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
  557. !PageAnonExclusive(page), page);
  558. /* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
  559. if (unlikely(!try_grab_page(page, flags))) {
  560. page = ERR_PTR(-ENOMEM);
  561. goto out;
  562. }
  563. /*
  564. * We need to make the page accessible if and only if we are going
  565. * to access its content (the FOLL_PIN case). Please see
  566. * Documentation/core-api/pin_user_pages.rst for details.
  567. */
  568. if (flags & FOLL_PIN) {
  569. ret = arch_make_page_accessible(page);
  570. if (ret) {
  571. unpin_user_page(page);
  572. page = ERR_PTR(ret);
  573. goto out;
  574. }
  575. }
  576. if (flags & FOLL_TOUCH) {
  577. if ((flags & FOLL_WRITE) &&
  578. !pte_dirty(pte) && !PageDirty(page))
  579. set_page_dirty(page);
  580. /*
  581. * pte_mkyoung() would be more correct here, but atomic care
  582. * is needed to avoid losing the dirty bit: it is easier to use
  583. * mark_page_accessed().
  584. */
  585. mark_page_accessed(page);
  586. }
  587. out:
  588. pte_unmap_unlock(ptep, ptl);
  589. return page;
  590. no_page:
  591. pte_unmap_unlock(ptep, ptl);
  592. if (!pte_none(pte))
  593. return NULL;
  594. return no_page_table(vma, flags);
  595. }
  596. static struct page *follow_pmd_mask(struct vm_area_struct *vma,
  597. unsigned long address, pud_t *pudp,
  598. unsigned int flags,
  599. struct follow_page_context *ctx)
  600. {
  601. pmd_t *pmd, pmdval;
  602. spinlock_t *ptl;
  603. struct page *page;
  604. struct mm_struct *mm = vma->vm_mm;
  605. pmd = pmd_offset(pudp, address);
  606. /*
  607. * The READ_ONCE() will stabilize the pmdval in a register or
  608. * on the stack so that it will stop changing under the code.
  609. */
  610. pmdval = READ_ONCE(*pmd);
  611. if (pmd_none(pmdval))
  612. return no_page_table(vma, flags);
  613. if (pmd_huge(pmdval) && is_vm_hugetlb_page(vma)) {
  614. page = follow_huge_pmd_pte(vma, address, flags);
  615. if (page)
  616. return page;
  617. return no_page_table(vma, flags);
  618. }
  619. if (is_hugepd(__hugepd(pmd_val(pmdval)))) {
  620. page = follow_huge_pd(vma, address,
  621. __hugepd(pmd_val(pmdval)), flags,
  622. PMD_SHIFT);
  623. if (page)
  624. return page;
  625. return no_page_table(vma, flags);
  626. }
  627. retry:
  628. if (!pmd_present(pmdval)) {
  629. /*
  630. * Should never reach here, if thp migration is not supported;
  631. * Otherwise, it must be a thp migration entry.
  632. */
  633. VM_BUG_ON(!thp_migration_supported() ||
  634. !is_pmd_migration_entry(pmdval));
  635. if (likely(!(flags & FOLL_MIGRATION)))
  636. return no_page_table(vma, flags);
  637. pmd_migration_entry_wait(mm, pmd);
  638. pmdval = READ_ONCE(*pmd);
  639. /*
  640. * MADV_DONTNEED may convert the pmd to null because
  641. * mmap_lock is held in read mode
  642. */
  643. if (pmd_none(pmdval))
  644. return no_page_table(vma, flags);
  645. goto retry;
  646. }
  647. if (pmd_devmap(pmdval)) {
  648. ptl = pmd_lock(mm, pmd);
  649. page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
  650. spin_unlock(ptl);
  651. if (page)
  652. return page;
  653. }
  654. if (likely(!pmd_trans_huge(pmdval)))
  655. return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
  656. if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags))
  657. return no_page_table(vma, flags);
  658. retry_locked:
  659. ptl = pmd_lock(mm, pmd);
  660. if (unlikely(pmd_none(*pmd))) {
  661. spin_unlock(ptl);
  662. return no_page_table(vma, flags);
  663. }
  664. if (unlikely(!pmd_present(*pmd))) {
  665. spin_unlock(ptl);
  666. if (likely(!(flags & FOLL_MIGRATION)))
  667. return no_page_table(vma, flags);
  668. pmd_migration_entry_wait(mm, pmd);
  669. goto retry_locked;
  670. }
  671. if (unlikely(!pmd_trans_huge(*pmd))) {
  672. spin_unlock(ptl);
  673. return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
  674. }
  675. if (flags & FOLL_SPLIT_PMD) {
  676. int ret;
  677. page = pmd_page(*pmd);
  678. if (is_huge_zero_page(page)) {
  679. spin_unlock(ptl);
  680. ret = 0;
  681. split_huge_pmd(vma, pmd, address);
  682. if (pmd_trans_unstable(pmd))
  683. ret = -EBUSY;
  684. } else {
  685. spin_unlock(ptl);
  686. split_huge_pmd(vma, pmd, address);
  687. ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
  688. }
  689. return ret ? ERR_PTR(ret) :
  690. follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
  691. }
  692. page = follow_trans_huge_pmd(vma, address, pmd, flags);
  693. spin_unlock(ptl);
  694. ctx->page_mask = HPAGE_PMD_NR - 1;
  695. return page;
  696. }
  697. static struct page *follow_pud_mask(struct vm_area_struct *vma,
  698. unsigned long address, p4d_t *p4dp,
  699. unsigned int flags,
  700. struct follow_page_context *ctx)
  701. {
  702. pud_t *pud;
  703. spinlock_t *ptl;
  704. struct page *page;
  705. struct mm_struct *mm = vma->vm_mm;
  706. pud = pud_offset(p4dp, address);
  707. if (pud_none(*pud))
  708. return no_page_table(vma, flags);
  709. if (pud_huge(*pud) && is_vm_hugetlb_page(vma)) {
  710. page = follow_huge_pud(mm, address, pud, flags);
  711. if (page)
  712. return page;
  713. return no_page_table(vma, flags);
  714. }
  715. if (is_hugepd(__hugepd(pud_val(*pud)))) {
  716. page = follow_huge_pd(vma, address,
  717. __hugepd(pud_val(*pud)), flags,
  718. PUD_SHIFT);
  719. if (page)
  720. return page;
  721. return no_page_table(vma, flags);
  722. }
  723. if (pud_devmap(*pud)) {
  724. ptl = pud_lock(mm, pud);
  725. page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
  726. spin_unlock(ptl);
  727. if (page)
  728. return page;
  729. }
  730. if (unlikely(pud_bad(*pud)))
  731. return no_page_table(vma, flags);
  732. return follow_pmd_mask(vma, address, pud, flags, ctx);
  733. }
  734. static struct page *follow_p4d_mask(struct vm_area_struct *vma,
  735. unsigned long address, pgd_t *pgdp,
  736. unsigned int flags,
  737. struct follow_page_context *ctx)
  738. {
  739. p4d_t *p4d;
  740. struct page *page;
  741. p4d = p4d_offset(pgdp, address);
  742. if (p4d_none(*p4d))
  743. return no_page_table(vma, flags);
  744. BUILD_BUG_ON(p4d_huge(*p4d));
  745. if (unlikely(p4d_bad(*p4d)))
  746. return no_page_table(vma, flags);
  747. if (is_hugepd(__hugepd(p4d_val(*p4d)))) {
  748. page = follow_huge_pd(vma, address,
  749. __hugepd(p4d_val(*p4d)), flags,
  750. P4D_SHIFT);
  751. if (page)
  752. return page;
  753. return no_page_table(vma, flags);
  754. }
  755. return follow_pud_mask(vma, address, p4d, flags, ctx);
  756. }
  757. /**
  758. * follow_page_mask - look up a page descriptor from a user-virtual address
  759. * @vma: vm_area_struct mapping @address
  760. * @address: virtual address to look up
  761. * @flags: flags modifying lookup behaviour
  762. * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
  763. * pointer to output page_mask
  764. *
  765. * @flags can have FOLL_ flags set, defined in <linux/mm.h>
  766. *
  767. * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
  768. * the device's dev_pagemap metadata to avoid repeating expensive lookups.
  769. *
  770. * When getting an anonymous page and the caller has to trigger unsharing
  771. * of a shared anonymous page first, -EMLINK is returned. The caller should
  772. * trigger a fault with FAULT_FLAG_UNSHARE set. Note that unsharing is only
  773. * relevant with FOLL_PIN and !FOLL_WRITE.
  774. *
  775. * On output, the @ctx->page_mask is set according to the size of the page.
  776. *
  777. * Return: the mapped (struct page *), %NULL if no mapping exists, or
  778. * an error pointer if there is a mapping to something not represented
  779. * by a page descriptor (see also vm_normal_page()).
  780. */
  781. static struct page *follow_page_mask(struct vm_area_struct *vma,
  782. unsigned long address, unsigned int flags,
  783. struct follow_page_context *ctx)
  784. {
  785. pgd_t *pgd;
  786. struct page *page;
  787. struct mm_struct *mm = vma->vm_mm;
  788. ctx->page_mask = 0;
  789. /* make this handle hugepd */
  790. page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
  791. if (!IS_ERR(page)) {
  792. WARN_ON_ONCE(flags & (FOLL_GET | FOLL_PIN));
  793. return page;
  794. }
  795. pgd = pgd_offset(mm, address);
  796. if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
  797. return no_page_table(vma, flags);
  798. if (pgd_huge(*pgd)) {
  799. page = follow_huge_pgd(mm, address, pgd, flags);
  800. if (page)
  801. return page;
  802. return no_page_table(vma, flags);
  803. }
  804. if (is_hugepd(__hugepd(pgd_val(*pgd)))) {
  805. page = follow_huge_pd(vma, address,
  806. __hugepd(pgd_val(*pgd)), flags,
  807. PGDIR_SHIFT);
  808. if (page)
  809. return page;
  810. return no_page_table(vma, flags);
  811. }
  812. return follow_p4d_mask(vma, address, pgd, flags, ctx);
  813. }
  814. struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
  815. unsigned int foll_flags)
  816. {
  817. struct follow_page_context ctx = { NULL };
  818. struct page *page;
  819. if (vma_is_secretmem(vma))
  820. return NULL;
  821. if (foll_flags & FOLL_PIN)
  822. return NULL;
  823. page = follow_page_mask(vma, address, foll_flags, &ctx);
  824. if (ctx.pgmap)
  825. put_dev_pagemap(ctx.pgmap);
  826. return page;
  827. }
  828. static int get_gate_page(struct mm_struct *mm, unsigned long address,
  829. unsigned int gup_flags, struct vm_area_struct **vma,
  830. struct page **page)
  831. {
  832. pgd_t *pgd;
  833. p4d_t *p4d;
  834. pud_t *pud;
  835. pmd_t *pmd;
  836. pte_t *pte;
  837. int ret = -EFAULT;
  838. /* user gate pages are read-only */
  839. if (gup_flags & FOLL_WRITE)
  840. return -EFAULT;
  841. if (address > TASK_SIZE)
  842. pgd = pgd_offset_k(address);
  843. else
  844. pgd = pgd_offset_gate(mm, address);
  845. if (pgd_none(*pgd))
  846. return -EFAULT;
  847. p4d = p4d_offset(pgd, address);
  848. if (p4d_none(*p4d))
  849. return -EFAULT;
  850. pud = pud_offset(p4d, address);
  851. if (pud_none(*pud))
  852. return -EFAULT;
  853. pmd = pmd_offset(pud, address);
  854. if (!pmd_present(*pmd))
  855. return -EFAULT;
  856. VM_BUG_ON(pmd_trans_huge(*pmd));
  857. pte = pte_offset_map(pmd, address);
  858. if (pte_none(*pte))
  859. goto unmap;
  860. *vma = get_gate_vma(mm);
  861. if (!page)
  862. goto out;
  863. *page = vm_normal_page(*vma, address, *pte);
  864. if (!*page) {
  865. if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
  866. goto unmap;
  867. *page = pte_page(*pte);
  868. }
  869. if (unlikely(!try_grab_page(*page, gup_flags))) {
  870. ret = -ENOMEM;
  871. goto unmap;
  872. }
  873. out:
  874. ret = 0;
  875. unmap:
  876. pte_unmap(pte);
  877. return ret;
  878. }
  879. /*
  880. * mmap_lock must be held on entry. If @locked != NULL and *@flags
  881. * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
  882. * is, *@locked will be set to 0 and -EBUSY returned.
  883. */
  884. static int faultin_page(struct vm_area_struct *vma,
  885. unsigned long address, unsigned int *flags, bool unshare,
  886. int *locked)
  887. {
  888. unsigned int fault_flags = 0;
  889. vm_fault_t ret;
  890. if (*flags & FOLL_NOFAULT)
  891. return -EFAULT;
  892. if (*flags & FOLL_WRITE)
  893. fault_flags |= FAULT_FLAG_WRITE;
  894. if (*flags & FOLL_REMOTE)
  895. fault_flags |= FAULT_FLAG_REMOTE;
  896. if (locked)
  897. fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
  898. if (*flags & FOLL_NOWAIT)
  899. fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
  900. if (*flags & FOLL_TRIED) {
  901. /*
  902. * Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
  903. * can co-exist
  904. */
  905. fault_flags |= FAULT_FLAG_TRIED;
  906. }
  907. if (unshare) {
  908. fault_flags |= FAULT_FLAG_UNSHARE;
  909. /* FAULT_FLAG_WRITE and FAULT_FLAG_UNSHARE are incompatible */
  910. VM_BUG_ON(fault_flags & FAULT_FLAG_WRITE);
  911. }
  912. ret = handle_mm_fault(vma, address, fault_flags, NULL);
  913. if (ret & VM_FAULT_COMPLETED) {
  914. /*
  915. * With FAULT_FLAG_RETRY_NOWAIT we'll never release the
  916. * mmap lock in the page fault handler. Sanity check this.
  917. */
  918. WARN_ON_ONCE(fault_flags & FAULT_FLAG_RETRY_NOWAIT);
  919. if (locked)
  920. *locked = 0;
  921. /*
  922. * We should do the same as VM_FAULT_RETRY, but let's not
  923. * return -EBUSY since that's not reflecting the reality of
  924. * what has happened - we've just fully completed a page
  925. * fault, with the mmap lock released. Use -EAGAIN to show
  926. * that we want to take the mmap lock _again_.
  927. */
  928. return -EAGAIN;
  929. }
  930. if (ret & VM_FAULT_ERROR) {
  931. int err = vm_fault_to_errno(ret, *flags);
  932. if (err)
  933. return err;
  934. BUG();
  935. }
  936. if (ret & VM_FAULT_RETRY) {
  937. if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
  938. *locked = 0;
  939. return -EBUSY;
  940. }
  941. return 0;
  942. }
  943. static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
  944. {
  945. vm_flags_t vm_flags = vma->vm_flags;
  946. int write = (gup_flags & FOLL_WRITE);
  947. int foreign = (gup_flags & FOLL_REMOTE);
  948. if (vm_flags & (VM_IO | VM_PFNMAP))
  949. return -EFAULT;
  950. if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
  951. return -EFAULT;
  952. if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
  953. return -EOPNOTSUPP;
  954. if (vma_is_secretmem(vma))
  955. return -EFAULT;
  956. if (write) {
  957. if (!(vm_flags & VM_WRITE)) {
  958. if (!(gup_flags & FOLL_FORCE))
  959. return -EFAULT;
  960. /* hugetlb does not support FOLL_FORCE|FOLL_WRITE. */
  961. if (is_vm_hugetlb_page(vma))
  962. return -EFAULT;
  963. /*
  964. * We used to let the write,force case do COW in a
  965. * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could
  966. * set a breakpoint in a read-only mapping of an
  967. * executable, without corrupting the file (yet only
  968. * when that file had been opened for writing!).
  969. * Anon pages in shared mappings are surprising: now
  970. * just reject it.
  971. */
  972. if (!is_cow_mapping(vm_flags))
  973. return -EFAULT;
  974. }
  975. } else if (!(vm_flags & VM_READ)) {
  976. if (!(gup_flags & FOLL_FORCE))
  977. return -EFAULT;
  978. /*
  979. * Is there actually any vma we can reach here which does not
  980. * have VM_MAYREAD set?
  981. */
  982. if (!(vm_flags & VM_MAYREAD))
  983. return -EFAULT;
  984. }
  985. /*
  986. * gups are always data accesses, not instruction
  987. * fetches, so execute=false here
  988. */
  989. if (!arch_vma_access_permitted(vma, write, false, foreign))
  990. return -EFAULT;
  991. return 0;
  992. }
  993. /**
  994. * __get_user_pages() - pin user pages in memory
  995. * @mm: mm_struct of target mm
  996. * @start: starting user address
  997. * @nr_pages: number of pages from start to pin
  998. * @gup_flags: flags modifying pin behaviour
  999. * @pages: array that receives pointers to the pages pinned.
  1000. * Should be at least nr_pages long. Or NULL, if caller
  1001. * only intends to ensure the pages are faulted in.
  1002. * @vmas: array of pointers to vmas corresponding to each page.
  1003. * Or NULL if the caller does not require them.
  1004. * @locked: whether we're still with the mmap_lock held
  1005. *
  1006. * Returns either number of pages pinned (which may be less than the
  1007. * number requested), or an error. Details about the return value:
  1008. *
  1009. * -- If nr_pages is 0, returns 0.
  1010. * -- If nr_pages is >0, but no pages were pinned, returns -errno.
  1011. * -- If nr_pages is >0, and some pages were pinned, returns the number of
  1012. * pages pinned. Again, this may be less than nr_pages.
  1013. * -- 0 return value is possible when the fault would need to be retried.
  1014. *
  1015. * The caller is responsible for releasing returned @pages, via put_page().
  1016. *
  1017. * @vmas are valid only as long as mmap_lock is held.
  1018. *
  1019. * Must be called with mmap_lock held. It may be released. See below.
  1020. *
  1021. * __get_user_pages walks a process's page tables and takes a reference to
  1022. * each struct page that each user address corresponds to at a given
  1023. * instant. That is, it takes the page that would be accessed if a user
  1024. * thread accesses the given user virtual address at that instant.
  1025. *
  1026. * This does not guarantee that the page exists in the user mappings when
  1027. * __get_user_pages returns, and there may even be a completely different
  1028. * page there in some cases (eg. if mmapped pagecache has been invalidated
  1029. * and subsequently re faulted). However it does guarantee that the page
  1030. * won't be freed completely. And mostly callers simply care that the page
  1031. * contains data that was valid *at some point in time*. Typically, an IO
  1032. * or similar operation cannot guarantee anything stronger anyway because
  1033. * locks can't be held over the syscall boundary.
  1034. *
  1035. * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
  1036. * the page is written to, set_page_dirty (or set_page_dirty_lock, as
  1037. * appropriate) must be called after the page is finished with, and
  1038. * before put_page is called.
  1039. *
  1040. * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
  1041. * released by an up_read(). That can happen if @gup_flags does not
  1042. * have FOLL_NOWAIT.
  1043. *
  1044. * A caller using such a combination of @locked and @gup_flags
  1045. * must therefore hold the mmap_lock for reading only, and recognize
  1046. * when it's been released. Otherwise, it must be held for either
  1047. * reading or writing and will not be released.
  1048. *
  1049. * In most cases, get_user_pages or get_user_pages_fast should be used
  1050. * instead of __get_user_pages. __get_user_pages should be used only if
  1051. * you need some special @gup_flags.
  1052. */
  1053. static long __get_user_pages(struct mm_struct *mm,
  1054. unsigned long start, unsigned long nr_pages,
  1055. unsigned int gup_flags, struct page **pages,
  1056. struct vm_area_struct **vmas, int *locked)
  1057. {
  1058. long ret = 0, i = 0;
  1059. struct vm_area_struct *vma = NULL;
  1060. struct follow_page_context ctx = { NULL };
  1061. if (!nr_pages)
  1062. return 0;
  1063. start = untagged_addr(start);
  1064. VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN)));
  1065. do {
  1066. struct page *page;
  1067. unsigned int foll_flags = gup_flags;
  1068. unsigned int page_increm;
  1069. /* first iteration or cross vma bound */
  1070. if (!vma || start >= vma->vm_end) {
  1071. vma = vma_lookup(mm, start);
  1072. if (!vma && in_gate_area(mm, start)) {
  1073. ret = get_gate_page(mm, start & PAGE_MASK,
  1074. gup_flags, &vma,
  1075. pages ? &pages[i] : NULL);
  1076. if (ret)
  1077. goto out;
  1078. ctx.page_mask = 0;
  1079. goto next_page;
  1080. }
  1081. if (!vma) {
  1082. ret = -EFAULT;
  1083. goto out;
  1084. }
  1085. ret = check_vma_flags(vma, gup_flags);
  1086. if (ret)
  1087. goto out;
  1088. if (is_vm_hugetlb_page(vma)) {
  1089. i = follow_hugetlb_page(mm, vma, pages, vmas,
  1090. &start, &nr_pages, i,
  1091. gup_flags, locked);
  1092. if (locked && *locked == 0) {
  1093. /*
  1094. * We've got a VM_FAULT_RETRY
  1095. * and we've lost mmap_lock.
  1096. * We must stop here.
  1097. */
  1098. BUG_ON(gup_flags & FOLL_NOWAIT);
  1099. goto out;
  1100. }
  1101. continue;
  1102. }
  1103. }
  1104. retry:
  1105. /*
  1106. * If we have a pending SIGKILL, don't keep faulting pages and
  1107. * potentially allocating memory.
  1108. */
  1109. if (fatal_signal_pending(current)) {
  1110. ret = -EINTR;
  1111. goto out;
  1112. }
  1113. cond_resched();
  1114. page = follow_page_mask(vma, start, foll_flags, &ctx);
  1115. if (!page || PTR_ERR(page) == -EMLINK) {
  1116. ret = faultin_page(vma, start, &foll_flags,
  1117. PTR_ERR(page) == -EMLINK, locked);
  1118. switch (ret) {
  1119. case 0:
  1120. goto retry;
  1121. case -EBUSY:
  1122. case -EAGAIN:
  1123. ret = 0;
  1124. fallthrough;
  1125. case -EFAULT:
  1126. case -ENOMEM:
  1127. case -EHWPOISON:
  1128. goto out;
  1129. }
  1130. BUG();
  1131. } else if (PTR_ERR(page) == -EEXIST) {
  1132. /*
  1133. * Proper page table entry exists, but no corresponding
  1134. * struct page. If the caller expects **pages to be
  1135. * filled in, bail out now, because that can't be done
  1136. * for this page.
  1137. */
  1138. if (pages) {
  1139. ret = PTR_ERR(page);
  1140. goto out;
  1141. }
  1142. goto next_page;
  1143. } else if (IS_ERR(page)) {
  1144. ret = PTR_ERR(page);
  1145. goto out;
  1146. }
  1147. if (pages) {
  1148. pages[i] = page;
  1149. flush_anon_page(vma, page, start);
  1150. flush_dcache_page(page);
  1151. ctx.page_mask = 0;
  1152. }
  1153. next_page:
  1154. if (vmas) {
  1155. vmas[i] = vma;
  1156. ctx.page_mask = 0;
  1157. }
  1158. page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
  1159. if (page_increm > nr_pages)
  1160. page_increm = nr_pages;
  1161. i += page_increm;
  1162. start += page_increm * PAGE_SIZE;
  1163. nr_pages -= page_increm;
  1164. } while (nr_pages);
  1165. out:
  1166. if (ctx.pgmap)
  1167. put_dev_pagemap(ctx.pgmap);
  1168. return i ? i : ret;
  1169. }
  1170. static bool vma_permits_fault(struct vm_area_struct *vma,
  1171. unsigned int fault_flags)
  1172. {
  1173. bool write = !!(fault_flags & FAULT_FLAG_WRITE);
  1174. bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE);
  1175. vm_flags_t vm_flags = write ? VM_WRITE : VM_READ;
  1176. if (!(vm_flags & vma->vm_flags))
  1177. return false;
  1178. /*
  1179. * The architecture might have a hardware protection
  1180. * mechanism other than read/write that can deny access.
  1181. *
  1182. * gup always represents data access, not instruction
  1183. * fetches, so execute=false here:
  1184. */
  1185. if (!arch_vma_access_permitted(vma, write, false, foreign))
  1186. return false;
  1187. return true;
  1188. }
  1189. /**
  1190. * fixup_user_fault() - manually resolve a user page fault
  1191. * @mm: mm_struct of target mm
  1192. * @address: user address
  1193. * @fault_flags:flags to pass down to handle_mm_fault()
  1194. * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
  1195. * does not allow retry. If NULL, the caller must guarantee
  1196. * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
  1197. *
  1198. * This is meant to be called in the specific scenario where for locking reasons
  1199. * we try to access user memory in atomic context (within a pagefault_disable()
  1200. * section), this returns -EFAULT, and we want to resolve the user fault before
  1201. * trying again.
  1202. *
  1203. * Typically this is meant to be used by the futex code.
  1204. *
  1205. * The main difference with get_user_pages() is that this function will
  1206. * unconditionally call handle_mm_fault() which will in turn perform all the
  1207. * necessary SW fixup of the dirty and young bits in the PTE, while
  1208. * get_user_pages() only guarantees to update these in the struct page.
  1209. *
  1210. * This is important for some architectures where those bits also gate the
  1211. * access permission to the page because they are maintained in software. On
  1212. * such architectures, gup() will not be enough to make a subsequent access
  1213. * succeed.
  1214. *
  1215. * This function will not return with an unlocked mmap_lock. So it has not the
  1216. * same semantics wrt the @mm->mmap_lock as does filemap_fault().
  1217. */
  1218. int fixup_user_fault(struct mm_struct *mm,
  1219. unsigned long address, unsigned int fault_flags,
  1220. bool *unlocked)
  1221. {
  1222. struct vm_area_struct *vma;
  1223. vm_fault_t ret;
  1224. address = untagged_addr(address);
  1225. if (unlocked)
  1226. fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
  1227. retry:
  1228. vma = vma_lookup(mm, address);
  1229. if (!vma)
  1230. return -EFAULT;
  1231. if (!vma_permits_fault(vma, fault_flags))
  1232. return -EFAULT;
  1233. if ((fault_flags & FAULT_FLAG_KILLABLE) &&
  1234. fatal_signal_pending(current))
  1235. return -EINTR;
  1236. ret = handle_mm_fault(vma, address, fault_flags, NULL);
  1237. if (ret & VM_FAULT_COMPLETED) {
  1238. /*
  1239. * NOTE: it's a pity that we need to retake the lock here
  1240. * to pair with the unlock() in the callers. Ideally we
  1241. * could tell the callers so they do not need to unlock.
  1242. */
  1243. mmap_read_lock(mm);
  1244. *unlocked = true;
  1245. return 0;
  1246. }
  1247. if (ret & VM_FAULT_ERROR) {
  1248. int err = vm_fault_to_errno(ret, 0);
  1249. if (err)
  1250. return err;
  1251. BUG();
  1252. }
  1253. if (ret & VM_FAULT_RETRY) {
  1254. mmap_read_lock(mm);
  1255. *unlocked = true;
  1256. fault_flags |= FAULT_FLAG_TRIED;
  1257. goto retry;
  1258. }
  1259. return 0;
  1260. }
  1261. EXPORT_SYMBOL_GPL(fixup_user_fault);
  1262. /*
  1263. * Please note that this function, unlike __get_user_pages will not
  1264. * return 0 for nr_pages > 0 without FOLL_NOWAIT
  1265. */
  1266. static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
  1267. unsigned long start,
  1268. unsigned long nr_pages,
  1269. struct page **pages,
  1270. struct vm_area_struct **vmas,
  1271. int *locked,
  1272. unsigned int flags)
  1273. {
  1274. long ret, pages_done;
  1275. bool lock_dropped;
  1276. if (locked) {
  1277. /* if VM_FAULT_RETRY can be returned, vmas become invalid */
  1278. BUG_ON(vmas);
  1279. /* check caller initialized locked */
  1280. BUG_ON(*locked != 1);
  1281. }
  1282. if (flags & FOLL_PIN)
  1283. mm_set_has_pinned_flag(&mm->flags);
  1284. /*
  1285. * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
  1286. * is to set FOLL_GET if the caller wants pages[] filled in (but has
  1287. * carelessly failed to specify FOLL_GET), so keep doing that, but only
  1288. * for FOLL_GET, not for the newer FOLL_PIN.
  1289. *
  1290. * FOLL_PIN always expects pages to be non-null, but no need to assert
  1291. * that here, as any failures will be obvious enough.
  1292. */
  1293. if (pages && !(flags & FOLL_PIN))
  1294. flags |= FOLL_GET;
  1295. pages_done = 0;
  1296. lock_dropped = false;
  1297. for (;;) {
  1298. ret = __get_user_pages(mm, start, nr_pages, flags, pages,
  1299. vmas, locked);
  1300. if (!locked)
  1301. /* VM_FAULT_RETRY couldn't trigger, bypass */
  1302. return ret;
  1303. /* VM_FAULT_RETRY or VM_FAULT_COMPLETED cannot return errors */
  1304. if (!*locked) {
  1305. BUG_ON(ret < 0);
  1306. BUG_ON(ret >= nr_pages);
  1307. }
  1308. if (ret > 0) {
  1309. nr_pages -= ret;
  1310. pages_done += ret;
  1311. if (!nr_pages)
  1312. break;
  1313. }
  1314. if (*locked) {
  1315. /*
  1316. * VM_FAULT_RETRY didn't trigger or it was a
  1317. * FOLL_NOWAIT.
  1318. */
  1319. if (!pages_done)
  1320. pages_done = ret;
  1321. break;
  1322. }
  1323. /*
  1324. * VM_FAULT_RETRY triggered, so seek to the faulting offset.
  1325. * For the prefault case (!pages) we only update counts.
  1326. */
  1327. if (likely(pages))
  1328. pages += ret;
  1329. start += ret << PAGE_SHIFT;
  1330. lock_dropped = true;
  1331. retry:
  1332. /*
  1333. * Repeat on the address that fired VM_FAULT_RETRY
  1334. * with both FAULT_FLAG_ALLOW_RETRY and
  1335. * FAULT_FLAG_TRIED. Note that GUP can be interrupted
  1336. * by fatal signals, so we need to check it before we
  1337. * start trying again otherwise it can loop forever.
  1338. */
  1339. if (fatal_signal_pending(current)) {
  1340. if (!pages_done)
  1341. pages_done = -EINTR;
  1342. break;
  1343. }
  1344. ret = mmap_read_lock_killable(mm);
  1345. if (ret) {
  1346. BUG_ON(ret > 0);
  1347. if (!pages_done)
  1348. pages_done = ret;
  1349. break;
  1350. }
  1351. *locked = 1;
  1352. ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
  1353. pages, NULL, locked);
  1354. if (!*locked) {
  1355. /* Continue to retry until we succeeded */
  1356. BUG_ON(ret != 0);
  1357. goto retry;
  1358. }
  1359. if (ret != 1) {
  1360. BUG_ON(ret > 1);
  1361. if (!pages_done)
  1362. pages_done = ret;
  1363. break;
  1364. }
  1365. nr_pages--;
  1366. pages_done++;
  1367. if (!nr_pages)
  1368. break;
  1369. if (likely(pages))
  1370. pages++;
  1371. start += PAGE_SIZE;
  1372. }
  1373. if (lock_dropped && *locked) {
  1374. /*
  1375. * We must let the caller know we temporarily dropped the lock
  1376. * and so the critical section protected by it was lost.
  1377. */
  1378. mmap_read_unlock(mm);
  1379. *locked = 0;
  1380. }
  1381. return pages_done;
  1382. }
  1383. /**
  1384. * populate_vma_page_range() - populate a range of pages in the vma.
  1385. * @vma: target vma
  1386. * @start: start address
  1387. * @end: end address
  1388. * @locked: whether the mmap_lock is still held
  1389. *
  1390. * This takes care of mlocking the pages too if VM_LOCKED is set.
  1391. *
  1392. * Return either number of pages pinned in the vma, or a negative error
  1393. * code on error.
  1394. *
  1395. * vma->vm_mm->mmap_lock must be held.
  1396. *
  1397. * If @locked is NULL, it may be held for read or write and will
  1398. * be unperturbed.
  1399. *
  1400. * If @locked is non-NULL, it must held for read only and may be
  1401. * released. If it's released, *@locked will be set to 0.
  1402. */
  1403. long populate_vma_page_range(struct vm_area_struct *vma,
  1404. unsigned long start, unsigned long end, int *locked)
  1405. {
  1406. struct mm_struct *mm = vma->vm_mm;
  1407. unsigned long nr_pages = (end - start) / PAGE_SIZE;
  1408. int gup_flags;
  1409. long ret;
  1410. VM_BUG_ON(!PAGE_ALIGNED(start));
  1411. VM_BUG_ON(!PAGE_ALIGNED(end));
  1412. VM_BUG_ON_VMA(start < vma->vm_start, vma);
  1413. VM_BUG_ON_VMA(end > vma->vm_end, vma);
  1414. mmap_assert_locked(mm);
  1415. /*
  1416. * Rightly or wrongly, the VM_LOCKONFAULT case has never used
  1417. * faultin_page() to break COW, so it has no work to do here.
  1418. */
  1419. if (vma->vm_flags & VM_LOCKONFAULT)
  1420. return nr_pages;
  1421. gup_flags = FOLL_TOUCH;
  1422. /*
  1423. * We want to touch writable mappings with a write fault in order
  1424. * to break COW, except for shared mappings because these don't COW
  1425. * and we would not want to dirty them for nothing.
  1426. */
  1427. if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
  1428. gup_flags |= FOLL_WRITE;
  1429. /*
  1430. * We want mlock to succeed for regions that have any permissions
  1431. * other than PROT_NONE.
  1432. */
  1433. if (vma_is_accessible(vma))
  1434. gup_flags |= FOLL_FORCE;
  1435. /*
  1436. * We made sure addr is within a VMA, so the following will
  1437. * not result in a stack expansion that recurses back here.
  1438. */
  1439. ret = __get_user_pages(mm, start, nr_pages, gup_flags,
  1440. NULL, NULL, locked);
  1441. lru_add_drain();
  1442. return ret;
  1443. }
  1444. /*
  1445. * faultin_vma_page_range() - populate (prefault) page tables inside the
  1446. * given VMA range readable/writable
  1447. *
  1448. * This takes care of mlocking the pages, too, if VM_LOCKED is set.
  1449. *
  1450. * @vma: target vma
  1451. * @start: start address
  1452. * @end: end address
  1453. * @write: whether to prefault readable or writable
  1454. * @locked: whether the mmap_lock is still held
  1455. *
  1456. * Returns either number of processed pages in the vma, or a negative error
  1457. * code on error (see __get_user_pages()).
  1458. *
  1459. * vma->vm_mm->mmap_lock must be held. The range must be page-aligned and
  1460. * covered by the VMA.
  1461. *
  1462. * If @locked is NULL, it may be held for read or write and will be unperturbed.
  1463. *
  1464. * If @locked is non-NULL, it must held for read only and may be released. If
  1465. * it's released, *@locked will be set to 0.
  1466. */
  1467. long faultin_vma_page_range(struct vm_area_struct *vma, unsigned long start,
  1468. unsigned long end, bool write, int *locked)
  1469. {
  1470. struct mm_struct *mm = vma->vm_mm;
  1471. unsigned long nr_pages = (end - start) / PAGE_SIZE;
  1472. int gup_flags;
  1473. long ret;
  1474. VM_BUG_ON(!PAGE_ALIGNED(start));
  1475. VM_BUG_ON(!PAGE_ALIGNED(end));
  1476. VM_BUG_ON_VMA(start < vma->vm_start, vma);
  1477. VM_BUG_ON_VMA(end > vma->vm_end, vma);
  1478. mmap_assert_locked(mm);
  1479. /*
  1480. * FOLL_TOUCH: Mark page accessed and thereby young; will also mark
  1481. * the page dirty with FOLL_WRITE -- which doesn't make a
  1482. * difference with !FOLL_FORCE, because the page is writable
  1483. * in the page table.
  1484. * FOLL_HWPOISON: Return -EHWPOISON instead of -EFAULT when we hit
  1485. * a poisoned page.
  1486. * !FOLL_FORCE: Require proper access permissions.
  1487. */
  1488. gup_flags = FOLL_TOUCH | FOLL_HWPOISON;
  1489. if (write)
  1490. gup_flags |= FOLL_WRITE;
  1491. /*
  1492. * We want to report -EINVAL instead of -EFAULT for any permission
  1493. * problems or incompatible mappings.
  1494. */
  1495. if (check_vma_flags(vma, gup_flags))
  1496. return -EINVAL;
  1497. ret = __get_user_pages(mm, start, nr_pages, gup_flags,
  1498. NULL, NULL, locked);
  1499. lru_add_drain();
  1500. return ret;
  1501. }
  1502. /*
  1503. * __mm_populate - populate and/or mlock pages within a range of address space.
  1504. *
  1505. * This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
  1506. * flags. VMAs must be already marked with the desired vm_flags, and
  1507. * mmap_lock must not be held.
  1508. */
  1509. int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
  1510. {
  1511. struct mm_struct *mm = current->mm;
  1512. unsigned long end, nstart, nend;
  1513. struct vm_area_struct *vma = NULL;
  1514. int locked = 0;
  1515. long ret = 0;
  1516. end = start + len;
  1517. for (nstart = start; nstart < end; nstart = nend) {
  1518. /*
  1519. * We want to fault in pages for [nstart; end) address range.
  1520. * Find first corresponding VMA.
  1521. */
  1522. if (!locked) {
  1523. locked = 1;
  1524. mmap_read_lock(mm);
  1525. vma = find_vma_intersection(mm, nstart, end);
  1526. } else if (nstart >= vma->vm_end)
  1527. vma = find_vma_intersection(mm, vma->vm_end, end);
  1528. if (!vma)
  1529. break;
  1530. /*
  1531. * Set [nstart; nend) to intersection of desired address
  1532. * range with the first VMA. Also, skip undesirable VMA types.
  1533. */
  1534. nend = min(end, vma->vm_end);
  1535. if (vma->vm_flags & (VM_IO | VM_PFNMAP))
  1536. continue;
  1537. if (nstart < vma->vm_start)
  1538. nstart = vma->vm_start;
  1539. /*
  1540. * Now fault in a range of pages. populate_vma_page_range()
  1541. * double checks the vma flags, so that it won't mlock pages
  1542. * if the vma was already munlocked.
  1543. */
  1544. ret = populate_vma_page_range(vma, nstart, nend, &locked);
  1545. if (ret < 0) {
  1546. if (ignore_errors) {
  1547. ret = 0;
  1548. continue; /* continue at next VMA */
  1549. }
  1550. break;
  1551. }
  1552. nend = nstart + ret * PAGE_SIZE;
  1553. ret = 0;
  1554. }
  1555. if (locked)
  1556. mmap_read_unlock(mm);
  1557. return ret; /* 0 or negative error code */
  1558. }
  1559. #else /* CONFIG_MMU */
  1560. static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
  1561. unsigned long nr_pages, struct page **pages,
  1562. struct vm_area_struct **vmas, int *locked,
  1563. unsigned int foll_flags)
  1564. {
  1565. struct vm_area_struct *vma;
  1566. unsigned long vm_flags;
  1567. long i;
  1568. /* calculate required read or write permissions.
  1569. * If FOLL_FORCE is set, we only require the "MAY" flags.
  1570. */
  1571. vm_flags = (foll_flags & FOLL_WRITE) ?
  1572. (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
  1573. vm_flags &= (foll_flags & FOLL_FORCE) ?
  1574. (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
  1575. for (i = 0; i < nr_pages; i++) {
  1576. vma = find_vma(mm, start);
  1577. if (!vma)
  1578. goto finish_or_fault;
  1579. /* protect what we can, including chardevs */
  1580. if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
  1581. !(vm_flags & vma->vm_flags))
  1582. goto finish_or_fault;
  1583. if (pages) {
  1584. pages[i] = virt_to_page((void *)start);
  1585. if (pages[i])
  1586. get_page(pages[i]);
  1587. }
  1588. if (vmas)
  1589. vmas[i] = vma;
  1590. start = (start + PAGE_SIZE) & PAGE_MASK;
  1591. }
  1592. return i;
  1593. finish_or_fault:
  1594. return i ? : -EFAULT;
  1595. }
  1596. #endif /* !CONFIG_MMU */
  1597. /**
  1598. * fault_in_writeable - fault in userspace address range for writing
  1599. * @uaddr: start of address range
  1600. * @size: size of address range
  1601. *
  1602. * Returns the number of bytes not faulted in (like copy_to_user() and
  1603. * copy_from_user()).
  1604. */
  1605. size_t fault_in_writeable(char __user *uaddr, size_t size)
  1606. {
  1607. char __user *start = uaddr, *end;
  1608. if (unlikely(size == 0))
  1609. return 0;
  1610. if (!user_write_access_begin(uaddr, size))
  1611. return size;
  1612. if (!PAGE_ALIGNED(uaddr)) {
  1613. unsafe_put_user(0, uaddr, out);
  1614. uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
  1615. }
  1616. end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
  1617. if (unlikely(end < start))
  1618. end = NULL;
  1619. while (uaddr != end) {
  1620. unsafe_put_user(0, uaddr, out);
  1621. uaddr += PAGE_SIZE;
  1622. }
  1623. out:
  1624. user_write_access_end();
  1625. if (size > uaddr - start)
  1626. return size - (uaddr - start);
  1627. return 0;
  1628. }
  1629. EXPORT_SYMBOL(fault_in_writeable);
  1630. /**
  1631. * fault_in_subpage_writeable - fault in an address range for writing
  1632. * @uaddr: start of address range
  1633. * @size: size of address range
  1634. *
  1635. * Fault in a user address range for writing while checking for permissions at
  1636. * sub-page granularity (e.g. arm64 MTE). This function should be used when
  1637. * the caller cannot guarantee forward progress of a copy_to_user() loop.
  1638. *
  1639. * Returns the number of bytes not faulted in (like copy_to_user() and
  1640. * copy_from_user()).
  1641. */
  1642. size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
  1643. {
  1644. size_t faulted_in;
  1645. /*
  1646. * Attempt faulting in at page granularity first for page table
  1647. * permission checking. The arch-specific probe_subpage_writeable()
  1648. * functions may not check for this.
  1649. */
  1650. faulted_in = size - fault_in_writeable(uaddr, size);
  1651. if (faulted_in)
  1652. faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
  1653. return size - faulted_in;
  1654. }
  1655. EXPORT_SYMBOL(fault_in_subpage_writeable);
  1656. /*
  1657. * fault_in_safe_writeable - fault in an address range for writing
  1658. * @uaddr: start of address range
  1659. * @size: length of address range
  1660. *
  1661. * Faults in an address range for writing. This is primarily useful when we
  1662. * already know that some or all of the pages in the address range aren't in
  1663. * memory.
  1664. *
  1665. * Unlike fault_in_writeable(), this function is non-destructive.
  1666. *
  1667. * Note that we don't pin or otherwise hold the pages referenced that we fault
  1668. * in. There's no guarantee that they'll stay in memory for any duration of
  1669. * time.
  1670. *
  1671. * Returns the number of bytes not faulted in, like copy_to_user() and
  1672. * copy_from_user().
  1673. */
  1674. size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
  1675. {
  1676. unsigned long start = (unsigned long)uaddr, end;
  1677. struct mm_struct *mm = current->mm;
  1678. bool unlocked = false;
  1679. if (unlikely(size == 0))
  1680. return 0;
  1681. end = PAGE_ALIGN(start + size);
  1682. if (end < start)
  1683. end = 0;
  1684. mmap_read_lock(mm);
  1685. do {
  1686. if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
  1687. break;
  1688. start = (start + PAGE_SIZE) & PAGE_MASK;
  1689. } while (start != end);
  1690. mmap_read_unlock(mm);
  1691. if (size > (unsigned long)uaddr - start)
  1692. return size - ((unsigned long)uaddr - start);
  1693. return 0;
  1694. }
  1695. EXPORT_SYMBOL(fault_in_safe_writeable);
  1696. /**
  1697. * fault_in_readable - fault in userspace address range for reading
  1698. * @uaddr: start of user address range
  1699. * @size: size of user address range
  1700. *
  1701. * Returns the number of bytes not faulted in (like copy_to_user() and
  1702. * copy_from_user()).
  1703. */
  1704. size_t fault_in_readable(const char __user *uaddr, size_t size)
  1705. {
  1706. const char __user *start = uaddr, *end;
  1707. volatile char c;
  1708. if (unlikely(size == 0))
  1709. return 0;
  1710. if (!user_read_access_begin(uaddr, size))
  1711. return size;
  1712. if (!PAGE_ALIGNED(uaddr)) {
  1713. unsafe_get_user(c, uaddr, out);
  1714. uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
  1715. }
  1716. end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
  1717. if (unlikely(end < start))
  1718. end = NULL;
  1719. while (uaddr != end) {
  1720. unsafe_get_user(c, uaddr, out);
  1721. uaddr += PAGE_SIZE;
  1722. }
  1723. out:
  1724. user_read_access_end();
  1725. (void)c;
  1726. if (size > uaddr - start)
  1727. return size - (uaddr - start);
  1728. return 0;
  1729. }
  1730. EXPORT_SYMBOL(fault_in_readable);
  1731. /**
  1732. * get_dump_page() - pin user page in memory while writing it to core dump
  1733. * @addr: user address
  1734. *
  1735. * Returns struct page pointer of user page pinned for dump,
  1736. * to be freed afterwards by put_page().
  1737. *
  1738. * Returns NULL on any kind of failure - a hole must then be inserted into
  1739. * the corefile, to preserve alignment with its headers; and also returns
  1740. * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
  1741. * allowing a hole to be left in the corefile to save disk space.
  1742. *
  1743. * Called without mmap_lock (takes and releases the mmap_lock by itself).
  1744. */
  1745. #ifdef CONFIG_ELF_CORE
  1746. struct page *get_dump_page(unsigned long addr)
  1747. {
  1748. struct mm_struct *mm = current->mm;
  1749. struct page *page;
  1750. int locked = 1;
  1751. int ret;
  1752. if (mmap_read_lock_killable(mm))
  1753. return NULL;
  1754. ret = __get_user_pages_locked(mm, addr, 1, &page, NULL, &locked,
  1755. FOLL_FORCE | FOLL_DUMP | FOLL_GET);
  1756. if (locked)
  1757. mmap_read_unlock(mm);
  1758. return (ret == 1) ? page : NULL;
  1759. }
  1760. #endif /* CONFIG_ELF_CORE */
  1761. #ifdef CONFIG_MIGRATION
  1762. /*
  1763. * Returns the number of collected pages. Return value is always >= 0.
  1764. */
  1765. static unsigned long collect_longterm_unpinnable_pages(
  1766. struct list_head *movable_page_list,
  1767. unsigned long nr_pages,
  1768. struct page **pages)
  1769. {
  1770. unsigned long i, collected = 0;
  1771. struct folio *prev_folio = NULL;
  1772. bool drain_allow = true;
  1773. for (i = 0; i < nr_pages; i++) {
  1774. struct folio *folio = page_folio(pages[i]);
  1775. if (folio == prev_folio)
  1776. continue;
  1777. prev_folio = folio;
  1778. if (folio_is_longterm_pinnable(folio))
  1779. continue;
  1780. collected++;
  1781. if (folio_is_device_coherent(folio))
  1782. continue;
  1783. if (folio_test_hugetlb(folio)) {
  1784. isolate_hugetlb(&folio->page, movable_page_list);
  1785. continue;
  1786. }
  1787. if (!folio_test_lru(folio) && drain_allow) {
  1788. lru_add_drain_all();
  1789. drain_allow = false;
  1790. }
  1791. if (folio_isolate_lru(folio))
  1792. continue;
  1793. list_add_tail(&folio->lru, movable_page_list);
  1794. node_stat_mod_folio(folio,
  1795. NR_ISOLATED_ANON + folio_is_file_lru(folio),
  1796. folio_nr_pages(folio));
  1797. }
  1798. return collected;
  1799. }
  1800. /*
  1801. * Unpins all pages and migrates device coherent pages and movable_page_list.
  1802. * Returns -EAGAIN if all pages were successfully migrated or -errno for failure
  1803. * (or partial success).
  1804. */
  1805. static int migrate_longterm_unpinnable_pages(
  1806. struct list_head *movable_page_list,
  1807. unsigned long nr_pages,
  1808. struct page **pages)
  1809. {
  1810. int ret;
  1811. unsigned long i;
  1812. for (i = 0; i < nr_pages; i++) {
  1813. struct folio *folio = page_folio(pages[i]);
  1814. if (folio_is_device_coherent(folio)) {
  1815. /*
  1816. * Migration will fail if the page is pinned, so convert
  1817. * the pin on the source page to a normal reference.
  1818. */
  1819. pages[i] = NULL;
  1820. folio_get(folio);
  1821. gup_put_folio(folio, 1, FOLL_PIN);
  1822. if (migrate_device_coherent_page(&folio->page)) {
  1823. ret = -EBUSY;
  1824. goto err;
  1825. }
  1826. continue;
  1827. }
  1828. /*
  1829. * We can't migrate pages with unexpected references, so drop
  1830. * the reference obtained by __get_user_pages_locked().
  1831. * Migrating pages have been added to movable_page_list after
  1832. * calling folio_isolate_lru() which takes a reference so the
  1833. * page won't be freed if it's migrating.
  1834. */
  1835. unpin_user_page(pages[i]);
  1836. pages[i] = NULL;
  1837. }
  1838. if (!list_empty(movable_page_list)) {
  1839. struct migration_target_control mtc = {
  1840. .nid = NUMA_NO_NODE,
  1841. .gfp_mask = GFP_USER | __GFP_NOWARN,
  1842. };
  1843. if (migrate_pages(movable_page_list, alloc_migration_target,
  1844. NULL, (unsigned long)&mtc, MIGRATE_SYNC,
  1845. MR_LONGTERM_PIN, NULL)) {
  1846. ret = -ENOMEM;
  1847. goto err;
  1848. }
  1849. }
  1850. putback_movable_pages(movable_page_list);
  1851. return -EAGAIN;
  1852. err:
  1853. for (i = 0; i < nr_pages; i++)
  1854. if (pages[i])
  1855. unpin_user_page(pages[i]);
  1856. putback_movable_pages(movable_page_list);
  1857. return ret;
  1858. }
  1859. /*
  1860. * Check whether all pages are *allowed* to be pinned. Rather confusingly, all
  1861. * pages in the range are required to be pinned via FOLL_PIN, before calling
  1862. * this routine.
  1863. *
  1864. * If any pages in the range are not allowed to be pinned, then this routine
  1865. * will migrate those pages away, unpin all the pages in the range and return
  1866. * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then
  1867. * call this routine again.
  1868. *
  1869. * If an error other than -EAGAIN occurs, this indicates a migration failure.
  1870. * The caller should give up, and propagate the error back up the call stack.
  1871. *
  1872. * If everything is OK and all pages in the range are allowed to be pinned, then
  1873. * this routine leaves all pages pinned and returns zero for success.
  1874. */
  1875. static long check_and_migrate_movable_pages(unsigned long nr_pages,
  1876. struct page **pages)
  1877. {
  1878. unsigned long collected;
  1879. LIST_HEAD(movable_page_list);
  1880. collected = collect_longterm_unpinnable_pages(&movable_page_list,
  1881. nr_pages, pages);
  1882. if (!collected)
  1883. return 0;
  1884. return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages,
  1885. pages);
  1886. }
  1887. #else
  1888. static long check_and_migrate_movable_pages(unsigned long nr_pages,
  1889. struct page **pages)
  1890. {
  1891. return 0;
  1892. }
  1893. #endif /* CONFIG_MIGRATION */
  1894. /*
  1895. * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
  1896. * allows us to process the FOLL_LONGTERM flag.
  1897. */
  1898. static long __gup_longterm_locked(struct mm_struct *mm,
  1899. unsigned long start,
  1900. unsigned long nr_pages,
  1901. struct page **pages,
  1902. struct vm_area_struct **vmas,
  1903. unsigned int gup_flags)
  1904. {
  1905. unsigned int flags;
  1906. long rc, nr_pinned_pages;
  1907. if (!(gup_flags & FOLL_LONGTERM))
  1908. return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
  1909. NULL, gup_flags);
  1910. /*
  1911. * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM
  1912. * implies FOLL_PIN (although the reverse is not true). Therefore it is
  1913. * correct to unconditionally call check_and_migrate_movable_pages()
  1914. * which assumes pages have been pinned via FOLL_PIN.
  1915. *
  1916. * Enforce the above reasoning by asserting that FOLL_PIN is set.
  1917. */
  1918. if (WARN_ON(!(gup_flags & FOLL_PIN)))
  1919. return -EINVAL;
  1920. flags = memalloc_pin_save();
  1921. do {
  1922. nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages,
  1923. pages, vmas, NULL,
  1924. gup_flags);
  1925. if (nr_pinned_pages <= 0) {
  1926. rc = nr_pinned_pages;
  1927. break;
  1928. }
  1929. rc = check_and_migrate_movable_pages(nr_pinned_pages, pages);
  1930. } while (rc == -EAGAIN);
  1931. memalloc_pin_restore(flags);
  1932. return rc ? rc : nr_pinned_pages;
  1933. }
  1934. static bool is_valid_gup_flags(unsigned int gup_flags)
  1935. {
  1936. /*
  1937. * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
  1938. * never directly by the caller, so enforce that with an assertion:
  1939. */
  1940. if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
  1941. return false;
  1942. /*
  1943. * FOLL_PIN is a prerequisite to FOLL_LONGTERM. Another way of saying
  1944. * that is, FOLL_LONGTERM is a specific case, more restrictive case of
  1945. * FOLL_PIN.
  1946. */
  1947. if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  1948. return false;
  1949. return true;
  1950. }
  1951. #ifdef CONFIG_MMU
  1952. static long __get_user_pages_remote(struct mm_struct *mm,
  1953. unsigned long start, unsigned long nr_pages,
  1954. unsigned int gup_flags, struct page **pages,
  1955. struct vm_area_struct **vmas, int *locked)
  1956. {
  1957. /*
  1958. * Parts of FOLL_LONGTERM behavior are incompatible with
  1959. * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  1960. * vmas. However, this only comes up if locked is set, and there are
  1961. * callers that do request FOLL_LONGTERM, but do not set locked. So,
  1962. * allow what we can.
  1963. */
  1964. if (gup_flags & FOLL_LONGTERM) {
  1965. if (WARN_ON_ONCE(locked))
  1966. return -EINVAL;
  1967. /*
  1968. * This will check the vmas (even if our vmas arg is NULL)
  1969. * and return -ENOTSUPP if DAX isn't allowed in this case:
  1970. */
  1971. return __gup_longterm_locked(mm, start, nr_pages, pages,
  1972. vmas, gup_flags | FOLL_TOUCH |
  1973. FOLL_REMOTE);
  1974. }
  1975. return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
  1976. locked,
  1977. gup_flags | FOLL_TOUCH | FOLL_REMOTE);
  1978. }
  1979. /**
  1980. * get_user_pages_remote() - pin user pages in memory
  1981. * @mm: mm_struct of target mm
  1982. * @start: starting user address
  1983. * @nr_pages: number of pages from start to pin
  1984. * @gup_flags: flags modifying lookup behaviour
  1985. * @pages: array that receives pointers to the pages pinned.
  1986. * Should be at least nr_pages long. Or NULL, if caller
  1987. * only intends to ensure the pages are faulted in.
  1988. * @vmas: array of pointers to vmas corresponding to each page.
  1989. * Or NULL if the caller does not require them.
  1990. * @locked: pointer to lock flag indicating whether lock is held and
  1991. * subsequently whether VM_FAULT_RETRY functionality can be
  1992. * utilised. Lock must initially be held.
  1993. *
  1994. * Returns either number of pages pinned (which may be less than the
  1995. * number requested), or an error. Details about the return value:
  1996. *
  1997. * -- If nr_pages is 0, returns 0.
  1998. * -- If nr_pages is >0, but no pages were pinned, returns -errno.
  1999. * -- If nr_pages is >0, and some pages were pinned, returns the number of
  2000. * pages pinned. Again, this may be less than nr_pages.
  2001. *
  2002. * The caller is responsible for releasing returned @pages, via put_page().
  2003. *
  2004. * @vmas are valid only as long as mmap_lock is held.
  2005. *
  2006. * Must be called with mmap_lock held for read or write.
  2007. *
  2008. * get_user_pages_remote walks a process's page tables and takes a reference
  2009. * to each struct page that each user address corresponds to at a given
  2010. * instant. That is, it takes the page that would be accessed if a user
  2011. * thread accesses the given user virtual address at that instant.
  2012. *
  2013. * This does not guarantee that the page exists in the user mappings when
  2014. * get_user_pages_remote returns, and there may even be a completely different
  2015. * page there in some cases (eg. if mmapped pagecache has been invalidated
  2016. * and subsequently re faulted). However it does guarantee that the page
  2017. * won't be freed completely. And mostly callers simply care that the page
  2018. * contains data that was valid *at some point in time*. Typically, an IO
  2019. * or similar operation cannot guarantee anything stronger anyway because
  2020. * locks can't be held over the syscall boundary.
  2021. *
  2022. * If gup_flags & FOLL_WRITE == 0, the page must not be written to. If the page
  2023. * is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
  2024. * be called after the page is finished with, and before put_page is called.
  2025. *
  2026. * get_user_pages_remote is typically used for fewer-copy IO operations,
  2027. * to get a handle on the memory by some means other than accesses
  2028. * via the user virtual addresses. The pages may be submitted for
  2029. * DMA to devices or accessed via their kernel linear mapping (via the
  2030. * kmap APIs). Care should be taken to use the correct cache flushing APIs.
  2031. *
  2032. * See also get_user_pages_fast, for performance critical applications.
  2033. *
  2034. * get_user_pages_remote should be phased out in favor of
  2035. * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
  2036. * should use get_user_pages_remote because it cannot pass
  2037. * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
  2038. */
  2039. long get_user_pages_remote(struct mm_struct *mm,
  2040. unsigned long start, unsigned long nr_pages,
  2041. unsigned int gup_flags, struct page **pages,
  2042. struct vm_area_struct **vmas, int *locked)
  2043. {
  2044. if (!is_valid_gup_flags(gup_flags))
  2045. return -EINVAL;
  2046. return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
  2047. pages, vmas, locked);
  2048. }
  2049. EXPORT_SYMBOL(get_user_pages_remote);
  2050. #else /* CONFIG_MMU */
  2051. long get_user_pages_remote(struct mm_struct *mm,
  2052. unsigned long start, unsigned long nr_pages,
  2053. unsigned int gup_flags, struct page **pages,
  2054. struct vm_area_struct **vmas, int *locked)
  2055. {
  2056. return 0;
  2057. }
  2058. static long __get_user_pages_remote(struct mm_struct *mm,
  2059. unsigned long start, unsigned long nr_pages,
  2060. unsigned int gup_flags, struct page **pages,
  2061. struct vm_area_struct **vmas, int *locked)
  2062. {
  2063. return 0;
  2064. }
  2065. #endif /* !CONFIG_MMU */
  2066. /**
  2067. * get_user_pages() - pin user pages in memory
  2068. * @start: starting user address
  2069. * @nr_pages: number of pages from start to pin
  2070. * @gup_flags: flags modifying lookup behaviour
  2071. * @pages: array that receives pointers to the pages pinned.
  2072. * Should be at least nr_pages long. Or NULL, if caller
  2073. * only intends to ensure the pages are faulted in.
  2074. * @vmas: array of pointers to vmas corresponding to each page.
  2075. * Or NULL if the caller does not require them.
  2076. *
  2077. * This is the same as get_user_pages_remote(), just with a less-flexible
  2078. * calling convention where we assume that the mm being operated on belongs to
  2079. * the current task, and doesn't allow passing of a locked parameter. We also
  2080. * obviously don't pass FOLL_REMOTE in here.
  2081. */
  2082. long get_user_pages(unsigned long start, unsigned long nr_pages,
  2083. unsigned int gup_flags, struct page **pages,
  2084. struct vm_area_struct **vmas)
  2085. {
  2086. if (!is_valid_gup_flags(gup_flags))
  2087. return -EINVAL;
  2088. return __gup_longterm_locked(current->mm, start, nr_pages,
  2089. pages, vmas, gup_flags | FOLL_TOUCH);
  2090. }
  2091. EXPORT_SYMBOL(get_user_pages);
  2092. /*
  2093. * get_user_pages_unlocked() is suitable to replace the form:
  2094. *
  2095. * mmap_read_lock(mm);
  2096. * get_user_pages(mm, ..., pages, NULL);
  2097. * mmap_read_unlock(mm);
  2098. *
  2099. * with:
  2100. *
  2101. * get_user_pages_unlocked(mm, ..., pages);
  2102. *
  2103. * It is functionally equivalent to get_user_pages_fast so
  2104. * get_user_pages_fast should be used instead if specific gup_flags
  2105. * (e.g. FOLL_FORCE) are not required.
  2106. */
  2107. long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
  2108. struct page **pages, unsigned int gup_flags)
  2109. {
  2110. struct mm_struct *mm = current->mm;
  2111. int locked = 1;
  2112. long ret;
  2113. /*
  2114. * FIXME: Current FOLL_LONGTERM behavior is incompatible with
  2115. * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
  2116. * vmas. As there are no users of this flag in this call we simply
  2117. * disallow this option for now.
  2118. */
  2119. if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
  2120. return -EINVAL;
  2121. mmap_read_lock(mm);
  2122. ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
  2123. &locked, gup_flags | FOLL_TOUCH);
  2124. if (locked)
  2125. mmap_read_unlock(mm);
  2126. return ret;
  2127. }
  2128. EXPORT_SYMBOL(get_user_pages_unlocked);
  2129. /*
  2130. * Fast GUP
  2131. *
  2132. * get_user_pages_fast attempts to pin user pages by walking the page
  2133. * tables directly and avoids taking locks. Thus the walker needs to be
  2134. * protected from page table pages being freed from under it, and should
  2135. * block any THP splits.
  2136. *
  2137. * One way to achieve this is to have the walker disable interrupts, and
  2138. * rely on IPIs from the TLB flushing code blocking before the page table
  2139. * pages are freed. This is unsuitable for architectures that do not need
  2140. * to broadcast an IPI when invalidating TLBs.
  2141. *
  2142. * Another way to achieve this is to batch up page table containing pages
  2143. * belonging to more than one mm_user, then rcu_sched a callback to free those
  2144. * pages. Disabling interrupts will allow the fast_gup walker to both block
  2145. * the rcu_sched callback, and an IPI that we broadcast for splitting THPs
  2146. * (which is a relatively rare event). The code below adopts this strategy.
  2147. *
  2148. * Before activating this code, please be aware that the following assumptions
  2149. * are currently made:
  2150. *
  2151. * *) Either MMU_GATHER_RCU_TABLE_FREE is enabled, and tlb_remove_table() is used to
  2152. * free pages containing page tables or TLB flushing requires IPI broadcast.
  2153. *
  2154. * *) ptes can be read atomically by the architecture.
  2155. *
  2156. * *) access_ok is sufficient to validate userspace address ranges.
  2157. *
  2158. * The last two assumptions can be relaxed by the addition of helper functions.
  2159. *
  2160. * This code is based heavily on the PowerPC implementation by Nick Piggin.
  2161. */
  2162. #ifdef CONFIG_HAVE_FAST_GUP
  2163. static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
  2164. unsigned int flags,
  2165. struct page **pages)
  2166. {
  2167. while ((*nr) - nr_start) {
  2168. struct page *page = pages[--(*nr)];
  2169. ClearPageReferenced(page);
  2170. if (flags & FOLL_PIN)
  2171. unpin_user_page(page);
  2172. else
  2173. put_page(page);
  2174. }
  2175. }
  2176. #ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
  2177. /*
  2178. * Fast-gup relies on pte change detection to avoid concurrent pgtable
  2179. * operations.
  2180. *
  2181. * To pin the page, fast-gup needs to do below in order:
  2182. * (1) pin the page (by prefetching pte), then (2) check pte not changed.
  2183. *
  2184. * For the rest of pgtable operations where pgtable updates can be racy
  2185. * with fast-gup, we need to do (1) clear pte, then (2) check whether page
  2186. * is pinned.
  2187. *
  2188. * Above will work for all pte-level operations, including THP split.
  2189. *
  2190. * For THP collapse, it's a bit more complicated because fast-gup may be
  2191. * walking a pgtable page that is being freed (pte is still valid but pmd
  2192. * can be cleared already). To avoid race in such condition, we need to
  2193. * also check pmd here to make sure pmd doesn't change (corresponds to
  2194. * pmdp_collapse_flush() in the THP collapse code path).
  2195. */
  2196. static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
  2197. unsigned long end, unsigned int flags,
  2198. struct page **pages, int *nr)
  2199. {
  2200. struct dev_pagemap *pgmap = NULL;
  2201. int nr_start = *nr, ret = 0;
  2202. pte_t *ptep, *ptem;
  2203. ptem = ptep = pte_offset_map(&pmd, addr);
  2204. do {
  2205. pte_t pte = ptep_get_lockless(ptep);
  2206. struct page *page;
  2207. struct folio *folio;
  2208. if (pte_protnone(pte) && !gup_can_follow_protnone(flags))
  2209. goto pte_unmap;
  2210. if (!pte_access_permitted(pte, flags & FOLL_WRITE))
  2211. goto pte_unmap;
  2212. if (pte_devmap(pte)) {
  2213. if (unlikely(flags & FOLL_LONGTERM))
  2214. goto pte_unmap;
  2215. pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
  2216. if (unlikely(!pgmap)) {
  2217. undo_dev_pagemap(nr, nr_start, flags, pages);
  2218. goto pte_unmap;
  2219. }
  2220. } else if (pte_special(pte))
  2221. goto pte_unmap;
  2222. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  2223. page = pte_page(pte);
  2224. folio = try_grab_folio(page, 1, flags);
  2225. if (!folio)
  2226. goto pte_unmap;
  2227. if (unlikely(page_is_secretmem(page))) {
  2228. gup_put_folio(folio, 1, flags);
  2229. goto pte_unmap;
  2230. }
  2231. if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
  2232. unlikely(pte_val(pte) != pte_val(*ptep))) {
  2233. gup_put_folio(folio, 1, flags);
  2234. goto pte_unmap;
  2235. }
  2236. if (!pte_write(pte) && gup_must_unshare(flags, page)) {
  2237. gup_put_folio(folio, 1, flags);
  2238. goto pte_unmap;
  2239. }
  2240. /*
  2241. * We need to make the page accessible if and only if we are
  2242. * going to access its content (the FOLL_PIN case). Please
  2243. * see Documentation/core-api/pin_user_pages.rst for
  2244. * details.
  2245. */
  2246. if (flags & FOLL_PIN) {
  2247. ret = arch_make_page_accessible(page);
  2248. if (ret) {
  2249. gup_put_folio(folio, 1, flags);
  2250. goto pte_unmap;
  2251. }
  2252. }
  2253. folio_set_referenced(folio);
  2254. pages[*nr] = page;
  2255. (*nr)++;
  2256. } while (ptep++, addr += PAGE_SIZE, addr != end);
  2257. ret = 1;
  2258. pte_unmap:
  2259. if (pgmap)
  2260. put_dev_pagemap(pgmap);
  2261. pte_unmap(ptem);
  2262. return ret;
  2263. }
  2264. #else
  2265. /*
  2266. * If we can't determine whether or not a pte is special, then fail immediately
  2267. * for ptes. Note, we can still pin HugeTLB and THP as these are guaranteed not
  2268. * to be special.
  2269. *
  2270. * For a futex to be placed on a THP tail page, get_futex_key requires a
  2271. * get_user_pages_fast_only implementation that can pin pages. Thus it's still
  2272. * useful to have gup_huge_pmd even if we can't operate on ptes.
  2273. */
  2274. static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
  2275. unsigned long end, unsigned int flags,
  2276. struct page **pages, int *nr)
  2277. {
  2278. return 0;
  2279. }
  2280. #endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
  2281. #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
  2282. static int __gup_device_huge(unsigned long pfn, unsigned long addr,
  2283. unsigned long end, unsigned int flags,
  2284. struct page **pages, int *nr)
  2285. {
  2286. int nr_start = *nr;
  2287. struct dev_pagemap *pgmap = NULL;
  2288. do {
  2289. struct page *page = pfn_to_page(pfn);
  2290. pgmap = get_dev_pagemap(pfn, pgmap);
  2291. if (unlikely(!pgmap)) {
  2292. undo_dev_pagemap(nr, nr_start, flags, pages);
  2293. break;
  2294. }
  2295. SetPageReferenced(page);
  2296. pages[*nr] = page;
  2297. if (unlikely(!try_grab_page(page, flags))) {
  2298. undo_dev_pagemap(nr, nr_start, flags, pages);
  2299. break;
  2300. }
  2301. (*nr)++;
  2302. pfn++;
  2303. } while (addr += PAGE_SIZE, addr != end);
  2304. put_dev_pagemap(pgmap);
  2305. return addr == end;
  2306. }
  2307. static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
  2308. unsigned long end, unsigned int flags,
  2309. struct page **pages, int *nr)
  2310. {
  2311. unsigned long fault_pfn;
  2312. int nr_start = *nr;
  2313. fault_pfn = pmd_pfn(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
  2314. if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
  2315. return 0;
  2316. if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
  2317. undo_dev_pagemap(nr, nr_start, flags, pages);
  2318. return 0;
  2319. }
  2320. return 1;
  2321. }
  2322. static int __gup_device_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
  2323. unsigned long end, unsigned int flags,
  2324. struct page **pages, int *nr)
  2325. {
  2326. unsigned long fault_pfn;
  2327. int nr_start = *nr;
  2328. fault_pfn = pud_pfn(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
  2329. if (!__gup_device_huge(fault_pfn, addr, end, flags, pages, nr))
  2330. return 0;
  2331. if (unlikely(pud_val(orig) != pud_val(*pudp))) {
  2332. undo_dev_pagemap(nr, nr_start, flags, pages);
  2333. return 0;
  2334. }
  2335. return 1;
  2336. }
  2337. #else
  2338. static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
  2339. unsigned long end, unsigned int flags,
  2340. struct page **pages, int *nr)
  2341. {
  2342. BUILD_BUG();
  2343. return 0;
  2344. }
  2345. static int __gup_device_huge_pud(pud_t pud, pud_t *pudp, unsigned long addr,
  2346. unsigned long end, unsigned int flags,
  2347. struct page **pages, int *nr)
  2348. {
  2349. BUILD_BUG();
  2350. return 0;
  2351. }
  2352. #endif
  2353. static int record_subpages(struct page *page, unsigned long addr,
  2354. unsigned long end, struct page **pages)
  2355. {
  2356. int nr;
  2357. for (nr = 0; addr != end; nr++, addr += PAGE_SIZE)
  2358. pages[nr] = nth_page(page, nr);
  2359. return nr;
  2360. }
  2361. #ifdef CONFIG_ARCH_HAS_HUGEPD
  2362. static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
  2363. unsigned long sz)
  2364. {
  2365. unsigned long __boundary = (addr + sz) & ~(sz-1);
  2366. return (__boundary - 1 < end - 1) ? __boundary : end;
  2367. }
  2368. static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
  2369. unsigned long end, unsigned int flags,
  2370. struct page **pages, int *nr)
  2371. {
  2372. unsigned long pte_end;
  2373. struct page *page;
  2374. struct folio *folio;
  2375. pte_t pte;
  2376. int refs;
  2377. pte_end = (addr + sz) & ~(sz-1);
  2378. if (pte_end < end)
  2379. end = pte_end;
  2380. pte = huge_ptep_get(ptep);
  2381. if (!pte_access_permitted(pte, flags & FOLL_WRITE))
  2382. return 0;
  2383. /* hugepages are never "special" */
  2384. VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
  2385. page = nth_page(pte_page(pte), (addr & (sz - 1)) >> PAGE_SHIFT);
  2386. refs = record_subpages(page, addr, end, pages + *nr);
  2387. folio = try_grab_folio(page, refs, flags);
  2388. if (!folio)
  2389. return 0;
  2390. if (unlikely(pte_val(pte) != pte_val(*ptep))) {
  2391. gup_put_folio(folio, refs, flags);
  2392. return 0;
  2393. }
  2394. if (!pte_write(pte) && gup_must_unshare(flags, &folio->page)) {
  2395. gup_put_folio(folio, refs, flags);
  2396. return 0;
  2397. }
  2398. *nr += refs;
  2399. folio_set_referenced(folio);
  2400. return 1;
  2401. }
  2402. static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
  2403. unsigned int pdshift, unsigned long end, unsigned int flags,
  2404. struct page **pages, int *nr)
  2405. {
  2406. pte_t *ptep;
  2407. unsigned long sz = 1UL << hugepd_shift(hugepd);
  2408. unsigned long next;
  2409. ptep = hugepte_offset(hugepd, addr, pdshift);
  2410. do {
  2411. next = hugepte_addr_end(addr, end, sz);
  2412. if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
  2413. return 0;
  2414. } while (ptep++, addr = next, addr != end);
  2415. return 1;
  2416. }
  2417. #else
  2418. static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
  2419. unsigned int pdshift, unsigned long end, unsigned int flags,
  2420. struct page **pages, int *nr)
  2421. {
  2422. return 0;
  2423. }
  2424. #endif /* CONFIG_ARCH_HAS_HUGEPD */
  2425. static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
  2426. unsigned long end, unsigned int flags,
  2427. struct page **pages, int *nr)
  2428. {
  2429. struct page *page;
  2430. struct folio *folio;
  2431. int refs;
  2432. if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
  2433. return 0;
  2434. if (pmd_devmap(orig)) {
  2435. if (unlikely(flags & FOLL_LONGTERM))
  2436. return 0;
  2437. return __gup_device_huge_pmd(orig, pmdp, addr, end, flags,
  2438. pages, nr);
  2439. }
  2440. page = nth_page(pmd_page(orig), (addr & ~PMD_MASK) >> PAGE_SHIFT);
  2441. refs = record_subpages(page, addr, end, pages + *nr);
  2442. folio = try_grab_folio(page, refs, flags);
  2443. if (!folio)
  2444. return 0;
  2445. if (unlikely(pmd_val(orig) != pmd_val(*pmdp))) {
  2446. gup_put_folio(folio, refs, flags);
  2447. return 0;
  2448. }
  2449. if (!pmd_write(orig) && gup_must_unshare(flags, &folio->page)) {
  2450. gup_put_folio(folio, refs, flags);
  2451. return 0;
  2452. }
  2453. *nr += refs;
  2454. folio_set_referenced(folio);
  2455. return 1;
  2456. }
  2457. static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
  2458. unsigned long end, unsigned int flags,
  2459. struct page **pages, int *nr)
  2460. {
  2461. struct page *page;
  2462. struct folio *folio;
  2463. int refs;
  2464. if (!pud_access_permitted(orig, flags & FOLL_WRITE))
  2465. return 0;
  2466. if (pud_devmap(orig)) {
  2467. if (unlikely(flags & FOLL_LONGTERM))
  2468. return 0;
  2469. return __gup_device_huge_pud(orig, pudp, addr, end, flags,
  2470. pages, nr);
  2471. }
  2472. page = nth_page(pud_page(orig), (addr & ~PUD_MASK) >> PAGE_SHIFT);
  2473. refs = record_subpages(page, addr, end, pages + *nr);
  2474. folio = try_grab_folio(page, refs, flags);
  2475. if (!folio)
  2476. return 0;
  2477. if (unlikely(pud_val(orig) != pud_val(*pudp))) {
  2478. gup_put_folio(folio, refs, flags);
  2479. return 0;
  2480. }
  2481. if (!pud_write(orig) && gup_must_unshare(flags, &folio->page)) {
  2482. gup_put_folio(folio, refs, flags);
  2483. return 0;
  2484. }
  2485. *nr += refs;
  2486. folio_set_referenced(folio);
  2487. return 1;
  2488. }
  2489. static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
  2490. unsigned long end, unsigned int flags,
  2491. struct page **pages, int *nr)
  2492. {
  2493. int refs;
  2494. struct page *page;
  2495. struct folio *folio;
  2496. if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
  2497. return 0;
  2498. BUILD_BUG_ON(pgd_devmap(orig));
  2499. page = nth_page(pgd_page(orig), (addr & ~PGDIR_MASK) >> PAGE_SHIFT);
  2500. refs = record_subpages(page, addr, end, pages + *nr);
  2501. folio = try_grab_folio(page, refs, flags);
  2502. if (!folio)
  2503. return 0;
  2504. if (unlikely(pgd_val(orig) != pgd_val(*pgdp))) {
  2505. gup_put_folio(folio, refs, flags);
  2506. return 0;
  2507. }
  2508. *nr += refs;
  2509. folio_set_referenced(folio);
  2510. return 1;
  2511. }
  2512. static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
  2513. unsigned int flags, struct page **pages, int *nr)
  2514. {
  2515. unsigned long next;
  2516. pmd_t *pmdp;
  2517. pmdp = pmd_offset_lockless(pudp, pud, addr);
  2518. do {
  2519. pmd_t pmd = READ_ONCE(*pmdp);
  2520. next = pmd_addr_end(addr, end);
  2521. if (!pmd_present(pmd))
  2522. return 0;
  2523. if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
  2524. pmd_devmap(pmd))) {
  2525. if (pmd_protnone(pmd) &&
  2526. !gup_can_follow_protnone(flags))
  2527. return 0;
  2528. if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
  2529. pages, nr))
  2530. return 0;
  2531. } else if (unlikely(is_hugepd(__hugepd(pmd_val(pmd))))) {
  2532. /*
  2533. * architecture have different format for hugetlbfs
  2534. * pmd format and THP pmd format
  2535. */
  2536. if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
  2537. PMD_SHIFT, next, flags, pages, nr))
  2538. return 0;
  2539. } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
  2540. return 0;
  2541. } while (pmdp++, addr = next, addr != end);
  2542. return 1;
  2543. }
  2544. static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
  2545. unsigned int flags, struct page **pages, int *nr)
  2546. {
  2547. unsigned long next;
  2548. pud_t *pudp;
  2549. pudp = pud_offset_lockless(p4dp, p4d, addr);
  2550. do {
  2551. pud_t pud = READ_ONCE(*pudp);
  2552. next = pud_addr_end(addr, end);
  2553. if (unlikely(!pud_present(pud)))
  2554. return 0;
  2555. if (unlikely(pud_huge(pud) || pud_devmap(pud))) {
  2556. if (!gup_huge_pud(pud, pudp, addr, next, flags,
  2557. pages, nr))
  2558. return 0;
  2559. } else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
  2560. if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
  2561. PUD_SHIFT, next, flags, pages, nr))
  2562. return 0;
  2563. } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
  2564. return 0;
  2565. } while (pudp++, addr = next, addr != end);
  2566. return 1;
  2567. }
  2568. static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
  2569. unsigned int flags, struct page **pages, int *nr)
  2570. {
  2571. unsigned long next;
  2572. p4d_t *p4dp;
  2573. p4dp = p4d_offset_lockless(pgdp, pgd, addr);
  2574. do {
  2575. p4d_t p4d = READ_ONCE(*p4dp);
  2576. next = p4d_addr_end(addr, end);
  2577. if (p4d_none(p4d))
  2578. return 0;
  2579. BUILD_BUG_ON(p4d_huge(p4d));
  2580. if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
  2581. if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
  2582. P4D_SHIFT, next, flags, pages, nr))
  2583. return 0;
  2584. } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
  2585. return 0;
  2586. } while (p4dp++, addr = next, addr != end);
  2587. return 1;
  2588. }
  2589. static void gup_pgd_range(unsigned long addr, unsigned long end,
  2590. unsigned int flags, struct page **pages, int *nr)
  2591. {
  2592. unsigned long next;
  2593. pgd_t *pgdp;
  2594. pgdp = pgd_offset(current->mm, addr);
  2595. do {
  2596. pgd_t pgd = READ_ONCE(*pgdp);
  2597. next = pgd_addr_end(addr, end);
  2598. if (pgd_none(pgd))
  2599. return;
  2600. if (unlikely(pgd_huge(pgd))) {
  2601. if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
  2602. pages, nr))
  2603. return;
  2604. } else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
  2605. if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
  2606. PGDIR_SHIFT, next, flags, pages, nr))
  2607. return;
  2608. } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
  2609. return;
  2610. } while (pgdp++, addr = next, addr != end);
  2611. }
  2612. #else
  2613. static inline void gup_pgd_range(unsigned long addr, unsigned long end,
  2614. unsigned int flags, struct page **pages, int *nr)
  2615. {
  2616. }
  2617. #endif /* CONFIG_HAVE_FAST_GUP */
  2618. #ifndef gup_fast_permitted
  2619. /*
  2620. * Check if it's allowed to use get_user_pages_fast_only() for the range, or
  2621. * we need to fall back to the slow version:
  2622. */
  2623. static bool gup_fast_permitted(unsigned long start, unsigned long end)
  2624. {
  2625. return true;
  2626. }
  2627. #endif
  2628. static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
  2629. unsigned int gup_flags, struct page **pages)
  2630. {
  2631. int ret;
  2632. /*
  2633. * FIXME: FOLL_LONGTERM does not work with
  2634. * get_user_pages_unlocked() (see comments in that function)
  2635. */
  2636. if (gup_flags & FOLL_LONGTERM) {
  2637. mmap_read_lock(current->mm);
  2638. ret = __gup_longterm_locked(current->mm,
  2639. start, nr_pages,
  2640. pages, NULL, gup_flags);
  2641. mmap_read_unlock(current->mm);
  2642. } else {
  2643. ret = get_user_pages_unlocked(start, nr_pages,
  2644. pages, gup_flags);
  2645. }
  2646. return ret;
  2647. }
  2648. static unsigned long lockless_pages_from_mm(unsigned long start,
  2649. unsigned long end,
  2650. unsigned int gup_flags,
  2651. struct page **pages)
  2652. {
  2653. unsigned long flags;
  2654. int nr_pinned = 0;
  2655. unsigned seq;
  2656. if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
  2657. !gup_fast_permitted(start, end))
  2658. return 0;
  2659. if (gup_flags & FOLL_PIN) {
  2660. seq = raw_read_seqcount(&current->mm->write_protect_seq);
  2661. if (seq & 1)
  2662. return 0;
  2663. }
  2664. /*
  2665. * Disable interrupts. The nested form is used, in order to allow full,
  2666. * general purpose use of this routine.
  2667. *
  2668. * With interrupts disabled, we block page table pages from being freed
  2669. * from under us. See struct mmu_table_batch comments in
  2670. * include/asm-generic/tlb.h for more details.
  2671. *
  2672. * We do not adopt an rcu_read_lock() here as we also want to block IPIs
  2673. * that come from THPs splitting.
  2674. */
  2675. local_irq_save(flags);
  2676. gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
  2677. local_irq_restore(flags);
  2678. /*
  2679. * When pinning pages for DMA there could be a concurrent write protect
  2680. * from fork() via copy_page_range(), in this case always fail fast GUP.
  2681. */
  2682. if (gup_flags & FOLL_PIN) {
  2683. if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
  2684. unpin_user_pages_lockless(pages, nr_pinned);
  2685. return 0;
  2686. } else {
  2687. sanity_check_pinned_pages(pages, nr_pinned);
  2688. }
  2689. }
  2690. return nr_pinned;
  2691. }
  2692. static int internal_get_user_pages_fast(unsigned long start,
  2693. unsigned long nr_pages,
  2694. unsigned int gup_flags,
  2695. struct page **pages)
  2696. {
  2697. unsigned long len, end;
  2698. unsigned long nr_pinned;
  2699. int ret;
  2700. if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
  2701. FOLL_FORCE | FOLL_PIN | FOLL_GET |
  2702. FOLL_FAST_ONLY | FOLL_NOFAULT)))
  2703. return -EINVAL;
  2704. if (gup_flags & FOLL_PIN)
  2705. mm_set_has_pinned_flag(&current->mm->flags);
  2706. if (!(gup_flags & FOLL_FAST_ONLY))
  2707. might_lock_read(&current->mm->mmap_lock);
  2708. start = untagged_addr(start) & PAGE_MASK;
  2709. len = nr_pages << PAGE_SHIFT;
  2710. if (check_add_overflow(start, len, &end))
  2711. return 0;
  2712. if (unlikely(!access_ok((void __user *)start, len)))
  2713. return -EFAULT;
  2714. nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
  2715. if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
  2716. return nr_pinned;
  2717. /* Slow path: try to get the remaining pages with get_user_pages */
  2718. start += nr_pinned << PAGE_SHIFT;
  2719. pages += nr_pinned;
  2720. ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
  2721. pages);
  2722. if (ret < 0) {
  2723. /*
  2724. * The caller has to unpin the pages we already pinned so
  2725. * returning -errno is not an option
  2726. */
  2727. if (nr_pinned)
  2728. return nr_pinned;
  2729. return ret;
  2730. }
  2731. return ret + nr_pinned;
  2732. }
  2733. /**
  2734. * get_user_pages_fast_only() - pin user pages in memory
  2735. * @start: starting user address
  2736. * @nr_pages: number of pages from start to pin
  2737. * @gup_flags: flags modifying pin behaviour
  2738. * @pages: array that receives pointers to the pages pinned.
  2739. * Should be at least nr_pages long.
  2740. *
  2741. * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
  2742. * the regular GUP.
  2743. * Note a difference with get_user_pages_fast: this always returns the
  2744. * number of pages pinned, 0 if no pages were pinned.
  2745. *
  2746. * If the architecture does not support this function, simply return with no
  2747. * pages pinned.
  2748. *
  2749. * Careful, careful! COW breaking can go either way, so a non-write
  2750. * access can get ambiguous page results. If you call this function without
  2751. * 'write' set, you'd better be sure that you're ok with that ambiguity.
  2752. */
  2753. int get_user_pages_fast_only(unsigned long start, int nr_pages,
  2754. unsigned int gup_flags, struct page **pages)
  2755. {
  2756. int nr_pinned;
  2757. /*
  2758. * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
  2759. * because gup fast is always a "pin with a +1 page refcount" request.
  2760. *
  2761. * FOLL_FAST_ONLY is required in order to match the API description of
  2762. * this routine: no fall back to regular ("slow") GUP.
  2763. */
  2764. gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
  2765. nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
  2766. pages);
  2767. /*
  2768. * As specified in the API description above, this routine is not
  2769. * allowed to return negative values. However, the common core
  2770. * routine internal_get_user_pages_fast() *can* return -errno.
  2771. * Therefore, correct for that here:
  2772. */
  2773. if (nr_pinned < 0)
  2774. nr_pinned = 0;
  2775. return nr_pinned;
  2776. }
  2777. EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
  2778. /**
  2779. * get_user_pages_fast() - pin user pages in memory
  2780. * @start: starting user address
  2781. * @nr_pages: number of pages from start to pin
  2782. * @gup_flags: flags modifying pin behaviour
  2783. * @pages: array that receives pointers to the pages pinned.
  2784. * Should be at least nr_pages long.
  2785. *
  2786. * Attempt to pin user pages in memory without taking mm->mmap_lock.
  2787. * If not successful, it will fall back to taking the lock and
  2788. * calling get_user_pages().
  2789. *
  2790. * Returns number of pages pinned. This may be fewer than the number requested.
  2791. * If nr_pages is 0 or negative, returns 0. If no pages were pinned, returns
  2792. * -errno.
  2793. */
  2794. int get_user_pages_fast(unsigned long start, int nr_pages,
  2795. unsigned int gup_flags, struct page **pages)
  2796. {
  2797. if (!is_valid_gup_flags(gup_flags))
  2798. return -EINVAL;
  2799. /*
  2800. * The caller may or may not have explicitly set FOLL_GET; either way is
  2801. * OK. However, internally (within mm/gup.c), gup fast variants must set
  2802. * FOLL_GET, because gup fast is always a "pin with a +1 page refcount"
  2803. * request.
  2804. */
  2805. gup_flags |= FOLL_GET;
  2806. return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
  2807. }
  2808. EXPORT_SYMBOL_GPL(get_user_pages_fast);
  2809. /**
  2810. * pin_user_pages_fast() - pin user pages in memory without taking locks
  2811. *
  2812. * @start: starting user address
  2813. * @nr_pages: number of pages from start to pin
  2814. * @gup_flags: flags modifying pin behaviour
  2815. * @pages: array that receives pointers to the pages pinned.
  2816. * Should be at least nr_pages long.
  2817. *
  2818. * Nearly the same as get_user_pages_fast(), except that FOLL_PIN is set. See
  2819. * get_user_pages_fast() for documentation on the function arguments, because
  2820. * the arguments here are identical.
  2821. *
  2822. * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
  2823. * see Documentation/core-api/pin_user_pages.rst for further details.
  2824. */
  2825. int pin_user_pages_fast(unsigned long start, int nr_pages,
  2826. unsigned int gup_flags, struct page **pages)
  2827. {
  2828. /* FOLL_GET and FOLL_PIN are mutually exclusive. */
  2829. if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  2830. return -EINVAL;
  2831. if (WARN_ON_ONCE(!pages))
  2832. return -EINVAL;
  2833. gup_flags |= FOLL_PIN;
  2834. return internal_get_user_pages_fast(start, nr_pages, gup_flags, pages);
  2835. }
  2836. EXPORT_SYMBOL_GPL(pin_user_pages_fast);
  2837. /*
  2838. * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
  2839. * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
  2840. *
  2841. * The API rules are the same, too: no negative values may be returned.
  2842. */
  2843. int pin_user_pages_fast_only(unsigned long start, int nr_pages,
  2844. unsigned int gup_flags, struct page **pages)
  2845. {
  2846. int nr_pinned;
  2847. /*
  2848. * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
  2849. * rules require returning 0, rather than -errno:
  2850. */
  2851. if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  2852. return 0;
  2853. if (WARN_ON_ONCE(!pages))
  2854. return 0;
  2855. /*
  2856. * FOLL_FAST_ONLY is required in order to match the API description of
  2857. * this routine: no fall back to regular ("slow") GUP.
  2858. */
  2859. gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
  2860. nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
  2861. pages);
  2862. /*
  2863. * This routine is not allowed to return negative values. However,
  2864. * internal_get_user_pages_fast() *can* return -errno. Therefore,
  2865. * correct for that here:
  2866. */
  2867. if (nr_pinned < 0)
  2868. nr_pinned = 0;
  2869. return nr_pinned;
  2870. }
  2871. EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
  2872. /**
  2873. * pin_user_pages_remote() - pin pages of a remote process
  2874. *
  2875. * @mm: mm_struct of target mm
  2876. * @start: starting user address
  2877. * @nr_pages: number of pages from start to pin
  2878. * @gup_flags: flags modifying lookup behaviour
  2879. * @pages: array that receives pointers to the pages pinned.
  2880. * Should be at least nr_pages long.
  2881. * @vmas: array of pointers to vmas corresponding to each page.
  2882. * Or NULL if the caller does not require them.
  2883. * @locked: pointer to lock flag indicating whether lock is held and
  2884. * subsequently whether VM_FAULT_RETRY functionality can be
  2885. * utilised. Lock must initially be held.
  2886. *
  2887. * Nearly the same as get_user_pages_remote(), except that FOLL_PIN is set. See
  2888. * get_user_pages_remote() for documentation on the function arguments, because
  2889. * the arguments here are identical.
  2890. *
  2891. * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
  2892. * see Documentation/core-api/pin_user_pages.rst for details.
  2893. */
  2894. long pin_user_pages_remote(struct mm_struct *mm,
  2895. unsigned long start, unsigned long nr_pages,
  2896. unsigned int gup_flags, struct page **pages,
  2897. struct vm_area_struct **vmas, int *locked)
  2898. {
  2899. /* FOLL_GET and FOLL_PIN are mutually exclusive. */
  2900. if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  2901. return -EINVAL;
  2902. if (WARN_ON_ONCE(!pages))
  2903. return -EINVAL;
  2904. gup_flags |= FOLL_PIN;
  2905. return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
  2906. pages, vmas, locked);
  2907. }
  2908. EXPORT_SYMBOL(pin_user_pages_remote);
  2909. /**
  2910. * pin_user_pages() - pin user pages in memory for use by other devices
  2911. *
  2912. * @start: starting user address
  2913. * @nr_pages: number of pages from start to pin
  2914. * @gup_flags: flags modifying lookup behaviour
  2915. * @pages: array that receives pointers to the pages pinned.
  2916. * Should be at least nr_pages long.
  2917. * @vmas: array of pointers to vmas corresponding to each page.
  2918. * Or NULL if the caller does not require them.
  2919. *
  2920. * Nearly the same as get_user_pages(), except that FOLL_TOUCH is not set, and
  2921. * FOLL_PIN is set.
  2922. *
  2923. * FOLL_PIN means that the pages must be released via unpin_user_page(). Please
  2924. * see Documentation/core-api/pin_user_pages.rst for details.
  2925. */
  2926. long pin_user_pages(unsigned long start, unsigned long nr_pages,
  2927. unsigned int gup_flags, struct page **pages,
  2928. struct vm_area_struct **vmas)
  2929. {
  2930. /* FOLL_GET and FOLL_PIN are mutually exclusive. */
  2931. if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  2932. return -EINVAL;
  2933. if (WARN_ON_ONCE(!pages))
  2934. return -EINVAL;
  2935. gup_flags |= FOLL_PIN;
  2936. return __gup_longterm_locked(current->mm, start, nr_pages,
  2937. pages, vmas, gup_flags);
  2938. }
  2939. EXPORT_SYMBOL(pin_user_pages);
  2940. /*
  2941. * pin_user_pages_unlocked() is the FOLL_PIN variant of
  2942. * get_user_pages_unlocked(). Behavior is the same, except that this one sets
  2943. * FOLL_PIN and rejects FOLL_GET.
  2944. */
  2945. long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
  2946. struct page **pages, unsigned int gup_flags)
  2947. {
  2948. /* FOLL_GET and FOLL_PIN are mutually exclusive. */
  2949. if (WARN_ON_ONCE(gup_flags & FOLL_GET))
  2950. return -EINVAL;
  2951. if (WARN_ON_ONCE(!pages))
  2952. return -EINVAL;
  2953. gup_flags |= FOLL_PIN;
  2954. return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
  2955. }
  2956. EXPORT_SYMBOL(pin_user_pages_unlocked);