userfaultfd.c 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * mm/userfaultfd.c
  4. *
  5. * Copyright (C) 2015 Red Hat, Inc.
  6. */
  7. #include <linux/mm.h>
  8. #include <linux/sched/signal.h>
  9. #include <linux/pagemap.h>
  10. #include <linux/rmap.h>
  11. #include <linux/swap.h>
  12. #include <linux/swapops.h>
  13. #include <linux/userfaultfd_k.h>
  14. #include <linux/mmu_notifier.h>
  15. #include <linux/hugetlb.h>
  16. #include <linux/shmem_fs.h>
  17. #include <asm/tlbflush.h>
  18. #include <asm/tlb.h>
  19. #include "internal.h"
  20. static __always_inline
  21. struct vm_area_struct *find_dst_vma(struct mm_struct *dst_mm,
  22. unsigned long dst_start,
  23. unsigned long len)
  24. {
  25. /*
  26. * Make sure that the dst range is both valid and fully within a
  27. * single existing vma.
  28. */
  29. struct vm_area_struct *dst_vma;
  30. dst_vma = find_vma(dst_mm, dst_start);
  31. if (!dst_vma)
  32. return NULL;
  33. if (dst_start < dst_vma->vm_start ||
  34. dst_start + len > dst_vma->vm_end)
  35. return NULL;
  36. /*
  37. * Check the vma is registered in uffd, this is required to
  38. * enforce the VM_MAYWRITE check done at uffd registration
  39. * time.
  40. */
  41. if (!dst_vma->vm_userfaultfd_ctx.ctx)
  42. return NULL;
  43. return dst_vma;
  44. }
  45. /*
  46. * Install PTEs, to map dst_addr (within dst_vma) to page.
  47. *
  48. * This function handles both MCOPY_ATOMIC_NORMAL and _CONTINUE for both shmem
  49. * and anon, and for both shared and private VMAs.
  50. */
  51. int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
  52. struct vm_area_struct *dst_vma,
  53. unsigned long dst_addr, struct page *page,
  54. bool newly_allocated, bool wp_copy)
  55. {
  56. int ret;
  57. pte_t _dst_pte, *dst_pte;
  58. bool writable = dst_vma->vm_flags & VM_WRITE;
  59. bool vm_shared = dst_vma->vm_flags & VM_SHARED;
  60. bool page_in_cache = page_mapping(page);
  61. spinlock_t *ptl;
  62. struct inode *inode;
  63. pgoff_t offset, max_off;
  64. _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
  65. _dst_pte = pte_mkdirty(_dst_pte);
  66. if (page_in_cache && !vm_shared)
  67. writable = false;
  68. /*
  69. * Always mark a PTE as write-protected when needed, regardless of
  70. * VM_WRITE, which the user might change.
  71. */
  72. if (wp_copy) {
  73. _dst_pte = pte_mkuffd_wp(_dst_pte);
  74. writable = false;
  75. }
  76. if (writable)
  77. _dst_pte = pte_mkwrite(_dst_pte);
  78. else
  79. /*
  80. * We need this to make sure write bit removed; as mk_pte()
  81. * could return a pte with write bit set.
  82. */
  83. _dst_pte = pte_wrprotect(_dst_pte);
  84. dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
  85. if (vma_is_shmem(dst_vma)) {
  86. /* serialize against truncate with the page table lock */
  87. inode = dst_vma->vm_file->f_inode;
  88. offset = linear_page_index(dst_vma, dst_addr);
  89. max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  90. ret = -EFAULT;
  91. if (unlikely(offset >= max_off))
  92. goto out_unlock;
  93. }
  94. ret = -EEXIST;
  95. /*
  96. * We allow to overwrite a pte marker: consider when both MISSING|WP
  97. * registered, we firstly wr-protect a none pte which has no page cache
  98. * page backing it, then access the page.
  99. */
  100. if (!pte_none_mostly(*dst_pte))
  101. goto out_unlock;
  102. if (page_in_cache) {
  103. /* Usually, cache pages are already added to LRU */
  104. if (newly_allocated)
  105. lru_cache_add(page);
  106. page_add_file_rmap(page, dst_vma, false);
  107. } else {
  108. page_add_new_anon_rmap(page, dst_vma, dst_addr);
  109. lru_cache_add_inactive_or_unevictable(page, dst_vma);
  110. }
  111. /*
  112. * Must happen after rmap, as mm_counter() checks mapping (via
  113. * PageAnon()), which is set by __page_set_anon_rmap().
  114. */
  115. inc_mm_counter(dst_mm, mm_counter(page));
  116. set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  117. /* No need to invalidate - it was non-present before */
  118. update_mmu_cache(dst_vma, dst_addr, dst_pte);
  119. ret = 0;
  120. out_unlock:
  121. pte_unmap_unlock(dst_pte, ptl);
  122. return ret;
  123. }
  124. static int mcopy_atomic_pte(struct mm_struct *dst_mm,
  125. pmd_t *dst_pmd,
  126. struct vm_area_struct *dst_vma,
  127. unsigned long dst_addr,
  128. unsigned long src_addr,
  129. struct page **pagep,
  130. bool wp_copy)
  131. {
  132. void *page_kaddr;
  133. int ret;
  134. struct page *page;
  135. if (!*pagep) {
  136. ret = -ENOMEM;
  137. page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, dst_vma, dst_addr);
  138. if (!page)
  139. goto out;
  140. page_kaddr = kmap_local_page(page);
  141. /*
  142. * The read mmap_lock is held here. Despite the
  143. * mmap_lock being read recursive a deadlock is still
  144. * possible if a writer has taken a lock. For example:
  145. *
  146. * process A thread 1 takes read lock on own mmap_lock
  147. * process A thread 2 calls mmap, blocks taking write lock
  148. * process B thread 1 takes page fault, read lock on own mmap lock
  149. * process B thread 2 calls mmap, blocks taking write lock
  150. * process A thread 1 blocks taking read lock on process B
  151. * process B thread 1 blocks taking read lock on process A
  152. *
  153. * Disable page faults to prevent potential deadlock
  154. * and retry the copy outside the mmap_lock.
  155. */
  156. pagefault_disable();
  157. ret = copy_from_user(page_kaddr,
  158. (const void __user *) src_addr,
  159. PAGE_SIZE);
  160. pagefault_enable();
  161. kunmap_local(page_kaddr);
  162. /* fallback to copy_from_user outside mmap_lock */
  163. if (unlikely(ret)) {
  164. ret = -ENOENT;
  165. *pagep = page;
  166. /* don't free the page */
  167. goto out;
  168. }
  169. flush_dcache_page(page);
  170. } else {
  171. page = *pagep;
  172. *pagep = NULL;
  173. }
  174. /*
  175. * The memory barrier inside __SetPageUptodate makes sure that
  176. * preceding stores to the page contents become visible before
  177. * the set_pte_at() write.
  178. */
  179. __SetPageUptodate(page);
  180. ret = -ENOMEM;
  181. if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
  182. goto out_release;
  183. ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
  184. page, true, wp_copy);
  185. if (ret)
  186. goto out_release;
  187. out:
  188. return ret;
  189. out_release:
  190. put_page(page);
  191. goto out;
  192. }
  193. static int mfill_zeropage_pte(struct mm_struct *dst_mm,
  194. pmd_t *dst_pmd,
  195. struct vm_area_struct *dst_vma,
  196. unsigned long dst_addr)
  197. {
  198. pte_t _dst_pte, *dst_pte;
  199. spinlock_t *ptl;
  200. int ret;
  201. pgoff_t offset, max_off;
  202. struct inode *inode;
  203. _dst_pte = pte_mkspecial(pfn_pte(my_zero_pfn(dst_addr),
  204. dst_vma->vm_page_prot));
  205. dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
  206. if (dst_vma->vm_file) {
  207. /* the shmem MAP_PRIVATE case requires checking the i_size */
  208. inode = dst_vma->vm_file->f_inode;
  209. offset = linear_page_index(dst_vma, dst_addr);
  210. max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
  211. ret = -EFAULT;
  212. if (unlikely(offset >= max_off))
  213. goto out_unlock;
  214. }
  215. ret = -EEXIST;
  216. if (!pte_none(*dst_pte))
  217. goto out_unlock;
  218. set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
  219. /* No need to invalidate - it was non-present before */
  220. update_mmu_cache(dst_vma, dst_addr, dst_pte);
  221. ret = 0;
  222. out_unlock:
  223. pte_unmap_unlock(dst_pte, ptl);
  224. return ret;
  225. }
  226. /* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
  227. static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
  228. pmd_t *dst_pmd,
  229. struct vm_area_struct *dst_vma,
  230. unsigned long dst_addr,
  231. bool wp_copy)
  232. {
  233. struct inode *inode = file_inode(dst_vma->vm_file);
  234. pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
  235. struct folio *folio;
  236. struct page *page;
  237. int ret;
  238. ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
  239. /* Our caller expects us to return -EFAULT if we failed to find folio */
  240. if (ret == -ENOENT)
  241. ret = -EFAULT;
  242. if (ret)
  243. goto out;
  244. if (!folio) {
  245. ret = -EFAULT;
  246. goto out;
  247. }
  248. page = folio_file_page(folio, pgoff);
  249. if (PageHWPoison(page)) {
  250. ret = -EIO;
  251. goto out_release;
  252. }
  253. ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
  254. page, false, wp_copy);
  255. if (ret)
  256. goto out_release;
  257. folio_unlock(folio);
  258. ret = 0;
  259. out:
  260. return ret;
  261. out_release:
  262. folio_unlock(folio);
  263. folio_put(folio);
  264. goto out;
  265. }
  266. static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
  267. {
  268. pgd_t *pgd;
  269. p4d_t *p4d;
  270. pud_t *pud;
  271. pgd = pgd_offset(mm, address);
  272. p4d = p4d_alloc(mm, pgd, address);
  273. if (!p4d)
  274. return NULL;
  275. pud = pud_alloc(mm, p4d, address);
  276. if (!pud)
  277. return NULL;
  278. /*
  279. * Note that we didn't run this because the pmd was
  280. * missing, the *pmd may be already established and in
  281. * turn it may also be a trans_huge_pmd.
  282. */
  283. return pmd_alloc(mm, pud, address);
  284. }
  285. #ifdef CONFIG_HUGETLB_PAGE
  286. /*
  287. * __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
  288. * called with mmap_lock held, it will release mmap_lock before returning.
  289. */
  290. static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
  291. struct vm_area_struct *dst_vma,
  292. unsigned long dst_start,
  293. unsigned long src_start,
  294. unsigned long len,
  295. enum mcopy_atomic_mode mode,
  296. bool wp_copy)
  297. {
  298. int vm_shared = dst_vma->vm_flags & VM_SHARED;
  299. ssize_t err;
  300. pte_t *dst_pte;
  301. unsigned long src_addr, dst_addr;
  302. long copied;
  303. struct page *page;
  304. unsigned long vma_hpagesize;
  305. pgoff_t idx;
  306. u32 hash;
  307. struct address_space *mapping;
  308. /*
  309. * There is no default zero huge page for all huge page sizes as
  310. * supported by hugetlb. A PMD_SIZE huge pages may exist as used
  311. * by THP. Since we can not reliably insert a zero page, this
  312. * feature is not supported.
  313. */
  314. if (mode == MCOPY_ATOMIC_ZEROPAGE) {
  315. mmap_read_unlock(dst_mm);
  316. return -EINVAL;
  317. }
  318. src_addr = src_start;
  319. dst_addr = dst_start;
  320. copied = 0;
  321. page = NULL;
  322. vma_hpagesize = vma_kernel_pagesize(dst_vma);
  323. /*
  324. * Validate alignment based on huge page size
  325. */
  326. err = -EINVAL;
  327. if (dst_start & (vma_hpagesize - 1) || len & (vma_hpagesize - 1))
  328. goto out_unlock;
  329. retry:
  330. /*
  331. * On routine entry dst_vma is set. If we had to drop mmap_lock and
  332. * retry, dst_vma will be set to NULL and we must lookup again.
  333. */
  334. if (!dst_vma) {
  335. err = -ENOENT;
  336. dst_vma = find_dst_vma(dst_mm, dst_start, len);
  337. if (!dst_vma || !is_vm_hugetlb_page(dst_vma))
  338. goto out_unlock;
  339. err = -EINVAL;
  340. if (vma_hpagesize != vma_kernel_pagesize(dst_vma))
  341. goto out_unlock;
  342. vm_shared = dst_vma->vm_flags & VM_SHARED;
  343. }
  344. /*
  345. * If not shared, ensure the dst_vma has a anon_vma.
  346. */
  347. err = -ENOMEM;
  348. if (!vm_shared) {
  349. if (unlikely(anon_vma_prepare(dst_vma)))
  350. goto out_unlock;
  351. }
  352. while (src_addr < src_start + len) {
  353. BUG_ON(dst_addr >= dst_start + len);
  354. /*
  355. * Serialize via vma_lock and hugetlb_fault_mutex.
  356. * vma_lock ensures the dst_pte remains valid even
  357. * in the case of shared pmds. fault mutex prevents
  358. * races with other faulting threads.
  359. */
  360. idx = linear_page_index(dst_vma, dst_addr);
  361. mapping = dst_vma->vm_file->f_mapping;
  362. hash = hugetlb_fault_mutex_hash(mapping, idx);
  363. mutex_lock(&hugetlb_fault_mutex_table[hash]);
  364. hugetlb_vma_lock_read(dst_vma);
  365. err = -ENOMEM;
  366. dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize);
  367. if (!dst_pte) {
  368. hugetlb_vma_unlock_read(dst_vma);
  369. mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  370. goto out_unlock;
  371. }
  372. if (mode != MCOPY_ATOMIC_CONTINUE &&
  373. !huge_pte_none_mostly(huge_ptep_get(dst_pte))) {
  374. err = -EEXIST;
  375. hugetlb_vma_unlock_read(dst_vma);
  376. mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  377. goto out_unlock;
  378. }
  379. err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
  380. dst_addr, src_addr, mode, &page,
  381. wp_copy);
  382. hugetlb_vma_unlock_read(dst_vma);
  383. mutex_unlock(&hugetlb_fault_mutex_table[hash]);
  384. cond_resched();
  385. if (unlikely(err == -ENOENT)) {
  386. mmap_read_unlock(dst_mm);
  387. BUG_ON(!page);
  388. err = copy_huge_page_from_user(page,
  389. (const void __user *)src_addr,
  390. vma_hpagesize / PAGE_SIZE,
  391. true);
  392. if (unlikely(err)) {
  393. err = -EFAULT;
  394. goto out;
  395. }
  396. mmap_read_lock(dst_mm);
  397. dst_vma = NULL;
  398. goto retry;
  399. } else
  400. BUG_ON(page);
  401. if (!err) {
  402. dst_addr += vma_hpagesize;
  403. src_addr += vma_hpagesize;
  404. copied += vma_hpagesize;
  405. if (fatal_signal_pending(current))
  406. err = -EINTR;
  407. }
  408. if (err)
  409. break;
  410. }
  411. out_unlock:
  412. mmap_read_unlock(dst_mm);
  413. out:
  414. if (page)
  415. put_page(page);
  416. BUG_ON(copied < 0);
  417. BUG_ON(err > 0);
  418. BUG_ON(!copied && !err);
  419. return copied ? copied : err;
  420. }
  421. #else /* !CONFIG_HUGETLB_PAGE */
  422. /* fail at build time if gcc attempts to use this */
  423. extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
  424. struct vm_area_struct *dst_vma,
  425. unsigned long dst_start,
  426. unsigned long src_start,
  427. unsigned long len,
  428. enum mcopy_atomic_mode mode,
  429. bool wp_copy);
  430. #endif /* CONFIG_HUGETLB_PAGE */
  431. static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
  432. pmd_t *dst_pmd,
  433. struct vm_area_struct *dst_vma,
  434. unsigned long dst_addr,
  435. unsigned long src_addr,
  436. struct page **page,
  437. enum mcopy_atomic_mode mode,
  438. bool wp_copy)
  439. {
  440. ssize_t err;
  441. if (mode == MCOPY_ATOMIC_CONTINUE) {
  442. return mcontinue_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
  443. wp_copy);
  444. }
  445. /*
  446. * The normal page fault path for a shmem will invoke the
  447. * fault, fill the hole in the file and COW it right away. The
  448. * result generates plain anonymous memory. So when we are
  449. * asked to fill an hole in a MAP_PRIVATE shmem mapping, we'll
  450. * generate anonymous memory directly without actually filling
  451. * the hole. For the MAP_PRIVATE case the robustness check
  452. * only happens in the pagetable (to verify it's still none)
  453. * and not in the radix tree.
  454. */
  455. if (!(dst_vma->vm_flags & VM_SHARED)) {
  456. if (mode == MCOPY_ATOMIC_NORMAL)
  457. err = mcopy_atomic_pte(dst_mm, dst_pmd, dst_vma,
  458. dst_addr, src_addr, page,
  459. wp_copy);
  460. else
  461. err = mfill_zeropage_pte(dst_mm, dst_pmd,
  462. dst_vma, dst_addr);
  463. } else {
  464. err = shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
  465. dst_addr, src_addr,
  466. mode != MCOPY_ATOMIC_NORMAL,
  467. wp_copy, page);
  468. }
  469. return err;
  470. }
  471. static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
  472. unsigned long dst_start,
  473. unsigned long src_start,
  474. unsigned long len,
  475. enum mcopy_atomic_mode mcopy_mode,
  476. atomic_t *mmap_changing,
  477. __u64 mode)
  478. {
  479. struct vm_area_struct *dst_vma;
  480. ssize_t err;
  481. pmd_t *dst_pmd;
  482. unsigned long src_addr, dst_addr;
  483. long copied;
  484. struct page *page;
  485. bool wp_copy;
  486. /*
  487. * Sanitize the command parameters:
  488. */
  489. BUG_ON(dst_start & ~PAGE_MASK);
  490. BUG_ON(len & ~PAGE_MASK);
  491. /* Does the address range wrap, or is the span zero-sized? */
  492. BUG_ON(src_start + len <= src_start);
  493. BUG_ON(dst_start + len <= dst_start);
  494. src_addr = src_start;
  495. dst_addr = dst_start;
  496. copied = 0;
  497. page = NULL;
  498. retry:
  499. mmap_read_lock(dst_mm);
  500. /*
  501. * If memory mappings are changing because of non-cooperative
  502. * operation (e.g. mremap) running in parallel, bail out and
  503. * request the user to retry later
  504. */
  505. err = -EAGAIN;
  506. if (mmap_changing && atomic_read(mmap_changing))
  507. goto out_unlock;
  508. /*
  509. * Make sure the vma is not shared, that the dst range is
  510. * both valid and fully within a single existing vma.
  511. */
  512. err = -ENOENT;
  513. dst_vma = find_dst_vma(dst_mm, dst_start, len);
  514. if (!dst_vma)
  515. goto out_unlock;
  516. err = -EINVAL;
  517. /*
  518. * shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
  519. * it will overwrite vm_ops, so vma_is_anonymous must return false.
  520. */
  521. if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
  522. dst_vma->vm_flags & VM_SHARED))
  523. goto out_unlock;
  524. /*
  525. * validate 'mode' now that we know the dst_vma: don't allow
  526. * a wrprotect copy if the userfaultfd didn't register as WP.
  527. */
  528. wp_copy = mode & UFFDIO_COPY_MODE_WP;
  529. if (wp_copy && !(dst_vma->vm_flags & VM_UFFD_WP))
  530. goto out_unlock;
  531. /*
  532. * If this is a HUGETLB vma, pass off to appropriate routine
  533. */
  534. if (is_vm_hugetlb_page(dst_vma))
  535. return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
  536. src_start, len, mcopy_mode,
  537. wp_copy);
  538. if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
  539. goto out_unlock;
  540. if (!vma_is_shmem(dst_vma) && mcopy_mode == MCOPY_ATOMIC_CONTINUE)
  541. goto out_unlock;
  542. /*
  543. * Ensure the dst_vma has a anon_vma or this page
  544. * would get a NULL anon_vma when moved in the
  545. * dst_vma.
  546. */
  547. err = -ENOMEM;
  548. if (!(dst_vma->vm_flags & VM_SHARED) &&
  549. unlikely(anon_vma_prepare(dst_vma)))
  550. goto out_unlock;
  551. while (src_addr < src_start + len) {
  552. pmd_t dst_pmdval;
  553. BUG_ON(dst_addr >= dst_start + len);
  554. dst_pmd = mm_alloc_pmd(dst_mm, dst_addr);
  555. if (unlikely(!dst_pmd)) {
  556. err = -ENOMEM;
  557. break;
  558. }
  559. dst_pmdval = pmd_read_atomic(dst_pmd);
  560. /*
  561. * If the dst_pmd is mapped as THP don't
  562. * override it and just be strict.
  563. */
  564. if (unlikely(pmd_trans_huge(dst_pmdval))) {
  565. err = -EEXIST;
  566. break;
  567. }
  568. if (unlikely(pmd_none(dst_pmdval)) &&
  569. unlikely(__pte_alloc(dst_mm, dst_pmd))) {
  570. err = -ENOMEM;
  571. break;
  572. }
  573. /* If an huge pmd materialized from under us fail */
  574. if (unlikely(pmd_trans_huge(*dst_pmd))) {
  575. err = -EFAULT;
  576. break;
  577. }
  578. BUG_ON(pmd_none(*dst_pmd));
  579. BUG_ON(pmd_trans_huge(*dst_pmd));
  580. err = mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
  581. src_addr, &page, mcopy_mode, wp_copy);
  582. cond_resched();
  583. if (unlikely(err == -ENOENT)) {
  584. void *page_kaddr;
  585. mmap_read_unlock(dst_mm);
  586. BUG_ON(!page);
  587. page_kaddr = kmap_local_page(page);
  588. err = copy_from_user(page_kaddr,
  589. (const void __user *) src_addr,
  590. PAGE_SIZE);
  591. kunmap_local(page_kaddr);
  592. if (unlikely(err)) {
  593. err = -EFAULT;
  594. goto out;
  595. }
  596. flush_dcache_page(page);
  597. goto retry;
  598. } else
  599. BUG_ON(page);
  600. if (!err) {
  601. dst_addr += PAGE_SIZE;
  602. src_addr += PAGE_SIZE;
  603. copied += PAGE_SIZE;
  604. if (fatal_signal_pending(current))
  605. err = -EINTR;
  606. }
  607. if (err)
  608. break;
  609. }
  610. out_unlock:
  611. mmap_read_unlock(dst_mm);
  612. out:
  613. if (page)
  614. put_page(page);
  615. BUG_ON(copied < 0);
  616. BUG_ON(err > 0);
  617. BUG_ON(!copied && !err);
  618. return copied ? copied : err;
  619. }
  620. ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
  621. unsigned long src_start, unsigned long len,
  622. atomic_t *mmap_changing, __u64 mode)
  623. {
  624. return __mcopy_atomic(dst_mm, dst_start, src_start, len,
  625. MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
  626. }
  627. ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
  628. unsigned long len, atomic_t *mmap_changing)
  629. {
  630. return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
  631. mmap_changing, 0);
  632. }
  633. ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
  634. unsigned long len, atomic_t *mmap_changing)
  635. {
  636. return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
  637. mmap_changing, 0);
  638. }
  639. void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma,
  640. unsigned long start, unsigned long len, bool enable_wp)
  641. {
  642. struct mmu_gather tlb;
  643. pgprot_t newprot;
  644. if (enable_wp)
  645. newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE));
  646. else
  647. newprot = vm_get_page_prot(dst_vma->vm_flags);
  648. tlb_gather_mmu(&tlb, dst_mm);
  649. change_protection(&tlb, dst_vma, start, start + len, newprot,
  650. enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE);
  651. tlb_finish_mmu(&tlb);
  652. }
  653. int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
  654. unsigned long len, bool enable_wp,
  655. atomic_t *mmap_changing)
  656. {
  657. struct vm_area_struct *dst_vma;
  658. unsigned long page_mask;
  659. int err;
  660. /*
  661. * Sanitize the command parameters:
  662. */
  663. BUG_ON(start & ~PAGE_MASK);
  664. BUG_ON(len & ~PAGE_MASK);
  665. /* Does the address range wrap, or is the span zero-sized? */
  666. BUG_ON(start + len <= start);
  667. mmap_read_lock(dst_mm);
  668. /*
  669. * If memory mappings are changing because of non-cooperative
  670. * operation (e.g. mremap) running in parallel, bail out and
  671. * request the user to retry later
  672. */
  673. err = -EAGAIN;
  674. if (mmap_changing && atomic_read(mmap_changing))
  675. goto out_unlock;
  676. err = -ENOENT;
  677. dst_vma = find_dst_vma(dst_mm, start, len);
  678. if (!dst_vma)
  679. goto out_unlock;
  680. if (!userfaultfd_wp(dst_vma))
  681. goto out_unlock;
  682. if (!vma_can_userfault(dst_vma, dst_vma->vm_flags))
  683. goto out_unlock;
  684. if (is_vm_hugetlb_page(dst_vma)) {
  685. err = -EINVAL;
  686. page_mask = vma_kernel_pagesize(dst_vma) - 1;
  687. if ((start & page_mask) || (len & page_mask))
  688. goto out_unlock;
  689. }
  690. uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp);
  691. err = 0;
  692. out_unlock:
  693. mmap_read_unlock(dst_mm);
  694. return err;
  695. }