dax.c 52 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * fs/dax.c - Direct Access filesystem code
  4. * Copyright (c) 2013-2014 Intel Corporation
  5. * Author: Matthew Wilcox <[email protected]>
  6. * Author: Ross Zwisler <[email protected]>
  7. */
  8. #include <linux/atomic.h>
  9. #include <linux/blkdev.h>
  10. #include <linux/buffer_head.h>
  11. #include <linux/dax.h>
  12. #include <linux/fs.h>
  13. #include <linux/highmem.h>
  14. #include <linux/memcontrol.h>
  15. #include <linux/mm.h>
  16. #include <linux/mutex.h>
  17. #include <linux/pagevec.h>
  18. #include <linux/sched.h>
  19. #include <linux/sched/signal.h>
  20. #include <linux/uio.h>
  21. #include <linux/vmstat.h>
  22. #include <linux/pfn_t.h>
  23. #include <linux/sizes.h>
  24. #include <linux/mmu_notifier.h>
  25. #include <linux/iomap.h>
  26. #include <linux/rmap.h>
  27. #include <asm/pgalloc.h>
  28. #define CREATE_TRACE_POINTS
  29. #include <trace/events/fs_dax.h>
  30. static inline unsigned int pe_order(enum page_entry_size pe_size)
  31. {
  32. if (pe_size == PE_SIZE_PTE)
  33. return PAGE_SHIFT - PAGE_SHIFT;
  34. if (pe_size == PE_SIZE_PMD)
  35. return PMD_SHIFT - PAGE_SHIFT;
  36. if (pe_size == PE_SIZE_PUD)
  37. return PUD_SHIFT - PAGE_SHIFT;
  38. return ~0;
  39. }
  40. /* We choose 4096 entries - same as per-zone page wait tables */
  41. #define DAX_WAIT_TABLE_BITS 12
  42. #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
  43. /* The 'colour' (ie low bits) within a PMD of a page offset. */
  44. #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
  45. #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT)
  46. /* The order of a PMD entry */
  47. #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT)
  48. static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
  49. static int __init init_dax_wait_table(void)
  50. {
  51. int i;
  52. for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++)
  53. init_waitqueue_head(wait_table + i);
  54. return 0;
  55. }
  56. fs_initcall(init_dax_wait_table);
  57. /*
  58. * DAX pagecache entries use XArray value entries so they can't be mistaken
  59. * for pages. We use one bit for locking, one bit for the entry size (PMD)
  60. * and two more to tell us if the entry is a zero page or an empty entry that
  61. * is just used for locking. In total four special bits.
  62. *
  63. * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE
  64. * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
  65. * block allocation.
  66. */
  67. #define DAX_SHIFT (4)
  68. #define DAX_LOCKED (1UL << 0)
  69. #define DAX_PMD (1UL << 1)
  70. #define DAX_ZERO_PAGE (1UL << 2)
  71. #define DAX_EMPTY (1UL << 3)
  72. static unsigned long dax_to_pfn(void *entry)
  73. {
  74. return xa_to_value(entry) >> DAX_SHIFT;
  75. }
  76. static void *dax_make_entry(pfn_t pfn, unsigned long flags)
  77. {
  78. return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT));
  79. }
  80. static bool dax_is_locked(void *entry)
  81. {
  82. return xa_to_value(entry) & DAX_LOCKED;
  83. }
  84. static unsigned int dax_entry_order(void *entry)
  85. {
  86. if (xa_to_value(entry) & DAX_PMD)
  87. return PMD_ORDER;
  88. return 0;
  89. }
  90. static unsigned long dax_is_pmd_entry(void *entry)
  91. {
  92. return xa_to_value(entry) & DAX_PMD;
  93. }
  94. static bool dax_is_pte_entry(void *entry)
  95. {
  96. return !(xa_to_value(entry) & DAX_PMD);
  97. }
  98. static int dax_is_zero_entry(void *entry)
  99. {
  100. return xa_to_value(entry) & DAX_ZERO_PAGE;
  101. }
  102. static int dax_is_empty_entry(void *entry)
  103. {
  104. return xa_to_value(entry) & DAX_EMPTY;
  105. }
  106. /*
  107. * true if the entry that was found is of a smaller order than the entry
  108. * we were looking for
  109. */
  110. static bool dax_is_conflict(void *entry)
  111. {
  112. return entry == XA_RETRY_ENTRY;
  113. }
  114. /*
  115. * DAX page cache entry locking
  116. */
  117. struct exceptional_entry_key {
  118. struct xarray *xa;
  119. pgoff_t entry_start;
  120. };
  121. struct wait_exceptional_entry_queue {
  122. wait_queue_entry_t wait;
  123. struct exceptional_entry_key key;
  124. };
  125. /**
  126. * enum dax_wake_mode: waitqueue wakeup behaviour
  127. * @WAKE_ALL: wake all waiters in the waitqueue
  128. * @WAKE_NEXT: wake only the first waiter in the waitqueue
  129. */
  130. enum dax_wake_mode {
  131. WAKE_ALL,
  132. WAKE_NEXT,
  133. };
  134. static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas,
  135. void *entry, struct exceptional_entry_key *key)
  136. {
  137. unsigned long hash;
  138. unsigned long index = xas->xa_index;
  139. /*
  140. * If 'entry' is a PMD, align the 'index' that we use for the wait
  141. * queue to the start of that PMD. This ensures that all offsets in
  142. * the range covered by the PMD map to the same bit lock.
  143. */
  144. if (dax_is_pmd_entry(entry))
  145. index &= ~PG_PMD_COLOUR;
  146. key->xa = xas->xa;
  147. key->entry_start = index;
  148. hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS);
  149. return wait_table + hash;
  150. }
  151. static int wake_exceptional_entry_func(wait_queue_entry_t *wait,
  152. unsigned int mode, int sync, void *keyp)
  153. {
  154. struct exceptional_entry_key *key = keyp;
  155. struct wait_exceptional_entry_queue *ewait =
  156. container_of(wait, struct wait_exceptional_entry_queue, wait);
  157. if (key->xa != ewait->key.xa ||
  158. key->entry_start != ewait->key.entry_start)
  159. return 0;
  160. return autoremove_wake_function(wait, mode, sync, NULL);
  161. }
  162. /*
  163. * @entry may no longer be the entry at the index in the mapping.
  164. * The important information it's conveying is whether the entry at
  165. * this index used to be a PMD entry.
  166. */
  167. static void dax_wake_entry(struct xa_state *xas, void *entry,
  168. enum dax_wake_mode mode)
  169. {
  170. struct exceptional_entry_key key;
  171. wait_queue_head_t *wq;
  172. wq = dax_entry_waitqueue(xas, entry, &key);
  173. /*
  174. * Checking for locked entry and prepare_to_wait_exclusive() happens
  175. * under the i_pages lock, ditto for entry handling in our callers.
  176. * So at this point all tasks that could have seen our entry locked
  177. * must be in the waitqueue and the following check will see them.
  178. */
  179. if (waitqueue_active(wq))
  180. __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key);
  181. }
  182. /*
  183. * Look up entry in page cache, wait for it to become unlocked if it
  184. * is a DAX entry and return it. The caller must subsequently call
  185. * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry()
  186. * if it did. The entry returned may have a larger order than @order.
  187. * If @order is larger than the order of the entry found in i_pages, this
  188. * function returns a dax_is_conflict entry.
  189. *
  190. * Must be called with the i_pages lock held.
  191. */
  192. static void *get_unlocked_entry(struct xa_state *xas, unsigned int order)
  193. {
  194. void *entry;
  195. struct wait_exceptional_entry_queue ewait;
  196. wait_queue_head_t *wq;
  197. init_wait(&ewait.wait);
  198. ewait.wait.func = wake_exceptional_entry_func;
  199. for (;;) {
  200. entry = xas_find_conflict(xas);
  201. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  202. return entry;
  203. if (dax_entry_order(entry) < order)
  204. return XA_RETRY_ENTRY;
  205. if (!dax_is_locked(entry))
  206. return entry;
  207. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  208. prepare_to_wait_exclusive(wq, &ewait.wait,
  209. TASK_UNINTERRUPTIBLE);
  210. xas_unlock_irq(xas);
  211. xas_reset(xas);
  212. schedule();
  213. finish_wait(wq, &ewait.wait);
  214. xas_lock_irq(xas);
  215. }
  216. }
  217. /*
  218. * The only thing keeping the address space around is the i_pages lock
  219. * (it's cycled in clear_inode() after removing the entries from i_pages)
  220. * After we call xas_unlock_irq(), we cannot touch xas->xa.
  221. */
  222. static void wait_entry_unlocked(struct xa_state *xas, void *entry)
  223. {
  224. struct wait_exceptional_entry_queue ewait;
  225. wait_queue_head_t *wq;
  226. init_wait(&ewait.wait);
  227. ewait.wait.func = wake_exceptional_entry_func;
  228. wq = dax_entry_waitqueue(xas, entry, &ewait.key);
  229. /*
  230. * Unlike get_unlocked_entry() there is no guarantee that this
  231. * path ever successfully retrieves an unlocked entry before an
  232. * inode dies. Perform a non-exclusive wait in case this path
  233. * never successfully performs its own wake up.
  234. */
  235. prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE);
  236. xas_unlock_irq(xas);
  237. schedule();
  238. finish_wait(wq, &ewait.wait);
  239. }
  240. static void put_unlocked_entry(struct xa_state *xas, void *entry,
  241. enum dax_wake_mode mode)
  242. {
  243. if (entry && !dax_is_conflict(entry))
  244. dax_wake_entry(xas, entry, mode);
  245. }
  246. /*
  247. * We used the xa_state to get the entry, but then we locked the entry and
  248. * dropped the xa_lock, so we know the xa_state is stale and must be reset
  249. * before use.
  250. */
  251. static void dax_unlock_entry(struct xa_state *xas, void *entry)
  252. {
  253. void *old;
  254. BUG_ON(dax_is_locked(entry));
  255. xas_reset(xas);
  256. xas_lock_irq(xas);
  257. old = xas_store(xas, entry);
  258. xas_unlock_irq(xas);
  259. BUG_ON(!dax_is_locked(old));
  260. dax_wake_entry(xas, entry, WAKE_NEXT);
  261. }
  262. /*
  263. * Return: The entry stored at this location before it was locked.
  264. */
  265. static void *dax_lock_entry(struct xa_state *xas, void *entry)
  266. {
  267. unsigned long v = xa_to_value(entry);
  268. return xas_store(xas, xa_mk_value(v | DAX_LOCKED));
  269. }
  270. static unsigned long dax_entry_size(void *entry)
  271. {
  272. if (dax_is_zero_entry(entry))
  273. return 0;
  274. else if (dax_is_empty_entry(entry))
  275. return 0;
  276. else if (dax_is_pmd_entry(entry))
  277. return PMD_SIZE;
  278. else
  279. return PAGE_SIZE;
  280. }
  281. static unsigned long dax_end_pfn(void *entry)
  282. {
  283. return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
  284. }
  285. /*
  286. * Iterate through all mapped pfns represented by an entry, i.e. skip
  287. * 'empty' and 'zero' entries.
  288. */
  289. #define for_each_mapped_pfn(entry, pfn) \
  290. for (pfn = dax_to_pfn(entry); \
  291. pfn < dax_end_pfn(entry); pfn++)
  292. static inline bool dax_mapping_is_cow(struct address_space *mapping)
  293. {
  294. return (unsigned long)mapping == PAGE_MAPPING_DAX_COW;
  295. }
  296. /*
  297. * Set the page->mapping with FS_DAX_MAPPING_COW flag, increase the refcount.
  298. */
  299. static inline void dax_mapping_set_cow(struct page *page)
  300. {
  301. if ((uintptr_t)page->mapping != PAGE_MAPPING_DAX_COW) {
  302. /*
  303. * Reset the index if the page was already mapped
  304. * regularly before.
  305. */
  306. if (page->mapping)
  307. page->index = 1;
  308. page->mapping = (void *)PAGE_MAPPING_DAX_COW;
  309. }
  310. page->index++;
  311. }
  312. /*
  313. * When it is called in dax_insert_entry(), the cow flag will indicate that
  314. * whether this entry is shared by multiple files. If so, set the page->mapping
  315. * FS_DAX_MAPPING_COW, and use page->index as refcount.
  316. */
  317. static void dax_associate_entry(void *entry, struct address_space *mapping,
  318. struct vm_area_struct *vma, unsigned long address, bool cow)
  319. {
  320. unsigned long size = dax_entry_size(entry), pfn, index;
  321. int i = 0;
  322. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  323. return;
  324. index = linear_page_index(vma, address & ~(size - 1));
  325. for_each_mapped_pfn(entry, pfn) {
  326. struct page *page = pfn_to_page(pfn);
  327. if (cow) {
  328. dax_mapping_set_cow(page);
  329. } else {
  330. WARN_ON_ONCE(page->mapping);
  331. page->mapping = mapping;
  332. page->index = index + i++;
  333. }
  334. }
  335. }
  336. static void dax_disassociate_entry(void *entry, struct address_space *mapping,
  337. bool trunc)
  338. {
  339. unsigned long pfn;
  340. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  341. return;
  342. for_each_mapped_pfn(entry, pfn) {
  343. struct page *page = pfn_to_page(pfn);
  344. WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
  345. if (dax_mapping_is_cow(page->mapping)) {
  346. /* keep the CoW flag if this page is still shared */
  347. if (page->index-- > 0)
  348. continue;
  349. } else
  350. WARN_ON_ONCE(page->mapping && page->mapping != mapping);
  351. page->mapping = NULL;
  352. page->index = 0;
  353. }
  354. }
  355. static struct page *dax_busy_page(void *entry)
  356. {
  357. unsigned long pfn;
  358. for_each_mapped_pfn(entry, pfn) {
  359. struct page *page = pfn_to_page(pfn);
  360. if (page_ref_count(page) > 1)
  361. return page;
  362. }
  363. return NULL;
  364. }
  365. /*
  366. * dax_lock_page - Lock the DAX entry corresponding to a page
  367. * @page: The page whose entry we want to lock
  368. *
  369. * Context: Process context.
  370. * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could
  371. * not be locked.
  372. */
  373. dax_entry_t dax_lock_page(struct page *page)
  374. {
  375. XA_STATE(xas, NULL, 0);
  376. void *entry;
  377. /* Ensure page->mapping isn't freed while we look at it */
  378. rcu_read_lock();
  379. for (;;) {
  380. struct address_space *mapping = READ_ONCE(page->mapping);
  381. entry = NULL;
  382. if (!mapping || !dax_mapping(mapping))
  383. break;
  384. /*
  385. * In the device-dax case there's no need to lock, a
  386. * struct dev_pagemap pin is sufficient to keep the
  387. * inode alive, and we assume we have dev_pagemap pin
  388. * otherwise we would not have a valid pfn_to_page()
  389. * translation.
  390. */
  391. entry = (void *)~0UL;
  392. if (S_ISCHR(mapping->host->i_mode))
  393. break;
  394. xas.xa = &mapping->i_pages;
  395. xas_lock_irq(&xas);
  396. if (mapping != page->mapping) {
  397. xas_unlock_irq(&xas);
  398. continue;
  399. }
  400. xas_set(&xas, page->index);
  401. entry = xas_load(&xas);
  402. if (dax_is_locked(entry)) {
  403. rcu_read_unlock();
  404. wait_entry_unlocked(&xas, entry);
  405. rcu_read_lock();
  406. continue;
  407. }
  408. dax_lock_entry(&xas, entry);
  409. xas_unlock_irq(&xas);
  410. break;
  411. }
  412. rcu_read_unlock();
  413. return (dax_entry_t)entry;
  414. }
  415. void dax_unlock_page(struct page *page, dax_entry_t cookie)
  416. {
  417. struct address_space *mapping = page->mapping;
  418. XA_STATE(xas, &mapping->i_pages, page->index);
  419. if (S_ISCHR(mapping->host->i_mode))
  420. return;
  421. dax_unlock_entry(&xas, (void *)cookie);
  422. }
  423. /*
  424. * dax_lock_mapping_entry - Lock the DAX entry corresponding to a mapping
  425. * @mapping: the file's mapping whose entry we want to lock
  426. * @index: the offset within this file
  427. * @page: output the dax page corresponding to this dax entry
  428. *
  429. * Return: A cookie to pass to dax_unlock_mapping_entry() or 0 if the entry
  430. * could not be locked.
  431. */
  432. dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, pgoff_t index,
  433. struct page **page)
  434. {
  435. XA_STATE(xas, NULL, 0);
  436. void *entry;
  437. rcu_read_lock();
  438. for (;;) {
  439. entry = NULL;
  440. if (!dax_mapping(mapping))
  441. break;
  442. xas.xa = &mapping->i_pages;
  443. xas_lock_irq(&xas);
  444. xas_set(&xas, index);
  445. entry = xas_load(&xas);
  446. if (dax_is_locked(entry)) {
  447. rcu_read_unlock();
  448. wait_entry_unlocked(&xas, entry);
  449. rcu_read_lock();
  450. continue;
  451. }
  452. if (!entry ||
  453. dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  454. /*
  455. * Because we are looking for entry from file's mapping
  456. * and index, so the entry may not be inserted for now,
  457. * or even a zero/empty entry. We don't think this is
  458. * an error case. So, return a special value and do
  459. * not output @page.
  460. */
  461. entry = (void *)~0UL;
  462. } else {
  463. *page = pfn_to_page(dax_to_pfn(entry));
  464. dax_lock_entry(&xas, entry);
  465. }
  466. xas_unlock_irq(&xas);
  467. break;
  468. }
  469. rcu_read_unlock();
  470. return (dax_entry_t)entry;
  471. }
  472. void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index,
  473. dax_entry_t cookie)
  474. {
  475. XA_STATE(xas, &mapping->i_pages, index);
  476. if (cookie == ~0UL)
  477. return;
  478. dax_unlock_entry(&xas, (void *)cookie);
  479. }
  480. /*
  481. * Find page cache entry at given index. If it is a DAX entry, return it
  482. * with the entry locked. If the page cache doesn't contain an entry at
  483. * that index, add a locked empty entry.
  484. *
  485. * When requesting an entry with size DAX_PMD, grab_mapping_entry() will
  486. * either return that locked entry or will return VM_FAULT_FALLBACK.
  487. * This will happen if there are any PTE entries within the PMD range
  488. * that we are requesting.
  489. *
  490. * We always favor PTE entries over PMD entries. There isn't a flow where we
  491. * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD
  492. * insertion will fail if it finds any PTE entries already in the tree, and a
  493. * PTE insertion will cause an existing PMD entry to be unmapped and
  494. * downgraded to PTE entries. This happens for both PMD zero pages as
  495. * well as PMD empty entries.
  496. *
  497. * The exception to this downgrade path is for PMD entries that have
  498. * real storage backing them. We will leave these real PMD entries in
  499. * the tree, and PTE writes will simply dirty the entire PMD entry.
  500. *
  501. * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  502. * persistent memory the benefit is doubtful. We can add that later if we can
  503. * show it helps.
  504. *
  505. * On error, this function does not return an ERR_PTR. Instead it returns
  506. * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values
  507. * overlap with xarray value entries.
  508. */
  509. static void *grab_mapping_entry(struct xa_state *xas,
  510. struct address_space *mapping, unsigned int order)
  511. {
  512. unsigned long index = xas->xa_index;
  513. bool pmd_downgrade; /* splitting PMD entry into PTE entries? */
  514. void *entry;
  515. retry:
  516. pmd_downgrade = false;
  517. xas_lock_irq(xas);
  518. entry = get_unlocked_entry(xas, order);
  519. if (entry) {
  520. if (dax_is_conflict(entry))
  521. goto fallback;
  522. if (!xa_is_value(entry)) {
  523. xas_set_err(xas, -EIO);
  524. goto out_unlock;
  525. }
  526. if (order == 0) {
  527. if (dax_is_pmd_entry(entry) &&
  528. (dax_is_zero_entry(entry) ||
  529. dax_is_empty_entry(entry))) {
  530. pmd_downgrade = true;
  531. }
  532. }
  533. }
  534. if (pmd_downgrade) {
  535. /*
  536. * Make sure 'entry' remains valid while we drop
  537. * the i_pages lock.
  538. */
  539. dax_lock_entry(xas, entry);
  540. /*
  541. * Besides huge zero pages the only other thing that gets
  542. * downgraded are empty entries which don't need to be
  543. * unmapped.
  544. */
  545. if (dax_is_zero_entry(entry)) {
  546. xas_unlock_irq(xas);
  547. unmap_mapping_pages(mapping,
  548. xas->xa_index & ~PG_PMD_COLOUR,
  549. PG_PMD_NR, false);
  550. xas_reset(xas);
  551. xas_lock_irq(xas);
  552. }
  553. dax_disassociate_entry(entry, mapping, false);
  554. xas_store(xas, NULL); /* undo the PMD join */
  555. dax_wake_entry(xas, entry, WAKE_ALL);
  556. mapping->nrpages -= PG_PMD_NR;
  557. entry = NULL;
  558. xas_set(xas, index);
  559. }
  560. if (entry) {
  561. dax_lock_entry(xas, entry);
  562. } else {
  563. unsigned long flags = DAX_EMPTY;
  564. if (order > 0)
  565. flags |= DAX_PMD;
  566. entry = dax_make_entry(pfn_to_pfn_t(0), flags);
  567. dax_lock_entry(xas, entry);
  568. if (xas_error(xas))
  569. goto out_unlock;
  570. mapping->nrpages += 1UL << order;
  571. }
  572. out_unlock:
  573. xas_unlock_irq(xas);
  574. if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM))
  575. goto retry;
  576. if (xas->xa_node == XA_ERROR(-ENOMEM))
  577. return xa_mk_internal(VM_FAULT_OOM);
  578. if (xas_error(xas))
  579. return xa_mk_internal(VM_FAULT_SIGBUS);
  580. return entry;
  581. fallback:
  582. xas_unlock_irq(xas);
  583. return xa_mk_internal(VM_FAULT_FALLBACK);
  584. }
  585. /**
  586. * dax_layout_busy_page_range - find first pinned page in @mapping
  587. * @mapping: address space to scan for a page with ref count > 1
  588. * @start: Starting offset. Page containing 'start' is included.
  589. * @end: End offset. Page containing 'end' is included. If 'end' is LLONG_MAX,
  590. * pages from 'start' till the end of file are included.
  591. *
  592. * DAX requires ZONE_DEVICE mapped pages. These pages are never
  593. * 'onlined' to the page allocator so they are considered idle when
  594. * page->count == 1. A filesystem uses this interface to determine if
  595. * any page in the mapping is busy, i.e. for DMA, or other
  596. * get_user_pages() usages.
  597. *
  598. * It is expected that the filesystem is holding locks to block the
  599. * establishment of new mappings in this address_space. I.e. it expects
  600. * to be able to run unmap_mapping_range() and subsequently not race
  601. * mapping_mapped() becoming true.
  602. */
  603. struct page *dax_layout_busy_page_range(struct address_space *mapping,
  604. loff_t start, loff_t end)
  605. {
  606. void *entry;
  607. unsigned int scanned = 0;
  608. struct page *page = NULL;
  609. pgoff_t start_idx = start >> PAGE_SHIFT;
  610. pgoff_t end_idx;
  611. XA_STATE(xas, &mapping->i_pages, start_idx);
  612. /*
  613. * In the 'limited' case get_user_pages() for dax is disabled.
  614. */
  615. if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
  616. return NULL;
  617. if (!dax_mapping(mapping) || !mapping_mapped(mapping))
  618. return NULL;
  619. /* If end == LLONG_MAX, all pages from start to till end of file */
  620. if (end == LLONG_MAX)
  621. end_idx = ULONG_MAX;
  622. else
  623. end_idx = end >> PAGE_SHIFT;
  624. /*
  625. * If we race get_user_pages_fast() here either we'll see the
  626. * elevated page count in the iteration and wait, or
  627. * get_user_pages_fast() will see that the page it took a reference
  628. * against is no longer mapped in the page tables and bail to the
  629. * get_user_pages() slow path. The slow path is protected by
  630. * pte_lock() and pmd_lock(). New references are not taken without
  631. * holding those locks, and unmap_mapping_pages() will not zero the
  632. * pte or pmd without holding the respective lock, so we are
  633. * guaranteed to either see new references or prevent new
  634. * references from being established.
  635. */
  636. unmap_mapping_pages(mapping, start_idx, end_idx - start_idx + 1, 0);
  637. xas_lock_irq(&xas);
  638. xas_for_each(&xas, entry, end_idx) {
  639. if (WARN_ON_ONCE(!xa_is_value(entry)))
  640. continue;
  641. if (unlikely(dax_is_locked(entry)))
  642. entry = get_unlocked_entry(&xas, 0);
  643. if (entry)
  644. page = dax_busy_page(entry);
  645. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  646. if (page)
  647. break;
  648. if (++scanned % XA_CHECK_SCHED)
  649. continue;
  650. xas_pause(&xas);
  651. xas_unlock_irq(&xas);
  652. cond_resched();
  653. xas_lock_irq(&xas);
  654. }
  655. xas_unlock_irq(&xas);
  656. return page;
  657. }
  658. EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);
  659. struct page *dax_layout_busy_page(struct address_space *mapping)
  660. {
  661. return dax_layout_busy_page_range(mapping, 0, LLONG_MAX);
  662. }
  663. EXPORT_SYMBOL_GPL(dax_layout_busy_page);
  664. static int __dax_invalidate_entry(struct address_space *mapping,
  665. pgoff_t index, bool trunc)
  666. {
  667. XA_STATE(xas, &mapping->i_pages, index);
  668. int ret = 0;
  669. void *entry;
  670. xas_lock_irq(&xas);
  671. entry = get_unlocked_entry(&xas, 0);
  672. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  673. goto out;
  674. if (!trunc &&
  675. (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) ||
  676. xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE)))
  677. goto out;
  678. dax_disassociate_entry(entry, mapping, trunc);
  679. xas_store(&xas, NULL);
  680. mapping->nrpages -= 1UL << dax_entry_order(entry);
  681. ret = 1;
  682. out:
  683. put_unlocked_entry(&xas, entry, WAKE_ALL);
  684. xas_unlock_irq(&xas);
  685. return ret;
  686. }
  687. /*
  688. * Delete DAX entry at @index from @mapping. Wait for it
  689. * to be unlocked before deleting it.
  690. */
  691. int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  692. {
  693. int ret = __dax_invalidate_entry(mapping, index, true);
  694. /*
  695. * This gets called from truncate / punch_hole path. As such, the caller
  696. * must hold locks protecting against concurrent modifications of the
  697. * page cache (usually fs-private i_mmap_sem for writing). Since the
  698. * caller has seen a DAX entry for this index, we better find it
  699. * at that index as well...
  700. */
  701. WARN_ON_ONCE(!ret);
  702. return ret;
  703. }
  704. /*
  705. * Invalidate DAX entry if it is clean.
  706. */
  707. int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
  708. pgoff_t index)
  709. {
  710. return __dax_invalidate_entry(mapping, index, false);
  711. }
  712. static pgoff_t dax_iomap_pgoff(const struct iomap *iomap, loff_t pos)
  713. {
  714. return PHYS_PFN(iomap->addr + (pos & PAGE_MASK) - iomap->offset);
  715. }
  716. static int copy_cow_page_dax(struct vm_fault *vmf, const struct iomap_iter *iter)
  717. {
  718. pgoff_t pgoff = dax_iomap_pgoff(&iter->iomap, iter->pos);
  719. void *vto, *kaddr;
  720. long rc;
  721. int id;
  722. id = dax_read_lock();
  723. rc = dax_direct_access(iter->iomap.dax_dev, pgoff, 1, DAX_ACCESS,
  724. &kaddr, NULL);
  725. if (rc < 0) {
  726. dax_read_unlock(id);
  727. return rc;
  728. }
  729. vto = kmap_atomic(vmf->cow_page);
  730. copy_user_page(vto, kaddr, vmf->address, vmf->cow_page);
  731. kunmap_atomic(vto);
  732. dax_read_unlock(id);
  733. return 0;
  734. }
  735. /*
  736. * MAP_SYNC on a dax mapping guarantees dirty metadata is
  737. * flushed on write-faults (non-cow), but not read-faults.
  738. */
  739. static bool dax_fault_is_synchronous(const struct iomap_iter *iter,
  740. struct vm_area_struct *vma)
  741. {
  742. return (iter->flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) &&
  743. (iter->iomap.flags & IOMAP_F_DIRTY);
  744. }
  745. static bool dax_fault_is_cow(const struct iomap_iter *iter)
  746. {
  747. return (iter->flags & IOMAP_WRITE) &&
  748. (iter->iomap.flags & IOMAP_F_SHARED);
  749. }
  750. /*
  751. * By this point grab_mapping_entry() has ensured that we have a locked entry
  752. * of the appropriate size so we don't have to worry about downgrading PMDs to
  753. * PTEs. If we happen to be trying to insert a PTE and there is a PMD
  754. * already in the tree, we will skip the insertion and just dirty the PMD as
  755. * appropriate.
  756. */
  757. static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf,
  758. const struct iomap_iter *iter, void *entry, pfn_t pfn,
  759. unsigned long flags)
  760. {
  761. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  762. void *new_entry = dax_make_entry(pfn, flags);
  763. bool dirty = !dax_fault_is_synchronous(iter, vmf->vma);
  764. bool cow = dax_fault_is_cow(iter);
  765. if (dirty)
  766. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  767. if (cow || (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE))) {
  768. unsigned long index = xas->xa_index;
  769. /* we are replacing a zero page with block mapping */
  770. if (dax_is_pmd_entry(entry))
  771. unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR,
  772. PG_PMD_NR, false);
  773. else /* pte entry */
  774. unmap_mapping_pages(mapping, index, 1, false);
  775. }
  776. xas_reset(xas);
  777. xas_lock_irq(xas);
  778. if (cow || dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
  779. void *old;
  780. dax_disassociate_entry(entry, mapping, false);
  781. dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address,
  782. cow);
  783. /*
  784. * Only swap our new entry into the page cache if the current
  785. * entry is a zero page or an empty entry. If a normal PTE or
  786. * PMD entry is already in the cache, we leave it alone. This
  787. * means that if we are trying to insert a PTE and the
  788. * existing entry is a PMD, we will just leave the PMD in the
  789. * tree and dirty it if necessary.
  790. */
  791. old = dax_lock_entry(xas, new_entry);
  792. WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) |
  793. DAX_LOCKED));
  794. entry = new_entry;
  795. } else {
  796. xas_load(xas); /* Walk the xa_state */
  797. }
  798. if (dirty)
  799. xas_set_mark(xas, PAGECACHE_TAG_DIRTY);
  800. if (cow)
  801. xas_set_mark(xas, PAGECACHE_TAG_TOWRITE);
  802. xas_unlock_irq(xas);
  803. return entry;
  804. }
  805. static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
  806. struct address_space *mapping, void *entry)
  807. {
  808. unsigned long pfn, index, count, end;
  809. long ret = 0;
  810. struct vm_area_struct *vma;
  811. /*
  812. * A page got tagged dirty in DAX mapping? Something is seriously
  813. * wrong.
  814. */
  815. if (WARN_ON(!xa_is_value(entry)))
  816. return -EIO;
  817. if (unlikely(dax_is_locked(entry))) {
  818. void *old_entry = entry;
  819. entry = get_unlocked_entry(xas, 0);
  820. /* Entry got punched out / reallocated? */
  821. if (!entry || WARN_ON_ONCE(!xa_is_value(entry)))
  822. goto put_unlocked;
  823. /*
  824. * Entry got reallocated elsewhere? No need to writeback.
  825. * We have to compare pfns as we must not bail out due to
  826. * difference in lockbit or entry type.
  827. */
  828. if (dax_to_pfn(old_entry) != dax_to_pfn(entry))
  829. goto put_unlocked;
  830. if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
  831. dax_is_zero_entry(entry))) {
  832. ret = -EIO;
  833. goto put_unlocked;
  834. }
  835. /* Another fsync thread may have already done this entry */
  836. if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE))
  837. goto put_unlocked;
  838. }
  839. /* Lock the entry to serialize with page faults */
  840. dax_lock_entry(xas, entry);
  841. /*
  842. * We can clear the tag now but we have to be careful so that concurrent
  843. * dax_writeback_one() calls for the same index cannot finish before we
  844. * actually flush the caches. This is achieved as the calls will look
  845. * at the entry only under the i_pages lock and once they do that
  846. * they will see the entry locked and wait for it to unlock.
  847. */
  848. xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE);
  849. xas_unlock_irq(xas);
  850. /*
  851. * If dax_writeback_mapping_range() was given a wbc->range_start
  852. * in the middle of a PMD, the 'index' we use needs to be
  853. * aligned to the start of the PMD.
  854. * This allows us to flush for PMD_SIZE and not have to worry about
  855. * partial PMD writebacks.
  856. */
  857. pfn = dax_to_pfn(entry);
  858. count = 1UL << dax_entry_order(entry);
  859. index = xas->xa_index & ~(count - 1);
  860. end = index + count - 1;
  861. /* Walk all mappings of a given index of a file and writeprotect them */
  862. i_mmap_lock_read(mapping);
  863. vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
  864. pfn_mkclean_range(pfn, count, index, vma);
  865. cond_resched();
  866. }
  867. i_mmap_unlock_read(mapping);
  868. dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
  869. /*
  870. * After we have flushed the cache, we can clear the dirty tag. There
  871. * cannot be new dirty data in the pfn after the flush has completed as
  872. * the pfn mappings are writeprotected and fault waits for mapping
  873. * entry lock.
  874. */
  875. xas_reset(xas);
  876. xas_lock_irq(xas);
  877. xas_store(xas, entry);
  878. xas_clear_mark(xas, PAGECACHE_TAG_DIRTY);
  879. dax_wake_entry(xas, entry, WAKE_NEXT);
  880. trace_dax_writeback_one(mapping->host, index, count);
  881. return ret;
  882. put_unlocked:
  883. put_unlocked_entry(xas, entry, WAKE_NEXT);
  884. return ret;
  885. }
  886. /*
  887. * Flush the mapping to the persistent domain within the byte range of [start,
  888. * end]. This is required by data integrity operations to ensure file data is
  889. * on persistent storage prior to completion of the operation.
  890. */
  891. int dax_writeback_mapping_range(struct address_space *mapping,
  892. struct dax_device *dax_dev, struct writeback_control *wbc)
  893. {
  894. XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT);
  895. struct inode *inode = mapping->host;
  896. pgoff_t end_index = wbc->range_end >> PAGE_SHIFT;
  897. void *entry;
  898. int ret = 0;
  899. unsigned int scanned = 0;
  900. if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
  901. return -EIO;
  902. if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL)
  903. return 0;
  904. trace_dax_writeback_range(inode, xas.xa_index, end_index);
  905. tag_pages_for_writeback(mapping, xas.xa_index, end_index);
  906. xas_lock_irq(&xas);
  907. xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) {
  908. ret = dax_writeback_one(&xas, dax_dev, mapping, entry);
  909. if (ret < 0) {
  910. mapping_set_error(mapping, ret);
  911. break;
  912. }
  913. if (++scanned % XA_CHECK_SCHED)
  914. continue;
  915. xas_pause(&xas);
  916. xas_unlock_irq(&xas);
  917. cond_resched();
  918. xas_lock_irq(&xas);
  919. }
  920. xas_unlock_irq(&xas);
  921. trace_dax_writeback_range_done(inode, xas.xa_index, end_index);
  922. return ret;
  923. }
  924. EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
  925. static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos,
  926. size_t size, void **kaddr, pfn_t *pfnp)
  927. {
  928. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  929. int id, rc = 0;
  930. long length;
  931. id = dax_read_lock();
  932. length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
  933. DAX_ACCESS, kaddr, pfnp);
  934. if (length < 0) {
  935. rc = length;
  936. goto out;
  937. }
  938. if (!pfnp)
  939. goto out_check_addr;
  940. rc = -EINVAL;
  941. if (PFN_PHYS(length) < size)
  942. goto out;
  943. if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
  944. goto out;
  945. /* For larger pages we need devmap */
  946. if (length > 1 && !pfn_t_devmap(*pfnp))
  947. goto out;
  948. rc = 0;
  949. out_check_addr:
  950. if (!kaddr)
  951. goto out;
  952. if (!*kaddr)
  953. rc = -EFAULT;
  954. out:
  955. dax_read_unlock(id);
  956. return rc;
  957. }
  958. /**
  959. * dax_iomap_cow_copy - Copy the data from source to destination before write
  960. * @pos: address to do copy from.
  961. * @length: size of copy operation.
  962. * @align_size: aligned w.r.t align_size (either PMD_SIZE or PAGE_SIZE)
  963. * @srcmap: iomap srcmap
  964. * @daddr: destination address to copy to.
  965. *
  966. * This can be called from two places. Either during DAX write fault (page
  967. * aligned), to copy the length size data to daddr. Or, while doing normal DAX
  968. * write operation, dax_iomap_actor() might call this to do the copy of either
  969. * start or end unaligned address. In the latter case the rest of the copy of
  970. * aligned ranges is taken care by dax_iomap_actor() itself.
  971. */
  972. static int dax_iomap_cow_copy(loff_t pos, uint64_t length, size_t align_size,
  973. const struct iomap *srcmap, void *daddr)
  974. {
  975. loff_t head_off = pos & (align_size - 1);
  976. size_t size = ALIGN(head_off + length, align_size);
  977. loff_t end = pos + length;
  978. loff_t pg_end = round_up(end, align_size);
  979. bool copy_all = head_off == 0 && end == pg_end;
  980. void *saddr = 0;
  981. int ret = 0;
  982. ret = dax_iomap_direct_access(srcmap, pos, size, &saddr, NULL);
  983. if (ret)
  984. return ret;
  985. if (copy_all) {
  986. ret = copy_mc_to_kernel(daddr, saddr, length);
  987. return ret ? -EIO : 0;
  988. }
  989. /* Copy the head part of the range */
  990. if (head_off) {
  991. ret = copy_mc_to_kernel(daddr, saddr, head_off);
  992. if (ret)
  993. return -EIO;
  994. }
  995. /* Copy the tail part of the range */
  996. if (end < pg_end) {
  997. loff_t tail_off = head_off + length;
  998. loff_t tail_len = pg_end - end;
  999. ret = copy_mc_to_kernel(daddr + tail_off, saddr + tail_off,
  1000. tail_len);
  1001. if (ret)
  1002. return -EIO;
  1003. }
  1004. return 0;
  1005. }
  1006. /*
  1007. * The user has performed a load from a hole in the file. Allocating a new
  1008. * page in the file would cause excessive storage usage for workloads with
  1009. * sparse files. Instead we insert a read-only mapping of the 4k zero page.
  1010. * If this page is ever written to we will re-fault and change the mapping to
  1011. * point to real DAX storage instead.
  1012. */
  1013. static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1014. const struct iomap_iter *iter, void **entry)
  1015. {
  1016. struct inode *inode = iter->inode;
  1017. unsigned long vaddr = vmf->address;
  1018. pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
  1019. vm_fault_t ret;
  1020. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE);
  1021. ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
  1022. trace_dax_load_hole(inode, vmf, ret);
  1023. return ret;
  1024. }
  1025. #ifdef CONFIG_FS_DAX_PMD
  1026. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1027. const struct iomap_iter *iter, void **entry)
  1028. {
  1029. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1030. unsigned long pmd_addr = vmf->address & PMD_MASK;
  1031. struct vm_area_struct *vma = vmf->vma;
  1032. struct inode *inode = mapping->host;
  1033. pgtable_t pgtable = NULL;
  1034. struct page *zero_page;
  1035. spinlock_t *ptl;
  1036. pmd_t pmd_entry;
  1037. pfn_t pfn;
  1038. zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
  1039. if (unlikely(!zero_page))
  1040. goto fallback;
  1041. pfn = page_to_pfn_t(zero_page);
  1042. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn,
  1043. DAX_PMD | DAX_ZERO_PAGE);
  1044. if (arch_needs_pgtable_deposit()) {
  1045. pgtable = pte_alloc_one(vma->vm_mm);
  1046. if (!pgtable)
  1047. return VM_FAULT_OOM;
  1048. }
  1049. ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
  1050. if (!pmd_none(*(vmf->pmd))) {
  1051. spin_unlock(ptl);
  1052. goto fallback;
  1053. }
  1054. if (pgtable) {
  1055. pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
  1056. mm_inc_nr_ptes(vma->vm_mm);
  1057. }
  1058. pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot);
  1059. pmd_entry = pmd_mkhuge(pmd_entry);
  1060. set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry);
  1061. spin_unlock(ptl);
  1062. trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry);
  1063. return VM_FAULT_NOPAGE;
  1064. fallback:
  1065. if (pgtable)
  1066. pte_free(vma->vm_mm, pgtable);
  1067. trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry);
  1068. return VM_FAULT_FALLBACK;
  1069. }
  1070. #else
  1071. static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf,
  1072. const struct iomap_iter *iter, void **entry)
  1073. {
  1074. return VM_FAULT_FALLBACK;
  1075. }
  1076. #endif /* CONFIG_FS_DAX_PMD */
  1077. static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size)
  1078. {
  1079. const struct iomap *iomap = &iter->iomap;
  1080. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1081. unsigned offset = offset_in_page(pos);
  1082. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1083. void *kaddr;
  1084. long ret;
  1085. ret = dax_direct_access(iomap->dax_dev, pgoff, 1, DAX_ACCESS, &kaddr,
  1086. NULL);
  1087. if (ret < 0)
  1088. return ret;
  1089. memset(kaddr + offset, 0, size);
  1090. if (srcmap->addr != iomap->addr) {
  1091. ret = dax_iomap_cow_copy(pos, size, PAGE_SIZE, srcmap,
  1092. kaddr);
  1093. if (ret < 0)
  1094. return ret;
  1095. dax_flush(iomap->dax_dev, kaddr, PAGE_SIZE);
  1096. } else
  1097. dax_flush(iomap->dax_dev, kaddr + offset, size);
  1098. return ret;
  1099. }
  1100. static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero)
  1101. {
  1102. const struct iomap *iomap = &iter->iomap;
  1103. const struct iomap *srcmap = iomap_iter_srcmap(iter);
  1104. loff_t pos = iter->pos;
  1105. u64 length = iomap_length(iter);
  1106. s64 written = 0;
  1107. /* already zeroed? we're done. */
  1108. if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
  1109. return length;
  1110. do {
  1111. unsigned offset = offset_in_page(pos);
  1112. unsigned size = min_t(u64, PAGE_SIZE - offset, length);
  1113. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1114. long rc;
  1115. int id;
  1116. id = dax_read_lock();
  1117. if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE)
  1118. rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1);
  1119. else
  1120. rc = dax_memzero(iter, pos, size);
  1121. dax_read_unlock(id);
  1122. if (rc < 0)
  1123. return rc;
  1124. pos += size;
  1125. length -= size;
  1126. written += size;
  1127. } while (length > 0);
  1128. if (did_zero)
  1129. *did_zero = true;
  1130. return written;
  1131. }
  1132. int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
  1133. const struct iomap_ops *ops)
  1134. {
  1135. struct iomap_iter iter = {
  1136. .inode = inode,
  1137. .pos = pos,
  1138. .len = len,
  1139. .flags = IOMAP_DAX | IOMAP_ZERO,
  1140. };
  1141. int ret;
  1142. while ((ret = iomap_iter(&iter, ops)) > 0)
  1143. iter.processed = dax_zero_iter(&iter, did_zero);
  1144. return ret;
  1145. }
  1146. EXPORT_SYMBOL_GPL(dax_zero_range);
  1147. int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
  1148. const struct iomap_ops *ops)
  1149. {
  1150. unsigned int blocksize = i_blocksize(inode);
  1151. unsigned int off = pos & (blocksize - 1);
  1152. /* Block boundary? Nothing to do */
  1153. if (!off)
  1154. return 0;
  1155. return dax_zero_range(inode, pos, blocksize - off, did_zero, ops);
  1156. }
  1157. EXPORT_SYMBOL_GPL(dax_truncate_page);
  1158. static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
  1159. struct iov_iter *iter)
  1160. {
  1161. const struct iomap *iomap = &iomi->iomap;
  1162. const struct iomap *srcmap = &iomi->srcmap;
  1163. loff_t length = iomap_length(iomi);
  1164. loff_t pos = iomi->pos;
  1165. struct dax_device *dax_dev = iomap->dax_dev;
  1166. loff_t end = pos + length, done = 0;
  1167. bool write = iov_iter_rw(iter) == WRITE;
  1168. ssize_t ret = 0;
  1169. size_t xfer;
  1170. int id;
  1171. if (!write) {
  1172. end = min(end, i_size_read(iomi->inode));
  1173. if (pos >= end)
  1174. return 0;
  1175. if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN)
  1176. return iov_iter_zero(min(length, end - pos), iter);
  1177. }
  1178. /*
  1179. * In DAX mode, enforce either pure overwrites of written extents, or
  1180. * writes to unwritten extents as part of a copy-on-write operation.
  1181. */
  1182. if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED &&
  1183. !(iomap->flags & IOMAP_F_SHARED)))
  1184. return -EIO;
  1185. /*
  1186. * Write can allocate block for an area which has a hole page mapped
  1187. * into page tables. We have to tear down these mappings so that data
  1188. * written by write(2) is visible in mmap.
  1189. */
  1190. if (iomap->flags & IOMAP_F_NEW) {
  1191. invalidate_inode_pages2_range(iomi->inode->i_mapping,
  1192. pos >> PAGE_SHIFT,
  1193. (end - 1) >> PAGE_SHIFT);
  1194. }
  1195. id = dax_read_lock();
  1196. while (pos < end) {
  1197. unsigned offset = pos & (PAGE_SIZE - 1);
  1198. const size_t size = ALIGN(length + offset, PAGE_SIZE);
  1199. pgoff_t pgoff = dax_iomap_pgoff(iomap, pos);
  1200. ssize_t map_len;
  1201. bool recovery = false;
  1202. void *kaddr;
  1203. if (fatal_signal_pending(current)) {
  1204. ret = -EINTR;
  1205. break;
  1206. }
  1207. map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size),
  1208. DAX_ACCESS, &kaddr, NULL);
  1209. if (map_len == -EIO && iov_iter_rw(iter) == WRITE) {
  1210. map_len = dax_direct_access(dax_dev, pgoff,
  1211. PHYS_PFN(size), DAX_RECOVERY_WRITE,
  1212. &kaddr, NULL);
  1213. if (map_len > 0)
  1214. recovery = true;
  1215. }
  1216. if (map_len < 0) {
  1217. ret = map_len;
  1218. break;
  1219. }
  1220. if (write &&
  1221. srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
  1222. ret = dax_iomap_cow_copy(pos, length, PAGE_SIZE, srcmap,
  1223. kaddr);
  1224. if (ret)
  1225. break;
  1226. }
  1227. map_len = PFN_PHYS(map_len);
  1228. kaddr += offset;
  1229. map_len -= offset;
  1230. if (map_len > end - pos)
  1231. map_len = end - pos;
  1232. if (recovery)
  1233. xfer = dax_recovery_write(dax_dev, pgoff, kaddr,
  1234. map_len, iter);
  1235. else if (write)
  1236. xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr,
  1237. map_len, iter);
  1238. else
  1239. xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr,
  1240. map_len, iter);
  1241. pos += xfer;
  1242. length -= xfer;
  1243. done += xfer;
  1244. if (xfer == 0)
  1245. ret = -EFAULT;
  1246. if (xfer < map_len)
  1247. break;
  1248. }
  1249. dax_read_unlock(id);
  1250. return done ? done : ret;
  1251. }
  1252. /**
  1253. * dax_iomap_rw - Perform I/O to a DAX file
  1254. * @iocb: The control block for this I/O
  1255. * @iter: The addresses to do I/O from or to
  1256. * @ops: iomap ops passed from the file system
  1257. *
  1258. * This function performs read and write operations to directly mapped
  1259. * persistent memory. The callers needs to take care of read/write exclusion
  1260. * and evicting any page cache pages in the region under I/O.
  1261. */
  1262. ssize_t
  1263. dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
  1264. const struct iomap_ops *ops)
  1265. {
  1266. struct iomap_iter iomi = {
  1267. .inode = iocb->ki_filp->f_mapping->host,
  1268. .pos = iocb->ki_pos,
  1269. .len = iov_iter_count(iter),
  1270. .flags = IOMAP_DAX,
  1271. };
  1272. loff_t done = 0;
  1273. int ret;
  1274. if (!iomi.len)
  1275. return 0;
  1276. if (iov_iter_rw(iter) == WRITE) {
  1277. lockdep_assert_held_write(&iomi.inode->i_rwsem);
  1278. iomi.flags |= IOMAP_WRITE;
  1279. } else {
  1280. lockdep_assert_held(&iomi.inode->i_rwsem);
  1281. }
  1282. if (iocb->ki_flags & IOCB_NOWAIT)
  1283. iomi.flags |= IOMAP_NOWAIT;
  1284. while ((ret = iomap_iter(&iomi, ops)) > 0)
  1285. iomi.processed = dax_iomap_iter(&iomi, iter);
  1286. done = iomi.pos - iocb->ki_pos;
  1287. iocb->ki_pos = iomi.pos;
  1288. return done ? done : ret;
  1289. }
  1290. EXPORT_SYMBOL_GPL(dax_iomap_rw);
  1291. static vm_fault_t dax_fault_return(int error)
  1292. {
  1293. if (error == 0)
  1294. return VM_FAULT_NOPAGE;
  1295. return vmf_error(error);
  1296. }
  1297. /*
  1298. * When handling a synchronous page fault and the inode need a fsync, we can
  1299. * insert the PTE/PMD into page tables only after that fsync happened. Skip
  1300. * insertion for now and return the pfn so that caller can insert it after the
  1301. * fsync is done.
  1302. */
  1303. static vm_fault_t dax_fault_synchronous_pfnp(pfn_t *pfnp, pfn_t pfn)
  1304. {
  1305. if (WARN_ON_ONCE(!pfnp))
  1306. return VM_FAULT_SIGBUS;
  1307. *pfnp = pfn;
  1308. return VM_FAULT_NEEDDSYNC;
  1309. }
  1310. static vm_fault_t dax_fault_cow_page(struct vm_fault *vmf,
  1311. const struct iomap_iter *iter)
  1312. {
  1313. vm_fault_t ret;
  1314. int error = 0;
  1315. switch (iter->iomap.type) {
  1316. case IOMAP_HOLE:
  1317. case IOMAP_UNWRITTEN:
  1318. clear_user_highpage(vmf->cow_page, vmf->address);
  1319. break;
  1320. case IOMAP_MAPPED:
  1321. error = copy_cow_page_dax(vmf, iter);
  1322. break;
  1323. default:
  1324. WARN_ON_ONCE(1);
  1325. error = -EIO;
  1326. break;
  1327. }
  1328. if (error)
  1329. return dax_fault_return(error);
  1330. __SetPageUptodate(vmf->cow_page);
  1331. ret = finish_fault(vmf);
  1332. if (!ret)
  1333. return VM_FAULT_DONE_COW;
  1334. return ret;
  1335. }
  1336. /**
  1337. * dax_fault_iter - Common actor to handle pfn insertion in PTE/PMD fault.
  1338. * @vmf: vm fault instance
  1339. * @iter: iomap iter
  1340. * @pfnp: pfn to be returned
  1341. * @xas: the dax mapping tree of a file
  1342. * @entry: an unlocked dax entry to be inserted
  1343. * @pmd: distinguish whether it is a pmd fault
  1344. */
  1345. static vm_fault_t dax_fault_iter(struct vm_fault *vmf,
  1346. const struct iomap_iter *iter, pfn_t *pfnp,
  1347. struct xa_state *xas, void **entry, bool pmd)
  1348. {
  1349. const struct iomap *iomap = &iter->iomap;
  1350. const struct iomap *srcmap = &iter->srcmap;
  1351. size_t size = pmd ? PMD_SIZE : PAGE_SIZE;
  1352. loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT;
  1353. bool write = iter->flags & IOMAP_WRITE;
  1354. unsigned long entry_flags = pmd ? DAX_PMD : 0;
  1355. int err = 0;
  1356. pfn_t pfn;
  1357. void *kaddr;
  1358. if (!pmd && vmf->cow_page)
  1359. return dax_fault_cow_page(vmf, iter);
  1360. /* if we are reading UNWRITTEN and HOLE, return a hole. */
  1361. if (!write &&
  1362. (iomap->type == IOMAP_UNWRITTEN || iomap->type == IOMAP_HOLE)) {
  1363. if (!pmd)
  1364. return dax_load_hole(xas, vmf, iter, entry);
  1365. return dax_pmd_load_hole(xas, vmf, iter, entry);
  1366. }
  1367. if (iomap->type != IOMAP_MAPPED && !(iomap->flags & IOMAP_F_SHARED)) {
  1368. WARN_ON_ONCE(1);
  1369. return pmd ? VM_FAULT_FALLBACK : VM_FAULT_SIGBUS;
  1370. }
  1371. err = dax_iomap_direct_access(iomap, pos, size, &kaddr, &pfn);
  1372. if (err)
  1373. return pmd ? VM_FAULT_FALLBACK : dax_fault_return(err);
  1374. *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, entry_flags);
  1375. if (write &&
  1376. srcmap->type != IOMAP_HOLE && srcmap->addr != iomap->addr) {
  1377. err = dax_iomap_cow_copy(pos, size, size, srcmap, kaddr);
  1378. if (err)
  1379. return dax_fault_return(err);
  1380. }
  1381. if (dax_fault_is_synchronous(iter, vmf->vma))
  1382. return dax_fault_synchronous_pfnp(pfnp, pfn);
  1383. /* insert PMD pfn */
  1384. if (pmd)
  1385. return vmf_insert_pfn_pmd(vmf, pfn, write);
  1386. /* insert PTE pfn */
  1387. if (write)
  1388. return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
  1389. return vmf_insert_mixed(vmf->vma, vmf->address, pfn);
  1390. }
  1391. static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1392. int *iomap_errp, const struct iomap_ops *ops)
  1393. {
  1394. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1395. XA_STATE(xas, &mapping->i_pages, vmf->pgoff);
  1396. struct iomap_iter iter = {
  1397. .inode = mapping->host,
  1398. .pos = (loff_t)vmf->pgoff << PAGE_SHIFT,
  1399. .len = PAGE_SIZE,
  1400. .flags = IOMAP_DAX | IOMAP_FAULT,
  1401. };
  1402. vm_fault_t ret = 0;
  1403. void *entry;
  1404. int error;
  1405. trace_dax_pte_fault(iter.inode, vmf, ret);
  1406. /*
  1407. * Check whether offset isn't beyond end of file now. Caller is supposed
  1408. * to hold locks serializing us with truncate / punch hole so this is
  1409. * a reliable test.
  1410. */
  1411. if (iter.pos >= i_size_read(iter.inode)) {
  1412. ret = VM_FAULT_SIGBUS;
  1413. goto out;
  1414. }
  1415. if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
  1416. iter.flags |= IOMAP_WRITE;
  1417. entry = grab_mapping_entry(&xas, mapping, 0);
  1418. if (xa_is_internal(entry)) {
  1419. ret = xa_to_internal(entry);
  1420. goto out;
  1421. }
  1422. /*
  1423. * It is possible, particularly with mixed reads & writes to private
  1424. * mappings, that we have raced with a PMD fault that overlaps with
  1425. * the PTE we need to set up. If so just return and the fault will be
  1426. * retried.
  1427. */
  1428. if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) {
  1429. ret = VM_FAULT_NOPAGE;
  1430. goto unlock_entry;
  1431. }
  1432. while ((error = iomap_iter(&iter, ops)) > 0) {
  1433. if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) {
  1434. iter.processed = -EIO; /* fs corruption? */
  1435. continue;
  1436. }
  1437. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, false);
  1438. if (ret != VM_FAULT_SIGBUS &&
  1439. (iter.iomap.flags & IOMAP_F_NEW)) {
  1440. count_vm_event(PGMAJFAULT);
  1441. count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
  1442. ret |= VM_FAULT_MAJOR;
  1443. }
  1444. if (!(ret & VM_FAULT_ERROR))
  1445. iter.processed = PAGE_SIZE;
  1446. }
  1447. if (iomap_errp)
  1448. *iomap_errp = error;
  1449. if (!ret && error)
  1450. ret = dax_fault_return(error);
  1451. unlock_entry:
  1452. dax_unlock_entry(&xas, entry);
  1453. out:
  1454. trace_dax_pte_fault_done(iter.inode, vmf, ret);
  1455. return ret;
  1456. }
  1457. #ifdef CONFIG_FS_DAX_PMD
  1458. static bool dax_fault_check_fallback(struct vm_fault *vmf, struct xa_state *xas,
  1459. pgoff_t max_pgoff)
  1460. {
  1461. unsigned long pmd_addr = vmf->address & PMD_MASK;
  1462. bool write = vmf->flags & FAULT_FLAG_WRITE;
  1463. /*
  1464. * Make sure that the faulting address's PMD offset (color) matches
  1465. * the PMD offset from the start of the file. This is necessary so
  1466. * that a PMD range in the page table overlaps exactly with a PMD
  1467. * range in the page cache.
  1468. */
  1469. if ((vmf->pgoff & PG_PMD_COLOUR) !=
  1470. ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR))
  1471. return true;
  1472. /* Fall back to PTEs if we're going to COW */
  1473. if (write && !(vmf->vma->vm_flags & VM_SHARED))
  1474. return true;
  1475. /* If the PMD would extend outside the VMA */
  1476. if (pmd_addr < vmf->vma->vm_start)
  1477. return true;
  1478. if ((pmd_addr + PMD_SIZE) > vmf->vma->vm_end)
  1479. return true;
  1480. /* If the PMD would extend beyond the file size */
  1481. if ((xas->xa_index | PG_PMD_COLOUR) >= max_pgoff)
  1482. return true;
  1483. return false;
  1484. }
  1485. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1486. const struct iomap_ops *ops)
  1487. {
  1488. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1489. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER);
  1490. struct iomap_iter iter = {
  1491. .inode = mapping->host,
  1492. .len = PMD_SIZE,
  1493. .flags = IOMAP_DAX | IOMAP_FAULT,
  1494. };
  1495. vm_fault_t ret = VM_FAULT_FALLBACK;
  1496. pgoff_t max_pgoff;
  1497. void *entry;
  1498. int error;
  1499. if (vmf->flags & FAULT_FLAG_WRITE)
  1500. iter.flags |= IOMAP_WRITE;
  1501. /*
  1502. * Check whether offset isn't beyond end of file now. Caller is
  1503. * supposed to hold locks serializing us with truncate / punch hole so
  1504. * this is a reliable test.
  1505. */
  1506. max_pgoff = DIV_ROUND_UP(i_size_read(iter.inode), PAGE_SIZE);
  1507. trace_dax_pmd_fault(iter.inode, vmf, max_pgoff, 0);
  1508. if (xas.xa_index >= max_pgoff) {
  1509. ret = VM_FAULT_SIGBUS;
  1510. goto out;
  1511. }
  1512. if (dax_fault_check_fallback(vmf, &xas, max_pgoff))
  1513. goto fallback;
  1514. /*
  1515. * grab_mapping_entry() will make sure we get an empty PMD entry,
  1516. * a zero PMD entry or a DAX PMD. If it can't (because a PTE
  1517. * entry is already in the array, for instance), it will return
  1518. * VM_FAULT_FALLBACK.
  1519. */
  1520. entry = grab_mapping_entry(&xas, mapping, PMD_ORDER);
  1521. if (xa_is_internal(entry)) {
  1522. ret = xa_to_internal(entry);
  1523. goto fallback;
  1524. }
  1525. /*
  1526. * It is possible, particularly with mixed reads & writes to private
  1527. * mappings, that we have raced with a PTE fault that overlaps with
  1528. * the PMD we need to set up. If so just return and the fault will be
  1529. * retried.
  1530. */
  1531. if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) &&
  1532. !pmd_devmap(*vmf->pmd)) {
  1533. ret = 0;
  1534. goto unlock_entry;
  1535. }
  1536. iter.pos = (loff_t)xas.xa_index << PAGE_SHIFT;
  1537. while ((error = iomap_iter(&iter, ops)) > 0) {
  1538. if (iomap_length(&iter) < PMD_SIZE)
  1539. continue; /* actually breaks out of the loop */
  1540. ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true);
  1541. if (ret != VM_FAULT_FALLBACK)
  1542. iter.processed = PMD_SIZE;
  1543. }
  1544. unlock_entry:
  1545. dax_unlock_entry(&xas, entry);
  1546. fallback:
  1547. if (ret == VM_FAULT_FALLBACK) {
  1548. split_huge_pmd(vmf->vma, vmf->pmd, vmf->address);
  1549. count_vm_event(THP_FAULT_FALLBACK);
  1550. }
  1551. out:
  1552. trace_dax_pmd_fault_done(iter.inode, vmf, max_pgoff, ret);
  1553. return ret;
  1554. }
  1555. #else
  1556. static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
  1557. const struct iomap_ops *ops)
  1558. {
  1559. return VM_FAULT_FALLBACK;
  1560. }
  1561. #endif /* CONFIG_FS_DAX_PMD */
  1562. /**
  1563. * dax_iomap_fault - handle a page fault on a DAX file
  1564. * @vmf: The description of the fault
  1565. * @pe_size: Size of the page to fault in
  1566. * @pfnp: PFN to insert for synchronous faults if fsync is required
  1567. * @iomap_errp: Storage for detailed error code in case of error
  1568. * @ops: Iomap ops passed from the file system
  1569. *
  1570. * When a page fault occurs, filesystems may call this helper in
  1571. * their fault handler for DAX files. dax_iomap_fault() assumes the caller
  1572. * has done all the necessary locking for page fault to proceed
  1573. * successfully.
  1574. */
  1575. vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
  1576. pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops)
  1577. {
  1578. switch (pe_size) {
  1579. case PE_SIZE_PTE:
  1580. return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops);
  1581. case PE_SIZE_PMD:
  1582. return dax_iomap_pmd_fault(vmf, pfnp, ops);
  1583. default:
  1584. return VM_FAULT_FALLBACK;
  1585. }
  1586. }
  1587. EXPORT_SYMBOL_GPL(dax_iomap_fault);
  1588. /*
  1589. * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
  1590. * @vmf: The description of the fault
  1591. * @pfn: PFN to insert
  1592. * @order: Order of entry to insert.
  1593. *
  1594. * This function inserts a writeable PTE or PMD entry into the page tables
  1595. * for an mmaped DAX file. It also marks the page cache entry as dirty.
  1596. */
  1597. static vm_fault_t
  1598. dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order)
  1599. {
  1600. struct address_space *mapping = vmf->vma->vm_file->f_mapping;
  1601. XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order);
  1602. void *entry;
  1603. vm_fault_t ret;
  1604. xas_lock_irq(&xas);
  1605. entry = get_unlocked_entry(&xas, order);
  1606. /* Did we race with someone splitting entry or so? */
  1607. if (!entry || dax_is_conflict(entry) ||
  1608. (order == 0 && !dax_is_pte_entry(entry))) {
  1609. put_unlocked_entry(&xas, entry, WAKE_NEXT);
  1610. xas_unlock_irq(&xas);
  1611. trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
  1612. VM_FAULT_NOPAGE);
  1613. return VM_FAULT_NOPAGE;
  1614. }
  1615. xas_set_mark(&xas, PAGECACHE_TAG_DIRTY);
  1616. dax_lock_entry(&xas, entry);
  1617. xas_unlock_irq(&xas);
  1618. if (order == 0)
  1619. ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
  1620. #ifdef CONFIG_FS_DAX_PMD
  1621. else if (order == PMD_ORDER)
  1622. ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE);
  1623. #endif
  1624. else
  1625. ret = VM_FAULT_FALLBACK;
  1626. dax_unlock_entry(&xas, entry);
  1627. trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret);
  1628. return ret;
  1629. }
  1630. /**
  1631. * dax_finish_sync_fault - finish synchronous page fault
  1632. * @vmf: The description of the fault
  1633. * @pe_size: Size of entry to be inserted
  1634. * @pfn: PFN to insert
  1635. *
  1636. * This function ensures that the file range touched by the page fault is
  1637. * stored persistently on the media and handles inserting of appropriate page
  1638. * table entry.
  1639. */
  1640. vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
  1641. enum page_entry_size pe_size, pfn_t pfn)
  1642. {
  1643. int err;
  1644. loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
  1645. unsigned int order = pe_order(pe_size);
  1646. size_t len = PAGE_SIZE << order;
  1647. err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
  1648. if (err)
  1649. return VM_FAULT_SIGBUS;
  1650. return dax_insert_pfn_mkwrite(vmf, pfn, order);
  1651. }
  1652. EXPORT_SYMBOL_GPL(dax_finish_sync_fault);
  1653. static loff_t dax_range_compare_iter(struct iomap_iter *it_src,
  1654. struct iomap_iter *it_dest, u64 len, bool *same)
  1655. {
  1656. const struct iomap *smap = &it_src->iomap;
  1657. const struct iomap *dmap = &it_dest->iomap;
  1658. loff_t pos1 = it_src->pos, pos2 = it_dest->pos;
  1659. void *saddr, *daddr;
  1660. int id, ret;
  1661. len = min(len, min(smap->length, dmap->length));
  1662. if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) {
  1663. *same = true;
  1664. return len;
  1665. }
  1666. if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) {
  1667. *same = false;
  1668. return 0;
  1669. }
  1670. id = dax_read_lock();
  1671. ret = dax_iomap_direct_access(smap, pos1, ALIGN(pos1 + len, PAGE_SIZE),
  1672. &saddr, NULL);
  1673. if (ret < 0)
  1674. goto out_unlock;
  1675. ret = dax_iomap_direct_access(dmap, pos2, ALIGN(pos2 + len, PAGE_SIZE),
  1676. &daddr, NULL);
  1677. if (ret < 0)
  1678. goto out_unlock;
  1679. *same = !memcmp(saddr, daddr, len);
  1680. if (!*same)
  1681. len = 0;
  1682. dax_read_unlock(id);
  1683. return len;
  1684. out_unlock:
  1685. dax_read_unlock(id);
  1686. return -EIO;
  1687. }
  1688. int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
  1689. struct inode *dst, loff_t dstoff, loff_t len, bool *same,
  1690. const struct iomap_ops *ops)
  1691. {
  1692. struct iomap_iter src_iter = {
  1693. .inode = src,
  1694. .pos = srcoff,
  1695. .len = len,
  1696. .flags = IOMAP_DAX,
  1697. };
  1698. struct iomap_iter dst_iter = {
  1699. .inode = dst,
  1700. .pos = dstoff,
  1701. .len = len,
  1702. .flags = IOMAP_DAX,
  1703. };
  1704. int ret;
  1705. while ((ret = iomap_iter(&src_iter, ops)) > 0) {
  1706. while ((ret = iomap_iter(&dst_iter, ops)) > 0) {
  1707. dst_iter.processed = dax_range_compare_iter(&src_iter,
  1708. &dst_iter, len, same);
  1709. }
  1710. if (ret <= 0)
  1711. src_iter.processed = ret;
  1712. }
  1713. return ret;
  1714. }
  1715. int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
  1716. struct file *file_out, loff_t pos_out,
  1717. loff_t *len, unsigned int remap_flags,
  1718. const struct iomap_ops *ops)
  1719. {
  1720. return __generic_remap_file_range_prep(file_in, pos_in, file_out,
  1721. pos_out, len, remap_flags, ops);
  1722. }
  1723. EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);