buffer.c 80 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945194619471948194919501951195219531954195519561957195819591960196119621963196419651966196719681969197019711972197319741975197619771978197919801981198219831984198519861987198819891990199119921993199419951996199719981999200020012002200320042005200620072008200920102011201220132014201520162017201820192020202120222023202420252026202720282029203020312032203320342035203620372038203920402041204220432044204520462047204820492050205120522053205420552056205720582059206020612062206320642065206620672068206920702071207220732074207520762077207820792080208120822083208420852086208720882089209020912092209320942095209620972098209921002101210221032104210521062107210821092110211121122113211421152116211721182119212021212122212321242125212621272128212921302131213221332134213521362137213821392140214121422143214421452146214721482149215021512152215321542155215621572158215921602161216221632164216521662167216821692170217121722173217421752176217721782179218021812182218321842185218621872188218921902191219221932194219521962197219821992200220122022203220422052206220722082209221022112212221322142215221622172218221922202221222222232224222522262227222822292230223122322233223422352236223722382239224022412242224322442245224622472248224922502251225222532254225522562257225822592260226122622263226422652266226722682269227022712272227322742275227622772278227922802281228222832284228522862287228822892290229122922293229422952296229722982299230023012302230323042305230623072308230923102311231223132314231523162317231823192320232123222323232423252326232723282329233023312332233323342335233623372338233923402341234223432344234523462347234823492350235123522353235423552356235723582359236023612362236323642365236623672368236923702371237223732374237523762377237823792380238123822383238423852386238723882389239023912392239323942395239623972398239924002401240224032404240524062407240824092410241124122413241424152416241724182419242024212422242324242425242624272428242924302431243224332434243524362437243824392440244124422443244424452446244724482449245024512452245324542455245624572458245924602461246224632464246524662467246824692470247124722473247424752476247724782479248024812482248324842485248624872488248924902491249224932494249524962497249824992500250125022503250425052506250725082509251025112512251325142515251625172518251925202521252225232524252525262527252825292530253125322533253425352536253725382539254025412542254325442545254625472548254925502551255225532554255525562557255825592560256125622563256425652566256725682569257025712572257325742575257625772578257925802581258225832584258525862587258825892590259125922593259425952596259725982599260026012602260326042605260626072608260926102611261226132614261526162617261826192620262126222623262426252626262726282629263026312632263326342635263626372638263926402641264226432644264526462647264826492650265126522653265426552656265726582659266026612662266326642665266626672668266926702671267226732674267526762677267826792680268126822683268426852686268726882689269026912692269326942695269626972698269927002701270227032704270527062707270827092710271127122713271427152716271727182719272027212722272327242725272627272728272927302731273227332734273527362737273827392740274127422743274427452746274727482749275027512752275327542755275627572758275927602761276227632764276527662767276827692770277127722773277427752776277727782779278027812782278327842785278627872788278927902791279227932794279527962797279827992800280128022803280428052806280728082809281028112812281328142815281628172818281928202821282228232824282528262827282828292830283128322833283428352836283728382839284028412842284328442845284628472848284928502851285228532854285528562857285828592860286128622863286428652866286728682869287028712872287328742875287628772878287928802881288228832884288528862887288828892890289128922893289428952896289728982899290029012902290329042905290629072908290929102911291229132914291529162917291829192920292129222923292429252926292729282929293029312932293329342935293629372938293929402941294229432944294529462947294829492950295129522953295429552956295729582959296029612962296329642965296629672968296929702971297229732974297529762977297829792980298129822983298429852986298729882989299029912992299329942995299629972998299930003001300230033004300530063007300830093010301130123013301430153016301730183019302030213022302330243025302630273028302930303031303230333034303530363037303830393040304130423043304430453046304730483049305030513052305330543055305630573058305930603061306230633064306530663067306830693070307130723073307430753076307730783079308030813082308330843085308630873088
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * linux/fs/buffer.c
  4. *
  5. * Copyright (C) 1991, 1992, 2002 Linus Torvalds
  6. */
  7. /*
  8. * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
  9. *
  10. * Removed a lot of unnecessary code and simplified things now that
  11. * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
  12. *
  13. * Speed up hash, lru, and free list operations. Use gfp() for allocating
  14. * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
  15. *
  16. * Added 32k buffer block sizes - these are required older ARM systems. - RMK
  17. *
  18. * async buffer flushing, 1999 Andrea Arcangeli <[email protected]>
  19. */
  20. #include <linux/kernel.h>
  21. #include <linux/sched/signal.h>
  22. #include <linux/syscalls.h>
  23. #include <linux/fs.h>
  24. #include <linux/iomap.h>
  25. #include <linux/mm.h>
  26. #include <linux/percpu.h>
  27. #include <linux/slab.h>
  28. #include <linux/capability.h>
  29. #include <linux/blkdev.h>
  30. #include <linux/file.h>
  31. #include <linux/quotaops.h>
  32. #include <linux/highmem.h>
  33. #include <linux/export.h>
  34. #include <linux/backing-dev.h>
  35. #include <linux/writeback.h>
  36. #include <linux/hash.h>
  37. #include <linux/suspend.h>
  38. #include <linux/buffer_head.h>
  39. #include <linux/task_io_accounting_ops.h>
  40. #include <linux/bio.h>
  41. #include <linux/cpu.h>
  42. #include <linux/bitops.h>
  43. #include <linux/mpage.h>
  44. #include <linux/bit_spinlock.h>
  45. #include <linux/pagevec.h>
  46. #include <linux/sched/mm.h>
  47. #include <trace/events/block.h>
  48. #include <linux/fscrypt.h>
  49. #include <linux/fsverity.h>
  50. #include "internal.h"
  51. static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
  52. static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  53. struct writeback_control *wbc);
  54. #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
  55. inline void touch_buffer(struct buffer_head *bh)
  56. {
  57. trace_block_touch_buffer(bh);
  58. mark_page_accessed(bh->b_page);
  59. }
  60. EXPORT_SYMBOL(touch_buffer);
  61. void __lock_buffer(struct buffer_head *bh)
  62. {
  63. wait_on_bit_lock_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  64. }
  65. EXPORT_SYMBOL(__lock_buffer);
  66. void unlock_buffer(struct buffer_head *bh)
  67. {
  68. clear_bit_unlock(BH_Lock, &bh->b_state);
  69. smp_mb__after_atomic();
  70. wake_up_bit(&bh->b_state, BH_Lock);
  71. }
  72. EXPORT_SYMBOL(unlock_buffer);
  73. /*
  74. * Returns if the folio has dirty or writeback buffers. If all the buffers
  75. * are unlocked and clean then the folio_test_dirty information is stale. If
  76. * any of the buffers are locked, it is assumed they are locked for IO.
  77. */
  78. void buffer_check_dirty_writeback(struct folio *folio,
  79. bool *dirty, bool *writeback)
  80. {
  81. struct buffer_head *head, *bh;
  82. *dirty = false;
  83. *writeback = false;
  84. BUG_ON(!folio_test_locked(folio));
  85. head = folio_buffers(folio);
  86. if (!head)
  87. return;
  88. if (folio_test_writeback(folio))
  89. *writeback = true;
  90. bh = head;
  91. do {
  92. if (buffer_locked(bh))
  93. *writeback = true;
  94. if (buffer_dirty(bh))
  95. *dirty = true;
  96. bh = bh->b_this_page;
  97. } while (bh != head);
  98. }
  99. EXPORT_SYMBOL(buffer_check_dirty_writeback);
  100. /*
  101. * Block until a buffer comes unlocked. This doesn't stop it
  102. * from becoming locked again - you have to lock it yourself
  103. * if you want to preserve its state.
  104. */
  105. void __wait_on_buffer(struct buffer_head * bh)
  106. {
  107. wait_on_bit_io(&bh->b_state, BH_Lock, TASK_UNINTERRUPTIBLE);
  108. }
  109. EXPORT_SYMBOL(__wait_on_buffer);
  110. static void buffer_io_error(struct buffer_head *bh, char *msg)
  111. {
  112. if (!test_bit(BH_Quiet, &bh->b_state))
  113. printk_ratelimited(KERN_ERR
  114. "Buffer I/O error on dev %pg, logical block %llu%s\n",
  115. bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
  116. }
  117. /*
  118. * End-of-IO handler helper function which does not touch the bh after
  119. * unlocking it.
  120. * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
  121. * a race there is benign: unlock_buffer() only use the bh's address for
  122. * hashing after unlocking the buffer, so it doesn't actually touch the bh
  123. * itself.
  124. */
  125. static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
  126. {
  127. if (uptodate) {
  128. set_buffer_uptodate(bh);
  129. } else {
  130. /* This happens, due to failed read-ahead attempts. */
  131. clear_buffer_uptodate(bh);
  132. }
  133. unlock_buffer(bh);
  134. }
  135. /*
  136. * Default synchronous end-of-IO handler.. Just mark it up-to-date and
  137. * unlock the buffer.
  138. */
  139. void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
  140. {
  141. __end_buffer_read_notouch(bh, uptodate);
  142. put_bh(bh);
  143. }
  144. EXPORT_SYMBOL(end_buffer_read_sync);
  145. void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
  146. {
  147. if (uptodate) {
  148. set_buffer_uptodate(bh);
  149. } else {
  150. buffer_io_error(bh, ", lost sync page write");
  151. mark_buffer_write_io_error(bh);
  152. clear_buffer_uptodate(bh);
  153. }
  154. unlock_buffer(bh);
  155. put_bh(bh);
  156. }
  157. EXPORT_SYMBOL(end_buffer_write_sync);
  158. /*
  159. * Various filesystems appear to want __find_get_block to be non-blocking.
  160. * But it's the page lock which protects the buffers. To get around this,
  161. * we get exclusion from try_to_free_buffers with the blockdev mapping's
  162. * private_lock.
  163. *
  164. * Hack idea: for the blockdev mapping, private_lock contention
  165. * may be quite high. This code could TryLock the page, and if that
  166. * succeeds, there is no need to take private_lock.
  167. */
  168. static struct buffer_head *
  169. __find_get_block_slow(struct block_device *bdev, sector_t block)
  170. {
  171. struct inode *bd_inode = bdev->bd_inode;
  172. struct address_space *bd_mapping = bd_inode->i_mapping;
  173. struct buffer_head *ret = NULL;
  174. pgoff_t index;
  175. struct buffer_head *bh;
  176. struct buffer_head *head;
  177. struct page *page;
  178. int all_mapped = 1;
  179. static DEFINE_RATELIMIT_STATE(last_warned, HZ, 1);
  180. index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
  181. page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED);
  182. if (!page)
  183. goto out;
  184. spin_lock(&bd_mapping->private_lock);
  185. if (!page_has_buffers(page))
  186. goto out_unlock;
  187. head = page_buffers(page);
  188. bh = head;
  189. do {
  190. if (!buffer_mapped(bh))
  191. all_mapped = 0;
  192. else if (bh->b_blocknr == block) {
  193. ret = bh;
  194. get_bh(bh);
  195. goto out_unlock;
  196. }
  197. bh = bh->b_this_page;
  198. } while (bh != head);
  199. /* we might be here because some of the buffers on this page are
  200. * not mapped. This is due to various races between
  201. * file io on the block device and getblk. It gets dealt with
  202. * elsewhere, don't buffer_error if we had some unmapped buffers
  203. */
  204. ratelimit_set_flags(&last_warned, RATELIMIT_MSG_ON_RELEASE);
  205. if (all_mapped && __ratelimit(&last_warned)) {
  206. printk("__find_get_block_slow() failed. block=%llu, "
  207. "b_blocknr=%llu, b_state=0x%08lx, b_size=%zu, "
  208. "device %pg blocksize: %d\n",
  209. (unsigned long long)block,
  210. (unsigned long long)bh->b_blocknr,
  211. bh->b_state, bh->b_size, bdev,
  212. 1 << bd_inode->i_blkbits);
  213. }
  214. out_unlock:
  215. spin_unlock(&bd_mapping->private_lock);
  216. put_page(page);
  217. out:
  218. return ret;
  219. }
  220. static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
  221. {
  222. unsigned long flags;
  223. struct buffer_head *first;
  224. struct buffer_head *tmp;
  225. struct page *page;
  226. int page_uptodate = 1;
  227. BUG_ON(!buffer_async_read(bh));
  228. page = bh->b_page;
  229. if (uptodate) {
  230. set_buffer_uptodate(bh);
  231. } else {
  232. clear_buffer_uptodate(bh);
  233. buffer_io_error(bh, ", async page read");
  234. SetPageError(page);
  235. }
  236. /*
  237. * Be _very_ careful from here on. Bad things can happen if
  238. * two buffer heads end IO at almost the same time and both
  239. * decide that the page is now completely done.
  240. */
  241. first = page_buffers(page);
  242. spin_lock_irqsave(&first->b_uptodate_lock, flags);
  243. clear_buffer_async_read(bh);
  244. unlock_buffer(bh);
  245. tmp = bh;
  246. do {
  247. if (!buffer_uptodate(tmp))
  248. page_uptodate = 0;
  249. if (buffer_async_read(tmp)) {
  250. BUG_ON(!buffer_locked(tmp));
  251. goto still_busy;
  252. }
  253. tmp = tmp->b_this_page;
  254. } while (tmp != bh);
  255. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  256. /*
  257. * If all of the buffers are uptodate then we can set the page
  258. * uptodate.
  259. */
  260. if (page_uptodate)
  261. SetPageUptodate(page);
  262. unlock_page(page);
  263. return;
  264. still_busy:
  265. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  266. return;
  267. }
  268. struct postprocess_bh_ctx {
  269. struct work_struct work;
  270. struct buffer_head *bh;
  271. };
  272. static void verify_bh(struct work_struct *work)
  273. {
  274. struct postprocess_bh_ctx *ctx =
  275. container_of(work, struct postprocess_bh_ctx, work);
  276. struct buffer_head *bh = ctx->bh;
  277. bool valid;
  278. valid = fsverity_verify_blocks(page_folio(bh->b_page), bh->b_size,
  279. bh_offset(bh));
  280. end_buffer_async_read(bh, valid);
  281. kfree(ctx);
  282. }
  283. static bool need_fsverity(struct buffer_head *bh)
  284. {
  285. struct page *page = bh->b_page;
  286. struct inode *inode = page->mapping->host;
  287. return fsverity_active(inode) &&
  288. /* needed by ext4 */
  289. page->index < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
  290. }
  291. static void decrypt_bh(struct work_struct *work)
  292. {
  293. struct postprocess_bh_ctx *ctx =
  294. container_of(work, struct postprocess_bh_ctx, work);
  295. struct buffer_head *bh = ctx->bh;
  296. int err;
  297. err = fscrypt_decrypt_pagecache_blocks(page_folio(bh->b_page),
  298. bh->b_size, bh_offset(bh));
  299. if (err == 0 && need_fsverity(bh)) {
  300. /*
  301. * We use different work queues for decryption and for verity
  302. * because verity may require reading metadata pages that need
  303. * decryption, and we shouldn't recurse to the same workqueue.
  304. */
  305. INIT_WORK(&ctx->work, verify_bh);
  306. fsverity_enqueue_verify_work(&ctx->work);
  307. return;
  308. }
  309. end_buffer_async_read(bh, err == 0);
  310. kfree(ctx);
  311. }
  312. /*
  313. * I/O completion handler for block_read_full_folio() - pages
  314. * which come unlocked at the end of I/O.
  315. */
  316. static void end_buffer_async_read_io(struct buffer_head *bh, int uptodate)
  317. {
  318. struct inode *inode = bh->b_page->mapping->host;
  319. bool decrypt = fscrypt_inode_uses_fs_layer_crypto(inode);
  320. bool verify = need_fsverity(bh);
  321. /* Decrypt (with fscrypt) and/or verify (with fsverity) if needed. */
  322. if (uptodate && (decrypt || verify)) {
  323. struct postprocess_bh_ctx *ctx =
  324. kmalloc(sizeof(*ctx), GFP_ATOMIC);
  325. if (ctx) {
  326. ctx->bh = bh;
  327. if (decrypt) {
  328. INIT_WORK(&ctx->work, decrypt_bh);
  329. fscrypt_enqueue_decrypt_work(&ctx->work);
  330. } else {
  331. INIT_WORK(&ctx->work, verify_bh);
  332. fsverity_enqueue_verify_work(&ctx->work);
  333. }
  334. return;
  335. }
  336. uptodate = 0;
  337. }
  338. end_buffer_async_read(bh, uptodate);
  339. }
  340. /*
  341. * Completion handler for block_write_full_page() - pages which are unlocked
  342. * during I/O, and which have PageWriteback cleared upon I/O completion.
  343. */
  344. void end_buffer_async_write(struct buffer_head *bh, int uptodate)
  345. {
  346. unsigned long flags;
  347. struct buffer_head *first;
  348. struct buffer_head *tmp;
  349. struct page *page;
  350. BUG_ON(!buffer_async_write(bh));
  351. page = bh->b_page;
  352. if (uptodate) {
  353. set_buffer_uptodate(bh);
  354. } else {
  355. buffer_io_error(bh, ", lost async page write");
  356. mark_buffer_write_io_error(bh);
  357. clear_buffer_uptodate(bh);
  358. SetPageError(page);
  359. }
  360. first = page_buffers(page);
  361. spin_lock_irqsave(&first->b_uptodate_lock, flags);
  362. clear_buffer_async_write(bh);
  363. unlock_buffer(bh);
  364. tmp = bh->b_this_page;
  365. while (tmp != bh) {
  366. if (buffer_async_write(tmp)) {
  367. BUG_ON(!buffer_locked(tmp));
  368. goto still_busy;
  369. }
  370. tmp = tmp->b_this_page;
  371. }
  372. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  373. end_page_writeback(page);
  374. return;
  375. still_busy:
  376. spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
  377. return;
  378. }
  379. EXPORT_SYMBOL(end_buffer_async_write);
  380. /*
  381. * If a page's buffers are under async readin (end_buffer_async_read
  382. * completion) then there is a possibility that another thread of
  383. * control could lock one of the buffers after it has completed
  384. * but while some of the other buffers have not completed. This
  385. * locked buffer would confuse end_buffer_async_read() into not unlocking
  386. * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
  387. * that this buffer is not under async I/O.
  388. *
  389. * The page comes unlocked when it has no locked buffer_async buffers
  390. * left.
  391. *
  392. * PageLocked prevents anyone starting new async I/O reads any of
  393. * the buffers.
  394. *
  395. * PageWriteback is used to prevent simultaneous writeout of the same
  396. * page.
  397. *
  398. * PageLocked prevents anyone from starting writeback of a page which is
  399. * under read I/O (PageWriteback is only ever set against a locked page).
  400. */
  401. static void mark_buffer_async_read(struct buffer_head *bh)
  402. {
  403. bh->b_end_io = end_buffer_async_read_io;
  404. set_buffer_async_read(bh);
  405. }
  406. static void mark_buffer_async_write_endio(struct buffer_head *bh,
  407. bh_end_io_t *handler)
  408. {
  409. bh->b_end_io = handler;
  410. set_buffer_async_write(bh);
  411. }
  412. void mark_buffer_async_write(struct buffer_head *bh)
  413. {
  414. mark_buffer_async_write_endio(bh, end_buffer_async_write);
  415. }
  416. EXPORT_SYMBOL(mark_buffer_async_write);
  417. /*
  418. * fs/buffer.c contains helper functions for buffer-backed address space's
  419. * fsync functions. A common requirement for buffer-based filesystems is
  420. * that certain data from the backing blockdev needs to be written out for
  421. * a successful fsync(). For example, ext2 indirect blocks need to be
  422. * written back and waited upon before fsync() returns.
  423. *
  424. * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
  425. * inode_has_buffers() and invalidate_inode_buffers() are provided for the
  426. * management of a list of dependent buffers at ->i_mapping->private_list.
  427. *
  428. * Locking is a little subtle: try_to_free_buffers() will remove buffers
  429. * from their controlling inode's queue when they are being freed. But
  430. * try_to_free_buffers() will be operating against the *blockdev* mapping
  431. * at the time, not against the S_ISREG file which depends on those buffers.
  432. * So the locking for private_list is via the private_lock in the address_space
  433. * which backs the buffers. Which is different from the address_space
  434. * against which the buffers are listed. So for a particular address_space,
  435. * mapping->private_lock does *not* protect mapping->private_list! In fact,
  436. * mapping->private_list will always be protected by the backing blockdev's
  437. * ->private_lock.
  438. *
  439. * Which introduces a requirement: all buffers on an address_space's
  440. * ->private_list must be from the same address_space: the blockdev's.
  441. *
  442. * address_spaces which do not place buffers at ->private_list via these
  443. * utility functions are free to use private_lock and private_list for
  444. * whatever they want. The only requirement is that list_empty(private_list)
  445. * be true at clear_inode() time.
  446. *
  447. * FIXME: clear_inode should not call invalidate_inode_buffers(). The
  448. * filesystems should do that. invalidate_inode_buffers() should just go
  449. * BUG_ON(!list_empty).
  450. *
  451. * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
  452. * take an address_space, not an inode. And it should be called
  453. * mark_buffer_dirty_fsync() to clearly define why those buffers are being
  454. * queued up.
  455. *
  456. * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
  457. * list if it is already on a list. Because if the buffer is on a list,
  458. * it *must* already be on the right one. If not, the filesystem is being
  459. * silly. This will save a ton of locking. But first we have to ensure
  460. * that buffers are taken *off* the old inode's list when they are freed
  461. * (presumably in truncate). That requires careful auditing of all
  462. * filesystems (do it inside bforget()). It could also be done by bringing
  463. * b_inode back.
  464. */
  465. /*
  466. * The buffer's backing address_space's private_lock must be held
  467. */
  468. static void __remove_assoc_queue(struct buffer_head *bh)
  469. {
  470. list_del_init(&bh->b_assoc_buffers);
  471. WARN_ON(!bh->b_assoc_map);
  472. bh->b_assoc_map = NULL;
  473. }
  474. int inode_has_buffers(struct inode *inode)
  475. {
  476. return !list_empty(&inode->i_data.private_list);
  477. }
  478. /*
  479. * osync is designed to support O_SYNC io. It waits synchronously for
  480. * all already-submitted IO to complete, but does not queue any new
  481. * writes to the disk.
  482. *
  483. * To do O_SYNC writes, just queue the buffer writes with write_dirty_buffer
  484. * as you dirty the buffers, and then use osync_inode_buffers to wait for
  485. * completion. Any other dirty buffers which are not yet queued for
  486. * write will not be flushed to disk by the osync.
  487. */
  488. static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
  489. {
  490. struct buffer_head *bh;
  491. struct list_head *p;
  492. int err = 0;
  493. spin_lock(lock);
  494. repeat:
  495. list_for_each_prev(p, list) {
  496. bh = BH_ENTRY(p);
  497. if (buffer_locked(bh)) {
  498. get_bh(bh);
  499. spin_unlock(lock);
  500. wait_on_buffer(bh);
  501. if (!buffer_uptodate(bh))
  502. err = -EIO;
  503. brelse(bh);
  504. spin_lock(lock);
  505. goto repeat;
  506. }
  507. }
  508. spin_unlock(lock);
  509. return err;
  510. }
  511. void emergency_thaw_bdev(struct super_block *sb)
  512. {
  513. while (sb->s_bdev && !thaw_bdev(sb->s_bdev))
  514. printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
  515. }
  516. /**
  517. * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
  518. * @mapping: the mapping which wants those buffers written
  519. *
  520. * Starts I/O against the buffers at mapping->private_list, and waits upon
  521. * that I/O.
  522. *
  523. * Basically, this is a convenience function for fsync().
  524. * @mapping is a file or directory which needs those buffers to be written for
  525. * a successful fsync().
  526. */
  527. int sync_mapping_buffers(struct address_space *mapping)
  528. {
  529. struct address_space *buffer_mapping = mapping->private_data;
  530. if (buffer_mapping == NULL || list_empty(&mapping->private_list))
  531. return 0;
  532. return fsync_buffers_list(&buffer_mapping->private_lock,
  533. &mapping->private_list);
  534. }
  535. EXPORT_SYMBOL(sync_mapping_buffers);
  536. /*
  537. * Called when we've recently written block `bblock', and it is known that
  538. * `bblock' was for a buffer_boundary() buffer. This means that the block at
  539. * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
  540. * dirty, schedule it for IO. So that indirects merge nicely with their data.
  541. */
  542. void write_boundary_block(struct block_device *bdev,
  543. sector_t bblock, unsigned blocksize)
  544. {
  545. struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
  546. if (bh) {
  547. if (buffer_dirty(bh))
  548. write_dirty_buffer(bh, 0);
  549. put_bh(bh);
  550. }
  551. }
  552. void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
  553. {
  554. struct address_space *mapping = inode->i_mapping;
  555. struct address_space *buffer_mapping = bh->b_page->mapping;
  556. mark_buffer_dirty(bh);
  557. if (!mapping->private_data) {
  558. mapping->private_data = buffer_mapping;
  559. } else {
  560. BUG_ON(mapping->private_data != buffer_mapping);
  561. }
  562. if (!bh->b_assoc_map) {
  563. spin_lock(&buffer_mapping->private_lock);
  564. list_move_tail(&bh->b_assoc_buffers,
  565. &mapping->private_list);
  566. bh->b_assoc_map = mapping;
  567. spin_unlock(&buffer_mapping->private_lock);
  568. }
  569. }
  570. EXPORT_SYMBOL(mark_buffer_dirty_inode);
  571. /*
  572. * Add a page to the dirty page list.
  573. *
  574. * It is a sad fact of life that this function is called from several places
  575. * deeply under spinlocking. It may not sleep.
  576. *
  577. * If the page has buffers, the uptodate buffers are set dirty, to preserve
  578. * dirty-state coherency between the page and the buffers. It the page does
  579. * not have buffers then when they are later attached they will all be set
  580. * dirty.
  581. *
  582. * The buffers are dirtied before the page is dirtied. There's a small race
  583. * window in which a writepage caller may see the page cleanness but not the
  584. * buffer dirtiness. That's fine. If this code were to set the page dirty
  585. * before the buffers, a concurrent writepage caller could clear the page dirty
  586. * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
  587. * page on the dirty page list.
  588. *
  589. * We use private_lock to lock against try_to_free_buffers while using the
  590. * page's buffer list. Also use this to protect against clean buffers being
  591. * added to the page after it was set dirty.
  592. *
  593. * FIXME: may need to call ->reservepage here as well. That's rather up to the
  594. * address_space though.
  595. */
  596. bool block_dirty_folio(struct address_space *mapping, struct folio *folio)
  597. {
  598. struct buffer_head *head;
  599. bool newly_dirty;
  600. spin_lock(&mapping->private_lock);
  601. head = folio_buffers(folio);
  602. if (head) {
  603. struct buffer_head *bh = head;
  604. do {
  605. set_buffer_dirty(bh);
  606. bh = bh->b_this_page;
  607. } while (bh != head);
  608. }
  609. /*
  610. * Lock out page's memcg migration to keep PageDirty
  611. * synchronized with per-memcg dirty page counters.
  612. */
  613. folio_memcg_lock(folio);
  614. newly_dirty = !folio_test_set_dirty(folio);
  615. spin_unlock(&mapping->private_lock);
  616. if (newly_dirty)
  617. __folio_mark_dirty(folio, mapping, 1);
  618. folio_memcg_unlock(folio);
  619. if (newly_dirty)
  620. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  621. return newly_dirty;
  622. }
  623. EXPORT_SYMBOL(block_dirty_folio);
  624. /*
  625. * Write out and wait upon a list of buffers.
  626. *
  627. * We have conflicting pressures: we want to make sure that all
  628. * initially dirty buffers get waited on, but that any subsequently
  629. * dirtied buffers don't. After all, we don't want fsync to last
  630. * forever if somebody is actively writing to the file.
  631. *
  632. * Do this in two main stages: first we copy dirty buffers to a
  633. * temporary inode list, queueing the writes as we go. Then we clean
  634. * up, waiting for those writes to complete.
  635. *
  636. * During this second stage, any subsequent updates to the file may end
  637. * up refiling the buffer on the original inode's dirty list again, so
  638. * there is a chance we will end up with a buffer queued for write but
  639. * not yet completed on that list. So, as a final cleanup we go through
  640. * the osync code to catch these locked, dirty buffers without requeuing
  641. * any newly dirty buffers for write.
  642. */
  643. static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
  644. {
  645. struct buffer_head *bh;
  646. struct list_head tmp;
  647. struct address_space *mapping;
  648. int err = 0, err2;
  649. struct blk_plug plug;
  650. INIT_LIST_HEAD(&tmp);
  651. blk_start_plug(&plug);
  652. spin_lock(lock);
  653. while (!list_empty(list)) {
  654. bh = BH_ENTRY(list->next);
  655. mapping = bh->b_assoc_map;
  656. __remove_assoc_queue(bh);
  657. /* Avoid race with mark_buffer_dirty_inode() which does
  658. * a lockless check and we rely on seeing the dirty bit */
  659. smp_mb();
  660. if (buffer_dirty(bh) || buffer_locked(bh)) {
  661. list_add(&bh->b_assoc_buffers, &tmp);
  662. bh->b_assoc_map = mapping;
  663. if (buffer_dirty(bh)) {
  664. get_bh(bh);
  665. spin_unlock(lock);
  666. /*
  667. * Ensure any pending I/O completes so that
  668. * write_dirty_buffer() actually writes the
  669. * current contents - it is a noop if I/O is
  670. * still in flight on potentially older
  671. * contents.
  672. */
  673. write_dirty_buffer(bh, REQ_SYNC);
  674. /*
  675. * Kick off IO for the previous mapping. Note
  676. * that we will not run the very last mapping,
  677. * wait_on_buffer() will do that for us
  678. * through sync_buffer().
  679. */
  680. brelse(bh);
  681. spin_lock(lock);
  682. }
  683. }
  684. }
  685. spin_unlock(lock);
  686. blk_finish_plug(&plug);
  687. spin_lock(lock);
  688. while (!list_empty(&tmp)) {
  689. bh = BH_ENTRY(tmp.prev);
  690. get_bh(bh);
  691. mapping = bh->b_assoc_map;
  692. __remove_assoc_queue(bh);
  693. /* Avoid race with mark_buffer_dirty_inode() which does
  694. * a lockless check and we rely on seeing the dirty bit */
  695. smp_mb();
  696. if (buffer_dirty(bh)) {
  697. list_add(&bh->b_assoc_buffers,
  698. &mapping->private_list);
  699. bh->b_assoc_map = mapping;
  700. }
  701. spin_unlock(lock);
  702. wait_on_buffer(bh);
  703. if (!buffer_uptodate(bh))
  704. err = -EIO;
  705. brelse(bh);
  706. spin_lock(lock);
  707. }
  708. spin_unlock(lock);
  709. err2 = osync_buffers_list(lock, list);
  710. if (err)
  711. return err;
  712. else
  713. return err2;
  714. }
  715. /*
  716. * Invalidate any and all dirty buffers on a given inode. We are
  717. * probably unmounting the fs, but that doesn't mean we have already
  718. * done a sync(). Just drop the buffers from the inode list.
  719. *
  720. * NOTE: we take the inode's blockdev's mapping's private_lock. Which
  721. * assumes that all the buffers are against the blockdev. Not true
  722. * for reiserfs.
  723. */
  724. void invalidate_inode_buffers(struct inode *inode)
  725. {
  726. if (inode_has_buffers(inode)) {
  727. struct address_space *mapping = &inode->i_data;
  728. struct list_head *list = &mapping->private_list;
  729. struct address_space *buffer_mapping = mapping->private_data;
  730. spin_lock(&buffer_mapping->private_lock);
  731. while (!list_empty(list))
  732. __remove_assoc_queue(BH_ENTRY(list->next));
  733. spin_unlock(&buffer_mapping->private_lock);
  734. }
  735. }
  736. EXPORT_SYMBOL(invalidate_inode_buffers);
  737. /*
  738. * Remove any clean buffers from the inode's buffer list. This is called
  739. * when we're trying to free the inode itself. Those buffers can pin it.
  740. *
  741. * Returns true if all buffers were removed.
  742. */
  743. int remove_inode_buffers(struct inode *inode)
  744. {
  745. int ret = 1;
  746. if (inode_has_buffers(inode)) {
  747. struct address_space *mapping = &inode->i_data;
  748. struct list_head *list = &mapping->private_list;
  749. struct address_space *buffer_mapping = mapping->private_data;
  750. spin_lock(&buffer_mapping->private_lock);
  751. while (!list_empty(list)) {
  752. struct buffer_head *bh = BH_ENTRY(list->next);
  753. if (buffer_dirty(bh)) {
  754. ret = 0;
  755. break;
  756. }
  757. __remove_assoc_queue(bh);
  758. }
  759. spin_unlock(&buffer_mapping->private_lock);
  760. }
  761. return ret;
  762. }
  763. /*
  764. * Create the appropriate buffers when given a page for data area and
  765. * the size of each buffer.. Use the bh->b_this_page linked list to
  766. * follow the buffers created. Return NULL if unable to create more
  767. * buffers.
  768. *
  769. * The retry flag is used to differentiate async IO (paging, swapping)
  770. * which may not fail from ordinary buffer allocations.
  771. */
  772. struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
  773. bool retry)
  774. {
  775. struct buffer_head *bh, *head;
  776. gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT;
  777. long offset;
  778. struct mem_cgroup *memcg, *old_memcg;
  779. if (retry)
  780. gfp |= __GFP_NOFAIL;
  781. /* The page lock pins the memcg */
  782. memcg = page_memcg(page);
  783. old_memcg = set_active_memcg(memcg);
  784. head = NULL;
  785. offset = PAGE_SIZE;
  786. while ((offset -= size) >= 0) {
  787. bh = alloc_buffer_head(gfp);
  788. if (!bh)
  789. goto no_grow;
  790. bh->b_this_page = head;
  791. bh->b_blocknr = -1;
  792. head = bh;
  793. bh->b_size = size;
  794. /* Link the buffer to its page */
  795. set_bh_page(bh, page, offset);
  796. }
  797. out:
  798. set_active_memcg(old_memcg);
  799. return head;
  800. /*
  801. * In case anything failed, we just free everything we got.
  802. */
  803. no_grow:
  804. if (head) {
  805. do {
  806. bh = head;
  807. head = head->b_this_page;
  808. free_buffer_head(bh);
  809. } while (head);
  810. }
  811. goto out;
  812. }
  813. EXPORT_SYMBOL_GPL(alloc_page_buffers);
  814. static inline void
  815. link_dev_buffers(struct page *page, struct buffer_head *head)
  816. {
  817. struct buffer_head *bh, *tail;
  818. bh = head;
  819. do {
  820. tail = bh;
  821. bh = bh->b_this_page;
  822. } while (bh);
  823. tail->b_this_page = head;
  824. attach_page_private(page, head);
  825. }
  826. static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
  827. {
  828. sector_t retval = ~((sector_t)0);
  829. loff_t sz = bdev_nr_bytes(bdev);
  830. if (sz) {
  831. unsigned int sizebits = blksize_bits(size);
  832. retval = (sz >> sizebits);
  833. }
  834. return retval;
  835. }
  836. /*
  837. * Initialise the state of a blockdev page's buffers.
  838. */
  839. static sector_t
  840. init_page_buffers(struct page *page, struct block_device *bdev,
  841. sector_t block, int size)
  842. {
  843. struct buffer_head *head = page_buffers(page);
  844. struct buffer_head *bh = head;
  845. int uptodate = PageUptodate(page);
  846. sector_t end_block = blkdev_max_block(bdev, size);
  847. do {
  848. if (!buffer_mapped(bh)) {
  849. bh->b_end_io = NULL;
  850. bh->b_private = NULL;
  851. bh->b_bdev = bdev;
  852. bh->b_blocknr = block;
  853. if (uptodate)
  854. set_buffer_uptodate(bh);
  855. if (block < end_block)
  856. set_buffer_mapped(bh);
  857. }
  858. block++;
  859. bh = bh->b_this_page;
  860. } while (bh != head);
  861. /*
  862. * Caller needs to validate requested block against end of device.
  863. */
  864. return end_block;
  865. }
  866. /*
  867. * Create the page-cache page that contains the requested block.
  868. *
  869. * This is used purely for blockdev mappings.
  870. */
  871. static int
  872. grow_dev_page(struct block_device *bdev, sector_t block,
  873. pgoff_t index, int size, int sizebits, gfp_t gfp)
  874. {
  875. struct inode *inode = bdev->bd_inode;
  876. struct page *page;
  877. struct buffer_head *bh;
  878. sector_t end_block;
  879. int ret = 0;
  880. gfp_t gfp_mask;
  881. gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp;
  882. /*
  883. * XXX: __getblk_slow() can not really deal with failure and
  884. * will endlessly loop on improvised global reclaim. Prefer
  885. * looping in the allocator rather than here, at least that
  886. * code knows what it's doing.
  887. */
  888. gfp_mask |= __GFP_NOFAIL;
  889. page = find_or_create_page(inode->i_mapping, index, gfp_mask);
  890. BUG_ON(!PageLocked(page));
  891. if (page_has_buffers(page)) {
  892. bh = page_buffers(page);
  893. if (bh->b_size == size) {
  894. end_block = init_page_buffers(page, bdev,
  895. (sector_t)index << sizebits,
  896. size);
  897. goto done;
  898. }
  899. if (!try_to_free_buffers(page_folio(page)))
  900. goto failed;
  901. }
  902. /*
  903. * Allocate some buffers for this page
  904. */
  905. bh = alloc_page_buffers(page, size, true);
  906. /*
  907. * Link the page to the buffers and initialise them. Take the
  908. * lock to be atomic wrt __find_get_block(), which does not
  909. * run under the page lock.
  910. */
  911. spin_lock(&inode->i_mapping->private_lock);
  912. link_dev_buffers(page, bh);
  913. end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
  914. size);
  915. spin_unlock(&inode->i_mapping->private_lock);
  916. done:
  917. ret = (block < end_block) ? 1 : -ENXIO;
  918. failed:
  919. unlock_page(page);
  920. put_page(page);
  921. return ret;
  922. }
  923. /*
  924. * Create buffers for the specified block device block's page. If
  925. * that page was dirty, the buffers are set dirty also.
  926. */
  927. static int
  928. grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
  929. {
  930. pgoff_t index;
  931. int sizebits;
  932. sizebits = PAGE_SHIFT - __ffs(size);
  933. index = block >> sizebits;
  934. /*
  935. * Check for a block which wants to lie outside our maximum possible
  936. * pagecache index. (this comparison is done using sector_t types).
  937. */
  938. if (unlikely(index != block >> sizebits)) {
  939. printk(KERN_ERR "%s: requested out-of-range block %llu for "
  940. "device %pg\n",
  941. __func__, (unsigned long long)block,
  942. bdev);
  943. return -EIO;
  944. }
  945. /* Create a page with the proper size buffers.. */
  946. return grow_dev_page(bdev, block, index, size, sizebits, gfp);
  947. }
  948. static struct buffer_head *
  949. __getblk_slow(struct block_device *bdev, sector_t block,
  950. unsigned size, gfp_t gfp)
  951. {
  952. /* Size must be multiple of hard sectorsize */
  953. if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
  954. (size < 512 || size > PAGE_SIZE))) {
  955. printk(KERN_ERR "getblk(): invalid block size %d requested\n",
  956. size);
  957. printk(KERN_ERR "logical block size: %d\n",
  958. bdev_logical_block_size(bdev));
  959. dump_stack();
  960. return NULL;
  961. }
  962. for (;;) {
  963. struct buffer_head *bh;
  964. int ret;
  965. bh = __find_get_block(bdev, block, size);
  966. if (bh)
  967. return bh;
  968. ret = grow_buffers(bdev, block, size, gfp);
  969. if (ret < 0)
  970. return NULL;
  971. }
  972. }
  973. /*
  974. * The relationship between dirty buffers and dirty pages:
  975. *
  976. * Whenever a page has any dirty buffers, the page's dirty bit is set, and
  977. * the page is tagged dirty in the page cache.
  978. *
  979. * At all times, the dirtiness of the buffers represents the dirtiness of
  980. * subsections of the page. If the page has buffers, the page dirty bit is
  981. * merely a hint about the true dirty state.
  982. *
  983. * When a page is set dirty in its entirety, all its buffers are marked dirty
  984. * (if the page has buffers).
  985. *
  986. * When a buffer is marked dirty, its page is dirtied, but the page's other
  987. * buffers are not.
  988. *
  989. * Also. When blockdev buffers are explicitly read with bread(), they
  990. * individually become uptodate. But their backing page remains not
  991. * uptodate - even if all of its buffers are uptodate. A subsequent
  992. * block_read_full_folio() against that folio will discover all the uptodate
  993. * buffers, will set the folio uptodate and will perform no I/O.
  994. */
  995. /**
  996. * mark_buffer_dirty - mark a buffer_head as needing writeout
  997. * @bh: the buffer_head to mark dirty
  998. *
  999. * mark_buffer_dirty() will set the dirty bit against the buffer, then set
  1000. * its backing page dirty, then tag the page as dirty in the page cache
  1001. * and then attach the address_space's inode to its superblock's dirty
  1002. * inode list.
  1003. *
  1004. * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
  1005. * i_pages lock and mapping->host->i_lock.
  1006. */
  1007. void mark_buffer_dirty(struct buffer_head *bh)
  1008. {
  1009. WARN_ON_ONCE(!buffer_uptodate(bh));
  1010. trace_block_dirty_buffer(bh);
  1011. /*
  1012. * Very *carefully* optimize the it-is-already-dirty case.
  1013. *
  1014. * Don't let the final "is it dirty" escape to before we
  1015. * perhaps modified the buffer.
  1016. */
  1017. if (buffer_dirty(bh)) {
  1018. smp_mb();
  1019. if (buffer_dirty(bh))
  1020. return;
  1021. }
  1022. if (!test_set_buffer_dirty(bh)) {
  1023. struct page *page = bh->b_page;
  1024. struct address_space *mapping = NULL;
  1025. lock_page_memcg(page);
  1026. if (!TestSetPageDirty(page)) {
  1027. mapping = page_mapping(page);
  1028. if (mapping)
  1029. __set_page_dirty(page, mapping, 0);
  1030. }
  1031. unlock_page_memcg(page);
  1032. if (mapping)
  1033. __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
  1034. }
  1035. }
  1036. EXPORT_SYMBOL(mark_buffer_dirty);
  1037. void mark_buffer_write_io_error(struct buffer_head *bh)
  1038. {
  1039. struct super_block *sb;
  1040. set_buffer_write_io_error(bh);
  1041. /* FIXME: do we need to set this in both places? */
  1042. if (bh->b_page && bh->b_page->mapping)
  1043. mapping_set_error(bh->b_page->mapping, -EIO);
  1044. if (bh->b_assoc_map)
  1045. mapping_set_error(bh->b_assoc_map, -EIO);
  1046. rcu_read_lock();
  1047. sb = READ_ONCE(bh->b_bdev->bd_super);
  1048. if (sb)
  1049. errseq_set(&sb->s_wb_err, -EIO);
  1050. rcu_read_unlock();
  1051. }
  1052. EXPORT_SYMBOL(mark_buffer_write_io_error);
  1053. /*
  1054. * Decrement a buffer_head's reference count. If all buffers against a page
  1055. * have zero reference count, are clean and unlocked, and if the page is clean
  1056. * and unlocked then try_to_free_buffers() may strip the buffers from the page
  1057. * in preparation for freeing it (sometimes, rarely, buffers are removed from
  1058. * a page but it ends up not being freed, and buffers may later be reattached).
  1059. */
  1060. void __brelse(struct buffer_head * buf)
  1061. {
  1062. if (atomic_read(&buf->b_count)) {
  1063. put_bh(buf);
  1064. return;
  1065. }
  1066. WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
  1067. }
  1068. EXPORT_SYMBOL(__brelse);
  1069. /*
  1070. * bforget() is like brelse(), except it discards any
  1071. * potentially dirty data.
  1072. */
  1073. void __bforget(struct buffer_head *bh)
  1074. {
  1075. clear_buffer_dirty(bh);
  1076. if (bh->b_assoc_map) {
  1077. struct address_space *buffer_mapping = bh->b_page->mapping;
  1078. spin_lock(&buffer_mapping->private_lock);
  1079. list_del_init(&bh->b_assoc_buffers);
  1080. bh->b_assoc_map = NULL;
  1081. spin_unlock(&buffer_mapping->private_lock);
  1082. }
  1083. __brelse(bh);
  1084. }
  1085. EXPORT_SYMBOL(__bforget);
  1086. static struct buffer_head *__bread_slow(struct buffer_head *bh)
  1087. {
  1088. lock_buffer(bh);
  1089. if (buffer_uptodate(bh)) {
  1090. unlock_buffer(bh);
  1091. return bh;
  1092. } else {
  1093. get_bh(bh);
  1094. bh->b_end_io = end_buffer_read_sync;
  1095. submit_bh(REQ_OP_READ, bh);
  1096. wait_on_buffer(bh);
  1097. if (buffer_uptodate(bh))
  1098. return bh;
  1099. }
  1100. brelse(bh);
  1101. return NULL;
  1102. }
  1103. /*
  1104. * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
  1105. * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
  1106. * refcount elevated by one when they're in an LRU. A buffer can only appear
  1107. * once in a particular CPU's LRU. A single buffer can be present in multiple
  1108. * CPU's LRUs at the same time.
  1109. *
  1110. * This is a transparent caching front-end to sb_bread(), sb_getblk() and
  1111. * sb_find_get_block().
  1112. *
  1113. * The LRUs themselves only need locking against invalidate_bh_lrus. We use
  1114. * a local interrupt disable for that.
  1115. */
  1116. #define BH_LRU_SIZE 16
  1117. struct bh_lru {
  1118. struct buffer_head *bhs[BH_LRU_SIZE];
  1119. };
  1120. static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
  1121. #ifdef CONFIG_SMP
  1122. #define bh_lru_lock() local_irq_disable()
  1123. #define bh_lru_unlock() local_irq_enable()
  1124. #else
  1125. #define bh_lru_lock() preempt_disable()
  1126. #define bh_lru_unlock() preempt_enable()
  1127. #endif
  1128. static inline void check_irqs_on(void)
  1129. {
  1130. #ifdef irqs_disabled
  1131. BUG_ON(irqs_disabled());
  1132. #endif
  1133. }
  1134. /*
  1135. * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is
  1136. * inserted at the front, and the buffer_head at the back if any is evicted.
  1137. * Or, if already in the LRU it is moved to the front.
  1138. */
  1139. static void bh_lru_install(struct buffer_head *bh)
  1140. {
  1141. struct buffer_head *evictee = bh;
  1142. struct bh_lru *b;
  1143. int i;
  1144. check_irqs_on();
  1145. bh_lru_lock();
  1146. /*
  1147. * the refcount of buffer_head in bh_lru prevents dropping the
  1148. * attached page(i.e., try_to_free_buffers) so it could cause
  1149. * failing page migration.
  1150. * Skip putting upcoming bh into bh_lru until migration is done.
  1151. */
  1152. if (lru_cache_disabled()) {
  1153. bh_lru_unlock();
  1154. return;
  1155. }
  1156. b = this_cpu_ptr(&bh_lrus);
  1157. for (i = 0; i < BH_LRU_SIZE; i++) {
  1158. swap(evictee, b->bhs[i]);
  1159. if (evictee == bh) {
  1160. bh_lru_unlock();
  1161. return;
  1162. }
  1163. }
  1164. get_bh(bh);
  1165. bh_lru_unlock();
  1166. brelse(evictee);
  1167. }
  1168. /*
  1169. * Look up the bh in this cpu's LRU. If it's there, move it to the head.
  1170. */
  1171. static struct buffer_head *
  1172. lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
  1173. {
  1174. struct buffer_head *ret = NULL;
  1175. unsigned int i;
  1176. check_irqs_on();
  1177. bh_lru_lock();
  1178. for (i = 0; i < BH_LRU_SIZE; i++) {
  1179. struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]);
  1180. if (bh && bh->b_blocknr == block && bh->b_bdev == bdev &&
  1181. bh->b_size == size) {
  1182. if (i) {
  1183. while (i) {
  1184. __this_cpu_write(bh_lrus.bhs[i],
  1185. __this_cpu_read(bh_lrus.bhs[i - 1]));
  1186. i--;
  1187. }
  1188. __this_cpu_write(bh_lrus.bhs[0], bh);
  1189. }
  1190. get_bh(bh);
  1191. ret = bh;
  1192. break;
  1193. }
  1194. }
  1195. bh_lru_unlock();
  1196. return ret;
  1197. }
  1198. /*
  1199. * Perform a pagecache lookup for the matching buffer. If it's there, refresh
  1200. * it in the LRU and mark it as accessed. If it is not present then return
  1201. * NULL
  1202. */
  1203. struct buffer_head *
  1204. __find_get_block(struct block_device *bdev, sector_t block, unsigned size)
  1205. {
  1206. struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
  1207. if (bh == NULL) {
  1208. /* __find_get_block_slow will mark the page accessed */
  1209. bh = __find_get_block_slow(bdev, block);
  1210. if (bh)
  1211. bh_lru_install(bh);
  1212. } else
  1213. touch_buffer(bh);
  1214. return bh;
  1215. }
  1216. EXPORT_SYMBOL(__find_get_block);
  1217. /*
  1218. * __getblk_gfp() will locate (and, if necessary, create) the buffer_head
  1219. * which corresponds to the passed block_device, block and size. The
  1220. * returned buffer has its reference count incremented.
  1221. *
  1222. * __getblk_gfp() will lock up the machine if grow_dev_page's
  1223. * try_to_free_buffers() attempt is failing. FIXME, perhaps?
  1224. */
  1225. struct buffer_head *
  1226. __getblk_gfp(struct block_device *bdev, sector_t block,
  1227. unsigned size, gfp_t gfp)
  1228. {
  1229. struct buffer_head *bh = __find_get_block(bdev, block, size);
  1230. might_sleep();
  1231. if (bh == NULL)
  1232. bh = __getblk_slow(bdev, block, size, gfp);
  1233. return bh;
  1234. }
  1235. EXPORT_SYMBOL(__getblk_gfp);
  1236. /*
  1237. * Do async read-ahead on a buffer..
  1238. */
  1239. void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
  1240. {
  1241. struct buffer_head *bh = __getblk(bdev, block, size);
  1242. if (likely(bh)) {
  1243. bh_readahead(bh, REQ_RAHEAD);
  1244. brelse(bh);
  1245. }
  1246. }
  1247. EXPORT_SYMBOL(__breadahead);
  1248. /**
  1249. * __bread_gfp() - reads a specified block and returns the bh
  1250. * @bdev: the block_device to read from
  1251. * @block: number of block
  1252. * @size: size (in bytes) to read
  1253. * @gfp: page allocation flag
  1254. *
  1255. * Reads a specified block, and returns buffer head that contains it.
  1256. * The page cache can be allocated from non-movable area
  1257. * not to prevent page migration if you set gfp to zero.
  1258. * It returns NULL if the block was unreadable.
  1259. */
  1260. struct buffer_head *
  1261. __bread_gfp(struct block_device *bdev, sector_t block,
  1262. unsigned size, gfp_t gfp)
  1263. {
  1264. struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp);
  1265. if (likely(bh) && !buffer_uptodate(bh))
  1266. bh = __bread_slow(bh);
  1267. return bh;
  1268. }
  1269. EXPORT_SYMBOL(__bread_gfp);
  1270. static void __invalidate_bh_lrus(struct bh_lru *b)
  1271. {
  1272. int i;
  1273. for (i = 0; i < BH_LRU_SIZE; i++) {
  1274. brelse(b->bhs[i]);
  1275. b->bhs[i] = NULL;
  1276. }
  1277. }
  1278. /*
  1279. * invalidate_bh_lrus() is called rarely - but not only at unmount.
  1280. * This doesn't race because it runs in each cpu either in irq
  1281. * or with preempt disabled.
  1282. */
  1283. static void invalidate_bh_lru(void *arg)
  1284. {
  1285. struct bh_lru *b = &get_cpu_var(bh_lrus);
  1286. __invalidate_bh_lrus(b);
  1287. put_cpu_var(bh_lrus);
  1288. }
  1289. bool has_bh_in_lru(int cpu, void *dummy)
  1290. {
  1291. struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
  1292. int i;
  1293. for (i = 0; i < BH_LRU_SIZE; i++) {
  1294. if (b->bhs[i])
  1295. return true;
  1296. }
  1297. return false;
  1298. }
  1299. void invalidate_bh_lrus(void)
  1300. {
  1301. on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1);
  1302. }
  1303. EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
  1304. /*
  1305. * It's called from workqueue context so we need a bh_lru_lock to close
  1306. * the race with preemption/irq.
  1307. */
  1308. void invalidate_bh_lrus_cpu(void)
  1309. {
  1310. struct bh_lru *b;
  1311. bh_lru_lock();
  1312. b = this_cpu_ptr(&bh_lrus);
  1313. __invalidate_bh_lrus(b);
  1314. bh_lru_unlock();
  1315. }
  1316. void set_bh_page(struct buffer_head *bh,
  1317. struct page *page, unsigned long offset)
  1318. {
  1319. bh->b_page = page;
  1320. BUG_ON(offset >= PAGE_SIZE);
  1321. if (PageHighMem(page))
  1322. /*
  1323. * This catches illegal uses and preserves the offset:
  1324. */
  1325. bh->b_data = (char *)(0 + offset);
  1326. else
  1327. bh->b_data = page_address(page) + offset;
  1328. }
  1329. EXPORT_SYMBOL(set_bh_page);
  1330. /*
  1331. * Called when truncating a buffer on a page completely.
  1332. */
  1333. /* Bits that are cleared during an invalidate */
  1334. #define BUFFER_FLAGS_DISCARD \
  1335. (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \
  1336. 1 << BH_Delay | 1 << BH_Unwritten)
  1337. static void discard_buffer(struct buffer_head * bh)
  1338. {
  1339. unsigned long b_state;
  1340. lock_buffer(bh);
  1341. clear_buffer_dirty(bh);
  1342. bh->b_bdev = NULL;
  1343. b_state = READ_ONCE(bh->b_state);
  1344. do {
  1345. } while (!try_cmpxchg(&bh->b_state, &b_state,
  1346. b_state & ~BUFFER_FLAGS_DISCARD));
  1347. unlock_buffer(bh);
  1348. }
  1349. /**
  1350. * block_invalidate_folio - Invalidate part or all of a buffer-backed folio.
  1351. * @folio: The folio which is affected.
  1352. * @offset: start of the range to invalidate
  1353. * @length: length of the range to invalidate
  1354. *
  1355. * block_invalidate_folio() is called when all or part of the folio has been
  1356. * invalidated by a truncate operation.
  1357. *
  1358. * block_invalidate_folio() does not have to release all buffers, but it must
  1359. * ensure that no dirty buffer is left outside @offset and that no I/O
  1360. * is underway against any of the blocks which are outside the truncation
  1361. * point. Because the caller is about to free (and possibly reuse) those
  1362. * blocks on-disk.
  1363. */
  1364. void block_invalidate_folio(struct folio *folio, size_t offset, size_t length)
  1365. {
  1366. struct buffer_head *head, *bh, *next;
  1367. size_t curr_off = 0;
  1368. size_t stop = length + offset;
  1369. BUG_ON(!folio_test_locked(folio));
  1370. /*
  1371. * Check for overflow
  1372. */
  1373. BUG_ON(stop > folio_size(folio) || stop < length);
  1374. head = folio_buffers(folio);
  1375. if (!head)
  1376. return;
  1377. bh = head;
  1378. do {
  1379. size_t next_off = curr_off + bh->b_size;
  1380. next = bh->b_this_page;
  1381. /*
  1382. * Are we still fully in range ?
  1383. */
  1384. if (next_off > stop)
  1385. goto out;
  1386. /*
  1387. * is this block fully invalidated?
  1388. */
  1389. if (offset <= curr_off)
  1390. discard_buffer(bh);
  1391. curr_off = next_off;
  1392. bh = next;
  1393. } while (bh != head);
  1394. /*
  1395. * We release buffers only if the entire folio is being invalidated.
  1396. * The get_block cached value has been unconditionally invalidated,
  1397. * so real IO is not possible anymore.
  1398. */
  1399. if (length == folio_size(folio))
  1400. filemap_release_folio(folio, 0);
  1401. out:
  1402. return;
  1403. }
  1404. EXPORT_SYMBOL(block_invalidate_folio);
  1405. /*
  1406. * We attach and possibly dirty the buffers atomically wrt
  1407. * block_dirty_folio() via private_lock. try_to_free_buffers
  1408. * is already excluded via the page lock.
  1409. */
  1410. void create_empty_buffers(struct page *page,
  1411. unsigned long blocksize, unsigned long b_state)
  1412. {
  1413. struct buffer_head *bh, *head, *tail;
  1414. head = alloc_page_buffers(page, blocksize, true);
  1415. bh = head;
  1416. do {
  1417. bh->b_state |= b_state;
  1418. tail = bh;
  1419. bh = bh->b_this_page;
  1420. } while (bh);
  1421. tail->b_this_page = head;
  1422. spin_lock(&page->mapping->private_lock);
  1423. if (PageUptodate(page) || PageDirty(page)) {
  1424. bh = head;
  1425. do {
  1426. if (PageDirty(page))
  1427. set_buffer_dirty(bh);
  1428. if (PageUptodate(page))
  1429. set_buffer_uptodate(bh);
  1430. bh = bh->b_this_page;
  1431. } while (bh != head);
  1432. }
  1433. attach_page_private(page, head);
  1434. spin_unlock(&page->mapping->private_lock);
  1435. }
  1436. EXPORT_SYMBOL(create_empty_buffers);
  1437. /**
  1438. * clean_bdev_aliases: clean a range of buffers in block device
  1439. * @bdev: Block device to clean buffers in
  1440. * @block: Start of a range of blocks to clean
  1441. * @len: Number of blocks to clean
  1442. *
  1443. * We are taking a range of blocks for data and we don't want writeback of any
  1444. * buffer-cache aliases starting from return from this function and until the
  1445. * moment when something will explicitly mark the buffer dirty (hopefully that
  1446. * will not happen until we will free that block ;-) We don't even need to mark
  1447. * it not-uptodate - nobody can expect anything from a newly allocated buffer
  1448. * anyway. We used to use unmap_buffer() for such invalidation, but that was
  1449. * wrong. We definitely don't want to mark the alias unmapped, for example - it
  1450. * would confuse anyone who might pick it with bread() afterwards...
  1451. *
  1452. * Also.. Note that bforget() doesn't lock the buffer. So there can be
  1453. * writeout I/O going on against recently-freed buffers. We don't wait on that
  1454. * I/O in bforget() - it's more efficient to wait on the I/O only if we really
  1455. * need to. That happens here.
  1456. */
  1457. void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
  1458. {
  1459. struct inode *bd_inode = bdev->bd_inode;
  1460. struct address_space *bd_mapping = bd_inode->i_mapping;
  1461. struct folio_batch fbatch;
  1462. pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
  1463. pgoff_t end;
  1464. int i, count;
  1465. struct buffer_head *bh;
  1466. struct buffer_head *head;
  1467. end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
  1468. folio_batch_init(&fbatch);
  1469. while (filemap_get_folios(bd_mapping, &index, end, &fbatch)) {
  1470. count = folio_batch_count(&fbatch);
  1471. for (i = 0; i < count; i++) {
  1472. struct folio *folio = fbatch.folios[i];
  1473. if (!folio_buffers(folio))
  1474. continue;
  1475. /*
  1476. * We use folio lock instead of bd_mapping->private_lock
  1477. * to pin buffers here since we can afford to sleep and
  1478. * it scales better than a global spinlock lock.
  1479. */
  1480. folio_lock(folio);
  1481. /* Recheck when the folio is locked which pins bhs */
  1482. head = folio_buffers(folio);
  1483. if (!head)
  1484. goto unlock_page;
  1485. bh = head;
  1486. do {
  1487. if (!buffer_mapped(bh) || (bh->b_blocknr < block))
  1488. goto next;
  1489. if (bh->b_blocknr >= block + len)
  1490. break;
  1491. clear_buffer_dirty(bh);
  1492. wait_on_buffer(bh);
  1493. clear_buffer_req(bh);
  1494. next:
  1495. bh = bh->b_this_page;
  1496. } while (bh != head);
  1497. unlock_page:
  1498. folio_unlock(folio);
  1499. }
  1500. folio_batch_release(&fbatch);
  1501. cond_resched();
  1502. /* End of range already reached? */
  1503. if (index > end || !index)
  1504. break;
  1505. }
  1506. }
  1507. EXPORT_SYMBOL(clean_bdev_aliases);
  1508. /*
  1509. * Size is a power-of-two in the range 512..PAGE_SIZE,
  1510. * and the case we care about most is PAGE_SIZE.
  1511. *
  1512. * So this *could* possibly be written with those
  1513. * constraints in mind (relevant mostly if some
  1514. * architecture has a slow bit-scan instruction)
  1515. */
  1516. static inline int block_size_bits(unsigned int blocksize)
  1517. {
  1518. return ilog2(blocksize);
  1519. }
  1520. static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
  1521. {
  1522. BUG_ON(!PageLocked(page));
  1523. if (!page_has_buffers(page))
  1524. create_empty_buffers(page, 1 << READ_ONCE(inode->i_blkbits),
  1525. b_state);
  1526. return page_buffers(page);
  1527. }
  1528. /*
  1529. * NOTE! All mapped/uptodate combinations are valid:
  1530. *
  1531. * Mapped Uptodate Meaning
  1532. *
  1533. * No No "unknown" - must do get_block()
  1534. * No Yes "hole" - zero-filled
  1535. * Yes No "allocated" - allocated on disk, not read in
  1536. * Yes Yes "valid" - allocated and up-to-date in memory.
  1537. *
  1538. * "Dirty" is valid only with the last case (mapped+uptodate).
  1539. */
  1540. /*
  1541. * While block_write_full_page is writing back the dirty buffers under
  1542. * the page lock, whoever dirtied the buffers may decide to clean them
  1543. * again at any time. We handle that by only looking at the buffer
  1544. * state inside lock_buffer().
  1545. *
  1546. * If block_write_full_page() is called for regular writeback
  1547. * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
  1548. * locked buffer. This only can happen if someone has written the buffer
  1549. * directly, with submit_bh(). At the address_space level PageWriteback
  1550. * prevents this contention from occurring.
  1551. *
  1552. * If block_write_full_page() is called with wbc->sync_mode ==
  1553. * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  1554. * causes the writes to be flagged as synchronous writes.
  1555. */
  1556. int __block_write_full_page(struct inode *inode, struct page *page,
  1557. get_block_t *get_block, struct writeback_control *wbc,
  1558. bh_end_io_t *handler)
  1559. {
  1560. int err;
  1561. sector_t block;
  1562. sector_t last_block;
  1563. struct buffer_head *bh, *head;
  1564. unsigned int blocksize, bbits;
  1565. int nr_underway = 0;
  1566. blk_opf_t write_flags = wbc_to_write_flags(wbc);
  1567. head = create_page_buffers(page, inode,
  1568. (1 << BH_Dirty)|(1 << BH_Uptodate));
  1569. /*
  1570. * Be very careful. We have no exclusion from block_dirty_folio
  1571. * here, and the (potentially unmapped) buffers may become dirty at
  1572. * any time. If a buffer becomes dirty here after we've inspected it
  1573. * then we just miss that fact, and the page stays dirty.
  1574. *
  1575. * Buffers outside i_size may be dirtied by block_dirty_folio;
  1576. * handle that here by just cleaning them.
  1577. */
  1578. bh = head;
  1579. blocksize = bh->b_size;
  1580. bbits = block_size_bits(blocksize);
  1581. block = (sector_t)page->index << (PAGE_SHIFT - bbits);
  1582. last_block = (i_size_read(inode) - 1) >> bbits;
  1583. /*
  1584. * Get all the dirty buffers mapped to disk addresses and
  1585. * handle any aliases from the underlying blockdev's mapping.
  1586. */
  1587. do {
  1588. if (block > last_block) {
  1589. /*
  1590. * mapped buffers outside i_size will occur, because
  1591. * this page can be outside i_size when there is a
  1592. * truncate in progress.
  1593. */
  1594. /*
  1595. * The buffer was zeroed by block_write_full_page()
  1596. */
  1597. clear_buffer_dirty(bh);
  1598. set_buffer_uptodate(bh);
  1599. } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
  1600. buffer_dirty(bh)) {
  1601. WARN_ON(bh->b_size != blocksize);
  1602. err = get_block(inode, block, bh, 1);
  1603. if (err)
  1604. goto recover;
  1605. clear_buffer_delay(bh);
  1606. if (buffer_new(bh)) {
  1607. /* blockdev mappings never come here */
  1608. clear_buffer_new(bh);
  1609. clean_bdev_bh_alias(bh);
  1610. }
  1611. }
  1612. bh = bh->b_this_page;
  1613. block++;
  1614. } while (bh != head);
  1615. do {
  1616. if (!buffer_mapped(bh))
  1617. continue;
  1618. /*
  1619. * If it's a fully non-blocking write attempt and we cannot
  1620. * lock the buffer then redirty the page. Note that this can
  1621. * potentially cause a busy-wait loop from writeback threads
  1622. * and kswapd activity, but those code paths have their own
  1623. * higher-level throttling.
  1624. */
  1625. if (wbc->sync_mode != WB_SYNC_NONE) {
  1626. lock_buffer(bh);
  1627. } else if (!trylock_buffer(bh)) {
  1628. redirty_page_for_writepage(wbc, page);
  1629. continue;
  1630. }
  1631. if (test_clear_buffer_dirty(bh)) {
  1632. mark_buffer_async_write_endio(bh, handler);
  1633. } else {
  1634. unlock_buffer(bh);
  1635. }
  1636. } while ((bh = bh->b_this_page) != head);
  1637. /*
  1638. * The page and its buffers are protected by PageWriteback(), so we can
  1639. * drop the bh refcounts early.
  1640. */
  1641. BUG_ON(PageWriteback(page));
  1642. set_page_writeback(page);
  1643. do {
  1644. struct buffer_head *next = bh->b_this_page;
  1645. if (buffer_async_write(bh)) {
  1646. submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
  1647. nr_underway++;
  1648. }
  1649. bh = next;
  1650. } while (bh != head);
  1651. unlock_page(page);
  1652. err = 0;
  1653. done:
  1654. if (nr_underway == 0) {
  1655. /*
  1656. * The page was marked dirty, but the buffers were
  1657. * clean. Someone wrote them back by hand with
  1658. * write_dirty_buffer/submit_bh. A rare case.
  1659. */
  1660. end_page_writeback(page);
  1661. /*
  1662. * The page and buffer_heads can be released at any time from
  1663. * here on.
  1664. */
  1665. }
  1666. return err;
  1667. recover:
  1668. /*
  1669. * ENOSPC, or some other error. We may already have added some
  1670. * blocks to the file, so we need to write these out to avoid
  1671. * exposing stale data.
  1672. * The page is currently locked and not marked for writeback
  1673. */
  1674. bh = head;
  1675. /* Recovery: lock and submit the mapped buffers */
  1676. do {
  1677. if (buffer_mapped(bh) && buffer_dirty(bh) &&
  1678. !buffer_delay(bh)) {
  1679. lock_buffer(bh);
  1680. mark_buffer_async_write_endio(bh, handler);
  1681. } else {
  1682. /*
  1683. * The buffer may have been set dirty during
  1684. * attachment to a dirty page.
  1685. */
  1686. clear_buffer_dirty(bh);
  1687. }
  1688. } while ((bh = bh->b_this_page) != head);
  1689. SetPageError(page);
  1690. BUG_ON(PageWriteback(page));
  1691. mapping_set_error(page->mapping, err);
  1692. set_page_writeback(page);
  1693. do {
  1694. struct buffer_head *next = bh->b_this_page;
  1695. if (buffer_async_write(bh)) {
  1696. clear_buffer_dirty(bh);
  1697. submit_bh_wbc(REQ_OP_WRITE | write_flags, bh, wbc);
  1698. nr_underway++;
  1699. }
  1700. bh = next;
  1701. } while (bh != head);
  1702. unlock_page(page);
  1703. goto done;
  1704. }
  1705. EXPORT_SYMBOL(__block_write_full_page);
  1706. /*
  1707. * If a page has any new buffers, zero them out here, and mark them uptodate
  1708. * and dirty so they'll be written out (in order to prevent uninitialised
  1709. * block data from leaking). And clear the new bit.
  1710. */
  1711. void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
  1712. {
  1713. unsigned int block_start, block_end;
  1714. struct buffer_head *head, *bh;
  1715. BUG_ON(!PageLocked(page));
  1716. if (!page_has_buffers(page))
  1717. return;
  1718. bh = head = page_buffers(page);
  1719. block_start = 0;
  1720. do {
  1721. block_end = block_start + bh->b_size;
  1722. if (buffer_new(bh)) {
  1723. if (block_end > from && block_start < to) {
  1724. if (!PageUptodate(page)) {
  1725. unsigned start, size;
  1726. start = max(from, block_start);
  1727. size = min(to, block_end) - start;
  1728. zero_user(page, start, size);
  1729. set_buffer_uptodate(bh);
  1730. }
  1731. clear_buffer_new(bh);
  1732. mark_buffer_dirty(bh);
  1733. }
  1734. }
  1735. block_start = block_end;
  1736. bh = bh->b_this_page;
  1737. } while (bh != head);
  1738. }
  1739. EXPORT_SYMBOL(page_zero_new_buffers);
  1740. static void
  1741. iomap_to_bh(struct inode *inode, sector_t block, struct buffer_head *bh,
  1742. const struct iomap *iomap)
  1743. {
  1744. loff_t offset = block << inode->i_blkbits;
  1745. bh->b_bdev = iomap->bdev;
  1746. /*
  1747. * Block points to offset in file we need to map, iomap contains
  1748. * the offset at which the map starts. If the map ends before the
  1749. * current block, then do not map the buffer and let the caller
  1750. * handle it.
  1751. */
  1752. BUG_ON(offset >= iomap->offset + iomap->length);
  1753. switch (iomap->type) {
  1754. case IOMAP_HOLE:
  1755. /*
  1756. * If the buffer is not up to date or beyond the current EOF,
  1757. * we need to mark it as new to ensure sub-block zeroing is
  1758. * executed if necessary.
  1759. */
  1760. if (!buffer_uptodate(bh) ||
  1761. (offset >= i_size_read(inode)))
  1762. set_buffer_new(bh);
  1763. break;
  1764. case IOMAP_DELALLOC:
  1765. if (!buffer_uptodate(bh) ||
  1766. (offset >= i_size_read(inode)))
  1767. set_buffer_new(bh);
  1768. set_buffer_uptodate(bh);
  1769. set_buffer_mapped(bh);
  1770. set_buffer_delay(bh);
  1771. break;
  1772. case IOMAP_UNWRITTEN:
  1773. /*
  1774. * For unwritten regions, we always need to ensure that regions
  1775. * in the block we are not writing to are zeroed. Mark the
  1776. * buffer as new to ensure this.
  1777. */
  1778. set_buffer_new(bh);
  1779. set_buffer_unwritten(bh);
  1780. fallthrough;
  1781. case IOMAP_MAPPED:
  1782. if ((iomap->flags & IOMAP_F_NEW) ||
  1783. offset >= i_size_read(inode))
  1784. set_buffer_new(bh);
  1785. bh->b_blocknr = (iomap->addr + offset - iomap->offset) >>
  1786. inode->i_blkbits;
  1787. set_buffer_mapped(bh);
  1788. break;
  1789. }
  1790. }
  1791. int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
  1792. get_block_t *get_block, const struct iomap *iomap)
  1793. {
  1794. unsigned from = pos & (PAGE_SIZE - 1);
  1795. unsigned to = from + len;
  1796. struct inode *inode = folio->mapping->host;
  1797. unsigned block_start, block_end;
  1798. sector_t block;
  1799. int err = 0;
  1800. unsigned blocksize, bbits;
  1801. struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
  1802. BUG_ON(!folio_test_locked(folio));
  1803. BUG_ON(from > PAGE_SIZE);
  1804. BUG_ON(to > PAGE_SIZE);
  1805. BUG_ON(from > to);
  1806. head = create_page_buffers(&folio->page, inode, 0);
  1807. blocksize = head->b_size;
  1808. bbits = block_size_bits(blocksize);
  1809. block = (sector_t)folio->index << (PAGE_SHIFT - bbits);
  1810. for(bh = head, block_start = 0; bh != head || !block_start;
  1811. block++, block_start=block_end, bh = bh->b_this_page) {
  1812. block_end = block_start + blocksize;
  1813. if (block_end <= from || block_start >= to) {
  1814. if (folio_test_uptodate(folio)) {
  1815. if (!buffer_uptodate(bh))
  1816. set_buffer_uptodate(bh);
  1817. }
  1818. continue;
  1819. }
  1820. if (buffer_new(bh))
  1821. clear_buffer_new(bh);
  1822. if (!buffer_mapped(bh)) {
  1823. WARN_ON(bh->b_size != blocksize);
  1824. if (get_block) {
  1825. err = get_block(inode, block, bh, 1);
  1826. if (err)
  1827. break;
  1828. } else {
  1829. iomap_to_bh(inode, block, bh, iomap);
  1830. }
  1831. if (buffer_new(bh)) {
  1832. clean_bdev_bh_alias(bh);
  1833. if (folio_test_uptodate(folio)) {
  1834. clear_buffer_new(bh);
  1835. set_buffer_uptodate(bh);
  1836. mark_buffer_dirty(bh);
  1837. continue;
  1838. }
  1839. if (block_end > to || block_start < from)
  1840. folio_zero_segments(folio,
  1841. to, block_end,
  1842. block_start, from);
  1843. continue;
  1844. }
  1845. }
  1846. if (folio_test_uptodate(folio)) {
  1847. if (!buffer_uptodate(bh))
  1848. set_buffer_uptodate(bh);
  1849. continue;
  1850. }
  1851. if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
  1852. !buffer_unwritten(bh) &&
  1853. (block_start < from || block_end > to)) {
  1854. bh_read_nowait(bh, 0);
  1855. *wait_bh++=bh;
  1856. }
  1857. }
  1858. /*
  1859. * If we issued read requests - let them complete.
  1860. */
  1861. while(wait_bh > wait) {
  1862. wait_on_buffer(*--wait_bh);
  1863. if (!buffer_uptodate(*wait_bh))
  1864. err = -EIO;
  1865. }
  1866. if (unlikely(err))
  1867. page_zero_new_buffers(&folio->page, from, to);
  1868. return err;
  1869. }
  1870. int __block_write_begin(struct page *page, loff_t pos, unsigned len,
  1871. get_block_t *get_block)
  1872. {
  1873. return __block_write_begin_int(page_folio(page), pos, len, get_block,
  1874. NULL);
  1875. }
  1876. EXPORT_SYMBOL(__block_write_begin);
  1877. static int __block_commit_write(struct inode *inode, struct page *page,
  1878. unsigned from, unsigned to)
  1879. {
  1880. unsigned block_start, block_end;
  1881. int partial = 0;
  1882. unsigned blocksize;
  1883. struct buffer_head *bh, *head;
  1884. bh = head = page_buffers(page);
  1885. blocksize = bh->b_size;
  1886. block_start = 0;
  1887. do {
  1888. block_end = block_start + blocksize;
  1889. if (block_end <= from || block_start >= to) {
  1890. if (!buffer_uptodate(bh))
  1891. partial = 1;
  1892. } else {
  1893. set_buffer_uptodate(bh);
  1894. mark_buffer_dirty(bh);
  1895. }
  1896. if (buffer_new(bh))
  1897. clear_buffer_new(bh);
  1898. block_start = block_end;
  1899. bh = bh->b_this_page;
  1900. } while (bh != head);
  1901. /*
  1902. * If this is a partial write which happened to make all buffers
  1903. * uptodate then we can optimize away a bogus read_folio() for
  1904. * the next read(). Here we 'discover' whether the page went
  1905. * uptodate as a result of this (potentially partial) write.
  1906. */
  1907. if (!partial)
  1908. SetPageUptodate(page);
  1909. return 0;
  1910. }
  1911. /*
  1912. * block_write_begin takes care of the basic task of block allocation and
  1913. * bringing partial write blocks uptodate first.
  1914. *
  1915. * The filesystem needs to handle block truncation upon failure.
  1916. */
  1917. int block_write_begin(struct address_space *mapping, loff_t pos, unsigned len,
  1918. struct page **pagep, get_block_t *get_block)
  1919. {
  1920. pgoff_t index = pos >> PAGE_SHIFT;
  1921. struct page *page;
  1922. int status;
  1923. page = grab_cache_page_write_begin(mapping, index);
  1924. if (!page)
  1925. return -ENOMEM;
  1926. status = __block_write_begin(page, pos, len, get_block);
  1927. if (unlikely(status)) {
  1928. unlock_page(page);
  1929. put_page(page);
  1930. page = NULL;
  1931. }
  1932. *pagep = page;
  1933. return status;
  1934. }
  1935. EXPORT_SYMBOL(block_write_begin);
  1936. int block_write_end(struct file *file, struct address_space *mapping,
  1937. loff_t pos, unsigned len, unsigned copied,
  1938. struct page *page, void *fsdata)
  1939. {
  1940. struct inode *inode = mapping->host;
  1941. unsigned start;
  1942. start = pos & (PAGE_SIZE - 1);
  1943. if (unlikely(copied < len)) {
  1944. /*
  1945. * The buffers that were written will now be uptodate, so
  1946. * we don't have to worry about a read_folio reading them
  1947. * and overwriting a partial write. However if we have
  1948. * encountered a short write and only partially written
  1949. * into a buffer, it will not be marked uptodate, so a
  1950. * read_folio might come in and destroy our partial write.
  1951. *
  1952. * Do the simplest thing, and just treat any short write to a
  1953. * non uptodate page as a zero-length write, and force the
  1954. * caller to redo the whole thing.
  1955. */
  1956. if (!PageUptodate(page))
  1957. copied = 0;
  1958. page_zero_new_buffers(page, start+copied, start+len);
  1959. }
  1960. flush_dcache_page(page);
  1961. /* This could be a short (even 0-length) commit */
  1962. __block_commit_write(inode, page, start, start+copied);
  1963. return copied;
  1964. }
  1965. EXPORT_SYMBOL(block_write_end);
  1966. int generic_write_end(struct file *file, struct address_space *mapping,
  1967. loff_t pos, unsigned len, unsigned copied,
  1968. struct page *page, void *fsdata)
  1969. {
  1970. struct inode *inode = mapping->host;
  1971. loff_t old_size = inode->i_size;
  1972. bool i_size_changed = false;
  1973. copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
  1974. /*
  1975. * No need to use i_size_read() here, the i_size cannot change under us
  1976. * because we hold i_rwsem.
  1977. *
  1978. * But it's important to update i_size while still holding page lock:
  1979. * page writeout could otherwise come in and zero beyond i_size.
  1980. */
  1981. if (pos + copied > inode->i_size) {
  1982. i_size_write(inode, pos + copied);
  1983. i_size_changed = true;
  1984. }
  1985. unlock_page(page);
  1986. put_page(page);
  1987. if (old_size < pos)
  1988. pagecache_isize_extended(inode, old_size, pos);
  1989. /*
  1990. * Don't mark the inode dirty under page lock. First, it unnecessarily
  1991. * makes the holding time of page lock longer. Second, it forces lock
  1992. * ordering of page lock and transaction start for journaling
  1993. * filesystems.
  1994. */
  1995. if (i_size_changed)
  1996. mark_inode_dirty(inode);
  1997. return copied;
  1998. }
  1999. EXPORT_SYMBOL(generic_write_end);
  2000. /*
  2001. * block_is_partially_uptodate checks whether buffers within a folio are
  2002. * uptodate or not.
  2003. *
  2004. * Returns true if all buffers which correspond to the specified part
  2005. * of the folio are uptodate.
  2006. */
  2007. bool block_is_partially_uptodate(struct folio *folio, size_t from, size_t count)
  2008. {
  2009. unsigned block_start, block_end, blocksize;
  2010. unsigned to;
  2011. struct buffer_head *bh, *head;
  2012. bool ret = true;
  2013. head = folio_buffers(folio);
  2014. if (!head)
  2015. return false;
  2016. blocksize = head->b_size;
  2017. to = min_t(unsigned, folio_size(folio) - from, count);
  2018. to = from + to;
  2019. if (from < blocksize && to > folio_size(folio) - blocksize)
  2020. return false;
  2021. bh = head;
  2022. block_start = 0;
  2023. do {
  2024. block_end = block_start + blocksize;
  2025. if (block_end > from && block_start < to) {
  2026. if (!buffer_uptodate(bh)) {
  2027. ret = false;
  2028. break;
  2029. }
  2030. if (block_end >= to)
  2031. break;
  2032. }
  2033. block_start = block_end;
  2034. bh = bh->b_this_page;
  2035. } while (bh != head);
  2036. return ret;
  2037. }
  2038. EXPORT_SYMBOL(block_is_partially_uptodate);
  2039. /*
  2040. * Generic "read_folio" function for block devices that have the normal
  2041. * get_block functionality. This is most of the block device filesystems.
  2042. * Reads the folio asynchronously --- the unlock_buffer() and
  2043. * set/clear_buffer_uptodate() functions propagate buffer state into the
  2044. * folio once IO has completed.
  2045. */
  2046. int block_read_full_folio(struct folio *folio, get_block_t *get_block)
  2047. {
  2048. struct inode *inode = folio->mapping->host;
  2049. sector_t iblock, lblock;
  2050. struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
  2051. unsigned int blocksize, bbits;
  2052. int nr, i;
  2053. int fully_mapped = 1;
  2054. bool page_error = false;
  2055. loff_t limit = i_size_read(inode);
  2056. /* This is needed for ext4. */
  2057. if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
  2058. limit = inode->i_sb->s_maxbytes;
  2059. VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
  2060. head = create_page_buffers(&folio->page, inode, 0);
  2061. blocksize = head->b_size;
  2062. bbits = block_size_bits(blocksize);
  2063. iblock = (sector_t)folio->index << (PAGE_SHIFT - bbits);
  2064. lblock = (limit+blocksize-1) >> bbits;
  2065. bh = head;
  2066. nr = 0;
  2067. i = 0;
  2068. do {
  2069. if (buffer_uptodate(bh))
  2070. continue;
  2071. if (!buffer_mapped(bh)) {
  2072. int err = 0;
  2073. fully_mapped = 0;
  2074. if (iblock < lblock) {
  2075. WARN_ON(bh->b_size != blocksize);
  2076. err = get_block(inode, iblock, bh, 0);
  2077. if (err) {
  2078. folio_set_error(folio);
  2079. page_error = true;
  2080. }
  2081. }
  2082. if (!buffer_mapped(bh)) {
  2083. folio_zero_range(folio, i * blocksize,
  2084. blocksize);
  2085. if (!err)
  2086. set_buffer_uptodate(bh);
  2087. continue;
  2088. }
  2089. /*
  2090. * get_block() might have updated the buffer
  2091. * synchronously
  2092. */
  2093. if (buffer_uptodate(bh))
  2094. continue;
  2095. }
  2096. arr[nr++] = bh;
  2097. } while (i++, iblock++, (bh = bh->b_this_page) != head);
  2098. if (fully_mapped)
  2099. folio_set_mappedtodisk(folio);
  2100. if (!nr) {
  2101. /*
  2102. * All buffers are uptodate - we can set the folio uptodate
  2103. * as well. But not if get_block() returned an error.
  2104. */
  2105. if (!page_error)
  2106. folio_mark_uptodate(folio);
  2107. folio_unlock(folio);
  2108. return 0;
  2109. }
  2110. /* Stage two: lock the buffers */
  2111. for (i = 0; i < nr; i++) {
  2112. bh = arr[i];
  2113. lock_buffer(bh);
  2114. mark_buffer_async_read(bh);
  2115. }
  2116. /*
  2117. * Stage 3: start the IO. Check for uptodateness
  2118. * inside the buffer lock in case another process reading
  2119. * the underlying blockdev brought it uptodate (the sct fix).
  2120. */
  2121. for (i = 0; i < nr; i++) {
  2122. bh = arr[i];
  2123. if (buffer_uptodate(bh))
  2124. end_buffer_async_read(bh, 1);
  2125. else
  2126. submit_bh(REQ_OP_READ, bh);
  2127. }
  2128. return 0;
  2129. }
  2130. EXPORT_SYMBOL(block_read_full_folio);
  2131. /* utility function for filesystems that need to do work on expanding
  2132. * truncates. Uses filesystem pagecache writes to allow the filesystem to
  2133. * deal with the hole.
  2134. */
  2135. int generic_cont_expand_simple(struct inode *inode, loff_t size)
  2136. {
  2137. struct address_space *mapping = inode->i_mapping;
  2138. const struct address_space_operations *aops = mapping->a_ops;
  2139. struct page *page;
  2140. void *fsdata = NULL;
  2141. int err;
  2142. err = inode_newsize_ok(inode, size);
  2143. if (err)
  2144. goto out;
  2145. err = aops->write_begin(NULL, mapping, size, 0, &page, &fsdata);
  2146. if (err)
  2147. goto out;
  2148. err = aops->write_end(NULL, mapping, size, 0, 0, page, fsdata);
  2149. BUG_ON(err > 0);
  2150. out:
  2151. return err;
  2152. }
  2153. EXPORT_SYMBOL(generic_cont_expand_simple);
  2154. static int cont_expand_zero(struct file *file, struct address_space *mapping,
  2155. loff_t pos, loff_t *bytes)
  2156. {
  2157. struct inode *inode = mapping->host;
  2158. const struct address_space_operations *aops = mapping->a_ops;
  2159. unsigned int blocksize = i_blocksize(inode);
  2160. struct page *page;
  2161. void *fsdata = NULL;
  2162. pgoff_t index, curidx;
  2163. loff_t curpos;
  2164. unsigned zerofrom, offset, len;
  2165. int err = 0;
  2166. index = pos >> PAGE_SHIFT;
  2167. offset = pos & ~PAGE_MASK;
  2168. while (index > (curidx = (curpos = *bytes)>>PAGE_SHIFT)) {
  2169. zerofrom = curpos & ~PAGE_MASK;
  2170. if (zerofrom & (blocksize-1)) {
  2171. *bytes |= (blocksize-1);
  2172. (*bytes)++;
  2173. }
  2174. len = PAGE_SIZE - zerofrom;
  2175. err = aops->write_begin(file, mapping, curpos, len,
  2176. &page, &fsdata);
  2177. if (err)
  2178. goto out;
  2179. zero_user(page, zerofrom, len);
  2180. err = aops->write_end(file, mapping, curpos, len, len,
  2181. page, fsdata);
  2182. if (err < 0)
  2183. goto out;
  2184. BUG_ON(err != len);
  2185. err = 0;
  2186. balance_dirty_pages_ratelimited(mapping);
  2187. if (fatal_signal_pending(current)) {
  2188. err = -EINTR;
  2189. goto out;
  2190. }
  2191. }
  2192. /* page covers the boundary, find the boundary offset */
  2193. if (index == curidx) {
  2194. zerofrom = curpos & ~PAGE_MASK;
  2195. /* if we will expand the thing last block will be filled */
  2196. if (offset <= zerofrom) {
  2197. goto out;
  2198. }
  2199. if (zerofrom & (blocksize-1)) {
  2200. *bytes |= (blocksize-1);
  2201. (*bytes)++;
  2202. }
  2203. len = offset - zerofrom;
  2204. err = aops->write_begin(file, mapping, curpos, len,
  2205. &page, &fsdata);
  2206. if (err)
  2207. goto out;
  2208. zero_user(page, zerofrom, len);
  2209. err = aops->write_end(file, mapping, curpos, len, len,
  2210. page, fsdata);
  2211. if (err < 0)
  2212. goto out;
  2213. BUG_ON(err != len);
  2214. err = 0;
  2215. }
  2216. out:
  2217. return err;
  2218. }
  2219. /*
  2220. * For moronic filesystems that do not allow holes in file.
  2221. * We may have to extend the file.
  2222. */
  2223. int cont_write_begin(struct file *file, struct address_space *mapping,
  2224. loff_t pos, unsigned len,
  2225. struct page **pagep, void **fsdata,
  2226. get_block_t *get_block, loff_t *bytes)
  2227. {
  2228. struct inode *inode = mapping->host;
  2229. unsigned int blocksize = i_blocksize(inode);
  2230. unsigned int zerofrom;
  2231. int err;
  2232. err = cont_expand_zero(file, mapping, pos, bytes);
  2233. if (err)
  2234. return err;
  2235. zerofrom = *bytes & ~PAGE_MASK;
  2236. if (pos+len > *bytes && zerofrom & (blocksize-1)) {
  2237. *bytes |= (blocksize-1);
  2238. (*bytes)++;
  2239. }
  2240. return block_write_begin(mapping, pos, len, pagep, get_block);
  2241. }
  2242. EXPORT_SYMBOL(cont_write_begin);
  2243. int block_commit_write(struct page *page, unsigned from, unsigned to)
  2244. {
  2245. struct inode *inode = page->mapping->host;
  2246. __block_commit_write(inode,page,from,to);
  2247. return 0;
  2248. }
  2249. EXPORT_SYMBOL(block_commit_write);
  2250. /*
  2251. * block_page_mkwrite() is not allowed to change the file size as it gets
  2252. * called from a page fault handler when a page is first dirtied. Hence we must
  2253. * be careful to check for EOF conditions here. We set the page up correctly
  2254. * for a written page which means we get ENOSPC checking when writing into
  2255. * holes and correct delalloc and unwritten extent mapping on filesystems that
  2256. * support these features.
  2257. *
  2258. * We are not allowed to take the i_mutex here so we have to play games to
  2259. * protect against truncate races as the page could now be beyond EOF. Because
  2260. * truncate writes the inode size before removing pages, once we have the
  2261. * page lock we can determine safely if the page is beyond EOF. If it is not
  2262. * beyond EOF, then the page is guaranteed safe against truncation until we
  2263. * unlock the page.
  2264. *
  2265. * Direct callers of this function should protect against filesystem freezing
  2266. * using sb_start_pagefault() - sb_end_pagefault() functions.
  2267. */
  2268. int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
  2269. get_block_t get_block)
  2270. {
  2271. struct page *page = vmf->page;
  2272. struct inode *inode = file_inode(vma->vm_file);
  2273. unsigned long end;
  2274. loff_t size;
  2275. int ret;
  2276. lock_page(page);
  2277. size = i_size_read(inode);
  2278. if ((page->mapping != inode->i_mapping) ||
  2279. (page_offset(page) > size)) {
  2280. /* We overload EFAULT to mean page got truncated */
  2281. ret = -EFAULT;
  2282. goto out_unlock;
  2283. }
  2284. /* page is wholly or partially inside EOF */
  2285. if (((page->index + 1) << PAGE_SHIFT) > size)
  2286. end = size & ~PAGE_MASK;
  2287. else
  2288. end = PAGE_SIZE;
  2289. ret = __block_write_begin(page, 0, end, get_block);
  2290. if (!ret)
  2291. ret = block_commit_write(page, 0, end);
  2292. if (unlikely(ret < 0))
  2293. goto out_unlock;
  2294. set_page_dirty(page);
  2295. wait_for_stable_page(page);
  2296. return 0;
  2297. out_unlock:
  2298. unlock_page(page);
  2299. return ret;
  2300. }
  2301. EXPORT_SYMBOL(block_page_mkwrite);
  2302. int block_truncate_page(struct address_space *mapping,
  2303. loff_t from, get_block_t *get_block)
  2304. {
  2305. pgoff_t index = from >> PAGE_SHIFT;
  2306. unsigned offset = from & (PAGE_SIZE-1);
  2307. unsigned blocksize;
  2308. sector_t iblock;
  2309. unsigned length, pos;
  2310. struct inode *inode = mapping->host;
  2311. struct page *page;
  2312. struct buffer_head *bh;
  2313. int err;
  2314. blocksize = i_blocksize(inode);
  2315. length = offset & (blocksize - 1);
  2316. /* Block boundary? Nothing to do */
  2317. if (!length)
  2318. return 0;
  2319. length = blocksize - length;
  2320. iblock = (sector_t)index << (PAGE_SHIFT - inode->i_blkbits);
  2321. page = grab_cache_page(mapping, index);
  2322. err = -ENOMEM;
  2323. if (!page)
  2324. goto out;
  2325. if (!page_has_buffers(page))
  2326. create_empty_buffers(page, blocksize, 0);
  2327. /* Find the buffer that contains "offset" */
  2328. bh = page_buffers(page);
  2329. pos = blocksize;
  2330. while (offset >= pos) {
  2331. bh = bh->b_this_page;
  2332. iblock++;
  2333. pos += blocksize;
  2334. }
  2335. err = 0;
  2336. if (!buffer_mapped(bh)) {
  2337. WARN_ON(bh->b_size != blocksize);
  2338. err = get_block(inode, iblock, bh, 0);
  2339. if (err)
  2340. goto unlock;
  2341. /* unmapped? It's a hole - nothing to do */
  2342. if (!buffer_mapped(bh))
  2343. goto unlock;
  2344. }
  2345. /* Ok, it's mapped. Make sure it's up-to-date */
  2346. if (PageUptodate(page))
  2347. set_buffer_uptodate(bh);
  2348. if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
  2349. err = bh_read(bh, 0);
  2350. /* Uhhuh. Read error. Complain and punt. */
  2351. if (err < 0)
  2352. goto unlock;
  2353. }
  2354. zero_user(page, offset, length);
  2355. mark_buffer_dirty(bh);
  2356. err = 0;
  2357. unlock:
  2358. unlock_page(page);
  2359. put_page(page);
  2360. out:
  2361. return err;
  2362. }
  2363. EXPORT_SYMBOL(block_truncate_page);
  2364. /*
  2365. * The generic ->writepage function for buffer-backed address_spaces
  2366. */
  2367. int block_write_full_page(struct page *page, get_block_t *get_block,
  2368. struct writeback_control *wbc)
  2369. {
  2370. struct inode * const inode = page->mapping->host;
  2371. loff_t i_size = i_size_read(inode);
  2372. const pgoff_t end_index = i_size >> PAGE_SHIFT;
  2373. unsigned offset;
  2374. /* Is the page fully inside i_size? */
  2375. if (page->index < end_index)
  2376. return __block_write_full_page(inode, page, get_block, wbc,
  2377. end_buffer_async_write);
  2378. /* Is the page fully outside i_size? (truncate in progress) */
  2379. offset = i_size & (PAGE_SIZE-1);
  2380. if (page->index >= end_index+1 || !offset) {
  2381. unlock_page(page);
  2382. return 0; /* don't care */
  2383. }
  2384. /*
  2385. * The page straddles i_size. It must be zeroed out on each and every
  2386. * writepage invocation because it may be mmapped. "A file is mapped
  2387. * in multiples of the page size. For a file that is not a multiple of
  2388. * the page size, the remaining memory is zeroed when mapped, and
  2389. * writes to that region are not written out to the file."
  2390. */
  2391. zero_user_segment(page, offset, PAGE_SIZE);
  2392. return __block_write_full_page(inode, page, get_block, wbc,
  2393. end_buffer_async_write);
  2394. }
  2395. EXPORT_SYMBOL(block_write_full_page);
  2396. sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
  2397. get_block_t *get_block)
  2398. {
  2399. struct inode *inode = mapping->host;
  2400. struct buffer_head tmp = {
  2401. .b_size = i_blocksize(inode),
  2402. };
  2403. get_block(inode, block, &tmp, 0);
  2404. return tmp.b_blocknr;
  2405. }
  2406. EXPORT_SYMBOL(generic_block_bmap);
  2407. static void end_bio_bh_io_sync(struct bio *bio)
  2408. {
  2409. struct buffer_head *bh = bio->bi_private;
  2410. if (unlikely(bio_flagged(bio, BIO_QUIET)))
  2411. set_bit(BH_Quiet, &bh->b_state);
  2412. bh->b_end_io(bh, !bio->bi_status);
  2413. bio_put(bio);
  2414. }
  2415. static void submit_bh_wbc(blk_opf_t opf, struct buffer_head *bh,
  2416. struct writeback_control *wbc)
  2417. {
  2418. const enum req_op op = opf & REQ_OP_MASK;
  2419. struct bio *bio;
  2420. BUG_ON(!buffer_locked(bh));
  2421. BUG_ON(!buffer_mapped(bh));
  2422. BUG_ON(!bh->b_end_io);
  2423. BUG_ON(buffer_delay(bh));
  2424. BUG_ON(buffer_unwritten(bh));
  2425. /*
  2426. * Only clear out a write error when rewriting
  2427. */
  2428. if (test_set_buffer_req(bh) && (op == REQ_OP_WRITE))
  2429. clear_buffer_write_io_error(bh);
  2430. if (buffer_meta(bh))
  2431. opf |= REQ_META;
  2432. if (buffer_prio(bh))
  2433. opf |= REQ_PRIO;
  2434. bio = bio_alloc(bh->b_bdev, 1, opf, GFP_NOIO);
  2435. fscrypt_set_bio_crypt_ctx_bh(bio, bh, GFP_NOIO);
  2436. bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
  2437. bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
  2438. BUG_ON(bio->bi_iter.bi_size != bh->b_size);
  2439. bio->bi_end_io = end_bio_bh_io_sync;
  2440. bio->bi_private = bh;
  2441. /* Take care of bh's that straddle the end of the device */
  2442. guard_bio_eod(bio);
  2443. if (wbc) {
  2444. wbc_init_bio(wbc, bio);
  2445. wbc_account_cgroup_owner(wbc, bh->b_page, bh->b_size);
  2446. }
  2447. submit_bio(bio);
  2448. }
  2449. void submit_bh(blk_opf_t opf, struct buffer_head *bh)
  2450. {
  2451. submit_bh_wbc(opf, bh, NULL);
  2452. }
  2453. EXPORT_SYMBOL(submit_bh);
  2454. void write_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
  2455. {
  2456. lock_buffer(bh);
  2457. if (!test_clear_buffer_dirty(bh)) {
  2458. unlock_buffer(bh);
  2459. return;
  2460. }
  2461. bh->b_end_io = end_buffer_write_sync;
  2462. get_bh(bh);
  2463. submit_bh(REQ_OP_WRITE | op_flags, bh);
  2464. }
  2465. EXPORT_SYMBOL(write_dirty_buffer);
  2466. /*
  2467. * For a data-integrity writeout, we need to wait upon any in-progress I/O
  2468. * and then start new I/O and then wait upon it. The caller must have a ref on
  2469. * the buffer_head.
  2470. */
  2471. int __sync_dirty_buffer(struct buffer_head *bh, blk_opf_t op_flags)
  2472. {
  2473. WARN_ON(atomic_read(&bh->b_count) < 1);
  2474. lock_buffer(bh);
  2475. if (test_clear_buffer_dirty(bh)) {
  2476. /*
  2477. * The bh should be mapped, but it might not be if the
  2478. * device was hot-removed. Not much we can do but fail the I/O.
  2479. */
  2480. if (!buffer_mapped(bh)) {
  2481. unlock_buffer(bh);
  2482. return -EIO;
  2483. }
  2484. get_bh(bh);
  2485. bh->b_end_io = end_buffer_write_sync;
  2486. submit_bh(REQ_OP_WRITE | op_flags, bh);
  2487. wait_on_buffer(bh);
  2488. if (!buffer_uptodate(bh))
  2489. return -EIO;
  2490. } else {
  2491. unlock_buffer(bh);
  2492. }
  2493. return 0;
  2494. }
  2495. EXPORT_SYMBOL(__sync_dirty_buffer);
  2496. int sync_dirty_buffer(struct buffer_head *bh)
  2497. {
  2498. return __sync_dirty_buffer(bh, REQ_SYNC);
  2499. }
  2500. EXPORT_SYMBOL(sync_dirty_buffer);
  2501. /*
  2502. * try_to_free_buffers() checks if all the buffers on this particular folio
  2503. * are unused, and releases them if so.
  2504. *
  2505. * Exclusion against try_to_free_buffers may be obtained by either
  2506. * locking the folio or by holding its mapping's private_lock.
  2507. *
  2508. * If the folio is dirty but all the buffers are clean then we need to
  2509. * be sure to mark the folio clean as well. This is because the folio
  2510. * may be against a block device, and a later reattachment of buffers
  2511. * to a dirty folio will set *all* buffers dirty. Which would corrupt
  2512. * filesystem data on the same device.
  2513. *
  2514. * The same applies to regular filesystem folios: if all the buffers are
  2515. * clean then we set the folio clean and proceed. To do that, we require
  2516. * total exclusion from block_dirty_folio(). That is obtained with
  2517. * private_lock.
  2518. *
  2519. * try_to_free_buffers() is non-blocking.
  2520. */
  2521. static inline int buffer_busy(struct buffer_head *bh)
  2522. {
  2523. return atomic_read(&bh->b_count) |
  2524. (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
  2525. }
  2526. static bool
  2527. drop_buffers(struct folio *folio, struct buffer_head **buffers_to_free)
  2528. {
  2529. struct buffer_head *head = folio_buffers(folio);
  2530. struct buffer_head *bh;
  2531. bh = head;
  2532. do {
  2533. if (buffer_busy(bh))
  2534. goto failed;
  2535. bh = bh->b_this_page;
  2536. } while (bh != head);
  2537. do {
  2538. struct buffer_head *next = bh->b_this_page;
  2539. if (bh->b_assoc_map)
  2540. __remove_assoc_queue(bh);
  2541. bh = next;
  2542. } while (bh != head);
  2543. *buffers_to_free = head;
  2544. folio_detach_private(folio);
  2545. return true;
  2546. failed:
  2547. return false;
  2548. }
  2549. bool try_to_free_buffers(struct folio *folio)
  2550. {
  2551. struct address_space * const mapping = folio->mapping;
  2552. struct buffer_head *buffers_to_free = NULL;
  2553. bool ret = 0;
  2554. BUG_ON(!folio_test_locked(folio));
  2555. if (folio_test_writeback(folio))
  2556. return false;
  2557. if (mapping == NULL) { /* can this still happen? */
  2558. ret = drop_buffers(folio, &buffers_to_free);
  2559. goto out;
  2560. }
  2561. spin_lock(&mapping->private_lock);
  2562. ret = drop_buffers(folio, &buffers_to_free);
  2563. /*
  2564. * If the filesystem writes its buffers by hand (eg ext3)
  2565. * then we can have clean buffers against a dirty folio. We
  2566. * clean the folio here; otherwise the VM will never notice
  2567. * that the filesystem did any IO at all.
  2568. *
  2569. * Also, during truncate, discard_buffer will have marked all
  2570. * the folio's buffers clean. We discover that here and clean
  2571. * the folio also.
  2572. *
  2573. * private_lock must be held over this entire operation in order
  2574. * to synchronise against block_dirty_folio and prevent the
  2575. * dirty bit from being lost.
  2576. */
  2577. if (ret)
  2578. folio_cancel_dirty(folio);
  2579. spin_unlock(&mapping->private_lock);
  2580. out:
  2581. if (buffers_to_free) {
  2582. struct buffer_head *bh = buffers_to_free;
  2583. do {
  2584. struct buffer_head *next = bh->b_this_page;
  2585. free_buffer_head(bh);
  2586. bh = next;
  2587. } while (bh != buffers_to_free);
  2588. }
  2589. return ret;
  2590. }
  2591. EXPORT_SYMBOL(try_to_free_buffers);
  2592. /*
  2593. * Buffer-head allocation
  2594. */
  2595. static struct kmem_cache *bh_cachep __read_mostly;
  2596. /*
  2597. * Once the number of bh's in the machine exceeds this level, we start
  2598. * stripping them in writeback.
  2599. */
  2600. static unsigned long max_buffer_heads;
  2601. int buffer_heads_over_limit;
  2602. struct bh_accounting {
  2603. int nr; /* Number of live bh's */
  2604. int ratelimit; /* Limit cacheline bouncing */
  2605. };
  2606. static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
  2607. static void recalc_bh_state(void)
  2608. {
  2609. int i;
  2610. int tot = 0;
  2611. if (__this_cpu_inc_return(bh_accounting.ratelimit) - 1 < 4096)
  2612. return;
  2613. __this_cpu_write(bh_accounting.ratelimit, 0);
  2614. for_each_online_cpu(i)
  2615. tot += per_cpu(bh_accounting, i).nr;
  2616. buffer_heads_over_limit = (tot > max_buffer_heads);
  2617. }
  2618. struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
  2619. {
  2620. struct buffer_head *ret = kmem_cache_zalloc(bh_cachep, gfp_flags);
  2621. if (ret) {
  2622. INIT_LIST_HEAD(&ret->b_assoc_buffers);
  2623. spin_lock_init(&ret->b_uptodate_lock);
  2624. preempt_disable();
  2625. __this_cpu_inc(bh_accounting.nr);
  2626. recalc_bh_state();
  2627. preempt_enable();
  2628. }
  2629. return ret;
  2630. }
  2631. EXPORT_SYMBOL(alloc_buffer_head);
  2632. void free_buffer_head(struct buffer_head *bh)
  2633. {
  2634. BUG_ON(!list_empty(&bh->b_assoc_buffers));
  2635. kmem_cache_free(bh_cachep, bh);
  2636. preempt_disable();
  2637. __this_cpu_dec(bh_accounting.nr);
  2638. recalc_bh_state();
  2639. preempt_enable();
  2640. }
  2641. EXPORT_SYMBOL(free_buffer_head);
  2642. static int buffer_exit_cpu_dead(unsigned int cpu)
  2643. {
  2644. int i;
  2645. struct bh_lru *b = &per_cpu(bh_lrus, cpu);
  2646. for (i = 0; i < BH_LRU_SIZE; i++) {
  2647. brelse(b->bhs[i]);
  2648. b->bhs[i] = NULL;
  2649. }
  2650. this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
  2651. per_cpu(bh_accounting, cpu).nr = 0;
  2652. return 0;
  2653. }
  2654. /**
  2655. * bh_uptodate_or_lock - Test whether the buffer is uptodate
  2656. * @bh: struct buffer_head
  2657. *
  2658. * Return true if the buffer is up-to-date and false,
  2659. * with the buffer locked, if not.
  2660. */
  2661. int bh_uptodate_or_lock(struct buffer_head *bh)
  2662. {
  2663. if (!buffer_uptodate(bh)) {
  2664. lock_buffer(bh);
  2665. if (!buffer_uptodate(bh))
  2666. return 0;
  2667. unlock_buffer(bh);
  2668. }
  2669. return 1;
  2670. }
  2671. EXPORT_SYMBOL(bh_uptodate_or_lock);
  2672. /**
  2673. * __bh_read - Submit read for a locked buffer
  2674. * @bh: struct buffer_head
  2675. * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
  2676. * @wait: wait until reading finish
  2677. *
  2678. * Returns zero on success or don't wait, and -EIO on error.
  2679. */
  2680. int __bh_read(struct buffer_head *bh, blk_opf_t op_flags, bool wait)
  2681. {
  2682. int ret = 0;
  2683. BUG_ON(!buffer_locked(bh));
  2684. get_bh(bh);
  2685. bh->b_end_io = end_buffer_read_sync;
  2686. submit_bh(REQ_OP_READ | op_flags, bh);
  2687. if (wait) {
  2688. wait_on_buffer(bh);
  2689. if (!buffer_uptodate(bh))
  2690. ret = -EIO;
  2691. }
  2692. return ret;
  2693. }
  2694. EXPORT_SYMBOL(__bh_read);
  2695. /**
  2696. * __bh_read_batch - Submit read for a batch of unlocked buffers
  2697. * @nr: entry number of the buffer batch
  2698. * @bhs: a batch of struct buffer_head
  2699. * @op_flags: appending REQ_OP_* flags besides REQ_OP_READ
  2700. * @force_lock: force to get a lock on the buffer if set, otherwise drops any
  2701. * buffer that cannot lock.
  2702. *
  2703. * Returns zero on success or don't wait, and -EIO on error.
  2704. */
  2705. void __bh_read_batch(int nr, struct buffer_head *bhs[],
  2706. blk_opf_t op_flags, bool force_lock)
  2707. {
  2708. int i;
  2709. for (i = 0; i < nr; i++) {
  2710. struct buffer_head *bh = bhs[i];
  2711. if (buffer_uptodate(bh))
  2712. continue;
  2713. if (force_lock)
  2714. lock_buffer(bh);
  2715. else
  2716. if (!trylock_buffer(bh))
  2717. continue;
  2718. if (buffer_uptodate(bh)) {
  2719. unlock_buffer(bh);
  2720. continue;
  2721. }
  2722. bh->b_end_io = end_buffer_read_sync;
  2723. get_bh(bh);
  2724. submit_bh(REQ_OP_READ | op_flags, bh);
  2725. }
  2726. }
  2727. EXPORT_SYMBOL(__bh_read_batch);
  2728. void __init buffer_init(void)
  2729. {
  2730. unsigned long nrpages;
  2731. int ret;
  2732. bh_cachep = kmem_cache_create("buffer_head",
  2733. sizeof(struct buffer_head), 0,
  2734. (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
  2735. SLAB_MEM_SPREAD),
  2736. NULL);
  2737. /*
  2738. * Limit the bh occupancy to 10% of ZONE_NORMAL
  2739. */
  2740. nrpages = (nr_free_buffer_pages() * 10) / 100;
  2741. max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
  2742. ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
  2743. NULL, buffer_exit_cpu_dead);
  2744. WARN_ON(ret < 0);
  2745. }