xfs_buf.c 58 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * Copyright (c) 2000-2006 Silicon Graphics, Inc.
  4. * All Rights Reserved.
  5. */
  6. #include "xfs.h"
  7. #include <linux/backing-dev.h>
  8. #include <linux/dax.h>
  9. #include "xfs_shared.h"
  10. #include "xfs_format.h"
  11. #include "xfs_log_format.h"
  12. #include "xfs_trans_resv.h"
  13. #include "xfs_mount.h"
  14. #include "xfs_trace.h"
  15. #include "xfs_log.h"
  16. #include "xfs_log_recover.h"
  17. #include "xfs_log_priv.h"
  18. #include "xfs_trans.h"
  19. #include "xfs_buf_item.h"
  20. #include "xfs_errortag.h"
  21. #include "xfs_error.h"
  22. #include "xfs_ag.h"
  23. struct kmem_cache *xfs_buf_cache;
  24. /*
  25. * Locking orders
  26. *
  27. * xfs_buf_ioacct_inc:
  28. * xfs_buf_ioacct_dec:
  29. * b_sema (caller holds)
  30. * b_lock
  31. *
  32. * xfs_buf_stale:
  33. * b_sema (caller holds)
  34. * b_lock
  35. * lru_lock
  36. *
  37. * xfs_buf_rele:
  38. * b_lock
  39. * pag_buf_lock
  40. * lru_lock
  41. *
  42. * xfs_buftarg_drain_rele
  43. * lru_lock
  44. * b_lock (trylock due to inversion)
  45. *
  46. * xfs_buftarg_isolate
  47. * lru_lock
  48. * b_lock (trylock due to inversion)
  49. */
  50. static int __xfs_buf_submit(struct xfs_buf *bp, bool wait);
  51. static inline int
  52. xfs_buf_submit(
  53. struct xfs_buf *bp)
  54. {
  55. return __xfs_buf_submit(bp, !(bp->b_flags & XBF_ASYNC));
  56. }
  57. static inline int
  58. xfs_buf_is_vmapped(
  59. struct xfs_buf *bp)
  60. {
  61. /*
  62. * Return true if the buffer is vmapped.
  63. *
  64. * b_addr is null if the buffer is not mapped, but the code is clever
  65. * enough to know it doesn't have to map a single page, so the check has
  66. * to be both for b_addr and bp->b_page_count > 1.
  67. */
  68. return bp->b_addr && bp->b_page_count > 1;
  69. }
  70. static inline int
  71. xfs_buf_vmap_len(
  72. struct xfs_buf *bp)
  73. {
  74. return (bp->b_page_count * PAGE_SIZE);
  75. }
  76. /*
  77. * Bump the I/O in flight count on the buftarg if we haven't yet done so for
  78. * this buffer. The count is incremented once per buffer (per hold cycle)
  79. * because the corresponding decrement is deferred to buffer release. Buffers
  80. * can undergo I/O multiple times in a hold-release cycle and per buffer I/O
  81. * tracking adds unnecessary overhead. This is used for sychronization purposes
  82. * with unmount (see xfs_buftarg_drain()), so all we really need is a count of
  83. * in-flight buffers.
  84. *
  85. * Buffers that are never released (e.g., superblock, iclog buffers) must set
  86. * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count
  87. * never reaches zero and unmount hangs indefinitely.
  88. */
  89. static inline void
  90. xfs_buf_ioacct_inc(
  91. struct xfs_buf *bp)
  92. {
  93. if (bp->b_flags & XBF_NO_IOACCT)
  94. return;
  95. ASSERT(bp->b_flags & XBF_ASYNC);
  96. spin_lock(&bp->b_lock);
  97. if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) {
  98. bp->b_state |= XFS_BSTATE_IN_FLIGHT;
  99. percpu_counter_inc(&bp->b_target->bt_io_count);
  100. }
  101. spin_unlock(&bp->b_lock);
  102. }
  103. /*
  104. * Clear the in-flight state on a buffer about to be released to the LRU or
  105. * freed and unaccount from the buftarg.
  106. */
  107. static inline void
  108. __xfs_buf_ioacct_dec(
  109. struct xfs_buf *bp)
  110. {
  111. lockdep_assert_held(&bp->b_lock);
  112. if (bp->b_state & XFS_BSTATE_IN_FLIGHT) {
  113. bp->b_state &= ~XFS_BSTATE_IN_FLIGHT;
  114. percpu_counter_dec(&bp->b_target->bt_io_count);
  115. }
  116. }
  117. static inline void
  118. xfs_buf_ioacct_dec(
  119. struct xfs_buf *bp)
  120. {
  121. spin_lock(&bp->b_lock);
  122. __xfs_buf_ioacct_dec(bp);
  123. spin_unlock(&bp->b_lock);
  124. }
  125. /*
  126. * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  127. * b_lru_ref count so that the buffer is freed immediately when the buffer
  128. * reference count falls to zero. If the buffer is already on the LRU, we need
  129. * to remove the reference that LRU holds on the buffer.
  130. *
  131. * This prevents build-up of stale buffers on the LRU.
  132. */
  133. void
  134. xfs_buf_stale(
  135. struct xfs_buf *bp)
  136. {
  137. ASSERT(xfs_buf_islocked(bp));
  138. bp->b_flags |= XBF_STALE;
  139. /*
  140. * Clear the delwri status so that a delwri queue walker will not
  141. * flush this buffer to disk now that it is stale. The delwri queue has
  142. * a reference to the buffer, so this is safe to do.
  143. */
  144. bp->b_flags &= ~_XBF_DELWRI_Q;
  145. /*
  146. * Once the buffer is marked stale and unlocked, a subsequent lookup
  147. * could reset b_flags. There is no guarantee that the buffer is
  148. * unaccounted (released to LRU) before that occurs. Drop in-flight
  149. * status now to preserve accounting consistency.
  150. */
  151. spin_lock(&bp->b_lock);
  152. __xfs_buf_ioacct_dec(bp);
  153. atomic_set(&bp->b_lru_ref, 0);
  154. if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
  155. (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
  156. atomic_dec(&bp->b_hold);
  157. ASSERT(atomic_read(&bp->b_hold) >= 1);
  158. spin_unlock(&bp->b_lock);
  159. }
  160. static int
  161. xfs_buf_get_maps(
  162. struct xfs_buf *bp,
  163. int map_count)
  164. {
  165. ASSERT(bp->b_maps == NULL);
  166. bp->b_map_count = map_count;
  167. if (map_count == 1) {
  168. bp->b_maps = &bp->__b_map;
  169. return 0;
  170. }
  171. bp->b_maps = kmem_zalloc(map_count * sizeof(struct xfs_buf_map),
  172. KM_NOFS);
  173. if (!bp->b_maps)
  174. return -ENOMEM;
  175. return 0;
  176. }
  177. /*
  178. * Frees b_pages if it was allocated.
  179. */
  180. static void
  181. xfs_buf_free_maps(
  182. struct xfs_buf *bp)
  183. {
  184. if (bp->b_maps != &bp->__b_map) {
  185. kmem_free(bp->b_maps);
  186. bp->b_maps = NULL;
  187. }
  188. }
  189. static int
  190. _xfs_buf_alloc(
  191. struct xfs_buftarg *target,
  192. struct xfs_buf_map *map,
  193. int nmaps,
  194. xfs_buf_flags_t flags,
  195. struct xfs_buf **bpp)
  196. {
  197. struct xfs_buf *bp;
  198. int error;
  199. int i;
  200. *bpp = NULL;
  201. bp = kmem_cache_zalloc(xfs_buf_cache, GFP_NOFS | __GFP_NOFAIL);
  202. /*
  203. * We don't want certain flags to appear in b_flags unless they are
  204. * specifically set by later operations on the buffer.
  205. */
  206. flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD);
  207. atomic_set(&bp->b_hold, 1);
  208. atomic_set(&bp->b_lru_ref, 1);
  209. init_completion(&bp->b_iowait);
  210. INIT_LIST_HEAD(&bp->b_lru);
  211. INIT_LIST_HEAD(&bp->b_list);
  212. INIT_LIST_HEAD(&bp->b_li_list);
  213. sema_init(&bp->b_sema, 0); /* held, no waiters */
  214. spin_lock_init(&bp->b_lock);
  215. bp->b_target = target;
  216. bp->b_mount = target->bt_mount;
  217. bp->b_flags = flags;
  218. /*
  219. * Set length and io_length to the same value initially.
  220. * I/O routines should use io_length, which will be the same in
  221. * most cases but may be reset (e.g. XFS recovery).
  222. */
  223. error = xfs_buf_get_maps(bp, nmaps);
  224. if (error) {
  225. kmem_cache_free(xfs_buf_cache, bp);
  226. return error;
  227. }
  228. bp->b_rhash_key = map[0].bm_bn;
  229. bp->b_length = 0;
  230. for (i = 0; i < nmaps; i++) {
  231. bp->b_maps[i].bm_bn = map[i].bm_bn;
  232. bp->b_maps[i].bm_len = map[i].bm_len;
  233. bp->b_length += map[i].bm_len;
  234. }
  235. atomic_set(&bp->b_pin_count, 0);
  236. init_waitqueue_head(&bp->b_waiters);
  237. XFS_STATS_INC(bp->b_mount, xb_create);
  238. trace_xfs_buf_init(bp, _RET_IP_);
  239. *bpp = bp;
  240. return 0;
  241. }
  242. static void
  243. xfs_buf_free_pages(
  244. struct xfs_buf *bp)
  245. {
  246. uint i;
  247. ASSERT(bp->b_flags & _XBF_PAGES);
  248. if (xfs_buf_is_vmapped(bp))
  249. vm_unmap_ram(bp->b_addr, bp->b_page_count);
  250. for (i = 0; i < bp->b_page_count; i++) {
  251. if (bp->b_pages[i])
  252. __free_page(bp->b_pages[i]);
  253. }
  254. if (current->reclaim_state)
  255. current->reclaim_state->reclaimed_slab += bp->b_page_count;
  256. if (bp->b_pages != bp->b_page_array)
  257. kmem_free(bp->b_pages);
  258. bp->b_pages = NULL;
  259. bp->b_flags &= ~_XBF_PAGES;
  260. }
  261. static void
  262. xfs_buf_free_callback(
  263. struct callback_head *cb)
  264. {
  265. struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu);
  266. xfs_buf_free_maps(bp);
  267. kmem_cache_free(xfs_buf_cache, bp);
  268. }
  269. static void
  270. xfs_buf_free(
  271. struct xfs_buf *bp)
  272. {
  273. trace_xfs_buf_free(bp, _RET_IP_);
  274. ASSERT(list_empty(&bp->b_lru));
  275. if (bp->b_flags & _XBF_PAGES)
  276. xfs_buf_free_pages(bp);
  277. else if (bp->b_flags & _XBF_KMEM)
  278. kmem_free(bp->b_addr);
  279. call_rcu(&bp->b_rcu, xfs_buf_free_callback);
  280. }
  281. static int
  282. xfs_buf_alloc_kmem(
  283. struct xfs_buf *bp,
  284. xfs_buf_flags_t flags)
  285. {
  286. xfs_km_flags_t kmflag_mask = KM_NOFS;
  287. size_t size = BBTOB(bp->b_length);
  288. /* Assure zeroed buffer for non-read cases. */
  289. if (!(flags & XBF_READ))
  290. kmflag_mask |= KM_ZERO;
  291. bp->b_addr = kmem_alloc(size, kmflag_mask);
  292. if (!bp->b_addr)
  293. return -ENOMEM;
  294. if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) !=
  295. ((unsigned long)bp->b_addr & PAGE_MASK)) {
  296. /* b_addr spans two pages - use alloc_page instead */
  297. kmem_free(bp->b_addr);
  298. bp->b_addr = NULL;
  299. return -ENOMEM;
  300. }
  301. bp->b_offset = offset_in_page(bp->b_addr);
  302. bp->b_pages = bp->b_page_array;
  303. bp->b_pages[0] = kmem_to_page(bp->b_addr);
  304. bp->b_page_count = 1;
  305. bp->b_flags |= _XBF_KMEM;
  306. return 0;
  307. }
  308. static int
  309. xfs_buf_alloc_pages(
  310. struct xfs_buf *bp,
  311. xfs_buf_flags_t flags)
  312. {
  313. gfp_t gfp_mask = __GFP_NOWARN;
  314. long filled = 0;
  315. if (flags & XBF_READ_AHEAD)
  316. gfp_mask |= __GFP_NORETRY;
  317. else
  318. gfp_mask |= GFP_NOFS;
  319. /* Make sure that we have a page list */
  320. bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE);
  321. if (bp->b_page_count <= XB_PAGES) {
  322. bp->b_pages = bp->b_page_array;
  323. } else {
  324. bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count,
  325. gfp_mask);
  326. if (!bp->b_pages)
  327. return -ENOMEM;
  328. }
  329. bp->b_flags |= _XBF_PAGES;
  330. /* Assure zeroed buffer for non-read cases. */
  331. if (!(flags & XBF_READ))
  332. gfp_mask |= __GFP_ZERO;
  333. /*
  334. * Bulk filling of pages can take multiple calls. Not filling the entire
  335. * array is not an allocation failure, so don't back off if we get at
  336. * least one extra page.
  337. */
  338. for (;;) {
  339. long last = filled;
  340. filled = alloc_pages_bulk_array(gfp_mask, bp->b_page_count,
  341. bp->b_pages);
  342. if (filled == bp->b_page_count) {
  343. XFS_STATS_INC(bp->b_mount, xb_page_found);
  344. break;
  345. }
  346. if (filled != last)
  347. continue;
  348. if (flags & XBF_READ_AHEAD) {
  349. xfs_buf_free_pages(bp);
  350. return -ENOMEM;
  351. }
  352. XFS_STATS_INC(bp->b_mount, xb_page_retries);
  353. memalloc_retry_wait(gfp_mask);
  354. }
  355. return 0;
  356. }
  357. /*
  358. * Map buffer into kernel address-space if necessary.
  359. */
  360. STATIC int
  361. _xfs_buf_map_pages(
  362. struct xfs_buf *bp,
  363. xfs_buf_flags_t flags)
  364. {
  365. ASSERT(bp->b_flags & _XBF_PAGES);
  366. if (bp->b_page_count == 1) {
  367. /* A single page buffer is always mappable */
  368. bp->b_addr = page_address(bp->b_pages[0]);
  369. } else if (flags & XBF_UNMAPPED) {
  370. bp->b_addr = NULL;
  371. } else {
  372. int retried = 0;
  373. unsigned nofs_flag;
  374. /*
  375. * vm_map_ram() will allocate auxiliary structures (e.g.
  376. * pagetables) with GFP_KERNEL, yet we are likely to be under
  377. * GFP_NOFS context here. Hence we need to tell memory reclaim
  378. * that we are in such a context via PF_MEMALLOC_NOFS to prevent
  379. * memory reclaim re-entering the filesystem here and
  380. * potentially deadlocking.
  381. */
  382. nofs_flag = memalloc_nofs_save();
  383. do {
  384. bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
  385. -1);
  386. if (bp->b_addr)
  387. break;
  388. vm_unmap_aliases();
  389. } while (retried++ <= 1);
  390. memalloc_nofs_restore(nofs_flag);
  391. if (!bp->b_addr)
  392. return -ENOMEM;
  393. }
  394. return 0;
  395. }
  396. /*
  397. * Finding and Reading Buffers
  398. */
  399. static int
  400. _xfs_buf_obj_cmp(
  401. struct rhashtable_compare_arg *arg,
  402. const void *obj)
  403. {
  404. const struct xfs_buf_map *map = arg->key;
  405. const struct xfs_buf *bp = obj;
  406. /*
  407. * The key hashing in the lookup path depends on the key being the
  408. * first element of the compare_arg, make sure to assert this.
  409. */
  410. BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
  411. if (bp->b_rhash_key != map->bm_bn)
  412. return 1;
  413. if (unlikely(bp->b_length != map->bm_len)) {
  414. /*
  415. * found a block number match. If the range doesn't
  416. * match, the only way this is allowed is if the buffer
  417. * in the cache is stale and the transaction that made
  418. * it stale has not yet committed. i.e. we are
  419. * reallocating a busy extent. Skip this buffer and
  420. * continue searching for an exact match.
  421. */
  422. ASSERT(bp->b_flags & XBF_STALE);
  423. return 1;
  424. }
  425. return 0;
  426. }
  427. static const struct rhashtable_params xfs_buf_hash_params = {
  428. .min_size = 32, /* empty AGs have minimal footprint */
  429. .nelem_hint = 16,
  430. .key_len = sizeof(xfs_daddr_t),
  431. .key_offset = offsetof(struct xfs_buf, b_rhash_key),
  432. .head_offset = offsetof(struct xfs_buf, b_rhash_head),
  433. .automatic_shrinking = true,
  434. .obj_cmpfn = _xfs_buf_obj_cmp,
  435. };
  436. int
  437. xfs_buf_hash_init(
  438. struct xfs_perag *pag)
  439. {
  440. spin_lock_init(&pag->pag_buf_lock);
  441. return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
  442. }
  443. void
  444. xfs_buf_hash_destroy(
  445. struct xfs_perag *pag)
  446. {
  447. rhashtable_destroy(&pag->pag_buf_hash);
  448. }
  449. static int
  450. xfs_buf_map_verify(
  451. struct xfs_buftarg *btp,
  452. struct xfs_buf_map *map)
  453. {
  454. xfs_daddr_t eofs;
  455. /* Check for IOs smaller than the sector size / not sector aligned */
  456. ASSERT(!(BBTOB(map->bm_len) < btp->bt_meta_sectorsize));
  457. ASSERT(!(BBTOB(map->bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
  458. /*
  459. * Corrupted block numbers can get through to here, unfortunately, so we
  460. * have to check that the buffer falls within the filesystem bounds.
  461. */
  462. eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
  463. if (map->bm_bn < 0 || map->bm_bn >= eofs) {
  464. xfs_alert(btp->bt_mount,
  465. "%s: daddr 0x%llx out of range, EOFS 0x%llx",
  466. __func__, map->bm_bn, eofs);
  467. WARN_ON(1);
  468. return -EFSCORRUPTED;
  469. }
  470. return 0;
  471. }
  472. static int
  473. xfs_buf_find_lock(
  474. struct xfs_buf *bp,
  475. xfs_buf_flags_t flags)
  476. {
  477. if (flags & XBF_TRYLOCK) {
  478. if (!xfs_buf_trylock(bp)) {
  479. XFS_STATS_INC(bp->b_mount, xb_busy_locked);
  480. return -EAGAIN;
  481. }
  482. } else {
  483. xfs_buf_lock(bp);
  484. XFS_STATS_INC(bp->b_mount, xb_get_locked_waited);
  485. }
  486. /*
  487. * if the buffer is stale, clear all the external state associated with
  488. * it. We need to keep flags such as how we allocated the buffer memory
  489. * intact here.
  490. */
  491. if (bp->b_flags & XBF_STALE) {
  492. ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
  493. bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
  494. bp->b_ops = NULL;
  495. }
  496. return 0;
  497. }
  498. static inline int
  499. xfs_buf_lookup(
  500. struct xfs_perag *pag,
  501. struct xfs_buf_map *map,
  502. xfs_buf_flags_t flags,
  503. struct xfs_buf **bpp)
  504. {
  505. struct xfs_buf *bp;
  506. int error;
  507. rcu_read_lock();
  508. bp = rhashtable_lookup(&pag->pag_buf_hash, map, xfs_buf_hash_params);
  509. if (!bp || !atomic_inc_not_zero(&bp->b_hold)) {
  510. rcu_read_unlock();
  511. return -ENOENT;
  512. }
  513. rcu_read_unlock();
  514. error = xfs_buf_find_lock(bp, flags);
  515. if (error) {
  516. xfs_buf_rele(bp);
  517. return error;
  518. }
  519. trace_xfs_buf_find(bp, flags, _RET_IP_);
  520. *bpp = bp;
  521. return 0;
  522. }
  523. /*
  524. * Insert the new_bp into the hash table. This consumes the perag reference
  525. * taken for the lookup regardless of the result of the insert.
  526. */
  527. static int
  528. xfs_buf_find_insert(
  529. struct xfs_buftarg *btp,
  530. struct xfs_perag *pag,
  531. struct xfs_buf_map *cmap,
  532. struct xfs_buf_map *map,
  533. int nmaps,
  534. xfs_buf_flags_t flags,
  535. struct xfs_buf **bpp)
  536. {
  537. struct xfs_buf *new_bp;
  538. struct xfs_buf *bp;
  539. int error;
  540. error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp);
  541. if (error)
  542. goto out_drop_pag;
  543. /*
  544. * For buffers that fit entirely within a single page, first attempt to
  545. * allocate the memory from the heap to minimise memory usage. If we
  546. * can't get heap memory for these small buffers, we fall back to using
  547. * the page allocator.
  548. */
  549. if (BBTOB(new_bp->b_length) >= PAGE_SIZE ||
  550. xfs_buf_alloc_kmem(new_bp, flags) < 0) {
  551. error = xfs_buf_alloc_pages(new_bp, flags);
  552. if (error)
  553. goto out_free_buf;
  554. }
  555. spin_lock(&pag->pag_buf_lock);
  556. bp = rhashtable_lookup_get_insert_fast(&pag->pag_buf_hash,
  557. &new_bp->b_rhash_head, xfs_buf_hash_params);
  558. if (IS_ERR(bp)) {
  559. error = PTR_ERR(bp);
  560. spin_unlock(&pag->pag_buf_lock);
  561. goto out_free_buf;
  562. }
  563. if (bp) {
  564. /* found an existing buffer */
  565. atomic_inc(&bp->b_hold);
  566. spin_unlock(&pag->pag_buf_lock);
  567. error = xfs_buf_find_lock(bp, flags);
  568. if (error)
  569. xfs_buf_rele(bp);
  570. else
  571. *bpp = bp;
  572. goto out_free_buf;
  573. }
  574. /* The new buffer keeps the perag reference until it is freed. */
  575. new_bp->b_pag = pag;
  576. spin_unlock(&pag->pag_buf_lock);
  577. *bpp = new_bp;
  578. return 0;
  579. out_free_buf:
  580. xfs_buf_free(new_bp);
  581. out_drop_pag:
  582. xfs_perag_put(pag);
  583. return error;
  584. }
  585. /*
  586. * Assembles a buffer covering the specified range. The code is optimised for
  587. * cache hits, as metadata intensive workloads will see 3 orders of magnitude
  588. * more hits than misses.
  589. */
  590. int
  591. xfs_buf_get_map(
  592. struct xfs_buftarg *btp,
  593. struct xfs_buf_map *map,
  594. int nmaps,
  595. xfs_buf_flags_t flags,
  596. struct xfs_buf **bpp)
  597. {
  598. struct xfs_perag *pag;
  599. struct xfs_buf *bp = NULL;
  600. struct xfs_buf_map cmap = { .bm_bn = map[0].bm_bn };
  601. int error;
  602. int i;
  603. for (i = 0; i < nmaps; i++)
  604. cmap.bm_len += map[i].bm_len;
  605. error = xfs_buf_map_verify(btp, &cmap);
  606. if (error)
  607. return error;
  608. pag = xfs_perag_get(btp->bt_mount,
  609. xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
  610. error = xfs_buf_lookup(pag, &cmap, flags, &bp);
  611. if (error && error != -ENOENT)
  612. goto out_put_perag;
  613. /* cache hits always outnumber misses by at least 10:1 */
  614. if (unlikely(!bp)) {
  615. XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
  616. if (flags & XBF_INCORE)
  617. goto out_put_perag;
  618. /* xfs_buf_find_insert() consumes the perag reference. */
  619. error = xfs_buf_find_insert(btp, pag, &cmap, map, nmaps,
  620. flags, &bp);
  621. if (error)
  622. return error;
  623. } else {
  624. XFS_STATS_INC(btp->bt_mount, xb_get_locked);
  625. xfs_perag_put(pag);
  626. }
  627. /* We do not hold a perag reference anymore. */
  628. if (!bp->b_addr) {
  629. error = _xfs_buf_map_pages(bp, flags);
  630. if (unlikely(error)) {
  631. xfs_warn_ratelimited(btp->bt_mount,
  632. "%s: failed to map %u pages", __func__,
  633. bp->b_page_count);
  634. xfs_buf_relse(bp);
  635. return error;
  636. }
  637. }
  638. /*
  639. * Clear b_error if this is a lookup from a caller that doesn't expect
  640. * valid data to be found in the buffer.
  641. */
  642. if (!(flags & XBF_READ))
  643. xfs_buf_ioerror(bp, 0);
  644. XFS_STATS_INC(btp->bt_mount, xb_get);
  645. trace_xfs_buf_get(bp, flags, _RET_IP_);
  646. *bpp = bp;
  647. return 0;
  648. out_put_perag:
  649. xfs_perag_put(pag);
  650. return error;
  651. }
  652. int
  653. _xfs_buf_read(
  654. struct xfs_buf *bp,
  655. xfs_buf_flags_t flags)
  656. {
  657. ASSERT(!(flags & XBF_WRITE));
  658. ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
  659. bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE);
  660. bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
  661. return xfs_buf_submit(bp);
  662. }
  663. /*
  664. * Reverify a buffer found in cache without an attached ->b_ops.
  665. *
  666. * If the caller passed an ops structure and the buffer doesn't have ops
  667. * assigned, set the ops and use it to verify the contents. If verification
  668. * fails, clear XBF_DONE. We assume the buffer has no recorded errors and is
  669. * already in XBF_DONE state on entry.
  670. *
  671. * Under normal operations, every in-core buffer is verified on read I/O
  672. * completion. There are two scenarios that can lead to in-core buffers without
  673. * an assigned ->b_ops. The first is during log recovery of buffers on a V4
  674. * filesystem, though these buffers are purged at the end of recovery. The
  675. * other is online repair, which intentionally reads with a NULL buffer ops to
  676. * run several verifiers across an in-core buffer in order to establish buffer
  677. * type. If repair can't establish that, the buffer will be left in memory
  678. * with NULL buffer ops.
  679. */
  680. int
  681. xfs_buf_reverify(
  682. struct xfs_buf *bp,
  683. const struct xfs_buf_ops *ops)
  684. {
  685. ASSERT(bp->b_flags & XBF_DONE);
  686. ASSERT(bp->b_error == 0);
  687. if (!ops || bp->b_ops)
  688. return 0;
  689. bp->b_ops = ops;
  690. bp->b_ops->verify_read(bp);
  691. if (bp->b_error)
  692. bp->b_flags &= ~XBF_DONE;
  693. return bp->b_error;
  694. }
  695. int
  696. xfs_buf_read_map(
  697. struct xfs_buftarg *target,
  698. struct xfs_buf_map *map,
  699. int nmaps,
  700. xfs_buf_flags_t flags,
  701. struct xfs_buf **bpp,
  702. const struct xfs_buf_ops *ops,
  703. xfs_failaddr_t fa)
  704. {
  705. struct xfs_buf *bp;
  706. int error;
  707. flags |= XBF_READ;
  708. *bpp = NULL;
  709. error = xfs_buf_get_map(target, map, nmaps, flags, &bp);
  710. if (error)
  711. return error;
  712. trace_xfs_buf_read(bp, flags, _RET_IP_);
  713. if (!(bp->b_flags & XBF_DONE)) {
  714. /* Initiate the buffer read and wait. */
  715. XFS_STATS_INC(target->bt_mount, xb_get_read);
  716. bp->b_ops = ops;
  717. error = _xfs_buf_read(bp, flags);
  718. /* Readahead iodone already dropped the buffer, so exit. */
  719. if (flags & XBF_ASYNC)
  720. return 0;
  721. } else {
  722. /* Buffer already read; all we need to do is check it. */
  723. error = xfs_buf_reverify(bp, ops);
  724. /* Readahead already finished; drop the buffer and exit. */
  725. if (flags & XBF_ASYNC) {
  726. xfs_buf_relse(bp);
  727. return 0;
  728. }
  729. /* We do not want read in the flags */
  730. bp->b_flags &= ~XBF_READ;
  731. ASSERT(bp->b_ops != NULL || ops == NULL);
  732. }
  733. /*
  734. * If we've had a read error, then the contents of the buffer are
  735. * invalid and should not be used. To ensure that a followup read tries
  736. * to pull the buffer from disk again, we clear the XBF_DONE flag and
  737. * mark the buffer stale. This ensures that anyone who has a current
  738. * reference to the buffer will interpret it's contents correctly and
  739. * future cache lookups will also treat it as an empty, uninitialised
  740. * buffer.
  741. */
  742. if (error) {
  743. /*
  744. * Check against log shutdown for error reporting because
  745. * metadata writeback may require a read first and we need to
  746. * report errors in metadata writeback until the log is shut
  747. * down. High level transaction read functions already check
  748. * against mount shutdown, anyway, so we only need to be
  749. * concerned about low level IO interactions here.
  750. */
  751. if (!xlog_is_shutdown(target->bt_mount->m_log))
  752. xfs_buf_ioerror_alert(bp, fa);
  753. bp->b_flags &= ~XBF_DONE;
  754. xfs_buf_stale(bp);
  755. xfs_buf_relse(bp);
  756. /* bad CRC means corrupted metadata */
  757. if (error == -EFSBADCRC)
  758. error = -EFSCORRUPTED;
  759. return error;
  760. }
  761. *bpp = bp;
  762. return 0;
  763. }
  764. /*
  765. * If we are not low on memory then do the readahead in a deadlock
  766. * safe manner.
  767. */
  768. void
  769. xfs_buf_readahead_map(
  770. struct xfs_buftarg *target,
  771. struct xfs_buf_map *map,
  772. int nmaps,
  773. const struct xfs_buf_ops *ops)
  774. {
  775. struct xfs_buf *bp;
  776. xfs_buf_read_map(target, map, nmaps,
  777. XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops,
  778. __this_address);
  779. }
  780. /*
  781. * Read an uncached buffer from disk. Allocates and returns a locked
  782. * buffer containing the disk contents or nothing. Uncached buffers always have
  783. * a cache index of XFS_BUF_DADDR_NULL so we can easily determine if the buffer
  784. * is cached or uncached during fault diagnosis.
  785. */
  786. int
  787. xfs_buf_read_uncached(
  788. struct xfs_buftarg *target,
  789. xfs_daddr_t daddr,
  790. size_t numblks,
  791. xfs_buf_flags_t flags,
  792. struct xfs_buf **bpp,
  793. const struct xfs_buf_ops *ops)
  794. {
  795. struct xfs_buf *bp;
  796. int error;
  797. *bpp = NULL;
  798. error = xfs_buf_get_uncached(target, numblks, flags, &bp);
  799. if (error)
  800. return error;
  801. /* set up the buffer for a read IO */
  802. ASSERT(bp->b_map_count == 1);
  803. bp->b_rhash_key = XFS_BUF_DADDR_NULL;
  804. bp->b_maps[0].bm_bn = daddr;
  805. bp->b_flags |= XBF_READ;
  806. bp->b_ops = ops;
  807. xfs_buf_submit(bp);
  808. if (bp->b_error) {
  809. error = bp->b_error;
  810. xfs_buf_relse(bp);
  811. return error;
  812. }
  813. *bpp = bp;
  814. return 0;
  815. }
  816. int
  817. xfs_buf_get_uncached(
  818. struct xfs_buftarg *target,
  819. size_t numblks,
  820. xfs_buf_flags_t flags,
  821. struct xfs_buf **bpp)
  822. {
  823. int error;
  824. struct xfs_buf *bp;
  825. DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks);
  826. *bpp = NULL;
  827. /* flags might contain irrelevant bits, pass only what we care about */
  828. error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp);
  829. if (error)
  830. return error;
  831. error = xfs_buf_alloc_pages(bp, flags);
  832. if (error)
  833. goto fail_free_buf;
  834. error = _xfs_buf_map_pages(bp, 0);
  835. if (unlikely(error)) {
  836. xfs_warn(target->bt_mount,
  837. "%s: failed to map pages", __func__);
  838. goto fail_free_buf;
  839. }
  840. trace_xfs_buf_get_uncached(bp, _RET_IP_);
  841. *bpp = bp;
  842. return 0;
  843. fail_free_buf:
  844. xfs_buf_free(bp);
  845. return error;
  846. }
  847. /*
  848. * Increment reference count on buffer, to hold the buffer concurrently
  849. * with another thread which may release (free) the buffer asynchronously.
  850. * Must hold the buffer already to call this function.
  851. */
  852. void
  853. xfs_buf_hold(
  854. struct xfs_buf *bp)
  855. {
  856. trace_xfs_buf_hold(bp, _RET_IP_);
  857. atomic_inc(&bp->b_hold);
  858. }
  859. /*
  860. * Release a hold on the specified buffer. If the hold count is 1, the buffer is
  861. * placed on LRU or freed (depending on b_lru_ref).
  862. */
  863. void
  864. xfs_buf_rele(
  865. struct xfs_buf *bp)
  866. {
  867. struct xfs_perag *pag = bp->b_pag;
  868. bool release;
  869. bool freebuf = false;
  870. trace_xfs_buf_rele(bp, _RET_IP_);
  871. if (!pag) {
  872. ASSERT(list_empty(&bp->b_lru));
  873. if (atomic_dec_and_test(&bp->b_hold)) {
  874. xfs_buf_ioacct_dec(bp);
  875. xfs_buf_free(bp);
  876. }
  877. return;
  878. }
  879. ASSERT(atomic_read(&bp->b_hold) > 0);
  880. /*
  881. * We grab the b_lock here first to serialise racing xfs_buf_rele()
  882. * calls. The pag_buf_lock being taken on the last reference only
  883. * serialises against racing lookups in xfs_buf_find(). IOWs, the second
  884. * to last reference we drop here is not serialised against the last
  885. * reference until we take bp->b_lock. Hence if we don't grab b_lock
  886. * first, the last "release" reference can win the race to the lock and
  887. * free the buffer before the second-to-last reference is processed,
  888. * leading to a use-after-free scenario.
  889. */
  890. spin_lock(&bp->b_lock);
  891. release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
  892. if (!release) {
  893. /*
  894. * Drop the in-flight state if the buffer is already on the LRU
  895. * and it holds the only reference. This is racy because we
  896. * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT
  897. * ensures the decrement occurs only once per-buf.
  898. */
  899. if ((atomic_read(&bp->b_hold) == 1) && !list_empty(&bp->b_lru))
  900. __xfs_buf_ioacct_dec(bp);
  901. goto out_unlock;
  902. }
  903. /* the last reference has been dropped ... */
  904. __xfs_buf_ioacct_dec(bp);
  905. if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
  906. /*
  907. * If the buffer is added to the LRU take a new reference to the
  908. * buffer for the LRU and clear the (now stale) dispose list
  909. * state flag
  910. */
  911. if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
  912. bp->b_state &= ~XFS_BSTATE_DISPOSE;
  913. atomic_inc(&bp->b_hold);
  914. }
  915. spin_unlock(&pag->pag_buf_lock);
  916. } else {
  917. /*
  918. * most of the time buffers will already be removed from the
  919. * LRU, so optimise that case by checking for the
  920. * XFS_BSTATE_DISPOSE flag indicating the last list the buffer
  921. * was on was the disposal list
  922. */
  923. if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
  924. list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
  925. } else {
  926. ASSERT(list_empty(&bp->b_lru));
  927. }
  928. ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
  929. rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
  930. xfs_buf_hash_params);
  931. spin_unlock(&pag->pag_buf_lock);
  932. xfs_perag_put(pag);
  933. freebuf = true;
  934. }
  935. out_unlock:
  936. spin_unlock(&bp->b_lock);
  937. if (freebuf)
  938. xfs_buf_free(bp);
  939. }
  940. /*
  941. * Lock a buffer object, if it is not already locked.
  942. *
  943. * If we come across a stale, pinned, locked buffer, we know that we are
  944. * being asked to lock a buffer that has been reallocated. Because it is
  945. * pinned, we know that the log has not been pushed to disk and hence it
  946. * will still be locked. Rather than continuing to have trylock attempts
  947. * fail until someone else pushes the log, push it ourselves before
  948. * returning. This means that the xfsaild will not get stuck trying
  949. * to push on stale inode buffers.
  950. */
  951. int
  952. xfs_buf_trylock(
  953. struct xfs_buf *bp)
  954. {
  955. int locked;
  956. locked = down_trylock(&bp->b_sema) == 0;
  957. if (locked)
  958. trace_xfs_buf_trylock(bp, _RET_IP_);
  959. else
  960. trace_xfs_buf_trylock_fail(bp, _RET_IP_);
  961. return locked;
  962. }
  963. /*
  964. * Lock a buffer object.
  965. *
  966. * If we come across a stale, pinned, locked buffer, we know that we
  967. * are being asked to lock a buffer that has been reallocated. Because
  968. * it is pinned, we know that the log has not been pushed to disk and
  969. * hence it will still be locked. Rather than sleeping until someone
  970. * else pushes the log, push it ourselves before trying to get the lock.
  971. */
  972. void
  973. xfs_buf_lock(
  974. struct xfs_buf *bp)
  975. {
  976. trace_xfs_buf_lock(bp, _RET_IP_);
  977. if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
  978. xfs_log_force(bp->b_mount, 0);
  979. down(&bp->b_sema);
  980. trace_xfs_buf_lock_done(bp, _RET_IP_);
  981. }
  982. void
  983. xfs_buf_unlock(
  984. struct xfs_buf *bp)
  985. {
  986. ASSERT(xfs_buf_islocked(bp));
  987. up(&bp->b_sema);
  988. trace_xfs_buf_unlock(bp, _RET_IP_);
  989. }
  990. STATIC void
  991. xfs_buf_wait_unpin(
  992. struct xfs_buf *bp)
  993. {
  994. DECLARE_WAITQUEUE (wait, current);
  995. if (atomic_read(&bp->b_pin_count) == 0)
  996. return;
  997. add_wait_queue(&bp->b_waiters, &wait);
  998. for (;;) {
  999. set_current_state(TASK_UNINTERRUPTIBLE);
  1000. if (atomic_read(&bp->b_pin_count) == 0)
  1001. break;
  1002. io_schedule();
  1003. }
  1004. remove_wait_queue(&bp->b_waiters, &wait);
  1005. set_current_state(TASK_RUNNING);
  1006. }
  1007. static void
  1008. xfs_buf_ioerror_alert_ratelimited(
  1009. struct xfs_buf *bp)
  1010. {
  1011. static unsigned long lasttime;
  1012. static struct xfs_buftarg *lasttarg;
  1013. if (bp->b_target != lasttarg ||
  1014. time_after(jiffies, (lasttime + 5*HZ))) {
  1015. lasttime = jiffies;
  1016. xfs_buf_ioerror_alert(bp, __this_address);
  1017. }
  1018. lasttarg = bp->b_target;
  1019. }
  1020. /*
  1021. * Account for this latest trip around the retry handler, and decide if
  1022. * we've failed enough times to constitute a permanent failure.
  1023. */
  1024. static bool
  1025. xfs_buf_ioerror_permanent(
  1026. struct xfs_buf *bp,
  1027. struct xfs_error_cfg *cfg)
  1028. {
  1029. struct xfs_mount *mp = bp->b_mount;
  1030. if (cfg->max_retries != XFS_ERR_RETRY_FOREVER &&
  1031. ++bp->b_retries > cfg->max_retries)
  1032. return true;
  1033. if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
  1034. time_after(jiffies, cfg->retry_timeout + bp->b_first_retry_time))
  1035. return true;
  1036. /* At unmount we may treat errors differently */
  1037. if (xfs_is_unmounting(mp) && mp->m_fail_unmount)
  1038. return true;
  1039. return false;
  1040. }
  1041. /*
  1042. * On a sync write or shutdown we just want to stale the buffer and let the
  1043. * caller handle the error in bp->b_error appropriately.
  1044. *
  1045. * If the write was asynchronous then no one will be looking for the error. If
  1046. * this is the first failure of this type, clear the error state and write the
  1047. * buffer out again. This means we always retry an async write failure at least
  1048. * once, but we also need to set the buffer up to behave correctly now for
  1049. * repeated failures.
  1050. *
  1051. * If we get repeated async write failures, then we take action according to the
  1052. * error configuration we have been set up to use.
  1053. *
  1054. * Returns true if this function took care of error handling and the caller must
  1055. * not touch the buffer again. Return false if the caller should proceed with
  1056. * normal I/O completion handling.
  1057. */
  1058. static bool
  1059. xfs_buf_ioend_handle_error(
  1060. struct xfs_buf *bp)
  1061. {
  1062. struct xfs_mount *mp = bp->b_mount;
  1063. struct xfs_error_cfg *cfg;
  1064. /*
  1065. * If we've already shutdown the journal because of I/O errors, there's
  1066. * no point in giving this a retry.
  1067. */
  1068. if (xlog_is_shutdown(mp->m_log))
  1069. goto out_stale;
  1070. xfs_buf_ioerror_alert_ratelimited(bp);
  1071. /*
  1072. * We're not going to bother about retrying this during recovery.
  1073. * One strike!
  1074. */
  1075. if (bp->b_flags & _XBF_LOGRECOVERY) {
  1076. xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
  1077. return false;
  1078. }
  1079. /*
  1080. * Synchronous writes will have callers process the error.
  1081. */
  1082. if (!(bp->b_flags & XBF_ASYNC))
  1083. goto out_stale;
  1084. trace_xfs_buf_iodone_async(bp, _RET_IP_);
  1085. cfg = xfs_error_get_cfg(mp, XFS_ERR_METADATA, bp->b_error);
  1086. if (bp->b_last_error != bp->b_error ||
  1087. !(bp->b_flags & (XBF_STALE | XBF_WRITE_FAIL))) {
  1088. bp->b_last_error = bp->b_error;
  1089. if (cfg->retry_timeout != XFS_ERR_RETRY_FOREVER &&
  1090. !bp->b_first_retry_time)
  1091. bp->b_first_retry_time = jiffies;
  1092. goto resubmit;
  1093. }
  1094. /*
  1095. * Permanent error - we need to trigger a shutdown if we haven't already
  1096. * to indicate that inconsistency will result from this action.
  1097. */
  1098. if (xfs_buf_ioerror_permanent(bp, cfg)) {
  1099. xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
  1100. goto out_stale;
  1101. }
  1102. /* Still considered a transient error. Caller will schedule retries. */
  1103. if (bp->b_flags & _XBF_INODES)
  1104. xfs_buf_inode_io_fail(bp);
  1105. else if (bp->b_flags & _XBF_DQUOTS)
  1106. xfs_buf_dquot_io_fail(bp);
  1107. else
  1108. ASSERT(list_empty(&bp->b_li_list));
  1109. xfs_buf_ioerror(bp, 0);
  1110. xfs_buf_relse(bp);
  1111. return true;
  1112. resubmit:
  1113. xfs_buf_ioerror(bp, 0);
  1114. bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL);
  1115. xfs_buf_submit(bp);
  1116. return true;
  1117. out_stale:
  1118. xfs_buf_stale(bp);
  1119. bp->b_flags |= XBF_DONE;
  1120. bp->b_flags &= ~XBF_WRITE;
  1121. trace_xfs_buf_error_relse(bp, _RET_IP_);
  1122. return false;
  1123. }
  1124. static void
  1125. xfs_buf_ioend(
  1126. struct xfs_buf *bp)
  1127. {
  1128. trace_xfs_buf_iodone(bp, _RET_IP_);
  1129. /*
  1130. * Pull in IO completion errors now. We are guaranteed to be running
  1131. * single threaded, so we don't need the lock to read b_io_error.
  1132. */
  1133. if (!bp->b_error && bp->b_io_error)
  1134. xfs_buf_ioerror(bp, bp->b_io_error);
  1135. if (bp->b_flags & XBF_READ) {
  1136. if (!bp->b_error && bp->b_ops)
  1137. bp->b_ops->verify_read(bp);
  1138. if (!bp->b_error)
  1139. bp->b_flags |= XBF_DONE;
  1140. } else {
  1141. if (!bp->b_error) {
  1142. bp->b_flags &= ~XBF_WRITE_FAIL;
  1143. bp->b_flags |= XBF_DONE;
  1144. }
  1145. if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp))
  1146. return;
  1147. /* clear the retry state */
  1148. bp->b_last_error = 0;
  1149. bp->b_retries = 0;
  1150. bp->b_first_retry_time = 0;
  1151. /*
  1152. * Note that for things like remote attribute buffers, there may
  1153. * not be a buffer log item here, so processing the buffer log
  1154. * item must remain optional.
  1155. */
  1156. if (bp->b_log_item)
  1157. xfs_buf_item_done(bp);
  1158. if (bp->b_flags & _XBF_INODES)
  1159. xfs_buf_inode_iodone(bp);
  1160. else if (bp->b_flags & _XBF_DQUOTS)
  1161. xfs_buf_dquot_iodone(bp);
  1162. }
  1163. bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD |
  1164. _XBF_LOGRECOVERY);
  1165. if (bp->b_flags & XBF_ASYNC)
  1166. xfs_buf_relse(bp);
  1167. else
  1168. complete(&bp->b_iowait);
  1169. }
  1170. static void
  1171. xfs_buf_ioend_work(
  1172. struct work_struct *work)
  1173. {
  1174. struct xfs_buf *bp =
  1175. container_of(work, struct xfs_buf, b_ioend_work);
  1176. xfs_buf_ioend(bp);
  1177. }
  1178. static void
  1179. xfs_buf_ioend_async(
  1180. struct xfs_buf *bp)
  1181. {
  1182. INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work);
  1183. queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work);
  1184. }
  1185. void
  1186. __xfs_buf_ioerror(
  1187. struct xfs_buf *bp,
  1188. int error,
  1189. xfs_failaddr_t failaddr)
  1190. {
  1191. ASSERT(error <= 0 && error >= -1000);
  1192. bp->b_error = error;
  1193. trace_xfs_buf_ioerror(bp, error, failaddr);
  1194. }
  1195. void
  1196. xfs_buf_ioerror_alert(
  1197. struct xfs_buf *bp,
  1198. xfs_failaddr_t func)
  1199. {
  1200. xfs_buf_alert_ratelimited(bp, "XFS: metadata IO error",
  1201. "metadata I/O error in \"%pS\" at daddr 0x%llx len %d error %d",
  1202. func, (uint64_t)xfs_buf_daddr(bp),
  1203. bp->b_length, -bp->b_error);
  1204. }
  1205. /*
  1206. * To simulate an I/O failure, the buffer must be locked and held with at least
  1207. * three references. The LRU reference is dropped by the stale call. The buf
  1208. * item reference is dropped via ioend processing. The third reference is owned
  1209. * by the caller and is dropped on I/O completion if the buffer is XBF_ASYNC.
  1210. */
  1211. void
  1212. xfs_buf_ioend_fail(
  1213. struct xfs_buf *bp)
  1214. {
  1215. bp->b_flags &= ~XBF_DONE;
  1216. xfs_buf_stale(bp);
  1217. xfs_buf_ioerror(bp, -EIO);
  1218. xfs_buf_ioend(bp);
  1219. }
  1220. int
  1221. xfs_bwrite(
  1222. struct xfs_buf *bp)
  1223. {
  1224. int error;
  1225. ASSERT(xfs_buf_islocked(bp));
  1226. bp->b_flags |= XBF_WRITE;
  1227. bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q |
  1228. XBF_DONE);
  1229. error = xfs_buf_submit(bp);
  1230. if (error)
  1231. xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
  1232. return error;
  1233. }
  1234. static void
  1235. xfs_buf_bio_end_io(
  1236. struct bio *bio)
  1237. {
  1238. struct xfs_buf *bp = (struct xfs_buf *)bio->bi_private;
  1239. if (!bio->bi_status &&
  1240. (bp->b_flags & XBF_WRITE) && (bp->b_flags & XBF_ASYNC) &&
  1241. XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR))
  1242. bio->bi_status = BLK_STS_IOERR;
  1243. /*
  1244. * don't overwrite existing errors - otherwise we can lose errors on
  1245. * buffers that require multiple bios to complete.
  1246. */
  1247. if (bio->bi_status) {
  1248. int error = blk_status_to_errno(bio->bi_status);
  1249. cmpxchg(&bp->b_io_error, 0, error);
  1250. }
  1251. if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
  1252. invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
  1253. if (atomic_dec_and_test(&bp->b_io_remaining) == 1)
  1254. xfs_buf_ioend_async(bp);
  1255. bio_put(bio);
  1256. }
  1257. static void
  1258. xfs_buf_ioapply_map(
  1259. struct xfs_buf *bp,
  1260. int map,
  1261. int *buf_offset,
  1262. int *count,
  1263. blk_opf_t op)
  1264. {
  1265. int page_index;
  1266. unsigned int total_nr_pages = bp->b_page_count;
  1267. int nr_pages;
  1268. struct bio *bio;
  1269. sector_t sector = bp->b_maps[map].bm_bn;
  1270. int size;
  1271. int offset;
  1272. /* skip the pages in the buffer before the start offset */
  1273. page_index = 0;
  1274. offset = *buf_offset;
  1275. while (offset >= PAGE_SIZE) {
  1276. page_index++;
  1277. offset -= PAGE_SIZE;
  1278. }
  1279. /*
  1280. * Limit the IO size to the length of the current vector, and update the
  1281. * remaining IO count for the next time around.
  1282. */
  1283. size = min_t(int, BBTOB(bp->b_maps[map].bm_len), *count);
  1284. *count -= size;
  1285. *buf_offset += size;
  1286. next_chunk:
  1287. atomic_inc(&bp->b_io_remaining);
  1288. nr_pages = bio_max_segs(total_nr_pages);
  1289. bio = bio_alloc(bp->b_target->bt_bdev, nr_pages, op, GFP_NOIO);
  1290. bio->bi_iter.bi_sector = sector;
  1291. bio->bi_end_io = xfs_buf_bio_end_io;
  1292. bio->bi_private = bp;
  1293. for (; size && nr_pages; nr_pages--, page_index++) {
  1294. int rbytes, nbytes = PAGE_SIZE - offset;
  1295. if (nbytes > size)
  1296. nbytes = size;
  1297. rbytes = bio_add_page(bio, bp->b_pages[page_index], nbytes,
  1298. offset);
  1299. if (rbytes < nbytes)
  1300. break;
  1301. offset = 0;
  1302. sector += BTOBB(nbytes);
  1303. size -= nbytes;
  1304. total_nr_pages--;
  1305. }
  1306. if (likely(bio->bi_iter.bi_size)) {
  1307. if (xfs_buf_is_vmapped(bp)) {
  1308. flush_kernel_vmap_range(bp->b_addr,
  1309. xfs_buf_vmap_len(bp));
  1310. }
  1311. submit_bio(bio);
  1312. if (size)
  1313. goto next_chunk;
  1314. } else {
  1315. /*
  1316. * This is guaranteed not to be the last io reference count
  1317. * because the caller (xfs_buf_submit) holds a count itself.
  1318. */
  1319. atomic_dec(&bp->b_io_remaining);
  1320. xfs_buf_ioerror(bp, -EIO);
  1321. bio_put(bio);
  1322. }
  1323. }
  1324. STATIC void
  1325. _xfs_buf_ioapply(
  1326. struct xfs_buf *bp)
  1327. {
  1328. struct blk_plug plug;
  1329. blk_opf_t op;
  1330. int offset;
  1331. int size;
  1332. int i;
  1333. /*
  1334. * Make sure we capture only current IO errors rather than stale errors
  1335. * left over from previous use of the buffer (e.g. failed readahead).
  1336. */
  1337. bp->b_error = 0;
  1338. if (bp->b_flags & XBF_WRITE) {
  1339. op = REQ_OP_WRITE;
  1340. /*
  1341. * Run the write verifier callback function if it exists. If
  1342. * this function fails it will mark the buffer with an error and
  1343. * the IO should not be dispatched.
  1344. */
  1345. if (bp->b_ops) {
  1346. bp->b_ops->verify_write(bp);
  1347. if (bp->b_error) {
  1348. xfs_force_shutdown(bp->b_mount,
  1349. SHUTDOWN_CORRUPT_INCORE);
  1350. return;
  1351. }
  1352. } else if (bp->b_rhash_key != XFS_BUF_DADDR_NULL) {
  1353. struct xfs_mount *mp = bp->b_mount;
  1354. /*
  1355. * non-crc filesystems don't attach verifiers during
  1356. * log recovery, so don't warn for such filesystems.
  1357. */
  1358. if (xfs_has_crc(mp)) {
  1359. xfs_warn(mp,
  1360. "%s: no buf ops on daddr 0x%llx len %d",
  1361. __func__, xfs_buf_daddr(bp),
  1362. bp->b_length);
  1363. xfs_hex_dump(bp->b_addr,
  1364. XFS_CORRUPTION_DUMP_LEN);
  1365. dump_stack();
  1366. }
  1367. }
  1368. } else {
  1369. op = REQ_OP_READ;
  1370. if (bp->b_flags & XBF_READ_AHEAD)
  1371. op |= REQ_RAHEAD;
  1372. }
  1373. /* we only use the buffer cache for meta-data */
  1374. op |= REQ_META;
  1375. /*
  1376. * Walk all the vectors issuing IO on them. Set up the initial offset
  1377. * into the buffer and the desired IO size before we start -
  1378. * _xfs_buf_ioapply_vec() will modify them appropriately for each
  1379. * subsequent call.
  1380. */
  1381. offset = bp->b_offset;
  1382. size = BBTOB(bp->b_length);
  1383. blk_start_plug(&plug);
  1384. for (i = 0; i < bp->b_map_count; i++) {
  1385. xfs_buf_ioapply_map(bp, i, &offset, &size, op);
  1386. if (bp->b_error)
  1387. break;
  1388. if (size <= 0)
  1389. break; /* all done */
  1390. }
  1391. blk_finish_plug(&plug);
  1392. }
  1393. /*
  1394. * Wait for I/O completion of a sync buffer and return the I/O error code.
  1395. */
  1396. static int
  1397. xfs_buf_iowait(
  1398. struct xfs_buf *bp)
  1399. {
  1400. ASSERT(!(bp->b_flags & XBF_ASYNC));
  1401. trace_xfs_buf_iowait(bp, _RET_IP_);
  1402. wait_for_completion(&bp->b_iowait);
  1403. trace_xfs_buf_iowait_done(bp, _RET_IP_);
  1404. return bp->b_error;
  1405. }
  1406. /*
  1407. * Buffer I/O submission path, read or write. Asynchronous submission transfers
  1408. * the buffer lock ownership and the current reference to the IO. It is not
  1409. * safe to reference the buffer after a call to this function unless the caller
  1410. * holds an additional reference itself.
  1411. */
  1412. static int
  1413. __xfs_buf_submit(
  1414. struct xfs_buf *bp,
  1415. bool wait)
  1416. {
  1417. int error = 0;
  1418. trace_xfs_buf_submit(bp, _RET_IP_);
  1419. ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
  1420. /*
  1421. * On log shutdown we stale and complete the buffer immediately. We can
  1422. * be called to read the superblock before the log has been set up, so
  1423. * be careful checking the log state.
  1424. *
  1425. * Checking the mount shutdown state here can result in the log tail
  1426. * moving inappropriately on disk as the log may not yet be shut down.
  1427. * i.e. failing this buffer on mount shutdown can remove it from the AIL
  1428. * and move the tail of the log forwards without having written this
  1429. * buffer to disk. This corrupts the log tail state in memory, and
  1430. * because the log may not be shut down yet, it can then be propagated
  1431. * to disk before the log is shutdown. Hence we check log shutdown
  1432. * state here rather than mount state to avoid corrupting the log tail
  1433. * on shutdown.
  1434. */
  1435. if (bp->b_mount->m_log &&
  1436. xlog_is_shutdown(bp->b_mount->m_log)) {
  1437. xfs_buf_ioend_fail(bp);
  1438. return -EIO;
  1439. }
  1440. /*
  1441. * Grab a reference so the buffer does not go away underneath us. For
  1442. * async buffers, I/O completion drops the callers reference, which
  1443. * could occur before submission returns.
  1444. */
  1445. xfs_buf_hold(bp);
  1446. if (bp->b_flags & XBF_WRITE)
  1447. xfs_buf_wait_unpin(bp);
  1448. /* clear the internal error state to avoid spurious errors */
  1449. bp->b_io_error = 0;
  1450. /*
  1451. * Set the count to 1 initially, this will stop an I/O completion
  1452. * callout which happens before we have started all the I/O from calling
  1453. * xfs_buf_ioend too early.
  1454. */
  1455. atomic_set(&bp->b_io_remaining, 1);
  1456. if (bp->b_flags & XBF_ASYNC)
  1457. xfs_buf_ioacct_inc(bp);
  1458. _xfs_buf_ioapply(bp);
  1459. /*
  1460. * If _xfs_buf_ioapply failed, we can get back here with only the IO
  1461. * reference we took above. If we drop it to zero, run completion so
  1462. * that we don't return to the caller with completion still pending.
  1463. */
  1464. if (atomic_dec_and_test(&bp->b_io_remaining) == 1) {
  1465. if (bp->b_error || !(bp->b_flags & XBF_ASYNC))
  1466. xfs_buf_ioend(bp);
  1467. else
  1468. xfs_buf_ioend_async(bp);
  1469. }
  1470. if (wait)
  1471. error = xfs_buf_iowait(bp);
  1472. /*
  1473. * Release the hold that keeps the buffer referenced for the entire
  1474. * I/O. Note that if the buffer is async, it is not safe to reference
  1475. * after this release.
  1476. */
  1477. xfs_buf_rele(bp);
  1478. return error;
  1479. }
  1480. void *
  1481. xfs_buf_offset(
  1482. struct xfs_buf *bp,
  1483. size_t offset)
  1484. {
  1485. struct page *page;
  1486. if (bp->b_addr)
  1487. return bp->b_addr + offset;
  1488. page = bp->b_pages[offset >> PAGE_SHIFT];
  1489. return page_address(page) + (offset & (PAGE_SIZE-1));
  1490. }
  1491. void
  1492. xfs_buf_zero(
  1493. struct xfs_buf *bp,
  1494. size_t boff,
  1495. size_t bsize)
  1496. {
  1497. size_t bend;
  1498. bend = boff + bsize;
  1499. while (boff < bend) {
  1500. struct page *page;
  1501. int page_index, page_offset, csize;
  1502. page_index = (boff + bp->b_offset) >> PAGE_SHIFT;
  1503. page_offset = (boff + bp->b_offset) & ~PAGE_MASK;
  1504. page = bp->b_pages[page_index];
  1505. csize = min_t(size_t, PAGE_SIZE - page_offset,
  1506. BBTOB(bp->b_length) - boff);
  1507. ASSERT((csize + page_offset) <= PAGE_SIZE);
  1508. memset(page_address(page) + page_offset, 0, csize);
  1509. boff += csize;
  1510. }
  1511. }
  1512. /*
  1513. * Log a message about and stale a buffer that a caller has decided is corrupt.
  1514. *
  1515. * This function should be called for the kinds of metadata corruption that
  1516. * cannot be detect from a verifier, such as incorrect inter-block relationship
  1517. * data. Do /not/ call this function from a verifier function.
  1518. *
  1519. * The buffer must be XBF_DONE prior to the call. Afterwards, the buffer will
  1520. * be marked stale, but b_error will not be set. The caller is responsible for
  1521. * releasing the buffer or fixing it.
  1522. */
  1523. void
  1524. __xfs_buf_mark_corrupt(
  1525. struct xfs_buf *bp,
  1526. xfs_failaddr_t fa)
  1527. {
  1528. ASSERT(bp->b_flags & XBF_DONE);
  1529. xfs_buf_corruption_error(bp, fa);
  1530. xfs_buf_stale(bp);
  1531. }
  1532. /*
  1533. * Handling of buffer targets (buftargs).
  1534. */
  1535. /*
  1536. * Wait for any bufs with callbacks that have been submitted but have not yet
  1537. * returned. These buffers will have an elevated hold count, so wait on those
  1538. * while freeing all the buffers only held by the LRU.
  1539. */
  1540. static enum lru_status
  1541. xfs_buftarg_drain_rele(
  1542. struct list_head *item,
  1543. struct list_lru_one *lru,
  1544. spinlock_t *lru_lock,
  1545. void *arg)
  1546. {
  1547. struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
  1548. struct list_head *dispose = arg;
  1549. if (atomic_read(&bp->b_hold) > 1) {
  1550. /* need to wait, so skip it this pass */
  1551. trace_xfs_buf_drain_buftarg(bp, _RET_IP_);
  1552. return LRU_SKIP;
  1553. }
  1554. if (!spin_trylock(&bp->b_lock))
  1555. return LRU_SKIP;
  1556. /*
  1557. * clear the LRU reference count so the buffer doesn't get
  1558. * ignored in xfs_buf_rele().
  1559. */
  1560. atomic_set(&bp->b_lru_ref, 0);
  1561. bp->b_state |= XFS_BSTATE_DISPOSE;
  1562. list_lru_isolate_move(lru, item, dispose);
  1563. spin_unlock(&bp->b_lock);
  1564. return LRU_REMOVED;
  1565. }
  1566. /*
  1567. * Wait for outstanding I/O on the buftarg to complete.
  1568. */
  1569. void
  1570. xfs_buftarg_wait(
  1571. struct xfs_buftarg *btp)
  1572. {
  1573. /*
  1574. * First wait on the buftarg I/O count for all in-flight buffers to be
  1575. * released. This is critical as new buffers do not make the LRU until
  1576. * they are released.
  1577. *
  1578. * Next, flush the buffer workqueue to ensure all completion processing
  1579. * has finished. Just waiting on buffer locks is not sufficient for
  1580. * async IO as the reference count held over IO is not released until
  1581. * after the buffer lock is dropped. Hence we need to ensure here that
  1582. * all reference counts have been dropped before we start walking the
  1583. * LRU list.
  1584. */
  1585. while (percpu_counter_sum(&btp->bt_io_count))
  1586. delay(100);
  1587. flush_workqueue(btp->bt_mount->m_buf_workqueue);
  1588. }
  1589. void
  1590. xfs_buftarg_drain(
  1591. struct xfs_buftarg *btp)
  1592. {
  1593. LIST_HEAD(dispose);
  1594. int loop = 0;
  1595. bool write_fail = false;
  1596. xfs_buftarg_wait(btp);
  1597. /* loop until there is nothing left on the lru list. */
  1598. while (list_lru_count(&btp->bt_lru)) {
  1599. list_lru_walk(&btp->bt_lru, xfs_buftarg_drain_rele,
  1600. &dispose, LONG_MAX);
  1601. while (!list_empty(&dispose)) {
  1602. struct xfs_buf *bp;
  1603. bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
  1604. list_del_init(&bp->b_lru);
  1605. if (bp->b_flags & XBF_WRITE_FAIL) {
  1606. write_fail = true;
  1607. xfs_buf_alert_ratelimited(bp,
  1608. "XFS: Corruption Alert",
  1609. "Corruption Alert: Buffer at daddr 0x%llx had permanent write failures!",
  1610. (long long)xfs_buf_daddr(bp));
  1611. }
  1612. xfs_buf_rele(bp);
  1613. }
  1614. if (loop++ != 0)
  1615. delay(100);
  1616. }
  1617. /*
  1618. * If one or more failed buffers were freed, that means dirty metadata
  1619. * was thrown away. This should only ever happen after I/O completion
  1620. * handling has elevated I/O error(s) to permanent failures and shuts
  1621. * down the journal.
  1622. */
  1623. if (write_fail) {
  1624. ASSERT(xlog_is_shutdown(btp->bt_mount->m_log));
  1625. xfs_alert(btp->bt_mount,
  1626. "Please run xfs_repair to determine the extent of the problem.");
  1627. }
  1628. }
  1629. static enum lru_status
  1630. xfs_buftarg_isolate(
  1631. struct list_head *item,
  1632. struct list_lru_one *lru,
  1633. spinlock_t *lru_lock,
  1634. void *arg)
  1635. {
  1636. struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru);
  1637. struct list_head *dispose = arg;
  1638. /*
  1639. * we are inverting the lru lock/bp->b_lock here, so use a trylock.
  1640. * If we fail to get the lock, just skip it.
  1641. */
  1642. if (!spin_trylock(&bp->b_lock))
  1643. return LRU_SKIP;
  1644. /*
  1645. * Decrement the b_lru_ref count unless the value is already
  1646. * zero. If the value is already zero, we need to reclaim the
  1647. * buffer, otherwise it gets another trip through the LRU.
  1648. */
  1649. if (atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
  1650. spin_unlock(&bp->b_lock);
  1651. return LRU_ROTATE;
  1652. }
  1653. bp->b_state |= XFS_BSTATE_DISPOSE;
  1654. list_lru_isolate_move(lru, item, dispose);
  1655. spin_unlock(&bp->b_lock);
  1656. return LRU_REMOVED;
  1657. }
  1658. static unsigned long
  1659. xfs_buftarg_shrink_scan(
  1660. struct shrinker *shrink,
  1661. struct shrink_control *sc)
  1662. {
  1663. struct xfs_buftarg *btp = container_of(shrink,
  1664. struct xfs_buftarg, bt_shrinker);
  1665. LIST_HEAD(dispose);
  1666. unsigned long freed;
  1667. freed = list_lru_shrink_walk(&btp->bt_lru, sc,
  1668. xfs_buftarg_isolate, &dispose);
  1669. while (!list_empty(&dispose)) {
  1670. struct xfs_buf *bp;
  1671. bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
  1672. list_del_init(&bp->b_lru);
  1673. xfs_buf_rele(bp);
  1674. }
  1675. return freed;
  1676. }
  1677. static unsigned long
  1678. xfs_buftarg_shrink_count(
  1679. struct shrinker *shrink,
  1680. struct shrink_control *sc)
  1681. {
  1682. struct xfs_buftarg *btp = container_of(shrink,
  1683. struct xfs_buftarg, bt_shrinker);
  1684. return list_lru_shrink_count(&btp->bt_lru, sc);
  1685. }
  1686. void
  1687. xfs_free_buftarg(
  1688. struct xfs_buftarg *btp)
  1689. {
  1690. unregister_shrinker(&btp->bt_shrinker);
  1691. ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
  1692. percpu_counter_destroy(&btp->bt_io_count);
  1693. list_lru_destroy(&btp->bt_lru);
  1694. blkdev_issue_flush(btp->bt_bdev);
  1695. fs_put_dax(btp->bt_daxdev, btp->bt_mount);
  1696. kmem_free(btp);
  1697. }
  1698. int
  1699. xfs_setsize_buftarg(
  1700. xfs_buftarg_t *btp,
  1701. unsigned int sectorsize)
  1702. {
  1703. /* Set up metadata sector size info */
  1704. btp->bt_meta_sectorsize = sectorsize;
  1705. btp->bt_meta_sectormask = sectorsize - 1;
  1706. if (set_blocksize(btp->bt_bdev, sectorsize)) {
  1707. xfs_warn(btp->bt_mount,
  1708. "Cannot set_blocksize to %u on device %pg",
  1709. sectorsize, btp->bt_bdev);
  1710. return -EINVAL;
  1711. }
  1712. /* Set up device logical sector size mask */
  1713. btp->bt_logical_sectorsize = bdev_logical_block_size(btp->bt_bdev);
  1714. btp->bt_logical_sectormask = bdev_logical_block_size(btp->bt_bdev) - 1;
  1715. return 0;
  1716. }
  1717. /*
  1718. * When allocating the initial buffer target we have not yet
  1719. * read in the superblock, so don't know what sized sectors
  1720. * are being used at this early stage. Play safe.
  1721. */
  1722. STATIC int
  1723. xfs_setsize_buftarg_early(
  1724. xfs_buftarg_t *btp,
  1725. struct block_device *bdev)
  1726. {
  1727. return xfs_setsize_buftarg(btp, bdev_logical_block_size(bdev));
  1728. }
  1729. struct xfs_buftarg *
  1730. xfs_alloc_buftarg(
  1731. struct xfs_mount *mp,
  1732. struct block_device *bdev)
  1733. {
  1734. xfs_buftarg_t *btp;
  1735. const struct dax_holder_operations *ops = NULL;
  1736. #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE)
  1737. ops = &xfs_dax_holder_operations;
  1738. #endif
  1739. btp = kmem_zalloc(sizeof(*btp), KM_NOFS);
  1740. btp->bt_mount = mp;
  1741. btp->bt_dev = bdev->bd_dev;
  1742. btp->bt_bdev = bdev;
  1743. btp->bt_daxdev = fs_dax_get_by_bdev(bdev, &btp->bt_dax_part_off,
  1744. mp, ops);
  1745. /*
  1746. * Buffer IO error rate limiting. Limit it to no more than 10 messages
  1747. * per 30 seconds so as to not spam logs too much on repeated errors.
  1748. */
  1749. ratelimit_state_init(&btp->bt_ioerror_rl, 30 * HZ,
  1750. DEFAULT_RATELIMIT_BURST);
  1751. if (xfs_setsize_buftarg_early(btp, bdev))
  1752. goto error_free;
  1753. if (list_lru_init(&btp->bt_lru))
  1754. goto error_free;
  1755. if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
  1756. goto error_lru;
  1757. btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
  1758. btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
  1759. btp->bt_shrinker.seeks = DEFAULT_SEEKS;
  1760. btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
  1761. if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s",
  1762. mp->m_super->s_id))
  1763. goto error_pcpu;
  1764. return btp;
  1765. error_pcpu:
  1766. percpu_counter_destroy(&btp->bt_io_count);
  1767. error_lru:
  1768. list_lru_destroy(&btp->bt_lru);
  1769. error_free:
  1770. kmem_free(btp);
  1771. return NULL;
  1772. }
  1773. /*
  1774. * Cancel a delayed write list.
  1775. *
  1776. * Remove each buffer from the list, clear the delwri queue flag and drop the
  1777. * associated buffer reference.
  1778. */
  1779. void
  1780. xfs_buf_delwri_cancel(
  1781. struct list_head *list)
  1782. {
  1783. struct xfs_buf *bp;
  1784. while (!list_empty(list)) {
  1785. bp = list_first_entry(list, struct xfs_buf, b_list);
  1786. xfs_buf_lock(bp);
  1787. bp->b_flags &= ~_XBF_DELWRI_Q;
  1788. list_del_init(&bp->b_list);
  1789. xfs_buf_relse(bp);
  1790. }
  1791. }
  1792. /*
  1793. * Add a buffer to the delayed write list.
  1794. *
  1795. * This queues a buffer for writeout if it hasn't already been. Note that
  1796. * neither this routine nor the buffer list submission functions perform
  1797. * any internal synchronization. It is expected that the lists are thread-local
  1798. * to the callers.
  1799. *
  1800. * Returns true if we queued up the buffer, or false if it already had
  1801. * been on the buffer list.
  1802. */
  1803. bool
  1804. xfs_buf_delwri_queue(
  1805. struct xfs_buf *bp,
  1806. struct list_head *list)
  1807. {
  1808. ASSERT(xfs_buf_islocked(bp));
  1809. ASSERT(!(bp->b_flags & XBF_READ));
  1810. /*
  1811. * If the buffer is already marked delwri it already is queued up
  1812. * by someone else for imediate writeout. Just ignore it in that
  1813. * case.
  1814. */
  1815. if (bp->b_flags & _XBF_DELWRI_Q) {
  1816. trace_xfs_buf_delwri_queued(bp, _RET_IP_);
  1817. return false;
  1818. }
  1819. trace_xfs_buf_delwri_queue(bp, _RET_IP_);
  1820. /*
  1821. * If a buffer gets written out synchronously or marked stale while it
  1822. * is on a delwri list we lazily remove it. To do this, the other party
  1823. * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
  1824. * It remains referenced and on the list. In a rare corner case it
  1825. * might get readded to a delwri list after the synchronous writeout, in
  1826. * which case we need just need to re-add the flag here.
  1827. */
  1828. bp->b_flags |= _XBF_DELWRI_Q;
  1829. if (list_empty(&bp->b_list)) {
  1830. atomic_inc(&bp->b_hold);
  1831. list_add_tail(&bp->b_list, list);
  1832. }
  1833. return true;
  1834. }
  1835. /*
  1836. * Compare function is more complex than it needs to be because
  1837. * the return value is only 32 bits and we are doing comparisons
  1838. * on 64 bit values
  1839. */
  1840. static int
  1841. xfs_buf_cmp(
  1842. void *priv,
  1843. const struct list_head *a,
  1844. const struct list_head *b)
  1845. {
  1846. struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
  1847. struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
  1848. xfs_daddr_t diff;
  1849. diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
  1850. if (diff < 0)
  1851. return -1;
  1852. if (diff > 0)
  1853. return 1;
  1854. return 0;
  1855. }
  1856. /*
  1857. * Submit buffers for write. If wait_list is specified, the buffers are
  1858. * submitted using sync I/O and placed on the wait list such that the caller can
  1859. * iowait each buffer. Otherwise async I/O is used and the buffers are released
  1860. * at I/O completion time. In either case, buffers remain locked until I/O
  1861. * completes and the buffer is released from the queue.
  1862. */
  1863. static int
  1864. xfs_buf_delwri_submit_buffers(
  1865. struct list_head *buffer_list,
  1866. struct list_head *wait_list)
  1867. {
  1868. struct xfs_buf *bp, *n;
  1869. int pinned = 0;
  1870. struct blk_plug plug;
  1871. list_sort(NULL, buffer_list, xfs_buf_cmp);
  1872. blk_start_plug(&plug);
  1873. list_for_each_entry_safe(bp, n, buffer_list, b_list) {
  1874. if (!wait_list) {
  1875. if (!xfs_buf_trylock(bp))
  1876. continue;
  1877. if (xfs_buf_ispinned(bp)) {
  1878. xfs_buf_unlock(bp);
  1879. pinned++;
  1880. continue;
  1881. }
  1882. } else {
  1883. xfs_buf_lock(bp);
  1884. }
  1885. /*
  1886. * Someone else might have written the buffer synchronously or
  1887. * marked it stale in the meantime. In that case only the
  1888. * _XBF_DELWRI_Q flag got cleared, and we have to drop the
  1889. * reference and remove it from the list here.
  1890. */
  1891. if (!(bp->b_flags & _XBF_DELWRI_Q)) {
  1892. list_del_init(&bp->b_list);
  1893. xfs_buf_relse(bp);
  1894. continue;
  1895. }
  1896. trace_xfs_buf_delwri_split(bp, _RET_IP_);
  1897. /*
  1898. * If we have a wait list, each buffer (and associated delwri
  1899. * queue reference) transfers to it and is submitted
  1900. * synchronously. Otherwise, drop the buffer from the delwri
  1901. * queue and submit async.
  1902. */
  1903. bp->b_flags &= ~_XBF_DELWRI_Q;
  1904. bp->b_flags |= XBF_WRITE;
  1905. if (wait_list) {
  1906. bp->b_flags &= ~XBF_ASYNC;
  1907. list_move_tail(&bp->b_list, wait_list);
  1908. } else {
  1909. bp->b_flags |= XBF_ASYNC;
  1910. list_del_init(&bp->b_list);
  1911. }
  1912. __xfs_buf_submit(bp, false);
  1913. }
  1914. blk_finish_plug(&plug);
  1915. return pinned;
  1916. }
  1917. /*
  1918. * Write out a buffer list asynchronously.
  1919. *
  1920. * This will take the @buffer_list, write all non-locked and non-pinned buffers
  1921. * out and not wait for I/O completion on any of the buffers. This interface
  1922. * is only safely useable for callers that can track I/O completion by higher
  1923. * level means, e.g. AIL pushing as the @buffer_list is consumed in this
  1924. * function.
  1925. *
  1926. * Note: this function will skip buffers it would block on, and in doing so
  1927. * leaves them on @buffer_list so they can be retried on a later pass. As such,
  1928. * it is up to the caller to ensure that the buffer list is fully submitted or
  1929. * cancelled appropriately when they are finished with the list. Failure to
  1930. * cancel or resubmit the list until it is empty will result in leaked buffers
  1931. * at unmount time.
  1932. */
  1933. int
  1934. xfs_buf_delwri_submit_nowait(
  1935. struct list_head *buffer_list)
  1936. {
  1937. return xfs_buf_delwri_submit_buffers(buffer_list, NULL);
  1938. }
  1939. /*
  1940. * Write out a buffer list synchronously.
  1941. *
  1942. * This will take the @buffer_list, write all buffers out and wait for I/O
  1943. * completion on all of the buffers. @buffer_list is consumed by the function,
  1944. * so callers must have some other way of tracking buffers if they require such
  1945. * functionality.
  1946. */
  1947. int
  1948. xfs_buf_delwri_submit(
  1949. struct list_head *buffer_list)
  1950. {
  1951. LIST_HEAD (wait_list);
  1952. int error = 0, error2;
  1953. struct xfs_buf *bp;
  1954. xfs_buf_delwri_submit_buffers(buffer_list, &wait_list);
  1955. /* Wait for IO to complete. */
  1956. while (!list_empty(&wait_list)) {
  1957. bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
  1958. list_del_init(&bp->b_list);
  1959. /*
  1960. * Wait on the locked buffer, check for errors and unlock and
  1961. * release the delwri queue reference.
  1962. */
  1963. error2 = xfs_buf_iowait(bp);
  1964. xfs_buf_relse(bp);
  1965. if (!error)
  1966. error = error2;
  1967. }
  1968. return error;
  1969. }
  1970. /*
  1971. * Push a single buffer on a delwri queue.
  1972. *
  1973. * The purpose of this function is to submit a single buffer of a delwri queue
  1974. * and return with the buffer still on the original queue. The waiting delwri
  1975. * buffer submission infrastructure guarantees transfer of the delwri queue
  1976. * buffer reference to a temporary wait list. We reuse this infrastructure to
  1977. * transfer the buffer back to the original queue.
  1978. *
  1979. * Note the buffer transitions from the queued state, to the submitted and wait
  1980. * listed state and back to the queued state during this call. The buffer
  1981. * locking and queue management logic between _delwri_pushbuf() and
  1982. * _delwri_queue() guarantee that the buffer cannot be queued to another list
  1983. * before returning.
  1984. */
  1985. int
  1986. xfs_buf_delwri_pushbuf(
  1987. struct xfs_buf *bp,
  1988. struct list_head *buffer_list)
  1989. {
  1990. LIST_HEAD (submit_list);
  1991. int error;
  1992. ASSERT(bp->b_flags & _XBF_DELWRI_Q);
  1993. trace_xfs_buf_delwri_pushbuf(bp, _RET_IP_);
  1994. /*
  1995. * Isolate the buffer to a new local list so we can submit it for I/O
  1996. * independently from the rest of the original list.
  1997. */
  1998. xfs_buf_lock(bp);
  1999. list_move(&bp->b_list, &submit_list);
  2000. xfs_buf_unlock(bp);
  2001. /*
  2002. * Delwri submission clears the DELWRI_Q buffer flag and returns with
  2003. * the buffer on the wait list with the original reference. Rather than
  2004. * bounce the buffer from a local wait list back to the original list
  2005. * after I/O completion, reuse the original list as the wait list.
  2006. */
  2007. xfs_buf_delwri_submit_buffers(&submit_list, buffer_list);
  2008. /*
  2009. * The buffer is now locked, under I/O and wait listed on the original
  2010. * delwri queue. Wait for I/O completion, restore the DELWRI_Q flag and
  2011. * return with the buffer unlocked and on the original queue.
  2012. */
  2013. error = xfs_buf_iowait(bp);
  2014. bp->b_flags |= _XBF_DELWRI_Q;
  2015. xfs_buf_unlock(bp);
  2016. return error;
  2017. }
  2018. void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
  2019. {
  2020. /*
  2021. * Set the lru reference count to 0 based on the error injection tag.
  2022. * This allows userspace to disrupt buffer caching for debug/testing
  2023. * purposes.
  2024. */
  2025. if (XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_LRU_REF))
  2026. lru_ref = 0;
  2027. atomic_set(&bp->b_lru_ref, lru_ref);
  2028. }
  2029. /*
  2030. * Verify an on-disk magic value against the magic value specified in the
  2031. * verifier structure. The verifier magic is in disk byte order so the caller is
  2032. * expected to pass the value directly from disk.
  2033. */
  2034. bool
  2035. xfs_verify_magic(
  2036. struct xfs_buf *bp,
  2037. __be32 dmagic)
  2038. {
  2039. struct xfs_mount *mp = bp->b_mount;
  2040. int idx;
  2041. idx = xfs_has_crc(mp);
  2042. if (WARN_ON(!bp->b_ops || !bp->b_ops->magic[idx]))
  2043. return false;
  2044. return dmagic == bp->b_ops->magic[idx];
  2045. }
  2046. /*
  2047. * Verify an on-disk magic value against the magic value specified in the
  2048. * verifier structure. The verifier magic is in disk byte order so the caller is
  2049. * expected to pass the value directly from disk.
  2050. */
  2051. bool
  2052. xfs_verify_magic16(
  2053. struct xfs_buf *bp,
  2054. __be16 dmagic)
  2055. {
  2056. struct xfs_mount *mp = bp->b_mount;
  2057. int idx;
  2058. idx = xfs_has_crc(mp);
  2059. if (WARN_ON(!bp->b_ops || !bp->b_ops->magic16[idx]))
  2060. return false;
  2061. return dmagic == bp->b_ops->magic16[idx];
  2062. }