repair.c 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * Copyright (C) 2018 Oracle. All Rights Reserved.
  4. * Author: Darrick J. Wong <[email protected]>
  5. */
  6. #include "xfs.h"
  7. #include "xfs_fs.h"
  8. #include "xfs_shared.h"
  9. #include "xfs_format.h"
  10. #include "xfs_trans_resv.h"
  11. #include "xfs_mount.h"
  12. #include "xfs_btree.h"
  13. #include "xfs_log_format.h"
  14. #include "xfs_trans.h"
  15. #include "xfs_sb.h"
  16. #include "xfs_inode.h"
  17. #include "xfs_alloc.h"
  18. #include "xfs_alloc_btree.h"
  19. #include "xfs_ialloc.h"
  20. #include "xfs_ialloc_btree.h"
  21. #include "xfs_rmap.h"
  22. #include "xfs_rmap_btree.h"
  23. #include "xfs_refcount_btree.h"
  24. #include "xfs_extent_busy.h"
  25. #include "xfs_ag.h"
  26. #include "xfs_ag_resv.h"
  27. #include "xfs_quota.h"
  28. #include "xfs_qm.h"
  29. #include "scrub/scrub.h"
  30. #include "scrub/common.h"
  31. #include "scrub/trace.h"
  32. #include "scrub/repair.h"
  33. #include "scrub/bitmap.h"
  34. /*
  35. * Attempt to repair some metadata, if the metadata is corrupt and userspace
  36. * told us to fix it. This function returns -EAGAIN to mean "re-run scrub",
  37. * and will set *fixed to true if it thinks it repaired anything.
  38. */
  39. int
  40. xrep_attempt(
  41. struct xfs_scrub *sc)
  42. {
  43. int error = 0;
  44. trace_xrep_attempt(XFS_I(file_inode(sc->file)), sc->sm, error);
  45. xchk_ag_btcur_free(&sc->sa);
  46. /* Repair whatever's broken. */
  47. ASSERT(sc->ops->repair);
  48. error = sc->ops->repair(sc);
  49. trace_xrep_done(XFS_I(file_inode(sc->file)), sc->sm, error);
  50. switch (error) {
  51. case 0:
  52. /*
  53. * Repair succeeded. Commit the fixes and perform a second
  54. * scrub so that we can tell userspace if we fixed the problem.
  55. */
  56. sc->sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT;
  57. sc->flags |= XREP_ALREADY_FIXED;
  58. return -EAGAIN;
  59. case -EDEADLOCK:
  60. case -EAGAIN:
  61. /* Tell the caller to try again having grabbed all the locks. */
  62. if (!(sc->flags & XCHK_TRY_HARDER)) {
  63. sc->flags |= XCHK_TRY_HARDER;
  64. return -EAGAIN;
  65. }
  66. /*
  67. * We tried harder but still couldn't grab all the resources
  68. * we needed to fix it. The corruption has not been fixed,
  69. * so report back to userspace.
  70. */
  71. return -EFSCORRUPTED;
  72. default:
  73. return error;
  74. }
  75. }
  76. /*
  77. * Complain about unfixable problems in the filesystem. We don't log
  78. * corruptions when IFLAG_REPAIR wasn't set on the assumption that the driver
  79. * program is xfs_scrub, which will call back with IFLAG_REPAIR set if the
  80. * administrator isn't running xfs_scrub in no-repairs mode.
  81. *
  82. * Use this helper function because _ratelimited silently declares a static
  83. * structure to track rate limiting information.
  84. */
  85. void
  86. xrep_failure(
  87. struct xfs_mount *mp)
  88. {
  89. xfs_alert_ratelimited(mp,
  90. "Corruption not fixed during online repair. Unmount and run xfs_repair.");
  91. }
  92. /*
  93. * Repair probe -- userspace uses this to probe if we're willing to repair a
  94. * given mountpoint.
  95. */
  96. int
  97. xrep_probe(
  98. struct xfs_scrub *sc)
  99. {
  100. int error = 0;
  101. if (xchk_should_terminate(sc, &error))
  102. return error;
  103. return 0;
  104. }
  105. /*
  106. * Roll a transaction, keeping the AG headers locked and reinitializing
  107. * the btree cursors.
  108. */
  109. int
  110. xrep_roll_ag_trans(
  111. struct xfs_scrub *sc)
  112. {
  113. int error;
  114. /* Keep the AG header buffers locked so we can keep going. */
  115. if (sc->sa.agi_bp)
  116. xfs_trans_bhold(sc->tp, sc->sa.agi_bp);
  117. if (sc->sa.agf_bp)
  118. xfs_trans_bhold(sc->tp, sc->sa.agf_bp);
  119. if (sc->sa.agfl_bp)
  120. xfs_trans_bhold(sc->tp, sc->sa.agfl_bp);
  121. /*
  122. * Roll the transaction. We still own the buffer and the buffer lock
  123. * regardless of whether or not the roll succeeds. If the roll fails,
  124. * the buffers will be released during teardown on our way out of the
  125. * kernel. If it succeeds, we join them to the new transaction and
  126. * move on.
  127. */
  128. error = xfs_trans_roll(&sc->tp);
  129. if (error)
  130. return error;
  131. /* Join AG headers to the new transaction. */
  132. if (sc->sa.agi_bp)
  133. xfs_trans_bjoin(sc->tp, sc->sa.agi_bp);
  134. if (sc->sa.agf_bp)
  135. xfs_trans_bjoin(sc->tp, sc->sa.agf_bp);
  136. if (sc->sa.agfl_bp)
  137. xfs_trans_bjoin(sc->tp, sc->sa.agfl_bp);
  138. return 0;
  139. }
  140. /*
  141. * Does the given AG have enough space to rebuild a btree? Neither AG
  142. * reservation can be critical, and we must have enough space (factoring
  143. * in AG reservations) to construct a whole btree.
  144. */
  145. bool
  146. xrep_ag_has_space(
  147. struct xfs_perag *pag,
  148. xfs_extlen_t nr_blocks,
  149. enum xfs_ag_resv_type type)
  150. {
  151. return !xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) &&
  152. !xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA) &&
  153. pag->pagf_freeblks > xfs_ag_resv_needed(pag, type) + nr_blocks;
  154. }
  155. /*
  156. * Figure out how many blocks to reserve for an AG repair. We calculate the
  157. * worst case estimate for the number of blocks we'd need to rebuild one of
  158. * any type of per-AG btree.
  159. */
  160. xfs_extlen_t
  161. xrep_calc_ag_resblks(
  162. struct xfs_scrub *sc)
  163. {
  164. struct xfs_mount *mp = sc->mp;
  165. struct xfs_scrub_metadata *sm = sc->sm;
  166. struct xfs_perag *pag;
  167. struct xfs_buf *bp;
  168. xfs_agino_t icount = NULLAGINO;
  169. xfs_extlen_t aglen = NULLAGBLOCK;
  170. xfs_extlen_t usedlen;
  171. xfs_extlen_t freelen;
  172. xfs_extlen_t bnobt_sz;
  173. xfs_extlen_t inobt_sz;
  174. xfs_extlen_t rmapbt_sz;
  175. xfs_extlen_t refcbt_sz;
  176. int error;
  177. if (!(sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR))
  178. return 0;
  179. pag = xfs_perag_get(mp, sm->sm_agno);
  180. if (pag->pagi_init) {
  181. /* Use in-core icount if possible. */
  182. icount = pag->pagi_count;
  183. } else {
  184. /* Try to get the actual counters from disk. */
  185. error = xfs_ialloc_read_agi(pag, NULL, &bp);
  186. if (!error) {
  187. icount = pag->pagi_count;
  188. xfs_buf_relse(bp);
  189. }
  190. }
  191. /* Now grab the block counters from the AGF. */
  192. error = xfs_alloc_read_agf(pag, NULL, 0, &bp);
  193. if (error) {
  194. aglen = pag->block_count;
  195. freelen = aglen;
  196. usedlen = aglen;
  197. } else {
  198. struct xfs_agf *agf = bp->b_addr;
  199. aglen = be32_to_cpu(agf->agf_length);
  200. freelen = be32_to_cpu(agf->agf_freeblks);
  201. usedlen = aglen - freelen;
  202. xfs_buf_relse(bp);
  203. }
  204. /* If the icount is impossible, make some worst-case assumptions. */
  205. if (icount == NULLAGINO ||
  206. !xfs_verify_agino(pag, icount)) {
  207. icount = pag->agino_max - pag->agino_min + 1;
  208. }
  209. /* If the block counts are impossible, make worst-case assumptions. */
  210. if (aglen == NULLAGBLOCK ||
  211. aglen != pag->block_count ||
  212. freelen >= aglen) {
  213. aglen = pag->block_count;
  214. freelen = aglen;
  215. usedlen = aglen;
  216. }
  217. xfs_perag_put(pag);
  218. trace_xrep_calc_ag_resblks(mp, sm->sm_agno, icount, aglen,
  219. freelen, usedlen);
  220. /*
  221. * Figure out how many blocks we'd need worst case to rebuild
  222. * each type of btree. Note that we can only rebuild the
  223. * bnobt/cntbt or inobt/finobt as pairs.
  224. */
  225. bnobt_sz = 2 * xfs_allocbt_calc_size(mp, freelen);
  226. if (xfs_has_sparseinodes(mp))
  227. inobt_sz = xfs_iallocbt_calc_size(mp, icount /
  228. XFS_INODES_PER_HOLEMASK_BIT);
  229. else
  230. inobt_sz = xfs_iallocbt_calc_size(mp, icount /
  231. XFS_INODES_PER_CHUNK);
  232. if (xfs_has_finobt(mp))
  233. inobt_sz *= 2;
  234. if (xfs_has_reflink(mp))
  235. refcbt_sz = xfs_refcountbt_calc_size(mp, usedlen);
  236. else
  237. refcbt_sz = 0;
  238. if (xfs_has_rmapbt(mp)) {
  239. /*
  240. * Guess how many blocks we need to rebuild the rmapbt.
  241. * For non-reflink filesystems we can't have more records than
  242. * used blocks. However, with reflink it's possible to have
  243. * more than one rmap record per AG block. We don't know how
  244. * many rmaps there could be in the AG, so we start off with
  245. * what we hope is an generous over-estimation.
  246. */
  247. if (xfs_has_reflink(mp))
  248. rmapbt_sz = xfs_rmapbt_calc_size(mp,
  249. (unsigned long long)aglen * 2);
  250. else
  251. rmapbt_sz = xfs_rmapbt_calc_size(mp, usedlen);
  252. } else {
  253. rmapbt_sz = 0;
  254. }
  255. trace_xrep_calc_ag_resblks_btsize(mp, sm->sm_agno, bnobt_sz,
  256. inobt_sz, rmapbt_sz, refcbt_sz);
  257. return max(max(bnobt_sz, inobt_sz), max(rmapbt_sz, refcbt_sz));
  258. }
  259. /* Allocate a block in an AG. */
  260. int
  261. xrep_alloc_ag_block(
  262. struct xfs_scrub *sc,
  263. const struct xfs_owner_info *oinfo,
  264. xfs_fsblock_t *fsbno,
  265. enum xfs_ag_resv_type resv)
  266. {
  267. struct xfs_alloc_arg args = {0};
  268. xfs_agblock_t bno;
  269. int error;
  270. switch (resv) {
  271. case XFS_AG_RESV_AGFL:
  272. case XFS_AG_RESV_RMAPBT:
  273. error = xfs_alloc_get_freelist(sc->sa.pag, sc->tp,
  274. sc->sa.agf_bp, &bno, 1);
  275. if (error)
  276. return error;
  277. if (bno == NULLAGBLOCK)
  278. return -ENOSPC;
  279. xfs_extent_busy_reuse(sc->mp, sc->sa.pag, bno, 1, false);
  280. *fsbno = XFS_AGB_TO_FSB(sc->mp, sc->sa.pag->pag_agno, bno);
  281. if (resv == XFS_AG_RESV_RMAPBT)
  282. xfs_ag_resv_rmapbt_alloc(sc->mp, sc->sa.pag->pag_agno);
  283. return 0;
  284. default:
  285. break;
  286. }
  287. args.tp = sc->tp;
  288. args.mp = sc->mp;
  289. args.oinfo = *oinfo;
  290. args.fsbno = XFS_AGB_TO_FSB(args.mp, sc->sa.pag->pag_agno, 0);
  291. args.minlen = 1;
  292. args.maxlen = 1;
  293. args.prod = 1;
  294. args.type = XFS_ALLOCTYPE_THIS_AG;
  295. args.resv = resv;
  296. error = xfs_alloc_vextent(&args);
  297. if (error)
  298. return error;
  299. if (args.fsbno == NULLFSBLOCK)
  300. return -ENOSPC;
  301. ASSERT(args.len == 1);
  302. *fsbno = args.fsbno;
  303. return 0;
  304. }
  305. /* Initialize a new AG btree root block with zero entries. */
  306. int
  307. xrep_init_btblock(
  308. struct xfs_scrub *sc,
  309. xfs_fsblock_t fsb,
  310. struct xfs_buf **bpp,
  311. xfs_btnum_t btnum,
  312. const struct xfs_buf_ops *ops)
  313. {
  314. struct xfs_trans *tp = sc->tp;
  315. struct xfs_mount *mp = sc->mp;
  316. struct xfs_buf *bp;
  317. int error;
  318. trace_xrep_init_btblock(mp, XFS_FSB_TO_AGNO(mp, fsb),
  319. XFS_FSB_TO_AGBNO(mp, fsb), btnum);
  320. ASSERT(XFS_FSB_TO_AGNO(mp, fsb) == sc->sa.pag->pag_agno);
  321. error = xfs_trans_get_buf(tp, mp->m_ddev_targp,
  322. XFS_FSB_TO_DADDR(mp, fsb), XFS_FSB_TO_BB(mp, 1), 0,
  323. &bp);
  324. if (error)
  325. return error;
  326. xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
  327. xfs_btree_init_block(mp, bp, btnum, 0, 0, sc->sa.pag->pag_agno);
  328. xfs_trans_buf_set_type(tp, bp, XFS_BLFT_BTREE_BUF);
  329. xfs_trans_log_buf(tp, bp, 0, BBTOB(bp->b_length) - 1);
  330. bp->b_ops = ops;
  331. *bpp = bp;
  332. return 0;
  333. }
  334. /*
  335. * Reconstructing per-AG Btrees
  336. *
  337. * When a space btree is corrupt, we don't bother trying to fix it. Instead,
  338. * we scan secondary space metadata to derive the records that should be in
  339. * the damaged btree, initialize a fresh btree root, and insert the records.
  340. * Note that for rebuilding the rmapbt we scan all the primary data to
  341. * generate the new records.
  342. *
  343. * However, that leaves the matter of removing all the metadata describing the
  344. * old broken structure. For primary metadata we use the rmap data to collect
  345. * every extent with a matching rmap owner (bitmap); we then iterate all other
  346. * metadata structures with the same rmap owner to collect the extents that
  347. * cannot be removed (sublist). We then subtract sublist from bitmap to
  348. * derive the blocks that were used by the old btree. These blocks can be
  349. * reaped.
  350. *
  351. * For rmapbt reconstructions we must use different tactics for extent
  352. * collection. First we iterate all primary metadata (this excludes the old
  353. * rmapbt, obviously) to generate new rmap records. The gaps in the rmap
  354. * records are collected as bitmap. The bnobt records are collected as
  355. * sublist. As with the other btrees we subtract sublist from bitmap, and the
  356. * result (since the rmapbt lives in the free space) are the blocks from the
  357. * old rmapbt.
  358. *
  359. * Disposal of Blocks from Old per-AG Btrees
  360. *
  361. * Now that we've constructed a new btree to replace the damaged one, we want
  362. * to dispose of the blocks that (we think) the old btree was using.
  363. * Previously, we used the rmapbt to collect the extents (bitmap) with the
  364. * rmap owner corresponding to the tree we rebuilt, collected extents for any
  365. * blocks with the same rmap owner that are owned by another data structure
  366. * (sublist), and subtracted sublist from bitmap. In theory the extents
  367. * remaining in bitmap are the old btree's blocks.
  368. *
  369. * Unfortunately, it's possible that the btree was crosslinked with other
  370. * blocks on disk. The rmap data can tell us if there are multiple owners, so
  371. * if the rmapbt says there is an owner of this block other than @oinfo, then
  372. * the block is crosslinked. Remove the reverse mapping and continue.
  373. *
  374. * If there is one rmap record, we can free the block, which removes the
  375. * reverse mapping but doesn't add the block to the free space. Our repair
  376. * strategy is to hope the other metadata objects crosslinked on this block
  377. * will be rebuilt (atop different blocks), thereby removing all the cross
  378. * links.
  379. *
  380. * If there are no rmap records at all, we also free the block. If the btree
  381. * being rebuilt lives in the free space (bnobt/cntbt/rmapbt) then there isn't
  382. * supposed to be a rmap record and everything is ok. For other btrees there
  383. * had to have been an rmap entry for the block to have ended up on @bitmap,
  384. * so if it's gone now there's something wrong and the fs will shut down.
  385. *
  386. * Note: If there are multiple rmap records with only the same rmap owner as
  387. * the btree we're trying to rebuild and the block is indeed owned by another
  388. * data structure with the same rmap owner, then the block will be in sublist
  389. * and therefore doesn't need disposal. If there are multiple rmap records
  390. * with only the same rmap owner but the block is not owned by something with
  391. * the same rmap owner, the block will be freed.
  392. *
  393. * The caller is responsible for locking the AG headers for the entire rebuild
  394. * operation so that nothing else can sneak in and change the AG state while
  395. * we're not looking. We also assume that the caller already invalidated any
  396. * buffers associated with @bitmap.
  397. */
  398. /*
  399. * Invalidate buffers for per-AG btree blocks we're dumping. This function
  400. * is not intended for use with file data repairs; we have bunmapi for that.
  401. */
  402. int
  403. xrep_invalidate_blocks(
  404. struct xfs_scrub *sc,
  405. struct xbitmap *bitmap)
  406. {
  407. struct xbitmap_range *bmr;
  408. struct xbitmap_range *n;
  409. struct xfs_buf *bp;
  410. xfs_fsblock_t fsbno;
  411. /*
  412. * For each block in each extent, see if there's an incore buffer for
  413. * exactly that block; if so, invalidate it. The buffer cache only
  414. * lets us look for one buffer at a time, so we have to look one block
  415. * at a time. Avoid invalidating AG headers and post-EOFS blocks
  416. * because we never own those; and if we can't TRYLOCK the buffer we
  417. * assume it's owned by someone else.
  418. */
  419. for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
  420. int error;
  421. /* Skip AG headers and post-EOFS blocks */
  422. if (!xfs_verify_fsbno(sc->mp, fsbno))
  423. continue;
  424. error = xfs_buf_incore(sc->mp->m_ddev_targp,
  425. XFS_FSB_TO_DADDR(sc->mp, fsbno),
  426. XFS_FSB_TO_BB(sc->mp, 1), XBF_TRYLOCK, &bp);
  427. if (error)
  428. continue;
  429. xfs_trans_bjoin(sc->tp, bp);
  430. xfs_trans_binval(sc->tp, bp);
  431. }
  432. return 0;
  433. }
  434. /* Ensure the freelist is the correct size. */
  435. int
  436. xrep_fix_freelist(
  437. struct xfs_scrub *sc,
  438. bool can_shrink)
  439. {
  440. struct xfs_alloc_arg args = {0};
  441. args.mp = sc->mp;
  442. args.tp = sc->tp;
  443. args.agno = sc->sa.pag->pag_agno;
  444. args.alignment = 1;
  445. args.pag = sc->sa.pag;
  446. return xfs_alloc_fix_freelist(&args,
  447. can_shrink ? 0 : XFS_ALLOC_FLAG_NOSHRINK);
  448. }
  449. /*
  450. * Put a block back on the AGFL.
  451. */
  452. STATIC int
  453. xrep_put_freelist(
  454. struct xfs_scrub *sc,
  455. xfs_agblock_t agbno)
  456. {
  457. int error;
  458. /* Make sure there's space on the freelist. */
  459. error = xrep_fix_freelist(sc, true);
  460. if (error)
  461. return error;
  462. /*
  463. * Since we're "freeing" a lost block onto the AGFL, we have to
  464. * create an rmap for the block prior to merging it or else other
  465. * parts will break.
  466. */
  467. error = xfs_rmap_alloc(sc->tp, sc->sa.agf_bp, sc->sa.pag, agbno, 1,
  468. &XFS_RMAP_OINFO_AG);
  469. if (error)
  470. return error;
  471. /* Put the block on the AGFL. */
  472. error = xfs_alloc_put_freelist(sc->sa.pag, sc->tp, sc->sa.agf_bp,
  473. sc->sa.agfl_bp, agbno, 0);
  474. if (error)
  475. return error;
  476. xfs_extent_busy_insert(sc->tp, sc->sa.pag, agbno, 1,
  477. XFS_EXTENT_BUSY_SKIP_DISCARD);
  478. return 0;
  479. }
  480. /* Dispose of a single block. */
  481. STATIC int
  482. xrep_reap_block(
  483. struct xfs_scrub *sc,
  484. xfs_fsblock_t fsbno,
  485. const struct xfs_owner_info *oinfo,
  486. enum xfs_ag_resv_type resv)
  487. {
  488. struct xfs_btree_cur *cur;
  489. struct xfs_buf *agf_bp = NULL;
  490. xfs_agblock_t agbno;
  491. bool has_other_rmap;
  492. int error;
  493. agbno = XFS_FSB_TO_AGBNO(sc->mp, fsbno);
  494. ASSERT(XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
  495. /*
  496. * If we are repairing per-inode metadata, we need to read in the AGF
  497. * buffer. Otherwise, we're repairing a per-AG structure, so reuse
  498. * the AGF buffer that the setup functions already grabbed.
  499. */
  500. if (sc->ip) {
  501. error = xfs_alloc_read_agf(sc->sa.pag, sc->tp, 0, &agf_bp);
  502. if (error)
  503. return error;
  504. } else {
  505. agf_bp = sc->sa.agf_bp;
  506. }
  507. cur = xfs_rmapbt_init_cursor(sc->mp, sc->tp, agf_bp, sc->sa.pag);
  508. /* Can we find any other rmappings? */
  509. error = xfs_rmap_has_other_keys(cur, agbno, 1, oinfo, &has_other_rmap);
  510. xfs_btree_del_cursor(cur, error);
  511. if (error)
  512. goto out_free;
  513. /*
  514. * If there are other rmappings, this block is cross linked and must
  515. * not be freed. Remove the reverse mapping and move on. Otherwise,
  516. * we were the only owner of the block, so free the extent, which will
  517. * also remove the rmap.
  518. *
  519. * XXX: XFS doesn't support detecting the case where a single block
  520. * metadata structure is crosslinked with a multi-block structure
  521. * because the buffer cache doesn't detect aliasing problems, so we
  522. * can't fix 100% of crosslinking problems (yet). The verifiers will
  523. * blow on writeout, the filesystem will shut down, and the admin gets
  524. * to run xfs_repair.
  525. */
  526. if (has_other_rmap)
  527. error = xfs_rmap_free(sc->tp, agf_bp, sc->sa.pag, agbno,
  528. 1, oinfo);
  529. else if (resv == XFS_AG_RESV_AGFL)
  530. error = xrep_put_freelist(sc, agbno);
  531. else
  532. error = xfs_free_extent(sc->tp, fsbno, 1, oinfo, resv);
  533. if (agf_bp != sc->sa.agf_bp)
  534. xfs_trans_brelse(sc->tp, agf_bp);
  535. if (error)
  536. return error;
  537. if (sc->ip)
  538. return xfs_trans_roll_inode(&sc->tp, sc->ip);
  539. return xrep_roll_ag_trans(sc);
  540. out_free:
  541. if (agf_bp != sc->sa.agf_bp)
  542. xfs_trans_brelse(sc->tp, agf_bp);
  543. return error;
  544. }
  545. /* Dispose of every block of every extent in the bitmap. */
  546. int
  547. xrep_reap_extents(
  548. struct xfs_scrub *sc,
  549. struct xbitmap *bitmap,
  550. const struct xfs_owner_info *oinfo,
  551. enum xfs_ag_resv_type type)
  552. {
  553. struct xbitmap_range *bmr;
  554. struct xbitmap_range *n;
  555. xfs_fsblock_t fsbno;
  556. int error = 0;
  557. ASSERT(xfs_has_rmapbt(sc->mp));
  558. for_each_xbitmap_block(fsbno, bmr, n, bitmap) {
  559. ASSERT(sc->ip != NULL ||
  560. XFS_FSB_TO_AGNO(sc->mp, fsbno) == sc->sa.pag->pag_agno);
  561. trace_xrep_dispose_btree_extent(sc->mp,
  562. XFS_FSB_TO_AGNO(sc->mp, fsbno),
  563. XFS_FSB_TO_AGBNO(sc->mp, fsbno), 1);
  564. error = xrep_reap_block(sc, fsbno, oinfo, type);
  565. if (error)
  566. break;
  567. }
  568. return error;
  569. }
  570. /*
  571. * Finding per-AG Btree Roots for AGF/AGI Reconstruction
  572. *
  573. * If the AGF or AGI become slightly corrupted, it may be necessary to rebuild
  574. * the AG headers by using the rmap data to rummage through the AG looking for
  575. * btree roots. This is not guaranteed to work if the AG is heavily damaged
  576. * or the rmap data are corrupt.
  577. *
  578. * Callers of xrep_find_ag_btree_roots must lock the AGF and AGFL
  579. * buffers if the AGF is being rebuilt; or the AGF and AGI buffers if the
  580. * AGI is being rebuilt. It must maintain these locks until it's safe for
  581. * other threads to change the btrees' shapes. The caller provides
  582. * information about the btrees to look for by passing in an array of
  583. * xrep_find_ag_btree with the (rmap owner, buf_ops, magic) fields set.
  584. * The (root, height) fields will be set on return if anything is found. The
  585. * last element of the array should have a NULL buf_ops to mark the end of the
  586. * array.
  587. *
  588. * For every rmapbt record matching any of the rmap owners in btree_info,
  589. * read each block referenced by the rmap record. If the block is a btree
  590. * block from this filesystem matching any of the magic numbers and has a
  591. * level higher than what we've already seen, remember the block and the
  592. * height of the tree required to have such a block. When the call completes,
  593. * we return the highest block we've found for each btree description; those
  594. * should be the roots.
  595. */
  596. struct xrep_findroot {
  597. struct xfs_scrub *sc;
  598. struct xfs_buf *agfl_bp;
  599. struct xfs_agf *agf;
  600. struct xrep_find_ag_btree *btree_info;
  601. };
  602. /* See if our block is in the AGFL. */
  603. STATIC int
  604. xrep_findroot_agfl_walk(
  605. struct xfs_mount *mp,
  606. xfs_agblock_t bno,
  607. void *priv)
  608. {
  609. xfs_agblock_t *agbno = priv;
  610. return (*agbno == bno) ? -ECANCELED : 0;
  611. }
  612. /* Does this block match the btree information passed in? */
  613. STATIC int
  614. xrep_findroot_block(
  615. struct xrep_findroot *ri,
  616. struct xrep_find_ag_btree *fab,
  617. uint64_t owner,
  618. xfs_agblock_t agbno,
  619. bool *done_with_block)
  620. {
  621. struct xfs_mount *mp = ri->sc->mp;
  622. struct xfs_buf *bp;
  623. struct xfs_btree_block *btblock;
  624. xfs_daddr_t daddr;
  625. int block_level;
  626. int error = 0;
  627. daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.pag->pag_agno, agbno);
  628. /*
  629. * Blocks in the AGFL have stale contents that might just happen to
  630. * have a matching magic and uuid. We don't want to pull these blocks
  631. * in as part of a tree root, so we have to filter out the AGFL stuff
  632. * here. If the AGFL looks insane we'll just refuse to repair.
  633. */
  634. if (owner == XFS_RMAP_OWN_AG) {
  635. error = xfs_agfl_walk(mp, ri->agf, ri->agfl_bp,
  636. xrep_findroot_agfl_walk, &agbno);
  637. if (error == -ECANCELED)
  638. return 0;
  639. if (error)
  640. return error;
  641. }
  642. /*
  643. * Read the buffer into memory so that we can see if it's a match for
  644. * our btree type. We have no clue if it is beforehand, and we want to
  645. * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
  646. * will cause needless disk reads in subsequent calls to this function)
  647. * and logging metadata verifier failures.
  648. *
  649. * Therefore, pass in NULL buffer ops. If the buffer was already in
  650. * memory from some other caller it will already have b_ops assigned.
  651. * If it was in memory from a previous unsuccessful findroot_block
  652. * call, the buffer won't have b_ops but it should be clean and ready
  653. * for us to try to verify if the read call succeeds. The same applies
  654. * if the buffer wasn't in memory at all.
  655. *
  656. * Note: If we never match a btree type with this buffer, it will be
  657. * left in memory with NULL b_ops. This shouldn't be a problem unless
  658. * the buffer gets written.
  659. */
  660. error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
  661. mp->m_bsize, 0, &bp, NULL);
  662. if (error)
  663. return error;
  664. /* Ensure the block magic matches the btree type we're looking for. */
  665. btblock = XFS_BUF_TO_BLOCK(bp);
  666. ASSERT(fab->buf_ops->magic[1] != 0);
  667. if (btblock->bb_magic != fab->buf_ops->magic[1])
  668. goto out;
  669. /*
  670. * If the buffer already has ops applied and they're not the ones for
  671. * this btree type, we know this block doesn't match the btree and we
  672. * can bail out.
  673. *
  674. * If the buffer ops match ours, someone else has already validated
  675. * the block for us, so we can move on to checking if this is a root
  676. * block candidate.
  677. *
  678. * If the buffer does not have ops, nobody has successfully validated
  679. * the contents and the buffer cannot be dirty. If the magic, uuid,
  680. * and structure match this btree type then we'll move on to checking
  681. * if it's a root block candidate. If there is no match, bail out.
  682. */
  683. if (bp->b_ops) {
  684. if (bp->b_ops != fab->buf_ops)
  685. goto out;
  686. } else {
  687. ASSERT(!xfs_trans_buf_is_dirty(bp));
  688. if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
  689. &mp->m_sb.sb_meta_uuid))
  690. goto out;
  691. /*
  692. * Read verifiers can reference b_ops, so we set the pointer
  693. * here. If the verifier fails we'll reset the buffer state
  694. * to what it was before we touched the buffer.
  695. */
  696. bp->b_ops = fab->buf_ops;
  697. fab->buf_ops->verify_read(bp);
  698. if (bp->b_error) {
  699. bp->b_ops = NULL;
  700. bp->b_error = 0;
  701. goto out;
  702. }
  703. /*
  704. * Some read verifiers will (re)set b_ops, so we must be
  705. * careful not to change b_ops after running the verifier.
  706. */
  707. }
  708. /*
  709. * This block passes the magic/uuid and verifier tests for this btree
  710. * type. We don't need the caller to try the other tree types.
  711. */
  712. *done_with_block = true;
  713. /*
  714. * Compare this btree block's level to the height of the current
  715. * candidate root block.
  716. *
  717. * If the level matches the root we found previously, throw away both
  718. * blocks because there can't be two candidate roots.
  719. *
  720. * If level is lower in the tree than the root we found previously,
  721. * ignore this block.
  722. */
  723. block_level = xfs_btree_get_level(btblock);
  724. if (block_level + 1 == fab->height) {
  725. fab->root = NULLAGBLOCK;
  726. goto out;
  727. } else if (block_level < fab->height) {
  728. goto out;
  729. }
  730. /*
  731. * This is the highest block in the tree that we've found so far.
  732. * Update the btree height to reflect what we've learned from this
  733. * block.
  734. */
  735. fab->height = block_level + 1;
  736. /*
  737. * If this block doesn't have sibling pointers, then it's the new root
  738. * block candidate. Otherwise, the root will be found farther up the
  739. * tree.
  740. */
  741. if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
  742. btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
  743. fab->root = agbno;
  744. else
  745. fab->root = NULLAGBLOCK;
  746. trace_xrep_findroot_block(mp, ri->sc->sa.pag->pag_agno, agbno,
  747. be32_to_cpu(btblock->bb_magic), fab->height - 1);
  748. out:
  749. xfs_trans_brelse(ri->sc->tp, bp);
  750. return error;
  751. }
  752. /*
  753. * Do any of the blocks in this rmap record match one of the btrees we're
  754. * looking for?
  755. */
  756. STATIC int
  757. xrep_findroot_rmap(
  758. struct xfs_btree_cur *cur,
  759. const struct xfs_rmap_irec *rec,
  760. void *priv)
  761. {
  762. struct xrep_findroot *ri = priv;
  763. struct xrep_find_ag_btree *fab;
  764. xfs_agblock_t b;
  765. bool done;
  766. int error = 0;
  767. /* Ignore anything that isn't AG metadata. */
  768. if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner))
  769. return 0;
  770. /* Otherwise scan each block + btree type. */
  771. for (b = 0; b < rec->rm_blockcount; b++) {
  772. done = false;
  773. for (fab = ri->btree_info; fab->buf_ops; fab++) {
  774. if (rec->rm_owner != fab->rmap_owner)
  775. continue;
  776. error = xrep_findroot_block(ri, fab,
  777. rec->rm_owner, rec->rm_startblock + b,
  778. &done);
  779. if (error)
  780. return error;
  781. if (done)
  782. break;
  783. }
  784. }
  785. return 0;
  786. }
  787. /* Find the roots of the per-AG btrees described in btree_info. */
  788. int
  789. xrep_find_ag_btree_roots(
  790. struct xfs_scrub *sc,
  791. struct xfs_buf *agf_bp,
  792. struct xrep_find_ag_btree *btree_info,
  793. struct xfs_buf *agfl_bp)
  794. {
  795. struct xfs_mount *mp = sc->mp;
  796. struct xrep_findroot ri;
  797. struct xrep_find_ag_btree *fab;
  798. struct xfs_btree_cur *cur;
  799. int error;
  800. ASSERT(xfs_buf_islocked(agf_bp));
  801. ASSERT(agfl_bp == NULL || xfs_buf_islocked(agfl_bp));
  802. ri.sc = sc;
  803. ri.btree_info = btree_info;
  804. ri.agf = agf_bp->b_addr;
  805. ri.agfl_bp = agfl_bp;
  806. for (fab = btree_info; fab->buf_ops; fab++) {
  807. ASSERT(agfl_bp || fab->rmap_owner != XFS_RMAP_OWN_AG);
  808. ASSERT(XFS_RMAP_NON_INODE_OWNER(fab->rmap_owner));
  809. fab->root = NULLAGBLOCK;
  810. fab->height = 0;
  811. }
  812. cur = xfs_rmapbt_init_cursor(mp, sc->tp, agf_bp, sc->sa.pag);
  813. error = xfs_rmap_query_all(cur, xrep_findroot_rmap, &ri);
  814. xfs_btree_del_cursor(cur, error);
  815. return error;
  816. }
  817. /* Force a quotacheck the next time we mount. */
  818. void
  819. xrep_force_quotacheck(
  820. struct xfs_scrub *sc,
  821. xfs_dqtype_t type)
  822. {
  823. uint flag;
  824. flag = xfs_quota_chkd_flag(type);
  825. if (!(flag & sc->mp->m_qflags))
  826. return;
  827. mutex_lock(&sc->mp->m_quotainfo->qi_quotaofflock);
  828. sc->mp->m_qflags &= ~flag;
  829. spin_lock(&sc->mp->m_sb_lock);
  830. sc->mp->m_sb.sb_qflags &= ~flag;
  831. spin_unlock(&sc->mp->m_sb_lock);
  832. xfs_log_sb(sc->tp);
  833. mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock);
  834. }
  835. /*
  836. * Attach dquots to this inode, or schedule quotacheck to fix them.
  837. *
  838. * This function ensures that the appropriate dquots are attached to an inode.
  839. * We cannot allow the dquot code to allocate an on-disk dquot block here
  840. * because we're already in transaction context with the inode locked. The
  841. * on-disk dquot should already exist anyway. If the quota code signals
  842. * corruption or missing quota information, schedule quotacheck, which will
  843. * repair corruptions in the quota metadata.
  844. */
  845. int
  846. xrep_ino_dqattach(
  847. struct xfs_scrub *sc)
  848. {
  849. int error;
  850. error = xfs_qm_dqattach_locked(sc->ip, false);
  851. switch (error) {
  852. case -EFSBADCRC:
  853. case -EFSCORRUPTED:
  854. case -ENOENT:
  855. xfs_err_ratelimited(sc->mp,
  856. "inode %llu repair encountered quota error %d, quotacheck forced.",
  857. (unsigned long long)sc->ip->i_ino, error);
  858. if (XFS_IS_UQUOTA_ON(sc->mp) && !sc->ip->i_udquot)
  859. xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
  860. if (XFS_IS_GQUOTA_ON(sc->mp) && !sc->ip->i_gdquot)
  861. xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
  862. if (XFS_IS_PQUOTA_ON(sc->mp) && !sc->ip->i_pdquot)
  863. xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
  864. fallthrough;
  865. case -ESRCH:
  866. error = 0;
  867. break;
  868. default:
  869. break;
  870. }
  871. return error;
  872. }