recovery.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938
  1. // SPDX-License-Identifier: GPL-2.0+
  2. /*
  3. * linux/fs/jbd2/recovery.c
  4. *
  5. * Written by Stephen C. Tweedie <[email protected]>, 1999
  6. *
  7. * Copyright 1999-2000 Red Hat Software --- All Rights Reserved
  8. *
  9. * Journal recovery routines for the generic filesystem journaling code;
  10. * part of the ext2fs journaling system.
  11. */
  12. #ifndef __KERNEL__
  13. #include "jfs_user.h"
  14. #else
  15. #include <linux/time.h>
  16. #include <linux/fs.h>
  17. #include <linux/jbd2.h>
  18. #include <linux/errno.h>
  19. #include <linux/crc32.h>
  20. #include <linux/blkdev.h>
  21. #endif
  22. /*
  23. * Maintain information about the progress of the recovery job, so that
  24. * the different passes can carry information between them.
  25. */
  26. struct recovery_info
  27. {
  28. tid_t start_transaction;
  29. tid_t end_transaction;
  30. int nr_replays;
  31. int nr_revokes;
  32. int nr_revoke_hits;
  33. };
  34. static int do_one_pass(journal_t *journal,
  35. struct recovery_info *info, enum passtype pass);
  36. static int scan_revoke_records(journal_t *, struct buffer_head *,
  37. tid_t, struct recovery_info *);
  38. #ifdef __KERNEL__
  39. /* Release readahead buffers after use */
  40. static void journal_brelse_array(struct buffer_head *b[], int n)
  41. {
  42. while (--n >= 0)
  43. brelse (b[n]);
  44. }
  45. /*
  46. * When reading from the journal, we are going through the block device
  47. * layer directly and so there is no readahead being done for us. We
  48. * need to implement any readahead ourselves if we want it to happen at
  49. * all. Recovery is basically one long sequential read, so make sure we
  50. * do the IO in reasonably large chunks.
  51. *
  52. * This is not so critical that we need to be enormously clever about
  53. * the readahead size, though. 128K is a purely arbitrary, good-enough
  54. * fixed value.
  55. */
  56. #define MAXBUF 8
  57. static int do_readahead(journal_t *journal, unsigned int start)
  58. {
  59. int err;
  60. unsigned int max, nbufs, next;
  61. unsigned long long blocknr;
  62. struct buffer_head *bh;
  63. struct buffer_head * bufs[MAXBUF];
  64. /* Do up to 128K of readahead */
  65. max = start + (128 * 1024 / journal->j_blocksize);
  66. if (max > journal->j_total_len)
  67. max = journal->j_total_len;
  68. /* Do the readahead itself. We'll submit MAXBUF buffer_heads at
  69. * a time to the block device IO layer. */
  70. nbufs = 0;
  71. for (next = start; next < max; next++) {
  72. err = jbd2_journal_bmap(journal, next, &blocknr);
  73. if (err) {
  74. printk(KERN_ERR "JBD2: bad block at offset %u\n",
  75. next);
  76. goto failed;
  77. }
  78. bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
  79. if (!bh) {
  80. err = -ENOMEM;
  81. goto failed;
  82. }
  83. if (!buffer_uptodate(bh) && !buffer_locked(bh)) {
  84. bufs[nbufs++] = bh;
  85. if (nbufs == MAXBUF) {
  86. bh_readahead_batch(nbufs, bufs, 0);
  87. journal_brelse_array(bufs, nbufs);
  88. nbufs = 0;
  89. }
  90. } else
  91. brelse(bh);
  92. }
  93. if (nbufs)
  94. bh_readahead_batch(nbufs, bufs, 0);
  95. err = 0;
  96. failed:
  97. if (nbufs)
  98. journal_brelse_array(bufs, nbufs);
  99. return err;
  100. }
  101. #endif /* __KERNEL__ */
  102. /*
  103. * Read a block from the journal
  104. */
  105. static int jread(struct buffer_head **bhp, journal_t *journal,
  106. unsigned int offset)
  107. {
  108. int err;
  109. unsigned long long blocknr;
  110. struct buffer_head *bh;
  111. *bhp = NULL;
  112. if (offset >= journal->j_total_len) {
  113. printk(KERN_ERR "JBD2: corrupted journal superblock\n");
  114. return -EFSCORRUPTED;
  115. }
  116. err = jbd2_journal_bmap(journal, offset, &blocknr);
  117. if (err) {
  118. printk(KERN_ERR "JBD2: bad block at offset %u\n",
  119. offset);
  120. return err;
  121. }
  122. bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize);
  123. if (!bh)
  124. return -ENOMEM;
  125. if (!buffer_uptodate(bh)) {
  126. /*
  127. * If this is a brand new buffer, start readahead.
  128. * Otherwise, we assume we are already reading it.
  129. */
  130. bool need_readahead = !buffer_req(bh);
  131. bh_read_nowait(bh, 0);
  132. if (need_readahead)
  133. do_readahead(journal, offset);
  134. wait_on_buffer(bh);
  135. }
  136. if (!buffer_uptodate(bh)) {
  137. printk(KERN_ERR "JBD2: Failed to read block at offset %u\n",
  138. offset);
  139. brelse(bh);
  140. return -EIO;
  141. }
  142. *bhp = bh;
  143. return 0;
  144. }
  145. static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
  146. {
  147. struct jbd2_journal_block_tail *tail;
  148. __be32 provided;
  149. __u32 calculated;
  150. if (!jbd2_journal_has_csum_v2or3(j))
  151. return 1;
  152. tail = (struct jbd2_journal_block_tail *)((char *)buf +
  153. j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
  154. provided = tail->t_checksum;
  155. tail->t_checksum = 0;
  156. calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
  157. tail->t_checksum = provided;
  158. return provided == cpu_to_be32(calculated);
  159. }
  160. /*
  161. * Count the number of in-use tags in a journal descriptor block.
  162. */
  163. static int count_tags(journal_t *journal, struct buffer_head *bh)
  164. {
  165. char * tagp;
  166. journal_block_tag_t tag;
  167. int nr = 0, size = journal->j_blocksize;
  168. int tag_bytes = journal_tag_bytes(journal);
  169. if (jbd2_journal_has_csum_v2or3(journal))
  170. size -= sizeof(struct jbd2_journal_block_tail);
  171. tagp = &bh->b_data[sizeof(journal_header_t)];
  172. while ((tagp - bh->b_data + tag_bytes) <= size) {
  173. memcpy(&tag, tagp, sizeof(tag));
  174. nr++;
  175. tagp += tag_bytes;
  176. if (!(tag.t_flags & cpu_to_be16(JBD2_FLAG_SAME_UUID)))
  177. tagp += 16;
  178. if (tag.t_flags & cpu_to_be16(JBD2_FLAG_LAST_TAG))
  179. break;
  180. }
  181. return nr;
  182. }
  183. /* Make sure we wrap around the log correctly! */
  184. #define wrap(journal, var) \
  185. do { \
  186. if (var >= (journal)->j_last) \
  187. var -= ((journal)->j_last - (journal)->j_first); \
  188. } while (0)
  189. static int fc_do_one_pass(journal_t *journal,
  190. struct recovery_info *info, enum passtype pass)
  191. {
  192. unsigned int expected_commit_id = info->end_transaction;
  193. unsigned long next_fc_block;
  194. struct buffer_head *bh;
  195. int err = 0;
  196. next_fc_block = journal->j_fc_first;
  197. if (!journal->j_fc_replay_callback)
  198. return 0;
  199. while (next_fc_block <= journal->j_fc_last) {
  200. jbd2_debug(3, "Fast commit replay: next block %ld\n",
  201. next_fc_block);
  202. err = jread(&bh, journal, next_fc_block);
  203. if (err) {
  204. jbd2_debug(3, "Fast commit replay: read error\n");
  205. break;
  206. }
  207. err = journal->j_fc_replay_callback(journal, bh, pass,
  208. next_fc_block - journal->j_fc_first,
  209. expected_commit_id);
  210. brelse(bh);
  211. next_fc_block++;
  212. if (err < 0 || err == JBD2_FC_REPLAY_STOP)
  213. break;
  214. err = 0;
  215. }
  216. if (err)
  217. jbd2_debug(3, "Fast commit replay failed, err = %d\n", err);
  218. return err;
  219. }
  220. /**
  221. * jbd2_journal_recover - recovers a on-disk journal
  222. * @journal: the journal to recover
  223. *
  224. * The primary function for recovering the log contents when mounting a
  225. * journaled device.
  226. *
  227. * Recovery is done in three passes. In the first pass, we look for the
  228. * end of the log. In the second, we assemble the list of revoke
  229. * blocks. In the third and final pass, we replay any un-revoked blocks
  230. * in the log.
  231. */
  232. int jbd2_journal_recover(journal_t *journal)
  233. {
  234. int err, err2;
  235. journal_superblock_t * sb;
  236. struct recovery_info info;
  237. errseq_t wb_err;
  238. struct address_space *mapping;
  239. memset(&info, 0, sizeof(info));
  240. sb = journal->j_superblock;
  241. /*
  242. * The journal superblock's s_start field (the current log head)
  243. * is always zero if, and only if, the journal was cleanly
  244. * unmounted.
  245. */
  246. if (!sb->s_start) {
  247. jbd2_debug(1, "No recovery required, last transaction %d\n",
  248. be32_to_cpu(sb->s_sequence));
  249. journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1;
  250. return 0;
  251. }
  252. wb_err = 0;
  253. mapping = journal->j_fs_dev->bd_inode->i_mapping;
  254. errseq_check_and_advance(&mapping->wb_err, &wb_err);
  255. err = do_one_pass(journal, &info, PASS_SCAN);
  256. if (!err)
  257. err = do_one_pass(journal, &info, PASS_REVOKE);
  258. if (!err)
  259. err = do_one_pass(journal, &info, PASS_REPLAY);
  260. jbd2_debug(1, "JBD2: recovery, exit status %d, "
  261. "recovered transactions %u to %u\n",
  262. err, info.start_transaction, info.end_transaction);
  263. jbd2_debug(1, "JBD2: Replayed %d and revoked %d/%d blocks\n",
  264. info.nr_replays, info.nr_revoke_hits, info.nr_revokes);
  265. /* Restart the log at the next transaction ID, thus invalidating
  266. * any existing commit records in the log. */
  267. journal->j_transaction_sequence = ++info.end_transaction;
  268. jbd2_journal_clear_revoke(journal);
  269. err2 = sync_blockdev(journal->j_fs_dev);
  270. if (!err)
  271. err = err2;
  272. err2 = errseq_check_and_advance(&mapping->wb_err, &wb_err);
  273. if (!err)
  274. err = err2;
  275. /* Make sure all replayed data is on permanent storage */
  276. if (journal->j_flags & JBD2_BARRIER) {
  277. err2 = blkdev_issue_flush(journal->j_fs_dev);
  278. if (!err)
  279. err = err2;
  280. }
  281. return err;
  282. }
  283. /**
  284. * jbd2_journal_skip_recovery - Start journal and wipe exiting records
  285. * @journal: journal to startup
  286. *
  287. * Locate any valid recovery information from the journal and set up the
  288. * journal structures in memory to ignore it (presumably because the
  289. * caller has evidence that it is out of date).
  290. * This function doesn't appear to be exported..
  291. *
  292. * We perform one pass over the journal to allow us to tell the user how
  293. * much recovery information is being erased, and to let us initialise
  294. * the journal transaction sequence numbers to the next unused ID.
  295. */
  296. int jbd2_journal_skip_recovery(journal_t *journal)
  297. {
  298. int err;
  299. struct recovery_info info;
  300. memset (&info, 0, sizeof(info));
  301. err = do_one_pass(journal, &info, PASS_SCAN);
  302. if (err) {
  303. printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
  304. ++journal->j_transaction_sequence;
  305. } else {
  306. #ifdef CONFIG_JBD2_DEBUG
  307. int dropped = info.end_transaction -
  308. be32_to_cpu(journal->j_superblock->s_sequence);
  309. jbd2_debug(1,
  310. "JBD2: ignoring %d transaction%s from the journal.\n",
  311. dropped, (dropped == 1) ? "" : "s");
  312. #endif
  313. journal->j_transaction_sequence = ++info.end_transaction;
  314. }
  315. journal->j_tail = 0;
  316. return err;
  317. }
  318. static inline unsigned long long read_tag_block(journal_t *journal,
  319. journal_block_tag_t *tag)
  320. {
  321. unsigned long long block = be32_to_cpu(tag->t_blocknr);
  322. if (jbd2_has_feature_64bit(journal))
  323. block |= (u64)be32_to_cpu(tag->t_blocknr_high) << 32;
  324. return block;
  325. }
  326. /*
  327. * calc_chksums calculates the checksums for the blocks described in the
  328. * descriptor block.
  329. */
  330. static int calc_chksums(journal_t *journal, struct buffer_head *bh,
  331. unsigned long *next_log_block, __u32 *crc32_sum)
  332. {
  333. int i, num_blks, err;
  334. unsigned long io_block;
  335. struct buffer_head *obh;
  336. num_blks = count_tags(journal, bh);
  337. /* Calculate checksum of the descriptor block. */
  338. *crc32_sum = crc32_be(*crc32_sum, (void *)bh->b_data, bh->b_size);
  339. for (i = 0; i < num_blks; i++) {
  340. io_block = (*next_log_block)++;
  341. wrap(journal, *next_log_block);
  342. err = jread(&obh, journal, io_block);
  343. if (err) {
  344. printk(KERN_ERR "JBD2: IO error %d recovering block "
  345. "%lu in log\n", err, io_block);
  346. return 1;
  347. } else {
  348. *crc32_sum = crc32_be(*crc32_sum, (void *)obh->b_data,
  349. obh->b_size);
  350. }
  351. put_bh(obh);
  352. }
  353. return 0;
  354. }
  355. static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
  356. {
  357. struct commit_header *h;
  358. __be32 provided;
  359. __u32 calculated;
  360. if (!jbd2_journal_has_csum_v2or3(j))
  361. return 1;
  362. h = buf;
  363. provided = h->h_chksum[0];
  364. h->h_chksum[0] = 0;
  365. calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
  366. h->h_chksum[0] = provided;
  367. return provided == cpu_to_be32(calculated);
  368. }
  369. static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
  370. journal_block_tag3_t *tag3,
  371. void *buf, __u32 sequence)
  372. {
  373. __u32 csum32;
  374. __be32 seq;
  375. if (!jbd2_journal_has_csum_v2or3(j))
  376. return 1;
  377. seq = cpu_to_be32(sequence);
  378. csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
  379. csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
  380. if (jbd2_has_feature_csum3(j))
  381. return tag3->t_checksum == cpu_to_be32(csum32);
  382. else
  383. return tag->t_checksum == cpu_to_be16(csum32);
  384. }
  385. static int do_one_pass(journal_t *journal,
  386. struct recovery_info *info, enum passtype pass)
  387. {
  388. unsigned int first_commit_ID, next_commit_ID;
  389. unsigned long next_log_block;
  390. int err, success = 0;
  391. journal_superblock_t * sb;
  392. journal_header_t * tmp;
  393. struct buffer_head * bh;
  394. unsigned int sequence;
  395. int blocktype;
  396. int tag_bytes = journal_tag_bytes(journal);
  397. __u32 crc32_sum = ~0; /* Transactional Checksums */
  398. int descr_csum_size = 0;
  399. int block_error = 0;
  400. bool need_check_commit_time = false;
  401. __u64 last_trans_commit_time = 0, commit_time;
  402. /*
  403. * First thing is to establish what we expect to find in the log
  404. * (in terms of transaction IDs), and where (in terms of log
  405. * block offsets): query the superblock.
  406. */
  407. sb = journal->j_superblock;
  408. next_commit_ID = be32_to_cpu(sb->s_sequence);
  409. next_log_block = be32_to_cpu(sb->s_start);
  410. first_commit_ID = next_commit_ID;
  411. if (pass == PASS_SCAN)
  412. info->start_transaction = first_commit_ID;
  413. jbd2_debug(1, "Starting recovery pass %d\n", pass);
  414. /*
  415. * Now we walk through the log, transaction by transaction,
  416. * making sure that each transaction has a commit block in the
  417. * expected place. Each complete transaction gets replayed back
  418. * into the main filesystem.
  419. */
  420. while (1) {
  421. int flags;
  422. char * tagp;
  423. journal_block_tag_t tag;
  424. struct buffer_head * obh;
  425. struct buffer_head * nbh;
  426. cond_resched();
  427. /* If we already know where to stop the log traversal,
  428. * check right now that we haven't gone past the end of
  429. * the log. */
  430. if (pass != PASS_SCAN)
  431. if (tid_geq(next_commit_ID, info->end_transaction))
  432. break;
  433. jbd2_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
  434. next_commit_ID, next_log_block, journal->j_last);
  435. /* Skip over each chunk of the transaction looking
  436. * either the next descriptor block or the final commit
  437. * record. */
  438. jbd2_debug(3, "JBD2: checking block %ld\n", next_log_block);
  439. err = jread(&bh, journal, next_log_block);
  440. if (err)
  441. goto failed;
  442. next_log_block++;
  443. wrap(journal, next_log_block);
  444. /* What kind of buffer is it?
  445. *
  446. * If it is a descriptor block, check that it has the
  447. * expected sequence number. Otherwise, we're all done
  448. * here. */
  449. tmp = (journal_header_t *)bh->b_data;
  450. if (tmp->h_magic != cpu_to_be32(JBD2_MAGIC_NUMBER)) {
  451. brelse(bh);
  452. break;
  453. }
  454. blocktype = be32_to_cpu(tmp->h_blocktype);
  455. sequence = be32_to_cpu(tmp->h_sequence);
  456. jbd2_debug(3, "Found magic %d, sequence %d\n",
  457. blocktype, sequence);
  458. if (sequence != next_commit_ID) {
  459. brelse(bh);
  460. break;
  461. }
  462. /* OK, we have a valid descriptor block which matches
  463. * all of the sequence number checks. What are we going
  464. * to do with it? That depends on the pass... */
  465. switch(blocktype) {
  466. case JBD2_DESCRIPTOR_BLOCK:
  467. /* Verify checksum first */
  468. if (jbd2_journal_has_csum_v2or3(journal))
  469. descr_csum_size =
  470. sizeof(struct jbd2_journal_block_tail);
  471. if (descr_csum_size > 0 &&
  472. !jbd2_descriptor_block_csum_verify(journal,
  473. bh->b_data)) {
  474. /*
  475. * PASS_SCAN can see stale blocks due to lazy
  476. * journal init. Don't error out on those yet.
  477. */
  478. if (pass != PASS_SCAN) {
  479. pr_err("JBD2: Invalid checksum recovering block %lu in log\n",
  480. next_log_block);
  481. err = -EFSBADCRC;
  482. brelse(bh);
  483. goto failed;
  484. }
  485. need_check_commit_time = true;
  486. jbd2_debug(1,
  487. "invalid descriptor block found in %lu\n",
  488. next_log_block);
  489. }
  490. /* If it is a valid descriptor block, replay it
  491. * in pass REPLAY; if journal_checksums enabled, then
  492. * calculate checksums in PASS_SCAN, otherwise,
  493. * just skip over the blocks it describes. */
  494. if (pass != PASS_REPLAY) {
  495. if (pass == PASS_SCAN &&
  496. jbd2_has_feature_checksum(journal) &&
  497. !need_check_commit_time &&
  498. !info->end_transaction) {
  499. if (calc_chksums(journal, bh,
  500. &next_log_block,
  501. &crc32_sum)) {
  502. put_bh(bh);
  503. break;
  504. }
  505. put_bh(bh);
  506. continue;
  507. }
  508. next_log_block += count_tags(journal, bh);
  509. wrap(journal, next_log_block);
  510. put_bh(bh);
  511. continue;
  512. }
  513. /* A descriptor block: we can now write all of
  514. * the data blocks. Yay, useful work is finally
  515. * getting done here! */
  516. tagp = &bh->b_data[sizeof(journal_header_t)];
  517. while ((tagp - bh->b_data + tag_bytes)
  518. <= journal->j_blocksize - descr_csum_size) {
  519. unsigned long io_block;
  520. memcpy(&tag, tagp, sizeof(tag));
  521. flags = be16_to_cpu(tag.t_flags);
  522. io_block = next_log_block++;
  523. wrap(journal, next_log_block);
  524. err = jread(&obh, journal, io_block);
  525. if (err) {
  526. /* Recover what we can, but
  527. * report failure at the end. */
  528. success = err;
  529. printk(KERN_ERR
  530. "JBD2: IO error %d recovering "
  531. "block %ld in log\n",
  532. err, io_block);
  533. } else {
  534. unsigned long long blocknr;
  535. J_ASSERT(obh != NULL);
  536. blocknr = read_tag_block(journal,
  537. &tag);
  538. /* If the block has been
  539. * revoked, then we're all done
  540. * here. */
  541. if (jbd2_journal_test_revoke
  542. (journal, blocknr,
  543. next_commit_ID)) {
  544. brelse(obh);
  545. ++info->nr_revoke_hits;
  546. goto skip_write;
  547. }
  548. /* Look for block corruption */
  549. if (!jbd2_block_tag_csum_verify(
  550. journal, &tag, (journal_block_tag3_t *)tagp,
  551. obh->b_data, be32_to_cpu(tmp->h_sequence))) {
  552. brelse(obh);
  553. success = -EFSBADCRC;
  554. printk(KERN_ERR "JBD2: Invalid "
  555. "checksum recovering "
  556. "data block %llu in "
  557. "log\n", blocknr);
  558. block_error = 1;
  559. goto skip_write;
  560. }
  561. /* Find a buffer for the new
  562. * data being restored */
  563. nbh = __getblk(journal->j_fs_dev,
  564. blocknr,
  565. journal->j_blocksize);
  566. if (nbh == NULL) {
  567. printk(KERN_ERR
  568. "JBD2: Out of memory "
  569. "during recovery.\n");
  570. err = -ENOMEM;
  571. brelse(bh);
  572. brelse(obh);
  573. goto failed;
  574. }
  575. lock_buffer(nbh);
  576. memcpy(nbh->b_data, obh->b_data,
  577. journal->j_blocksize);
  578. if (flags & JBD2_FLAG_ESCAPE) {
  579. *((__be32 *)nbh->b_data) =
  580. cpu_to_be32(JBD2_MAGIC_NUMBER);
  581. }
  582. BUFFER_TRACE(nbh, "marking dirty");
  583. set_buffer_uptodate(nbh);
  584. mark_buffer_dirty(nbh);
  585. BUFFER_TRACE(nbh, "marking uptodate");
  586. ++info->nr_replays;
  587. unlock_buffer(nbh);
  588. brelse(obh);
  589. brelse(nbh);
  590. }
  591. skip_write:
  592. tagp += tag_bytes;
  593. if (!(flags & JBD2_FLAG_SAME_UUID))
  594. tagp += 16;
  595. if (flags & JBD2_FLAG_LAST_TAG)
  596. break;
  597. }
  598. brelse(bh);
  599. continue;
  600. case JBD2_COMMIT_BLOCK:
  601. /* How to differentiate between interrupted commit
  602. * and journal corruption ?
  603. *
  604. * {nth transaction}
  605. * Checksum Verification Failed
  606. * |
  607. * ____________________
  608. * | |
  609. * async_commit sync_commit
  610. * | |
  611. * | GO TO NEXT "Journal Corruption"
  612. * | TRANSACTION
  613. * |
  614. * {(n+1)th transanction}
  615. * |
  616. * _______|______________
  617. * | |
  618. * Commit block found Commit block not found
  619. * | |
  620. * "Journal Corruption" |
  621. * _____________|_________
  622. * | |
  623. * nth trans corrupt OR nth trans
  624. * and (n+1)th interrupted interrupted
  625. * before commit block
  626. * could reach the disk.
  627. * (Cannot find the difference in above
  628. * mentioned conditions. Hence assume
  629. * "Interrupted Commit".)
  630. */
  631. commit_time = be64_to_cpu(
  632. ((struct commit_header *)bh->b_data)->h_commit_sec);
  633. /*
  634. * If need_check_commit_time is set, it means we are in
  635. * PASS_SCAN and csum verify failed before. If
  636. * commit_time is increasing, it's the same journal,
  637. * otherwise it is stale journal block, just end this
  638. * recovery.
  639. */
  640. if (need_check_commit_time) {
  641. if (commit_time >= last_trans_commit_time) {
  642. pr_err("JBD2: Invalid checksum found in transaction %u\n",
  643. next_commit_ID);
  644. err = -EFSBADCRC;
  645. brelse(bh);
  646. goto failed;
  647. }
  648. ignore_crc_mismatch:
  649. /*
  650. * It likely does not belong to same journal,
  651. * just end this recovery with success.
  652. */
  653. jbd2_debug(1, "JBD2: Invalid checksum ignored in transaction %u, likely stale data\n",
  654. next_commit_ID);
  655. brelse(bh);
  656. goto done;
  657. }
  658. /*
  659. * Found an expected commit block: if checksums
  660. * are present, verify them in PASS_SCAN; else not
  661. * much to do other than move on to the next sequence
  662. * number.
  663. */
  664. if (pass == PASS_SCAN &&
  665. jbd2_has_feature_checksum(journal)) {
  666. struct commit_header *cbh =
  667. (struct commit_header *)bh->b_data;
  668. unsigned found_chksum =
  669. be32_to_cpu(cbh->h_chksum[0]);
  670. if (info->end_transaction) {
  671. journal->j_failed_commit =
  672. info->end_transaction;
  673. brelse(bh);
  674. break;
  675. }
  676. /* Neither checksum match nor unused? */
  677. if (!((crc32_sum == found_chksum &&
  678. cbh->h_chksum_type ==
  679. JBD2_CRC32_CHKSUM &&
  680. cbh->h_chksum_size ==
  681. JBD2_CRC32_CHKSUM_SIZE) ||
  682. (cbh->h_chksum_type == 0 &&
  683. cbh->h_chksum_size == 0 &&
  684. found_chksum == 0)))
  685. goto chksum_error;
  686. crc32_sum = ~0;
  687. }
  688. if (pass == PASS_SCAN &&
  689. !jbd2_commit_block_csum_verify(journal,
  690. bh->b_data)) {
  691. chksum_error:
  692. if (commit_time < last_trans_commit_time)
  693. goto ignore_crc_mismatch;
  694. info->end_transaction = next_commit_ID;
  695. if (!jbd2_has_feature_async_commit(journal)) {
  696. journal->j_failed_commit =
  697. next_commit_ID;
  698. brelse(bh);
  699. break;
  700. }
  701. }
  702. if (pass == PASS_SCAN)
  703. last_trans_commit_time = commit_time;
  704. brelse(bh);
  705. next_commit_ID++;
  706. continue;
  707. case JBD2_REVOKE_BLOCK:
  708. /*
  709. * Check revoke block crc in pass_scan, if csum verify
  710. * failed, check commit block time later.
  711. */
  712. if (pass == PASS_SCAN &&
  713. !jbd2_descriptor_block_csum_verify(journal,
  714. bh->b_data)) {
  715. jbd2_debug(1, "JBD2: invalid revoke block found in %lu\n",
  716. next_log_block);
  717. need_check_commit_time = true;
  718. }
  719. /* If we aren't in the REVOKE pass, then we can
  720. * just skip over this block. */
  721. if (pass != PASS_REVOKE) {
  722. brelse(bh);
  723. continue;
  724. }
  725. err = scan_revoke_records(journal, bh,
  726. next_commit_ID, info);
  727. brelse(bh);
  728. if (err)
  729. goto failed;
  730. continue;
  731. default:
  732. jbd2_debug(3, "Unrecognised magic %d, end of scan.\n",
  733. blocktype);
  734. brelse(bh);
  735. goto done;
  736. }
  737. }
  738. done:
  739. /*
  740. * We broke out of the log scan loop: either we came to the
  741. * known end of the log or we found an unexpected block in the
  742. * log. If the latter happened, then we know that the "current"
  743. * transaction marks the end of the valid log.
  744. */
  745. if (pass == PASS_SCAN) {
  746. if (!info->end_transaction)
  747. info->end_transaction = next_commit_ID;
  748. } else {
  749. /* It's really bad news if different passes end up at
  750. * different places (but possible due to IO errors). */
  751. if (info->end_transaction != next_commit_ID) {
  752. printk(KERN_ERR "JBD2: recovery pass %d ended at "
  753. "transaction %u, expected %u\n",
  754. pass, next_commit_ID, info->end_transaction);
  755. if (!success)
  756. success = -EIO;
  757. }
  758. }
  759. if (jbd2_has_feature_fast_commit(journal) && pass != PASS_REVOKE) {
  760. err = fc_do_one_pass(journal, info, pass);
  761. if (err)
  762. success = err;
  763. }
  764. if (block_error && success == 0)
  765. success = -EIO;
  766. return success;
  767. failed:
  768. return err;
  769. }
  770. /* Scan a revoke record, marking all blocks mentioned as revoked. */
  771. static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
  772. tid_t sequence, struct recovery_info *info)
  773. {
  774. jbd2_journal_revoke_header_t *header;
  775. int offset, max;
  776. unsigned csum_size = 0;
  777. __u32 rcount;
  778. int record_len = 4;
  779. header = (jbd2_journal_revoke_header_t *) bh->b_data;
  780. offset = sizeof(jbd2_journal_revoke_header_t);
  781. rcount = be32_to_cpu(header->r_count);
  782. if (jbd2_journal_has_csum_v2or3(journal))
  783. csum_size = sizeof(struct jbd2_journal_block_tail);
  784. if (rcount > journal->j_blocksize - csum_size)
  785. return -EINVAL;
  786. max = rcount;
  787. if (jbd2_has_feature_64bit(journal))
  788. record_len = 8;
  789. while (offset + record_len <= max) {
  790. unsigned long long blocknr;
  791. int err;
  792. if (record_len == 4)
  793. blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
  794. else
  795. blocknr = be64_to_cpu(* ((__be64 *) (bh->b_data+offset)));
  796. offset += record_len;
  797. err = jbd2_journal_set_revoke(journal, blocknr, sequence);
  798. if (err)
  799. return err;
  800. ++info->nr_revokes;
  801. }
  802. return 0;
  803. }