xfs: add support for large btree blocks

Add support for larger btree blocks that contains a CRC32C checksum,
a filesystem uuid and block number for detecting filesystem
consistency and out of place writes.

[dchinner@redhat.com] Also include an owner field to allow reverse
mappings to be implemented for improved repairability and a LSN
field to so that log recovery can easily determine the last
modification that made it to disk for each buffer.

[dchinner@redhat.com] Add buffer log format flags to indicate the
type of buffer to recovery so that we don't have to do blind magic
number tests to determine what the buffer is.

[dchinner@redhat.com] Modified to fit into the verifier structure.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Ben Myers <bpm@sgi.com>
Signed-off-by: Ben Myers <bpm@sgi.com>
This commit is contained in:
Christoph Hellwig
2013-04-21 14:53:46 -05:00
committed by Ben Myers
parent a2050646f6
commit ee1a47ab0e
17 changed files with 642 additions and 206 deletions

View File

@@ -30,9 +30,11 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_buf_item.h"
#include "xfs_btree.h"
#include "xfs_error.h"
#include "xfs_trace.h"
#include "xfs_cksum.h"
/*
* Cursor allocation zone.
@@ -42,9 +44,13 @@ kmem_zone_t *xfs_btree_cur_zone;
/*
* Btree magic numbers.
*/
const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC },
{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC }
};
#define xfs_btree_magic(cur) \
xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
STATIC int /* error (0 or EFSCORRUPTED) */
@@ -54,30 +60,38 @@ xfs_btree_check_lblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer for block, if any */
{
int lblock_ok; /* block passes checks */
int lblock_ok = 1; /* block passes checks */
struct xfs_mount *mp; /* file system mount point */
mp = cur->bc_mp;
lblock_ok =
be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
if (xfs_sb_version_hascrc(&mp->m_sb)) {
lblock_ok = lblock_ok &&
uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
block->bb_u.l.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
lblock_ok = lblock_ok &&
be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
block->bb_u.l.bb_leftsib &&
(block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
block->bb_u.l.bb_rightsib &&
(block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
XFS_FSB_SANITY_CHECK(mp,
be64_to_cpu(block->bb_u.l.bb_rightsib)));
be64_to_cpu(block->bb_u.l.bb_rightsib)));
if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_LBLOCK,
XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_ERROR_REPORT("xfs_btree_check_lblock", XFS_ERRLEVEL_LOW,
mp);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -90,16 +104,26 @@ xfs_btree_check_sblock(
int level, /* level of the btree block */
struct xfs_buf *bp) /* buffer containing block */
{
struct xfs_mount *mp; /* file system mount point */
struct xfs_buf *agbp; /* buffer for ag. freespace struct */
struct xfs_agf *agf; /* ag. freespace structure */
xfs_agblock_t agflen; /* native ag. freespace length */
int sblock_ok; /* block passes checks */
int sblock_ok = 1; /* block passes checks */
mp = cur->bc_mp;
agbp = cur->bc_private.a.agbp;
agf = XFS_BUF_TO_AGF(agbp);
agflen = be32_to_cpu(agf->agf_length);
sblock_ok =
be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
if (xfs_sb_version_hascrc(&mp->m_sb)) {
sblock_ok = sblock_ok &&
uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
block->bb_u.s.bb_blkno == cpu_to_be64(
bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
}
sblock_ok = sblock_ok &&
be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
cur->bc_ops->get_maxrecs(cur, level) &&
@@ -109,13 +133,13 @@ xfs_btree_check_sblock(
(block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
block->bb_u.s.bb_rightsib;
if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK,
XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
if (bp)
trace_xfs_btree_corrupt(bp, _RET_IP_);
XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
XFS_ERRLEVEL_LOW, cur->bc_mp, block);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
return XFS_ERROR(EFSCORRUPTED);
}
return 0;
@@ -193,6 +217,72 @@ xfs_btree_check_ptr(
}
#endif
/*
* Calculate CRC on the whole btree block and stuff it into the
* long-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
* it into the buffer so recovery knows what the last modifcation was that made
* it to disk.
*/
void
xfs_btree_lblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
return;
if (bip)
block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
XFS_BTREE_LBLOCK_CRC_OFF);
}
bool
xfs_btree_lblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
XFS_BTREE_LBLOCK_CRC_OFF);
return true;
}
/*
* Calculate CRC on the whole btree block and stuff it into the
* short-form btree header.
*
* Prior to calculting the CRC, pull the LSN out of the buffer log item and put
* it into the buffer so recovery knows what the last modifcation was that made
* it to disk.
*/
void
xfs_btree_sblock_calc_crc(
struct xfs_buf *bp)
{
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
struct xfs_buf_log_item *bip = bp->b_fspriv;
if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
return;
if (bip)
block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
xfs_update_cksum(bp->b_addr, BBTOB(bp->b_length),
XFS_BTREE_SBLOCK_CRC_OFF);
}
bool
xfs_btree_sblock_verify_crc(
struct xfs_buf *bp)
{
if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
return xfs_verify_cksum(bp->b_addr, BBTOB(bp->b_length),
XFS_BTREE_SBLOCK_CRC_OFF);
return true;
}
/*
* Delete the btree cursor.
*/
@@ -277,10 +367,8 @@ xfs_btree_dup_cursor(
*ncur = NULL;
return error;
}
new->bc_bufs[i] = bp;
ASSERT(!xfs_buf_geterror(bp));
} else
new->bc_bufs[i] = NULL;
}
new->bc_bufs[i] = bp;
}
*ncur = new;
return 0;
@@ -321,9 +409,14 @@ xfs_btree_dup_cursor(
*/
static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
{
return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
XFS_BTREE_LBLOCK_LEN :
XFS_BTREE_SBLOCK_LEN;
if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
return XFS_BTREE_LBLOCK_CRC_LEN;
return XFS_BTREE_LBLOCK_LEN;
}
if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
return XFS_BTREE_SBLOCK_CRC_LEN;
return XFS_BTREE_SBLOCK_LEN;
}
/*
@@ -862,6 +955,44 @@ xfs_btree_set_sibling(
}
}
void
xfs_btree_init_block_int(
struct xfs_mount *mp,
struct xfs_btree_block *buf,
xfs_daddr_t blkno,
__u32 magic,
__u16 level,
__u16 numrecs,
__u64 owner,
unsigned int flags)
{
buf->bb_magic = cpu_to_be32(magic);
buf->bb_level = cpu_to_be16(level);
buf->bb_numrecs = cpu_to_be16(numrecs);
if (flags & XFS_BTREE_LONG_PTRS) {
buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.l.bb_owner = cpu_to_be64(owner);
uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
buf->bb_u.l.bb_pad = 0;
}
} else {
/* owner is a 32 bit value on short blocks */
__u32 __owner = (__u32)owner;
buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
if (flags & XFS_BTREE_CRC_BLOCKS) {
buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
}
}
}
void
xfs_btree_init_block(
struct xfs_mount *mp,
@@ -869,37 +1000,41 @@ xfs_btree_init_block(
__u32 magic,
__u16 level,
__u16 numrecs,
__u64 owner,
unsigned int flags)
{
struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
new->bb_magic = cpu_to_be32(magic);
new->bb_level = cpu_to_be16(level);
new->bb_numrecs = cpu_to_be16(numrecs);
if (flags & XFS_BTREE_LONG_PTRS) {
new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
} else {
new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
}
xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
magic, level, numrecs, owner, flags);
}
STATIC void
xfs_btree_init_block_cur(
struct xfs_btree_cur *cur,
struct xfs_buf *bp,
int level,
int numrecs,
struct xfs_buf *bp)
int numrecs)
{
xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
level, numrecs, cur->bc_flags);
__u64 owner;
/*
* we can pull the owner from the cursor right now as the different
* owners align directly with the pointer size of the btree. This may
* change in future, but is safe for current users of the generic btree
* code.
*/
if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
owner = cur->bc_private.b.ip->i_ino;
else
owner = cur->bc_private.a.agno;
xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
xfs_btree_magic(cur), level, numrecs,
owner, cur->bc_flags);
}
/*
* Return true if ptr is the last record in the btree and
* we need to track updateѕ to this record. The decision
* we need to track updates to this record. The decision
* will be further refined in the update_lastrec method.
*/
STATIC int
@@ -1147,6 +1282,7 @@ xfs_btree_log_keys(
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
if (bp) {
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_key_offset(cur, first),
xfs_btree_key_offset(cur, last + 1) - 1);
@@ -1171,6 +1307,7 @@ xfs_btree_log_recs(
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_rec_offset(cur, first),
xfs_btree_rec_offset(cur, last + 1) - 1);
@@ -1195,6 +1332,7 @@ xfs_btree_log_ptrs(
struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
int level = xfs_btree_get_level(block);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp,
xfs_btree_ptr_offset(cur, first, level),
xfs_btree_ptr_offset(cur, last + 1, level) - 1);
@@ -1223,7 +1361,12 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
XFS_BTREE_SBLOCK_LEN
offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
XFS_BTREE_SBLOCK_CRC_LEN
};
static const short loffsets[] = { /* table of offsets (long) */
offsetof(struct xfs_btree_block, bb_magic),
@@ -1231,17 +1374,40 @@ xfs_btree_log_block(
offsetof(struct xfs_btree_block, bb_numrecs),
offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
XFS_BTREE_LBLOCK_LEN
offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
XFS_BTREE_LBLOCK_CRC_LEN
};
XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
if (bp) {
int nbits;
if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
/*
* We don't log the CRC when updating a btree
* block but instead recreate it during log
* recovery. As the log buffers have checksums
* of their own this is safe and avoids logging a crc
* update in a lot of places.
*/
if (fields == XFS_BB_ALL_BITS)
fields = XFS_BB_ALL_BITS_CRC;
nbits = XFS_BB_NUM_BITS_CRC;
} else {
nbits = XFS_BB_NUM_BITS;
}
xfs_btree_offsets(fields,
(cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
loffsets : soffsets,
XFS_BB_NUM_BITS, &first, &last);
nbits, &first, &last);
xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLF_BTREE_BUF);
xfs_trans_log_buf(cur->bc_tp, bp, first, last);
} else {
xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
@@ -2204,7 +2370,7 @@ xfs_btree_split(
goto error0;
/* Fill in the btree header for the new right block. */
xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
/*
* Split the entries between the old and the new block evenly.
@@ -2513,7 +2679,7 @@ xfs_btree_new_root(
nptr = 2;
}
/* Fill in the new block's btree header and log it. */
xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
!xfs_btree_ptr_is_null(cur, &rptr));