aboutsummaryrefslogtreecommitdiff
path: root/fs/xfs/xfs_log_recover.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
-rw-r--r--fs/xfs/xfs_log_recover.c2478
1 files changed, 1547 insertions, 931 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 966d3f97458..981af0f6504 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -17,42 +17,65 @@
*/
#include "xfs.h"
#include "xfs_fs.h"
-#include "xfs_types.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
#include "xfs_bit.h"
-#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_error.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
+#include "xfs_da_format.h"
#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
#include "xfs_log_priv.h"
-#include "xfs_buf_item.h"
#include "xfs_log_recover.h"
+#include "xfs_inode_item.h"
#include "xfs_extfree_item.h"
#include "xfs_trans_priv.h"
+#include "xfs_alloc.h"
+#include "xfs_ialloc.h"
#include "xfs_quota.h"
-#include "xfs_rw.h"
-#include "xfs_utils.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+
+#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
-STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
-STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
+STATIC int
+xlog_find_zeroed(
+ struct xlog *,
+ xfs_daddr_t *);
+STATIC int
+xlog_clear_stale_blocks(
+ struct xlog *,
+ xfs_lsn_t);
#if defined(DEBUG)
-STATIC void xlog_recover_check_summary(xlog_t *);
+STATIC void
+xlog_recover_check_summary(
+ struct xlog *);
#else
#define xlog_recover_check_summary(log)
#endif
/*
+ * This structure is used during recovery to record the buf log items which
+ * have been canceled and should not be replayed.
+ */
+struct xfs_buf_cancel {
+ xfs_daddr_t bc_blkno;
+ uint bc_len;
+ int bc_refcount;
+ struct list_head bc_list;
+};
+
+/*
* Sector aligned buffer routines for buffer create/read/write/access
*/
@@ -64,7 +87,7 @@ STATIC void xlog_recover_check_summary(xlog_t *);
static inline int
xlog_buf_bbcount_valid(
- xlog_t *log,
+ struct xlog *log,
int bbcount)
{
return bbcount > 0 && bbcount <= log->l_logBBsize;
@@ -77,11 +100,13 @@ xlog_buf_bbcount_valid(
*/
STATIC xfs_buf_t *
xlog_get_bp(
- xlog_t *log,
+ struct xlog *log,
int nbblks)
{
+ struct xfs_buf *bp;
+
if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return NULL;
@@ -90,7 +115,7 @@ xlog_get_bp(
/*
* We do log I/O in units of log sectors (a power-of-2
* multiple of the basic block size), so we round up the
- * requested size to acommodate the basic blocks required
+ * requested size to accommodate the basic blocks required
* for complete log sectors.
*
* In addition, the buffer may be used for a non-sector-
@@ -101,14 +126,16 @@ xlog_get_bp(
* an issue. Nor will this be a problem if the log I/O is
* done in basic blocks (sector size 1). But otherwise we
* extend the buffer by one extra log sector to ensure
- * there's space to accomodate this possiblility.
+ * there's space to accommodate this possibility.
*/
if (nbblks > 1 && log->l_sectBBsize > 1)
nbblks += log->l_sectBBsize;
nbblks = round_up(nbblks, log->l_sectBBsize);
- return xfs_buf_get_uncached(log->l_mp->m_logdev_targp,
- BBTOB(nbblks), 0);
+ bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0);
+ if (bp)
+ xfs_buf_unlock(bp);
+ return bp;
}
STATIC void
@@ -124,15 +151,15 @@ xlog_put_bp(
*/
STATIC xfs_caddr_t
xlog_align(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
- ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
- return XFS_BUF_PTR(bp) + BBTOB(offset);
+ ASSERT(offset + nbblks <= bp->b_length);
+ return bp->b_addr + BBTOB(offset);
}
@@ -141,15 +168,15 @@ xlog_align(
*/
STATIC int
xlog_bread_noalign(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return EFSCORRUPTED;
@@ -159,28 +186,29 @@ xlog_bread_noalign(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_READ(bp);
- XFS_BUF_BUSY(bp);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
- XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
- xfsbdstrat(log->l_mp, bp);
+ if (XFS_FORCED_SHUTDOWN(log->l_mp))
+ return XFS_ERROR(EIO);
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error)
- xfs_ioerror_alert("xlog_bread", log->l_mp,
- bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
return error;
}
STATIC int
xlog_bread(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_caddr_t *offset)
{
int error;
@@ -194,21 +222,50 @@ xlog_bread(
}
/*
+ * Read at an offset into the buffer. Returns with the buffer in it's original
+ * state regardless of the result of the read.
+ */
+STATIC int
+xlog_bread_offset(
+ struct xlog *log,
+ xfs_daddr_t blk_no, /* block to read from */
+ int nbblks, /* blocks to read */
+ struct xfs_buf *bp,
+ xfs_caddr_t offset)
+{
+ xfs_caddr_t orig_offset = bp->b_addr;
+ int orig_len = BBTOB(bp->b_length);
+ int error, error2;
+
+ error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
+ if (error)
+ return error;
+
+ error = xlog_bread_noalign(log, blk_no, nbblks, bp);
+
+ /* must reset buffer pointer even on error */
+ error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
+ if (error)
+ return error;
+ return error2;
+}
+
+/*
* Write out the buffer at the given block for the given number of blocks.
* The buffer is kept locked across the write and is returned locked.
* This can only be used for synchronous log writes.
*/
STATIC int
xlog_bwrite(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t blk_no,
int nbblks,
- xfs_buf_t *bp)
+ struct xfs_buf *bp)
{
int error;
if (!xlog_buf_bbcount_valid(log, nbblks)) {
- xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
+ xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
nbblks);
XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
return EFSCORRUPTED;
@@ -218,19 +275,19 @@ xlog_bwrite(
nbblks = round_up(nbblks, log->l_sectBBsize);
ASSERT(nbblks > 0);
- ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
+ ASSERT(nbblks <= bp->b_length);
XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
XFS_BUF_ZEROFLAGS(bp);
- XFS_BUF_BUSY(bp);
- XFS_BUF_HOLD(bp);
- XFS_BUF_PSEMA(bp, PRIBIO);
- XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
- XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
-
- if ((error = xfs_bwrite(log->l_mp, bp)))
- xfs_ioerror_alert("xlog_bwrite", log->l_mp,
- bp, XFS_BUF_ADDR(bp));
+ xfs_buf_hold(bp);
+ xfs_buf_lock(bp);
+ bp->b_io_length = nbblks;
+ bp->b_error = 0;
+
+ error = xfs_bwrite(bp);
+ if (error)
+ xfs_buf_ioerror_alert(bp, __func__);
+ xfs_buf_relse(bp);
return error;
}
@@ -243,9 +300,9 @@ xlog_header_check_dump(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
- cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n",
+ xfs_debug(mp, " log : uuid = %pU, fmt = %d",
&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
}
#else
@@ -260,23 +317,23 @@ xlog_header_check_recover(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+ ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
/*
* IRIX doesn't write the h_fmt field and leaves it zeroed
* (XLOG_FMT_UNKNOWN). This stops us from trying to recover
* a dirty log created in IRIX.
*/
- if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
- xlog_warn(
- "XFS: dirty log written in incompatible format - can't recover");
+ if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
+ xfs_warn(mp,
+ "dirty log written in incompatible format - can't recover");
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_recover(1)",
XFS_ERRLEVEL_HIGH, mp);
return XFS_ERROR(EFSCORRUPTED);
} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
- xlog_warn(
- "XFS: dirty log entry has mismatched uuid - can't recover");
+ xfs_warn(mp,
+ "dirty log entry has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_recover(2)",
XFS_ERRLEVEL_HIGH, mp);
@@ -293,7 +350,7 @@ xlog_header_check_mount(
xfs_mount_t *mp,
xlog_rec_header_t *head)
{
- ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
+ ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
if (uuid_is_nil(&head->h_fs_uuid)) {
/*
@@ -301,9 +358,9 @@ xlog_header_check_mount(
* h_fs_uuid is nil, we assume this log was last mounted
* by IRIX and continue.
*/
- xlog_warn("XFS: nil uuid in log - IRIX style log");
+ xfs_warn(mp, "nil uuid in log - IRIX style log");
} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
- xlog_warn("XFS: log has mismatched uuid - can't recover");
+ xfs_warn(mp, "log has mismatched uuid - can't recover");
xlog_header_check_dump(mp, head);
XFS_ERROR_REPORT("xlog_header_check_mount",
XFS_ERRLEVEL_HIGH, mp);
@@ -316,18 +373,16 @@ STATIC void
xlog_recover_iodone(
struct xfs_buf *bp)
{
- if (XFS_BUF_GETERROR(bp)) {
+ if (bp->b_error) {
/*
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- xfs_ioerror_alert("xlog_recover_iodone",
- bp->b_target->bt_mount, bp,
- XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
xfs_force_shutdown(bp->b_target->bt_mount,
SHUTDOWN_META_IO_ERROR);
}
- XFS_BUF_CLR_IODONE_FUNC(bp);
+ bp->b_iodone = NULL;
xfs_buf_ioend(bp, 0);
}
@@ -339,8 +394,8 @@ xlog_recover_iodone(
*/
STATIC int
xlog_find_cycle_start(
- xlog_t *log,
- xfs_buf_t *bp,
+ struct xlog *log,
+ struct xfs_buf *bp,
xfs_daddr_t first_blk,
xfs_daddr_t *last_blk,
uint cycle)
@@ -382,7 +437,7 @@ xlog_find_cycle_start(
*/
STATIC int
xlog_find_verify_cycle(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
int nbblks,
uint stop_on_cycle_no,
@@ -402,6 +457,8 @@ xlog_find_verify_cycle(
* a log sector, or we're out of luck.
*/
bufblks = 1 << ffs(nbblks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < log->l_sectBBsize)
@@ -449,7 +506,7 @@ out:
*/
STATIC int
xlog_find_verify_log_record(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t start_blk,
xfs_daddr_t *last_blk,
int extra_bblks)
@@ -479,8 +536,8 @@ xlog_find_verify_log_record(
for (i = (*last_blk) - 1; i >= 0; i--) {
if (i < start_blk) {
/* valid log record not found */
- xlog_warn(
- "XFS: Log inconsistent (didn't find previous header)");
+ xfs_warn(log->l_mp,
+ "Log inconsistent (didn't find previous header)");
ASSERT(0);
error = XFS_ERROR(EIO);
goto out;
@@ -494,7 +551,7 @@ xlog_find_verify_log_record(
head = (xlog_rec_header_t *)offset;
- if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
+ if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
break;
if (!smallmem)
@@ -546,7 +603,7 @@ out:
/*
* Head is defined to be the point of the log where the next log write
- * write could go. This means that incomplete LR writes at the end are
+ * could go. This means that incomplete LR writes at the end are
* eliminated when calculating the head. We aren't guaranteed that previous
* LR have complete transactions. We only know that a cycle number of
* current cycle number -1 won't be present in the log if we start writing
@@ -559,7 +616,7 @@ out:
*/
STATIC int
xlog_find_head(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *return_head_blk)
{
xfs_buf_t *bp;
@@ -580,12 +637,12 @@ xlog_find_head(
* mkfs etc write a dummy unmount record to a fresh
* log so we can store the uuid in there
*/
- xlog_warn("XFS: totally zeroed log");
+ xfs_warn(log->l_mp, "totally zeroed log");
}
return 0;
} else if (error) {
- xlog_warn("XFS: empty log check failed");
+ xfs_warn(log->l_mp, "empty log check failed");
return error;
}
@@ -808,7 +865,7 @@ validate_head:
xlog_put_bp(bp);
if (error)
- xlog_warn("XFS: failed to find log head");
+ xfs_warn(log->l_mp, "failed to find log head");
return error;
}
@@ -830,7 +887,7 @@ validate_head:
*/
STATIC int
xlog_find_tail(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *head_blk,
xfs_daddr_t *tail_blk)
{
@@ -876,7 +933,7 @@ xlog_find_tail(
if (error)
goto done;
- if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
+ if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
found = 1;
break;
}
@@ -893,15 +950,16 @@ xlog_find_tail(
if (error)
goto done;
- if (XLOG_HEADER_MAGIC_NUM ==
- be32_to_cpu(*(__be32 *)offset)) {
+ if (*(__be32 *)offset ==
+ cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
found = 2;
break;
}
}
}
if (!found) {
- xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
+ xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+ xlog_put_bp(bp);
ASSERT(0);
return XFS_ERROR(EIO);
}
@@ -925,12 +983,12 @@ xlog_find_tail(
log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
if (found == 2)
log->l_curr_cycle++;
- log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
- log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
- log->l_grant_reserve_cycle = log->l_curr_cycle;
- log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
- log->l_grant_write_cycle = log->l_curr_cycle;
- log->l_grant_write_bytes = BBTOB(log->l_curr_block);
+ atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+ atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+ xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
+ xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+ BBTOB(log->l_curr_block));
/*
* Look for unmount record. If we find it, then we know there
@@ -960,7 +1018,7 @@ xlog_find_tail(
}
after_umount_blk = (i + hblks + (int)
BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
- tail_lsn = log->l_tail_lsn;
+ tail_lsn = atomic64_read(&log->l_tail_lsn);
if (*head_blk == after_umount_blk &&
be32_to_cpu(rhead->h_num_logops) == 1) {
umount_data_blk = (i + hblks) % log->l_logBBsize;
@@ -975,12 +1033,10 @@ xlog_find_tail(
* log records will point recovery to after the
* current unmount record.
*/
- log->l_tail_lsn =
- xlog_assign_lsn(log->l_curr_cycle,
- after_umount_blk);
- log->l_last_sync_lsn =
- xlog_assign_lsn(log->l_curr_cycle,
- after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_tail_lsn,
+ log->l_curr_cycle, after_umount_blk);
+ xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
+ log->l_curr_cycle, after_umount_blk);
*tail_blk = after_umount_blk;
/*
@@ -1019,7 +1075,7 @@ done:
xlog_put_bp(bp);
if (error)
- xlog_warn("XFS: failed to locate log tail");
+ xfs_warn(log->l_mp, "failed to locate log tail");
return error;
}
@@ -1041,7 +1097,7 @@ done:
*/
STATIC int
xlog_find_zeroed(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t *blk_no)
{
xfs_buf_t *bp;
@@ -1083,8 +1139,10 @@ xlog_find_zeroed(
* the first block must be 1. If it's not, maybe we're
* not looking at a log... Bail out.
*/
- xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
- return XFS_ERROR(EINVAL);
+ xfs_warn(log->l_mp,
+ "Log inconsistent or not a log (last==0, first!=1)");
+ error = XFS_ERROR(EINVAL);
+ goto bp_err;
}
/* we have a partially zeroed log */
@@ -1143,7 +1201,7 @@ bp_err:
*/
STATIC void
xlog_add_record(
- xlog_t *log,
+ struct xlog *log,
xfs_caddr_t buf,
int cycle,
int block,
@@ -1165,7 +1223,7 @@ xlog_add_record(
STATIC int
xlog_write_log_records(
- xlog_t *log,
+ struct xlog *log,
int cycle,
int start_block,
int blocks,
@@ -1188,6 +1246,8 @@ xlog_write_log_records(
* log sector, or we're out of luck.
*/
bufblks = 1 << ffs(blocks);
+ while (bufblks > log->l_logBBsize)
+ bufblks >>= 1;
while (!(bp = xlog_get_bp(log, bufblks))) {
bufblks >>= 1;
if (bufblks < sectbb)
@@ -1219,20 +1279,12 @@ xlog_write_log_records(
*/
ealign = round_down(end_block, sectbb);
if (j == 0 && (start_block + endcount > ealign)) {
- offset = XFS_BUF_PTR(bp);
- balign = BBTOB(ealign - start_block);
- error = XFS_BUF_SET_PTR(bp, offset + balign,
- BBTOB(sectbb));
+ offset = bp->b_addr + BBTOB(ealign - start_block);
+ error = xlog_bread_offset(log, ealign, sectbb,
+ bp, offset);
if (error)
break;
- error = xlog_bread_noalign(log, ealign, sectbb, bp);
- if (error)
- break;
-
- error = XFS_BUF_SET_PTR(bp, offset, bufblks);
- if (error)
- break;
}
offset = xlog_align(log, start_block, endcount, bp);
@@ -1271,7 +1323,7 @@ xlog_write_log_records(
*/
STATIC int
xlog_clear_stale_blocks(
- xlog_t *log,
+ struct xlog *log,
xfs_lsn_t tail_lsn)
{
int tail_cycle, head_cycle;
@@ -1398,9 +1450,8 @@ xlog_recover_find_tid(
xlog_tid_t tid)
{
xlog_recover_t *trans;
- struct hlist_node *n;
- hlist_for_each_entry(trans, n, head, r_list) {
+ hlist_for_each_entry(trans, head, r_list) {
if (trans->r_log_tid == tid)
return trans;
}
@@ -1437,8 +1488,8 @@ xlog_recover_add_item(
STATIC int
xlog_recover_add_to_cont_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1460,7 +1511,7 @@ xlog_recover_add_to_cont_trans(
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
+ ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
memcpy(&ptr[old_len], dp, len); /* d, s, l */
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -1483,8 +1534,8 @@ xlog_recover_add_to_cont_trans(
*/
STATIC int
xlog_recover_add_to_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
xfs_caddr_t dp,
int len)
{
@@ -1497,8 +1548,8 @@ xlog_recover_add_to_trans(
if (list_empty(&trans->r_itemq)) {
/* we need to catch log corruptions here */
if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
- xlog_warn("XFS: xlog_recover_add_to_trans: "
- "bad header magic number");
+ xfs_warn(log->l_mp, "%s: bad header magic number",
+ __func__);
ASSERT(0);
return XFS_ERROR(EIO);
}
@@ -1525,10 +1576,11 @@ xlog_recover_add_to_trans(
if (item->ri_total == 0) { /* first region to be added */
if (in_f->ilf_size == 0 ||
in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
- xlog_warn(
- "XFS: bad number of regions (%d) in inode log format",
+ xfs_warn(log->l_mp,
+ "bad number of regions (%d) in inode log format",
in_f->ilf_size);
ASSERT(0);
+ kmem_free(ptr);
return XFS_ERROR(EIO);
}
@@ -1547,32 +1599,89 @@ xlog_recover_add_to_trans(
}
/*
- * Sort the log items in the transaction. Cancelled buffers need
- * to be put first so they are processed before any items that might
- * modify the buffers. If they are cancelled, then the modifications
- * don't need to be replayed.
+ * Sort the log items in the transaction.
+ *
+ * The ordering constraints are defined by the inode allocation and unlink
+ * behaviour. The rules are:
+ *
+ * 1. Every item is only logged once in a given transaction. Hence it
+ * represents the last logged state of the item. Hence ordering is
+ * dependent on the order in which operations need to be performed so
+ * required initial conditions are always met.
+ *
+ * 2. Cancelled buffers are recorded in pass 1 in a separate table and
+ * there's nothing to replay from them so we can simply cull them
+ * from the transaction. However, we can't do that until after we've
+ * replayed all the other items because they may be dependent on the
+ * cancelled buffer and replaying the cancelled buffer can remove it
+ * form the cancelled buffer table. Hence they have tobe done last.
+ *
+ * 3. Inode allocation buffers must be replayed before inode items that
+ * read the buffer and replay changes into it. For filesystems using the
+ * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
+ * treated the same as inode allocation buffers as they create and
+ * initialise the buffers directly.
+ *
+ * 4. Inode unlink buffers must be replayed after inode items are replayed.
+ * This ensures that inodes are completely flushed to the inode buffer
+ * in a "free" state before we remove the unlinked inode list pointer.
+ *
+ * Hence the ordering needs to be inode allocation buffers first, inode items
+ * second, inode unlink buffers third and cancelled buffers last.
+ *
+ * But there's a problem with that - we can't tell an inode allocation buffer
+ * apart from a regular buffer, so we can't separate them. We can, however,
+ * tell an inode unlink buffer from the others, and so we can separate them out
+ * from all the other buffers and move them to last.
+ *
+ * Hence, 4 lists, in order from head to tail:
+ * - buffer_list for all buffers except cancelled/inode unlink buffers
+ * - item_list for all non-buffer items
+ * - inode_buffer_list for inode unlink buffers
+ * - cancel_list for the cancelled buffers
+ *
+ * Note that we add objects to the tail of the lists so that first-to-last
+ * ordering is preserved within the lists. Adding objects to the head of the
+ * list means when we traverse from the head we walk them in last-to-first
+ * order. For cancelled buffers and inode unlink buffers this doesn't matter,
+ * but for all other items there may be specific ordering that we need to
+ * preserve.
*/
STATIC int
xlog_recover_reorder_trans(
- struct log *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
int pass)
{
xlog_recover_item_t *item, *n;
+ int error = 0;
LIST_HEAD(sort_list);
+ LIST_HEAD(cancel_list);
+ LIST_HEAD(buffer_list);
+ LIST_HEAD(inode_buffer_list);
+ LIST_HEAD(inode_list);
list_splice_init(&trans->r_itemq, &sort_list);
list_for_each_entry_safe(item, n, &sort_list, ri_list) {
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
switch (ITEM_TYPE(item)) {
+ case XFS_LI_ICREATE:
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_BUF:
- if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
+ if (buf_f->blf_flags & XFS_BLF_CANCEL) {
trace_xfs_log_recover_item_reorder_head(log,
trans, item, pass);
- list_move(&item->ri_list, &trans->r_itemq);
+ list_move(&item->ri_list, &cancel_list);
break;
}
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
+ list_move(&item->ri_list, &inode_buffer_list);
+ break;
+ }
+ list_move_tail(&item->ri_list, &buffer_list);
+ break;
case XFS_LI_INODE:
case XFS_LI_DQUOT:
case XFS_LI_QUOTAOFF:
@@ -1580,17 +1689,34 @@ xlog_recover_reorder_trans(
case XFS_LI_EFI:
trace_xfs_log_recover_item_reorder_tail(log,
trans, item, pass);
- list_move_tail(&item->ri_list, &trans->r_itemq);
+ list_move_tail(&item->ri_list, &inode_list);
break;
default:
- xlog_warn(
- "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
+ xfs_warn(log->l_mp,
+ "%s: unrecognized type of log operation",
+ __func__);
ASSERT(0);
- return XFS_ERROR(EIO);
+ /*
+ * return the remaining items back to the transaction
+ * item list so they can be freed in caller.
+ */
+ if (!list_empty(&sort_list))
+ list_splice_init(&sort_list, &trans->r_itemq);
+ error = XFS_ERROR(EIO);
+ goto out;
}
}
+out:
ASSERT(list_empty(&sort_list));
- return 0;
+ if (!list_empty(&buffer_list))
+ list_splice(&buffer_list, &trans->r_itemq);
+ if (!list_empty(&inode_list))
+ list_splice_tail(&inode_list, &trans->r_itemq);
+ if (!list_empty(&inode_buffer_list))
+ list_splice_tail(&inode_buffer_list, &trans->r_itemq);
+ if (!list_empty(&cancel_list))
+ list_splice_tail(&cancel_list, &trans->r_itemq);
+ return error;
}
/*
@@ -1605,240 +1731,160 @@ xlog_recover_reorder_trans(
* record in the table to tell us how many times we expect to see this
* record during the second pass.
*/
-STATIC void
-xlog_recover_do_buffer_pass1(
- xlog_t *log,
- xfs_buf_log_format_t *buf_f)
+STATIC int
+xlog_recover_buffer_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
- xfs_buf_cancel_t *bcp;
- xfs_buf_cancel_t *nextp;
- xfs_buf_cancel_t *prevp;
- xfs_buf_cancel_t **bucket;
- xfs_daddr_t blkno = 0;
- uint len = 0;
- ushort flags = 0;
-
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- len = buf_f->blf_len;
- flags = buf_f->blf_flags;
- break;
- }
+ xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
/*
* If this isn't a cancel buffer item, then just return.
*/
- if (!(flags & XFS_BLF_CANCEL)) {
+ if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
trace_xfs_log_recover_buf_not_cancel(log, buf_f);
- return;
- }
-
- /*
- * Insert an xfs_buf_cancel record into the hash table of
- * them. If there is already an identical record, bump
- * its reference count.
- */
- bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
- XLOG_BC_TABLE_SIZE];
- /*
- * If the hash bucket is empty then just insert a new record into
- * the bucket.
- */
- if (*bucket == NULL) {
- bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
- KM_SLEEP);
- bcp->bc_blkno = blkno;
- bcp->bc_len = len;
- bcp->bc_refcount = 1;
- bcp->bc_next = NULL;
- *bucket = bcp;
- return;
+ return 0;
}
/*
- * The hash bucket is not empty, so search for duplicates of our
- * record. If we find one them just bump its refcount. If not
- * then add us at the end of the list.
+ * Insert an xfs_buf_cancel record into the hash table of them.
+ * If there is already an identical record, bump its reference count.
*/
- prevp = NULL;
- nextp = *bucket;
- while (nextp != NULL) {
- if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
- nextp->bc_refcount++;
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == buf_f->blf_blkno &&
+ bcp->bc_len == buf_f->blf_len) {
+ bcp->bc_refcount++;
trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
- return;
+ return 0;
}
- prevp = nextp;
- nextp = nextp->bc_next;
}
- ASSERT(prevp != NULL);
- bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
- KM_SLEEP);
- bcp->bc_blkno = blkno;
- bcp->bc_len = len;
+
+ bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
+ bcp->bc_blkno = buf_f->blf_blkno;
+ bcp->bc_len = buf_f->blf_len;
bcp->bc_refcount = 1;
- bcp->bc_next = NULL;
- prevp->bc_next = bcp;
+ list_add_tail(&bcp->bc_list, bucket);
+
trace_xfs_log_recover_buf_cancel_add(log, buf_f);
+ return 0;
}
/*
* Check to see whether the buffer being recovered has a corresponding
- * entry in the buffer cancel record table. If it does then return 1
- * so that it will be cancelled, otherwise return 0. If the buffer is
- * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
- * the refcount on the entry in the table and remove it from the table
- * if this is the last reference.
- *
- * We remove the cancel record from the table when we encounter its
- * last occurrence in the log so that if the same buffer is re-used
- * again after its last cancellation we actually replay the changes
- * made at that point.
+ * entry in the buffer cancel record table. If it is, return the cancel
+ * buffer structure to the caller.
*/
-STATIC int
-xlog_check_buffer_cancelled(
- xlog_t *log,
+STATIC struct xfs_buf_cancel *
+xlog_peek_buffer_cancelled(
+ struct xlog *log,
xfs_daddr_t blkno,
uint len,
ushort flags)
{
- xfs_buf_cancel_t *bcp;
- xfs_buf_cancel_t *prevp;
- xfs_buf_cancel_t **bucket;
+ struct list_head *bucket;
+ struct xfs_buf_cancel *bcp;
- if (log->l_buf_cancel_table == NULL) {
- /*
- * There is nothing in the table built in pass one,
- * so this buffer must not be cancelled.
- */
+ if (!log->l_buf_cancel_table) {
+ /* empty table means no cancelled buffers in the log */
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
}
- bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
- XLOG_BC_TABLE_SIZE];
- bcp = *bucket;
- if (bcp == NULL) {
- /*
- * There is no corresponding entry in the table built
- * in pass one, so this buffer has not been cancelled.
- */
- ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
+ list_for_each_entry(bcp, bucket, bc_list) {
+ if (bcp->bc_blkno == blkno && bcp->bc_len == len)
+ return bcp;
}
/*
- * Search for an entry in the buffer cancel table that
- * matches our buffer.
- */
- prevp = NULL;
- while (bcp != NULL) {
- if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
- /*
- * We've go a match, so return 1 so that the
- * recovery of this buffer is cancelled.
- * If this buffer is actually a buffer cancel
- * log item, then decrement the refcount on the
- * one in the table and remove it if this is the
- * last reference.
- */
- if (flags & XFS_BLF_CANCEL) {
- bcp->bc_refcount--;
- if (bcp->bc_refcount == 0) {
- if (prevp == NULL) {
- *bucket = bcp->bc_next;
- } else {
- prevp->bc_next = bcp->bc_next;
- }
- kmem_free(bcp);
- }
- }
- return 1;
- }
- prevp = bcp;
- bcp = bcp->bc_next;
- }
- /*
- * We didn't find a corresponding entry in the table, so
- * return 0 so that the buffer is NOT cancelled.
+ * We didn't find a corresponding entry in the table, so return 0 so
+ * that the buffer is NOT cancelled.
*/
ASSERT(!(flags & XFS_BLF_CANCEL));
- return 0;
+ return NULL;
}
+/*
+ * If the buffer is being cancelled then return 1 so that it will be cancelled,
+ * otherwise return 0. If the buffer is actually a buffer cancel item
+ * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the
+ * table and remove it from the table if this is the last reference.
+ *
+ * We remove the cancel record from the table when we encounter its last
+ * occurrence in the log so that if the same buffer is re-used again after its
+ * last cancellation we actually replay the changes made at that point.
+ */
STATIC int
-xlog_recover_do_buffer_pass2(
- xlog_t *log,
- xfs_buf_log_format_t *buf_f)
+xlog_check_buffer_cancelled(
+ struct xlog *log,
+ xfs_daddr_t blkno,
+ uint len,
+ ushort flags)
{
- xfs_daddr_t blkno = 0;
- ushort flags = 0;
- uint len = 0;
+ struct xfs_buf_cancel *bcp;
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- flags = buf_f->blf_flags;
- len = buf_f->blf_len;
- break;
- }
+ bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags);
+ if (!bcp)
+ return 0;
- return xlog_check_buffer_cancelled(log, blkno, len, flags);
+ /*
+ * We've go a match, so return 1 so that the recovery of this buffer
+ * is cancelled. If this buffer is actually a buffer cancel log
+ * item, then decrement the refcount on the one in the table and
+ * remove it if this is the last reference.
+ */
+ if (flags & XFS_BLF_CANCEL) {
+ if (--bcp->bc_refcount == 0) {
+ list_del(&bcp->bc_list);
+ kmem_free(bcp);
+ }
+ }
+ return 1;
}
/*
- * Perform recovery for a buffer full of inodes. In these buffers,
- * the only data which should be recovered is that which corresponds
- * to the di_next_unlinked pointers in the on disk inode structures.
- * The rest of the data for the inodes is always logged through the
- * inodes themselves rather than the inode buffer and is recovered
- * in xlog_recover_do_inode_trans().
+ * Perform recovery for a buffer full of inodes. In these buffers, the only
+ * data which should be recovered is that which corresponds to the
+ * di_next_unlinked pointers in the on disk inode structures. The rest of the
+ * data for the inodes is always logged through the inodes themselves rather
+ * than the inode buffer and is recovered in xlog_recover_inode_pass2().
*
- * The only time when buffers full of inodes are fully recovered is
- * when the buffer is full of newly allocated inodes. In this case
- * the buffer will not be marked as an inode buffer and so will be
- * sent to xlog_recover_do_reg_buffer() below during recovery.
+ * The only time when buffers full of inodes are fully recovered is when the
+ * buffer is full of newly allocated inodes. In this case the buffer will
+ * not be marked as an inode buffer and so will be sent to
+ * xlog_recover_do_reg_buffer() below during recovery.
*/
STATIC int
xlog_recover_do_inode_buffer(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
xlog_recover_item_t *item,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
{
int i;
- int item_index;
- int bit;
- int nbits;
- int reg_buf_offset;
- int reg_buf_bytes;
+ int item_index = 0;
+ int bit = 0;
+ int nbits = 0;
+ int reg_buf_offset = 0;
+ int reg_buf_bytes = 0;
int next_unlinked_offset;
int inodes_per_buf;
xfs_agino_t *logged_nextp;
xfs_agino_t *buffer_nextp;
- unsigned int *data_map = NULL;
- unsigned int map_size = 0;
trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- data_map = buf_f->blf_data_map;
- map_size = buf_f->blf_map_size;
- break;
- }
/*
- * Set the variables corresponding to the current region to
- * 0 so that we'll initialize them on the first pass through
- * the loop.
+ * Post recovery validation only works properly on CRC enabled
+ * filesystems.
*/
- reg_buf_offset = 0;
- reg_buf_bytes = 0;
- bit = 0;
- nbits = 0;
- item_index = 0;
- inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ bp->b_ops = &xfs_inode_buf_ops;
+
+ inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog;
for (i = 0; i < inodes_per_buf; i++) {
next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
offsetof(xfs_dinode_t, di_next_unlinked);
@@ -1852,18 +1898,18 @@ xlog_recover_do_inode_buffer(
* the current di_next_unlinked field.
*/
bit += nbits;
- bit = xfs_next_bit(data_map, map_size, bit);
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
/*
* If there are no more logged regions in the
* buffer, then we're done.
*/
- if (bit == -1) {
+ if (bit == -1)
return 0;
- }
- nbits = xfs_contig_bits(data_map, map_size,
- bit);
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
reg_buf_offset = bit << XFS_BLF_SHIFT;
reg_buf_bytes = nbits << XFS_BLF_SHIFT;
@@ -1875,13 +1921,13 @@ xlog_recover_do_inode_buffer(
* di_next_unlinked field, then move on to the next
* di_next_unlinked field.
*/
- if (next_unlinked_offset < reg_buf_offset) {
+ if (next_unlinked_offset < reg_buf_offset)
continue;
- }
ASSERT(item->ri_buf[item_index].i_addr != NULL);
ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
- ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
+ ASSERT((reg_buf_offset + reg_buf_bytes) <=
+ BBTOB(bp->b_io_length));
/*
* The current logged region contains a copy of the
@@ -1891,8 +1937,9 @@ xlog_recover_do_inode_buffer(
logged_nextp = item->ri_buf[item_index].i_addr +
next_unlinked_offset - reg_buf_offset;
if (unlikely(*logged_nextp == 0)) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
+ xfs_alert(mp,
+ "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
+ "Trying to replay bad (0) inode di_next_unlinked field.",
item, bp);
XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
XFS_ERRLEVEL_LOW, mp);
@@ -1902,52 +1949,405 @@ xlog_recover_do_inode_buffer(
buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
next_unlinked_offset);
*buffer_nextp = *logged_nextp;
+
+ /*
+ * If necessary, recalculate the CRC in the on-disk inode. We
+ * have to leave the inode in a consistent state for whoever
+ * reads it next....
+ */
+ xfs_dinode_calc_crc(mp, (struct xfs_dinode *)
+ xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize));
+
}
return 0;
}
/*
+ * V5 filesystems know the age of the buffer on disk being recovered. We can
+ * have newer objects on disk than we are replaying, and so for these cases we
+ * don't want to replay the current change as that will make the buffer contents
+ * temporarily invalid on disk.
+ *
+ * The magic number might not match the buffer type we are going to recover
+ * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence
+ * extract the LSN of the existing object in the buffer based on it's current
+ * magic number. If we don't recognise the magic number in the buffer, then
+ * return a LSN of -1 so that the caller knows it was an unrecognised block and
+ * so can recover the buffer.
+ *
+ * Note: we cannot rely solely on magic number matches to determine that the
+ * buffer has a valid LSN - we also need to verify that it belongs to this
+ * filesystem, so we need to extract the object's LSN and compare it to that
+ * which we read from the superblock. If the UUIDs don't match, then we've got a
+ * stale metadata block from an old filesystem instance that we need to recover
+ * over the top of.
+ */
+static xfs_lsn_t
+xlog_recover_get_buf_lsn(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp)
+{
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+ void *blk = bp->b_addr;
+ uuid_t *uuid;
+ xfs_lsn_t lsn = -1;
+
+ /* v4 filesystems always recover immediately */
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ goto recover_immediately;
+
+ magic32 = be32_to_cpu(*(__be32 *)blk);
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.s.bb_lsn);
+ uuid = &btb->bb_u.s.bb_uuid;
+ break;
+ }
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC: {
+ struct xfs_btree_block *btb = blk;
+
+ lsn = be64_to_cpu(btb->bb_u.l.bb_lsn);
+ uuid = &btb->bb_u.l.bb_uuid;
+ break;
+ }
+ case XFS_AGF_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn);
+ uuid = &((struct xfs_agf *)blk)->agf_uuid;
+ break;
+ case XFS_AGFL_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn);
+ uuid = &((struct xfs_agfl *)blk)->agfl_uuid;
+ break;
+ case XFS_AGI_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn);
+ uuid = &((struct xfs_agi *)blk)->agi_uuid;
+ break;
+ case XFS_SYMLINK_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn);
+ uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid;
+ break;
+ case XFS_DIR3_BLOCK_MAGIC:
+ case XFS_DIR3_DATA_MAGIC:
+ case XFS_DIR3_FREE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn);
+ uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid;
+ break;
+ case XFS_ATTR3_RMT_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
+ uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid;
+ break;
+ case XFS_SB_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
+ uuid = &((struct xfs_dsb *)blk)->sb_uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic);
+ switch (magicda) {
+ case XFS_DIR3_LEAF1_MAGIC:
+ case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_DA3_NODE_MAGIC:
+ lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
+ uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
+ break;
+ default:
+ break;
+ }
+
+ if (lsn != (xfs_lsn_t)-1) {
+ if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ goto recover_immediately;
+ return lsn;
+ }
+
+ /*
+ * We do individual object checks on dquot and inode buffers as they
+ * have their own individual LSN records. Also, we could have a stale
+ * buffer here, so we have to at least recognise these buffer types.
+ *
+ * A notd complexity here is inode unlinked list processing - it logs
+ * the inode directly in the buffer, but we don't know which inodes have
+ * been modified, and there is no global buffer LSN. Hence we need to
+ * recover all inode buffer types immediately. This problem will be
+ * fixed by logical logging of the unlinked list modifications.
+ */
+ magic16 = be16_to_cpu(*(__be16 *)blk);
+ switch (magic16) {
+ case XFS_DQUOT_MAGIC:
+ case XFS_DINODE_MAGIC:
+ goto recover_immediately;
+ default:
+ break;
+ }
+
+ /* unknown buffer contents, recover immediately */
+
+recover_immediately:
+ return (xfs_lsn_t)-1;
+
+}
+
+/*
+ * Validate the recovered buffer is of the correct type and attach the
+ * appropriate buffer operations to them for writeback. Magic numbers are in a
+ * few places:
+ * the first 16 bits of the buffer (inode buffer, dquot buffer),
+ * the first 32 bits of the buffer (most blocks),
+ * inside a struct xfs_da_blkinfo at the start of the buffer.
+ */
+static void
+xlog_recover_validate_buf_type(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ xfs_buf_log_format_t *buf_f)
+{
+ struct xfs_da_blkinfo *info = bp->b_addr;
+ __uint32_t magic32;
+ __uint16_t magic16;
+ __uint16_t magicda;
+
+ magic32 = be32_to_cpu(*(__be32 *)bp->b_addr);
+ magic16 = be16_to_cpu(*(__be16*)bp->b_addr);
+ magicda = be16_to_cpu(info->magic);
+ switch (xfs_blft_from_flags(buf_f)) {
+ case XFS_BLFT_BTREE_BUF:
+ switch (magic32) {
+ case XFS_ABTB_CRC_MAGIC:
+ case XFS_ABTC_CRC_MAGIC:
+ case XFS_ABTB_MAGIC:
+ case XFS_ABTC_MAGIC:
+ bp->b_ops = &xfs_allocbt_buf_ops;
+ break;
+ case XFS_IBT_CRC_MAGIC:
+ case XFS_FIBT_CRC_MAGIC:
+ case XFS_IBT_MAGIC:
+ case XFS_FIBT_MAGIC:
+ bp->b_ops = &xfs_inobt_buf_ops;
+ break;
+ case XFS_BMAP_CRC_MAGIC:
+ case XFS_BMAP_MAGIC:
+ bp->b_ops = &xfs_bmbt_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Bad btree block magic!");
+ ASSERT(0);
+ break;
+ }
+ break;
+ case XFS_BLFT_AGF_BUF:
+ if (magic32 != XFS_AGF_MAGIC) {
+ xfs_warn(mp, "Bad AGF block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agf_buf_ops;
+ break;
+ case XFS_BLFT_AGFL_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_AGFL_MAGIC) {
+ xfs_warn(mp, "Bad AGFL block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agfl_buf_ops;
+ break;
+ case XFS_BLFT_AGI_BUF:
+ if (magic32 != XFS_AGI_MAGIC) {
+ xfs_warn(mp, "Bad AGI block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_agi_buf_ops;
+ break;
+ case XFS_BLFT_UDQUOT_BUF:
+ case XFS_BLFT_PDQUOT_BUF:
+ case XFS_BLFT_GDQUOT_BUF:
+#ifdef CONFIG_XFS_QUOTA
+ if (magic16 != XFS_DQUOT_MAGIC) {
+ xfs_warn(mp, "Bad DQUOT block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dquot_buf_ops;
+#else
+ xfs_alert(mp,
+ "Trying to recover dquots without QUOTA support built in!");
+ ASSERT(0);
+#endif
+ break;
+ case XFS_BLFT_DINO_BUF:
+ /*
+ * we get here with inode allocation buffers, not buffers that
+ * track unlinked list changes.
+ */
+ if (magic16 != XFS_DINODE_MAGIC) {
+ xfs_warn(mp, "Bad INODE block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_inode_buf_ops;
+ break;
+ case XFS_BLFT_SYMLINK_BUF:
+ if (magic32 != XFS_SYMLINK_MAGIC) {
+ xfs_warn(mp, "Bad symlink block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_symlink_buf_ops;
+ break;
+ case XFS_BLFT_DIR_BLOCK_BUF:
+ if (magic32 != XFS_DIR2_BLOCK_MAGIC &&
+ magic32 != XFS_DIR3_BLOCK_MAGIC) {
+ xfs_warn(mp, "Bad dir block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_block_buf_ops;
+ break;
+ case XFS_BLFT_DIR_DATA_BUF:
+ if (magic32 != XFS_DIR2_DATA_MAGIC &&
+ magic32 != XFS_DIR3_DATA_MAGIC) {
+ xfs_warn(mp, "Bad dir data magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_data_buf_ops;
+ break;
+ case XFS_BLFT_DIR_FREE_BUF:
+ if (magic32 != XFS_DIR2_FREE_MAGIC &&
+ magic32 != XFS_DIR3_FREE_MAGIC) {
+ xfs_warn(mp, "Bad dir3 free magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_free_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAF1_BUF:
+ if (magicda != XFS_DIR2_LEAF1_MAGIC &&
+ magicda != XFS_DIR3_LEAF1_MAGIC) {
+ xfs_warn(mp, "Bad dir leaf1 magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leaf1_buf_ops;
+ break;
+ case XFS_BLFT_DIR_LEAFN_BUF:
+ if (magicda != XFS_DIR2_LEAFN_MAGIC &&
+ magicda != XFS_DIR3_LEAFN_MAGIC) {
+ xfs_warn(mp, "Bad dir leafn magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_dir3_leafn_buf_ops;
+ break;
+ case XFS_BLFT_DA_NODE_BUF:
+ if (magicda != XFS_DA_NODE_MAGIC &&
+ magicda != XFS_DA3_NODE_MAGIC) {
+ xfs_warn(mp, "Bad da node magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_da3_node_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_LEAF_BUF:
+ if (magicda != XFS_ATTR_LEAF_MAGIC &&
+ magicda != XFS_ATTR3_LEAF_MAGIC) {
+ xfs_warn(mp, "Bad attr leaf magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_leaf_buf_ops;
+ break;
+ case XFS_BLFT_ATTR_RMT_BUF:
+ if (!xfs_sb_version_hascrc(&mp->m_sb))
+ break;
+ if (magic32 != XFS_ATTR3_RMT_MAGIC) {
+ xfs_warn(mp, "Bad attr remote magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_attr3_rmt_buf_ops;
+ break;
+ case XFS_BLFT_SB_BUF:
+ if (magic32 != XFS_SB_MAGIC) {
+ xfs_warn(mp, "Bad SB block magic!");
+ ASSERT(0);
+ break;
+ }
+ bp->b_ops = &xfs_sb_buf_ops;
+ break;
+ default:
+ xfs_warn(mp, "Unknown buffer type %d!",
+ xfs_blft_from_flags(buf_f));
+ break;
+ }
+}
+
+/*
* Perform a 'normal' buffer recovery. Each logged region of the
* buffer should be copied over the corresponding region in the
* given buffer. The bitmap in the buf log format structure indicates
* where to place the logged data.
*/
-/*ARGSUSED*/
STATIC void
xlog_recover_do_reg_buffer(
struct xfs_mount *mp,
xlog_recover_item_t *item,
- xfs_buf_t *bp,
+ struct xfs_buf *bp,
xfs_buf_log_format_t *buf_f)
{
int i;
int bit;
int nbits;
- unsigned int *data_map = NULL;
- unsigned int map_size = 0;
int error;
trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- data_map = buf_f->blf_data_map;
- map_size = buf_f->blf_map_size;
- break;
- }
bit = 0;
i = 1; /* 0 is the buf format structure */
while (1) {
- bit = xfs_next_bit(data_map, map_size, bit);
+ bit = xfs_next_bit(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
if (bit == -1)
break;
- nbits = xfs_contig_bits(data_map, map_size, bit);
+ nbits = xfs_contig_bits(buf_f->blf_data_map,
+ buf_f->blf_map_size, bit);
ASSERT(nbits > 0);
ASSERT(item->ri_buf[i].i_addr != NULL);
ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
- ASSERT(XFS_BUF_COUNT(bp) >=
- ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
+ ASSERT(BBTOB(bp->b_io_length) >=
+ ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT));
+
+ /*
+ * The dirty regions logged in the buffer, even though
+ * contiguous, may span multiple chunks. This is because the
+ * dirty region may span a physical page boundary in a buffer
+ * and hence be split into two separate vectors for writing into
+ * the log. Hence we need to trim nbits back to the length of
+ * the current region being copied out of the log.
+ */
+ if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT))
+ nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT;
/*
* Do a sanity check if this is a dquot buffer. Just checking
@@ -1958,17 +2358,17 @@ xlog_recover_do_reg_buffer(
if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
if (item->ri_buf[i].i_addr == NULL) {
- cmn_err(CE_ALERT,
+ xfs_alert(mp,
"XFS: NULL dquot in %s.", __func__);
goto next;
}
if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
- cmn_err(CE_ALERT,
+ xfs_alert(mp,
"XFS: dquot too small (%d) in %s.",
item->ri_buf[i].i_len, __func__);
goto next;
}
- error = xfs_qm_dqcheck(item->ri_buf[i].i_addr,
+ error = xfs_dqcheck(mp, item->ri_buf[i].i_addr,
-1, 0, XFS_QMOPT_DOWARN,
"dquot_buf_recover");
if (error)
@@ -1986,144 +2386,32 @@ xlog_recover_do_reg_buffer(
/* Shouldn't be any more regions */
ASSERT(i == item->ri_total);
-}
-
-/*
- * Do some primitive error checking on ondisk dquot data structures.
- */
-int
-xfs_qm_dqcheck(
- xfs_disk_dquot_t *ddq,
- xfs_dqid_t id,
- uint type, /* used only when IO_dorepair is true */
- uint flags,
- char *str)
-{
- xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
- int errs = 0;
-
- /*
- * We can encounter an uninitialized dquot buffer for 2 reasons:
- * 1. If we crash while deleting the quotainode(s), and those blks got
- * used for user data. This is because we take the path of regular
- * file deletion; however, the size field of quotainodes is never
- * updated, so all the tricks that we play in itruncate_finish
- * don't quite matter.
- *
- * 2. We don't play the quota buffers when there's a quotaoff logitem.
- * But the allocation will be replayed so we'll end up with an
- * uninitialized quota block.
- *
- * This is all fine; things are still consistent, and we haven't lost
- * any quota information. Just don't complain about bad dquot blks.
- */
- if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
- str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
- errs++;
- }
- if (ddq->d_version != XFS_DQUOT_VERSION) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
- str, id, ddq->d_version, XFS_DQUOT_VERSION);
- errs++;
- }
-
- if (ddq->d_flags != XFS_DQ_USER &&
- ddq->d_flags != XFS_DQ_PROJ &&
- ddq->d_flags != XFS_DQ_GROUP) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
- str, id, ddq->d_flags);
- errs++;
- }
-
- if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : ondisk-dquot 0x%p, ID mismatch: "
- "0x%x expected, found id 0x%x",
- str, ddq, id, be32_to_cpu(ddq->d_id));
- errs++;
- }
-
- if (!errs && ddq->d_id) {
- if (ddq->d_blk_softlimit &&
- be64_to_cpu(ddq->d_bcount) >=
- be64_to_cpu(ddq->d_blk_softlimit)) {
- if (!ddq->d_btimer) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : Dquot ID 0x%x (0x%p) "
- "BLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_ino_softlimit &&
- be64_to_cpu(ddq->d_icount) >=
- be64_to_cpu(ddq->d_ino_softlimit)) {
- if (!ddq->d_itimer) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : Dquot ID 0x%x (0x%p) "
- "INODE TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- if (ddq->d_rtb_softlimit &&
- be64_to_cpu(ddq->d_rtbcount) >=
- be64_to_cpu(ddq->d_rtb_softlimit)) {
- if (!ddq->d_rtbtimer) {
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_ALERT,
- "%s : Dquot ID 0x%x (0x%p) "
- "RTBLK TIMER NOT STARTED",
- str, (int)be32_to_cpu(ddq->d_id), ddq);
- errs++;
- }
- }
- }
-
- if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
- return errs;
-
- if (flags & XFS_QMOPT_DOWARN)
- cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
/*
- * Typically, a repair is only requested by quotacheck.
+ * We can only do post recovery validation on items on CRC enabled
+ * fielsystems as we need to know when the buffer was written to be able
+ * to determine if we should have replayed the item. If we replay old
+ * metadata over a newer buffer, then it will enter a temporarily
+ * inconsistent state resulting in verification failures. Hence for now
+ * just avoid the verification stage for non-crc filesystems
*/
- ASSERT(id != -1);
- ASSERT(flags & XFS_QMOPT_DQREPAIR);
- memset(d, 0, sizeof(xfs_dqblk_t));
-
- d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
- d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
- d->dd_diskdq.d_flags = type;
- d->dd_diskdq.d_id = cpu_to_be32(id);
-
- return errs;
+ if (xfs_sb_version_hascrc(&mp->m_sb))
+ xlog_recover_validate_buf_type(mp, bp, buf_f);
}
/*
* Perform a dquot buffer recovery.
- * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
+ * Simple algorithm: if we have found a QUOTAOFF log item of the same type
* (ie. USR or GRP), then just toss this buffer away; don't recover it.
* Else, treat it as a regular buffer and do recovery.
*/
STATIC void
xlog_recover_do_dquot_buffer(
- xfs_mount_t *mp,
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_buf_t *bp,
- xfs_buf_log_format_t *buf_f)
+ struct xfs_mount *mp,
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ struct xfs_buf *bp,
+ struct xfs_buf_log_format *buf_f)
{
uint type;
@@ -2172,88 +2460,67 @@ xlog_recover_do_dquot_buffer(
* over the log during recovery. During the first we build a table of
* those buffers which have been cancelled, and during the second we
* only replay those buffers which do not have corresponding cancel
- * records in the table. See xlog_recover_do_buffer_pass[1,2] above
+ * records in the table. See xlog_recover_buffer_pass[1,2] above
* for more details on the implementation of the table of cancel records.
*/
STATIC int
-xlog_recover_do_buffer_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+xlog_recover_buffer_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
int error;
- int cancel;
- xfs_daddr_t blkno;
- int len;
- ushort flags;
uint buf_flags;
+ xfs_lsn_t lsn;
- if (pass == XLOG_RECOVER_PASS1) {
- /*
- * In this pass we're only looking for buf items
- * with the XFS_BLF_CANCEL bit set.
- */
- xlog_recover_do_buffer_pass1(log, buf_f);
+ /*
+ * In this pass we only want to recover all the buffers which have
+ * not been cancelled and are not cancellation buffers themselves.
+ */
+ if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ trace_xfs_log_recover_buf_cancel(log, buf_f);
return 0;
- } else {
- /*
- * In this pass we want to recover all the buffers
- * which have not been cancelled and are not
- * cancellation buffers themselves. The routine
- * we call here will tell us whether or not to
- * continue with the replay of this buffer.
- */
- cancel = xlog_recover_do_buffer_pass2(log, buf_f);
- if (cancel) {
- trace_xfs_log_recover_buf_cancel(log, buf_f);
- return 0;
- }
}
+
trace_xfs_log_recover_buf_recover(log, buf_f);
- switch (buf_f->blf_type) {
- case XFS_LI_BUF:
- blkno = buf_f->blf_blkno;
- len = buf_f->blf_len;
- flags = buf_f->blf_flags;
- break;
- default:
- xfs_fs_cmn_err(CE_ALERT, log->l_mp,
- "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
- buf_f->blf_type, log->l_mp->m_logname ?
- log->l_mp->m_logname : "internal");
- XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
- XFS_ERRLEVEL_LOW, log->l_mp);
- return XFS_ERROR(EFSCORRUPTED);
- }
- mp = log->l_mp;
- buf_flags = XBF_LOCK;
- if (!(flags & XFS_BLF_INODE_BUF))
- buf_flags |= XBF_MAPPED;
-
- bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
- if (XFS_BUF_ISERROR(bp)) {
- xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
- bp, blkno);
- error = XFS_BUF_GETERROR(bp);
- xfs_buf_relse(bp);
- return error;
+ buf_flags = 0;
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF)
+ buf_flags |= XBF_UNMAPPED;
+
+ bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
+ buf_flags, NULL);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
+ goto out_release;
}
- error = 0;
- if (flags & XFS_BLF_INODE_BUF) {
+ /*
+ * recover the buffer only if we get an LSN from it and it's less than
+ * the lsn of the transaction we are replaying.
+ */
+ lsn = xlog_recover_get_buf_lsn(mp, bp);
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0)
+ goto out_release;
+
+ if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
- } else if (flags &
+ } else if (buf_f->blf_flags &
(XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
} else {
xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
}
if (error)
- return XFS_ERROR(error);
+ goto out_release;
/*
* Perform delayed write on the buffer. Asynchronous writes will be
@@ -2261,41 +2528,119 @@ xlog_recover_do_buffer_trans(
*
* Also make sure that only inode buffers with good sizes stay in
* the buffer cache. The kernel moves inodes in buffers of 1 block
- * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
+ * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode
* buffers in the log can be a different size if the log was generated
* by an older kernel using unclustered inode buffers or a newer kernel
* running with a different inode cluster size. Regardless, if the
- * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
- * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
+ * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size)
+ * for *our* value of mp->m_inode_cluster_size, then we need to keep
* the buffer out of the buffer cache so that the buffer won't
* overlap with future reads of those inodes.
*/
if (XFS_DINODE_MAGIC ==
be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
- (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
- (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
- XFS_BUF_STALE(bp);
- error = xfs_bwrite(mp, bp);
+ (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize,
+ (__uint32_t)log->l_mp->m_inode_cluster_size))) {
+ xfs_buf_stale(bp);
+ error = xfs_bwrite(bp);
} else {
ASSERT(bp->b_target->bt_mount == mp);
- XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
- xfs_bdwrite(mp, bp);
+ bp->b_iodone = xlog_recover_iodone;
+ xfs_buf_delwri_queue(bp, buffer_list);
}
- return (error);
+out_release:
+ xfs_buf_relse(bp);
+ return error;
}
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one. This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
STATIC int
-xlog_recover_do_inode_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+xfs_recover_inode_owner_change(
+ struct xfs_mount *mp,
+ struct xfs_dinode *dip,
+ struct xfs_inode_log_format *in_f,
+ struct list_head *buffer_list)
+{
+ struct xfs_inode *ip;
+ int error;
+
+ ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+ ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+ if (!ip)
+ return ENOMEM;
+
+ /* instantiate the inode */
+ xfs_dinode_from_disk(&ip->i_d, dip);
+ ASSERT(ip->i_d.di_version >= 3);
+
+ error = xfs_iformat_fork(ip, dip);
+ if (error)
+ goto out_free_ip;
+
+
+ if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+ if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+ ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+ error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+ ip->i_ino, buffer_list);
+ if (error)
+ goto out_free_ip;
+ }
+
+out_free_ip:
+ xfs_inode_free(ip);
+ return error;
+}
+
+STATIC int
+xlog_recover_inode_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
xfs_inode_log_format_t *in_f;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
xfs_dinode_t *dip;
- xfs_ino_t ino;
int len;
xfs_caddr_t src;
xfs_caddr_t dest;
@@ -2303,12 +2648,9 @@ xlog_recover_do_inode_trans(
int attr_index;
uint fields;
xfs_icdinode_t *dicp;
+ uint isize;
int need_free = 0;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
-
if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
in_f = item->ri_buf[0].i_addr;
} else {
@@ -2318,8 +2660,6 @@ xlog_recover_do_inode_trans(
if (error)
goto error;
}
- ino = in_f->ilf_ino;
- mp = log->l_mp;
/*
* Inode buffers can be freed, look out for it,
@@ -2333,16 +2673,17 @@ xlog_recover_do_inode_trans(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
- XBF_LOCK);
- if (XFS_BUF_ISERROR(bp)) {
- xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
- bp, in_f->ilf_blkno);
- error = XFS_BUF_GETERROR(bp);
- xfs_buf_relse(bp);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+ &xfs_inode_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
goto error;
}
- error = 0;
+ error = bp->b_error;
+ if (error) {
+ xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
+ goto out_release;
+ }
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2350,30 +2691,53 @@ xlog_recover_do_inode_trans(
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
- dip, bp, ino);
- XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
+ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
+ xfs_alert(mp,
+ "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
+ __func__, dip, bp, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
dicp = item->ri_buf[1].i_addr;
if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
- item, ino);
- XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
+ __func__, item, in_f->ilf_ino);
+ XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
XFS_ERRLEVEL_LOW, mp);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
- /* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
+ /*
+ * If the inode has an LSN in it, recover the inode only if it's less
+ * than the lsn of the transaction we are replaying. Note: we still
+ * need to replay an owner change even though the inode is more recent
+ * than the transaction as there is no guarantee that all the btree
+ * blocks are more recent than this transaction, too.
+ */
+ if (dip->di_version >= 3) {
+ xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ trace_xfs_log_recover_inode_skip(log, in_f);
+ error = 0;
+ goto out_owner_change;
+ }
+ }
+
+ /*
+ * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
+ * are transactional and if ordering is necessary we can determine that
+ * more accurately by the LSN field in the V3 inode core. Don't trust
+ * the inode versions we might be changing them here - use the
+ * superblock flag to determine whether we need to look at di_flushiter
+ * to skip replay when the on disk inode is newer than the log one
+ */
+ if (!xfs_sb_version_hascrc(&mp->m_sb) &&
+ dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
@@ -2382,82 +2746,82 @@ xlog_recover_do_inode_trans(
dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
- xfs_buf_relse(bp);
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
- goto error;
+ goto out_release;
}
}
+
/* Take the opportunity to reset the flush iteration count */
dicp->di_flushiter = 0;
- if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
+ if (unlikely(S_ISREG(dicp->di_mode))) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
(dicp->di_format != XFS_DINODE_FMT_BTREE)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
- item, dip, bp, ino);
+ xfs_alert(mp,
+ "%s: Bad regular inode log record, rec ptr 0x%p, "
+ "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
- } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
+ } else if (unlikely(S_ISDIR(dicp->di_mode))) {
if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
(dicp->di_format != XFS_DINODE_FMT_BTREE) &&
(dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
- item, dip, bp, ino);
+ xfs_alert(mp,
+ "%s: Bad dir inode log record, rec ptr 0x%p, "
+ "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
}
if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
- item, dip, bp, ino,
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+ "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
+ __func__, item, dip, bp, in_f->ilf_ino,
dicp->di_nextents + dicp->di_anextents,
dicp->di_nblocks);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
- item, dip, bp, ino, dicp->di_forkoff);
+ xfs_alert(mp,
+ "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
+ "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
+ item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
- if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
- XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
+ isize = xfs_icdinode_size(dicp->di_version);
+ if (unlikely(item->ri_buf[1].i_len > isize)) {
+ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
- xfs_buf_relse(bp);
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
- item->ri_buf[1].i_len, item);
+ xfs_alert(mp,
+ "%s: Bad inode log record length %d, rec ptr 0x%p",
+ __func__, item->ri_buf[1].i_len, item);
error = EFSCORRUPTED;
- goto error;
+ goto out_release;
}
/* The core is in in-core format */
- xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, dicp);
/* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
- memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
- item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
+ if (item->ri_buf[1].i_len > isize) {
+ memcpy((char *)dip + isize,
+ item->ri_buf[1].i_addr + isize,
+ item->ri_buf[1].i_len - isize);
}
fields = in_f->ilf_fields;
@@ -2473,7 +2837,7 @@ xlog_recover_do_inode_trans(
}
if (in_f->ilf_size == 2)
- goto write_inode_buffer;
+ goto out_owner_change;
len = item->ri_buf[2].i_len;
src = item->ri_buf[2].i_addr;
ASSERT(in_f->ilf_size <= 4);
@@ -2532,18 +2896,26 @@ xlog_recover_do_inode_trans(
break;
default:
- xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
+ xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
ASSERT(0);
- xfs_buf_relse(bp);
error = EIO;
- goto error;
+ goto out_release;
}
}
-write_inode_buffer:
+out_owner_change:
+ if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+ error = xfs_recover_inode_owner_change(mp, dip, in_f,
+ buffer_list);
+ /* re-generate the checksum. */
+ xfs_dinode_calc_crc(log->l_mp, dip);
+
ASSERT(bp->b_target->bt_mount == mp);
- XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
- xfs_bdwrite(mp, bp);
+ bp->b_iodone = xlog_recover_iodone;
+ xfs_buf_delwri_queue(bp, buffer_list);
+
+out_release:
+ xfs_buf_relse(bp);
error:
if (need_free)
kmem_free(in_f);
@@ -2551,23 +2923,16 @@ error:
}
/*
- * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
+ * Recover QUOTAOFF records. We simply make a note of it in the xlog
* structure, so that we know not to do any dquot item or dquot buffer recovery,
* of that type.
*/
STATIC int
-xlog_recover_do_quotaoff_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+xlog_recover_quotaoff_pass1(
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
- xfs_qoff_logformat_t *qoff_f;
-
- if (pass == XLOG_RECOVER_PASS2) {
- return (0);
- }
-
- qoff_f = item->ri_buf[0].i_addr;
+ xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr;
ASSERT(qoff_f);
/*
@@ -2588,22 +2953,19 @@ xlog_recover_do_quotaoff_trans(
* Recover a dquot record
*/
STATIC int
-xlog_recover_do_dquot_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+xlog_recover_dquot_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item,
+ xfs_lsn_t current_lsn)
{
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_buf_t *bp;
struct xfs_disk_dquot *ddq, *recddq;
int error;
xfs_dq_logformat_t *dq_f;
uint type;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
- mp = log->l_mp;
/*
* Filesystems are required to send in quota flags at mount time.
@@ -2613,13 +2975,11 @@ xlog_recover_do_dquot_trans(
recddq = item->ri_buf[1].i_addr;
if (recddq == NULL) {
- cmn_err(CE_ALERT,
- "XFS: NULL dquot in %s.", __func__);
+ xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
return XFS_ERROR(EIO);
}
if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
- cmn_err(CE_ALERT,
- "XFS: dquot too small (%d) in %s.",
+ xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
item->ri_buf[1].i_len, __func__);
return XFS_ERROR(EIO);
}
@@ -2644,23 +3004,18 @@ xlog_recover_do_dquot_trans(
*/
dq_f = item->ri_buf[0].i_addr;
ASSERT(dq_f);
- if ((error = xfs_qm_dqcheck(recddq,
- dq_f->qlf_id,
- 0, XFS_QMOPT_DOWARN,
- "xlog_recover_do_dquot_trans (log copy)"))) {
+ error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ "xlog_recover_dquot_pass2 (log copy)");
+ if (error)
return XFS_ERROR(EIO);
- }
ASSERT(dq_f->qlf_len == 1);
- error = xfs_read_buf(mp, mp->m_ddev_targp,
- dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len),
- 0, &bp);
- if (error) {
- xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
- bp, dq_f->qlf_blkno);
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ NULL);
+ if (error)
return error;
- }
+
ASSERT(bp);
ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
@@ -2669,20 +3024,40 @@ xlog_recover_do_dquot_trans(
* was among a chunk of dquots created earlier, and we did some
* minimal initialization then.
*/
- if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
- "xlog_recover_do_dquot_trans")) {
+ error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
+ "xlog_recover_dquot_pass2");
+ if (error) {
xfs_buf_relse(bp);
return XFS_ERROR(EIO);
}
+ /*
+ * If the dquot has an LSN in it, recover the dquot only if it's less
+ * than the lsn of the transaction we are replaying.
+ */
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq;
+ xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn);
+
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ goto out_release;
+ }
+ }
+
memcpy(ddq, recddq, item->ri_buf[1].i_len);
+ if (xfs_sb_version_hascrc(&mp->m_sb)) {
+ xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk),
+ XFS_DQUOT_CRC_OFF);
+ }
ASSERT(dq_f->qlf_size == 2);
ASSERT(bp->b_target->bt_mount == mp);
- XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
- xfs_bdwrite(mp, bp);
+ bp->b_iodone = xlog_recover_iodone;
+ xfs_buf_delwri_queue(bp, buffer_list);
- return (0);
+out_release:
+ xfs_buf_relse(bp);
+ return 0;
}
/*
@@ -2693,38 +3068,31 @@ xlog_recover_do_dquot_trans(
* LSN.
*/
STATIC int
-xlog_recover_do_efi_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- xfs_lsn_t lsn,
- int pass)
+xlog_recover_efi_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item,
+ xfs_lsn_t lsn)
{
int error;
- xfs_mount_t *mp;
+ xfs_mount_t *mp = log->l_mp;
xfs_efi_log_item_t *efip;
xfs_efi_log_format_t *efi_formatp;
- if (pass == XLOG_RECOVER_PASS1) {
- return 0;
- }
-
efi_formatp = item->ri_buf[0].i_addr;
- mp = log->l_mp;
efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
&(efip->efi_format)))) {
xfs_efi_item_free(efip);
return error;
}
- efip->efi_next_extent = efi_formatp->efi_nextents;
- efip->efi_flags |= XFS_EFI_COMMITTED;
+ atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
spin_lock(&log->l_ailp->xa_lock);
/*
* xfs_trans_ail_update() drops the AIL lock.
*/
- xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
+ xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
return 0;
}
@@ -2737,11 +3105,10 @@ xlog_recover_do_efi_trans(
* efd format structure. If we find it, we remove the efi from the
* AIL and free it.
*/
-STATIC void
-xlog_recover_do_efd_trans(
- xlog_t *log,
- xlog_recover_item_t *item,
- int pass)
+STATIC int
+xlog_recover_efd_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
{
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
@@ -2750,10 +3117,6 @@ xlog_recover_do_efd_trans(
struct xfs_ail_cursor cur;
struct xfs_ail *ailp = log->l_ailp;
- if (pass == XLOG_RECOVER_PASS1) {
- return;
- }
-
efd_formatp = item->ri_buf[0].i_addr;
ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
@@ -2775,7 +3138,8 @@ xlog_recover_do_efd_trans(
* xfs_trans_ail_delete() drops the
* AIL lock.
*/
- xfs_trans_ail_delete(ailp, lip);
+ xfs_trans_ail_delete(ailp, lip,
+ SHUTDOWN_CORRUPT_INCORE);
xfs_efi_item_free(efip);
spin_lock(&ailp->xa_lock);
break;
@@ -2783,65 +3147,96 @@ xlog_recover_do_efd_trans(
}
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
+
+ return 0;
}
/*
- * Perform the transaction
- *
- * If the transaction modifies a buffer or inode, do it now. Otherwise,
- * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ * This routine is called when an inode create format structure is found in a
+ * committed transaction in the log. It's purpose is to initialise the inodes
+ * being allocated on disk. This requires us to get inode cluster buffers that
+ * match the range to be intialised, stamped with inode templates and written
+ * by delayed write so that subsequent modifications will hit the cached buffer
+ * and only need writing out at the end of recovery.
*/
STATIC int
-xlog_recover_do_trans(
- xlog_t *log,
- xlog_recover_t *trans,
- int pass)
+xlog_recover_do_icreate_pass2(
+ struct xlog *log,
+ struct list_head *buffer_list,
+ xlog_recover_item_t *item)
{
- int error = 0;
- xlog_recover_item_t *item;
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_icreate_log *icl;
+ xfs_agnumber_t agno;
+ xfs_agblock_t agbno;
+ unsigned int count;
+ unsigned int isize;
+ xfs_agblock_t length;
+
+ icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr;
+ if (icl->icl_type != XFS_LI_ICREATE) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type");
+ return EINVAL;
+ }
- error = xlog_recover_reorder_trans(log, trans, pass);
- if (error)
- return error;
+ if (icl->icl_size != 1) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size");
+ return EINVAL;
+ }
- list_for_each_entry(item, &trans->r_itemq, ri_list) {
- trace_xfs_log_recover_item_recover(log, trans, item, pass);
- switch (ITEM_TYPE(item)) {
- case XFS_LI_BUF:
- error = xlog_recover_do_buffer_trans(log, item, pass);
- break;
- case XFS_LI_INODE:
- error = xlog_recover_do_inode_trans(log, item, pass);
- break;
- case XFS_LI_EFI:
- error = xlog_recover_do_efi_trans(log, item,
- trans->r_lsn, pass);
- break;
- case XFS_LI_EFD:
- xlog_recover_do_efd_trans(log, item, pass);
- error = 0;
- break;
- case XFS_LI_DQUOT:
- error = xlog_recover_do_dquot_trans(log, item, pass);
- break;
- case XFS_LI_QUOTAOFF:
- error = xlog_recover_do_quotaoff_trans(log, item,
- pass);
- break;
- default:
- xlog_warn(
- "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
- ASSERT(0);
- error = XFS_ERROR(EIO);
- break;
- }
+ agno = be32_to_cpu(icl->icl_ag);
+ if (agno >= mp->m_sb.sb_agcount) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno");
+ return EINVAL;
+ }
+ agbno = be32_to_cpu(icl->icl_agbno);
+ if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno");
+ return EINVAL;
+ }
+ isize = be32_to_cpu(icl->icl_isize);
+ if (isize != mp->m_sb.sb_inodesize) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize");
+ return EINVAL;
+ }
+ count = be32_to_cpu(icl->icl_count);
+ if (!count) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count");
+ return EINVAL;
+ }
+ length = be32_to_cpu(icl->icl_length);
+ if (!length || length >= mp->m_sb.sb_agblocks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length");
+ return EINVAL;
+ }
- if (error)
- return error;
+ /* existing allocation is fixed value */
+ ASSERT(count == mp->m_ialloc_inos);
+ ASSERT(length == mp->m_ialloc_blks);
+ if (count != mp->m_ialloc_inos ||
+ length != mp->m_ialloc_blks) {
+ xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2");
+ return EINVAL;
}
+ /*
+ * Inode buffers can be freed. Do not replay the inode initialisation as
+ * we could be overwriting something written after this inode buffer was
+ * cancelled.
+ *
+ * XXX: we need to iterate all buffers and only init those that are not
+ * cancelled. I think that a more fine grained factoring of
+ * xfs_ialloc_inode_init may be appropriate here to enable this to be
+ * done easily.
+ */
+ if (xlog_check_buffer_cancelled(log,
+ XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0))
+ return 0;
+
+ xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length,
+ be32_to_cpu(icl->icl_gen));
return 0;
}
@@ -2852,7 +3247,7 @@ xlog_recover_do_trans(
*/
STATIC void
xlog_recover_free_trans(
- xlog_recover_t *trans)
+ struct xlog_recover *trans)
{
xlog_recover_item_t *item, *n;
int i;
@@ -2870,27 +3265,267 @@ xlog_recover_free_trans(
kmem_free(trans);
}
+STATIC void
+xlog_recover_buffer_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr;
+ struct xfs_mount *mp = log->l_mp;
+
+ if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno,
+ buf_f->blf_len, buf_f->blf_flags)) {
+ return;
+ }
+
+ xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno,
+ buf_f->blf_len, NULL);
+}
+
+STATIC void
+xlog_recover_inode_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_inode_log_format ilf_buf;
+ struct xfs_inode_log_format *ilfp;
+ struct xfs_mount *mp = log->l_mp;
+ int error;
+
+ if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
+ ilfp = item->ri_buf[0].i_addr;
+ } else {
+ ilfp = &ilf_buf;
+ memset(ilfp, 0, sizeof(*ilfp));
+ error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp);
+ if (error)
+ return;
+ }
+
+ if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0))
+ return;
+
+ xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno,
+ ilfp->ilf_len, &xfs_inode_buf_ra_ops);
+}
+
+STATIC void
+xlog_recover_dquot_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ struct xfs_mount *mp = log->l_mp;
+ struct xfs_disk_dquot *recddq;
+ struct xfs_dq_logformat *dq_f;
+ uint type;
+
+
+ if (mp->m_qflags == 0)
+ return;
+
+ recddq = item->ri_buf[1].i_addr;
+ if (recddq == NULL)
+ return;
+ if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot))
+ return;
+
+ type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
+ ASSERT(type);
+ if (log->l_quotaoffs_flag & type)
+ return;
+
+ dq_f = item->ri_buf[0].i_addr;
+ ASSERT(dq_f);
+ ASSERT(dq_f->qlf_len == 1);
+
+ xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+}
+
+STATIC void
+xlog_recover_ra_pass2(
+ struct xlog *log,
+ struct xlog_recover_item *item)
+{
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ xlog_recover_buffer_ra_pass2(log, item);
+ break;
+ case XFS_LI_INODE:
+ xlog_recover_inode_ra_pass2(log, item);
+ break;
+ case XFS_LI_DQUOT:
+ xlog_recover_dquot_ra_pass2(log, item);
+ break;
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_QUOTAOFF:
+ default:
+ break;
+ }
+}
+
+STATIC int
+xlog_recover_commit_pass1(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct xlog_recover_item *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass1(log, item);
+ case XFS_LI_QUOTAOFF:
+ return xlog_recover_quotaoff_pass1(log, item);
+ case XFS_LI_INODE:
+ case XFS_LI_EFI:
+ case XFS_LI_EFD:
+ case XFS_LI_DQUOT:
+ case XFS_LI_ICREATE:
+ /* nothing to do in pass 1 */
+ return 0;
+ default:
+ xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+ __func__, ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+STATIC int
+xlog_recover_commit_pass2(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct xlog_recover_item *item)
+{
+ trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
+
+ switch (ITEM_TYPE(item)) {
+ case XFS_LI_BUF:
+ return xlog_recover_buffer_pass2(log, buffer_list, item,
+ trans->r_lsn);
+ case XFS_LI_INODE:
+ return xlog_recover_inode_pass2(log, buffer_list, item,
+ trans->r_lsn);
+ case XFS_LI_EFI:
+ return xlog_recover_efi_pass2(log, item, trans->r_lsn);
+ case XFS_LI_EFD:
+ return xlog_recover_efd_pass2(log, item);
+ case XFS_LI_DQUOT:
+ return xlog_recover_dquot_pass2(log, buffer_list, item,
+ trans->r_lsn);
+ case XFS_LI_ICREATE:
+ return xlog_recover_do_icreate_pass2(log, buffer_list, item);
+ case XFS_LI_QUOTAOFF:
+ /* nothing to do in pass2 */
+ return 0;
+ default:
+ xfs_warn(log->l_mp, "%s: invalid item type (%d)",
+ __func__, ITEM_TYPE(item));
+ ASSERT(0);
+ return XFS_ERROR(EIO);
+ }
+}
+
+STATIC int
+xlog_recover_items_pass2(
+ struct xlog *log,
+ struct xlog_recover *trans,
+ struct list_head *buffer_list,
+ struct list_head *item_list)
+{
+ struct xlog_recover_item *item;
+ int error = 0;
+
+ list_for_each_entry(item, item_list, ri_list) {
+ error = xlog_recover_commit_pass2(log, trans,
+ buffer_list, item);
+ if (error)
+ return error;
+ }
+
+ return error;
+}
+
+/*
+ * Perform the transaction.
+ *
+ * If the transaction modifies a buffer or inode, do it now. Otherwise,
+ * EFIs and EFDs get queued up by adding entries into the AIL for them.
+ */
STATIC int
xlog_recover_commit_trans(
- xlog_t *log,
- xlog_recover_t *trans,
+ struct xlog *log,
+ struct xlog_recover *trans,
int pass)
{
- int error;
+ int error = 0;
+ int error2;
+ int items_queued = 0;
+ struct xlog_recover_item *item;
+ struct xlog_recover_item *next;
+ LIST_HEAD (buffer_list);
+ LIST_HEAD (ra_list);
+ LIST_HEAD (done_list);
+
+ #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
hlist_del(&trans->r_list);
- if ((error = xlog_recover_do_trans(log, trans, pass)))
+
+ error = xlog_recover_reorder_trans(log, trans, pass);
+ if (error)
return error;
- xlog_recover_free_trans(trans); /* no error */
- return 0;
+
+ list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
+ switch (pass) {
+ case XLOG_RECOVER_PASS1:
+ error = xlog_recover_commit_pass1(log, trans, item);
+ break;
+ case XLOG_RECOVER_PASS2:
+ xlog_recover_ra_pass2(log, item);
+ list_move_tail(&item->ri_list, &ra_list);
+ items_queued++;
+ if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
+ items_queued = 0;
+ }
+
+ break;
+ default:
+ ASSERT(0);
+ }
+
+ if (error)
+ goto out;
+ }
+
+out:
+ if (!list_empty(&ra_list)) {
+ if (!error)
+ error = xlog_recover_items_pass2(log, trans,
+ &buffer_list, &ra_list);
+ list_splice_tail_init(&ra_list, &done_list);
+ }
+
+ if (!list_empty(&done_list))
+ list_splice_init(&done_list, &trans->r_itemq);
+
+ xlog_recover_free_trans(trans);
+
+ error2 = xfs_buf_delwri_submit(&buffer_list);
+ return error ? error : error2;
}
STATIC int
xlog_recover_unmount_trans(
- xlog_recover_t *trans)
+ struct xlog *log)
{
/* Do nothing now */
- xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
+ xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
return 0;
}
@@ -2905,9 +3540,9 @@ xlog_recover_unmount_trans(
*/
STATIC int
xlog_recover_process_data(
- xlog_t *log,
+ struct xlog *log,
struct hlist_head rhash[],
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
int pass)
{
@@ -2933,8 +3568,8 @@ xlog_recover_process_data(
dp += sizeof(xlog_op_header_t);
if (ohead->oh_clientid != XFS_TRANSACTION &&
ohead->oh_clientid != XFS_LOG) {
- xlog_warn(
- "XFS: xlog_recover_process_data: bad clientid");
+ xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
+ __func__, ohead->oh_clientid);
ASSERT(0);
return (XFS_ERROR(EIO));
}
@@ -2947,8 +3582,8 @@ xlog_recover_process_data(
be64_to_cpu(rhead->h_lsn));
} else {
if (dp + be32_to_cpu(ohead->oh_len) > lp) {
- xlog_warn(
- "XFS: xlog_recover_process_data: bad length");
+ xfs_warn(log->l_mp, "%s: bad length 0x%x",
+ __func__, be32_to_cpu(ohead->oh_len));
WARN_ON(1);
return (XFS_ERROR(EIO));
}
@@ -2961,7 +3596,7 @@ xlog_recover_process_data(
trans, pass);
break;
case XLOG_UNMOUNT_TRANS:
- error = xlog_recover_unmount_trans(trans);
+ error = xlog_recover_unmount_trans(log);
break;
case XLOG_WAS_CONT_TRANS:
error = xlog_recover_add_to_cont_trans(log,
@@ -2969,8 +3604,8 @@ xlog_recover_process_data(
be32_to_cpu(ohead->oh_len));
break;
case XLOG_START_TRANS:
- xlog_warn(
- "XFS: xlog_recover_process_data: bad transaction");
+ xfs_warn(log->l_mp, "%s: bad transaction",
+ __func__);
ASSERT(0);
error = XFS_ERROR(EIO);
break;
@@ -2980,14 +3615,16 @@ xlog_recover_process_data(
dp, be32_to_cpu(ohead->oh_len));
break;
default:
- xlog_warn(
- "XFS: xlog_recover_process_data: bad flag");
+ xfs_warn(log->l_mp, "%s: bad flag 0x%x",
+ __func__, flags);
ASSERT(0);
error = XFS_ERROR(EIO);
break;
}
- if (error)
+ if (error) {
+ xlog_recover_free_trans(trans);
return error;
+ }
}
dp += be32_to_cpu(ohead->oh_len);
num_logops--;
@@ -3011,7 +3648,7 @@ xlog_recover_process_efi(
xfs_extent_t *extp;
xfs_fsblock_t startblock_fsb;
- ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
+ ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
/*
* First check the validity of the extents described by the
@@ -3030,13 +3667,14 @@ xlog_recover_process_efi(
* This will pull the EFI from the AIL and
* free the memory associated with it.
*/
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
xfs_efi_release(efip, efip->efi_format.efi_nextents);
return XFS_ERROR(EIO);
}
}
tp = xfs_trans_alloc(mp, 0);
- error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
if (error)
goto abort_error;
efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
@@ -3050,7 +3688,7 @@ xlog_recover_process_efi(
extp->ext_len);
}
- efip->efi_flags |= XFS_EFI_RECOVERED;
+ set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
error = xfs_trans_commit(tp, 0);
return error;
@@ -3079,7 +3717,7 @@ abort_error:
*/
STATIC int
xlog_recover_process_efis(
- xlog_t *log)
+ struct xlog *log)
{
xfs_log_item_t *lip;
xfs_efi_log_item_t *efip;
@@ -3107,7 +3745,7 @@ xlog_recover_process_efis(
* Skip EFIs that we've already processed.
*/
efip = (xfs_efi_log_item_t *)lip;
- if (efip->efi_flags & XFS_EFI_RECOVERED) {
+ if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
}
@@ -3120,7 +3758,7 @@ xlog_recover_process_efis(
lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
out:
- xfs_trans_ail_cursor_done(ailp, &cur);
+ xfs_trans_ail_cursor_done(&cur);
spin_unlock(&ailp->xa_lock);
return error;
}
@@ -3142,8 +3780,7 @@ xlog_recover_clear_agi_bucket(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
- 0, 0, 0);
+ error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0);
if (error)
goto out_abort;
@@ -3166,8 +3803,7 @@ xlog_recover_clear_agi_bucket(
out_abort:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
out_error:
- xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
- "failed to clear agi %d. Continuing.", agno);
+ xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
return;
}
@@ -3192,7 +3828,7 @@ xlog_recover_process_one_iunlink(
/*
* Get the on disk inode to find the next inode in the bucket.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
+ error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0);
if (error)
goto fail_iput;
@@ -3241,7 +3877,7 @@ xlog_recover_process_one_iunlink(
*/
STATIC void
xlog_recover_process_iunlinks(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agnumber_t agno;
@@ -3274,116 +3910,83 @@ xlog_recover_process_iunlinks(
*/
continue;
}
+ /*
+ * Unlock the buffer so that it can be acquired in the normal
+ * course of the transaction to truncate and free each inode.
+ * Because we are not racing with anyone else here for the AGI
+ * buffer, we don't even need to hold it locked to read the
+ * initial unlinked bucket entries out of the buffer. We keep
+ * buffer reference though, so that it stays pinned in memory
+ * while we need the buffer.
+ */
agi = XFS_BUF_TO_AGI(agibp);
+ xfs_buf_unlock(agibp);
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
- /*
- * Release the agi buffer so that it can
- * be acquired in the normal course of the
- * transaction to truncate and free the inode.
- */
- xfs_buf_relse(agibp);
-
agino = xlog_recover_process_one_iunlink(mp,
agno, agino, bucket);
-
- /*
- * Reacquire the agibuffer and continue around
- * the loop. This should never fail as we know
- * the buffer was good earlier on.
- */
- error = xfs_read_agi(mp, NULL, agno, &agibp);
- ASSERT(error == 0);
- agi = XFS_BUF_TO_AGI(agibp);
}
}
-
- /*
- * Release the buffer for the current agi so we can
- * go on to the next one.
- */
- xfs_buf_relse(agibp);
+ xfs_buf_rele(agibp);
}
mp->m_dmevmask = mp_dmevmask;
}
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int size)
-{
- int i;
- __be32 *up;
- uint chksum = 0;
-
- up = (__be32 *)iclog->ic_datap;
- /* divide length by 4 to get # words */
- for (i = 0; i < (size >> 2); i++) {
- chksum ^= be32_to_cpu(*up);
- up++;
- }
- iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
/*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
*/
-void
-xlog_pack_data(
- xlog_t *log,
- xlog_in_core_t *iclog,
- int roundoff)
+STATIC int
+xlog_unpack_data_crc(
+ struct xlog_rec_header *rhead,
+ xfs_caddr_t dp,
+ struct xlog *log)
{
- int i, j, k;
- int size = iclog->ic_offset + roundoff;
- __be32 cycle_lsn;
- xfs_caddr_t dp;
-
- xlog_pack_data_checksum(log, iclog, size);
-
- cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
- dp = iclog->ic_datap;
- for (i = 0; i < BTOBB(size) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = iclog->ic_data;
-
- for ( ; i < BTOBB(size); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
+ __le32 crc;
+
+ crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ if (crc != rhead->h_crc) {
+ if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ xfs_alert(log->l_mp,
+ "log record CRC mismatch: found 0x%x, expected 0x%x.",
+ le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(crc));
+ xfs_hex_dump(dp, 32);
}
- for (i = 1; i < log->l_iclog_heads; i++) {
- xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
- }
+ /*
+ * If we've detected a log record corruption, then we can't
+ * recover past this point. Abort recovery if we are enforcing
+ * CRC protection by punting an error back up the stack.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ return EFSCORRUPTED;
}
+
+ return 0;
}
-STATIC void
+STATIC int
xlog_unpack_data(
- xlog_rec_header_t *rhead,
+ struct xlog_rec_header *rhead,
xfs_caddr_t dp,
- xlog_t *log)
+ struct xlog *log)
{
int i, j, k;
+ int error;
+
+ error = xlog_unpack_data_crc(rhead, dp, log);
+ if (error)
+ return error;
for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3400,17 +4003,19 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
+
+ return 0;
}
STATIC int
xlog_valid_rec_header(
- xlog_t *log,
- xlog_rec_header_t *rhead,
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
xfs_daddr_t blkno)
{
int hlen;
- if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
+ if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
XFS_ERRLEVEL_LOW, log->l_mp);
return XFS_ERROR(EFSCORRUPTED);
@@ -3418,7 +4023,7 @@ xlog_valid_rec_header(
if (unlikely(
(!rhead->h_version ||
(be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
- xlog_warn("XFS: %s: unrecognised log version (%d).",
+ xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
__func__, be32_to_cpu(rhead->h_version));
return XFS_ERROR(EIO);
}
@@ -3448,7 +4053,7 @@ xlog_valid_rec_header(
*/
STATIC int
xlog_do_recovery_pass(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk,
int pass)
@@ -3531,9 +4136,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log,
- rhash, rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log,
+ rhash, rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3548,7 +4157,7 @@ xlog_do_recovery_pass(
/*
* Check for header wrapping around physical end-of-log
*/
- offset = XFS_BUF_PTR(hbp);
+ offset = hbp->b_addr;
split_hblks = 0;
wrapped_hblks = 0;
if (blk_no + hblks <= log->l_logBBsize) {
@@ -3584,19 +4193,9 @@ xlog_do_recovery_pass(
* - order is important.
*/
wrapped_hblks = hblks - split_hblks;
- error = XFS_BUF_SET_PTR(hbp,
- offset + BBTOB(split_hblks),
- BBTOB(hblks - split_hblks));
- if (error)
- goto bread_err2;
-
- error = xlog_bread_noalign(log, 0,
- wrapped_hblks, hbp);
- if (error)
- goto bread_err2;
-
- error = XFS_BUF_SET_PTR(hbp, offset,
- BBTOB(hblks));
+ error = xlog_bread_offset(log, 0,
+ wrapped_hblks, hbp,
+ offset + BBTOB(split_hblks));
if (error)
goto bread_err2;
}
@@ -3618,7 +4217,7 @@ xlog_do_recovery_pass(
} else {
/* This log record is split across the
* physical end of log */
- offset = XFS_BUF_PTR(dbp);
+ offset = dbp->b_addr;
split_bblks = 0;
if (blk_no != log->l_logBBsize) {
/* some data is before the physical
@@ -3647,25 +4246,20 @@ xlog_do_recovery_pass(
* _first_, then the log start (LR header end)
* - order is important.
*/
- error = XFS_BUF_SET_PTR(dbp,
- offset + BBTOB(split_bblks),
- BBTOB(bblks - split_bblks));
+ error = xlog_bread_offset(log, 0,
+ bblks - split_bblks, dbp,
+ offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
+ }
- error = xlog_bread_noalign(log, wrapped_hblks,
- bblks - split_bblks,
- dbp);
- if (error)
- goto bread_err2;
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
- error = XFS_BUF_SET_PTR(dbp, offset, h_size);
- if (error)
- goto bread_err2;
- }
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks;
}
@@ -3690,9 +4284,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3720,11 +4318,11 @@ xlog_do_recovery_pass(
*/
STATIC int
xlog_do_log_recovery(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
- int error;
+ int error, i;
ASSERT(head_blk != tail_blk);
@@ -3732,10 +4330,12 @@ xlog_do_log_recovery(
* First do a pass to find all of the cancelled buf log items.
* Store them in the buf_cancel_table for use in the second pass.
*/
- log->l_buf_cancel_table =
- (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
- sizeof(xfs_buf_cancel_t*),
+ log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
+ sizeof(struct list_head),
KM_SLEEP);
+ for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
+ INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
+
error = xlog_do_recovery_pass(log, head_blk, tail_blk,
XLOG_RECOVER_PASS1);
if (error != 0) {
@@ -3754,7 +4354,7 @@ xlog_do_log_recovery(
int i;
for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
- ASSERT(log->l_buf_cancel_table[i] == NULL);
+ ASSERT(list_empty(&log->l_buf_cancel_table[i]));
}
#endif /* DEBUG */
@@ -3769,7 +4369,7 @@ xlog_do_log_recovery(
*/
STATIC int
xlog_do_recover(
- xlog_t *log,
+ struct xlog *log,
xfs_daddr_t head_blk,
xfs_daddr_t tail_blk)
{
@@ -3781,11 +4381,8 @@ xlog_do_recover(
* First replay the images in the log.
*/
error = xlog_do_log_recovery(log, head_blk, tail_blk);
- if (error) {
+ if (error)
return error;
- }
-
- XFS_bflush(log->l_mp->m_ddev_targp);
/*
* If IO errors happened during recovery, bail out.
@@ -3807,19 +4404,24 @@ xlog_do_recover(
/*
* Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock.
+ * updates, re-read in the superblock and reverify it.
*/
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
- ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
- xfsbdstrat(log->l_mp, bp);
+ bp->b_ops = &xfs_sb_buf_ops;
+
+ if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
+ xfs_buf_relse(bp);
+ return XFS_ERROR(EIO);
+ }
+
+ xfs_buf_iorequest(bp);
error = xfs_buf_iowait(bp);
if (error) {
- xfs_ioerror_alert("xlog_do_recover",
- log->l_mp, bp, XFS_BUF_ADDR(bp));
+ xfs_buf_ioerror_alert(bp, __func__);
ASSERT(0);
xfs_buf_relse(bp);
return error;
@@ -3849,7 +4451,7 @@ xlog_do_recover(
*/
int
xlog_recover(
- xlog_t *log)
+ struct xlog *log)
{
xfs_daddr_t head_blk, tail_blk;
int error;
@@ -3874,10 +4476,28 @@ xlog_recover(
return error;
}
- cmn_err(CE_NOTE,
- "Starting XFS recovery on filesystem: %s (logdev: %s)",
- log->l_mp->m_fsname, log->l_mp->m_logname ?
- log->l_mp->m_logname : "internal");
+ /*
+ * Version 5 superblock log feature mask validation. We know the
+ * log is dirty so check if there are any unknown log features
+ * in what we need to recover. If there are unknown features
+ * (e.g. unsupported transactions, then simply reject the
+ * attempt at recovery before touching anything.
+ */
+ if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 &&
+ xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
+ xfs_warn(log->l_mp,
+"Superblock has unknown incompatible log features (0x%x) enabled.\n"
+"The log can not be fully and/or safely recovered by this kernel.\n"
+"Please recover the log on a kernel that supports the unknown features.",
+ (log->l_mp->m_sb.sb_features_log_incompat &
+ XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
+ return EINVAL;
+ }
+
+ xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
+ log->l_mp->m_logname ? log->l_mp->m_logname
+ : "internal");
error = xlog_do_recover(log, head_blk, tail_blk);
log->l_flags |= XLOG_RECOVERY_NEEDED;
@@ -3896,7 +4516,7 @@ xlog_recover(
*/
int
xlog_recover_finish(
- xlog_t *log)
+ struct xlog *log)
{
/*
* Now we're ready to do the transactions needed for the
@@ -3910,9 +4530,7 @@ xlog_recover_finish(
int error;
error = xlog_recover_process_efis(log);
if (error) {
- cmn_err(CE_ALERT,
- "Failed to recover EFIs on filesystem: %s",
- log->l_mp->m_fsname);
+ xfs_alert(log->l_mp, "Failed to recover EFIs");
return error;
}
/*
@@ -3927,15 +4545,12 @@ xlog_recover_finish(
xlog_recover_check_summary(log);
- cmn_err(CE_NOTE,
- "Ending XFS recovery on filesystem: %s (logdev: %s)",
- log->l_mp->m_fsname, log->l_mp->m_logname ?
- log->l_mp->m_logname : "internal");
+ xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
+ log->l_mp->m_logname ? log->l_mp->m_logname
+ : "internal");
log->l_flags &= ~XLOG_RECOVERY_NEEDED;
} else {
- cmn_err(CE_DEBUG,
- "!Ending clean XFS mount for filesystem: %s\n",
- log->l_mp->m_fsname);
+ xfs_info(log->l_mp, "Ending clean mount");
}
return 0;
}
@@ -3948,7 +4563,7 @@ xlog_recover_finish(
*/
void
xlog_recover_check_summary(
- xlog_t *log)
+ struct xlog *log)
{
xfs_mount_t *mp;
xfs_agf_t *agfp;
@@ -3968,10 +4583,8 @@ xlog_recover_check_summary(
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
if (error) {
- xfs_fs_cmn_err(CE_ALERT, mp,
- "xlog_recover_check_summary(agf)"
- "agf read failed agno %d error %d",
- agno, error);
+ xfs_alert(mp, "%s agf read failed agno %d error %d",
+ __func__, agno, error);
} else {
agfp = XFS_BUF_TO_AGF(agfbp);
freeblks += be32_to_cpu(agfp->agf_freeblks) +
@@ -3980,7 +4593,10 @@ xlog_recover_check_summary(
}
error = xfs_read_agi(mp, NULL, agno, &agibp);
- if (!error) {
+ if (error) {
+ xfs_alert(mp, "%s agi read failed agno %d error %d",
+ __func__, agno, error);
+ } else {
struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
itotal += be32_to_cpu(agi->agi_count);