diff options
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
| -rw-r--r-- | fs/xfs/xfs_log_recover.c | 2478 |
1 files changed, 1547 insertions, 931 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 966d3f97458..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,42 +17,65 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_rw.h" -#include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_dir2.h" + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif /* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; + +/* * Sector aligned buffer routines for buffer create/read/write/access */ @@ -64,7 +87,7 @@ STATIC void xlog_recover_check_summary(xlog_t *); static inline int xlog_buf_bbcount_valid( - xlog_t *log, + struct xlog *log, int bbcount) { return bbcount > 0 && bbcount <= log->l_logBBsize; @@ -77,11 +100,13 @@ xlog_buf_bbcount_valid( */ STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { + struct xfs_buf *bp; + if (!xlog_buf_bbcount_valid(log, nbblks)) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; @@ -90,7 +115,7 @@ xlog_get_bp( /* * We do log I/O in units of log sectors (a power-of-2 * multiple of the basic block size), so we round up the - * requested size to acommodate the basic blocks required + * requested size to accommodate the basic blocks required * for complete log sectors. * * In addition, the buffer may be used for a non-sector- @@ -101,14 +126,16 @@ xlog_get_bp( * an issue. Nor will this be a problem if the log I/O is * done in basic blocks (sector size 1). But otherwise we * extend the buffer by one extra log sector to ensure - * there's space to accomodate this possiblility. + * there's space to accommodate this possibility. */ if (nbblks > 1 && log->l_sectBBsize > 1) nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - return xfs_buf_get_uncached(log->l_mp->m_logdev_targp, - BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); + if (bp) + xfs_buf_unlock(bp); + return bp; } STATIC void @@ -124,15 +151,15 @@ xlog_put_bp( */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); - return XFS_BUF_PTR(bp) + BBTOB(offset); + ASSERT(offset + nbblks <= bp->b_length); + return bp->b_addr + BBTOB(offset); } @@ -141,15 +168,15 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; if (!xlog_buf_bbcount_valid(log, nbblks)) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; @@ -159,28 +186,29 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + bp->b_io_length = nbblks; + bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -194,21 +222,50 @@ xlog_bread( } /* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + struct xlog *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + struct xfs_buf *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = bp->b_addr; + int orig_len = BBTOB(bp->b_length); + int error, error2; + + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. * This can only be used for synchronous log writes. */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; if (!xlog_buf_bbcount_valid(log, nbblks)) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", nbblks); XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; @@ -218,19 +275,19 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_HOLD(bp); - XFS_BUF_PSEMA(bp, PRIBIO); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); - - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_hold(bp); + xfs_buf_lock(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); return error; } @@ -243,9 +300,9 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); - cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", + xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else @@ -260,23 +317,23 @@ xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { - xlog_warn( - "XFS: dirty log written in incompatible format - can't recover"); + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(1)", XFS_ERRLEVEL_HIGH, mp); return XFS_ERROR(EFSCORRUPTED); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn( - "XFS: dirty log entry has mismatched uuid - can't recover"); + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(2)", XFS_ERRLEVEL_HIGH, mp); @@ -293,7 +350,7 @@ xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); if (uuid_is_nil(&head->h_fs_uuid)) { /* @@ -301,9 +358,9 @@ xlog_header_check_mount( * h_fs_uuid is nil, we assume this log was last mounted * by IRIX and continue. */ - xlog_warn("XFS: nil uuid in log - IRIX style log"); + xfs_warn(mp, "nil uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn("XFS: log has mismatched uuid - can't recover"); + xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_mount", XFS_ERRLEVEL_HIGH, mp); @@ -316,18 +373,16 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_target->bt_mount, bp, - XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); } - XFS_BUF_CLR_IODONE_FUNC(bp); + bp->b_iodone = NULL; xfs_buf_ioend(bp, 0); } @@ -339,8 +394,8 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -382,7 +437,7 @@ xlog_find_cycle_start( */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -402,6 +457,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -449,7 +506,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -479,8 +536,8 @@ xlog_find_verify_log_record( for (i = (*last_blk) - 1; i >= 0; i--) { if (i < start_blk) { /* valid log record not found */ - xlog_warn( - "XFS: Log inconsistent (didn't find previous header)"); + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); ASSERT(0); error = XFS_ERROR(EIO); goto out; @@ -494,7 +551,7 @@ xlog_find_verify_log_record( head = (xlog_rec_header_t *)offset; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; if (!smallmem) @@ -546,7 +603,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -559,7 +616,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -580,12 +637,12 @@ xlog_find_head( * mkfs etc write a dummy unmount record to a fresh * log so we can store the uuid in there */ - xlog_warn("XFS: totally zeroed log"); + xfs_warn(log->l_mp, "totally zeroed log"); } return 0; } else if (error) { - xlog_warn("XFS: empty log check failed"); + xfs_warn(log->l_mp, "empty log check failed"); return error; } @@ -808,7 +865,7 @@ validate_head: xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to find log head"); + xfs_warn(log->l_mp, "failed to find log head"); return error; } @@ -830,7 +887,7 @@ validate_head: */ STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -876,7 +933,7 @@ xlog_find_tail( if (error) goto done; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 1; break; } @@ -893,15 +950,16 @@ xlog_find_tail( if (error) goto done; - if (XLOG_HEADER_MAGIC_NUM == - be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 2; break; } } } if (!found) { - xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } @@ -925,12 +983,12 @@ xlog_find_tail( log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); - log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); - log->l_grant_reserve_cycle = log->l_curr_cycle; - log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); - log->l_grant_write_cycle = log->l_curr_cycle; - log->l_grant_write_bytes = BBTOB(log->l_curr_block); + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); /* * Look for unmount record. If we find it, then we know there @@ -960,7 +1018,7 @@ xlog_find_tail( } after_umount_blk = (i + hblks + (int) BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; - tail_lsn = log->l_tail_lsn; + tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; @@ -975,12 +1033,10 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - log->l_tail_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); - log->l_last_sync_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; /* @@ -1019,7 +1075,7 @@ done: xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to locate log tail"); + xfs_warn(log->l_mp, "failed to locate log tail"); return error; } @@ -1041,7 +1097,7 @@ done: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1083,8 +1139,10 @@ xlog_find_zeroed( * the first block must be 1. If it's not, maybe we're * not looking at a log... Bail out. */ - xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1143,7 +1201,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1165,7 +1223,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1188,6 +1246,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1219,20 +1279,12 @@ xlog_write_log_records( */ ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp); - balign = BBTOB(ealign - start_block); - error = XFS_BUF_SET_PTR(bp, offset + balign, - BBTOB(sectbb)); + offset = bp->b_addr + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); if (error) break; - error = xlog_bread_noalign(log, ealign, sectbb, bp); - if (error) - break; - - error = XFS_BUF_SET_PTR(bp, offset, bufblks); - if (error) - break; } offset = xlog_align(log, start_block, endcount, bp); @@ -1271,7 +1323,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -1398,9 +1450,8 @@ xlog_recover_find_tid( xlog_tid_t tid) { xlog_recover_t *trans; - struct hlist_node *n; - hlist_for_each_entry(trans, n, head, r_list) { + hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } @@ -1437,8 +1488,8 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1460,7 +1511,7 @@ xlog_recover_add_to_cont_trans( old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; @@ -1483,8 +1534,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1497,8 +1548,8 @@ xlog_recover_add_to_trans( if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { - xlog_warn("XFS: xlog_recover_add_to_trans: " - "bad header magic number"); + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); ASSERT(0); return XFS_ERROR(EIO); } @@ -1525,10 +1576,11 @@ xlog_recover_add_to_trans( if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { - xlog_warn( - "XFS: bad number of regions (%d) in inode log format", + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + kmem_free(ptr); return XFS_ERROR(EIO); } @@ -1547,32 +1599,89 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. */ STATIC int xlog_recover_reorder_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); break; } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); + break; + } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1580,17 +1689,34 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: - xlog_warn( - "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); - return 0; + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; } /* @@ -1605,240 +1731,160 @@ xlog_recover_reorder_trans( * record in the table to tell us how many times we expect to see this * record during the second pass. */ -STATIC void -xlog_recover_do_buffer_pass1( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +STATIC int +xlog_recover_buffer_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *nextp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; - xfs_daddr_t blkno = 0; - uint len = 0; - ushort flags = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - } + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLF_CANCEL)) { + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { trace_xfs_log_recover_buf_not_cancel(log, buf_f); - return; - } - - /* - * Insert an xfs_buf_cancel record into the hash table of - * them. If there is already an identical record, bump - * its reference count. - */ - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - /* - * If the hash bucket is empty then just insert a new record into - * the bucket. - */ - if (*bucket == NULL) { - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; - bcp->bc_refcount = 1; - bcp->bc_next = NULL; - *bucket = bcp; - return; + return 0; } /* - * The hash bucket is not empty, so search for duplicates of our - * record. If we find one them just bump its refcount. If not - * then add us at the end of the list. + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. */ - prevp = NULL; - nextp = *bucket; - while (nextp != NULL) { - if (nextp->bc_blkno == blkno && nextp->bc_len == len) { - nextp->bc_refcount++; + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); - return; + return 0; } - prevp = nextp; - nextp = nextp->bc_next; } - ASSERT(prevp != NULL); - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; - bcp->bc_next = NULL; - prevp->bc_next = bcp; + list_add_tail(&bcp->bc_list, bucket); + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; } /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( - xlog_t *log, +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; } - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - bcp = *bucket; - if (bcp == NULL) { - /* - * There is no corresponding entry in the table built - * in pass one, so this buffer has not been cancelled. - */ - ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; } /* - * Search for an entry in the buffer cancel table that - * matches our buffer. - */ - prevp = NULL; - while (bcp != NULL) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) { - /* - * We've go a match, so return 1 so that the - * recovery of this buffer is cancelled. - * If this buffer is actually a buffer cancel - * log item, then decrement the refcount on the - * one in the table and remove it if this is the - * last reference. - */ - if (flags & XFS_BLF_CANCEL) { - bcp->bc_refcount--; - if (bcp->bc_refcount == 0) { - if (prevp == NULL) { - *bucket = bcp->bc_next; - } else { - prevp->bc_next = bcp->bc_next; - } - kmem_free(bcp); - } - } - return 1; - } - prevp = bcp; - bcp = bcp->bc_next; - } - /* - * We didn't find a corresponding entry in the table, so - * return 0 so that the buffer is NOT cancelled. + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; } +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ STATIC int -xlog_recover_do_buffer_pass2( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) { - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; + struct xfs_buf_cancel *bcp; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; - } + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; - return xlog_check_buffer_cancelled(log, blkno, len, flags); + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; } /* - * Perform recovery for a buffer full of inodes. In these buffers, - * the only data which should be recovered is that which corresponds - * to the di_next_unlinked pointers in the on disk inode structures. - * The rest of the data for the inodes is always logged through the - * inodes themselves rather than the inode buffer and is recovered - * in xlog_recover_do_inode_trans(). + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * - * The only time when buffers full of inodes are fully recovered is - * when the buffer is full of newly allocated inodes. In this case - * the buffer will not be marked as an inode buffer and so will be - * sent to xlog_recover_do_reg_buffer() below during recovery. + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; - int item_index; - int bit; - int nbits; - int reg_buf_offset; - int reg_buf_bytes; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - unsigned int *data_map = NULL; - unsigned int map_size = 0; trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } /* - * Set the variables corresponding to the current region to - * 0 so that we'll initialize them on the first pass through - * the loop. + * Post recovery validation only works properly on CRC enabled + * filesystems. */ - reg_buf_offset = 0; - reg_buf_bytes = 0; - bit = 0; - nbits = 0; - item_index = 0; - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1852,18 +1898,18 @@ xlog_recover_do_inode_buffer( * the current di_next_unlinked field. */ bit += nbits; - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ - if (bit == -1) { + if (bit == -1) return 0; - } - nbits = xfs_contig_bits(data_map, map_size, - bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); reg_buf_offset = bit << XFS_BLF_SHIFT; reg_buf_bytes = nbits << XFS_BLF_SHIFT; @@ -1875,13 +1921,13 @@ xlog_recover_do_inode_buffer( * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ - if (next_unlinked_offset < reg_buf_offset) { + if (next_unlinked_offset < reg_buf_offset) continue; - } ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1891,8 +1937,9 @@ xlog_recover_do_inode_buffer( logged_nextp = item->ri_buf[item_index].i_addr + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { - xfs_fs_cmn_err(CE_ALERT, mp, - "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", XFS_ERRLEVEL_LOW, mp); @@ -1902,52 +1949,405 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -/*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; - unsigned int *data_map = NULL; - unsigned int map_size = 0; int error; trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); if (bit == -1) break; - nbits = xfs_contig_bits(data_map, map_size, bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1958,17 +2358,17 @@ xlog_recover_do_reg_buffer( if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { - cmn_err(CE_ALERT, + xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { - cmn_err(CE_ALERT, + xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck(item->ri_buf[i].i_addr, + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -1986,144 +2386,32 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); -} - -/* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - - /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. - */ - if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); /* - * Typically, a repair is only requested by quotacheck. + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - return errs; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; @@ -2172,88 +2460,67 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int -xlog_recover_do_buffer_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_buffer_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; - int cancel; - xfs_daddr_t blkno; - int len; - ushort flags; uint buf_flags; + xfs_lsn_t lsn; - if (pass == XLOG_RECOVER_PASS1) { - /* - * In this pass we're only looking for buf items - * with the XFS_BLF_CANCEL bit set. - */ - xlog_recover_do_buffer_pass1(log, buf_f); + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; - } else { - /* - * In this pass we want to recover all the buffers - * which have not been cancelled and are not - * cancellation buffers themselves. The routine - * we call here will tell us whether or not to - * continue with the replay of this buffer. - */ - cancel = xlog_recover_do_buffer_pass2(log, buf_f); - if (cancel) { - trace_xfs_log_recover_buf_cancel(log, buf_f); - return 0; - } } + trace_xfs_log_recover_buf_recover(log, buf_f); - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - default: - xfs_fs_cmn_err(CE_ALERT, log->l_mp, - "xfs_log_recover: unknown buffer type 0x%x, logdev %s", - buf_f->blf_type, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); - XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", - XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); - } - mp = log->l_mp; - buf_flags = XBF_LOCK; - if (!(flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; - - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); - return error; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, NULL); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); + goto out_release; } - error = 0; - if (flags & XFS_BLF_INODE_BUF) { + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & + } else if (buf_f->blf_flags & (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2261,41 +2528,119 @@ xlog_recover_do_buffer_trans( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); } - return (error); +out_release: + xfs_buf_relse(bp); + return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + STATIC int -xlog_recover_do_inode_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; - xfs_ino_t ino; int len; xfs_caddr_t src; xfs_caddr_t dest; @@ -2303,12 +2648,9 @@ xlog_recover_do_inode_trans( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { in_f = item->ri_buf[0].i_addr; } else { @@ -2318,8 +2660,6 @@ xlog_recover_do_inode_trans( if (error) goto error; } - ino = in_f->ilf_ino; - mp = log->l_mp; /* * Inode buffers can be freed, look out for it, @@ -2333,16 +2673,17 @@ xlog_recover_do_inode_trans( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); + if (!bp) { + error = ENOMEM; goto error; } - error = 0; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); + goto out_release; + } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2350,30 +2691,53 @@ xlog_recover_do_inode_trans( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", - dip, bp, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", - item, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } + } + + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2382,82 +2746,82 @@ xlog_recover_do_inode_trans( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", - item, dip, bp, ino, + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", - item, dip, bp, ino, dicp->di_forkoff); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", - item->ri_buf[1].i_len, item); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2473,7 +2837,7 @@ xlog_recover_do_inode_trans( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2532,18 +2896,26 @@ xlog_recover_do_inode_trans( break; default: - xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2551,23 +2923,16 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int -xlog_recover_do_quotaoff_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_quotaoff_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_qoff_logformat_t *qoff_f; - - if (pass == XLOG_RECOVER_PASS2) { - return (0); - } - - qoff_f = item->ri_buf[0].i_addr; + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2588,22 +2953,19 @@ xlog_recover_do_quotaoff_trans( * Recover a dquot record */ STATIC int -xlog_recover_do_dquot_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_dquot_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - mp = log->l_mp; /* * Filesystems are required to send in quota flags at mount time. @@ -2613,13 +2975,11 @@ xlog_recover_do_dquot_trans( recddq = item->ri_buf[1].i_addr; if (recddq == NULL) { - cmn_err(CE_ALERT, - "XFS: NULL dquot in %s.", __func__); + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { - cmn_err(CE_ALERT, - "XFS: dquot too small (%d) in %s.", + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); return XFS_ERROR(EIO); } @@ -2644,23 +3004,18 @@ xlog_recover_do_dquot_trans( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - if ((error = xfs_qm_dqcheck(recddq, - dq_f->qlf_id, - 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans (log copy)"))) { + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) return XFS_ERROR(EIO); - } ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2669,20 +3024,40 @@ xlog_recover_do_dquot_trans( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans")) { + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2"); + if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -2693,38 +3068,31 @@ xlog_recover_do_dquot_trans( * LSN. */ STATIC int -xlog_recover_do_efi_trans( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn, - int pass) +xlog_recover_efi_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - efi_formatp = item->ri_buf[0].i_addr; - mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } - efip->efi_next_extent = efi_formatp->efi_nextents; - efip->efi_flags |= XFS_EFI_COMMITTED; + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); spin_lock(&log->l_ailp->xa_lock); /* * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } @@ -2737,11 +3105,10 @@ xlog_recover_do_efi_trans( * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ -STATIC void -xlog_recover_do_efd_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +STATIC int +xlog_recover_efd_pass2( + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2750,10 +3117,6 @@ xlog_recover_do_efd_trans( struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; - if (pass == XLOG_RECOVER_PASS1) { - return; - } - efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || @@ -2775,7 +3138,8 @@ xlog_recover_do_efd_trans( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2783,65 +3147,96 @@ xlog_recover_do_efd_trans( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + + return 0; } /* - * Perform the transaction - * - * If the transaction modifies a buffer or inode, do it now. Otherwise, - * EFIs and EFDs get queued up by adding entries into the AIL for them. + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. */ STATIC int -xlog_recover_do_trans( - xlog_t *log, - xlog_recover_t *trans, - int pass) +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) { - int error = 0; - xlog_recover_item_t *item; + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } - error = xlog_recover_reorder_trans(log, trans, pass); - if (error) - return error; + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } - list_for_each_entry(item, &trans->r_itemq, ri_list) { - trace_xfs_log_recover_item_recover(log, trans, item, pass); - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - error = xlog_recover_do_buffer_trans(log, item, pass); - break; - case XFS_LI_INODE: - error = xlog_recover_do_inode_trans(log, item, pass); - break; - case XFS_LI_EFI: - error = xlog_recover_do_efi_trans(log, item, - trans->r_lsn, pass); - break; - case XFS_LI_EFD: - xlog_recover_do_efd_trans(log, item, pass); - error = 0; - break; - case XFS_LI_DQUOT: - error = xlog_recover_do_dquot_trans(log, item, pass); - break; - case XFS_LI_QUOTAOFF: - error = xlog_recover_do_quotaoff_trans(log, item, - pass); - break; - default: - xlog_warn( - "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); - ASSERT(0); - error = XFS_ERROR(EIO); - break; - } + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } - if (error) - return error; + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; } + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -2852,7 +3247,7 @@ xlog_recover_do_trans( */ STATIC void xlog_recover_free_trans( - xlog_recover_t *trans) + struct xlog_recover *trans) { xlog_recover_item_t *item, *n; int i; @@ -2870,27 +3265,267 @@ xlog_recover_free_trans( kmem_free(trans); } +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + +STATIC int +xlog_recover_commit_pass1( + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + case XFS_LI_ICREATE: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ STATIC int xlog_recover_commit_trans( - xlog_t *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { - int error; + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 hlist_del(&trans->r_list); - if ((error = xlog_recover_do_trans(log, trans, pass))) + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) return error; - xlog_recover_free_trans(trans); /* no error */ - return 0; + + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: + error = xlog_recover_commit_pass1(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + + if (error) + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + + xlog_recover_free_trans(trans); + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - xlog_recover_t *trans) + struct xlog *log) { /* Do nothing now */ - xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); return 0; } @@ -2905,9 +3540,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, + struct xlog *log, struct hlist_head rhash[], - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2933,8 +3568,8 @@ xlog_recover_process_data( dp += sizeof(xlog_op_header_t); if (ohead->oh_clientid != XFS_TRANSACTION && ohead->oh_clientid != XFS_LOG) { - xlog_warn( - "XFS: xlog_recover_process_data: bad clientid"); + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); ASSERT(0); return (XFS_ERROR(EIO)); } @@ -2947,8 +3582,8 @@ xlog_recover_process_data( be64_to_cpu(rhead->h_lsn)); } else { if (dp + be32_to_cpu(ohead->oh_len) > lp) { - xlog_warn( - "XFS: xlog_recover_process_data: bad length"); + xfs_warn(log->l_mp, "%s: bad length 0x%x", + __func__, be32_to_cpu(ohead->oh_len)); WARN_ON(1); return (XFS_ERROR(EIO)); } @@ -2961,7 +3596,7 @@ xlog_recover_process_data( trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, @@ -2969,8 +3604,8 @@ xlog_recover_process_data( be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: - xlog_warn( - "XFS: xlog_recover_process_data: bad transaction"); + xfs_warn(log->l_mp, "%s: bad transaction", + __func__); ASSERT(0); error = XFS_ERROR(EIO); break; @@ -2980,14 +3615,16 @@ xlog_recover_process_data( dp, be32_to_cpu(ohead->oh_len)); break; default: - xlog_warn( - "XFS: xlog_recover_process_data: bad flag"); + xfs_warn(log->l_mp, "%s: bad flag 0x%x", + __func__, flags); ASSERT(0); error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -3011,7 +3648,7 @@ xlog_recover_process_efi( xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the @@ -3030,13 +3667,14 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); @@ -3050,7 +3688,7 @@ xlog_recover_process_efi( extp->ext_len); } - efip->efi_flags |= XFS_EFI_RECOVERED; + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp, 0); return error; @@ -3079,7 +3717,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3107,7 +3745,7 @@ xlog_recover_process_efis( * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_flags & XFS_EFI_RECOVERED) { + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } @@ -3120,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } @@ -3142,8 +3780,7 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), - 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); if (error) goto out_abort; @@ -3166,8 +3803,7 @@ xlog_recover_clear_agi_bucket( out_abort: xfs_trans_cancel(tp, XFS_TRANS_ABORT); out_error: - xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " - "failed to clear agi %d. Continuing.", agno); + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; } @@ -3192,7 +3828,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; @@ -3241,7 +3877,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3274,116 +3910,83 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3400,17 +4003,19 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; - if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); @@ -3418,7 +4023,7 @@ xlog_valid_rec_header( if (unlikely( (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { - xlog_warn("XFS: %s: unrecognised log version (%d).", + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } @@ -3448,7 +4053,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3531,9 +4136,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3548,7 +4157,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = XFS_BUF_PTR(hbp); + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3584,19 +4193,9 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - error = XFS_BUF_SET_PTR(hbp, - offset + BBTOB(split_hblks), - BBTOB(hblks - split_hblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, 0, - wrapped_hblks, hbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(hbp, offset, - BBTOB(hblks)); + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); if (error) goto bread_err2; } @@ -3618,7 +4217,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = XFS_BUF_PTR(dbp); + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3647,25 +4246,20 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - error = XFS_BUF_SET_PTR(dbp, - offset + BBTOB(split_bblks), - BBTOB(bblks - split_bblks)); + error = xlog_bread_offset(log, 0, + bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); if (error) goto bread_err2; + } - error = xlog_bread_noalign(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (error) - goto bread_err2; + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; - error = XFS_BUF_SET_PTR(dbp, offset, h_size); - if (error) - goto bread_err2; - } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3690,9 +4284,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3720,11 +4318,11 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error; + int error, i; ASSERT(head_blk != tail_blk); @@ -3732,10 +4330,12 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = - (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(xfs_buf_cancel_t*), + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { @@ -3754,7 +4354,7 @@ xlog_do_log_recovery( int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(log->l_buf_cancel_table[i] == NULL); + ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ @@ -3769,7 +4369,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3781,11 +4381,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - XFS_bflush(log->l_mp->m_ddev_targp); /* * If IO errors happened during recovery, bail out. @@ -3807,19 +4404,24 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - xfsbdstrat(log->l_mp, bp); + bp->b_ops = &xfs_sb_buf_ops; + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; @@ -3849,7 +4451,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3874,10 +4476,28 @@ xlog_recover( return error; } - cmn_err(CE_NOTE, - "Starting XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); log->l_flags |= XLOG_RECOVERY_NEEDED; @@ -3896,7 +4516,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3910,9 +4530,7 @@ xlog_recover_finish( int error; error = xlog_recover_process_efis(log); if (error) { - cmn_err(CE_ALERT, - "Failed to recover EFIs on filesystem: %s", - log->l_mp->m_fsname); + xfs_alert(log->l_mp, "Failed to recover EFIs"); return error; } /* @@ -3927,15 +4545,12 @@ xlog_recover_finish( xlog_recover_check_summary(log); - cmn_err(CE_NOTE, - "Ending XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { - cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", - log->l_mp->m_fsname); + xfs_info(log->l_mp, "Ending clean mount"); } return 0; } @@ -3948,7 +4563,7 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; @@ -3968,10 +4583,8 @@ xlog_recover_check_summary( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xlog_recover_check_summary(agf)" - "agf read failed agno %d error %d", - agno, error); + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); } else { agfp = XFS_BUF_TO_AGF(agfbp); freeblks += be32_to_cpu(agfp->agf_freeblks) + @@ -3980,7 +4593,10 @@ xlog_recover_check_summary( } error = xfs_read_agi(mp, NULL, agno, &agibp); - if (!error) { + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); itotal += be32_to_cpu(agi->agi_count); |
