diff options
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
| -rw-r--r-- | fs/xfs/xfs_log_recover.c | 3019 |
1 files changed, 1774 insertions, 1245 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 69ac2e5ef20..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,100 +17,149 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" -#include "xfs_dir2.h" -#include "xfs_dmapi.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dir2_sf.h" -#include "xfs_attr_sf.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_rw.h" -#include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_dir2.h" + +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); -STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q, - xlog_recover_item_t *item); +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif +/* + * This structure is used during recovery to record the buf log items which + * have been canceled and should not be replayed. + */ +struct xfs_buf_cancel { + xfs_daddr_t bc_blkno; + uint bc_len; + int bc_refcount; + struct list_head bc_list; +}; /* * Sector aligned buffer routines for buffer create/read/write/access */ -#define XLOG_SECTOR_ROUNDUP_BBCOUNT(log, bbs) \ - ( ((log)->l_sectbb_mask && (bbs & (log)->l_sectbb_mask)) ? \ - ((bbs + (log)->l_sectbb_mask + 1) & ~(log)->l_sectbb_mask) : (bbs) ) -#define XLOG_SECTOR_ROUNDDOWN_BLKNO(log, bno) ((bno) & ~(log)->l_sectbb_mask) +/* + * Verify the given count of basic blocks is valid number of blocks + * to specify for an operation involving the given XFS log buffer. + * Returns nonzero if the count is valid, 0 otherwise. + */ -xfs_buf_t * +static inline int +xlog_buf_bbcount_valid( + struct xlog *log, + int bbcount) +{ + return bbcount > 0 && bbcount <= log->l_logBBsize; +} + +/* + * Allocate a buffer to hold log data. The buffer needs to be able + * to map to a range of nbblks basic blocks at any valid (basic + * block) offset within the log. + */ +STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_get_bp(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + struct xfs_buf *bp; + + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return NULL; } - if (log->l_sectbb_log) { - if (nbblks > 1) - nbblks += XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } - return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp); + /* + * We do log I/O in units of log sectors (a power-of-2 + * multiple of the basic block size), so we round up the + * requested size to accommodate the basic blocks required + * for complete log sectors. + * + * In addition, the buffer may be used for a non-sector- + * aligned block offset, in which case an I/O of the + * requested size could extend beyond the end of the + * buffer. If the requested size is only 1 basic block it + * will never straddle a sector boundary, so this won't be + * an issue. Nor will this be a problem if the log I/O is + * done in basic blocks (sector size 1). But otherwise we + * extend the buffer by one extra log sector to ensure + * there's space to accommodate this possibility. + */ + if (nbblks > 1 && log->l_sectBBsize > 1) + nbblks += log->l_sectBBsize; + nbblks = round_up(nbblks, log->l_sectBBsize); + + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); + if (bp) + xfs_buf_unlock(bp); + return bp; } -void +STATIC void xlog_put_bp( xfs_buf_t *bp) { xfs_buf_free(bp); } +/* + * Return the address of the start of the given block number's data + * in a log buffer. The buffer covers a log sector-aligned region. + */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { - xfs_caddr_t ptr; + xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - if (!log->l_sectbb_log) - return XFS_BUF_PTR(bp); - - ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask); - ASSERT(XFS_BUF_SIZE(bp) >= - BBTOB(nbblks + (blk_no & log->l_sectbb_mask))); - return ptr; + ASSERT(offset + nbblks <= bp->b_length); + return bp->b_addr + BBTOB(offset); } @@ -119,49 +168,47 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bread(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); - ASSERT(bp); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); + bp->b_io_length = nbblks; + bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); - error = xfs_iowait(bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); if (error) - xfs_ioerror_alert("xlog_bread", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); return error; } STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -175,45 +222,72 @@ xlog_bread( } /* + * Read at an offset into the buffer. Returns with the buffer in it's original + * state regardless of the result of the read. + */ +STATIC int +xlog_bread_offset( + struct xlog *log, + xfs_daddr_t blk_no, /* block to read from */ + int nbblks, /* blocks to read */ + struct xfs_buf *bp, + xfs_caddr_t offset) +{ + xfs_caddr_t orig_offset = bp->b_addr; + int orig_len = BBTOB(bp->b_length); + int error, error2; + + error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); + if (error) + return error; + + error = xlog_bread_noalign(log, blk_no, nbblks, bp); + + /* must reset buffer pointer even on error */ + error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len); + if (error) + return error; + return error2; +} + +/* * Write out the buffer at the given block for the given number of blocks. * The buffer is kept locked across the write and is returned locked. * This can only be used for synchronous log writes. */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; - if (nbblks <= 0 || nbblks > log->l_logBBsize) { - xlog_warn("XFS: Invalid block length (0x%x) given for buffer", nbblks); - XFS_ERROR_REPORT("xlog_bwrite(1)", - XFS_ERRLEVEL_HIGH, log->l_mp); + if (!xlog_buf_bbcount_valid(log, nbblks)) { + xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer", + nbblks); + XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp); return EFSCORRUPTED; } - if (log->l_sectbb_log) { - blk_no = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, blk_no); - nbblks = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, nbblks); - } + blk_no = round_down(blk_no, log->l_sectBBsize); + nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); - XFS_BUF_BUSY(bp); - XFS_BUF_HOLD(bp); - XFS_BUF_PSEMA(bp, PRIBIO); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); - XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); - - if ((error = xfs_bwrite(log->l_mp, bp))) - xfs_ioerror_alert("xlog_bwrite", log->l_mp, - bp, XFS_BUF_ADDR(bp)); + xfs_buf_hold(bp); + xfs_buf_lock(bp); + bp->b_io_length = nbblks; + bp->b_error = 0; + + error = xfs_bwrite(bp); + if (error) + xfs_buf_ioerror_alert(bp, __func__); + xfs_buf_relse(bp); return error; } @@ -226,9 +300,9 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n", + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); - cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n", + xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else @@ -243,23 +317,23 @@ xlog_header_check_recover( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); /* * IRIX doesn't write the h_fmt field and leaves it zeroed * (XLOG_FMT_UNKNOWN). This stops us from trying to recover * a dirty log created in IRIX. */ - if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) { - xlog_warn( - "XFS: dirty log written in incompatible format - can't recover"); + if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) { + xfs_warn(mp, + "dirty log written in incompatible format - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(1)", XFS_ERRLEVEL_HIGH, mp); return XFS_ERROR(EFSCORRUPTED); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn( - "XFS: dirty log entry has mismatched uuid - can't recover"); + xfs_warn(mp, + "dirty log entry has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_recover(2)", XFS_ERRLEVEL_HIGH, mp); @@ -276,7 +350,7 @@ xlog_header_check_mount( xfs_mount_t *mp, xlog_rec_header_t *head) { - ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM); + ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)); if (uuid_is_nil(&head->h_fs_uuid)) { /* @@ -284,9 +358,9 @@ xlog_header_check_mount( * h_fs_uuid is nil, we assume this log was last mounted * by IRIX and continue. */ - xlog_warn("XFS: nil uuid in log - IRIX style log"); + xfs_warn(mp, "nil uuid in log - IRIX style log"); } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) { - xlog_warn("XFS: log has mismatched uuid - can't recover"); + xfs_warn(mp, "log has mismatched uuid - can't recover"); xlog_header_check_dump(mp, head); XFS_ERROR_REPORT("xlog_header_check_mount", XFS_ERRLEVEL_HIGH, mp); @@ -299,18 +373,17 @@ STATIC void xlog_recover_iodone( struct xfs_buf *bp) { - if (XFS_BUF_GETERROR(bp)) { + if (bp->b_error) { /* * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_ioerror_alert("xlog_recover_iodone", - bp->b_mount, bp, XFS_BUF_ADDR(bp)); - xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR); + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); } - bp->b_mount = NULL; - XFS_BUF_CLR_IODONE_FUNC(bp); - xfs_biodone(bp); + bp->b_iodone = NULL; + xfs_buf_ioend(bp, 0); } /* @@ -321,51 +394,50 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) { xfs_caddr_t offset; xfs_daddr_t mid_blk; + xfs_daddr_t end_blk; uint mid_cycle; int error; - mid_blk = BLK_AVG(first_blk, *last_blk); - while (mid_blk != first_blk && mid_blk != *last_blk) { + end_blk = *last_blk; + mid_blk = BLK_AVG(first_blk, end_blk); + while (mid_blk != first_blk && mid_blk != end_blk) { error = xlog_bread(log, mid_blk, 1, bp, &offset); if (error) return error; mid_cycle = xlog_get_cycle(offset); - if (mid_cycle == cycle) { - *last_blk = mid_blk; - /* last_half_cycle == mid_cycle */ - } else { - first_blk = mid_blk; - /* first_half_cycle == mid_cycle */ - } - mid_blk = BLK_AVG(first_blk, *last_blk); + if (mid_cycle == cycle) + end_blk = mid_blk; /* last_half_cycle == mid_cycle */ + else + first_blk = mid_blk; /* first_half_cycle == mid_cycle */ + mid_blk = BLK_AVG(first_blk, end_blk); } - ASSERT((mid_blk == first_blk && mid_blk+1 == *last_blk) || - (mid_blk == *last_blk && mid_blk-1 == first_blk)); + ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) || + (mid_blk == end_blk && mid_blk-1 == first_blk)); + + *last_blk = end_blk; return 0; } /* - * Check that the range of blocks does not contain the cycle number - * given. The scan needs to occur from front to back and the ptr into the - * region must be updated since a later routine will need to perform another - * test. If the region is completely good, we end up returning the same - * last block number. - * - * Set blkno to -1 if we encounter no errors. This is an invalid block number - * since we don't ever expect logs to get this large. + * Check that a range of blocks does not contain stop_on_cycle_no. + * Fill in *new_blk with the block offset where such a block is + * found, or with -1 (an invalid block number) if there is no such + * block in the range. The scan needs to occur from front to back + * and the pointer into the region must be updated since a later + * routine will need to perform another test. */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -378,12 +450,18 @@ xlog_find_verify_cycle( xfs_caddr_t buf = NULL; int error = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks we'll be examining. If that fails, + * try a smaller size. We need to be able to read at least + * a log sector, or we're out of luck. + */ bufblks = 1 << ffs(nbblks); - + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { - /* can't get enough memory to do everything in one big buffer */ bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < log->l_sectBBsize) return ENOMEM; } @@ -428,7 +506,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -458,8 +536,8 @@ xlog_find_verify_log_record( for (i = (*last_blk) - 1; i >= 0; i--) { if (i < start_blk) { /* valid log record not found */ - xlog_warn( - "XFS: Log inconsistent (didn't find previous header)"); + xfs_warn(log->l_mp, + "Log inconsistent (didn't find previous header)"); ASSERT(0); error = XFS_ERROR(EIO); goto out; @@ -473,7 +551,7 @@ xlog_find_verify_log_record( head = (xlog_rec_header_t *)offset; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno)) + if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) break; if (!smallmem) @@ -525,7 +603,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -538,7 +616,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -559,12 +637,12 @@ xlog_find_head( * mkfs etc write a dummy unmount record to a fresh * log so we can store the uuid in there */ - xlog_warn("XFS: totally zeroed log"); + xfs_warn(log->l_mp, "totally zeroed log"); } return 0; } else if (error) { - xlog_warn("XFS: empty log check failed"); + xfs_warn(log->l_mp, "empty log check failed"); return error; } @@ -631,7 +709,7 @@ xlog_find_head( * In this case we want to find the first block with cycle * number matching last_half_cycle. We expect the log to be * some variation on - * x + 1 ... | x ... + * x + 1 ... | x ... | x * The first block with cycle number x (last_half_cycle) will * be where the new head belongs. First we do a binary search * for the first occurrence of last_half_cycle. The binary @@ -641,11 +719,13 @@ xlog_find_head( * the log, then we look for occurrences of last_half_cycle - 1 * at the end of the log. The cases we're looking for look * like - * x + 1 ... | x | x + 1 | x ... - * ^ binary search stopped here + * v binary search stopped here + * x + 1 ... | x | x + 1 | x ... | x + * ^ but we want to locate this spot * or - * x + 1 ... | x ... | x - 1 | x * <---------> less than scan distance + * x + 1 ... | x ... | x - 1 | x + * ^ we want to locate this spot */ stop_on_cycle = last_half_cycle; if ((error = xlog_find_cycle_start(log, bp, first_blk, @@ -701,16 +781,16 @@ xlog_find_head( * certainly not the head of the log. By searching for * last_half_cycle-1 we accomplish that. */ - start_blk = log_bbnum - num_scan_bblks + head_blk; ASSERT(head_blk <= INT_MAX && - (xfs_daddr_t) num_scan_bblks - head_blk >= 0); + (xfs_daddr_t) num_scan_bblks >= head_blk); + start_blk = log_bbnum - (num_scan_bblks - head_blk); if ((error = xlog_find_verify_cycle(log, start_blk, num_scan_bblks - (int)head_blk, (stop_on_cycle - 1), &new_blk))) goto bp_err; if (new_blk != -1) { head_blk = new_blk; - goto bad_blk; + goto validate_head; } /* @@ -728,7 +808,7 @@ xlog_find_head( head_blk = new_blk; } - bad_blk: +validate_head: /* * Now we need to make sure head_blk is not pointing to a block in * the middle of a log record. @@ -750,7 +830,7 @@ xlog_find_head( if ((error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0)) == -1) { /* We hit the beginning of the log during our search */ - start_blk = log_bbnum - num_scan_bblks + head_blk; + start_blk = log_bbnum - (num_scan_bblks - head_blk); new_blk = log_bbnum; ASSERT(start_blk <= INT_MAX && (xfs_daddr_t) log_bbnum-start_blk >= 0); @@ -785,7 +865,7 @@ xlog_find_head( xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to find log head"); + xfs_warn(log->l_mp, "failed to find log head"); return error; } @@ -805,9 +885,9 @@ xlog_find_head( * We could speed up search by using current head_blk buffer, but it is not * available. */ -int +STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -835,12 +915,12 @@ xlog_find_tail( if (*head_blk == 0) { /* special case */ error = xlog_bread(log, 0, 1, bp, &offset); if (error) - goto bread_err; + goto done; if (xlog_get_cycle(offset) == 0) { *tail_blk = 0; /* leave all other log inited values alone */ - goto exit; + goto done; } } @@ -851,9 +931,9 @@ xlog_find_tail( for (i = (int)(*head_blk) - 1; i >= 0; i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; - if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 1; break; } @@ -868,17 +948,18 @@ xlog_find_tail( for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto bread_err; + goto done; - if (XLOG_HEADER_MAGIC_NUM == - be32_to_cpu(*(__be32 *)offset)) { + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { found = 2; break; } } } if (!found) { - xlog_warn("XFS: xlog_find_tail: couldn't find sync record"); + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } @@ -902,12 +983,12 @@ xlog_find_tail( log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); if (found == 2) log->l_curr_cycle++; - log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn); - log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn); - log->l_grant_reserve_cycle = log->l_curr_cycle; - log->l_grant_reserve_bytes = BBTOB(log->l_curr_block); - log->l_grant_write_cycle = log->l_curr_cycle; - log->l_grant_write_bytes = BBTOB(log->l_curr_block); + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); /* * Look for unmount record. If we find it, then we know there @@ -937,13 +1018,13 @@ xlog_find_tail( } after_umount_blk = (i + hblks + (int) BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; - tail_lsn = log->l_tail_lsn; + tail_lsn = atomic64_read(&log->l_tail_lsn); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { umount_data_blk = (i + hblks) % log->l_logBBsize; error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto bread_err; + goto done; op_head = (xlog_op_header_t *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { @@ -952,12 +1033,10 @@ xlog_find_tail( * log records will point recovery to after the * current unmount record. */ - log->l_tail_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); - log->l_last_sync_lsn = - xlog_assign_lsn(log->l_curr_cycle, - after_umount_blk); + xlog_assign_atomic_lsn(&log->l_tail_lsn, + log->l_curr_cycle, after_umount_blk); + xlog_assign_atomic_lsn(&log->l_last_sync_lsn, + log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; /* @@ -989,16 +1068,14 @@ xlog_find_tail( * But... if the -device- itself is readonly, just skip this. * We can't recover this device anyway, so it won't matter. */ - if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) { + if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp)) error = xlog_clear_stale_blocks(log, tail_lsn); - } -bread_err: -exit: +done: xlog_put_bp(bp); if (error) - xlog_warn("XFS: failed to locate log tail"); + xfs_warn(log->l_mp, "failed to locate log tail"); return error; } @@ -1020,7 +1097,7 @@ exit: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1062,8 +1139,10 @@ xlog_find_zeroed( * the first block must be 1. If it's not, maybe we're * not looking at a log... Bail out. */ - xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + xfs_warn(log->l_mp, + "Log inconsistent or not a log (last==0, first!=1)"); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1122,7 +1201,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1144,7 +1223,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1154,16 +1233,24 @@ xlog_write_log_records( xfs_caddr_t offset; xfs_buf_t *bp; int balign, ealign; - int sectbb = XLOG_SECTOR_ROUNDUP_BBCOUNT(log, 1); + int sectbb = log->l_sectBBsize; int end_block = start_block + blocks; int bufblks; int error = 0; int i, j = 0; + /* + * Greedily allocate a buffer big enough to handle the full + * range of basic blocks to be written. If that fails, try + * a smaller size. We need to be able to write at least a + * log sector, or we're out of luck. + */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; - if (bufblks <= log->l_sectbb_log) + if (bufblks < sectbb) return ENOMEM; } @@ -1171,7 +1258,7 @@ xlog_write_log_records( * the buffer in the starting sector not covered by the first * write below. */ - balign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, start_block); + balign = round_down(start_block, sectbb); if (balign != start_block) { error = xlog_bread_noalign(log, start_block, 1, bp); if (error) @@ -1190,22 +1277,14 @@ xlog_write_log_records( * the buffer in the final sector not covered by the write. * If this is the same sector as the above read, skip it. */ - ealign = XLOG_SECTOR_ROUNDDOWN_BLKNO(log, end_block); + ealign = round_down(end_block, sectbb); if (j == 0 && (start_block + endcount > ealign)) { - offset = XFS_BUF_PTR(bp); - balign = BBTOB(ealign - start_block); - error = XFS_BUF_SET_PTR(bp, offset + balign, - BBTOB(sectbb)); - if (error) - break; - - error = xlog_bread_noalign(log, ealign, sectbb, bp); + offset = bp->b_addr + BBTOB(ealign - start_block); + error = xlog_bread_offset(log, ealign, sectbb, + bp, offset); if (error) break; - error = XFS_BUF_SET_PTR(bp, offset, bufblks); - if (error) - break; } offset = xlog_align(log, start_block, endcount, bp); @@ -1244,7 +1323,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -1367,41 +1446,50 @@ xlog_clear_stale_blocks( STATIC xlog_recover_t * xlog_recover_find_tid( - xlog_recover_t *q, + struct hlist_head *head, xlog_tid_t tid) { - xlog_recover_t *p = q; + xlog_recover_t *trans; - while (p != NULL) { - if (p->r_log_tid == tid) - break; - p = p->r_next; + hlist_for_each_entry(trans, head, r_list) { + if (trans->r_log_tid == tid) + return trans; } - return p; + return NULL; } STATIC void -xlog_recover_put_hashq( - xlog_recover_t **q, - xlog_recover_t *trans) +xlog_recover_new_tid( + struct hlist_head *head, + xlog_tid_t tid, + xfs_lsn_t lsn) { - trans->r_next = *q; - *q = trans; + xlog_recover_t *trans; + + trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = lsn; + INIT_LIST_HEAD(&trans->r_itemq); + + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, head); } STATIC void xlog_recover_add_item( - xlog_recover_item_t **itemq) + struct list_head *head) { xlog_recover_item_t *item; item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - xlog_recover_insert_item_backq(itemq, item); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); } STATIC int xlog_recover_add_to_cont_trans( - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1409,8 +1497,7 @@ xlog_recover_add_to_cont_trans( xfs_caddr_t ptr, old_ptr; int old_len; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); ptr = (xfs_caddr_t) &trans->r_theader + @@ -1418,15 +1505,17 @@ xlog_recover_add_to_cont_trans( memcpy(ptr, dp, len); /* d, s, l */ return 0; } - item = item->ri_prev; + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; old_len = item->ri_buf[item->ri_cnt-1].i_len; - ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u); + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); memcpy(&ptr[old_len], dp, len); /* d, s, l */ item->ri_buf[item->ri_cnt-1].i_len += len; item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); return 0; } @@ -1445,7 +1534,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1455,12 +1545,11 @@ xlog_recover_add_to_trans( if (!len) return 0; - item = trans->r_itemq; - if (item == NULL) { + if (list_empty(&trans->r_itemq)) { /* we need to catch log corruptions here */ if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { - xlog_warn("XFS: xlog_recover_add_to_trans: " - "bad header magic number"); + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); ASSERT(0); return XFS_ERROR(EIO); } @@ -1474,20 +1563,24 @@ xlog_recover_add_to_trans( memcpy(ptr, dp, len); in_f = (xfs_inode_log_format_t *)ptr; - if (item->ri_prev->ri_total != 0 && - item->ri_prev->ri_total == item->ri_prev->ri_cnt) { + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); } - item = trans->r_itemq; - item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ if (in_f->ilf_size == 0 || in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { - xlog_warn( - "XFS: bad number of regions (%d) in inode log format", + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + kmem_free(ptr); return XFS_ERROR(EIO); } @@ -1501,117 +1594,129 @@ xlog_recover_add_to_trans( item->ri_buf[item->ri_cnt].i_addr = ptr; item->ri_buf[item->ri_cnt].i_len = len; item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } -STATIC void -xlog_recover_new_tid( - xlog_recover_t **q, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - xlog_recover_put_hashq(q, trans); -} - -STATIC int -xlog_recover_unlink_tid( - xlog_recover_t **q, - xlog_recover_t *trans) -{ - xlog_recover_t *tp; - int found = 0; - - ASSERT(trans != NULL); - if (trans == *q) { - *q = (*q)->r_next; - } else { - tp = *q; - while (tp) { - if (tp->r_next == trans) { - found = 1; - break; - } - tp = tp->r_next; - } - if (!found) { - xlog_warn( - "XFS: xlog_recover_unlink_tid: trans not found"); - ASSERT(0); - return XFS_ERROR(EIO); - } - tp->r_next = tp->r_next->r_next; - } - return 0; -} - -STATIC void -xlog_recover_insert_item_backq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - if (*q == NULL) { - item->ri_prev = item->ri_next = item; - *q = item; - } else { - item->ri_next = *q; - item->ri_prev = (*q)->ri_prev; - (*q)->ri_prev = item; - item->ri_prev->ri_next = item; - } -} - -STATIC void -xlog_recover_insert_item_frontq( - xlog_recover_item_t **q, - xlog_recover_item_t *item) -{ - xlog_recover_insert_item_backq(q, item); - *q = item; -} - +/* + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. + */ STATIC int xlog_recover_reorder_trans( - xlog_recover_t *trans) + struct xlog *log, + struct xlog_recover *trans, + int pass) { - xlog_recover_item_t *first_item, *itemq, *itemq_next; - xfs_buf_log_format_t *buf_f; - ushort flags = 0; + xlog_recover_item_t *item, *n; + int error = 0; + LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); - first_item = itemq = trans->r_itemq; - trans->r_itemq = NULL; - do { - itemq_next = itemq->ri_next; - buf_f = (xfs_buf_log_format_t *)itemq->ri_buf[0].i_addr; + list_splice_init(&trans->r_itemq, &sort_list); + list_for_each_entry_safe(item, n, &sort_list, ri_list) { + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; - switch (ITEM_TYPE(itemq)) { + switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - flags = buf_f->blf_flags; - if (!(flags & XFS_BLI_CANCEL)) { - xlog_recover_insert_item_frontq(&trans->r_itemq, - itemq); + if (buf_f->blf_flags & XFS_BLF_CANCEL) { + trace_xfs_log_recover_item_reorder_head(log, + trans, item, pass); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); break; } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: case XFS_LI_EFD: case XFS_LI_EFI: - xlog_recover_insert_item_backq(&trans->r_itemq, itemq); + trace_xfs_log_recover_item_reorder_tail(log, + trans, item, pass); + list_move_tail(&item->ri_list, &inode_list); break; default: - xlog_warn( - "XFS: xlog_recover_reorder_trans: unrecognized type of log operation"); + xfs_warn(log->l_mp, + "%s: unrecognized type of log operation", + __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } - itemq = itemq_next; - } while (first_item != itemq); - return 0; + } +out: + ASSERT(list_empty(&sort_list)); + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; } /* @@ -1626,234 +1731,160 @@ xlog_recover_reorder_trans( * record in the table to tell us how many times we expect to see this * record during the second pass. */ -STATIC void -xlog_recover_do_buffer_pass1( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +STATIC int +xlog_recover_buffer_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *nextp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; - xfs_daddr_t blkno = 0; - uint len = 0; - ushort flags = 0; - - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - } + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; /* * If this isn't a cancel buffer item, then just return. */ - if (!(flags & XFS_BLI_CANCEL)) - return; - - /* - * Insert an xfs_buf_cancel record into the hash table of - * them. If there is already an identical record, bump - * its reference count. - */ - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - /* - * If the hash bucket is empty then just insert a new record into - * the bucket. - */ - if (*bucket == NULL) { - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; - bcp->bc_refcount = 1; - bcp->bc_next = NULL; - *bucket = bcp; - return; + if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + trace_xfs_log_recover_buf_not_cancel(log, buf_f); + return 0; } /* - * The hash bucket is not empty, so search for duplicates of our - * record. If we find one them just bump its refcount. If not - * then add us at the end of the list. + * Insert an xfs_buf_cancel record into the hash table of them. + * If there is already an identical record, bump its reference count. */ - prevp = NULL; - nextp = *bucket; - while (nextp != NULL) { - if (nextp->bc_blkno == blkno && nextp->bc_len == len) { - nextp->bc_refcount++; - return; + bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == buf_f->blf_blkno && + bcp->bc_len == buf_f->blf_len) { + bcp->bc_refcount++; + trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f); + return 0; } - prevp = nextp; - nextp = nextp->bc_next; } - ASSERT(prevp != NULL); - bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t), - KM_SLEEP); - bcp->bc_blkno = blkno; - bcp->bc_len = len; + + bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP); + bcp->bc_blkno = buf_f->blf_blkno; + bcp->bc_len = buf_f->blf_len; bcp->bc_refcount = 1; - bcp->bc_next = NULL; - prevp->bc_next = bcp; + list_add_tail(&bcp->bc_list, bucket); + + trace_xfs_log_recover_buf_cancel_add(log, buf_f); + return 0; } /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( - xlog_t *log, +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) { - xfs_buf_cancel_t *bcp; - xfs_buf_cancel_t *prevp; - xfs_buf_cancel_t **bucket; + struct list_head *bucket; + struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; } - bucket = &log->l_buf_cancel_table[(__uint64_t)blkno % - XLOG_BC_TABLE_SIZE]; - bcp = *bucket; - if (bcp == NULL) { - /* - * There is no corresponding entry in the table built - * in pass one, so this buffer has not been cancelled. - */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); + list_for_each_entry(bcp, bucket, bc_list) { + if (bcp->bc_blkno == blkno && bcp->bc_len == len) + return bcp; } /* - * Search for an entry in the buffer cancel table that - * matches our buffer. + * We didn't find a corresponding entry in the table, so return 0 so + * that the buffer is NOT cancelled. */ - prevp = NULL; - while (bcp != NULL) { - if (bcp->bc_blkno == blkno && bcp->bc_len == len) { - /* - * We've go a match, so return 1 so that the - * recovery of this buffer is cancelled. - * If this buffer is actually a buffer cancel - * log item, then decrement the refcount on the - * one in the table and remove it if this is the - * last reference. - */ - if (flags & XFS_BLI_CANCEL) { - bcp->bc_refcount--; - if (bcp->bc_refcount == 0) { - if (prevp == NULL) { - *bucket = bcp->bc_next; - } else { - prevp->bc_next = bcp->bc_next; - } - kmem_free(bcp); - } - } - return 1; - } - prevp = bcp; - bcp = bcp->bc_next; - } - /* - * We didn't find a corresponding entry in the table, so - * return 0 so that the buffer is NOT cancelled. - */ - ASSERT(!(flags & XFS_BLI_CANCEL)); - return 0; + ASSERT(!(flags & XFS_BLF_CANCEL)); + return NULL; } +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ STATIC int -xlog_recover_do_buffer_pass2( - xlog_t *log, - xfs_buf_log_format_t *buf_f) +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) { - xfs_daddr_t blkno = 0; - ushort flags = 0; - uint len = 0; + struct xfs_buf_cancel *bcp; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - flags = buf_f->blf_flags; - len = buf_f->blf_len; - break; - } + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; - return xlog_check_buffer_cancelled(log, blkno, len, flags); + /* + * We've go a match, so return 1 so that the recovery of this buffer + * is cancelled. If this buffer is actually a buffer cancel log + * item, then decrement the refcount on the one in the table and + * remove it if this is the last reference. + */ + if (flags & XFS_BLF_CANCEL) { + if (--bcp->bc_refcount == 0) { + list_del(&bcp->bc_list); + kmem_free(bcp); + } + } + return 1; } /* - * Perform recovery for a buffer full of inodes. In these buffers, - * the only data which should be recovered is that which corresponds - * to the di_next_unlinked pointers in the on disk inode structures. - * The rest of the data for the inodes is always logged through the - * inodes themselves rather than the inode buffer and is recovered - * in xlog_recover_do_inode_trans(). + * Perform recovery for a buffer full of inodes. In these buffers, the only + * data which should be recovered is that which corresponds to the + * di_next_unlinked pointers in the on disk inode structures. The rest of the + * data for the inodes is always logged through the inodes themselves rather + * than the inode buffer and is recovered in xlog_recover_inode_pass2(). * - * The only time when buffers full of inodes are fully recovered is - * when the buffer is full of newly allocated inodes. In this case - * the buffer will not be marked as an inode buffer and so will be - * sent to xlog_recover_do_reg_buffer() below during recovery. + * The only time when buffers full of inodes are fully recovered is when the + * buffer is full of newly allocated inodes. In this case the buffer will + * not be marked as an inode buffer and so will be sent to + * xlog_recover_do_reg_buffer() below during recovery. */ STATIC int xlog_recover_do_inode_buffer( - xfs_mount_t *mp, + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; - int item_index; - int bit; - int nbits; - int reg_buf_offset; - int reg_buf_bytes; + int item_index = 0; + int bit = 0; + int nbits = 0; + int reg_buf_offset = 0; + int reg_buf_bytes = 0; int next_unlinked_offset; int inodes_per_buf; xfs_agino_t *logged_nextp; xfs_agino_t *buffer_nextp; - unsigned int *data_map = NULL; - unsigned int map_size = 0; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } + trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); + /* - * Set the variables corresponding to the current region to - * 0 so that we'll initialize them on the first pass through - * the loop. + * Post recovery validation only works properly on CRC enabled + * filesystems. */ - reg_buf_offset = 0; - reg_buf_bytes = 0; - bit = 0; - nbits = 0; - item_index = 0; - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1867,21 +1898,21 @@ xlog_recover_do_inode_buffer( * the current di_next_unlinked field. */ bit += nbits; - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); /* * If there are no more logged regions in the * buffer, then we're done. */ - if (bit == -1) { + if (bit == -1) return 0; - } - nbits = xfs_contig_bits(data_map, map_size, - bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); - reg_buf_offset = bit << XFS_BLI_SHIFT; - reg_buf_bytes = nbits << XFS_BLI_SHIFT; + reg_buf_offset = bit << XFS_BLF_SHIFT; + reg_buf_bytes = nbits << XFS_BLF_SHIFT; item_index++; } @@ -1890,25 +1921,25 @@ xlog_recover_do_inode_buffer( * di_next_unlinked field, then move on to the next * di_next_unlinked field. */ - if (next_unlinked_offset < reg_buf_offset) { + if (next_unlinked_offset < reg_buf_offset) continue; - } ASSERT(item->ri_buf[item_index].i_addr != NULL); - ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the * current di_next_unlinked field. Extract its value * and copy it to the buffer copy. */ - logged_nextp = (xfs_agino_t *) - ((char *)(item->ri_buf[item_index].i_addr) + - (next_unlinked_offset - reg_buf_offset)); + logged_nextp = item->ri_buf[item_index].i_addr + + next_unlinked_offset - reg_buf_offset; if (unlikely(*logged_nextp == 0)) { - xfs_fs_cmn_err(CE_ALERT, mp, - "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field", + xfs_alert(mp, + "Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). " + "Trying to replay bad (0) inode di_next_unlinked field.", item, bp); XFS_ERROR_REPORT("xlog_recover_do_inode_buf", XFS_ERRLEVEL_LOW, mp); @@ -1918,49 +1949,405 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates * where to place the logged data. */ -/*ARGSUSED*/ STATIC void xlog_recover_do_reg_buffer( + struct xfs_mount *mp, xlog_recover_item_t *item, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_buf_log_format_t *buf_f) { int i; int bit; int nbits; - unsigned int *data_map = NULL; - unsigned int map_size = 0; int error; - switch (buf_f->blf_type) { - case XFS_LI_BUF: - data_map = buf_f->blf_data_map; - map_size = buf_f->blf_map_size; - break; - } + trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f); + bit = 0; i = 1; /* 0 is the buf format structure */ while (1) { - bit = xfs_next_bit(data_map, map_size, bit); + bit = xfs_next_bit(buf_f->blf_data_map, + buf_f->blf_map_size, bit); if (bit == -1) break; - nbits = xfs_contig_bits(data_map, map_size, bit); + nbits = xfs_contig_bits(buf_f->blf_data_map, + buf_f->blf_map_size, bit); ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); - ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT)); + ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1969,20 +2356,19 @@ xlog_recover_do_reg_buffer( */ error = 0; if (buf_f->blf_flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { if (item->ri_buf[i].i_addr == NULL) { - cmn_err(CE_ALERT, + xfs_alert(mp, "XFS: NULL dquot in %s.", __func__); goto next; } if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) { - cmn_err(CE_ALERT, + xfs_alert(mp, "XFS: dquot too small (%d) in %s.", item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck((xfs_disk_dquot_t *) - item->ri_buf[i].i_addr, + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -1990,9 +2376,9 @@ xlog_recover_do_reg_buffer( } memcpy(xfs_buf_offset(bp, - (uint)bit << XFS_BLI_SHIFT), /* dest */ + (uint)bit << XFS_BLF_SHIFT), /* dest */ item->ri_buf[i].i_addr, /* source */ - nbits<<XFS_BLI_SHIFT); /* length */ + nbits<<XFS_BLF_SHIFT); /* length */ next: i++; bit += nbits; @@ -2000,147 +2386,37 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); -} - -/* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems */ - if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_ALERT, - "%s : Dquot ID 0x%x (0x%p) " - "RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id); - - /* - * Typically, a repair is only requested by quotacheck. - */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - return errs; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; + trace_xfs_log_recover_buf_dquot_buf(log, buf_f); + /* * Filesystems are required to send in quota flags at mount time. */ @@ -2149,11 +2425,11 @@ xlog_recover_do_dquot_buffer( } type = 0; - if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF) type |= XFS_DQ_USER; - if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF) type |= XFS_DQ_PROJ; - if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF) + if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF) type |= XFS_DQ_GROUP; /* * This type of quotas was turned off, so ignore this buffer @@ -2161,7 +2437,7 @@ xlog_recover_do_dquot_buffer( if (log->l_quotaoffs_flag & type) return; - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } /* @@ -2174,7 +2450,7 @@ xlog_recover_do_dquot_buffer( * here which overlaps that may be stale. * * When meta-data buffers are freed at run time we log a buffer item - * with the XFS_BLI_CANCEL bit set to indicate that previous copies + * with the XFS_BLF_CANCEL bit set to indicate that previous copies * of the buffer in the log should not be replayed at recovery time. * This is so that if the blocks covered by the buffer are reused for * file data before we crash we don't end up replaying old, freed @@ -2184,88 +2460,67 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int -xlog_recover_do_buffer_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_buffer_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { - xfs_buf_log_format_t *buf_f; - xfs_mount_t *mp; + xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; - int cancel; - xfs_daddr_t blkno; - int len; - ushort flags; uint buf_flags; + xfs_lsn_t lsn; - buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr; - - if (pass == XLOG_RECOVER_PASS1) { - /* - * In this pass we're only looking for buf items - * with the XFS_BLI_CANCEL bit set. - */ - xlog_recover_do_buffer_pass1(log, buf_f); + /* + * In this pass we only want to recover all the buffers which have + * not been cancelled and are not cancellation buffers themselves. + */ + if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + trace_xfs_log_recover_buf_cancel(log, buf_f); return 0; - } else { - /* - * In this pass we want to recover all the buffers - * which have not been cancelled and are not - * cancellation buffers themselves. The routine - * we call here will tell us whether or not to - * continue with the replay of this buffer. - */ - cancel = xlog_recover_do_buffer_pass2(log, buf_f); - if (cancel) { - return 0; - } - } - switch (buf_f->blf_type) { - case XFS_LI_BUF: - blkno = buf_f->blf_blkno; - len = buf_f->blf_len; - flags = buf_f->blf_flags; - break; - default: - xfs_fs_cmn_err(CE_ALERT, log->l_mp, - "xfs_log_recover: unknown buffer type 0x%x, logdev %s", - buf_f->blf_type, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); - XFS_ERROR_REPORT("xlog_recover_do_buffer_trans", - XFS_ERRLEVEL_LOW, log->l_mp); - return XFS_ERROR(EFSCORRUPTED); } - mp = log->l_mp; - buf_flags = XFS_BUF_LOCK; - if (!(flags & XFS_BLI_INODE_BUF)) - buf_flags |= XFS_BUF_MAPPED; - - bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp, - bp, blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); - return error; + trace_xfs_log_recover_buf_recover(log, buf_f); + + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; + + bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, + buf_flags, NULL); + if (!bp) + return XFS_ERROR(ENOMEM); + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); + goto out_release; } - error = 0; - if (flags & XFS_BLI_INODE_BUF) { + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); - } else if (flags & - (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) { + } else if (buf_f->blf_flags & + (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) { xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f); } else { - xlog_recover_do_reg_buffer(item, bp, buf_f); + xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2273,42 +2528,119 @@ xlog_recover_do_buffer_trans( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { - XFS_BUF_STALE(bp); - error = xfs_bwrite(mp, bp); + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { + xfs_buf_stale(bp); + error = xfs_bwrite(bp); } else { - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); } - return (error); +out_release: + xfs_buf_relse(bp); + return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + STATIC int -xlog_recover_do_inode_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + +STATIC int +xlog_recover_inode_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; xfs_dinode_t *dip; - xfs_ino_t ino; int len; xfs_caddr_t src; xfs_caddr_t dest; @@ -2316,24 +2648,18 @@ xlog_recover_do_inode_trans( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { - in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr; + in_f = item->ri_buf[0].i_addr; } else { - in_f = (xfs_inode_log_format_t *)kmem_alloc( - sizeof(xfs_inode_log_format_t), KM_SLEEP); + in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP); need_free = 1; error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); if (error) goto error; } - ino = in_f->ilf_ino; - mp = log->l_mp; /* * Inode buffers can be freed, look out for it, @@ -2342,19 +2668,22 @@ xlog_recover_do_inode_trans( if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len, 0)) { error = 0; + trace_xfs_log_recover_inode_cancel(log, in_f); goto error; } + trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XFS_BUF_LOCK); - if (XFS_BUF_ISERROR(bp)) { - xfs_ioerror_alert("xlog_recover_do..(read#2)", mp, - bp, in_f->ilf_blkno); - error = XFS_BUF_GETERROR(bp); - xfs_buf_relse(bp); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); + if (!bp) { + error = ENOMEM; goto error; } - error = 0; + error = bp->b_error; + if (error) { + xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); + goto out_release; + } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2362,30 +2691,53 @@ xlog_recover_do_inode_trans( * Make sure the place we're flushing out to really looks * like an inode! */ - if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld", - dip, bp, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)", + if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { + xfs_alert(mp, + "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", + __func__, dip, bp, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } - dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr); + dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld", - item, ino); - XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)", + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", + __func__, item, in_f->ilf_ino); + XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2394,81 +2746,82 @@ xlog_recover_do_inode_trans( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); + trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; - if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) { + if (unlikely(S_ISREG(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad regular inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } - } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) { + } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && (dicp->di_format != XFS_DINODE_FMT_BTREE) && (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", - item, dip, bp, ino); + xfs_alert(mp, + "%s: Bad dir inode log record, rec ptr 0x%p, " + "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", + __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", - item, dip, bp, ino, + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", + __func__, item, dip, bp, in_f->ilf_ino, dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)", + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x", - item, dip, bp, ino, dicp->di_forkoff); + xfs_alert(mp, + "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " + "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, + item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { - XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)", + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { + XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); - xfs_fs_cmn_err(CE_ALERT, mp, - "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p", - item->ri_buf[1].i_len, item); + xfs_alert(mp, + "%s: Bad inode log record length %d, rec ptr 0x%p", + __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2484,7 +2837,7 @@ xlog_recover_do_inode_trans( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2543,19 +2896,26 @@ xlog_recover_do_inode_trans( break; default: - xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag"); + xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: + xfs_buf_relse(bp); error: if (need_free) kmem_free(in_f); @@ -2563,23 +2923,16 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int -xlog_recover_do_quotaoff_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_quotaoff_pass1( + struct xlog *log, + struct xlog_recover_item *item) { - xfs_qoff_logformat_t *qoff_f; - - if (pass == XLOG_RECOVER_PASS2) { - return (0); - } - - qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr; + xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); /* @@ -2600,22 +2953,19 @@ xlog_recover_do_quotaoff_trans( * Recover a dquot record */ STATIC int -xlog_recover_do_dquot_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +xlog_recover_dquot_pass2( + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; struct xfs_disk_dquot *ddq, *recddq; int error; xfs_dq_logformat_t *dq_f; uint type; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - mp = log->l_mp; /* * Filesystems are required to send in quota flags at mount time. @@ -2623,16 +2973,13 @@ xlog_recover_do_dquot_trans( if (mp->m_qflags == 0) return (0); - recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr; - - if (item->ri_buf[1].i_addr == NULL) { - cmn_err(CE_ALERT, - "XFS: NULL dquot in %s.", __func__); + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) { + xfs_alert(log->l_mp, "NULL dquot in %s.", __func__); return XFS_ERROR(EIO); } if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) { - cmn_err(CE_ALERT, - "XFS: dquot too small (%d) in %s.", + xfs_alert(log->l_mp, "dquot too small (%d) in %s.", item->ri_buf[1].i_len, __func__); return XFS_ERROR(EIO); } @@ -2655,25 +3002,20 @@ xlog_recover_do_dquot_trans( * The other possibility, of course, is that the quota subsystem was * removed since the last mount - ENOSYS. */ - dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr; + dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - if ((error = xfs_qm_dqcheck(recddq, - dq_f->qlf_id, - 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans (log copy)"))) { + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2 (log copy)"); + if (error) return XFS_ERROR(EIO); - } ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_ioerror_alert("xlog_recover_do..(read#3)", mp, - bp, dq_f->qlf_blkno); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2682,21 +3024,40 @@ xlog_recover_do_dquot_trans( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, - "xlog_recover_do_dquot_trans")) { + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + "xlog_recover_dquot_pass2"); + if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); - ASSERT(bp->b_mount == NULL || bp->b_mount == mp); - bp->b_mount = mp; - XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone); - xfs_bdwrite(mp, bp); + ASSERT(bp->b_target->bt_mount == mp); + bp->b_iodone = xlog_recover_iodone; + xfs_buf_delwri_queue(bp, buffer_list); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -2707,38 +3068,31 @@ xlog_recover_do_dquot_trans( * LSN. */ STATIC int -xlog_recover_do_efi_trans( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn, - int pass) +xlog_recover_efi_pass2( + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; - xfs_mount_t *mp; + xfs_mount_t *mp = log->l_mp; xfs_efi_log_item_t *efip; xfs_efi_log_format_t *efi_formatp; - if (pass == XLOG_RECOVER_PASS1) { - return 0; - } - - efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr; + efi_formatp = item->ri_buf[0].i_addr; - mp = log->l_mp; efip = xfs_efi_init(mp, efi_formatp->efi_nextents); if ((error = xfs_efi_copy_format(&(item->ri_buf[0]), &(efip->efi_format)))) { xfs_efi_item_free(efip); return error; } - efip->efi_next_extent = efi_formatp->efi_nextents; - efip->efi_flags |= XFS_EFI_COMMITTED; + atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents); spin_lock(&log->l_ailp->xa_lock); /* * xfs_trans_ail_update() drops the AIL lock. */ - xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn); + xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn); return 0; } @@ -2751,11 +3105,10 @@ xlog_recover_do_efi_trans( * efd format structure. If we find it, we remove the efi from the * AIL and free it. */ -STATIC void -xlog_recover_do_efd_trans( - xlog_t *log, - xlog_recover_item_t *item, - int pass) +STATIC int +xlog_recover_efd_pass2( + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2764,11 +3117,7 @@ xlog_recover_do_efd_trans( struct xfs_ail_cursor cur; struct xfs_ail *ailp = log->l_ailp; - if (pass == XLOG_RECOVER_PASS1) { - return; - } - - efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr; + efd_formatp = item->ri_buf[0].i_addr; ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) + ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) || (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) + @@ -2789,7 +3138,8 @@ xlog_recover_do_efd_trans( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2797,66 +3147,96 @@ xlog_recover_do_efd_trans( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); + + return 0; } /* - * Perform the transaction - * - * If the transaction modifies a buffer or inode, do it now. Otherwise, - * EFIs and EFDs get queued up by adding entries into the AIL for them. + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. */ STATIC int -xlog_recover_do_trans( - xlog_t *log, - xlog_recover_t *trans, - int pass) +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) { - int error = 0; - xlog_recover_item_t *item, *first_item; + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } - error = xlog_recover_reorder_trans(trans); - if (error) - return error; + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } - first_item = item = trans->r_itemq; - do { - switch (ITEM_TYPE(item)) { - case XFS_LI_BUF: - error = xlog_recover_do_buffer_trans(log, item, pass); - break; - case XFS_LI_INODE: - error = xlog_recover_do_inode_trans(log, item, pass); - break; - case XFS_LI_EFI: - error = xlog_recover_do_efi_trans(log, item, - trans->r_lsn, pass); - break; - case XFS_LI_EFD: - xlog_recover_do_efd_trans(log, item, pass); - error = 0; - break; - case XFS_LI_DQUOT: - error = xlog_recover_do_dquot_trans(log, item, pass); - break; - case XFS_LI_QUOTAOFF: - error = xlog_recover_do_quotaoff_trans(log, item, - pass); - break; - default: - xlog_warn( - "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item)); - ASSERT(0); - error = XFS_ERROR(EIO); - break; - } + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } - if (error) - return error; - item = item->ri_next; - } while (first_item != item); + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -2867,50 +3247,285 @@ xlog_recover_do_trans( */ STATIC void xlog_recover_free_trans( - xlog_recover_t *trans) + struct xlog_recover *trans) { - xlog_recover_item_t *first_item, *item, *free_item; + xlog_recover_item_t *item, *n; int i; - item = first_item = trans->r_itemq; - do { - free_item = item; - item = item->ri_next; - /* Free the regions in the item. */ - for (i = 0; i < free_item->ri_cnt; i++) { - kmem_free(free_item->ri_buf[i].i_addr); - } + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); /* Free the item itself */ - kmem_free(free_item->ri_buf); - kmem_free(free_item); - } while (first_item != item); + kmem_free(item->ri_buf); + kmem_free(item); + } /* Free the transaction recover structure */ kmem_free(trans); } +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + +STATIC int +xlog_recover_commit_pass1( + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass1(log, item); + case XFS_LI_QUOTAOFF: + return xlog_recover_quotaoff_pass1(log, item); + case XFS_LI_INODE: + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_DQUOT: + case XFS_LI_ICREATE: + /* nothing to do in pass 1 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_commit_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) +{ + trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); + + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_INODE: + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_EFI: + return xlog_recover_efi_pass2(log, item, trans->r_lsn); + case XFS_LI_EFD: + return xlog_recover_efd_pass2(log, item); + case XFS_LI_DQUOT: + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); + case XFS_LI_QUOTAOFF: + /* nothing to do in pass2 */ + return 0; + default: + xfs_warn(log->l_mp, "%s: invalid item type (%d)", + __func__, ITEM_TYPE(item)); + ASSERT(0); + return XFS_ERROR(EIO); + } +} + +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + +/* + * Perform the transaction. + * + * If the transaction modifies a buffer or inode, do it now. Otherwise, + * EFIs and EFDs get queued up by adding entries into the AIL for them. + */ STATIC int xlog_recover_commit_trans( - xlog_t *log, - xlog_recover_t **q, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { - int error; + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); - if ((error = xlog_recover_unlink_tid(q, trans))) - return error; - if ((error = xlog_recover_do_trans(log, trans, pass))) + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 + + hlist_del(&trans->r_list); + + error = xlog_recover_reorder_trans(log, trans, pass); + if (error) return error; - xlog_recover_free_trans(trans); /* no error */ - return 0; + + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: + error = xlog_recover_commit_pass1(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + + if (error) + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + } + + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + + xlog_recover_free_trans(trans); + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - xlog_recover_t *trans) + struct xlog *log) { /* Do nothing now */ - xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR"); + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); return 0; } @@ -2925,9 +3540,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, - xlog_recover_t *rhash[], - xlog_rec_header_t *rhead, + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2953,22 +3568,22 @@ xlog_recover_process_data( dp += sizeof(xlog_op_header_t); if (ohead->oh_clientid != XFS_TRANSACTION && ohead->oh_clientid != XFS_LOG) { - xlog_warn( - "XFS: xlog_recover_process_data: bad clientid"); + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); ASSERT(0); return (XFS_ERROR(EIO)); } tid = be32_to_cpu(ohead->oh_tid); hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(rhash[hash], tid); + trans = xlog_recover_find_tid(&rhash[hash], tid); if (trans == NULL) { /* not found; add new tid */ if (ohead->oh_flags & XLOG_START_TRANS) xlog_recover_new_tid(&rhash[hash], tid, be64_to_cpu(rhead->h_lsn)); } else { if (dp + be32_to_cpu(ohead->oh_len) > lp) { - xlog_warn( - "XFS: xlog_recover_process_data: bad length"); + xfs_warn(log->l_mp, "%s: bad length 0x%x", + __func__, be32_to_cpu(ohead->oh_len)); WARN_ON(1); return (XFS_ERROR(EIO)); } @@ -2978,35 +3593,38 @@ xlog_recover_process_data( switch (flags) { case XLOG_COMMIT_TRANS: error = xlog_recover_commit_trans(log, - &rhash[hash], trans, pass); + trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(trans, - dp, be32_to_cpu(ohead->oh_len)); + error = xlog_recover_add_to_cont_trans(log, + trans, dp, + be32_to_cpu(ohead->oh_len)); break; case XLOG_START_TRANS: - xlog_warn( - "XFS: xlog_recover_process_data: bad transaction"); + xfs_warn(log->l_mp, "%s: bad transaction", + __func__); ASSERT(0); error = XFS_ERROR(EIO); break; case 0: case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(trans, + error = xlog_recover_add_to_trans(log, trans, dp, be32_to_cpu(ohead->oh_len)); break; default: - xlog_warn( - "XFS: xlog_recover_process_data: bad flag"); + xfs_warn(log->l_mp, "%s: bad flag 0x%x", + __func__, flags); ASSERT(0); error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -3030,7 +3648,7 @@ xlog_recover_process_efi( xfs_extent_t *extp; xfs_fsblock_t startblock_fsb; - ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED)); + ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)); /* * First check the validity of the extents described by the @@ -3049,13 +3667,14 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); @@ -3069,7 +3688,7 @@ xlog_recover_process_efi( extp->ext_len); } - efip->efi_flags |= XFS_EFI_RECOVERED; + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); error = xfs_trans_commit(tp, 0); return error; @@ -3098,7 +3717,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3126,7 +3745,7 @@ xlog_recover_process_efis( * Skip EFIs that we've already processed. */ efip = (xfs_efi_log_item_t *)lip; - if (efip->efi_flags & XFS_EFI_RECOVERED) { + if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) { lip = xfs_trans_ail_cursor_next(ailp, &cur); continue; } @@ -3139,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } @@ -3161,8 +3780,7 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), - 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); if (error) goto out_abort; @@ -3185,8 +3803,7 @@ xlog_recover_clear_agi_bucket( out_abort: xfs_trans_cancel(tp, XFS_TRANS_ABORT); out_error: - xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: " - "failed to clear agi %d. Continuing.", agno); + xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; } @@ -3204,14 +3821,14 @@ xlog_recover_process_one_iunlink( int error; ino = XFS_AGINO_TO_INO(mp, agno, agino); - error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0); + error = xfs_iget(mp, NULL, ino, 0, 0, &ip); if (error) goto fail; /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; @@ -3260,7 +3877,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3293,152 +3910,83 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } -} -#if defined(DEBUG) && defined(XFS_LOUD_RECOVERY) -STATIC void -xlog_unpack_data_checksum( - xlog_rec_header_t *rhead, - xfs_caddr_t dp, - xlog_t *log) -{ - __be32 *up = (__be32 *)dp; - uint chksum = 0; - int i; - - /* divide length by 4 to get # words */ - for (i=0; i < be32_to_cpu(rhead->h_len) >> 2; i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - if (chksum != be32_to_cpu(rhead->h_chksum)) { - if (rhead->h_chksum || - ((log->l_flags & XLOG_CHKSUM_MISMATCH) == 0)) { - cmn_err(CE_DEBUG, - "XFS: LogR chksum mismatch: was (0x%x) is (0x%x)\n", - be32_to_cpu(rhead->h_chksum), chksum); - cmn_err(CE_DEBUG, -"XFS: Disregard message if filesystem was created with non-DEBUG kernel"); - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - cmn_err(CE_DEBUG, - "XFS: LogR this is a LogV2 filesystem\n"); - } - log->l_flags |= XLOG_CHKSUM_MISMATCH; - } - } + return 0; } -#else -#define xlog_unpack_data_checksum(rhead, dp, log) -#endif -STATIC void +STATIC int xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3456,18 +4004,18 @@ xlog_unpack_data( } } - xlog_unpack_data_checksum(rhead, dp, log); + return 0; } STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; - if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) { + if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) { XFS_ERROR_REPORT("xlog_valid_rec_header(1)", XFS_ERRLEVEL_LOW, log->l_mp); return XFS_ERROR(EFSCORRUPTED); @@ -3475,7 +4023,7 @@ xlog_valid_rec_header( if (unlikely( (!rhead->h_version || (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { - xlog_warn("XFS: %s: unrecognised log version (%d).", + xfs_warn(log->l_mp, "%s: unrecognised log version (%d).", __func__, be32_to_cpu(rhead->h_version)); return XFS_ERROR(EIO); } @@ -3505,7 +4053,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3517,7 +4065,7 @@ xlog_do_recovery_pass( int error = 0, h_size; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; - xlog_recover_t *rhash[XLOG_RHASH_SIZE]; + struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); @@ -3555,7 +4103,7 @@ xlog_do_recovery_pass( hblks = 1; } } else { - ASSERT(log->l_sectbb_log == 0); + ASSERT(log->l_sectBBsize == 1); hblks = 1; hbp = xlog_get_bp(log, 1); h_size = XLOG_BIG_RECORD_BSIZE; @@ -3588,9 +4136,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3605,7 +4157,7 @@ xlog_do_recovery_pass( /* * Check for header wrapping around physical end-of-log */ - offset = XFS_BUF_PTR(hbp); + offset = hbp->b_addr; split_hblks = 0; wrapped_hblks = 0; if (blk_no + hblks <= log->l_logBBsize) { @@ -3641,19 +4193,9 @@ xlog_do_recovery_pass( * - order is important. */ wrapped_hblks = hblks - split_hblks; - error = XFS_BUF_SET_PTR(hbp, - offset + BBTOB(split_hblks), - BBTOB(hblks - split_hblks)); - if (error) - goto bread_err2; - - error = xlog_bread_noalign(log, 0, - wrapped_hblks, hbp); - if (error) - goto bread_err2; - - error = XFS_BUF_SET_PTR(hbp, offset, - BBTOB(hblks)); + error = xlog_bread_offset(log, 0, + wrapped_hblks, hbp, + offset + BBTOB(split_hblks)); if (error) goto bread_err2; } @@ -3675,7 +4217,7 @@ xlog_do_recovery_pass( } else { /* This log record is split across the * physical end of log */ - offset = XFS_BUF_PTR(dbp); + offset = dbp->b_addr; split_bblks = 0; if (blk_no != log->l_logBBsize) { /* some data is before the physical @@ -3704,25 +4246,20 @@ xlog_do_recovery_pass( * _first_, then the log start (LR header end) * - order is important. */ - error = XFS_BUF_SET_PTR(dbp, - offset + BBTOB(split_bblks), - BBTOB(bblks - split_bblks)); + error = xlog_bread_offset(log, 0, + bblks - split_bblks, dbp, + offset + BBTOB(split_bblks)); if (error) goto bread_err2; + } - error = xlog_bread_noalign(log, wrapped_hblks, - bblks - split_bblks, - dbp); - if (error) - goto bread_err2; + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; - error = XFS_BUF_SET_PTR(dbp, offset, h_size); - if (error) - goto bread_err2; - } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3747,9 +4284,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3777,11 +4318,11 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { - int error; + int error, i; ASSERT(head_blk != tail_blk); @@ -3789,10 +4330,12 @@ xlog_do_log_recovery( * First do a pass to find all of the cancelled buf log items. * Store them in the buf_cancel_table for use in the second pass. */ - log->l_buf_cancel_table = - (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE * - sizeof(xfs_buf_cancel_t*), + log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE * + sizeof(struct list_head), KM_SLEEP); + for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) + INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); + error = xlog_do_recovery_pass(log, head_blk, tail_blk, XLOG_RECOVER_PASS1); if (error != 0) { @@ -3811,7 +4354,7 @@ xlog_do_log_recovery( int i; for (i = 0; i < XLOG_BC_TABLE_SIZE; i++) - ASSERT(log->l_buf_cancel_table[i] == NULL); + ASSERT(list_empty(&log->l_buf_cancel_table[i])); } #endif /* DEBUG */ @@ -3826,7 +4369,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3838,11 +4381,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - XFS_bflush(log->l_mp->m_ddev_targp); /* * If IO errors happened during recovery, bail out. @@ -3864,19 +4404,24 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - xfsbdstrat(log->l_mp, bp); - error = xfs_iowait(bp); + bp->b_ops = &xfs_sb_buf_ops; + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); + error = xfs_buf_iowait(bp); if (error) { - xfs_ioerror_alert("xlog_do_recover", - log->l_mp, bp, XFS_BUF_ADDR(bp)); + xfs_buf_ioerror_alert(bp, __func__); ASSERT(0); xfs_buf_relse(bp); return error; @@ -3906,7 +4451,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3931,10 +4476,28 @@ xlog_recover( return error; } - cmn_err(CE_NOTE, - "Starting XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); error = xlog_do_recover(log, head_blk, tail_blk); log->l_flags |= XLOG_RECOVERY_NEEDED; @@ -3953,7 +4516,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3967,9 +4530,7 @@ xlog_recover_finish( int error; error = xlog_recover_process_efis(log); if (error) { - cmn_err(CE_ALERT, - "Failed to recover EFIs on filesystem: %s", - log->l_mp->m_fsname); + xfs_alert(log->l_mp, "Failed to recover EFIs"); return error; } /* @@ -3978,22 +4539,18 @@ xlog_recover_finish( * case the unlink transactions would have problems * pushing the EFIs out of the way. */ - xfs_log_force(log->l_mp, (xfs_lsn_t)0, - (XFS_LOG_FORCE | XFS_LOG_SYNC)); + xfs_log_force(log->l_mp, XFS_LOG_SYNC); xlog_recover_process_iunlinks(log); xlog_recover_check_summary(log); - cmn_err(CE_NOTE, - "Ending XFS recovery on filesystem: %s (logdev: %s)", - log->l_mp->m_fsname, log->l_mp->m_logname ? - log->l_mp->m_logname : "internal"); + xfs_notice(log->l_mp, "Ending recovery (logdev: %s)", + log->l_mp->m_logname ? log->l_mp->m_logname + : "internal"); log->l_flags &= ~XLOG_RECOVERY_NEEDED; } else { - cmn_err(CE_DEBUG, - "!Ending clean XFS mount for filesystem: %s\n", - log->l_mp->m_fsname); + xfs_info(log->l_mp, "Ending clean mount"); } return 0; } @@ -4006,16 +4563,12 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; xfs_buf_t *agfbp; xfs_buf_t *agibp; - xfs_buf_t *sbbp; -#ifdef XFS_LOUD_RECOVERY - xfs_sb_t *sbp; -#endif xfs_agnumber_t agno; __uint64_t freeblks; __uint64_t itotal; @@ -4030,10 +4583,8 @@ xlog_recover_check_summary( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { error = xfs_read_agf(mp, NULL, agno, 0, &agfbp); if (error) { - xfs_fs_cmn_err(CE_ALERT, mp, - "xlog_recover_check_summary(agf)" - "agf read failed agno %d error %d", - agno, error); + xfs_alert(mp, "%s agf read failed agno %d error %d", + __func__, agno, error); } else { agfp = XFS_BUF_TO_AGF(agfbp); freeblks += be32_to_cpu(agfp->agf_freeblks) + @@ -4042,7 +4593,10 @@ xlog_recover_check_summary( } error = xfs_read_agi(mp, NULL, agno, &agibp); - if (!error) { + if (error) { + xfs_alert(mp, "%s agi read failed agno %d error %d", + __func__, agno, error); + } else { struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp); itotal += be32_to_cpu(agi->agi_count); @@ -4050,30 +4604,5 @@ xlog_recover_check_summary( xfs_buf_relse(agibp); } } - - sbbp = xfs_getsb(mp, 0); -#ifdef XFS_LOUD_RECOVERY - sbp = &mp->m_sb; - xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(sbbp)); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_icount %Lu itotal %Lu", - sbp->sb_icount, itotal); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_ifree %Lu itotal %Lu", - sbp->sb_ifree, ifree); - cmn_err(CE_NOTE, - "xlog_recover_check_summary: sb_fdblocks %Lu freeblks %Lu", - sbp->sb_fdblocks, freeblks); -#if 0 - /* - * This is turned off until I account for the allocation - * btree blocks which live in free space. - */ - ASSERT(sbp->sb_icount == itotal); - ASSERT(sbp->sb_ifree == ifree); - ASSERT(sbp->sb_fdblocks == freeblks); -#endif -#endif - xfs_buf_relse(sbbp); } #endif /* DEBUG */ |
