diff options
Diffstat (limited to 'fs/xfs/xfs_log_recover.c')
| -rw-r--r-- | fs/xfs/xfs_log_recover.c | 1635 |
1 files changed, 1194 insertions, 441 deletions
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 15ff5392fb6..981af0f6504 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -17,37 +17,49 @@ */ #include "xfs.h" #include "xfs_fs.h" -#include "xfs_types.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" #include "xfs_bit.h" -#include "xfs_log.h" #include "xfs_inum.h" -#include "xfs_trans.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" -#include "xfs_error.h" -#include "xfs_bmap_btree.h" -#include "xfs_alloc_btree.h" -#include "xfs_ialloc_btree.h" -#include "xfs_dinode.h" +#include "xfs_da_format.h" #include "xfs_inode.h" -#include "xfs_inode_item.h" -#include "xfs_alloc.h" -#include "xfs_ialloc.h" +#include "xfs_trans.h" +#include "xfs_log.h" #include "xfs_log_priv.h" -#include "xfs_buf_item.h" #include "xfs_log_recover.h" +#include "xfs_inode_item.h" #include "xfs_extfree_item.h" #include "xfs_trans_priv.h" +#include "xfs_alloc.h" +#include "xfs_ialloc.h" #include "xfs_quota.h" -#include "xfs_rw.h" -#include "xfs_utils.h" +#include "xfs_cksum.h" #include "xfs_trace.h" +#include "xfs_icache.h" +#include "xfs_bmap_btree.h" +#include "xfs_dinode.h" +#include "xfs_error.h" +#include "xfs_dir2.h" -STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); -STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); +#define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1) + +STATIC int +xlog_find_zeroed( + struct xlog *, + xfs_daddr_t *); +STATIC int +xlog_clear_stale_blocks( + struct xlog *, + xfs_lsn_t); #if defined(DEBUG) -STATIC void xlog_recover_check_summary(xlog_t *); +STATIC void +xlog_recover_check_summary( + struct xlog *); #else #define xlog_recover_check_summary(log) #endif @@ -75,7 +87,7 @@ struct xfs_buf_cancel { static inline int xlog_buf_bbcount_valid( - xlog_t *log, + struct xlog *log, int bbcount) { return bbcount > 0 && bbcount <= log->l_logBBsize; @@ -88,7 +100,7 @@ xlog_buf_bbcount_valid( */ STATIC xfs_buf_t * xlog_get_bp( - xlog_t *log, + struct xlog *log, int nbblks) { struct xfs_buf *bp; @@ -120,7 +132,7 @@ xlog_get_bp( nbblks += log->l_sectBBsize; nbblks = round_up(nbblks, log->l_sectBBsize); - bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0); + bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, nbblks, 0); if (bp) xfs_buf_unlock(bp); return bp; @@ -139,14 +151,14 @@ xlog_put_bp( */ STATIC xfs_caddr_t xlog_align( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { xfs_daddr_t offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1); - ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(offset + nbblks <= bp->b_length); return bp->b_addr + BBTOB(offset); } @@ -156,10 +168,10 @@ xlog_align( */ STATIC int xlog_bread_noalign( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -174,13 +186,17 @@ xlog_bread_noalign( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_READ(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; - xfsbdstrat(log->l_mp, bp); + if (XFS_FORCED_SHUTDOWN(log->l_mp)) + return XFS_ERROR(EIO); + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) xfs_buf_ioerror_alert(bp, __func__); @@ -189,10 +205,10 @@ xlog_bread_noalign( STATIC int xlog_bread( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t *offset) { int error; @@ -211,14 +227,14 @@ xlog_bread( */ STATIC int xlog_bread_offset( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ - xfs_buf_t *bp, + struct xfs_buf *bp, xfs_caddr_t offset) { xfs_caddr_t orig_offset = bp->b_addr; - int orig_len = bp->b_buffer_length; + int orig_len = BBTOB(bp->b_length); int error, error2; error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks)); @@ -241,10 +257,10 @@ xlog_bread_offset( */ STATIC int xlog_bwrite( - xlog_t *log, + struct xlog *log, xfs_daddr_t blk_no, int nbblks, - xfs_buf_t *bp) + struct xfs_buf *bp) { int error; @@ -259,13 +275,14 @@ xlog_bwrite( nbblks = round_up(nbblks, log->l_sectBBsize); ASSERT(nbblks > 0); - ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp)); + ASSERT(nbblks <= bp->b_length); XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no); XFS_BUF_ZEROFLAGS(bp); xfs_buf_hold(bp); xfs_buf_lock(bp); - XFS_BUF_SET_COUNT(bp, BBTOB(nbblks)); + bp->b_io_length = nbblks; + bp->b_error = 0; error = xfs_bwrite(bp); if (error) @@ -283,9 +300,9 @@ xlog_header_check_dump( xfs_mount_t *mp, xlog_rec_header_t *head) { - xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d\n", + xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d", __func__, &mp->m_sb.sb_uuid, XLOG_FMT); - xfs_debug(mp, " log : uuid = %pU, fmt = %d\n", + xfs_debug(mp, " log : uuid = %pU, fmt = %d", &head->h_fs_uuid, be32_to_cpu(head->h_fmt)); } #else @@ -377,8 +394,8 @@ xlog_recover_iodone( */ STATIC int xlog_find_cycle_start( - xlog_t *log, - xfs_buf_t *bp, + struct xlog *log, + struct xfs_buf *bp, xfs_daddr_t first_blk, xfs_daddr_t *last_blk, uint cycle) @@ -420,7 +437,7 @@ xlog_find_cycle_start( */ STATIC int xlog_find_verify_cycle( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, int nbblks, uint stop_on_cycle_no, @@ -440,6 +457,8 @@ xlog_find_verify_cycle( * a log sector, or we're out of luck. */ bufblks = 1 << ffs(nbblks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < log->l_sectBBsize) @@ -487,7 +506,7 @@ out: */ STATIC int xlog_find_verify_log_record( - xlog_t *log, + struct xlog *log, xfs_daddr_t start_blk, xfs_daddr_t *last_blk, int extra_bblks) @@ -584,7 +603,7 @@ out: /* * Head is defined to be the point of the log where the next log write - * write could go. This means that incomplete LR writes at the end are + * could go. This means that incomplete LR writes at the end are * eliminated when calculating the head. We aren't guaranteed that previous * LR have complete transactions. We only know that a cycle number of * current cycle number -1 won't be present in the log if we start writing @@ -597,7 +616,7 @@ out: */ STATIC int xlog_find_head( - xlog_t *log, + struct xlog *log, xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; @@ -868,7 +887,7 @@ validate_head: */ STATIC int xlog_find_tail( - xlog_t *log, + struct xlog *log, xfs_daddr_t *head_blk, xfs_daddr_t *tail_blk) { @@ -940,6 +959,7 @@ xlog_find_tail( } if (!found) { xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + xlog_put_bp(bp); ASSERT(0); return XFS_ERROR(EIO); } @@ -965,9 +985,9 @@ xlog_find_tail( log->l_curr_cycle++; atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_grant_reserve_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_grant_write_head, log->l_curr_cycle, + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, BBTOB(log->l_curr_block)); /* @@ -1077,7 +1097,7 @@ done: */ STATIC int xlog_find_zeroed( - xlog_t *log, + struct xlog *log, xfs_daddr_t *blk_no) { xfs_buf_t *bp; @@ -1121,7 +1141,8 @@ xlog_find_zeroed( */ xfs_warn(log->l_mp, "Log inconsistent or not a log (last==0, first!=1)"); - return XFS_ERROR(EINVAL); + error = XFS_ERROR(EINVAL); + goto bp_err; } /* we have a partially zeroed log */ @@ -1180,7 +1201,7 @@ bp_err: */ STATIC void xlog_add_record( - xlog_t *log, + struct xlog *log, xfs_caddr_t buf, int cycle, int block, @@ -1202,7 +1223,7 @@ xlog_add_record( STATIC int xlog_write_log_records( - xlog_t *log, + struct xlog *log, int cycle, int start_block, int blocks, @@ -1225,6 +1246,8 @@ xlog_write_log_records( * log sector, or we're out of luck. */ bufblks = 1 << ffs(blocks); + while (bufblks > log->l_logBBsize) + bufblks >>= 1; while (!(bp = xlog_get_bp(log, bufblks))) { bufblks >>= 1; if (bufblks < sectbb) @@ -1300,7 +1323,7 @@ xlog_write_log_records( */ STATIC int xlog_clear_stale_blocks( - xlog_t *log, + struct xlog *log, xfs_lsn_t tail_lsn) { int tail_cycle, head_cycle; @@ -1427,9 +1450,8 @@ xlog_recover_find_tid( xlog_tid_t tid) { xlog_recover_t *trans; - struct hlist_node *n; - hlist_for_each_entry(trans, n, head, r_list) { + hlist_for_each_entry(trans, head, r_list) { if (trans->r_log_tid == tid) return trans; } @@ -1466,8 +1488,8 @@ xlog_recover_add_item( STATIC int xlog_recover_add_to_cont_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1512,8 +1534,8 @@ xlog_recover_add_to_cont_trans( */ STATIC int xlog_recover_add_to_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, xfs_caddr_t dp, int len) { @@ -1558,6 +1580,7 @@ xlog_recover_add_to_trans( "bad number of regions (%d) in inode log format", in_f->ilf_size); ASSERT(0); + kmem_free(ptr); return XFS_ERROR(EIO); } @@ -1576,32 +1599,89 @@ xlog_recover_add_to_trans( } /* - * Sort the log items in the transaction. Cancelled buffers need - * to be put first so they are processed before any items that might - * modify the buffers. If they are cancelled, then the modifications - * don't need to be replayed. + * Sort the log items in the transaction. + * + * The ordering constraints are defined by the inode allocation and unlink + * behaviour. The rules are: + * + * 1. Every item is only logged once in a given transaction. Hence it + * represents the last logged state of the item. Hence ordering is + * dependent on the order in which operations need to be performed so + * required initial conditions are always met. + * + * 2. Cancelled buffers are recorded in pass 1 in a separate table and + * there's nothing to replay from them so we can simply cull them + * from the transaction. However, we can't do that until after we've + * replayed all the other items because they may be dependent on the + * cancelled buffer and replaying the cancelled buffer can remove it + * form the cancelled buffer table. Hence they have tobe done last. + * + * 3. Inode allocation buffers must be replayed before inode items that + * read the buffer and replay changes into it. For filesystems using the + * ICREATE transactions, this means XFS_LI_ICREATE objects need to get + * treated the same as inode allocation buffers as they create and + * initialise the buffers directly. + * + * 4. Inode unlink buffers must be replayed after inode items are replayed. + * This ensures that inodes are completely flushed to the inode buffer + * in a "free" state before we remove the unlinked inode list pointer. + * + * Hence the ordering needs to be inode allocation buffers first, inode items + * second, inode unlink buffers third and cancelled buffers last. + * + * But there's a problem with that - we can't tell an inode allocation buffer + * apart from a regular buffer, so we can't separate them. We can, however, + * tell an inode unlink buffer from the others, and so we can separate them out + * from all the other buffers and move them to last. + * + * Hence, 4 lists, in order from head to tail: + * - buffer_list for all buffers except cancelled/inode unlink buffers + * - item_list for all non-buffer items + * - inode_buffer_list for inode unlink buffers + * - cancel_list for the cancelled buffers + * + * Note that we add objects to the tail of the lists so that first-to-last + * ordering is preserved within the lists. Adding objects to the head of the + * list means when we traverse from the head we walk them in last-to-first + * order. For cancelled buffers and inode unlink buffers this doesn't matter, + * but for all other items there may be specific ordering that we need to + * preserve. */ STATIC int xlog_recover_reorder_trans( - struct log *log, - xlog_recover_t *trans, + struct xlog *log, + struct xlog_recover *trans, int pass) { xlog_recover_item_t *item, *n; + int error = 0; LIST_HEAD(sort_list); + LIST_HEAD(cancel_list); + LIST_HEAD(buffer_list); + LIST_HEAD(inode_buffer_list); + LIST_HEAD(inode_list); list_splice_init(&trans->r_itemq, &sort_list); list_for_each_entry_safe(item, n, &sort_list, ri_list) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; switch (ITEM_TYPE(item)) { + case XFS_LI_ICREATE: + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_BUF: - if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) { + if (buf_f->blf_flags & XFS_BLF_CANCEL) { trace_xfs_log_recover_item_reorder_head(log, trans, item, pass); - list_move(&item->ri_list, &trans->r_itemq); + list_move(&item->ri_list, &cancel_list); + break; + } + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { + list_move(&item->ri_list, &inode_buffer_list); break; } + list_move_tail(&item->ri_list, &buffer_list); + break; case XFS_LI_INODE: case XFS_LI_DQUOT: case XFS_LI_QUOTAOFF: @@ -1609,18 +1689,34 @@ xlog_recover_reorder_trans( case XFS_LI_EFI: trace_xfs_log_recover_item_reorder_tail(log, trans, item, pass); - list_move_tail(&item->ri_list, &trans->r_itemq); + list_move_tail(&item->ri_list, &inode_list); break; default: xfs_warn(log->l_mp, "%s: unrecognized type of log operation", __func__); ASSERT(0); - return XFS_ERROR(EIO); + /* + * return the remaining items back to the transaction + * item list so they can be freed in caller. + */ + if (!list_empty(&sort_list)) + list_splice_init(&sort_list, &trans->r_itemq); + error = XFS_ERROR(EIO); + goto out; } } +out: ASSERT(list_empty(&sort_list)); - return 0; + if (!list_empty(&buffer_list)) + list_splice(&buffer_list, &trans->r_itemq); + if (!list_empty(&inode_list)) + list_splice_tail(&inode_list, &trans->r_itemq); + if (!list_empty(&inode_buffer_list)) + list_splice_tail(&inode_buffer_list, &trans->r_itemq); + if (!list_empty(&cancel_list)) + list_splice_tail(&cancel_list, &trans->r_itemq); + return error; } /* @@ -1637,8 +1733,8 @@ xlog_recover_reorder_trans( */ STATIC int xlog_recover_buffer_pass1( - struct log *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; struct list_head *bucket; @@ -1678,20 +1774,12 @@ xlog_recover_buffer_pass1( /* * Check to see whether the buffer being recovered has a corresponding - * entry in the buffer cancel record table. If it does then return 1 - * so that it will be cancelled, otherwise return 0. If the buffer is - * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement - * the refcount on the entry in the table and remove it from the table - * if this is the last reference. - * - * We remove the cancel record from the table when we encounter its - * last occurrence in the log so that if the same buffer is re-used - * again after its last cancellation we actually replay the changes - * made at that point. + * entry in the buffer cancel record table. If it is, return the cancel + * buffer structure to the caller. */ -STATIC int -xlog_check_buffer_cancelled( - struct log *log, +STATIC struct xfs_buf_cancel * +xlog_peek_buffer_cancelled( + struct xlog *log, xfs_daddr_t blkno, uint len, ushort flags) @@ -1699,22 +1787,16 @@ xlog_check_buffer_cancelled( struct list_head *bucket; struct xfs_buf_cancel *bcp; - if (log->l_buf_cancel_table == NULL) { - /* - * There is nothing in the table built in pass one, - * so this buffer must not be cancelled. - */ + if (!log->l_buf_cancel_table) { + /* empty table means no cancelled buffers in the log */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; } - /* - * Search for an entry in the cancel table that matches our buffer. - */ bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno); list_for_each_entry(bcp, bucket, bc_list) { if (bcp->bc_blkno == blkno && bcp->bc_len == len) - goto found; + return bcp; } /* @@ -1722,9 +1804,32 @@ xlog_check_buffer_cancelled( * that the buffer is NOT cancelled. */ ASSERT(!(flags & XFS_BLF_CANCEL)); - return 0; + return NULL; +} + +/* + * If the buffer is being cancelled then return 1 so that it will be cancelled, + * otherwise return 0. If the buffer is actually a buffer cancel item + * (XFS_BLF_CANCEL is set), then decrement the refcount on the entry in the + * table and remove it from the table if this is the last reference. + * + * We remove the cancel record from the table when we encounter its last + * occurrence in the log so that if the same buffer is re-used again after its + * last cancellation we actually replay the changes made at that point. + */ +STATIC int +xlog_check_buffer_cancelled( + struct xlog *log, + xfs_daddr_t blkno, + uint len, + ushort flags) +{ + struct xfs_buf_cancel *bcp; + + bcp = xlog_peek_buffer_cancelled(log, blkno, len, flags); + if (!bcp) + return 0; -found: /* * We've go a match, so return 1 so that the recovery of this buffer * is cancelled. If this buffer is actually a buffer cancel log @@ -1772,7 +1877,14 @@ xlog_recover_do_inode_buffer( trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f); - inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog; + /* + * Post recovery validation only works properly on CRC enabled + * filesystems. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) + bp->b_ops = &xfs_inode_buf_ops; + + inodes_per_buf = BBTOB(bp->b_io_length) >> mp->m_sb.sb_inodelog; for (i = 0; i < inodes_per_buf; i++) { next_unlinked_offset = (i * mp->m_sb.sb_inodesize) + offsetof(xfs_dinode_t, di_next_unlinked); @@ -1814,7 +1926,8 @@ xlog_recover_do_inode_buffer( ASSERT(item->ri_buf[item_index].i_addr != NULL); ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0); - ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp)); + ASSERT((reg_buf_offset + reg_buf_bytes) <= + BBTOB(bp->b_io_length)); /* * The current logged region contains a copy of the @@ -1836,12 +1949,361 @@ xlog_recover_do_inode_buffer( buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; + + /* + * If necessary, recalculate the CRC in the on-disk inode. We + * have to leave the inode in a consistent state for whoever + * reads it next.... + */ + xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); + } return 0; } /* + * V5 filesystems know the age of the buffer on disk being recovered. We can + * have newer objects on disk than we are replaying, and so for these cases we + * don't want to replay the current change as that will make the buffer contents + * temporarily invalid on disk. + * + * The magic number might not match the buffer type we are going to recover + * (e.g. reallocated blocks), so we ignore the xfs_buf_log_format flags. Hence + * extract the LSN of the existing object in the buffer based on it's current + * magic number. If we don't recognise the magic number in the buffer, then + * return a LSN of -1 so that the caller knows it was an unrecognised block and + * so can recover the buffer. + * + * Note: we cannot rely solely on magic number matches to determine that the + * buffer has a valid LSN - we also need to verify that it belongs to this + * filesystem, so we need to extract the object's LSN and compare it to that + * which we read from the superblock. If the UUIDs don't match, then we've got a + * stale metadata block from an old filesystem instance that we need to recover + * over the top of. + */ +static xfs_lsn_t +xlog_recover_get_buf_lsn( + struct xfs_mount *mp, + struct xfs_buf *bp) +{ + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + void *blk = bp->b_addr; + uuid_t *uuid; + xfs_lsn_t lsn = -1; + + /* v4 filesystems always recover immediately */ + if (!xfs_sb_version_hascrc(&mp->m_sb)) + goto recover_immediately; + + magic32 = be32_to_cpu(*(__be32 *)blk); + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + case XFS_IBT_CRC_MAGIC: + case XFS_IBT_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.s.bb_lsn); + uuid = &btb->bb_u.s.bb_uuid; + break; + } + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: { + struct xfs_btree_block *btb = blk; + + lsn = be64_to_cpu(btb->bb_u.l.bb_lsn); + uuid = &btb->bb_u.l.bb_uuid; + break; + } + case XFS_AGF_MAGIC: + lsn = be64_to_cpu(((struct xfs_agf *)blk)->agf_lsn); + uuid = &((struct xfs_agf *)blk)->agf_uuid; + break; + case XFS_AGFL_MAGIC: + lsn = be64_to_cpu(((struct xfs_agfl *)blk)->agfl_lsn); + uuid = &((struct xfs_agfl *)blk)->agfl_uuid; + break; + case XFS_AGI_MAGIC: + lsn = be64_to_cpu(((struct xfs_agi *)blk)->agi_lsn); + uuid = &((struct xfs_agi *)blk)->agi_uuid; + break; + case XFS_SYMLINK_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsymlink_hdr *)blk)->sl_lsn); + uuid = &((struct xfs_dsymlink_hdr *)blk)->sl_uuid; + break; + case XFS_DIR3_BLOCK_MAGIC: + case XFS_DIR3_DATA_MAGIC: + case XFS_DIR3_FREE_MAGIC: + lsn = be64_to_cpu(((struct xfs_dir3_blk_hdr *)blk)->lsn); + uuid = &((struct xfs_dir3_blk_hdr *)blk)->uuid; + break; + case XFS_ATTR3_RMT_MAGIC: + lsn = be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn); + uuid = &((struct xfs_attr3_rmt_hdr *)blk)->rm_uuid; + break; + case XFS_SB_MAGIC: + lsn = be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn); + uuid = &((struct xfs_dsb *)blk)->sb_uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + magicda = be16_to_cpu(((struct xfs_da_blkinfo *)blk)->magic); + switch (magicda) { + case XFS_DIR3_LEAF1_MAGIC: + case XFS_DIR3_LEAFN_MAGIC: + case XFS_DA3_NODE_MAGIC: + lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn); + uuid = &((struct xfs_da3_blkinfo *)blk)->uuid; + break; + default: + break; + } + + if (lsn != (xfs_lsn_t)-1) { + if (!uuid_equal(&mp->m_sb.sb_uuid, uuid)) + goto recover_immediately; + return lsn; + } + + /* + * We do individual object checks on dquot and inode buffers as they + * have their own individual LSN records. Also, we could have a stale + * buffer here, so we have to at least recognise these buffer types. + * + * A notd complexity here is inode unlinked list processing - it logs + * the inode directly in the buffer, but we don't know which inodes have + * been modified, and there is no global buffer LSN. Hence we need to + * recover all inode buffer types immediately. This problem will be + * fixed by logical logging of the unlinked list modifications. + */ + magic16 = be16_to_cpu(*(__be16 *)blk); + switch (magic16) { + case XFS_DQUOT_MAGIC: + case XFS_DINODE_MAGIC: + goto recover_immediately; + default: + break; + } + + /* unknown buffer contents, recover immediately */ + +recover_immediately: + return (xfs_lsn_t)-1; + +} + +/* + * Validate the recovered buffer is of the correct type and attach the + * appropriate buffer operations to them for writeback. Magic numbers are in a + * few places: + * the first 16 bits of the buffer (inode buffer, dquot buffer), + * the first 32 bits of the buffer (most blocks), + * inside a struct xfs_da_blkinfo at the start of the buffer. + */ +static void +xlog_recover_validate_buf_type( + struct xfs_mount *mp, + struct xfs_buf *bp, + xfs_buf_log_format_t *buf_f) +{ + struct xfs_da_blkinfo *info = bp->b_addr; + __uint32_t magic32; + __uint16_t magic16; + __uint16_t magicda; + + magic32 = be32_to_cpu(*(__be32 *)bp->b_addr); + magic16 = be16_to_cpu(*(__be16*)bp->b_addr); + magicda = be16_to_cpu(info->magic); + switch (xfs_blft_from_flags(buf_f)) { + case XFS_BLFT_BTREE_BUF: + switch (magic32) { + case XFS_ABTB_CRC_MAGIC: + case XFS_ABTC_CRC_MAGIC: + case XFS_ABTB_MAGIC: + case XFS_ABTC_MAGIC: + bp->b_ops = &xfs_allocbt_buf_ops; + break; + case XFS_IBT_CRC_MAGIC: + case XFS_FIBT_CRC_MAGIC: + case XFS_IBT_MAGIC: + case XFS_FIBT_MAGIC: + bp->b_ops = &xfs_inobt_buf_ops; + break; + case XFS_BMAP_CRC_MAGIC: + case XFS_BMAP_MAGIC: + bp->b_ops = &xfs_bmbt_buf_ops; + break; + default: + xfs_warn(mp, "Bad btree block magic!"); + ASSERT(0); + break; + } + break; + case XFS_BLFT_AGF_BUF: + if (magic32 != XFS_AGF_MAGIC) { + xfs_warn(mp, "Bad AGF block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agf_buf_ops; + break; + case XFS_BLFT_AGFL_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_AGFL_MAGIC) { + xfs_warn(mp, "Bad AGFL block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agfl_buf_ops; + break; + case XFS_BLFT_AGI_BUF: + if (magic32 != XFS_AGI_MAGIC) { + xfs_warn(mp, "Bad AGI block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_agi_buf_ops; + break; + case XFS_BLFT_UDQUOT_BUF: + case XFS_BLFT_PDQUOT_BUF: + case XFS_BLFT_GDQUOT_BUF: +#ifdef CONFIG_XFS_QUOTA + if (magic16 != XFS_DQUOT_MAGIC) { + xfs_warn(mp, "Bad DQUOT block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dquot_buf_ops; +#else + xfs_alert(mp, + "Trying to recover dquots without QUOTA support built in!"); + ASSERT(0); +#endif + break; + case XFS_BLFT_DINO_BUF: + /* + * we get here with inode allocation buffers, not buffers that + * track unlinked list changes. + */ + if (magic16 != XFS_DINODE_MAGIC) { + xfs_warn(mp, "Bad INODE block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_inode_buf_ops; + break; + case XFS_BLFT_SYMLINK_BUF: + if (magic32 != XFS_SYMLINK_MAGIC) { + xfs_warn(mp, "Bad symlink block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_symlink_buf_ops; + break; + case XFS_BLFT_DIR_BLOCK_BUF: + if (magic32 != XFS_DIR2_BLOCK_MAGIC && + magic32 != XFS_DIR3_BLOCK_MAGIC) { + xfs_warn(mp, "Bad dir block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_block_buf_ops; + break; + case XFS_BLFT_DIR_DATA_BUF: + if (magic32 != XFS_DIR2_DATA_MAGIC && + magic32 != XFS_DIR3_DATA_MAGIC) { + xfs_warn(mp, "Bad dir data magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_data_buf_ops; + break; + case XFS_BLFT_DIR_FREE_BUF: + if (magic32 != XFS_DIR2_FREE_MAGIC && + magic32 != XFS_DIR3_FREE_MAGIC) { + xfs_warn(mp, "Bad dir3 free magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_free_buf_ops; + break; + case XFS_BLFT_DIR_LEAF1_BUF: + if (magicda != XFS_DIR2_LEAF1_MAGIC && + magicda != XFS_DIR3_LEAF1_MAGIC) { + xfs_warn(mp, "Bad dir leaf1 magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leaf1_buf_ops; + break; + case XFS_BLFT_DIR_LEAFN_BUF: + if (magicda != XFS_DIR2_LEAFN_MAGIC && + magicda != XFS_DIR3_LEAFN_MAGIC) { + xfs_warn(mp, "Bad dir leafn magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_dir3_leafn_buf_ops; + break; + case XFS_BLFT_DA_NODE_BUF: + if (magicda != XFS_DA_NODE_MAGIC && + magicda != XFS_DA3_NODE_MAGIC) { + xfs_warn(mp, "Bad da node magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_da3_node_buf_ops; + break; + case XFS_BLFT_ATTR_LEAF_BUF: + if (magicda != XFS_ATTR_LEAF_MAGIC && + magicda != XFS_ATTR3_LEAF_MAGIC) { + xfs_warn(mp, "Bad attr leaf magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_leaf_buf_ops; + break; + case XFS_BLFT_ATTR_RMT_BUF: + if (!xfs_sb_version_hascrc(&mp->m_sb)) + break; + if (magic32 != XFS_ATTR3_RMT_MAGIC) { + xfs_warn(mp, "Bad attr remote magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_attr3_rmt_buf_ops; + break; + case XFS_BLFT_SB_BUF: + if (magic32 != XFS_SB_MAGIC) { + xfs_warn(mp, "Bad SB block magic!"); + ASSERT(0); + break; + } + bp->b_ops = &xfs_sb_buf_ops; + break; + default: + xfs_warn(mp, "Unknown buffer type %d!", + xfs_blft_from_flags(buf_f)); + break; + } +} + +/* * Perform a 'normal' buffer recovery. Each logged region of the * buffer should be copied over the corresponding region in the * given buffer. The bitmap in the buf log format structure indicates @@ -1873,8 +2335,19 @@ xlog_recover_do_reg_buffer( ASSERT(nbits > 0); ASSERT(item->ri_buf[i].i_addr != NULL); ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0); - ASSERT(XFS_BUF_COUNT(bp) >= - ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT)); + ASSERT(BBTOB(bp->b_io_length) >= + ((uint)bit << XFS_BLF_SHIFT) + (nbits << XFS_BLF_SHIFT)); + + /* + * The dirty regions logged in the buffer, even though + * contiguous, may span multiple chunks. This is because the + * dirty region may span a physical page boundary in a buffer + * and hence be split into two separate vectors for writing into + * the log. Hence we need to trim nbits back to the length of + * the current region being copied out of the log. + */ + if (item->ri_buf[i].i_len < (nbits << XFS_BLF_SHIFT)) + nbits = item->ri_buf[i].i_len >> XFS_BLF_SHIFT; /* * Do a sanity check if this is a dquot buffer. Just checking @@ -1895,7 +2368,7 @@ xlog_recover_do_reg_buffer( item->ri_buf[i].i_len, __func__); goto next; } - error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr, + error = xfs_dqcheck(mp, item->ri_buf[i].i_addr, -1, 0, XFS_QMOPT_DOWARN, "dquot_buf_recover"); if (error) @@ -1913,142 +2386,32 @@ xlog_recover_do_reg_buffer( /* Shouldn't be any more regions */ ASSERT(i == item->ri_total); -} - -/* - * Do some primitive error checking on ondisk dquot data structures. - */ -int -xfs_qm_dqcheck( - struct xfs_mount *mp, - xfs_disk_dquot_t *ddq, - xfs_dqid_t id, - uint type, /* used only when IO_dorepair is true */ - uint flags, - char *str) -{ - xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; - int errs = 0; - - /* - * We can encounter an uninitialized dquot buffer for 2 reasons: - * 1. If we crash while deleting the quotainode(s), and those blks got - * used for user data. This is because we take the path of regular - * file deletion; however, the size field of quotainodes is never - * updated, so all the tricks that we play in itruncate_finish - * don't quite matter. - * - * 2. We don't play the quota buffers when there's a quotaoff logitem. - * But the allocation will be replayed so we'll end up with an - * uninitialized quota block. - * - * This is all fine; things are still consistent, and we haven't lost - * any quota information. Just don't complain about bad dquot blks. - */ - if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x", - str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC); - errs++; - } - if (ddq->d_version != XFS_DQUOT_VERSION) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x", - str, id, ddq->d_version, XFS_DQUOT_VERSION); - errs++; - } - - if (ddq->d_flags != XFS_DQ_USER && - ddq->d_flags != XFS_DQ_PROJ && - ddq->d_flags != XFS_DQ_GROUP) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : XFS dquot ID 0x%x, unknown flags 0x%x", - str, id, ddq->d_flags); - errs++; - } - - if (id != -1 && id != be32_to_cpu(ddq->d_id)) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : ondisk-dquot 0x%p, ID mismatch: " - "0x%x expected, found id 0x%x", - str, ddq, id, be32_to_cpu(ddq->d_id)); - errs++; - } - - if (!errs && ddq->d_id) { - if (ddq->d_blk_softlimit && - be64_to_cpu(ddq->d_bcount) >= - be64_to_cpu(ddq->d_blk_softlimit)) { - if (!ddq->d_btimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_ino_softlimit && - be64_to_cpu(ddq->d_icount) >= - be64_to_cpu(ddq->d_ino_softlimit)) { - if (!ddq->d_itimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - if (ddq->d_rtb_softlimit && - be64_to_cpu(ddq->d_rtbcount) >= - be64_to_cpu(ddq->d_rtb_softlimit)) { - if (!ddq->d_rtbtimer) { - if (flags & XFS_QMOPT_DOWARN) - xfs_alert(mp, - "%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED", - str, (int)be32_to_cpu(ddq->d_id), ddq); - errs++; - } - } - } - - if (!errs || !(flags & XFS_QMOPT_DQREPAIR)) - return errs; - - if (flags & XFS_QMOPT_DOWARN) - xfs_notice(mp, "Re-initializing dquot ID 0x%x", id); /* - * Typically, a repair is only requested by quotacheck. + * We can only do post recovery validation on items on CRC enabled + * fielsystems as we need to know when the buffer was written to be able + * to determine if we should have replayed the item. If we replay old + * metadata over a newer buffer, then it will enter a temporarily + * inconsistent state resulting in verification failures. Hence for now + * just avoid the verification stage for non-crc filesystems */ - ASSERT(id != -1); - ASSERT(flags & XFS_QMOPT_DQREPAIR); - memset(d, 0, sizeof(xfs_dqblk_t)); - - d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC); - d->dd_diskdq.d_version = XFS_DQUOT_VERSION; - d->dd_diskdq.d_flags = type; - d->dd_diskdq.d_id = cpu_to_be32(id); - - return errs; + if (xfs_sb_version_hascrc(&mp->m_sb)) + xlog_recover_validate_buf_type(mp, bp, buf_f); } /* * Perform a dquot buffer recovery. - * Simple algorithm: if we have found a QUOTAOFF logitem of the same type + * Simple algorithm: if we have found a QUOTAOFF log item of the same type * (ie. USR or GRP), then just toss this buffer away; don't recover it. * Else, treat it as a regular buffer and do recovery. */ STATIC void xlog_recover_do_dquot_buffer( - xfs_mount_t *mp, - xlog_t *log, - xlog_recover_item_t *item, - xfs_buf_t *bp, - xfs_buf_log_format_t *buf_f) + struct xfs_mount *mp, + struct xlog *log, + struct xlog_recover_item *item, + struct xfs_buf *bp, + struct xfs_buf_log_format *buf_f) { uint type; @@ -2097,19 +2460,22 @@ xlog_recover_do_dquot_buffer( * over the log during recovery. During the first we build a table of * those buffers which have been cancelled, and during the second we * only replay those buffers which do not have corresponding cancel - * records in the table. See xlog_recover_do_buffer_pass[1,2] above + * records in the table. See xlog_recover_buffer_pass[1,2] above * for more details on the implementation of the table of cancel records. */ STATIC int xlog_recover_buffer_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; int error; uint buf_flags; + xfs_lsn_t lsn; /* * In this pass we only want to recover all the buffers which have @@ -2123,21 +2489,28 @@ xlog_recover_buffer_pass2( trace_xfs_log_recover_buf_recover(log, buf_f); - buf_flags = XBF_LOCK; - if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF)) - buf_flags |= XBF_MAPPED; + buf_flags = 0; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) + buf_flags |= XBF_UNMAPPED; bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags); + buf_flags, NULL); if (!bp) return XFS_ERROR(ENOMEM); error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)"); - xfs_buf_relse(bp); - return error; + goto out_release; } + /* + * recover the buffer only if we get an LSN from it and it's less than + * the lsn of the transaction we are replaying. + */ + lsn = xlog_recover_get_buf_lsn(mp, bp); + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) + goto out_release; + if (buf_f->blf_flags & XFS_BLF_INODE_BUF) { error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f); } else if (buf_f->blf_flags & @@ -2147,7 +2520,7 @@ xlog_recover_buffer_pass2( xlog_recover_do_reg_buffer(mp, item, bp, buf_f); } if (error) - return XFS_ERROR(error); + goto out_release; /* * Perform delayed write on the buffer. Asynchronous writes will be @@ -2155,35 +2528,114 @@ xlog_recover_buffer_pass2( * * Also make sure that only inode buffers with good sizes stay in * the buffer cache. The kernel moves inodes in buffers of 1 block - * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode + * or mp->m_inode_cluster_size bytes, whichever is bigger. The inode * buffers in the log can be a different size if the log was generated * by an older kernel using unclustered inode buffers or a newer kernel * running with a different inode cluster size. Regardless, if the - * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE) - * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep + * the inode buffer size isn't MAX(blocksize, mp->m_inode_cluster_size) + * for *our* value of mp->m_inode_cluster_size, then we need to keep * the buffer out of the buffer cache so that the buffer won't * overlap with future reads of those inodes. */ if (XFS_DINODE_MAGIC == be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) && - (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize, - (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) { + (BBTOB(bp->b_io_length) != MAX(log->l_mp->m_sb.sb_blocksize, + (__uint32_t)log->l_mp->m_inode_cluster_size))) { xfs_buf_stale(bp); error = xfs_bwrite(bp); } else { ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); } +out_release: xfs_buf_relse(bp); return error; } +/* + * Inode fork owner changes + * + * If we have been told that we have to reparent the inode fork, it's because an + * extent swap operation on a CRC enabled filesystem has been done and we are + * replaying it. We need to walk the BMBT of the appropriate fork and change the + * owners of it. + * + * The complexity here is that we don't have an inode context to work with, so + * after we've replayed the inode we need to instantiate one. This is where the + * fun begins. + * + * We are in the middle of log recovery, so we can't run transactions. That + * means we cannot use cache coherent inode instantiation via xfs_iget(), as + * that will result in the corresponding iput() running the inode through + * xfs_inactive(). If we've just replayed an inode core that changes the link + * count to zero (i.e. it's been unlinked), then xfs_inactive() will run + * transactions (bad!). + * + * So, to avoid this, we instantiate an inode directly from the inode core we've + * just recovered. We have the buffer still locked, and all we really need to + * instantiate is the inode core and the forks being modified. We can do this + * manually, then run the inode btree owner change, and then tear down the + * xfs_inode without having to run any transactions at all. + * + * Also, because we don't have a transaction context available here but need to + * gather all the buffers we modify for writeback so we pass the buffer_list + * instead for the operation to use. + */ + +STATIC int +xfs_recover_inode_owner_change( + struct xfs_mount *mp, + struct xfs_dinode *dip, + struct xfs_inode_log_format *in_f, + struct list_head *buffer_list) +{ + struct xfs_inode *ip; + int error; + + ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); + + ip = xfs_inode_alloc(mp, in_f->ilf_ino); + if (!ip) + return ENOMEM; + + /* instantiate the inode */ + xfs_dinode_from_disk(&ip->i_d, dip); + ASSERT(ip->i_d.di_version >= 3); + + error = xfs_iformat_fork(ip, dip); + if (error) + goto out_free_ip; + + + if (in_f->ilf_fields & XFS_ILOG_DOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + + if (in_f->ilf_fields & XFS_ILOG_AOWNER) { + ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); + error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, + ip->i_ino, buffer_list); + if (error) + goto out_free_ip; + } + +out_free_ip: + xfs_inode_free(ip); + return error; +} + STATIC int xlog_recover_inode_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_inode_log_format_t *in_f; xfs_mount_t *mp = log->l_mp; @@ -2196,6 +2648,7 @@ xlog_recover_inode_pass2( int attr_index; uint fields; xfs_icdinode_t *dicp; + uint isize; int need_free = 0; if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) { @@ -2220,8 +2673,8 @@ xlog_recover_inode_pass2( } trace_xfs_log_recover_inode_recover(log, in_f); - bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, - XBF_LOCK); + bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0, + &xfs_inode_buf_ops); if (!bp) { error = ENOMEM; goto error; @@ -2229,8 +2682,7 @@ xlog_recover_inode_pass2( error = bp->b_error; if (error) { xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)"); - xfs_buf_relse(bp); - goto error; + goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); @@ -2240,29 +2692,52 @@ xlog_recover_inode_pass2( * like an inode! */ if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld", __func__, dip, bp, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; } dicp = item->ri_buf[1].i_addr; if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) { - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, ino %Ld", __func__, item, in_f->ilf_ino); XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)", XFS_ERRLEVEL_LOW, mp); error = EFSCORRUPTED; - goto error; + goto out_release; + } + + /* + * If the inode has an LSN in it, recover the inode only if it's less + * than the lsn of the transaction we are replaying. Note: we still + * need to replay an owner change even though the inode is more recent + * than the transaction as there is no guarantee that all the btree + * blocks are more recent than this transaction, too. + */ + if (dip->di_version >= 3) { + xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + trace_xfs_log_recover_inode_skip(log, in_f); + error = 0; + goto out_owner_change; + } } - /* Skip replay when the on disk inode is newer than the log one */ - if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { + /* + * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes + * are transactional and if ordering is necessary we can determine that + * more accurately by the LSN field in the V3 inode core. Don't trust + * the inode versions we might be changing them here - use the + * superblock flag to determine whether we need to look at di_flushiter + * to skip replay when the on disk inode is newer than the log one + */ + if (!xfs_sb_version_hascrc(&mp->m_sb) && + dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) { /* * Deal with the wrap case, DI_MAX_FLUSH is less * than smaller numbers @@ -2271,12 +2746,12 @@ xlog_recover_inode_pass2( dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) { /* do nothing */ } else { - xfs_buf_relse(bp); trace_xfs_log_recover_inode_skip(log, in_f); error = 0; - goto error; + goto out_release; } } + /* Take the opportunity to reset the flush iteration count */ dicp->di_flushiter = 0; @@ -2285,13 +2760,12 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_BTREE)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad regular inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } else if (unlikely(S_ISDIR(dicp->di_mode))) { if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) && @@ -2299,19 +2773,17 @@ xlog_recover_inode_pass2( (dicp->di_format != XFS_DINODE_FMT_LOCAL)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad dir inode log record, rec ptr 0x%p, " "ino ptr = 0x%p, ino bp = 0x%p, ino %Ld", __func__, item, dip, bp, in_f->ilf_ino); error = EFSCORRUPTED; - goto error; + goto out_release; } } if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){ XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld", @@ -2319,38 +2791,37 @@ xlog_recover_inode_pass2( dicp->di_nextents + dicp->di_anextents, dicp->di_nblocks); error = EFSCORRUPTED; - goto error; + goto out_release; } if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, " "dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__, item, dip, bp, in_f->ilf_ino, dicp->di_forkoff); error = EFSCORRUPTED; - goto error; + goto out_release; } - if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) { + isize = xfs_icdinode_size(dicp->di_version); + if (unlikely(item->ri_buf[1].i_len > isize)) { XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", XFS_ERRLEVEL_LOW, mp, dicp); - xfs_buf_relse(bp); xfs_alert(mp, "%s: Bad inode log record length %d, rec ptr 0x%p", __func__, item->ri_buf[1].i_len, item); error = EFSCORRUPTED; - goto error; + goto out_release; } /* The core is in in-core format */ - xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr); + xfs_dinode_to_disk(dip, dicp); /* the rest is in on-disk format */ - if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) { - memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode), - item->ri_buf[1].i_len - sizeof(struct xfs_icdinode)); + if (item->ri_buf[1].i_len > isize) { + memcpy((char *)dip + isize, + item->ri_buf[1].i_addr + isize, + item->ri_buf[1].i_len - isize); } fields = in_f->ilf_fields; @@ -2366,7 +2837,7 @@ xlog_recover_inode_pass2( } if (in_f->ilf_size == 2) - goto write_inode_buffer; + goto out_owner_change; len = item->ri_buf[2].i_len; src = item->ri_buf[2].i_addr; ASSERT(in_f->ilf_size <= 4); @@ -2427,16 +2898,23 @@ xlog_recover_inode_pass2( default: xfs_warn(log->l_mp, "%s: Invalid flag", __func__); ASSERT(0); - xfs_buf_relse(bp); error = EIO; - goto error; + goto out_release; } } -write_inode_buffer: +out_owner_change: + if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) + error = xfs_recover_inode_owner_change(mp, dip, in_f, + buffer_list); + /* re-generate the checksum. */ + xfs_dinode_calc_crc(log->l_mp, dip); + ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); + xfs_buf_delwri_queue(bp, buffer_list); + +out_release: xfs_buf_relse(bp); error: if (need_free) @@ -2445,14 +2923,14 @@ error: } /* - * Recover QUOTAOFF records. We simply make a note of it in the xlog_t + * Recover QUOTAOFF records. We simply make a note of it in the xlog * structure, so that we know not to do any dquot item or dquot buffer recovery, * of that type. */ STATIC int xlog_recover_quotaoff_pass1( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_qoff_logformat_t *qoff_f = item->ri_buf[0].i_addr; ASSERT(qoff_f); @@ -2476,8 +2954,10 @@ xlog_recover_quotaoff_pass1( */ STATIC int xlog_recover_dquot_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct list_head *buffer_list, + struct xlog_recover_item *item, + xfs_lsn_t current_lsn) { xfs_mount_t *mp = log->l_mp; xfs_buf_t *bp; @@ -2524,20 +3004,18 @@ xlog_recover_dquot_pass2( */ dq_f = item->ri_buf[0].i_addr; ASSERT(dq_f); - error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2 (log copy)"); if (error) return XFS_ERROR(EIO); ASSERT(dq_f->qlf_len == 1); - error = xfs_read_buf(mp, mp->m_ddev_targp, - dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), - 0, &bp); - if (error) { - xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)"); + error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp, + NULL); + if (error) return error; - } + ASSERT(bp); ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); @@ -2546,22 +3024,40 @@ xlog_recover_dquot_pass2( * was among a chunk of dquots created earlier, and we did some * minimal initialization then. */ - error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, + error = xfs_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN, "xlog_recover_dquot_pass2"); if (error) { xfs_buf_relse(bp); return XFS_ERROR(EIO); } + /* + * If the dquot has an LSN in it, recover the dquot only if it's less + * than the lsn of the transaction we are replaying. + */ + if (xfs_sb_version_hascrc(&mp->m_sb)) { + struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; + xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); + + if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { + goto out_release; + } + } + memcpy(ddq, recddq, item->ri_buf[1].i_len); + if (xfs_sb_version_hascrc(&mp->m_sb)) { + xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + XFS_DQUOT_CRC_OFF); + } ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_target->bt_mount == mp); bp->b_iodone = xlog_recover_iodone; - xfs_buf_delwri_queue(bp); - xfs_buf_relse(bp); + xfs_buf_delwri_queue(bp, buffer_list); - return (0); +out_release: + xfs_buf_relse(bp); + return 0; } /* @@ -2573,9 +3069,9 @@ xlog_recover_dquot_pass2( */ STATIC int xlog_recover_efi_pass2( - xlog_t *log, - xlog_recover_item_t *item, - xfs_lsn_t lsn) + struct xlog *log, + struct xlog_recover_item *item, + xfs_lsn_t lsn) { int error; xfs_mount_t *mp = log->l_mp; @@ -2611,8 +3107,8 @@ xlog_recover_efi_pass2( */ STATIC int xlog_recover_efd_pass2( - xlog_t *log, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover_item *item) { xfs_efd_log_format_t *efd_formatp; xfs_efi_log_item_t *efip = NULL; @@ -2642,7 +3138,8 @@ xlog_recover_efd_pass2( * xfs_trans_ail_delete() drops the * AIL lock. */ - xfs_trans_ail_delete(ailp, lip); + xfs_trans_ail_delete(ailp, lip, + SHUTDOWN_CORRUPT_INCORE); xfs_efi_item_free(efip); spin_lock(&ailp->xa_lock); break; @@ -2650,13 +3147,100 @@ xlog_recover_efd_pass2( } lip = xfs_trans_ail_cursor_next(ailp, &cur); } - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return 0; } /* + * This routine is called when an inode create format structure is found in a + * committed transaction in the log. It's purpose is to initialise the inodes + * being allocated on disk. This requires us to get inode cluster buffers that + * match the range to be intialised, stamped with inode templates and written + * by delayed write so that subsequent modifications will hit the cached buffer + * and only need writing out at the end of recovery. + */ +STATIC int +xlog_recover_do_icreate_pass2( + struct xlog *log, + struct list_head *buffer_list, + xlog_recover_item_t *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_icreate_log *icl; + xfs_agnumber_t agno; + xfs_agblock_t agbno; + unsigned int count; + unsigned int isize; + xfs_agblock_t length; + + icl = (struct xfs_icreate_log *)item->ri_buf[0].i_addr; + if (icl->icl_type != XFS_LI_ICREATE) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad type"); + return EINVAL; + } + + if (icl->icl_size != 1) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad icl size"); + return EINVAL; + } + + agno = be32_to_cpu(icl->icl_ag); + if (agno >= mp->m_sb.sb_agcount) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agno"); + return EINVAL; + } + agbno = be32_to_cpu(icl->icl_agbno); + if (!agbno || agbno == NULLAGBLOCK || agbno >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad agbno"); + return EINVAL; + } + isize = be32_to_cpu(icl->icl_isize); + if (isize != mp->m_sb.sb_inodesize) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad isize"); + return EINVAL; + } + count = be32_to_cpu(icl->icl_count); + if (!count) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count"); + return EINVAL; + } + length = be32_to_cpu(icl->icl_length); + if (!length || length >= mp->m_sb.sb_agblocks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad length"); + return EINVAL; + } + + /* existing allocation is fixed value */ + ASSERT(count == mp->m_ialloc_inos); + ASSERT(length == mp->m_ialloc_blks); + if (count != mp->m_ialloc_inos || + length != mp->m_ialloc_blks) { + xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + return EINVAL; + } + + /* + * Inode buffers can be freed. Do not replay the inode initialisation as + * we could be overwriting something written after this inode buffer was + * cancelled. + * + * XXX: we need to iterate all buffers and only init those that are not + * cancelled. I think that a more fine grained factoring of + * xfs_ialloc_inode_init may be appropriate here to enable this to be + * done easily. + */ + if (xlog_check_buffer_cancelled(log, + XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) + return 0; + + xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); + return 0; +} + +/* * Free up any resources allocated by the transaction * * Remember that EFIs, EFDs, and IUNLINKs are handled later. @@ -2681,11 +3265,111 @@ xlog_recover_free_trans( kmem_free(trans); } +STATIC void +xlog_recover_buffer_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_buf_log_format *buf_f = item->ri_buf[0].i_addr; + struct xfs_mount *mp = log->l_mp; + + if (xlog_peek_buffer_cancelled(log, buf_f->blf_blkno, + buf_f->blf_len, buf_f->blf_flags)) { + return; + } + + xfs_buf_readahead(mp->m_ddev_targp, buf_f->blf_blkno, + buf_f->blf_len, NULL); +} + +STATIC void +xlog_recover_inode_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_inode_log_format ilf_buf; + struct xfs_inode_log_format *ilfp; + struct xfs_mount *mp = log->l_mp; + int error; + + if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { + ilfp = item->ri_buf[0].i_addr; + } else { + ilfp = &ilf_buf; + memset(ilfp, 0, sizeof(*ilfp)); + error = xfs_inode_item_format_convert(&item->ri_buf[0], ilfp); + if (error) + return; + } + + if (xlog_peek_buffer_cancelled(log, ilfp->ilf_blkno, ilfp->ilf_len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, ilfp->ilf_blkno, + ilfp->ilf_len, &xfs_inode_buf_ra_ops); +} + +STATIC void +xlog_recover_dquot_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + struct xfs_mount *mp = log->l_mp; + struct xfs_disk_dquot *recddq; + struct xfs_dq_logformat *dq_f; + uint type; + + + if (mp->m_qflags == 0) + return; + + recddq = item->ri_buf[1].i_addr; + if (recddq == NULL) + return; + if (item->ri_buf[1].i_len < sizeof(struct xfs_disk_dquot)) + return; + + type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP); + ASSERT(type); + if (log->l_quotaoffs_flag & type) + return; + + dq_f = item->ri_buf[0].i_addr; + ASSERT(dq_f); + ASSERT(dq_f->qlf_len == 1); + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, + XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); +} + +STATIC void +xlog_recover_ra_pass2( + struct xlog *log, + struct xlog_recover_item *item) +{ + switch (ITEM_TYPE(item)) { + case XFS_LI_BUF: + xlog_recover_buffer_ra_pass2(log, item); + break; + case XFS_LI_INODE: + xlog_recover_inode_ra_pass2(log, item); + break; + case XFS_LI_DQUOT: + xlog_recover_dquot_ra_pass2(log, item); + break; + case XFS_LI_EFI: + case XFS_LI_EFD: + case XFS_LI_QUOTAOFF: + default: + break; + } +} + STATIC int xlog_recover_commit_pass1( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1); @@ -2698,6 +3382,7 @@ xlog_recover_commit_pass1( case XFS_LI_EFI: case XFS_LI_EFD: case XFS_LI_DQUOT: + case XFS_LI_ICREATE: /* nothing to do in pass 1 */ return 0; default: @@ -2710,23 +3395,29 @@ xlog_recover_commit_pass1( STATIC int xlog_recover_commit_pass2( - struct log *log, - struct xlog_recover *trans, - xlog_recover_item_t *item) + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct xlog_recover_item *item) { trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); switch (ITEM_TYPE(item)) { case XFS_LI_BUF: - return xlog_recover_buffer_pass2(log, item); + return xlog_recover_buffer_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_INODE: - return xlog_recover_inode_pass2(log, item); + return xlog_recover_inode_pass2(log, buffer_list, item, + trans->r_lsn); case XFS_LI_EFI: return xlog_recover_efi_pass2(log, item, trans->r_lsn); case XFS_LI_EFD: return xlog_recover_efd_pass2(log, item); case XFS_LI_DQUOT: - return xlog_recover_dquot_pass2(log, item); + return xlog_recover_dquot_pass2(log, buffer_list, item, + trans->r_lsn); + case XFS_LI_ICREATE: + return xlog_recover_do_icreate_pass2(log, buffer_list, item); case XFS_LI_QUOTAOFF: /* nothing to do in pass2 */ return 0; @@ -2738,6 +3429,26 @@ xlog_recover_commit_pass2( } } +STATIC int +xlog_recover_items_pass2( + struct xlog *log, + struct xlog_recover *trans, + struct list_head *buffer_list, + struct list_head *item_list) +{ + struct xlog_recover_item *item; + int error = 0; + + list_for_each_entry(item, item_list, ri_list) { + error = xlog_recover_commit_pass2(log, trans, + buffer_list, item); + if (error) + return error; + } + + return error; +} + /* * Perform the transaction. * @@ -2746,12 +3457,20 @@ xlog_recover_commit_pass2( */ STATIC int xlog_recover_commit_trans( - struct log *log, + struct xlog *log, struct xlog_recover *trans, int pass) { - int error = 0; - xlog_recover_item_t *item; + int error = 0; + int error2; + int items_queued = 0; + struct xlog_recover_item *item; + struct xlog_recover_item *next; + LIST_HEAD (buffer_list); + LIST_HEAD (ra_list); + LIST_HEAD (done_list); + + #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100 hlist_del(&trans->r_list); @@ -2759,23 +3478,51 @@ xlog_recover_commit_trans( if (error) return error; - list_for_each_entry(item, &trans->r_itemq, ri_list) { - if (pass == XLOG_RECOVER_PASS1) + list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) { + switch (pass) { + case XLOG_RECOVER_PASS1: error = xlog_recover_commit_pass1(log, trans, item); - else - error = xlog_recover_commit_pass2(log, trans, item); + break; + case XLOG_RECOVER_PASS2: + xlog_recover_ra_pass2(log, item); + list_move_tail(&item->ri_list, &ra_list); + items_queued++; + if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) { + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); + items_queued = 0; + } + + break; + default: + ASSERT(0); + } + if (error) - return error; + goto out; + } + +out: + if (!list_empty(&ra_list)) { + if (!error) + error = xlog_recover_items_pass2(log, trans, + &buffer_list, &ra_list); + list_splice_tail_init(&ra_list, &done_list); } + if (!list_empty(&done_list)) + list_splice_init(&done_list, &trans->r_itemq); + xlog_recover_free_trans(trans); - return 0; + + error2 = xfs_buf_delwri_submit(&buffer_list); + return error ? error : error2; } STATIC int xlog_recover_unmount_trans( - struct log *log, - xlog_recover_t *trans) + struct xlog *log) { /* Do nothing now */ xfs_warn(log->l_mp, "%s: Unmount LR", __func__); @@ -2793,9 +3540,9 @@ xlog_recover_unmount_trans( */ STATIC int xlog_recover_process_data( - xlog_t *log, + struct xlog *log, struct hlist_head rhash[], - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, int pass) { @@ -2849,7 +3596,7 @@ xlog_recover_process_data( trans, pass); break; case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log, trans); + error = xlog_recover_unmount_trans(log); break; case XLOG_WAS_CONT_TRANS: error = xlog_recover_add_to_cont_trans(log, @@ -2874,8 +3621,10 @@ xlog_recover_process_data( error = XFS_ERROR(EIO); break; } - if (error) + if (error) { + xlog_recover_free_trans(trans); return error; + } } dp += be32_to_cpu(ohead->oh_len); num_logops--; @@ -2918,13 +3667,14 @@ xlog_recover_process_efi( * This will pull the EFI from the AIL and * free the memory associated with it. */ + set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); xfs_efi_release(efip, efip->efi_format.efi_nextents); return XFS_ERROR(EIO); } } tp = xfs_trans_alloc(mp, 0); - error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) goto abort_error; efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); @@ -2967,7 +3717,7 @@ abort_error: */ STATIC int xlog_recover_process_efis( - xlog_t *log) + struct xlog *log) { xfs_log_item_t *lip; xfs_efi_log_item_t *efip; @@ -3008,7 +3758,7 @@ xlog_recover_process_efis( lip = xfs_trans_ail_cursor_next(ailp, &cur); } out: - xfs_trans_ail_cursor_done(ailp, &cur); + xfs_trans_ail_cursor_done(&cur); spin_unlock(&ailp->xa_lock); return error; } @@ -3030,8 +3780,7 @@ xlog_recover_clear_agi_bucket( int error; tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); - error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), - 0, 0, 0); + error = xfs_trans_reserve(tp, &M_RES(mp)->tr_clearagi, 0, 0); if (error) goto out_abort; @@ -3079,7 +3828,7 @@ xlog_recover_process_one_iunlink( /* * Get the on disk inode to find the next inode in the bucket. */ - error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK); + error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &ibp, 0, 0); if (error) goto fail_iput; @@ -3128,7 +3877,7 @@ xlog_recover_process_one_iunlink( */ STATIC void xlog_recover_process_iunlinks( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agnumber_t agno; @@ -3161,116 +3910,83 @@ xlog_recover_process_iunlinks( */ continue; } + /* + * Unlock the buffer so that it can be acquired in the normal + * course of the transaction to truncate and free each inode. + * Because we are not racing with anyone else here for the AGI + * buffer, we don't even need to hold it locked to read the + * initial unlinked bucket entries out of the buffer. We keep + * buffer reference though, so that it stays pinned in memory + * while we need the buffer. + */ agi = XFS_BUF_TO_AGI(agibp); + xfs_buf_unlock(agibp); for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) { agino = be32_to_cpu(agi->agi_unlinked[bucket]); while (agino != NULLAGINO) { - /* - * Release the agi buffer so that it can - * be acquired in the normal course of the - * transaction to truncate and free the inode. - */ - xfs_buf_relse(agibp); - agino = xlog_recover_process_one_iunlink(mp, agno, agino, bucket); - - /* - * Reacquire the agibuffer and continue around - * the loop. This should never fail as we know - * the buffer was good earlier on. - */ - error = xfs_read_agi(mp, NULL, agno, &agibp); - ASSERT(error == 0); - agi = XFS_BUF_TO_AGI(agibp); } } - - /* - * Release the buffer for the current agi so we can - * go on to the next one. - */ - xfs_buf_relse(agibp); + xfs_buf_rele(agibp); } mp->m_dmevmask = mp_dmevmask; } - -#ifdef DEBUG -STATIC void -xlog_pack_data_checksum( - xlog_t *log, - xlog_in_core_t *iclog, - int size) -{ - int i; - __be32 *up; - uint chksum = 0; - - up = (__be32 *)iclog->ic_datap; - /* divide length by 4 to get # words */ - for (i = 0; i < (size >> 2); i++) { - chksum ^= be32_to_cpu(*up); - up++; - } - iclog->ic_header.h_chksum = cpu_to_be32(chksum); -} -#else -#define xlog_pack_data_checksum(log, iclog, size) -#endif - /* - * Stamp cycle number in every block + * Upack the log buffer data and crc check it. If the check fails, issue a + * warning if and only if the CRC in the header is non-zero. This makes the + * check an advisory warning, and the zero CRC check will prevent failure + * warnings from being emitted when upgrading the kernel from one that does not + * add CRCs by default. + * + * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log + * corruption failure */ -void -xlog_pack_data( - xlog_t *log, - xlog_in_core_t *iclog, - int roundoff) +STATIC int +xlog_unpack_data_crc( + struct xlog_rec_header *rhead, + xfs_caddr_t dp, + struct xlog *log) { - int i, j, k; - int size = iclog->ic_offset + roundoff; - __be32 cycle_lsn; - xfs_caddr_t dp; - - xlog_pack_data_checksum(log, iclog, size); - - cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); - - dp = iclog->ic_datap; - for (i = 0; i < BTOBB(size) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = iclog->ic_data; - - for ( ; i < BTOBB(size); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp; - *(__be32 *)dp = cycle_lsn; - dp += BBSIZE; + __le32 crc; + + crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + if (crc != rhead->h_crc) { + if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { + xfs_alert(log->l_mp, + "log record CRC mismatch: found 0x%x, expected 0x%x.", + le32_to_cpu(rhead->h_crc), + le32_to_cpu(crc)); + xfs_hex_dump(dp, 32); } - for (i = 1; i < log->l_iclog_heads; i++) { - xhdr[i].hic_xheader.xh_cycle = cycle_lsn; - } + /* + * If we've detected a log record corruption, then we can't + * recover past this point. Abort recovery if we are enforcing + * CRC protection by punting an error back up the stack. + */ + if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) + return EFSCORRUPTED; } + + return 0; } -STATIC void +STATIC int xlog_unpack_data( - xlog_rec_header_t *rhead, + struct xlog_rec_header *rhead, xfs_caddr_t dp, - xlog_t *log) + struct xlog *log) { int i, j, k; + int error; + + error = xlog_unpack_data_crc(rhead, dp, log); + if (error) + return error; for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { @@ -3287,12 +4003,14 @@ xlog_unpack_data( dp += BBSIZE; } } + + return 0; } STATIC int xlog_valid_rec_header( - xlog_t *log, - xlog_rec_header_t *rhead, + struct xlog *log, + struct xlog_rec_header *rhead, xfs_daddr_t blkno) { int hlen; @@ -3335,7 +4053,7 @@ xlog_valid_rec_header( */ STATIC int xlog_do_recovery_pass( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, int pass) @@ -3418,9 +4136,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, - rhash, rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, + rhash, rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3525,14 +4247,19 @@ xlog_do_recovery_pass( * - order is important. */ error = xlog_bread_offset(log, 0, - bblks - split_bblks, hbp, + bblks - split_bblks, dbp, offset + BBTOB(split_bblks)); if (error) goto bread_err2; } - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks; } @@ -3557,9 +4284,13 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - xlog_unpack_data(rhead, offset, log); - if ((error = xlog_recover_process_data(log, rhash, - rhead, offset, pass))) + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; + + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) goto bread_err2; blk_no += bblks + hblks; } @@ -3587,7 +4318,7 @@ xlog_do_recovery_pass( */ STATIC int xlog_do_log_recovery( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3638,7 +4369,7 @@ xlog_do_log_recovery( */ STATIC int xlog_do_recover( - xlog_t *log, + struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk) { @@ -3650,11 +4381,8 @@ xlog_do_recover( * First replay the images in the log. */ error = xlog_do_log_recovery(log, head_blk, tail_blk); - if (error) { + if (error) return error; - } - - xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1); /* * If IO errors happened during recovery, bail out. @@ -3676,15 +4404,21 @@ xlog_do_recover( /* * Now that we've finished replaying all buffer and inode - * updates, re-read in the superblock. + * updates, re-read in the superblock and reverify it. */ bp = xfs_getsb(log->l_mp, 0); XFS_BUF_UNDONE(bp); ASSERT(!(XFS_BUF_ISWRITE(bp))); - ASSERT(!(XFS_BUF_ISDELAYWRITE(bp))); XFS_BUF_READ(bp); XFS_BUF_UNASYNC(bp); - xfsbdstrat(log->l_mp, bp); + bp->b_ops = &xfs_sb_buf_ops; + + if (XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_relse(bp); + return XFS_ERROR(EIO); + } + + xfs_buf_iorequest(bp); error = xfs_buf_iowait(bp); if (error) { xfs_buf_ioerror_alert(bp, __func__); @@ -3717,7 +4451,7 @@ xlog_do_recover( */ int xlog_recover( - xlog_t *log) + struct xlog *log) { xfs_daddr_t head_blk, tail_blk; int error; @@ -3742,6 +4476,25 @@ xlog_recover( return error; } + /* + * Version 5 superblock log feature mask validation. We know the + * log is dirty so check if there are any unknown log features + * in what we need to recover. If there are unknown features + * (e.g. unsupported transactions, then simply reject the + * attempt at recovery before touching anything. + */ + if (XFS_SB_VERSION_NUM(&log->l_mp->m_sb) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb, + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) { + xfs_warn(log->l_mp, +"Superblock has unknown incompatible log features (0x%x) enabled.\n" +"The log can not be fully and/or safely recovered by this kernel.\n" +"Please recover the log on a kernel that supports the unknown features.", + (log->l_mp->m_sb.sb_features_log_incompat & + XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)); + return EINVAL; + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); @@ -3763,7 +4516,7 @@ xlog_recover( */ int xlog_recover_finish( - xlog_t *log) + struct xlog *log) { /* * Now we're ready to do the transactions needed for the @@ -3810,7 +4563,7 @@ xlog_recover_finish( */ void xlog_recover_check_summary( - xlog_t *log) + struct xlog *log) { xfs_mount_t *mp; xfs_agf_t *agfp; |
